From a4e7b836b8d2124763b0a672fc0fed446a091f66 Mon Sep 17 00:00:00 2001 From: jetsparrow Date: Sat, 17 Aug 2019 22:23:13 +0300 Subject: [PATCH] Make unbleeper testable - separate unbleeper as dependency of aasb - fix algorithm - tests --- .../AntiAntiSwearingBot.Tests.csproj | 19 +++++ AntiAntiSwearingBot.Tests/FilterTests.cs | 74 +++++++++++++++++++ AntiAntiSwearingBot.sln | 8 +- AntiAntiSwearingBot/AntiAntiSwearingBot.cs | 37 +--------- AntiAntiSwearingBot/CommandRouter.cs | 5 +- AntiAntiSwearingBot/Config.cs | 47 ++++++------ AntiAntiSwearingBot/Language.cs | 51 +++---------- AntiAntiSwearingBot/Program.cs | 1 + AntiAntiSwearingBot/SearchDictionary.cs | 12 ++- AntiAntiSwearingBot/Unbleeper.cs | 59 +++++++++++++++ AntiAntiSwearingBot/aasb.cfg.json | 4 +- .../dict/ObsceneDictionaryRu.txt | 1 - 12 files changed, 213 insertions(+), 105 deletions(-) create mode 100644 AntiAntiSwearingBot.Tests/AntiAntiSwearingBot.Tests.csproj create mode 100644 AntiAntiSwearingBot.Tests/FilterTests.cs create mode 100644 AntiAntiSwearingBot/Unbleeper.cs diff --git a/AntiAntiSwearingBot.Tests/AntiAntiSwearingBot.Tests.csproj b/AntiAntiSwearingBot.Tests/AntiAntiSwearingBot.Tests.csproj new file mode 100644 index 0000000..d6f5994 --- /dev/null +++ b/AntiAntiSwearingBot.Tests/AntiAntiSwearingBot.Tests.csproj @@ -0,0 +1,19 @@ + + + + netcoreapp2.1 + + false + + + + + + + + + + + + + diff --git a/AntiAntiSwearingBot.Tests/FilterTests.cs b/AntiAntiSwearingBot.Tests/FilterTests.cs new file mode 100644 index 0000000..cb95e09 --- /dev/null +++ b/AntiAntiSwearingBot.Tests/FilterTests.cs @@ -0,0 +1,74 @@ +using System; +using Xunit; + +namespace AntiAntiSwearingBot.Tests +{ + public class FilterTests + { + Unbleeper ubl { get; } + public FilterTests() + { + var cfg = Config.Load("aasb.cfg.json", "aasb.cfg.secret.json"); + var dict = new SearchDictionary(cfg); + ubl = new Unbleeper(dict, cfg.Unbleeper); + } + + [Theory] + [InlineData("бл**ь", "*блядь")] + [InlineData("ж**а", "*жопа")] + public void UnbleepSimpleSwears(string word, string expected) + { + var unbleep = ubl.UnbleepSwears(word).TrimEnd(Environment.NewLine.ToCharArray()); + Assert.Equal(expected, unbleep); + } + + [Theory] + [InlineData("*")] + [InlineData("**#")] + [InlineData("@**#")] + public void IgnoreShortGrawlixes(string text) => Assert.Null(ubl.UnbleepSwears(text)); + + [Theory] + [InlineData("@pvkuznetsov https://github.com/jacksondunstan/UnityNativeScripting")] + [InlineData("@JohnnyMnemonic")] + [InlineData("@Artyom по поводу")] + [InlineData("@Laima прошу блины!")] + [InlineData("эй админ @harry0xfefecaca верни бота")] + public void IgnoreMentions(string text) => Assert.Null(ubl.UnbleepSwears(text)); + + [Theory] + [InlineData("x - floor(abs(x)) * sign(x) -- вроде такая формула для frac(x)")] + public void IgnoresWeirdShit(string text) => Assert.Null(ubl.UnbleepSwears(text)); + + [Theory] + [InlineData("/poll")] + [InlineData("/roll 2d6")] + [InlineData("/award medal")] + [InlineData("/status@MinecraftServerBot")] + [InlineData("/broadcast@MinecraftServerBot пи#*ец вы понастроили тут")] + [InlineData("/ban@MinecraftServerBot @dirty_johnny86")] + public void IgnoreCommands(string text) => Assert.Null(ubl.UnbleepSwears(text)); + + [Theory] + [InlineData("#UEeğitimKarazin")] + [InlineData("#KöksalBabaCafeTrabzonda")] + [InlineData("#ZehraHanımSüresizeKadro")] + [InlineData("#define")] + [InlineData("#ifndef")] + [InlineData("#trashtag")] + [InlineData("#MeToo")] + [InlineData("#инстаграм")] + [InlineData("#битваБлогеров")] + [InlineData("#зенитахмат")] + [InlineData("#HappyKWONJIYONGDay")] + [InlineData("#MCITOT")] + [InlineData("#ТамбовКраснодар")] + [InlineData("#JRockконвент2019")] + [InlineData("#DonaldTrumpAgain")] + [InlineData("#ZodiacKillerStrikesAgain")] + [InlineData("#ThanksObama")] + [InlineData("#BalıkBurcuKızıylaEvlenmek")] + public void IgnoreHashtags(string text) => Assert.Null(ubl.UnbleepSwears(text)); + + } +} diff --git a/AntiAntiSwearingBot.sln b/AntiAntiSwearingBot.sln index 7ea08cc..886cca0 100644 --- a/AntiAntiSwearingBot.sln +++ b/AntiAntiSwearingBot.sln @@ -3,7 +3,9 @@ Microsoft Visual Studio Solution File, Format Version 12.00 # Visual Studio 15 VisualStudioVersion = 15.0.28010.2036 MinimumVisualStudioVersion = 10.0.40219.1 -Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "AntiAntiSwearingBot", "AntiAntiSwearingBot\AntiAntiSwearingBot.csproj", "{66AFFD7B-5B2D-4C85-8523-770702255511}" +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "AntiAntiSwearingBot", "AntiAntiSwearingBot\AntiAntiSwearingBot.csproj", "{66AFFD7B-5B2D-4C85-8523-770702255511}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "AntiAntiSwearingBot.Tests", "AntiAntiSwearingBot.Tests\AntiAntiSwearingBot.Tests.csproj", "{AA3CB2CB-05F1-46C4-8710-2702BD663A8B}" EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution @@ -15,6 +17,10 @@ Global {66AFFD7B-5B2D-4C85-8523-770702255511}.Debug|Any CPU.Build.0 = Debug|Any CPU {66AFFD7B-5B2D-4C85-8523-770702255511}.Release|Any CPU.ActiveCfg = Release|Any CPU {66AFFD7B-5B2D-4C85-8523-770702255511}.Release|Any CPU.Build.0 = Release|Any CPU + {AA3CB2CB-05F1-46C4-8710-2702BD663A8B}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {AA3CB2CB-05F1-46C4-8710-2702BD663A8B}.Debug|Any CPU.Build.0 = Debug|Any CPU + {AA3CB2CB-05F1-46C4-8710-2702BD663A8B}.Release|Any CPU.ActiveCfg = Release|Any CPU + {AA3CB2CB-05F1-46C4-8710-2702BD663A8B}.Release|Any CPU.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE diff --git a/AntiAntiSwearingBot/AntiAntiSwearingBot.cs b/AntiAntiSwearingBot/AntiAntiSwearingBot.cs index 5dbfe7b..89d98d6 100644 --- a/AntiAntiSwearingBot/AntiAntiSwearingBot.cs +++ b/AntiAntiSwearingBot/AntiAntiSwearingBot.cs @@ -16,14 +16,13 @@ namespace AntiAntiSwearingBot { Config Config { get; } SearchDictionary Dict { get; } + Unbleeper Unbleeper { get; } public AntiAntiSwearingBot(Config cfg, SearchDictionary dict) { Config = cfg; Dict = dict; - BleepedSwearsRegex = new Regex(cfg.BleepedSwearsRegex, RegexOptions.Compiled); - NonWordRegex = new Regex("\\W", RegexOptions.Compiled); - MentionRegex = new Regex("@[a-zA-Z0-9_]+", RegexOptions.Compiled); + Unbleeper = new Unbleeper(dict, cfg.Unbleeper); } TelegramBotClient Client { get; set; } @@ -49,41 +48,11 @@ namespace AntiAntiSwearingBot public async Task Stop() { - Dict.Save(); Dispose(); } #region service - Regex BleepedSwearsRegex { get; } - Regex NonWordRegex { get; } - Regex MentionRegex { get; } - - string UnbleepSwears(string text) - { - if (string.IsNullOrWhiteSpace(text)) - return null; - - var words = BleepedSwearsRegex.Matches(text) - .Select(m => m.Value) - .Where(m => NonWordRegex.IsMatch(m)) - .Where(m => !MentionRegex.IsMatch(m)) - .ToArray(); - - if (words.Any()) - { - var response = new StringBuilder(); - for (int i = 0; i < words.Length; ++i) - { - var m = Dict.Match(words[i]); - response.AppendLine(new string('*', i + 1) + m.Word + new string('?', m.Distance)); - } - return response.ToString(); - } - else - return null; - } - void BotOnMessageReceived(object sender, MessageEventArgs args) { var msg = args.Message; @@ -104,7 +73,7 @@ namespace AntiAntiSwearingBot } else { - var unbleepResponse = UnbleepSwears(msg.Text); + var unbleepResponse = Unbleeper.UnbleepSwears(msg.Text); if (unbleepResponse != null) Client.SendTextMessageAsync( args.Message.Chat.Id, diff --git a/AntiAntiSwearingBot/CommandRouter.cs b/AntiAntiSwearingBot/CommandRouter.cs index eae4a84..800a8de 100644 --- a/AntiAntiSwearingBot/CommandRouter.cs +++ b/AntiAntiSwearingBot/CommandRouter.cs @@ -29,10 +29,7 @@ namespace AntiAntiSwearingBot if (cmd.UserName != null && cmd.UserName != Username) return null; if (Commands.ContainsKey(cmd.Command)) - { - try { return Commands[cmd.Command].Execute(cmd, args); } - catch { } - } + return Commands[cmd.Command].Execute(cmd, args); } return null; } diff --git a/AntiAntiSwearingBot/Config.cs b/AntiAntiSwearingBot/Config.cs index bb86862..c8f4f03 100644 --- a/AntiAntiSwearingBot/Config.cs +++ b/AntiAntiSwearingBot/Config.cs @@ -3,30 +3,33 @@ public class Config : ConfigBase { public string ApiKey { get; private set; } - - public string BleepedSwearsRegex { get; private set; } - - public struct ProxySettings - { - public string Url { get; private set; } - public int Port { get; private set; } - public string Login { get; private set; } - public string Password { get; private set; } - } - public ProxySettings Proxy { get; private set; } - - public struct SearchDictionarySettings - { - public string DictionaryPath { get; private set; } - - public double LearnNudgeFactor { get; private set; } - public double LearnInitialRating { get; private set; } - public int MinUnlearnNudge { get; private set; } - public double UnlearnNudgeFactor { get; private set; } - } - public SearchDictionarySettings SearchDictionary { get; private set; } + public UnbleeperSettings Unbleeper { get; private set; } } + + public struct UnbleeperSettings + { + public string BleepedSwearsRegex { get; private set; } + } + + public struct SearchDictionarySettings + { + public string DictionaryPath { get; private set; } + + public double LearnNudgeFactor { get; private set; } + public double LearnInitialRating { get; private set; } + public int MinUnlearnNudge { get; private set; } + public double UnlearnNudgeFactor { get; private set; } + } + + public struct ProxySettings + { + public string Url { get; private set; } + public int Port { get; private set; } + public string Login { get; private set; } + public string Password { get; private set; } + } + } diff --git a/AntiAntiSwearingBot/Language.cs b/AntiAntiSwearingBot/Language.cs index 76e6b59..b4cf7fe 100644 --- a/AntiAntiSwearingBot/Language.cs +++ b/AntiAntiSwearingBot/Language.cs @@ -2,11 +2,14 @@ using System.Collections.Generic; using System.Linq; using System.Text; +using System.Text.RegularExpressions; namespace AntiAntiSwearingBot { - static class Language + public static class Language { + static int min(int a, int b, int c) { return Math.Min(Math.Min(a, b), c); } + public static int HammingDistance(string a, string b) { if (string.IsNullOrEmpty(a)) @@ -26,8 +29,6 @@ namespace AntiAntiSwearingBot return leftover + dist; } - static int min(int a, int b, int c) { return Math.Min(Math.Min(a, b), c); } - public static int LevenshteinDistance(string a, string b) { int[] prevRow = new int[b.Length + 1]; @@ -55,46 +56,18 @@ namespace AntiAntiSwearingBot public static bool CharMatch(char a, char b) => a == b || !char.IsLetterOrDigit(a) || !char.IsLetterOrDigit(b); - /// - /// Compute the distance between two strings. - /// - public static int Compute(string s, string t) - { - int n = s.Length; - int m = t.Length; - int[,] d = new int[n + 1, m + 1]; + static readonly Regex MentionRegex = new Regex("^@[a-zA-Z0-9_]+$", RegexOptions.Compiled); - if (n == 0) - return m; - if (m == 0) - return n; + static readonly Regex HashTagRegex = new Regex("^#\\w+$", RegexOptions.Compiled); - // Step 2 - for (int i = 0; i <= n; d[i, 0] = i++) - { - } + public static bool IsTelegramMention(string word) => MentionRegex.IsMatch(word); - for (int j = 0; j <= m; d[0, j] = j++) - { - } + public static bool IsHashTag(string word) => HashTagRegex.IsMatch(word); + + public static bool HasNonWordChars(string arg) => arg.Any(c => !char.IsLetterOrDigit(c)); + + public static bool HasWordChars(string arg) => arg.Any(char.IsLetter); - // Step 3 - for (int i = 1; i <= n; i++) - { - //Step 4 - for (int j = 1; j <= m; j++) - { - // Step 5 - int cost = (t[j - 1] == s[i - 1]) ? 0 : 1; - // Step 6 - d[i, j] = Math.Min( - Math.Min(d[i - 1, j] + 1, d[i, j - 1] + 1), - d[i - 1, j - 1] + cost); - } - } - // Step 7 - return d[n, m]; - } } } diff --git a/AntiAntiSwearingBot/Program.cs b/AntiAntiSwearingBot/Program.cs index 622d6d0..dc86087 100644 --- a/AntiAntiSwearingBot/Program.cs +++ b/AntiAntiSwearingBot/Program.cs @@ -26,6 +26,7 @@ namespace AntiAntiSwearingBot Console.ReadKey(); Console.WriteLine("Waiting for exit..."); bot.Stop().Wait(); + dict.Save(); return (int)ExitCode.Ok; } catch (Exception ex) diff --git a/AntiAntiSwearingBot/SearchDictionary.cs b/AntiAntiSwearingBot/SearchDictionary.cs index 76eaca8..b4be9a1 100644 --- a/AntiAntiSwearingBot/SearchDictionary.cs +++ b/AntiAntiSwearingBot/SearchDictionary.cs @@ -11,6 +11,8 @@ namespace AntiAntiSwearingBot { var s = cfg.SearchDictionary; path = s.DictionaryPath; + tmppath = path + ".tmp"; + learnInitialRating = Math.Clamp(s.LearnInitialRating, 0,1); learnNudgeFactor = Math.Clamp(s.LearnNudgeFactor, 0, 1); unlearnNudgeFactor = Math.Clamp(s.UnlearnNudgeFactor, 0, 1); @@ -21,8 +23,12 @@ namespace AntiAntiSwearingBot public void Save() { - File.WriteAllLines(path + ".tmp", words); - File.Move(path + ".tmp", path); + if (File.Exists(tmppath)) + File.Delete(tmppath); + File.WriteAllLines(tmppath, words); + if (File.Exists(path)) + File.Delete(path); + File.Move(tmppath, path); } public struct WordMatch @@ -93,7 +99,7 @@ namespace AntiAntiSwearingBot #region service - string path; + readonly string path, tmppath; double learnInitialRating = 0.75; double learnNudgeFactor = 0.5; diff --git a/AntiAntiSwearingBot/Unbleeper.cs b/AntiAntiSwearingBot/Unbleeper.cs new file mode 100644 index 0000000..239e53e --- /dev/null +++ b/AntiAntiSwearingBot/Unbleeper.cs @@ -0,0 +1,59 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Text.RegularExpressions; + +namespace AntiAntiSwearingBot +{ + public class Unbleeper + { + SearchDictionary Dict { get; } + UnbleeperSettings Cfg { get; } + + public Unbleeper(SearchDictionary dict, UnbleeperSettings cfg) + { + Dict = dict; + Cfg = cfg; + BleepedSwearsRegex = new Regex("^" + Cfg.BleepedSwearsRegex + "$", RegexOptions.Compiled); + } + + Regex BleepedSwearsRegex { get; } + + public string UnbleepSwears(string text) + { + if (string.IsNullOrWhiteSpace(text)) + return null; + + text = text.Trim(); + + if (text.StartsWith('/')) // is chat command + return null; + + var words = text.Split(new char[0], StringSplitOptions.RemoveEmptyEntries); + var candidates = words + .Where(w => + !Language.IsTelegramMention(w) + && Language.HasNonWordChars(w) + && !Language.IsHashTag(w) + && (Language.HasWordChars(w) || w.Length > 5) + && w.Length > 2 + && BleepedSwearsRegex.IsMatch(w) + ) + .ToArray(); + + if (candidates.Any()) + { + var response = new StringBuilder(); + for (int i = 0; i < candidates.Length; ++i) + { + var m = Dict.Match(candidates[i]); + response.AppendLine(new string('*', i + 1) + m.Word + new string('?', m.Distance)); + } + return response.ToString(); + } + else + return null; + } + } +} diff --git a/AntiAntiSwearingBot/aasb.cfg.json b/AntiAntiSwearingBot/aasb.cfg.json index 4898bb5..d07d182 100644 --- a/AntiAntiSwearingBot/aasb.cfg.json +++ b/AntiAntiSwearingBot/aasb.cfg.json @@ -1,5 +1,7 @@ { - "BleepedSwearsRegex": "[а-яА-Я@\\*#]+", + "Unbleeper": { + "BleepedSwearsRegex": "[а-яА-Я@\\*#]+" + }, "SearchDictionary": { "DictionaryPath": "dict/ObsceneDictionaryRu.txt", "LearnNudgeFactor": 0.5, diff --git a/AntiAntiSwearingBot/dict/ObsceneDictionaryRu.txt b/AntiAntiSwearingBot/dict/ObsceneDictionaryRu.txt index 0847a2f..ff62eac 100644 --- a/AntiAntiSwearingBot/dict/ObsceneDictionaryRu.txt +++ b/AntiAntiSwearingBot/dict/ObsceneDictionaryRu.txt @@ -1,4 +1,3 @@ -еб бля хуй блядь