From ed5be399da157abc50b37d33189fff9cf91b79c3 Mon Sep 17 00:00:00 2001 From: jetsparrow Date: Mon, 19 Aug 2019 21:37:34 +0300 Subject: [PATCH] separate words from punctuation --- AntiAntiSwearingBot.Tests/DetectTests.cs | 40 +++++++++++++++++++ AntiAntiSwearingBot.Tests/FilterTests.cs | 34 ++++++++++------ AntiAntiSwearingBot/Unbleeper.cs | 4 +- .../dict/ObsceneDictionaryRu.txt | 3 ++ 4 files changed, 67 insertions(+), 14 deletions(-) create mode 100644 AntiAntiSwearingBot.Tests/DetectTests.cs diff --git a/AntiAntiSwearingBot.Tests/DetectTests.cs b/AntiAntiSwearingBot.Tests/DetectTests.cs new file mode 100644 index 0000000..effa499 --- /dev/null +++ b/AntiAntiSwearingBot.Tests/DetectTests.cs @@ -0,0 +1,40 @@ +using System; +using System.Collections.Generic; +using System.Text; +using Xunit; + +namespace AntiAntiSwearingBot.Tests +{ + public class DetectTests + { + Unbleeper ubl { get; } + Config cfg { get; } + SearchDictionary dict { get; } + + public DetectTests() + { + cfg = Config.Load("aasb.cfg.json"); + dict = new SearchDictionary(cfg); + ubl = new Unbleeper(dict, cfg.Unbleeper); + } + + [Theory] + [InlineData("бл**ь", "*блядь")] + [InlineData("ж**а", "*жопа")] + public void UnbleepSimpleSwears(string word, string expected) + { + var unbleep = ubl.UnbleepSwears(word).TrimEnd(Environment.NewLine.ToCharArray()); + Assert.Equal(expected, unbleep); + } + + [Theory] + [InlineData("Просто пи**ец, как хочется кушать.", "*пиздец")] + [InlineData("Ужас на*уй!", "*нахуй")] + [InlineData("Сергей опять вы**нулся своим знанием тонкостей русского языка; в окно еб*шил стылый ноябрьский ветер. ", "*выебнулся\n**ебашил")] + public void DetectWordsWithPunctuation(string text, string expected) + { + var unbleep = ubl.UnbleepSwears(text).Replace("\r\n", "\n").Trim(); + Assert.Equal(expected, unbleep); + } + } +} diff --git a/AntiAntiSwearingBot.Tests/FilterTests.cs b/AntiAntiSwearingBot.Tests/FilterTests.cs index cb95e09..11ef2a6 100644 --- a/AntiAntiSwearingBot.Tests/FilterTests.cs +++ b/AntiAntiSwearingBot.Tests/FilterTests.cs @@ -6,27 +6,35 @@ namespace AntiAntiSwearingBot.Tests public class FilterTests { Unbleeper ubl { get; } + Config cfg { get; } + SearchDictionary dict { get; } + public FilterTests() { - var cfg = Config.Load("aasb.cfg.json", "aasb.cfg.secret.json"); - var dict = new SearchDictionary(cfg); + cfg = Config.Load("aasb.cfg.json", "aasb.cfg.secret.json"); + dict = new SearchDictionary(cfg); ubl = new Unbleeper(dict, cfg.Unbleeper); } - [Theory] - [InlineData("бл**ь", "*блядь")] - [InlineData("ж**а", "*жопа")] - public void UnbleepSimpleSwears(string word, string expected) - { - var unbleep = ubl.UnbleepSwears(word).TrimEnd(Environment.NewLine.ToCharArray()); - Assert.Equal(expected, unbleep); - } - [Theory] [InlineData("*")] [InlineData("**#")] - [InlineData("@**#")] - public void IgnoreShortGrawlixes(string text) => Assert.Null(ubl.UnbleepSwears(text)); + [InlineData("@*#")] + public void IgnoreShortGrawlixesWithoutLetters(string text) + { + if (text.Length < cfg.Unbleeper.MinAmbiguousWordLength) + Assert.Null(ubl.UnbleepSwears(text)); + } + + [Theory] + [InlineData("*")] + [InlineData("*б")] + [InlineData("х#")] + public void IgnoreShortWords(string text) + { + if (text.Length < cfg.Unbleeper.MinWordLength) + Assert.Null(ubl.UnbleepSwears(text)); + } [Theory] [InlineData("@pvkuznetsov https://github.com/jacksondunstan/UnityNativeScripting")] diff --git a/AntiAntiSwearingBot/Unbleeper.cs b/AntiAntiSwearingBot/Unbleeper.cs index 2ffb392..473ca9e 100644 --- a/AntiAntiSwearingBot/Unbleeper.cs +++ b/AntiAntiSwearingBot/Unbleeper.cs @@ -20,6 +20,8 @@ namespace AntiAntiSwearingBot Regex BleepedSwearsRegex { get; } + static readonly char[] WORD_SEPARATORS = { ' ', '\t', '\r', '\n', '.', ',', '!', '?', ';' }; + public string UnbleepSwears(string text) { if (string.IsNullOrWhiteSpace(text)) @@ -30,7 +32,7 @@ namespace AntiAntiSwearingBot if (text.StartsWith('/')) // is chat command return null; - var words = text.Split(new char[0], StringSplitOptions.RemoveEmptyEntries); + var words = text.Split(WORD_SEPARATORS, StringSplitOptions.RemoveEmptyEntries); var candidates = words .Where(w => !Language.IsTelegramMention(w) diff --git a/AntiAntiSwearingBot/dict/ObsceneDictionaryRu.txt b/AntiAntiSwearingBot/dict/ObsceneDictionaryRu.txt index ff62eac..3687723 100644 --- a/AntiAntiSwearingBot/dict/ObsceneDictionaryRu.txt +++ b/AntiAntiSwearingBot/dict/ObsceneDictionaryRu.txt @@ -84,6 +84,7 @@ выебанный выебат выебаться +выебнулся высрать высраться выссать @@ -217,6 +218,7 @@ ебат ебаться ебатьс +ебашил ебитесь ебло еблом @@ -457,6 +459,7 @@ натрахаться натрахивать натрахиваться +нахуй нахуякать нахуякаться нахуякивать