You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
73 lines
2.9 KiB
73 lines
2.9 KiB
# -*- encoding: utf-8 -*-
|
|
|
|
version = "0.0.1"
|
|
version_info = (0,0,1)
|
|
"""
|
|
Модуль для поиска нецензурных слов (мата) в тексте
|
|
|
|
Лицензия: LGPL (http://www.opensource.org/licenses/lgpl-2.1.php)
|
|
|
|
Пример:
|
|
|
|
from matfilter import matfilter
|
|
some_data = "любой текст для проверки"
|
|
if len(matfilter(some_data)):
|
|
print "Пожалуйста, уберите из текста нецензурные выражения."
|
|
|
|
Источник:
|
|
https://bitbucket.org/spanasik/django-matfilter
|
|
"""
|
|
import re
|
|
|
|
PATTERNS = (ur"(\b[сs]{1}[сsц]{0,1}[uуy](?:[ч4]{0,1}[иаakк][^ц])\w*\b)",
|
|
ur"(\b(?!пло|стра|[тл]и)(\w(?!(у|пло)))*[хx][уy](й|йа|[еeё]|и|я|ли|ю)(?!га)\w*\b)",
|
|
ur"(\b(п[oо]|[нз][аa])*[хx][eе][рp]\w*\b)",
|
|
ur"(\b[мm][уy][дd]([аa][кk]|[oо]|и)\w*\b)",
|
|
ur"(\b\w*д[рp](?:[oо][ч4]|[аa][ч4])(?!л)\w*\b)",
|
|
ur"(\b(?!(?:кило)?[тм]ет)(?!смо)[а-яa-z]*(?<!с)т[рp][аa][хx]\w*\b)",
|
|
ur"(\b[к|k][аaoо][з3z]+[eе]?ё?л\w*\b)",
|
|
ur"(\b(?!со)\w*п[еeё]р[нд](и|иc|ы|у|н|е|ы)\w*\b)",
|
|
ur"(\b\w*[бп][ссз]д\w+\b)",
|
|
ur"(\b([нnп][аa]?[оo]?[xх])\b)",
|
|
ur"(\b([аa]?[оo]?[нnпбз][аa]?[оo]?)?([cс][pр][аa][^зжбсвм])\w*\b)",
|
|
ur"(\b\w*([оo]т|вы|[рp]и|[оo]|и|[уy]){0,1}([пnрp][iиеeё]{0,1}[3zзсcs][дd])\w*\b)",
|
|
ur"(\b(вы)?у?[еeё]?би?ля[дт]?[юоo]?\w*\b)",
|
|
ur"(\b(?!вело|ски|эн)\w*[пpp][eеиi][дd][oaоаеeирp](?![цянгюсмйчв])[рp]?(?![лт])\w*\b)",
|
|
ur"(\b(?!в?[ст]{1,2}еб)(?:(?:в?[сcз3о][тяaа]?[ьъ]?|вы|п[рp][иоo]|[уy]|р[aа][з3z][ьъ]?|к[оo]н[оo])?[её]б[а-яa-z]*)|(?:[а-яa-z]*[^хлрдв][еeё]б)\b)",
|
|
ur"(\b[з3z][аaоo]л[уy]п[аaeеин]\w*\b)",)
|
|
|
|
|
|
def CheckMatches(matches):
|
|
if len(matches):
|
|
result = []
|
|
for match in matches:
|
|
if type(match) == tuple:
|
|
result.append(match[0].strip())
|
|
else:
|
|
result.append(match.strip())
|
|
return result
|
|
return ()
|
|
|
|
|
|
def matfilter(text, npattern=None):
|
|
"""Находит в тексте мат.
|
|
|
|
Возвращает список найденных слов"""
|
|
text = text.replace("\r\n", " ")
|
|
text = text.replace("\n", " ")
|
|
|
|
if npattern is not None:
|
|
result = CheckMatches(re.findall(
|
|
PATTERNS[npattern], text,
|
|
re.IGNORECASE | re.VERBOSE | re.UNICODE | re.DOTALL))
|
|
if len(result):
|
|
return result
|
|
else:
|
|
for pattern in PATTERNS:
|
|
result = CheckMatches(re.findall(
|
|
pattern, text,
|
|
re.IGNORECASE | re.VERBOSE | re.UNICODE | re.DOTALL))
|
|
if len(result):
|
|
return result
|
|
|
|
return ()
|
|
|