Spaces:
Sleeping
Sleeping
Update rule_based_filter.py
Browse files- rule_based_filter.py +9 -37
rule_based_filter.py
CHANGED
|
@@ -7,46 +7,26 @@ from lexicons import (
|
|
| 7 |
HATE_EMOJIS
|
| 8 |
)
|
| 9 |
|
| 10 |
-
# Load
|
| 11 |
with open("en.txt", encoding="utf-8") as f:
|
| 12 |
OFFENSIVE_PHRASES = set(line.strip().lower() for line in f if line.strip())
|
| 13 |
|
| 14 |
-
# Tokenized version of the phrases for n-gram match
|
| 15 |
-
TOKENIZED_OFFENSIVE_PHRASES = set(
|
| 16 |
-
' '.join(re.findall(r"\b\w+\b", phrase)) for phrase in OFFENSIVE_PHRASES
|
| 17 |
-
)
|
| 18 |
-
|
| 19 |
# -------------------- Preprocessing Utilities --------------------
|
| 20 |
|
| 21 |
def normalize_text(text: str) -> str:
|
| 22 |
text = text.lower()
|
| 23 |
-
text = re.sub(r"[.]{2,}", " ", text)
|
| 24 |
return text
|
| 25 |
|
| 26 |
def clean_and_tokenize(text: str):
|
| 27 |
text = normalize_text(text)
|
| 28 |
-
return re.findall(r"\b\w+\b", text)
|
| 29 |
-
|
| 30 |
-
def generate_ngrams(tokens, n):
|
| 31 |
-
return [' '.join(tokens[i:i + n]) for i in range(len(tokens) - n + 1)]
|
| 32 |
-
|
| 33 |
-
def phrase_match(text: str, phrases: set) -> str:
|
| 34 |
-
tokens = clean_and_tokenize(text)
|
| 35 |
-
max_n = max(len(p.split()) for p in phrases)
|
| 36 |
-
|
| 37 |
-
for n in range(1, max_n + 1):
|
| 38 |
-
ngrams = generate_ngrams(tokens, n)
|
| 39 |
-
for ngram in ngrams:
|
| 40 |
-
for phrase in phrases:
|
| 41 |
-
if phrase in ngram:
|
| 42 |
-
return phrase # return the triggering phrase
|
| 43 |
-
return ""
|
| 44 |
|
| 45 |
# -------------------- DEBUG Rule-Based Filter --------------------
|
| 46 |
|
| 47 |
def debug_rule_based_check(text: str) -> bool:
|
| 48 |
text_norm = normalize_text(text)
|
| 49 |
-
tokens =
|
| 50 |
|
| 51 |
if tokens & SWEAR_WORDS:
|
| 52 |
print(f"π Matched SWEAR_WORDS: {tokens & SWEAR_WORDS}")
|
|
@@ -60,10 +40,10 @@ def debug_rule_based_check(text: str) -> bool:
|
|
| 60 |
print(f"π Matched HATE_SHORT_FORMS: {tokens & HATE_SHORT_FORMS}")
|
| 61 |
return True
|
| 62 |
|
| 63 |
-
phrase
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
|
| 68 |
for tag in HATE_HASHTAGS:
|
| 69 |
if tag in text_norm:
|
|
@@ -76,12 +56,4 @@ def debug_rule_based_check(text: str) -> bool:
|
|
| 76 |
return True
|
| 77 |
|
| 78 |
print("β
No match found.")
|
| 79 |
-
return False
|
| 80 |
-
|
| 81 |
-
# # -------------------- Optional: Test Suite --------------------
|
| 82 |
-
|
| 83 |
-
# if __name__ == "__main__":
|
| 84 |
-
# for text in test_cases_1:
|
| 85 |
-
# print(f"\nπ§Ύ Text: {text}")
|
| 86 |
-
# result = debug_rule_based_check(text)
|
| 87 |
-
# print(f"Result: {'π₯ Flagged' if result else 'β
Safe'}")
|
|
|
|
| 7 |
HATE_EMOJIS
|
| 8 |
)
|
| 9 |
|
| 10 |
+
# Load offensive phrases from file
|
| 11 |
with open("en.txt", encoding="utf-8") as f:
|
| 12 |
OFFENSIVE_PHRASES = set(line.strip().lower() for line in f if line.strip())
|
| 13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
# -------------------- Preprocessing Utilities --------------------
|
| 15 |
|
| 16 |
def normalize_text(text: str) -> str:
|
| 17 |
text = text.lower()
|
| 18 |
+
text = re.sub(r"[.]{2,}", " ", text) # Normalize "..."
|
| 19 |
return text
|
| 20 |
|
| 21 |
def clean_and_tokenize(text: str):
|
| 22 |
text = normalize_text(text)
|
| 23 |
+
return set(re.findall(r"\b\w+\b", text))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
# -------------------- DEBUG Rule-Based Filter --------------------
|
| 26 |
|
| 27 |
def debug_rule_based_check(text: str) -> bool:
|
| 28 |
text_norm = normalize_text(text)
|
| 29 |
+
tokens = clean_and_tokenize(text_norm)
|
| 30 |
|
| 31 |
if tokens & SWEAR_WORDS:
|
| 32 |
print(f"π Matched SWEAR_WORDS: {tokens & SWEAR_WORDS}")
|
|
|
|
| 40 |
print(f"π Matched HATE_SHORT_FORMS: {tokens & HATE_SHORT_FORMS}")
|
| 41 |
return True
|
| 42 |
|
| 43 |
+
for phrase in OFFENSIVE_PHRASES:
|
| 44 |
+
if phrase in text_norm:
|
| 45 |
+
print(f"π Matched OFFENSIVE_PHRASE: '{phrase}'")
|
| 46 |
+
return True
|
| 47 |
|
| 48 |
for tag in HATE_HASHTAGS:
|
| 49 |
if tag in text_norm:
|
|
|
|
| 56 |
return True
|
| 57 |
|
| 58 |
print("β
No match found.")
|
| 59 |
+
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|