Spaces:
Sleeping
Sleeping
File size: 1,661 Bytes
75276f3 dc3ed32 75276f3 167edec 03ad36d dc3ed32 167edec dc3ed32 167edec dc3ed32 167edec dc3ed32 167edec dc3ed32 03ad36d dc3ed32 167edec | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 | import re
from lexicons import (
SWEAR_WORDS,
HATE_KEYWORDS,
HATE_HASHTAGS,
HATE_SHORT_FORMS,
HATE_EMOJIS
)
# Load offensive phrases from file
with open("en.txt", encoding="utf-8") as f:
OFFENSIVE_PHRASES = set(line.strip().lower() for line in f if line.strip())
# -------------------- Preprocessing Utilities --------------------
def normalize_text(text: str) -> str:
text = text.lower()
text = re.sub(r"[.]{2,}", " ", text) # Normalize "..."
return text
def clean_and_tokenize(text: str):
text = normalize_text(text)
return set(re.findall(r"\b\w+\b", text))
# -------------------- DEBUG Rule-Based Filter --------------------
def debug_rule_based_check(text: str) -> bool:
text_norm = normalize_text(text)
tokens = clean_and_tokenize(text_norm)
if tokens & SWEAR_WORDS:
print(f"π Matched SWEAR_WORDS: {tokens & SWEAR_WORDS}")
return True
if tokens & HATE_KEYWORDS:
print(f"π Matched HATE_KEYWORDS: {tokens & HATE_KEYWORDS}")
return True
if tokens & HATE_SHORT_FORMS:
print(f"π Matched HATE_SHORT_FORMS: {tokens & HATE_SHORT_FORMS}")
return True
for phrase in OFFENSIVE_PHRASES:
if phrase in text_norm:
print(f"π Matched OFFENSIVE_PHRASE: '{phrase}'")
return True
for tag in HATE_HASHTAGS:
if tag in text_norm:
print(f"π Matched HATE_HASHTAG: {tag}")
return True
for emoji in HATE_EMOJIS:
if emoji in text:
print(f"π Matched HATE_EMOJI: {emoji}")
return True
print("β
No match found.")
return False |