Spaces:
Sleeping
Sleeping
| import re | |
| from lexicons import ( | |
| SWEAR_WORDS, | |
| HATE_KEYWORDS, | |
| HATE_HASHTAGS, | |
| HATE_SHORT_FORMS, | |
| HATE_EMOJIS | |
| ) | |
| # Load offensive phrases from file | |
| with open("en.txt", encoding="utf-8") as f: | |
| OFFENSIVE_PHRASES = set(line.strip().lower() for line in f if line.strip()) | |
| # -------------------- Preprocessing Utilities -------------------- | |
| def normalize_text(text: str) -> str: | |
| text = text.lower() | |
| text = re.sub(r"[.]{2,}", " ", text) # Normalize "..." | |
| return text | |
| def clean_and_tokenize(text: str): | |
| text = normalize_text(text) | |
| return set(re.findall(r"\b\w+\b", text)) | |
| # -------------------- DEBUG Rule-Based Filter -------------------- | |
| def debug_rule_based_check(text: str) -> bool: | |
| text_norm = normalize_text(text) | |
| tokens = clean_and_tokenize(text_norm) | |
| if tokens & SWEAR_WORDS: | |
| print(f"π Matched SWEAR_WORDS: {tokens & SWEAR_WORDS}") | |
| return True | |
| if tokens & HATE_KEYWORDS: | |
| print(f"π Matched HATE_KEYWORDS: {tokens & HATE_KEYWORDS}") | |
| return True | |
| if tokens & HATE_SHORT_FORMS: | |
| print(f"π Matched HATE_SHORT_FORMS: {tokens & HATE_SHORT_FORMS}") | |
| return True | |
| for phrase in OFFENSIVE_PHRASES: | |
| if phrase in text_norm: | |
| print(f"π Matched OFFENSIVE_PHRASE: '{phrase}'") | |
| return True | |
| for tag in HATE_HASHTAGS: | |
| if tag in text_norm: | |
| print(f"π Matched HATE_HASHTAG: {tag}") | |
| return True | |
| for emoji in HATE_EMOJIS: | |
| if emoji in text: | |
| print(f"π Matched HATE_EMOJI: {emoji}") | |
| return True | |
| print("β No match found.") | |
| return False |