Spaces:

medoxz543
/

hate-endpoint

Sleeping

App Files Files Community

medoxz543 commited on Jun 2, 2025

Commit

167edec

verified ·

1 Parent(s): ec447c9

Update rule_based_filter.py

Browse files

Files changed (1) hide show

rule_based_filter.py +9 -37

rule_based_filter.py CHANGED Viewed

@@ -7,46 +7,26 @@ from lexicons import (
     HATE_EMOJIS
 )
-# Load and preprocess phrases from file
 with open("en.txt", encoding="utf-8") as f:
     OFFENSIVE_PHRASES = set(line.strip().lower() for line in f if line.strip())
-# Tokenized version of the phrases for n-gram match
-TOKENIZED_OFFENSIVE_PHRASES = set(
-    ' '.join(re.findall(r"\b\w+\b", phrase)) for phrase in OFFENSIVE_PHRASES
-)
 # -------------------- Preprocessing Utilities --------------------
 def normalize_text(text: str) -> str:
     text = text.lower()
-    text = re.sub(r"[.]{2,}", " ", text)              # Normalize "..."
     return text
 def clean_and_tokenize(text: str):
     text = normalize_text(text)
-    return re.findall(r"\b\w+\b", text)
-def generate_ngrams(tokens, n):
-    return [' '.join(tokens[i:i + n]) for i in range(len(tokens) - n + 1)]
-def phrase_match(text: str, phrases: set) -> str:
-    tokens = clean_and_tokenize(text)
-    max_n = max(len(p.split()) for p in phrases)
-    for n in range(1, max_n + 1):
-        ngrams = generate_ngrams(tokens, n)
-        for ngram in ngrams:
-            for phrase in phrases:
-                if phrase in ngram:
-                    return phrase  # return the triggering phrase
-    return ""
 # -------------------- DEBUG Rule-Based Filter --------------------
 def debug_rule_based_check(text: str) -> bool:
     text_norm = normalize_text(text)
-    tokens = set(clean_and_tokenize(text_norm))
     if tokens & SWEAR_WORDS:
         print(f"🔍 Matched SWEAR_WORDS: {tokens & SWEAR_WORDS}")
@@ -60,10 +40,10 @@ def debug_rule_based_check(text: str) -> bool:
         print(f"🔍 Matched HATE_SHORT_FORMS: {tokens & HATE_SHORT_FORMS}")
         return True
-    phrase = phrase_match(text_norm, TOKENIZED_OFFENSIVE_PHRASES)
-    if phrase:
-        print(f"🔍 Matched OFFENSIVE_PHRASE: '{phrase}'")
-        return True
     for tag in HATE_HASHTAGS:
         if tag in text_norm:
@@ -76,12 +56,4 @@ def debug_rule_based_check(text: str) -> bool:
             return True
     print("✅ No match found.")
-    return False
-# # -------------------- Optional: Test Suite --------------------
-# if __name__ == "__main__":
-#     for text in test_cases_1:
-#         print(f"\n🧾 Text: {text}")
-#         result = debug_rule_based_check(text)
-#         print(f"Result: {'🔥 Flagged' if result else '✅ Safe'}")

     HATE_EMOJIS
 )
+# Load offensive phrases from file
 with open("en.txt", encoding="utf-8") as f:
     OFFENSIVE_PHRASES = set(line.strip().lower() for line in f if line.strip())
 # -------------------- Preprocessing Utilities --------------------
 def normalize_text(text: str) -> str:
     text = text.lower()
+    text = re.sub(r"[.]{2,}", " ", text)  # Normalize "..."
     return text
 def clean_and_tokenize(text: str):
     text = normalize_text(text)
+    return set(re.findall(r"\b\w+\b", text))
 # -------------------- DEBUG Rule-Based Filter --------------------
 def debug_rule_based_check(text: str) -> bool:
     text_norm = normalize_text(text)
+    tokens = clean_and_tokenize(text_norm)
     if tokens & SWEAR_WORDS:
         print(f"🔍 Matched SWEAR_WORDS: {tokens & SWEAR_WORDS}")
         print(f"🔍 Matched HATE_SHORT_FORMS: {tokens & HATE_SHORT_FORMS}")
         return True
+    for phrase in OFFENSIVE_PHRASES:
+        if phrase in text_norm:
+            print(f"🔍 Matched OFFENSIVE_PHRASE: '{phrase}'")
+            return True
     for tag in HATE_HASHTAGS:
         if tag in text_norm:
             return True
     print("✅ No match found.")
+    return False