medoxz543 commited on
Commit
167edec
Β·
verified Β·
1 Parent(s): ec447c9

Update rule_based_filter.py

Browse files
Files changed (1) hide show
  1. rule_based_filter.py +9 -37
rule_based_filter.py CHANGED
@@ -7,46 +7,26 @@ from lexicons import (
7
  HATE_EMOJIS
8
  )
9
 
10
- # Load and preprocess phrases from file
11
  with open("en.txt", encoding="utf-8") as f:
12
  OFFENSIVE_PHRASES = set(line.strip().lower() for line in f if line.strip())
13
 
14
- # Tokenized version of the phrases for n-gram match
15
- TOKENIZED_OFFENSIVE_PHRASES = set(
16
- ' '.join(re.findall(r"\b\w+\b", phrase)) for phrase in OFFENSIVE_PHRASES
17
- )
18
-
19
  # -------------------- Preprocessing Utilities --------------------
20
 
21
  def normalize_text(text: str) -> str:
22
  text = text.lower()
23
- text = re.sub(r"[.]{2,}", " ", text) # Normalize "..."
24
  return text
25
 
26
  def clean_and_tokenize(text: str):
27
  text = normalize_text(text)
28
- return re.findall(r"\b\w+\b", text)
29
-
30
- def generate_ngrams(tokens, n):
31
- return [' '.join(tokens[i:i + n]) for i in range(len(tokens) - n + 1)]
32
-
33
- def phrase_match(text: str, phrases: set) -> str:
34
- tokens = clean_and_tokenize(text)
35
- max_n = max(len(p.split()) for p in phrases)
36
-
37
- for n in range(1, max_n + 1):
38
- ngrams = generate_ngrams(tokens, n)
39
- for ngram in ngrams:
40
- for phrase in phrases:
41
- if phrase in ngram:
42
- return phrase # return the triggering phrase
43
- return ""
44
 
45
  # -------------------- DEBUG Rule-Based Filter --------------------
46
 
47
  def debug_rule_based_check(text: str) -> bool:
48
  text_norm = normalize_text(text)
49
- tokens = set(clean_and_tokenize(text_norm))
50
 
51
  if tokens & SWEAR_WORDS:
52
  print(f"πŸ” Matched SWEAR_WORDS: {tokens & SWEAR_WORDS}")
@@ -60,10 +40,10 @@ def debug_rule_based_check(text: str) -> bool:
60
  print(f"πŸ” Matched HATE_SHORT_FORMS: {tokens & HATE_SHORT_FORMS}")
61
  return True
62
 
63
- phrase = phrase_match(text_norm, TOKENIZED_OFFENSIVE_PHRASES)
64
- if phrase:
65
- print(f"πŸ” Matched OFFENSIVE_PHRASE: '{phrase}'")
66
- return True
67
 
68
  for tag in HATE_HASHTAGS:
69
  if tag in text_norm:
@@ -76,12 +56,4 @@ def debug_rule_based_check(text: str) -> bool:
76
  return True
77
 
78
  print("βœ… No match found.")
79
- return False
80
-
81
- # # -------------------- Optional: Test Suite --------------------
82
-
83
- # if __name__ == "__main__":
84
- # for text in test_cases_1:
85
- # print(f"\n🧾 Text: {text}")
86
- # result = debug_rule_based_check(text)
87
- # print(f"Result: {'πŸ”₯ Flagged' if result else 'βœ… Safe'}")
 
7
  HATE_EMOJIS
8
  )
9
 
10
+ # Load offensive phrases from file
11
  with open("en.txt", encoding="utf-8") as f:
12
  OFFENSIVE_PHRASES = set(line.strip().lower() for line in f if line.strip())
13
 
 
 
 
 
 
14
  # -------------------- Preprocessing Utilities --------------------
15
 
16
  def normalize_text(text: str) -> str:
17
  text = text.lower()
18
+ text = re.sub(r"[.]{2,}", " ", text) # Normalize "..."
19
  return text
20
 
21
  def clean_and_tokenize(text: str):
22
  text = normalize_text(text)
23
+ return set(re.findall(r"\b\w+\b", text))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
  # -------------------- DEBUG Rule-Based Filter --------------------
26
 
27
  def debug_rule_based_check(text: str) -> bool:
28
  text_norm = normalize_text(text)
29
+ tokens = clean_and_tokenize(text_norm)
30
 
31
  if tokens & SWEAR_WORDS:
32
  print(f"πŸ” Matched SWEAR_WORDS: {tokens & SWEAR_WORDS}")
 
40
  print(f"πŸ” Matched HATE_SHORT_FORMS: {tokens & HATE_SHORT_FORMS}")
41
  return True
42
 
43
+ for phrase in OFFENSIVE_PHRASES:
44
+ if phrase in text_norm:
45
+ print(f"πŸ” Matched OFFENSIVE_PHRASE: '{phrase}'")
46
+ return True
47
 
48
  for tag in HATE_HASHTAGS:
49
  if tag in text_norm:
 
56
  return True
57
 
58
  print("βœ… No match found.")
59
+ return False