File size: 1,661 Bytes
75276f3
dc3ed32
 
 
 
 
 
 
75276f3
167edec
03ad36d
 
 
dc3ed32
 
 
 
167edec
dc3ed32
 
 
 
167edec
dc3ed32
 
 
 
 
167edec
dc3ed32
 
 
 
 
 
 
 
 
 
 
 
 
167edec
 
 
 
dc3ed32
 
 
 
 
 
 
 
 
03ad36d
 
dc3ed32
167edec
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import re
from lexicons import (
    SWEAR_WORDS,
    HATE_KEYWORDS,
    HATE_HASHTAGS,
    HATE_SHORT_FORMS,
    HATE_EMOJIS
)

# Load offensive phrases from file
with open("en.txt", encoding="utf-8") as f:
    OFFENSIVE_PHRASES = set(line.strip().lower() for line in f if line.strip())

# -------------------- Preprocessing Utilities --------------------

def normalize_text(text: str) -> str:
    text = text.lower()
    text = re.sub(r"[.]{2,}", " ", text)  # Normalize "..."
    return text

def clean_and_tokenize(text: str):
    text = normalize_text(text)
    return set(re.findall(r"\b\w+\b", text))

# -------------------- DEBUG Rule-Based Filter --------------------

def debug_rule_based_check(text: str) -> bool:
    text_norm = normalize_text(text)
    tokens = clean_and_tokenize(text_norm)

    if tokens & SWEAR_WORDS:
        print(f"πŸ” Matched SWEAR_WORDS: {tokens & SWEAR_WORDS}")
        return True

    if tokens & HATE_KEYWORDS:
        print(f"πŸ” Matched HATE_KEYWORDS: {tokens & HATE_KEYWORDS}")
        return True

    if tokens & HATE_SHORT_FORMS:
        print(f"πŸ” Matched HATE_SHORT_FORMS: {tokens & HATE_SHORT_FORMS}")
        return True

    for phrase in OFFENSIVE_PHRASES:
        if phrase in text_norm:
            print(f"πŸ” Matched OFFENSIVE_PHRASE: '{phrase}'")
            return True

    for tag in HATE_HASHTAGS:
        if tag in text_norm:
            print(f"πŸ” Matched HATE_HASHTAG: {tag}")
            return True

    for emoji in HATE_EMOJIS:
        if emoji in text:
            print(f"πŸ” Matched HATE_EMOJI: {emoji}")
            return True

    print("βœ… No match found.")
    return False