import re
from lexicons import (
    SWEAR_WORDS,
    HATE_KEYWORDS,
    HATE_HASHTAGS,
    HATE_SHORT_FORMS,
    HATE_EMOJIS
)

# Load offensive phrases from file
with open("en.txt", encoding="utf-8") as f:
    OFFENSIVE_PHRASES = set(line.strip().lower() for line in f if line.strip())

# -------------------- Preprocessing Utilities --------------------

def normalize_text(text: str) -> str:
    text = text.lower()
    text = re.sub(r"[.]{2,}", " ", text)  # Normalize "..."
    return text

def clean_and_tokenize(text: str):
    text = normalize_text(text)
    return set(re.findall(r"\b\w+\b", text))

# -------------------- DEBUG Rule-Based Filter --------------------

def debug_rule_based_check(text: str) -> bool:
    text_norm = normalize_text(text)
    tokens = clean_and_tokenize(text_norm)

    if tokens & SWEAR_WORDS:
        print(f"🔍 Matched SWEAR_WORDS: {tokens & SWEAR_WORDS}")
        return True

    if tokens & HATE_KEYWORDS:
        print(f"🔍 Matched HATE_KEYWORDS: {tokens & HATE_KEYWORDS}")
        return True

    if tokens & HATE_SHORT_FORMS:
        print(f"🔍 Matched HATE_SHORT_FORMS: {tokens & HATE_SHORT_FORMS}")
        return True

    for phrase in OFFENSIVE_PHRASES:
        if phrase in text_norm:
            print(f"🔍 Matched OFFENSIVE_PHRASE: '{phrase}'")
            return True

    for tag in HATE_HASHTAGS:
        if tag in text_norm:
            print(f"🔍 Matched HATE_HASHTAG: {tag}")
            return True

    for emoji in HATE_EMOJIS:
        if emoji in text:
            print(f"🔍 Matched HATE_EMOJI: {emoji}")
            return True

    print("✅ No match found.")
    return False