Spaces:

medoxz543
/

hate-endpoint

Sleeping

hate-endpoint / rule_based_filter.py

Update rule_based_filter.py

167edec verified 7 months ago

1.66 kB

	import re
	from lexicons import (
	SWEAR_WORDS,
	HATE_KEYWORDS,
	HATE_HASHTAGS,
	HATE_SHORT_FORMS,
	HATE_EMOJIS
	)

	# Load offensive phrases from file
	with open("en.txt", encoding="utf-8") as f:
	OFFENSIVE_PHRASES = set(line.strip().lower() for line in f if line.strip())

	# -------------------- Preprocessing Utilities --------------------

	def normalize_text(text: str) -> str:
	text = text.lower()
	text = re.sub(r"[.]{2,}", " ", text) # Normalize "..."
	return text

	def clean_and_tokenize(text: str):
	text = normalize_text(text)
	return set(re.findall(r"\b\w+\b", text))

	# -------------------- DEBUG Rule-Based Filter --------------------

	def debug_rule_based_check(text: str) -> bool:
	text_norm = normalize_text(text)
	tokens = clean_and_tokenize(text_norm)

	if tokens & SWEAR_WORDS:
	print(f"🔍 Matched SWEAR_WORDS: {tokens & SWEAR_WORDS}")
	return True

	if tokens & HATE_KEYWORDS:
	print(f"🔍 Matched HATE_KEYWORDS: {tokens & HATE_KEYWORDS}")
	return True

	if tokens & HATE_SHORT_FORMS:
	print(f"🔍 Matched HATE_SHORT_FORMS: {tokens & HATE_SHORT_FORMS}")
	return True

	for phrase in OFFENSIVE_PHRASES:
	if phrase in text_norm:
	print(f"🔍 Matched OFFENSIVE_PHRASE: '{phrase}'")
	return True

	for tag in HATE_HASHTAGS:
	if tag in text_norm:
	print(f"🔍 Matched HATE_HASHTAG: {tag}")
	return True

	for emoji in HATE_EMOJIS:
	if emoji in text:
	print(f"🔍 Matched HATE_EMOJI: {emoji}")
	return True

	print("✅ No match found.")
	return False