Spaces:
Sleeping
Sleeping
Commit ·
b8300d6
0
Parent(s):
Deploy: Comment Guard API - FastAPI + MuRIL BERT
Browse files- .dockerignore +6 -0
- Dockerfile +16 -0
- admin_manager.py +84 -0
- clean_dataset.py +88 -0
- data/bad_emojis.txt +94 -0
- data/secure_words.bin +1 -0
- data/telugu_badwords.txt +425 -0
- export_badwords.py +50 -0
- inspect_data.py +18 -0
- kaggle_training_v3.py +327 -0
- main.py +294 -0
- merge_datasets.py +53 -0
- requirements.txt +11 -0
- train_model.py +418 -0
- verify_model.py +58 -0
.dockerignore
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__pycache__
|
| 2 |
+
env/
|
| 3 |
+
venv/
|
| 4 |
+
.git
|
| 5 |
+
.gitignore
|
| 6 |
+
*.pyc
|
Dockerfile
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.9-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
COPY requirements.txt .
|
| 6 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 7 |
+
|
| 8 |
+
# Pre-download the model to cache it in the image (optional but good for speed)
|
| 9 |
+
# We can run a small python script to trigger the download or just let it download on first run.
|
| 10 |
+
# For simplicity, we let it download on first run.
|
| 11 |
+
|
| 12 |
+
COPY . .
|
| 13 |
+
|
| 14 |
+
EXPOSE 8000
|
| 15 |
+
|
| 16 |
+
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
|
admin_manager.py
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import base64
|
| 2 |
+
import os
|
| 3 |
+
import sys
|
| 4 |
+
|
| 5 |
+
DATA_DIR = os.path.join(os.path.dirname(__file__), "data")
|
| 6 |
+
PLAIN_FILE = os.path.join(DATA_DIR, "telugu_badwords.txt")
|
| 7 |
+
SECURE_FILE = os.path.join(DATA_DIR, "secure_words.bin")
|
| 8 |
+
|
| 9 |
+
def load_secure_words():
|
| 10 |
+
if not os.path.exists(SECURE_FILE):
|
| 11 |
+
return []
|
| 12 |
+
try:
|
| 13 |
+
with open(SECURE_FILE, "rb") as f:
|
| 14 |
+
encoded_data = f.read()
|
| 15 |
+
decoded_data = base64.b64decode(encoded_data).decode("utf-8")
|
| 16 |
+
return [w.strip() for w in decoded_data.splitlines() if w.strip()]
|
| 17 |
+
except Exception as e:
|
| 18 |
+
print(f"Error loading secure file: {e}")
|
| 19 |
+
return []
|
| 20 |
+
|
| 21 |
+
def save_secure_words(words):
|
| 22 |
+
try:
|
| 23 |
+
content = "\n".join(words)
|
| 24 |
+
encoded_data = base64.b64encode(content.encode("utf-8"))
|
| 25 |
+
with open(SECURE_FILE, "wb") as f:
|
| 26 |
+
f.write(encoded_data)
|
| 27 |
+
print(f"Successfully saved {len(words)} words to secure storage.")
|
| 28 |
+
except Exception as e:
|
| 29 |
+
print(f"Error saving secure file: {e}")
|
| 30 |
+
|
| 31 |
+
def migrate():
|
| 32 |
+
if not os.path.exists(PLAIN_FILE):
|
| 33 |
+
print(f"No plain text file found at {PLAIN_FILE}")
|
| 34 |
+
return
|
| 35 |
+
|
| 36 |
+
print(f"Migrating {PLAIN_FILE} to secure storage...")
|
| 37 |
+
with open(PLAIN_FILE, "r", encoding="utf-8") as f:
|
| 38 |
+
words = [line.strip() for line in f if line.strip() and not line.startswith("#")]
|
| 39 |
+
|
| 40 |
+
save_secure_words(words)
|
| 41 |
+
print("Migration complete. You can now safely delete the .txt file.")
|
| 42 |
+
|
| 43 |
+
def view_words():
|
| 44 |
+
words = load_secure_words()
|
| 45 |
+
print(f"--- SECURE WORD LIST ({len(words)} words) ---")
|
| 46 |
+
for w in words:
|
| 47 |
+
print(w)
|
| 48 |
+
print("-------------------------------------------")
|
| 49 |
+
|
| 50 |
+
def add_word(word):
|
| 51 |
+
words = load_secure_words()
|
| 52 |
+
if word in words:
|
| 53 |
+
print(f"'{word}' is already in the list.")
|
| 54 |
+
return
|
| 55 |
+
words.append(word)
|
| 56 |
+
save_secure_words(words)
|
| 57 |
+
print(f"Added '{word}'.")
|
| 58 |
+
|
| 59 |
+
def remove_word(word):
|
| 60 |
+
words = load_secure_words()
|
| 61 |
+
if word not in words:
|
| 62 |
+
print(f"'{word}' not found in the list.")
|
| 63 |
+
return
|
| 64 |
+
words = [w for w in words if w != word]
|
| 65 |
+
save_secure_words(words)
|
| 66 |
+
print(f"Removed '{word}'.")
|
| 67 |
+
|
| 68 |
+
if __name__ == "__main__":
|
| 69 |
+
if len(sys.argv) < 2:
|
| 70 |
+
print("Usage: python admin_manager.py [migrate|view|add <word>|remove <word>]")
|
| 71 |
+
sys.exit(1)
|
| 72 |
+
|
| 73 |
+
command = sys.argv[1]
|
| 74 |
+
|
| 75 |
+
if command == "migrate":
|
| 76 |
+
migrate()
|
| 77 |
+
elif command == "view":
|
| 78 |
+
view_words()
|
| 79 |
+
elif command == "add" and len(sys.argv) > 2:
|
| 80 |
+
add_word(sys.argv[2])
|
| 81 |
+
elif command == "remove" and len(sys.argv) > 2:
|
| 82 |
+
remove_word(sys.argv[2])
|
| 83 |
+
else:
|
| 84 |
+
print("Invalid command or missing argument.")
|
clean_dataset.py
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import traceback
|
| 2 |
+
|
| 3 |
+
try:
|
| 4 |
+
import pandas as pd
|
| 5 |
+
|
| 6 |
+
path = 'data/training_data_telugu-hate.xlsx'
|
| 7 |
+
print(f"Loading {path}...")
|
| 8 |
+
df = pd.read_excel(path)
|
| 9 |
+
print(f"Original shape: {df.shape}")
|
| 10 |
+
|
| 11 |
+
# 1. Back up the original file just in case
|
| 12 |
+
df.to_excel('data/training_data_telugu-hate_backup.xlsx', index=False)
|
| 13 |
+
|
| 14 |
+
# Clean duplicates and nans
|
| 15 |
+
df = df.dropna(subset=['Comments', 'Label'])
|
| 16 |
+
df['Comments'] = df['Comments'].astype(str).str.strip()
|
| 17 |
+
df['Label'] = df['Label'].astype(str).str.strip().str.lower()
|
| 18 |
+
df = df[df['Label'].isin(['hate', 'non-hate'])]
|
| 19 |
+
df = df.drop_duplicates(subset=['Comments'], keep='first')
|
| 20 |
+
|
| 21 |
+
print(f"Shape after cleaning: {df.shape}")
|
| 22 |
+
|
| 23 |
+
# New words
|
| 24 |
+
toxic = [
|
| 25 |
+
"rey mental puku", "ni edava veshalu", "konda erri hook", "thu ni brathuku cheda", "panimashiva ra nuvu",
|
| 26 |
+
"erri puku nayala", "nuvu oka pedda jaffa", "siggu ledu ra neeku", "pichi pulka gadu", "waste fellow ra nuvu",
|
| 27 |
+
"dengay ra lathkor", "ni yamma kadupula koti", "adangi vedhava", "gudda balupu", "boku gadu vidu",
|
| 28 |
+
"rey chetha na kodaka", "poramboku nayala", "ni mokam chudu elagundo", "chapri gadu lanjodka", "lavada lo panulu",
|
| 29 |
+
"modda em kadu le", "pachi boothulu tidutha", "daridrudu", "tuppas gadu", "chavata chavata",
|
| 30 |
+
"mental gadu ra vidu", "sannasi", "bewarse gadu", "ne bondha ra ne bondha", "rey puku",
|
| 31 |
+
"vedava sannaasi", "guddalo em ledha", "ni amma", "ni abba", "rey lanjodoka", "addamina waste gadu",
|
| 32 |
+
"rotta gadu", "faltu gadu", "picha light teesko ra puku", "lathkor gadu", "erri pusa",
|
| 33 |
+
"bazar munda", "rey kojja nayala", "ni ayya ki cheppu", "solu gadu", "sollu cheppaku nayala",
|
| 34 |
+
"arey howle", "bhadcow gadu", "puka musko", "rey ni amma", "denga beta",
|
| 35 |
+
"ni puku lo na modda", "erri guda", "nuvvu oka waste puku", "ni yabba", "dunnapothu nayala",
|
| 36 |
+
"munda mokam", "sulli gadu", "arey erri", "pedda puku", "mental na kodaka", "lanja kodaka",
|
| 37 |
+
"ni amma ranku", "chethana kodaka", "musali puku", "gudda chimputha", "ni amma ninnu kaninda",
|
| 38 |
+
"rey neeku guddalo dammu leda", "ni mokam meda umma", "chepaleni boothulu", "thu ni bathuku", "kukka brathuku",
|
| 39 |
+
"ni bathuku bus stand", "picchi puku", "hook gani laga unnav", "gadida kodaka",
|
| 40 |
+
"donga puku", "munda edava", "musko ra jaffa", "bocchu gadu", "ni ayya puku", "naa modda guduvu",
|
| 41 |
+
"lavadalo comments", "item gani laga unnav", "loffer gadu", "ni face ki dippa okate takkuva", "pakodi gadu",
|
| 42 |
+
"mental hospital ki ellu", "rey pichi guda", "bithiri", "buffoon gadu", "420 gadu",
|
| 43 |
+
"ne kamma", "ni bondha pettu", "kothi na kodaka", "labor na kodaka", "signal daggara adukko", "Footpath gadu"
|
| 44 |
+
]
|
| 45 |
+
|
| 46 |
+
safe = [
|
| 47 |
+
"super undi bro", "congrats macha", "all the best ra", "chala bagundi", "kekaa",
|
| 48 |
+
"thanks anna", "subram ga undi", "awesome work", "good job keep it up", "nice explanation",
|
| 49 |
+
"this is very helpful", "mee video lu ante chala ishtam", "first comment ra", "video super", "nice editing",
|
| 50 |
+
"super ga chepparu", "meeru inka goppavallu avvali", "waiting for next part", "good morning everyone", "have a nice day",
|
| 51 |
+
"really nice bro", "bhale cheppav", "good point", "manchi maata", "exactly macha",
|
| 52 |
+
"agreed", "well said anna", "proud of you", "jai hind", "super hit",
|
| 53 |
+
"very informative", "hats off to you", "good lesson learned", "superb acting", "next level",
|
| 54 |
+
"mind blowing performance", "keep soaring high", "bagundi", "baga chesaaru", "congratulations brother",
|
| 55 |
+
"so beautiful", "very nice song", "loved this", "manchi content idhi", "thank you so much",
|
| 56 |
+
"keep going", "amazing as always", "very true words", "good luck", "edo oka roju sadhistavu",
|
| 57 |
+
"meeru goppa anna", "salute anna", "inspiring video", "bhale tisaaru", "cinematography peaks",
|
| 58 |
+
"this made my day", "chala happy ga undi", "super star nvvu", "naaku idi chala use aindi", "respect",
|
| 59 |
+
"god bless you", "super anna", "keep doing videos", "nenu subscribe chesa", "like kottandi",
|
| 60 |
+
"miku manchi jargali", "great progress", "awesome efforts", "very nice tutorial", "fantastic",
|
| 61 |
+
"proud moment", "excellent work", "bhale undi kada", "super ga navvu", "nice smile",
|
| 62 |
+
"thanks for your support", "manchi advice", "helpful tips", "very clear", "super bro super",
|
| 63 |
+
"love from hyd", "amazing talent", "keep rocking", "gret job", "so soothing",
|
| 64 |
+
"wonderful video", "sweet comments", "very kind of you", "thank you akka", "wow super",
|
| 65 |
+
"masterpiece", "great info", "good stuff", "so positive", "happy for you", "best wishes",
|
| 66 |
+
"take care", "always supporting you", "superb explanation", "nice tutorial bro", "you are the best"
|
| 67 |
+
]
|
| 68 |
+
|
| 69 |
+
# Map to new rows
|
| 70 |
+
new_rows = []
|
| 71 |
+
|
| 72 |
+
for t in toxic:
|
| 73 |
+
new_rows.append({'S.No': 'AUGMENTED_HATE', 'Comments': t, 'Label': 'hate'})
|
| 74 |
+
for s in safe:
|
| 75 |
+
new_rows.append({'S.No': 'AUGMENTED_SAFE', 'Comments': s, 'Label': 'non-hate'})
|
| 76 |
+
|
| 77 |
+
augment_df = pd.DataFrame(new_rows)
|
| 78 |
+
final_df = pd.concat([df, augment_df], ignore_index=True)
|
| 79 |
+
|
| 80 |
+
# Overwrite
|
| 81 |
+
final_df.to_excel(path, index=False)
|
| 82 |
+
print(f"Final shape: {final_df.shape}")
|
| 83 |
+
print("✅ Augmentation complete! Successfully wrote to Excel.")
|
| 84 |
+
|
| 85 |
+
except Exception as e:
|
| 86 |
+
with open('error_log.txt', 'w') as f:
|
| 87 |
+
f.write(traceback.format_exc())
|
| 88 |
+
print("Script failed. See error_log.txt")
|
data/bad_emojis.txt
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Offensive Emojis Blacklist
|
| 2 |
+
# Emojis that should be blocked in comments/chat
|
| 3 |
+
# Add or remove as needed for your moderation policy
|
| 4 |
+
|
| 5 |
+
# ── Offensive Gestures ────────────────────────────────────────────
|
| 6 |
+
🖕
|
| 7 |
+
🖕🏻
|
| 8 |
+
🖕🏼
|
| 9 |
+
🖕🏽
|
| 10 |
+
🖕🏾
|
| 11 |
+
🖕🏿
|
| 12 |
+
🤙
|
| 13 |
+
🤏
|
| 14 |
+
|
| 15 |
+
# ── Threats / Violence / Weapons ─────────────────────────────────
|
| 16 |
+
🔪
|
| 17 |
+
🗡️
|
| 18 |
+
🔫
|
| 19 |
+
🪃
|
| 20 |
+
💣
|
| 21 |
+
🧨
|
| 22 |
+
⚰️
|
| 23 |
+
🪦
|
| 24 |
+
☠️
|
| 25 |
+
💀
|
| 26 |
+
🩸
|
| 27 |
+
🪓
|
| 28 |
+
🏹
|
| 29 |
+
⚔️
|
| 30 |
+
🛡️
|
| 31 |
+
|
| 32 |
+
# ── Death / Dark Symbols ──────────────────────────────────────────
|
| 33 |
+
👊
|
| 34 |
+
🤜
|
| 35 |
+
🤛
|
| 36 |
+
💢
|
| 37 |
+
😡
|
| 38 |
+
🤬
|
| 39 |
+
😤
|
| 40 |
+
👿
|
| 41 |
+
😾
|
| 42 |
+
|
| 43 |
+
# ── Explicit / Sexual Content ─────────────────────────────────────
|
| 44 |
+
🍆
|
| 45 |
+
🍑
|
| 46 |
+
💦
|
| 47 |
+
🔞
|
| 48 |
+
🥵
|
| 49 |
+
👅
|
| 50 |
+
💋
|
| 51 |
+
🍒
|
| 52 |
+
🌮
|
| 53 |
+
🌭
|
| 54 |
+
🍌
|
| 55 |
+
🍫
|
| 56 |
+
🛏️
|
| 57 |
+
🔑
|
| 58 |
+
📸
|
| 59 |
+
🩲
|
| 60 |
+
🩳
|
| 61 |
+
👙
|
| 62 |
+
💊
|
| 63 |
+
|
| 64 |
+
# ── Harassment / Mocking ─────────────────────────────────────────
|
| 65 |
+
🤡
|
| 66 |
+
🤢
|
| 67 |
+
🤮
|
| 68 |
+
💩
|
| 69 |
+
🐷
|
| 70 |
+
🐖
|
| 71 |
+
🐮
|
| 72 |
+
🐄
|
| 73 |
+
🐒
|
| 74 |
+
🙊
|
| 75 |
+
🐸
|
| 76 |
+
🐀
|
| 77 |
+
🐁
|
| 78 |
+
🦠
|
| 79 |
+
🐛
|
| 80 |
+
🪲
|
| 81 |
+
🪳
|
| 82 |
+
|
| 83 |
+
# ── Hate Symbols ─────────────────────────────────────────────────
|
| 84 |
+
卐
|
| 85 |
+
卍
|
| 86 |
+
|
| 87 |
+
# ── Dangerous / Risk ─────────────────────────────────────────────
|
| 88 |
+
🧪
|
| 89 |
+
💉
|
| 90 |
+
🩺
|
| 91 |
+
☢️
|
| 92 |
+
☣️
|
| 93 |
+
⚠️
|
| 94 |
+
🚨
|
data/secure_words.bin
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
ZG9uZ2EKdmVkaGF2YQp2ZWRhdmEKcGFuZGkKa3Vra2EKbmVlIGFtbWEKbmVlIGFiYmEKY2hhdHRhCndhc3RlIGZlbGxvdwppdGVtCmxvdyBjbGFzcwpwaWNjaGkKZXJyaXBhcHBhCmVycmkKbXVuZGFtb3BpCmRhcmlkcnlhCnNhbmkKcGFuaWtpIG1hbGluYQp0aGlra2EKd29yc3QgZmVsbG93CmJsb29keSBmb29sCnVzZWxlc3MgZmVsbG93CmxhbmphCmxhbmpha29kYWthCmxhbmphIGtvZGFrYQptdW5kYQptb2RkYQpkZW5ndQpkZW5nZXkKZGVuZ3V0aGEKbmVlIHlhYmJhCmNoYXZhdGEKc2FubmFzaQpsdWNjaGEKaG93bGUKcHVrdQpwdWsKbWFkZGEKbGF2YWRhCmtvamphCmhpanJhCmJvY2NodQpuZSB5YW1tYQpuZSBheXlhCmJva3UKYmFkY293CmVycmkgcHVrdQpwaWNoaSBsYW5qYWtvZGFrYQpib2t1bG8KZ3VkZGEKbXVzYWxpCm5pIGJvbmRoYQpuaSBhYmJhCmNoZXR0YW5hIGtvZGFrYQpkdXJtYXJndWR1Cm5lZSBheXlhCmNoYXR0YSBuYSBrb2Rha2EKcGljaGkgcHVsa2EKZXJyaSBwdXNocGFtCndhc3RlIGdhZHUKbmUga2FtbWEKd2FzdGUgbmEga29kYWthCnBvcmFtYm9rdQpzaWdndSBsZW5pCmxhamphCnllcnJpCmJld2Fyc2kKYmV3YXJzCnBha29kaQpwdWxrYQpidWZmb29uCnNjb3VuZHJlbApyYXNjYWwKaWRpb3QKc3R1cGlkCmxvc2VyCmxvYWZlcgpyb3dkeQo0MjAKZG9uZ2FuYSBrb2Rha2EKbmVlIGZ1a3UKa29uZGEgZXJyaQpwb29rCnBvb2t1Cm1vZGRhbG8KbGF2YWRhbG8Kc3VsbGkKc3VsbGlnYQpsYWJvciBuYSBrb2Rha2EKY2hhcHJpCmNoYXByaSBnYWR1CmVycmlob29rCmhvb2sgZ2FkdQpiaGFkY293CmJoYWRrYXcKaG93bGEKamFmZmEKZ2FqdWxhdGhvCmtvamphIG5hIGtvZGFrYQpzaGlrYW5kaQpmYWtlIGdhZHUKZnJhdWQgZ2FkdQpkdW5uYXBvdGh1CmdhYWRpZGEKZ2FkaWRhCmJ1ZmZhbG8KbW9ua2V5CmtvdGhpCmtvdGhpIHZlZGhhdmEKc29sbHUKc29sdQpzb2xsdSBnYWR1CnZhZGh1cmEKb2RpeWFtbWEKeWFkYXZhCnllZGF2YQp0dWR1bXUKd2FzdGUgYm9keQpjaGV0aGEKY2hldHRhCnBlbmR1CnRyYXNoCmdhcmJhZ2UKZGlydHkgZmVsbG93Cm5hc3R5CmNoZWFwIGZlbGxvdwpsb3cgY2xhc3MgZmVsbG93CnRoaXJkIGNsYXNzCjNyZCBjbGFzcwo0dGggY2xhc3MKbWVudGFsbwpwc3ljaG8Kc2FkaXN0CnRodXB1awp3b3JzdCBnYWR1CnBpY2hpIG5hIGtvZGFrYQplcnJpIG5hIGtvZGFrYQpkb25nYSBuYSBrb2Rha2EKZG9uZ2EgbXVuZGEKcmFua3UgbXVuZGEKYmF6YXJ1IG11bmRhCmJhemFyIGRhbmEKcm9hZCBtZWVkYSB0aWdlIGRhbmEKdGlydWd1Ym90aHUKdGhpcnVndWJvdGh1CnRhYWd1Ym90aHUKdGFndWJvdGh1Cmp1bGUKanVsYXlpCmF2YWxhbmphCmFkZGFtaW5hCmFkZGFtaW5hIHBhbnVsdQpuZWVrdSBlbmR1a3UgcmEKbmVla3UgZW5kdWt1Cm11c3Vrb25pIGt1cmNobwptdXN1a28Kc2h1dCB1cApjbG9zZSB5b3VyIG1vdXRoCm5vcnUgbXV5eWkKbm9ydSBtdXN1a28Kbm90bG8KZ3VkZGFsbwpiYXN0aGkKc2x1bQpzbHVtIGZlbGxvdwpsb2NhbCBnYWR1CnVuY2l2aWxpemVkCmJhcmJhcmlhbgpicnV0ZQpzYXZhZ2UKcmFrc2hhc3VkYQpyYWtzaGFzaQp3aXRjaApiaXRjaApzbHV0Cndob3JlCnByb3N0aXR1dGUKYmFzdGFyZAphc3Nob2xlCmZ1Y2tlcgptb3RoZXJmdWNrZXIKc2lzdGVyIGZ1Y2tlcgpicm90aGVyIGZ1Y2tlcgpmYXRoZXIgZnVja2VyCmRpY2sKY29jawpwdXNzeQpjdW50CnRpdHMKYm9vYnMKbmlwcGxlCnBlbmlzCnZhZ2luYQpmdWNrCmZ1Y2tpbmcKZnVja2VkCnNjcmV3ZWQKc2hhZ2dlZApodW1wZWQKY3JlYW1waWUKc3Blcm0Kc2VtZW4Kaml6egpzcHVuawpzcXVpcnQKaG9ybnkKcmFwZQptb2xlc3QKaGFyYXNzCmFzc2F1bHQKYWJ1c2UKdmlvbGF0ZQpkZWdyYWRlCmh1bWlsaWF0ZQpzdWljaWRlCmt5cwpjaG9rZQpzdHJhbmdsZQpzdWZmb2NhdGUKc2xhcApzcGl0CnNoaXQKZmlsdGgKZ3JpbWUKbXVjawpzbGltZQpzY3VtCnZlcm1pbgpwZXN0CnBhcmFzaXRlCmxlZWNoCm1hZ2dvdApqYWNrYXNzCm11bGUKb3gKYnVsbAp2dWx0dXJlCnNuYWtlCmxpemFyZApiYXN0aGkgZ2FkdQpiYXN0aGkgZmVsbG93CnJvYWQgZmVsbG93CnJvYWQgZ2FkdQpzdHJlZXQgZmVsbG93CnBhdmVtZW50IGZlbGxvdwpmb290cGF0aCBnYWR1CnNpZ25hbCBnYWR1CnRyYWZmaWMgZmVsbG93CmF1dG8gZ2FkdQpyaWtzaGEgZ2FkdQpjb29saWUKY29vbGllIGdhZHUKbGFib3IgZ2FkdQpzd2VlcGVyIGdhZHUKZ2FyYmFnZSBnYWR1CmR1c3RiaW4gZ2FkdQp0b2lsZXQgZ2FkdQpndXR0ZXIgZ2FkdQpkcmFpbiBnYWR1CnNld2VyIGdhZHUKbWFuaG9sZSBnYWR1Cm5lZSBpbnRsbwpuZWUgaW50aSB2YWx1Cm5lZSBmYW1pbHkKbmVlIHBhcmVudHMKbmVlIGZhdGhlcgpuZWUgbW90aGVyCm5lZSBzaXN0ZXIKbmVlIGJyb3RoZXIKbmVlIHdpZmUKbmVlIGh1c2JhbmQKZ3VkZGEgbG8KcHVrdSBsbwptb2RkYSBsbwpsYXZhZGEgbG8KYm9ra2EgbG8Kbm90bG8gcGV0dGkKZ3VkZGFsbyBwZXR0aQpwdWt1bG8gcGV0dGkKbW9kZGFsbyBwZXR0aQpkZW5ndXRhbnUKZGVuZ2VzdGEKZGVuZ2FsaQpkZW5naWNodWtvCmRlbmdleSByYQpkZW5nZXkgbGUKZGVuZ2lwb3RoYQpkZW5naXBveWEKZGVuZ2lwb3lpbmEKZGVuZ2luY2h1a3VubmEKZGVuZ2ljaHVrdW50dW5uYQpkZW5ndXR1bm5hCmRlbmd1dHVubmFudQpkZW5ndXR1bm5hdgpkZW5ndXR1bm5hZHUKZGVuZ3V0dW5uYWRpCmRlbmd1dHVubmFtCmRlbmd1dHVubmFydQpkZW5ndXR1bm5haQpkZW5nYW51CmRlbmdhdgpkZW5nYWR1CmRlbmdpbmRpCmRlbmdhbQpkZW5nYXJ1CmRlbmdhaQpkZW5naW5hCmRlbmdpbmF2CmRlbmdpbmFkdQpkZW5naW5hZGkKZGVuZ2luYW0KZGVuZ2luYXJ1CmRlbmdpbmFpCmRlbmdlc2FudQpkZW5nZXNhdgpkZW5nZXNhZHUKZGVuZ2VzYWRpCmRlbmdlc2FtCmRlbmdlc2FydQpkZW5nZXNhaQpkZW5nZXN0YXYKZGVuZ2VzdGFkdQpkZW5nZXN0YWRpCmRlbmdlc3RhbQpkZW5nZXN0YXJ1CmRlbmdlc3RhaQpkZW5ndXRhdgpkZW5ndXRhZHUKZGVuZ3V0YWRpCmRlbmd1dGFtCmRlbmd1dGFydQpkZW5ndXRhaQpwdWt1bG8gZGVuZ3V0YW51Cmd1ZGRhbG8gZGVuZ3V0YW51Cm5vdGxvIGRlbmd1dGFudQpib2trYWxvIGRlbmd1dGFudQpsYXZhZGFsbyBkZW5ndXRhbnUKbW9kZGFsbyBkZW5ndXRhbnUKcHVrdSBkZW5ndXRhbnUKZ3VkZGEgZGVuZ3V0YW51CmJva2thIGRlbmd1dGFudQpsYXZhZGEgZGVuZ3V0YW51Cm1vZGRhIGRlbmd1dGFudQpub3J1IGRlbmd1dGFudQpwdWt1IHJhCmd1ZGRhIHJhCmJva2thIHJhCmxhdmFkYSByYQptb2RkYSByYQpub3J1IHJhCnB1a3UgbGUKZ3VkZGEgbGUKYm9ra2EgbGUKbGF2YWRhIGxlCm1vZGRhIGxlCm5vcnUgbGUKcHVrdSBsYW5qYQpndWRkYSBsYW5qYQpib2trYSBsYW5qYQpsYXZhZGEgbGFuamEKbW9kZGEgbGFuamEKbm9ydSBsYW5qYQpwdWt1IGtvZGFrYQpndWRkYSBrb2Rha2EKYm9ra2Ega29kYWthCmxhdmFkYSBrb2Rha2EKbW9kZGEga29kYWthCm5vcnUga29kYWthCnB1a3UgbXVuZGEKZ3VkZGEgbXVuZGEKYm9ra2EgbXVuZGEKbGF2YWRhIG11bmRhCm1vZGRhIG11bmRhCm5vcnUgbXVuZGEKcHVrdSBkb25nYQpndWRkYSBkb25nYQpib2trYSBkb25nYQpsYXZhZGEgZG9uZ2EKbW9kZGEgZG9uZ2EKbm9ydSBkb25nYQpwdWt1IGVycmkKZ3VkZGEgZXJyaQpib2trYSBlcnJpCmxhdmFkYSBlcnJpCm1vZGRhIGVycmkKbm9ydSBlcnJpCnB1a3UgcGljY2hpCmd1ZGRhIHBpY2NoaQpib2trYSBwaWNjaGkKbGF2YWRhIHBpY2NoaQptb2RkYSBwaWNjaGkKbm9ydSBwaWNjaGkKcHVrdSB3YXN0ZQpndWRkYSB3YXN0ZQpib2trYSB3YXN0ZQpsYXZhZGEgd2FzdGUKbW9kZGEgd2FzdGUKbm9ydSB3YXN0ZQpwdWt1bG8gcGV0dGkgZGVuZ3V0YW51Cmd1ZGRhbG8gcGV0dGkgZGVuZ3V0YW51Cm5vdGxvIHBldHRpIGRlbmd1dGFudQpib2trYWxvIHBldHRpIGRlbmd1dGFudQpsYXZhZGFsbyBwZXR0aSBkZW5ndXRhbnUKbW9kZGFsbyBwZXR0aSBkZW5ndXRhbnUKeW91IGFyZSBzdHVwaWQKeW91IGFyZSBhbiBpZGlvdAp5b3UncmUgc28gZHVtYgp3aGF0IGEgbG9zZXIKaSB3aWxsIGZpbmQgeW91CnlvdSBkZXNlcnZlIHRvIGRpZQppIGhhdGUgeW91CnlvdSdyZSBkaXNndXN0aW5nCm5vYm9keSBsaWtlcyB5b3UKeW91J3JlIHBhdGhldGljCmdldCBsb3N0Cm5vYm9keSBhc2tlZAp5b3UncmUgd29ydGhsZXNzCnlvdSdyZSB0cmFzaApraWxsIHlvdXJzZWxmCnlvdSdyZSB1Z2x5CnlvdSdyZSBhbm5veWluZwpnbyB0byBoZWxsCnN0dXBpZCBnYSB1bm5hdgp0aGlzIGlzIGdhcmJhZ2UKbm9ib2R5IGFza2VkIGZvciB5b3VyIG9waW5pb24K
|
data/telugu_badwords.txt
ADDED
|
@@ -0,0 +1,425 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
donga
|
| 2 |
+
vedhava
|
| 3 |
+
vedava
|
| 4 |
+
pandi
|
| 5 |
+
kukka
|
| 6 |
+
nee amma
|
| 7 |
+
nee abba
|
| 8 |
+
chatta
|
| 9 |
+
waste fellow
|
| 10 |
+
item
|
| 11 |
+
low class
|
| 12 |
+
picchi
|
| 13 |
+
erripappa
|
| 14 |
+
erri
|
| 15 |
+
mundamopi
|
| 16 |
+
daridrya
|
| 17 |
+
sani
|
| 18 |
+
paniki malina
|
| 19 |
+
thikka
|
| 20 |
+
worst fellow
|
| 21 |
+
bloody fool
|
| 22 |
+
useless fellow
|
| 23 |
+
lanja
|
| 24 |
+
lanjakodaka
|
| 25 |
+
lanja kodaka
|
| 26 |
+
munda
|
| 27 |
+
modda
|
| 28 |
+
dengu
|
| 29 |
+
dengey
|
| 30 |
+
dengutha
|
| 31 |
+
nee yabba
|
| 32 |
+
chavata
|
| 33 |
+
sannasi
|
| 34 |
+
luccha
|
| 35 |
+
howle
|
| 36 |
+
puku
|
| 37 |
+
puk
|
| 38 |
+
madda
|
| 39 |
+
lavada
|
| 40 |
+
kojja
|
| 41 |
+
hijra
|
| 42 |
+
bocchu
|
| 43 |
+
ne yamma
|
| 44 |
+
ne ayya
|
| 45 |
+
boku
|
| 46 |
+
badcow
|
| 47 |
+
erri puku
|
| 48 |
+
pichi lanjakodaka
|
| 49 |
+
bokulo
|
| 50 |
+
gudda
|
| 51 |
+
musali
|
| 52 |
+
ni bondha
|
| 53 |
+
ni abba
|
| 54 |
+
chettana kodaka
|
| 55 |
+
durmargudu
|
| 56 |
+
nee ayya
|
| 57 |
+
chatta na kodaka
|
| 58 |
+
pichi pulka
|
| 59 |
+
erri pushpam
|
| 60 |
+
waste gadu
|
| 61 |
+
ne kamma
|
| 62 |
+
waste na kodaka
|
| 63 |
+
poramboku
|
| 64 |
+
siggu leni
|
| 65 |
+
lajja
|
| 66 |
+
yerri
|
| 67 |
+
bewarsi
|
| 68 |
+
bewars
|
| 69 |
+
pakodi
|
| 70 |
+
pulka
|
| 71 |
+
buffoon
|
| 72 |
+
scoundrel
|
| 73 |
+
rascal
|
| 74 |
+
idiot
|
| 75 |
+
stupid
|
| 76 |
+
loser
|
| 77 |
+
loafer
|
| 78 |
+
rowdy
|
| 79 |
+
420
|
| 80 |
+
dongana kodaka
|
| 81 |
+
nee fuku
|
| 82 |
+
konda erri
|
| 83 |
+
pook
|
| 84 |
+
pooku
|
| 85 |
+
moddalo
|
| 86 |
+
lavadalo
|
| 87 |
+
sulli
|
| 88 |
+
sulliga
|
| 89 |
+
labor na kodaka
|
| 90 |
+
chapri
|
| 91 |
+
chapri gadu
|
| 92 |
+
errihook
|
| 93 |
+
hook gadu
|
| 94 |
+
bhadcow
|
| 95 |
+
bhadkaw
|
| 96 |
+
howla
|
| 97 |
+
jaffa
|
| 98 |
+
gajulatho
|
| 99 |
+
kojja na kodaka
|
| 100 |
+
shikandi
|
| 101 |
+
fake gadu
|
| 102 |
+
fraud gadu
|
| 103 |
+
dunnapothu
|
| 104 |
+
gaadida
|
| 105 |
+
gadida
|
| 106 |
+
buffalo
|
| 107 |
+
monkey
|
| 108 |
+
kothi
|
| 109 |
+
kothi vedhava
|
| 110 |
+
sollu
|
| 111 |
+
solu
|
| 112 |
+
sollu gadu
|
| 113 |
+
vadhura
|
| 114 |
+
odiyamma
|
| 115 |
+
yadava
|
| 116 |
+
yedava
|
| 117 |
+
tudumu
|
| 118 |
+
waste body
|
| 119 |
+
chetha
|
| 120 |
+
chetta
|
| 121 |
+
pendu
|
| 122 |
+
trash
|
| 123 |
+
garbage
|
| 124 |
+
dirty fellow
|
| 125 |
+
nasty
|
| 126 |
+
cheap fellow
|
| 127 |
+
low class fellow
|
| 128 |
+
third class
|
| 129 |
+
3rd class
|
| 130 |
+
4th class
|
| 131 |
+
mentalo
|
| 132 |
+
psycho
|
| 133 |
+
sadist
|
| 134 |
+
thupuk
|
| 135 |
+
worst gadu
|
| 136 |
+
pichi na kodaka
|
| 137 |
+
erri na kodaka
|
| 138 |
+
donga na kodaka
|
| 139 |
+
donga munda
|
| 140 |
+
ranku munda
|
| 141 |
+
bazaru munda
|
| 142 |
+
bazar dana
|
| 143 |
+
road meeda tige dana
|
| 144 |
+
tirugubothu
|
| 145 |
+
thirugubothu
|
| 146 |
+
taagubothu
|
| 147 |
+
tagubothu
|
| 148 |
+
jule
|
| 149 |
+
julayi
|
| 150 |
+
avalanja
|
| 151 |
+
addamina
|
| 152 |
+
addamina panulu
|
| 153 |
+
neeku enduku ra
|
| 154 |
+
neeku enduku
|
| 155 |
+
musukoni kurcho
|
| 156 |
+
musuko
|
| 157 |
+
shut up
|
| 158 |
+
close your mouth
|
| 159 |
+
noru muyyi
|
| 160 |
+
noru musuko
|
| 161 |
+
notlo
|
| 162 |
+
guddalo
|
| 163 |
+
basthi
|
| 164 |
+
slum
|
| 165 |
+
slum fellow
|
| 166 |
+
local gadu
|
| 167 |
+
uncivilized
|
| 168 |
+
barbarian
|
| 169 |
+
brute
|
| 170 |
+
savage
|
| 171 |
+
rakshasuda
|
| 172 |
+
rakshasi
|
| 173 |
+
witch
|
| 174 |
+
bitch
|
| 175 |
+
slut
|
| 176 |
+
whore
|
| 177 |
+
prostitute
|
| 178 |
+
bastard
|
| 179 |
+
asshole
|
| 180 |
+
fucker
|
| 181 |
+
motherfucker
|
| 182 |
+
sister fucker
|
| 183 |
+
brother fucker
|
| 184 |
+
father fucker
|
| 185 |
+
dick
|
| 186 |
+
cock
|
| 187 |
+
pussy
|
| 188 |
+
cunt
|
| 189 |
+
tits
|
| 190 |
+
boobs
|
| 191 |
+
nipple
|
| 192 |
+
penis
|
| 193 |
+
vagina
|
| 194 |
+
fuck
|
| 195 |
+
fucking
|
| 196 |
+
fucked
|
| 197 |
+
screwed
|
| 198 |
+
shagged
|
| 199 |
+
humped
|
| 200 |
+
creampie
|
| 201 |
+
sperm
|
| 202 |
+
semen
|
| 203 |
+
jizz
|
| 204 |
+
spunk
|
| 205 |
+
squirt
|
| 206 |
+
horny
|
| 207 |
+
rape
|
| 208 |
+
molest
|
| 209 |
+
harass
|
| 210 |
+
assault
|
| 211 |
+
abuse
|
| 212 |
+
violate
|
| 213 |
+
degrade
|
| 214 |
+
humiliate
|
| 215 |
+
suicide
|
| 216 |
+
kys
|
| 217 |
+
choke
|
| 218 |
+
strangle
|
| 219 |
+
suffocate
|
| 220 |
+
slap
|
| 221 |
+
spit
|
| 222 |
+
shit
|
| 223 |
+
filth
|
| 224 |
+
grime
|
| 225 |
+
muck
|
| 226 |
+
slime
|
| 227 |
+
scum
|
| 228 |
+
vermin
|
| 229 |
+
pest
|
| 230 |
+
parasite
|
| 231 |
+
leech
|
| 232 |
+
maggot
|
| 233 |
+
jackass
|
| 234 |
+
mule
|
| 235 |
+
ox
|
| 236 |
+
bull
|
| 237 |
+
vulture
|
| 238 |
+
snake
|
| 239 |
+
lizard
|
| 240 |
+
basthi gadu
|
| 241 |
+
basthi fellow
|
| 242 |
+
road fellow
|
| 243 |
+
road gadu
|
| 244 |
+
street fellow
|
| 245 |
+
pavement fellow
|
| 246 |
+
footpath gadu
|
| 247 |
+
signal gadu
|
| 248 |
+
traffic fellow
|
| 249 |
+
auto gadu
|
| 250 |
+
riksha gadu
|
| 251 |
+
coolie
|
| 252 |
+
coolie gadu
|
| 253 |
+
labor gadu
|
| 254 |
+
sweeper gadu
|
| 255 |
+
garbage gadu
|
| 256 |
+
dustbin gadu
|
| 257 |
+
toilet gadu
|
| 258 |
+
gutter gadu
|
| 259 |
+
drain gadu
|
| 260 |
+
sewer gadu
|
| 261 |
+
manhole gadu
|
| 262 |
+
nee intlo
|
| 263 |
+
nee inti valu
|
| 264 |
+
nee family
|
| 265 |
+
nee parents
|
| 266 |
+
nee father
|
| 267 |
+
nee mother
|
| 268 |
+
nee sister
|
| 269 |
+
nee brother
|
| 270 |
+
nee wife
|
| 271 |
+
nee husband
|
| 272 |
+
gudda lo
|
| 273 |
+
puku lo
|
| 274 |
+
modda lo
|
| 275 |
+
lavada lo
|
| 276 |
+
bokka lo
|
| 277 |
+
notlo petti
|
| 278 |
+
guddalo petti
|
| 279 |
+
pukulo petti
|
| 280 |
+
moddalo petti
|
| 281 |
+
dengutanu
|
| 282 |
+
dengesta
|
| 283 |
+
dengali
|
| 284 |
+
dengichuko
|
| 285 |
+
dengey ra
|
| 286 |
+
dengey le
|
| 287 |
+
dengipotha
|
| 288 |
+
dengipoya
|
| 289 |
+
dengipoyina
|
| 290 |
+
denginchukunna
|
| 291 |
+
dengichukuntunna
|
| 292 |
+
dengutunna
|
| 293 |
+
dengutunnanu
|
| 294 |
+
dengutunnav
|
| 295 |
+
dengutunnadu
|
| 296 |
+
dengutunnadi
|
| 297 |
+
dengutunnam
|
| 298 |
+
dengutunnaru
|
| 299 |
+
dengutunnai
|
| 300 |
+
denganu
|
| 301 |
+
dengav
|
| 302 |
+
dengadu
|
| 303 |
+
dengindi
|
| 304 |
+
dengam
|
| 305 |
+
dengaru
|
| 306 |
+
dengai
|
| 307 |
+
dengina
|
| 308 |
+
denginav
|
| 309 |
+
denginadu
|
| 310 |
+
denginadi
|
| 311 |
+
denginam
|
| 312 |
+
denginaru
|
| 313 |
+
denginai
|
| 314 |
+
dengesanu
|
| 315 |
+
dengesav
|
| 316 |
+
dengesadu
|
| 317 |
+
dengesadi
|
| 318 |
+
dengesam
|
| 319 |
+
dengesaru
|
| 320 |
+
dengesai
|
| 321 |
+
dengestav
|
| 322 |
+
dengestadu
|
| 323 |
+
dengestadi
|
| 324 |
+
dengestam
|
| 325 |
+
dengestaru
|
| 326 |
+
dengestai
|
| 327 |
+
dengutav
|
| 328 |
+
dengutadu
|
| 329 |
+
dengutadi
|
| 330 |
+
dengutam
|
| 331 |
+
dengutaru
|
| 332 |
+
dengutai
|
| 333 |
+
pukulo dengutanu
|
| 334 |
+
guddalo dengutanu
|
| 335 |
+
notlo dengutanu
|
| 336 |
+
bokkalo dengutanu
|
| 337 |
+
lavadalo dengutanu
|
| 338 |
+
moddalo dengutanu
|
| 339 |
+
puku dengutanu
|
| 340 |
+
gudda dengutanu
|
| 341 |
+
bokka dengutanu
|
| 342 |
+
lavada dengutanu
|
| 343 |
+
modda dengutanu
|
| 344 |
+
noru dengutanu
|
| 345 |
+
puku ra
|
| 346 |
+
gudda ra
|
| 347 |
+
bokka ra
|
| 348 |
+
lavada ra
|
| 349 |
+
modda ra
|
| 350 |
+
noru ra
|
| 351 |
+
puku le
|
| 352 |
+
gudda le
|
| 353 |
+
bokka le
|
| 354 |
+
lavada le
|
| 355 |
+
modda le
|
| 356 |
+
noru le
|
| 357 |
+
puku lanja
|
| 358 |
+
gudda lanja
|
| 359 |
+
bokka lanja
|
| 360 |
+
lavada lanja
|
| 361 |
+
modda lanja
|
| 362 |
+
noru lanja
|
| 363 |
+
puku kodaka
|
| 364 |
+
gudda kodaka
|
| 365 |
+
bokka kodaka
|
| 366 |
+
lavada kodaka
|
| 367 |
+
modda kodaka
|
| 368 |
+
noru kodaka
|
| 369 |
+
puku munda
|
| 370 |
+
gudda munda
|
| 371 |
+
bokka munda
|
| 372 |
+
lavada munda
|
| 373 |
+
modda munda
|
| 374 |
+
noru munda
|
| 375 |
+
puku donga
|
| 376 |
+
gudda donga
|
| 377 |
+
bokka donga
|
| 378 |
+
lavada donga
|
| 379 |
+
modda donga
|
| 380 |
+
noru donga
|
| 381 |
+
puku erri
|
| 382 |
+
gudda erri
|
| 383 |
+
bokka erri
|
| 384 |
+
lavada erri
|
| 385 |
+
modda erri
|
| 386 |
+
noru erri
|
| 387 |
+
puku picchi
|
| 388 |
+
gudda picchi
|
| 389 |
+
bokka picchi
|
| 390 |
+
lavada picchi
|
| 391 |
+
modda picchi
|
| 392 |
+
noru picchi
|
| 393 |
+
puku waste
|
| 394 |
+
gudda waste
|
| 395 |
+
bokka waste
|
| 396 |
+
lavada waste
|
| 397 |
+
modda waste
|
| 398 |
+
noru waste
|
| 399 |
+
pukulo petti dengutanu
|
| 400 |
+
guddalo petti dengutanu
|
| 401 |
+
notlo petti dengutanu
|
| 402 |
+
bokkalo petti dengutanu
|
| 403 |
+
lavadalo petti dengutanu
|
| 404 |
+
moddalo petti dengutanu
|
| 405 |
+
you are stupid
|
| 406 |
+
you are an idiot
|
| 407 |
+
you're so dumb
|
| 408 |
+
what a loser
|
| 409 |
+
i will find you
|
| 410 |
+
you deserve to die
|
| 411 |
+
i hate you
|
| 412 |
+
you're disgusting
|
| 413 |
+
nobody likes you
|
| 414 |
+
you're pathetic
|
| 415 |
+
get lost
|
| 416 |
+
nobody asked
|
| 417 |
+
you're worthless
|
| 418 |
+
you're trash
|
| 419 |
+
kill yourself
|
| 420 |
+
you're ugly
|
| 421 |
+
you're annoying
|
| 422 |
+
go to hell
|
| 423 |
+
stupid ga unnav
|
| 424 |
+
this is garbage
|
| 425 |
+
nobody asked for your opinion
|
export_badwords.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import base64
|
| 3 |
+
import pandas as pd
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
def export_badwords_to_excel(output_filename="custom_badwords_dataset.xlsx"):
|
| 7 |
+
data_dir = Path("data")
|
| 8 |
+
toxic_words = []
|
| 9 |
+
|
| 10 |
+
# 1. Load regular badwords
|
| 11 |
+
p1 = data_dir / "telugu_badwords.txt"
|
| 12 |
+
if p1.exists():
|
| 13 |
+
with open(p1, "r", encoding="utf-8") as f:
|
| 14 |
+
toxic_words.extend([l.strip() for l in f if l.strip()])
|
| 15 |
+
|
| 16 |
+
# 2. Load secure base64 badwords
|
| 17 |
+
p2 = data_dir / "secure_words.bin"
|
| 18 |
+
if p2.exists():
|
| 19 |
+
with open(p2, "rb") as f:
|
| 20 |
+
decoded = base64.b64decode(f.read()).decode("utf-8")
|
| 21 |
+
toxic_words.extend([l.strip() for l in decoded.splitlines() if l.strip()])
|
| 22 |
+
|
| 23 |
+
# 3. Load bad emojis
|
| 24 |
+
p3 = data_dir / "bad_emojis.txt"
|
| 25 |
+
if p3.exists():
|
| 26 |
+
with open(p3, "r", encoding="utf-8") as f:
|
| 27 |
+
toxic_words.extend([l.strip() for l in f if l.strip() and not l.strip().startswith("#")])
|
| 28 |
+
|
| 29 |
+
# Remove duplicates
|
| 30 |
+
toxic_words = list(set(toxic_words))
|
| 31 |
+
print(f"Total unique offensive terms gathered: {len(toxic_words)}")
|
| 32 |
+
|
| 33 |
+
if not toxic_words:
|
| 34 |
+
print("No words found to export.")
|
| 35 |
+
return
|
| 36 |
+
|
| 37 |
+
# Create a DataFrame
|
| 38 |
+
# Here we are just exporting the raw words as 'toxic'
|
| 39 |
+
df = pd.DataFrame({
|
| 40 |
+
'text': toxic_words,
|
| 41 |
+
'label': 'toxic'
|
| 42 |
+
})
|
| 43 |
+
|
| 44 |
+
# Save to Excel
|
| 45 |
+
output_path = data_dir / output_filename
|
| 46 |
+
df.to_excel(output_path, index=False)
|
| 47 |
+
print(f"Successfully exported {len(toxic_words)} words to {output_path}")
|
| 48 |
+
|
| 49 |
+
if __name__ == "__main__":
|
| 50 |
+
export_badwords_to_excel()
|
inspect_data.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import sys
|
| 3 |
+
|
| 4 |
+
with open('inspect_out.txt', 'w', encoding='utf-8') as f:
|
| 5 |
+
f.write("Loading dataset...\n")
|
| 6 |
+
try:
|
| 7 |
+
df = pd.read_excel('data/training_data_telugu-hate.xlsx')
|
| 8 |
+
f.write("Columns: " + str(df.columns.tolist()) + "\n")
|
| 9 |
+
f.write("Shape: " + str(df.shape) + "\n")
|
| 10 |
+
if 'label' in df.columns:
|
| 11 |
+
f.write("Value Counts for 'label':\n" + str(df['label'].value_counts()) + "\n")
|
| 12 |
+
f.write("\nFirst 5 rows:\n")
|
| 13 |
+
f.write(str(df.head()) + "\n")
|
| 14 |
+
|
| 15 |
+
# Look for missing values
|
| 16 |
+
f.write("\nMissing Values:\n" + str(df.isnull().sum()) + "\n")
|
| 17 |
+
except Exception as e:
|
| 18 |
+
f.write("Error: " + str(e) + "\n")
|
kaggle_training_v3.py
ADDED
|
@@ -0,0 +1,327 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
KAGGLE MODEL V3: Aiming for 90%+ Accuracy without Overfitting
|
| 3 |
+
Optimizations:
|
| 4 |
+
1. Increased Dataset Size: More diverse templates and safe phrases for data augmentation.
|
| 5 |
+
2. Data Text Cleaning: Removed URLs, extra spaces, and user mentions to reduce noise.
|
| 6 |
+
3. Class Balancing: Automatically oversamples the minority class to perfectly balance the dataset.
|
| 7 |
+
4. Overfitting Prevention: Added Label Smoothing, Cosine Learning Rate Scheduler,
|
| 8 |
+
Warmup steps, and appropriate Weight Decay.
|
| 9 |
+
5. Model: Using 'google/muril-base-cased' which is highly optimized for Indian languages
|
| 10 |
+
including Telugu, better for code-mixed text. Added custom dropout to config.
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
import os
|
| 14 |
+
import sys
|
| 15 |
+
import json
|
| 16 |
+
import base64
|
| 17 |
+
import random
|
| 18 |
+
import re
|
| 19 |
+
from pathlib import Path
|
| 20 |
+
|
| 21 |
+
# Force unbuffered output
|
| 22 |
+
try:
|
| 23 |
+
if hasattr(sys.stdout, 'reconfigure'):
|
| 24 |
+
sys.stdout.reconfigure(encoding='utf-8')
|
| 25 |
+
except Exception:
|
| 26 |
+
pass
|
| 27 |
+
|
| 28 |
+
print("DEBUG: Kaggle V3 Training Script started", flush=True)
|
| 29 |
+
|
| 30 |
+
# ── Paths ────────────────────────────────────────────────────────────────────
|
| 31 |
+
KAGGLE_INPUT = Path("/kaggle/input")
|
| 32 |
+
KAGGLE_OUTPUT = Path("/kaggle/working")
|
| 33 |
+
|
| 34 |
+
DATA_DIR = None
|
| 35 |
+
print(f"DEBUG: Checking for data in {KAGGLE_INPUT}...", flush=True)
|
| 36 |
+
|
| 37 |
+
for p in KAGGLE_INPUT.glob("*"):
|
| 38 |
+
if p.is_dir() and any(p.glob("*training_data*")):
|
| 39 |
+
DATA_DIR = p
|
| 40 |
+
break
|
| 41 |
+
|
| 42 |
+
if not DATA_DIR:
|
| 43 |
+
for p in KAGGLE_INPUT.rglob("*training_data*"):
|
| 44 |
+
DATA_DIR = p.parent
|
| 45 |
+
break
|
| 46 |
+
|
| 47 |
+
if not DATA_DIR:
|
| 48 |
+
DATA_DIR = KAGGLE_INPUT / "comment-guard-data"
|
| 49 |
+
|
| 50 |
+
OUTPUT_DIR = KAGGLE_OUTPUT / "model_output_v3"
|
| 51 |
+
|
| 52 |
+
# ── Dependencies ─────────────────────────────────────────────────────────────
|
| 53 |
+
try:
|
| 54 |
+
import torch
|
| 55 |
+
import transformers
|
| 56 |
+
from transformers import (
|
| 57 |
+
AutoTokenizer,
|
| 58 |
+
AutoModelForSequenceClassification,
|
| 59 |
+
AutoConfig,
|
| 60 |
+
TrainingArguments,
|
| 61 |
+
Trainer,
|
| 62 |
+
EarlyStoppingCallback
|
| 63 |
+
)
|
| 64 |
+
import pandas as pd
|
| 65 |
+
import openpyxl
|
| 66 |
+
import sklearn
|
| 67 |
+
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
|
| 68 |
+
import numpy as np
|
| 69 |
+
from torch.utils.data import Dataset as TorchDataset
|
| 70 |
+
from sklearn.model_selection import train_test_split
|
| 71 |
+
except ImportError:
|
| 72 |
+
print("⚠ Please run: !pip install transformers torch scikit-learn accelerate openpyxl pandas -q")
|
| 73 |
+
sys.exit(1)
|
| 74 |
+
|
| 75 |
+
# ── Config ────────────────────────────────────────────────────────────────────
|
| 76 |
+
BASE_MODEL = "google/muril-base-cased" # Great for Telugu/Code-mixed
|
| 77 |
+
MAX_LENGTH = 128
|
| 78 |
+
EPOCHS = 10 # High max epochs, relying on early stopping
|
| 79 |
+
LEARNING_RATE = 2e-5
|
| 80 |
+
WEIGHT_DECAY = 0.05
|
| 81 |
+
LABEL_SMOOTHING = 0.1 # Helps prevent overfitting by softening labels
|
| 82 |
+
WARMUP_RATIO = 0.1 # Gradual learning rate increase
|
| 83 |
+
|
| 84 |
+
# ── Functions ────────────────────────────────────────────────────────────────
|
| 85 |
+
|
| 86 |
+
def clean_text(text):
|
| 87 |
+
text = str(text).lower()
|
| 88 |
+
text = re.sub(r'http\S+', '', text) # Remove URLs
|
| 89 |
+
text = re.sub(r'@\w+', '', text) # Remove mentions
|
| 90 |
+
text = re.sub(r'#\w+', '', text) # Remove hashtags
|
| 91 |
+
text = re.sub(r'\s+', ' ', text) # Remove extra whitespace
|
| 92 |
+
return text.strip()
|
| 93 |
+
|
| 94 |
+
def is_code_mixed(text):
|
| 95 |
+
text = str(text)
|
| 96 |
+
has_latin = any('\u0041' <= c <= '\u007A' for c in text)
|
| 97 |
+
total = len([c for c in text if c.strip()])
|
| 98 |
+
# Simply require that it has some Latin characters (English alphabet)
|
| 99 |
+
if total == 0 or not has_latin: return False
|
| 100 |
+
return True
|
| 101 |
+
|
| 102 |
+
def load_data(files):
|
| 103 |
+
hate_labels_set = {'hate', 'offensive', 'hof', '1', 'yes', 'toxic'}
|
| 104 |
+
frames = []
|
| 105 |
+
TEXT_NAMES = {'text', 'comment', 'comments', 'sentence', 'tweet', 'content', 'data'}
|
| 106 |
+
LABEL_NAMES = {'label', 'labels', 'category', 'class', 'tag', 'hate', 'annotation'}
|
| 107 |
+
|
| 108 |
+
for excel_file in files:
|
| 109 |
+
try:
|
| 110 |
+
if excel_file.suffix == '.csv':
|
| 111 |
+
df = pd.read_csv(excel_file)
|
| 112 |
+
sheets_data = [('csv', df)]
|
| 113 |
+
else:
|
| 114 |
+
xl = pd.ExcelFile(excel_file)
|
| 115 |
+
sheets_data = [(sheet, xl.parse(sheet)) for sheet in xl.sheet_names]
|
| 116 |
+
|
| 117 |
+
for sheet, df in sheets_data:
|
| 118 |
+
text_col = next((c for c in df.columns if str(c).lower() in TEXT_NAMES or any(t in str(c).lower() for t in ['text', 'comment', 'sentence'])), None)
|
| 119 |
+
label_col = next((c for c in df.columns if str(c).lower() in LABEL_NAMES or any(t in str(c).lower() for t in ['label', 'categor', 'class'])), None)
|
| 120 |
+
|
| 121 |
+
if text_col and label_col:
|
| 122 |
+
sub = df[[text_col, label_col]].copy()
|
| 123 |
+
sub.columns = ['text', 'label']
|
| 124 |
+
sub = sub.dropna()
|
| 125 |
+
sub['text'] = sub['text'].apply(clean_text)
|
| 126 |
+
sub['label_int'] = sub['label'].astype(str).str.strip().str.lower().apply(lambda x: 1 if x in hate_labels_set else 0)
|
| 127 |
+
sub = sub[sub['text'].apply(is_code_mixed)].reset_index(drop=True)
|
| 128 |
+
frames.append(sub)
|
| 129 |
+
except Exception as e:
|
| 130 |
+
print(f"Error loading {excel_file}: {e}")
|
| 131 |
+
pass
|
| 132 |
+
|
| 133 |
+
return pd.concat(frames, ignore_index=True) if frames else pd.DataFrame(columns=['text', 'label', 'label_int'])
|
| 134 |
+
|
| 135 |
+
def load_badwords_augmented():
|
| 136 |
+
"""V3: Massively expanded safe phrases and toxic templates to increase dataset robustness."""
|
| 137 |
+
toxic_words = []
|
| 138 |
+
p1, p2, p3 = DATA_DIR / "telugu_badwords.txt", DATA_DIR / "secure_words.bin", DATA_DIR / "bad_emojis.txt"
|
| 139 |
+
if p1.exists():
|
| 140 |
+
with open(p1, "r", encoding="utf-8") as f: toxic_words.extend([l.strip() for l in f if l.strip()])
|
| 141 |
+
if p2.exists():
|
| 142 |
+
with open(p2, "rb") as f: toxic_words.extend([l.strip() for l in base64.b64decode(f.read()).decode("utf-8").splitlines() if l.strip()])
|
| 143 |
+
if p3.exists():
|
| 144 |
+
with open(p3, "r", encoding="utf-8") as f: toxic_words.extend([l.strip() for l in f if l.strip() and not l.strip().startswith("#")])
|
| 145 |
+
|
| 146 |
+
if not toxic_words: return pd.DataFrame()
|
| 147 |
+
|
| 148 |
+
random.seed(42)
|
| 149 |
+
# Increased variety
|
| 150 |
+
toxic_templates = [
|
| 151 |
+
"{word}", "you are a {word}", "{word} ga unnav", "enti ra {word}",
|
| 152 |
+
"nuvvu {word}", "{word} fellow", "worst {word}", "rey {word}",
|
| 153 |
+
"ni yamma {word} nayala", "nuvvu pedda {word}", "chi {word} badava",
|
| 154 |
+
"endira ee {word} panulu", "tuppas {word} mokam", "nee lanti {word} inka evaru leru"
|
| 155 |
+
]
|
| 156 |
+
|
| 157 |
+
safe_phrases = [
|
| 158 |
+
"bagundi bro", "keep it up", "manchi video", "super explanation", "thanks for sharing",
|
| 159 |
+
"helpful information", "nice edit", "waiting for next video", "super ga undi",
|
| 160 |
+
"love from ap", "good job", "congratulations brother", "beautiful video", "awesome music",
|
| 161 |
+
"next video eppudu?", "very interesting topic", "I learned a lot today", "nice talk",
|
| 162 |
+
"informative content", "meeru chala baga chepparu", "meeru chala handsome", "super anna",
|
| 163 |
+
"daily chustanu mee videos", "proud of you", "all the best for your future", "fantastic editing",
|
| 164 |
+
"thank you so much", "very nice presentation", "please upload more", "hello everyone",
|
| 165 |
+
"good morning brother", "have a great day ahead", "chala upayoga padindi", "excellent work"
|
| 166 |
+
]
|
| 167 |
+
|
| 168 |
+
rows = []
|
| 169 |
+
for word in list(set(toxic_words)):
|
| 170 |
+
# Generate 4 toxic examples per word
|
| 171 |
+
for t in random.sample(toxic_templates, min(4, len(toxic_templates))):
|
| 172 |
+
rows.append({'text': t.format(word=word), 'label_int': 1})
|
| 173 |
+
# Generate 4 safe examples to match
|
| 174 |
+
for _ in range(4):
|
| 175 |
+
rows.append({'text': random.choice(safe_phrases), 'label_int': 0})
|
| 176 |
+
|
| 177 |
+
return pd.DataFrame(rows)
|
| 178 |
+
|
| 179 |
+
# ── Main Execution ───────────────────────────────────────────────────────────
|
| 180 |
+
|
| 181 |
+
if not DATA_DIR.exists():
|
| 182 |
+
print(f"✗ ERROR: DATA_DIR {DATA_DIR} not found. Ensure dataset is added to notebook.")
|
| 183 |
+
sys.exit(1)
|
| 184 |
+
|
| 185 |
+
train_files = [f for f in DATA_DIR.iterdir() if 'training_data' in f.name.lower() and f.suffix in ['.xlsx', '.xls', '.csv']]
|
| 186 |
+
all_data = load_data(train_files)
|
| 187 |
+
aug_data = load_badwords_augmented()
|
| 188 |
+
if not aug_data.empty:
|
| 189 |
+
all_data = pd.concat([all_data, aug_data], ignore_index=True)
|
| 190 |
+
|
| 191 |
+
all_data = all_data.drop_duplicates(subset='text').reset_index(drop=True)
|
| 192 |
+
|
| 193 |
+
# V3: DYNAMIC OVERSAMPLING & BALANCING
|
| 194 |
+
counts = all_data['label_int'].value_counts()
|
| 195 |
+
if len(counts) == 2:
|
| 196 |
+
majority_class = counts.idxmax()
|
| 197 |
+
minority_class = counts.idxmin()
|
| 198 |
+
majority_count = counts[majority_class]
|
| 199 |
+
minority_count = counts[minority_class]
|
| 200 |
+
|
| 201 |
+
if minority_count < majority_count:
|
| 202 |
+
df_majority = all_data[all_data['label_int'] == majority_class]
|
| 203 |
+
df_minority = all_data[all_data['label_int'] == minority_class]
|
| 204 |
+
|
| 205 |
+
# Oversample minority
|
| 206 |
+
df_minority_over = df_minority.sample(majority_count, replace=True, random_state=42)
|
| 207 |
+
all_data = pd.concat([df_majority, df_minority_over], axis=0).sample(frac=1, random_state=42).reset_index(drop=True)
|
| 208 |
+
print(f"DEBUG: Oversampled class {minority_class} to {majority_count}. Total rows symmetrically balanced: {len(all_data)}")
|
| 209 |
+
|
| 210 |
+
# Train/Test Split
|
| 211 |
+
train_df, test_df = train_test_split(all_data, test_size=0.10, random_state=42, stratify=all_data['label_int'])
|
| 212 |
+
|
| 213 |
+
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
|
| 214 |
+
|
| 215 |
+
# Incorporating Dropout into config to prevent overfitting
|
| 216 |
+
config = AutoConfig.from_pretrained(BASE_MODEL, num_labels=2, problem_type="single_label_classification")
|
| 217 |
+
config.hidden_dropout_prob = 0.2
|
| 218 |
+
config.attention_probs_dropout_prob = 0.2
|
| 219 |
+
|
| 220 |
+
model = AutoModelForSequenceClassification.from_pretrained(
|
| 221 |
+
BASE_MODEL,
|
| 222 |
+
config=config,
|
| 223 |
+
ignore_mismatched_sizes=True
|
| 224 |
+
)
|
| 225 |
+
|
| 226 |
+
class CommentDataset(TorchDataset):
|
| 227 |
+
def __init__(self, texts, labels):
|
| 228 |
+
self.texts = texts # Store raw texts as well
|
| 229 |
+
self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='pt')
|
| 230 |
+
self.labels = labels
|
| 231 |
+
def __len__(self): return len(self.labels)
|
| 232 |
+
def __getitem__(self, idx):
|
| 233 |
+
item = {k: v[idx] for k, v in self.encodings.items()}
|
| 234 |
+
item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
|
| 235 |
+
return item
|
| 236 |
+
|
| 237 |
+
train_dataset = CommentDataset(train_df['text'].tolist(), train_df['label_int'].tolist())
|
| 238 |
+
test_dataset = CommentDataset(test_df['text'].tolist(), test_df['label_int'].tolist())
|
| 239 |
+
|
| 240 |
+
def compute_metrics(eval_pred):
|
| 241 |
+
logits, labels = eval_pred
|
| 242 |
+
preds = np.argmax(logits, axis=-1)
|
| 243 |
+
return {
|
| 244 |
+
'accuracy': accuracy_score(labels, preds),
|
| 245 |
+
'f1': f1_score(labels, preds, zero_division=0),
|
| 246 |
+
'precision': precision_score(labels, preds, zero_division=0),
|
| 247 |
+
'recall': recall_score(labels, preds, zero_division=0),
|
| 248 |
+
}
|
| 249 |
+
|
| 250 |
+
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
| 251 |
+
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
| 252 |
+
|
| 253 |
+
training_args = TrainingArguments(
|
| 254 |
+
output_dir=str(OUTPUT_DIR),
|
| 255 |
+
num_train_epochs=EPOCHS,
|
| 256 |
+
per_device_train_batch_size=16 if device == 'cuda' else 8,
|
| 257 |
+
per_device_eval_batch_size=32 if device == 'cuda' else 8,
|
| 258 |
+
learning_rate=LEARNING_RATE,
|
| 259 |
+
weight_decay=WEIGHT_DECAY,
|
| 260 |
+
warmup_ratio=WARMUP_RATIO,
|
| 261 |
+
lr_scheduler_type='cosine', # Cosine learning rate scheduler helps avoid overfitting and local minima
|
| 262 |
+
label_smoothing_factor=LABEL_SMOOTHING, # Distributes a bit of probability mass to other classes, reducing overconfidence
|
| 263 |
+
eval_strategy="epoch",
|
| 264 |
+
save_strategy="no", # CHANGED: Don't save checkpoints to prevent KAGGLE STORAGE OVERFLOW
|
| 265 |
+
load_best_model_at_end=False, # CHANGED: Must be false if we aren't saving checkpoints
|
| 266 |
+
metric_for_best_model="f1",
|
| 267 |
+
report_to="none",
|
| 268 |
+
fp16=(device == 'cuda'),
|
| 269 |
+
logging_steps=50,
|
| 270 |
+
)
|
| 271 |
+
|
| 272 |
+
trainer = Trainer(
|
| 273 |
+
model=model,
|
| 274 |
+
args=training_args,
|
| 275 |
+
train_dataset=train_dataset,
|
| 276 |
+
eval_dataset=test_dataset,
|
| 277 |
+
compute_metrics=compute_metrics,
|
| 278 |
+
callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
|
| 279 |
+
)
|
| 280 |
+
|
| 281 |
+
print(f"Starting V3 training on {device}...")
|
| 282 |
+
trainer.train()
|
| 283 |
+
|
| 284 |
+
# Evaluate & Print Results
|
| 285 |
+
print("\n📊 EVALUATING MODEL V3...")
|
| 286 |
+
results = trainer.evaluate()
|
| 287 |
+
print(f"\n{'='*50}\n🏆 V3 FINAL ACCURACY: {results.get('eval_accuracy', 0)*100:.2f}%\n{'='*50}")
|
| 288 |
+
|
| 289 |
+
# --- CRITICAL KAGGLE STORAGE FIX ---
|
| 290 |
+
# Free up disk space before saving by clearing the HuggingFace cache and previous runs
|
| 291 |
+
print("\n🧹 Clearing disk space...")
|
| 292 |
+
import shutil
|
| 293 |
+
import gc
|
| 294 |
+
|
| 295 |
+
# 1. Clear large dataframes and run garbage collection
|
| 296 |
+
del all_data, train_df, test_df, train_dataset, test_dataset
|
| 297 |
+
gc.collect()
|
| 298 |
+
|
| 299 |
+
# 2. Clear known cache directories
|
| 300 |
+
for cache_path in [".cache/huggingface", ".cache/torch"]:
|
| 301 |
+
cache_dir = Path.home() / cache_path
|
| 302 |
+
if cache_dir.exists():
|
| 303 |
+
try:
|
| 304 |
+
shutil.rmtree(cache_dir)
|
| 305 |
+
print(f"✅ Cleared {cache_dir}")
|
| 306 |
+
except Exception as e:
|
| 307 |
+
pass
|
| 308 |
+
|
| 309 |
+
# 3. Aggressively delete OLD model outputs in /kaggle/working to free up 100s of MBs
|
| 310 |
+
for old_dir in ["model_output", "model_output_v2", "wandb"]:
|
| 311 |
+
old_path = KAGGLE_OUTPUT / old_dir
|
| 312 |
+
if old_path.exists():
|
| 313 |
+
try:
|
| 314 |
+
shutil.rmtree(old_path)
|
| 315 |
+
print(f"✅ Deleted old directory: {old_path}")
|
| 316 |
+
except Exception as e:
|
| 317 |
+
pass
|
| 318 |
+
|
| 319 |
+
# Save
|
| 320 |
+
try:
|
| 321 |
+
trainer.save_model(str(OUTPUT_DIR))
|
| 322 |
+
tokenizer.save_pretrained(str(OUTPUT_DIR))
|
| 323 |
+
with open(OUTPUT_DIR / "eval_results.json", 'w') as f: json.dump(results, f, indent=2)
|
| 324 |
+
print(f"✅ Model saved successfully to: {OUTPUT_DIR}")
|
| 325 |
+
except OSError as e:
|
| 326 |
+
print(f"\n❌ FATAL SAVING ERROR: {e}")
|
| 327 |
+
print("Kaggle ran out of disk space again! Try restarting your session or using a smaller BASE_MODEL.")
|
main.py
ADDED
|
@@ -0,0 +1,294 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import FastAPI, HTTPException
|
| 2 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 3 |
+
from pydantic import BaseModel
|
| 4 |
+
from transformers import pipeline
|
| 5 |
+
from better_profanity import profanity
|
| 6 |
+
from typing import List, Dict
|
| 7 |
+
import re
|
| 8 |
+
|
| 9 |
+
# Mild/acceptable words that better_profanity should NOT flag.
|
| 10 |
+
# Using the library's built-in whitelist_words param is the most reliable fix.
|
| 11 |
+
MILD_WORDS_WHITELIST = [
|
| 12 |
+
"damn", "hell", "crap", "dang", "heck", "shoot", "frick", "freaking",
|
| 13 |
+
"sucks", "suck", "bloody", "piss", "pissed",
|
| 14 |
+
]
|
| 15 |
+
|
| 16 |
+
# Initialize profanity filter with whitelisted mild words so they never trigger
|
| 17 |
+
profanity.load_censor_words(whitelist_words=MILD_WORDS_WHITELIST)
|
| 18 |
+
|
| 19 |
+
# Keep a set for the manual cleanup fallback (covers multi-word phrases)
|
| 20 |
+
PROFANITY_WHITELIST = set(MILD_WORDS_WHITELIST) | {"keep it up", "great post"}
|
| 21 |
+
|
| 22 |
+
# Pre-compiled regex patterns for profanity whitelist
|
| 23 |
+
PROFANITY_WHITELIST_PATTERNS = {word: re.compile(r'\b' + re.escape(word) + r'\b', re.IGNORECASE) for word in PROFANITY_WHITELIST}
|
| 24 |
+
|
| 25 |
+
def is_whitelisted(text: str) -> bool:
|
| 26 |
+
"""Check if the text only triggers profanity due to whitelisted mild words."""
|
| 27 |
+
cleaned = text.lower()
|
| 28 |
+
for pattern in PROFANITY_WHITELIST_PATTERNS.values():
|
| 29 |
+
cleaned = pattern.sub("", cleaned)
|
| 30 |
+
return not profanity.contains_profanity(cleaned)
|
| 31 |
+
|
| 32 |
+
# Keyword-based insult/threat detector to catch what the ML model misses.
|
| 33 |
+
# Unicode apostrophe class ['‘’] handles both ASCII (') and curly (’) apostrophes.
|
| 34 |
+
INSULT_KEYWORDS = [
|
| 35 |
+
# --- English insults / threats ---
|
| 36 |
+
r"\byou['‘’]?re so dumb\b",
|
| 37 |
+
r"\bwhat a loser\b",
|
| 38 |
+
r"\bi will find you\b",
|
| 39 |
+
r"\byou deserve to die\b",
|
| 40 |
+
r"\bi hate you\b",
|
| 41 |
+
r"\byou['‘’]?re disgusting\b",
|
| 42 |
+
r"\bnobody likes you\b",
|
| 43 |
+
r"\byou['‘’]?re pathetic\b",
|
| 44 |
+
r"\bget lost\b",
|
| 45 |
+
r"\bnobody asked\b",
|
| 46 |
+
r"\byou['‘’]?re worthless\b",
|
| 47 |
+
r"\byou['‘’]?re trash\b",
|
| 48 |
+
r"\bkill yourself\b",
|
| 49 |
+
r"\bgo kill yourself\b",
|
| 50 |
+
r"\byou['‘’]?re ugly\b",
|
| 51 |
+
r"\bshut up\b",
|
| 52 |
+
r"\byou['‘’]?re annoying\b",
|
| 53 |
+
r"\bgo to hell\b",
|
| 54 |
+
r"\bstupid ga\b",
|
| 55 |
+
r"\bwaste fellow\b",
|
| 56 |
+
r"\byou['‘’]?re an idiot\b",
|
| 57 |
+
r"\bthis is garbage\b",
|
| 58 |
+
r"\byou are stupid\b",
|
| 59 |
+
r"\byou are an idiot\b",
|
| 60 |
+
r"\byou['‘’]?re dumb\b",
|
| 61 |
+
r"\bstupid idiot\b",
|
| 62 |
+
r"\bbloody fool\b",
|
| 63 |
+
# --- Telugu-English compound insults: [insult word] + gadu/fellow/vaadu ---
|
| 64 |
+
r"\b(?:buffalo|monkey|mental|psycho|cheap|nasty|dirty|useless|worst|scoundrel)"
|
| 65 |
+
r"\s+(?:gadu|fellow|vaadu|ra)\b",
|
| 66 |
+
r"\b(?:rascal|buffoon|loafer|fraud|basthi|chapri|local|rowdy|420|kothi|waste)"
|
| 67 |
+
r"\s+(?:gadu|fellow|vaadu|ra)\b",
|
| 68 |
+
r"\b(?:third\s+class|low\s+class|third-class|low-class)\s+(?:gadu|fellow|vaadu)\b",
|
| 69 |
+
r"\b(?:buffalo|monkey|mental|psycho|cheap|nasty|dirty|useless|worst|scoundrel|rascal|buffoon|loafer|fraud)\s+fellow\b",
|
| 70 |
+
# --- Telugu standalone insult suffixes ---
|
| 71 |
+
r"\bkothi\s+vedhava\b",
|
| 72 |
+
]
|
| 73 |
+
INSULT_PATTERN = re.compile("|".join(INSULT_KEYWORDS), re.IGNORECASE | re.UNICODE)
|
| 74 |
+
|
| 75 |
+
def contains_insult_keyword(text: str) -> bool:
|
| 76 |
+
"""Check if text contains known insult/threat patterns."""
|
| 77 |
+
return bool(INSULT_PATTERN.search(text))
|
| 78 |
+
|
| 79 |
+
# Load Custom Telugu-English Bad Words (Secure)
|
| 80 |
+
import base64
|
| 81 |
+
import os
|
| 82 |
+
|
| 83 |
+
try:
|
| 84 |
+
secure_file_path = "data/secure_words.bin"
|
| 85 |
+
if os.path.exists(secure_file_path):
|
| 86 |
+
with open(secure_file_path, "rb") as f:
|
| 87 |
+
encoded_data = f.read()
|
| 88 |
+
decoded_data = base64.b64decode(encoded_data).decode("utf-8")
|
| 89 |
+
custom_words = [line.strip() for line in decoded_data.splitlines() if line.strip()]
|
| 90 |
+
profanity.add_censor_words(custom_words)
|
| 91 |
+
print(f"Loaded {len(custom_words)} custom bad words from secure storage.")
|
| 92 |
+
else:
|
| 93 |
+
print("Warning: Secure bad words file not found.")
|
| 94 |
+
except Exception as e:
|
| 95 |
+
print(f"Warning: Could not load custom bad words: {e}")
|
| 96 |
+
|
| 97 |
+
# Load Offensive Emojis
|
| 98 |
+
offensive_emojis = set()
|
| 99 |
+
try:
|
| 100 |
+
emoji_file_path = "data/bad_emojis.txt"
|
| 101 |
+
if os.path.exists(emoji_file_path):
|
| 102 |
+
with open(emoji_file_path, "r", encoding="utf-8") as f:
|
| 103 |
+
for line in f:
|
| 104 |
+
line = line.strip()
|
| 105 |
+
if line and not line.startswith("#"):
|
| 106 |
+
offensive_emojis.add(line)
|
| 107 |
+
print(f"Loaded {len(offensive_emojis)} offensive emojis.")
|
| 108 |
+
else:
|
| 109 |
+
print("Warning: Offensive emojis file not found.")
|
| 110 |
+
except Exception as e:
|
| 111 |
+
print(f"Warning: Could not load offensive emojis: {e}")
|
| 112 |
+
|
| 113 |
+
def contains_offensive_emoji(text: str) -> bool:
|
| 114 |
+
"""Check if text contains any offensive emojis"""
|
| 115 |
+
for emoji in offensive_emojis:
|
| 116 |
+
if emoji in text:
|
| 117 |
+
return True
|
| 118 |
+
return False
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
app = FastAPI(title="AI Comment Moderation API")
|
| 122 |
+
|
| 123 |
+
app.add_middleware(
|
| 124 |
+
CORSMiddleware,
|
| 125 |
+
allow_origins=["*"],
|
| 126 |
+
allow_credentials=True,
|
| 127 |
+
allow_methods=["*"],
|
| 128 |
+
allow_headers=["*"],
|
| 129 |
+
)
|
| 130 |
+
|
| 131 |
+
# Initialize the toxicity classification pipeline
|
| 132 |
+
# We use 'original' to keep the original distilbert-base-uncased-finetuned-sst-2-english if we wanted simple sentiment
|
| 133 |
+
# However, for toxicity detection in Telugu-English code-mixed content, MuRIL (Multilingual
|
| 134 |
+
# Representations for Indian Languages) BERT is preferred over standard DistilBERT or toxic-bert.
|
| 135 |
+
# MuRIL is specifically trained on Indian languages and handles code-switching much better.
|
| 136 |
+
# Current production model: google/muril-base-cased (fine-tuned)
|
| 137 |
+
import torch
|
| 138 |
+
|
| 139 |
+
# Optimizatons to prevent PyTorch from lagging the entire OS when running on CPU
|
| 140 |
+
try:
|
| 141 |
+
if torch.cuda.is_available():
|
| 142 |
+
device = 0 # Use GPU
|
| 143 |
+
print("✓ CUDA GPU detected, running models on GPU for faster inference.")
|
| 144 |
+
else:
|
| 145 |
+
device = -1 # Use CPU
|
| 146 |
+
torch.set_num_threads(config.get("cpu_threads", 4)) # Limit to 4 threads rather than maxing out CPU
|
| 147 |
+
print(f"✓ CPU detected, limited PyTorch to {torch.get_num_threads()} threads to prevent system lag.")
|
| 148 |
+
except Exception as e:
|
| 149 |
+
device = -1
|
| 150 |
+
pass
|
| 151 |
+
|
| 152 |
+
try:
|
| 153 |
+
# Use fine-tuned model if available (produced by train_model.py)
|
| 154 |
+
fine_tuned_path = os.path.join(os.path.dirname(__file__), "model_output")
|
| 155 |
+
if os.path.exists(fine_tuned_path) and os.path.exists(os.path.join(fine_tuned_path, "config.json")):
|
| 156 |
+
print(f"✓ Loading fine-tuned model from: {fine_tuned_path}")
|
| 157 |
+
classifier = pipeline("text-classification", model=fine_tuned_path, top_k=None, device=device)
|
| 158 |
+
else:
|
| 159 |
+
print("Loading default model: google/muril-base-cased (Fallback)")
|
| 160 |
+
print("Note: MuRIL is highly recommended for Telugu-English code-mixed content.")
|
| 161 |
+
classifier = pipeline("text-classification", model="google/muril-base-cased", top_k=None, device=device)
|
| 162 |
+
except Exception as e:
|
| 163 |
+
print(f"Error loading model: {e}")
|
| 164 |
+
classifier = None
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
class CommentRequest(BaseModel):
|
| 168 |
+
text: str
|
| 169 |
+
strictness: str = "high" # "high" (Celeb) or "low" (Friend)
|
| 170 |
+
|
| 171 |
+
class Score(BaseModel):
|
| 172 |
+
label: str
|
| 173 |
+
score: float
|
| 174 |
+
|
| 175 |
+
class AnalysisResponse(BaseModel):
|
| 176 |
+
text: str
|
| 177 |
+
results: List[Score]
|
| 178 |
+
is_toxic: bool
|
| 179 |
+
|
| 180 |
+
@app.get("/")
|
| 181 |
+
def read_root():
|
| 182 |
+
return {"message": "AI Comment Moderation API is running"}
|
| 183 |
+
|
| 184 |
+
@app.post("/analyze", response_model=AnalysisResponse)
|
| 185 |
+
def analyze_comment(request: CommentRequest):
|
| 186 |
+
text = request.text.strip()
|
| 187 |
+
if not text:
|
| 188 |
+
raise HTTPException(status_code=400, detail="Text cannot be empty")
|
| 189 |
+
|
| 190 |
+
# 1. Strict "Bad Word" Check (Rule-based)
|
| 191 |
+
# MILD_WORDS_WHITELIST is already removed from the profanity library's censor list,
|
| 192 |
+
# so only genuine profanity (slurs, explicit words) will be flagged here.
|
| 193 |
+
if profanity.contains_profanity(text):
|
| 194 |
+
# Extra safety: remove any remaining multi-word safe phrases and re-check using PRECOMPILED regex
|
| 195 |
+
cleaned_text = text.lower()
|
| 196 |
+
for pattern in PROFANITY_WHITELIST_PATTERNS.values():
|
| 197 |
+
cleaned_text = pattern.sub("", cleaned_text)
|
| 198 |
+
|
| 199 |
+
if profanity.contains_profanity(cleaned_text):
|
| 200 |
+
return AnalysisResponse(
|
| 201 |
+
text=request.text,
|
| 202 |
+
results=[Score(label="profanity_strict", score=1.0)],
|
| 203 |
+
is_toxic=True
|
| 204 |
+
)
|
| 205 |
+
# Only multi-word mild phrase triggered it — continue to deeper checks
|
| 206 |
+
|
| 207 |
+
# 1b. Keyword-based insult/threat detector (catches ML model blind spots)
|
| 208 |
+
if contains_insult_keyword(text):
|
| 209 |
+
return AnalysisResponse(
|
| 210 |
+
text=request.text,
|
| 211 |
+
results=[Score(label="insult_keyword", score=1.0)],
|
| 212 |
+
is_toxic=True
|
| 213 |
+
)
|
| 214 |
+
|
| 215 |
+
# 2. Offensive Emoji Check
|
| 216 |
+
if contains_offensive_emoji(text):
|
| 217 |
+
return AnalysisResponse(
|
| 218 |
+
text=request.text,
|
| 219 |
+
results=[Score(label="offensive_emoji", score=1.0)],
|
| 220 |
+
is_toxic=True
|
| 221 |
+
)
|
| 222 |
+
|
| 223 |
+
|
| 224 |
+
# 2. Short Text Heuristic
|
| 225 |
+
if len(text) < 5:
|
| 226 |
+
return AnalysisResponse(
|
| 227 |
+
text=request.text,
|
| 228 |
+
results=[],
|
| 229 |
+
is_toxic=False
|
| 230 |
+
)
|
| 231 |
+
|
| 232 |
+
# 3. ML Model Check (Context-based)
|
| 233 |
+
if not classifier:
|
| 234 |
+
print("Classifier not loaded, skipping ML check.")
|
| 235 |
+
return AnalysisResponse(text=request.text, results=[], is_toxic=False)
|
| 236 |
+
|
| 237 |
+
results = classifier(text)
|
| 238 |
+
scores = results[0]
|
| 239 |
+
|
| 240 |
+
is_toxic = False
|
| 241 |
+
formatted_scores = []
|
| 242 |
+
|
| 243 |
+
# Define Threshold based on Strictness
|
| 244 |
+
# High (Celeb) = 0.4 (Strict)
|
| 245 |
+
# Low (Friend) = 0.7 (Balanced)
|
| 246 |
+
threshold = 0.4 if request.strictness == "high" else 0.7
|
| 247 |
+
|
| 248 |
+
# Labels that indicate toxicity. Ignores 'LABEL_0', 'non-toxic', 'neutral', etc.
|
| 249 |
+
TOXIC_LABELS = {"toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate", "LABEL_1"}
|
| 250 |
+
|
| 251 |
+
for item in scores:
|
| 252 |
+
label = item['label']
|
| 253 |
+
score = item['score']
|
| 254 |
+
formatted_scores.append(Score(label=label, score=score))
|
| 255 |
+
|
| 256 |
+
# Only mark as toxic if the label is in our toxic set AND exceeds threshold
|
| 257 |
+
if label in TOXIC_LABELS and score > threshold:
|
| 258 |
+
is_toxic = True
|
| 259 |
+
|
| 260 |
+
return AnalysisResponse(
|
| 261 |
+
text=request.text,
|
| 262 |
+
results=formatted_scores,
|
| 263 |
+
is_toxic=is_toxic
|
| 264 |
+
)
|
| 265 |
+
|
| 266 |
+
@app.post("/submit")
|
| 267 |
+
def submit_comment(request: CommentRequest):
|
| 268 |
+
# This is a mock endpoint. In a real app, this would save to DB.
|
| 269 |
+
# We re-check toxicity here to prevent bypassing frontend
|
| 270 |
+
if not classifier:
|
| 271 |
+
raise HTTPException(status_code=500, detail="Model not loaded")
|
| 272 |
+
|
| 273 |
+
results = classifier(request.text)[0]
|
| 274 |
+
is_toxic = any(item['score'] > 0.5 for item in results)
|
| 275 |
+
|
| 276 |
+
if is_toxic:
|
| 277 |
+
raise HTTPException(status_code=400, detail="Comment rejected due to toxicity.")
|
| 278 |
+
|
| 279 |
+
return {"message": "Comment posted successfully", "text": request.text}
|
| 280 |
+
|
| 281 |
+
if __name__ == "__main__":
|
| 282 |
+
import uvicorn
|
| 283 |
+
import os
|
| 284 |
+
|
| 285 |
+
# Check for SSL certificates in data directory or root
|
| 286 |
+
key_file = "data/key.pem" if os.path.exists("data/key.pem") else "key.pem"
|
| 287 |
+
cert_file = "data/cert.pem" if os.path.exists("data/cert.pem") else "cert.pem"
|
| 288 |
+
|
| 289 |
+
if os.path.exists(key_file) and os.path.exists(cert_file):
|
| 290 |
+
print(f"Starting server with SSL/HTTPS enabled using {cert_file} and {key_file}...")
|
| 291 |
+
uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=True, ssl_keyfile=key_file, ssl_certfile=cert_file)
|
| 292 |
+
else:
|
| 293 |
+
print("SSL certificates not found. Starting server in HTTP mode.")
|
| 294 |
+
uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=True)
|
merge_datasets.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
|
| 4 |
+
def merge_datasets():
|
| 5 |
+
data_dir = Path("data")
|
| 6 |
+
custom_words_file = data_dir / "custom_badwords_dataset.xlsx"
|
| 7 |
+
main_dataset_file = data_dir / "training_data_telugu-hate.xlsx"
|
| 8 |
+
|
| 9 |
+
if not custom_words_file.exists():
|
| 10 |
+
print(f"Error: {custom_words_file} not found.")
|
| 11 |
+
return
|
| 12 |
+
|
| 13 |
+
if not main_dataset_file.exists():
|
| 14 |
+
print(f"Error: {main_dataset_file} not found.")
|
| 15 |
+
return
|
| 16 |
+
|
| 17 |
+
# Load both datasets
|
| 18 |
+
print("Loading data...")
|
| 19 |
+
custom_df = pd.read_excel(custom_words_file)
|
| 20 |
+
main_df = pd.read_excel(main_dataset_file)
|
| 21 |
+
|
| 22 |
+
print(f"Original main dataset size: {len(main_df)}")
|
| 23 |
+
print(f"Custom badwords size: {len(custom_df)}")
|
| 24 |
+
|
| 25 |
+
# Identify column names in main_dataset (usually text/comment and label/category)
|
| 26 |
+
# Based on kaggle_model script, we know text could be 'text' or 'comment'
|
| 27 |
+
text_col_main = next((c for c in main_df.columns if str(c).lower() in ['text', 'comment', 'comments', 'sentence']), 'text')
|
| 28 |
+
label_col_main = next((c for c in main_df.columns if str(c).lower() in ['label', 'labels', 'category', 'class']), 'label')
|
| 29 |
+
|
| 30 |
+
print(f"Identified columns in main dataset -> Text: '{text_col_main}', Label: '{label_col_main}'")
|
| 31 |
+
|
| 32 |
+
# Rename custom dataset columns to match main dataset
|
| 33 |
+
custom_df = custom_df.rename(columns={'text': text_col_main, 'label': label_col_main})
|
| 34 |
+
|
| 35 |
+
# Combine the dataframes
|
| 36 |
+
merged_df = pd.concat([main_df, custom_df], ignore_index=True)
|
| 37 |
+
|
| 38 |
+
# Remove any absolute duplicates just in case
|
| 39 |
+
merged_df = merged_df.drop_duplicates(subset=[text_col_main]).reset_index(drop=True)
|
| 40 |
+
|
| 41 |
+
print(f"New merged dataset size: {len(merged_df)}")
|
| 42 |
+
|
| 43 |
+
# Make a backup of the original just in case we need it
|
| 44 |
+
backup_path = data_dir / "training_data_telugu-hate_backup2.xlsx"
|
| 45 |
+
main_df.to_excel(backup_path, index=False)
|
| 46 |
+
print(f"Saved backup of original to {backup_path}")
|
| 47 |
+
|
| 48 |
+
# Overwrite the main dataset
|
| 49 |
+
merged_df.to_excel(main_dataset_file, index=False)
|
| 50 |
+
print(f"Successfully merged and saved updated dataset to {main_dataset_file}")
|
| 51 |
+
|
| 52 |
+
if __name__ == "__main__":
|
| 53 |
+
merge_datasets()
|
requirements.txt
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi
|
| 2 |
+
uvicorn
|
| 3 |
+
transformers
|
| 4 |
+
torch
|
| 5 |
+
pydantic
|
| 6 |
+
better-profanity
|
| 7 |
+
tf-keras
|
| 8 |
+
scikit-learn
|
| 9 |
+
requests
|
| 10 |
+
datasets
|
| 11 |
+
accelerate
|
train_model.py
ADDED
|
@@ -0,0 +1,418 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Fine-tune MuRIL (google/muril-base-cased) on the HOLD-Telugu (Dravidian CodeMix) dataset.
|
| 3 |
+
(MuRIL handles Telugu significantly better than standard toxic-bert)
|
| 4 |
+
SETUP:
|
| 5 |
+
1. Place the downloaded Excel file in: backend/data/ (any .xlsx file)
|
| 6 |
+
2. Install deps: pip install transformers torch scikit-learn accelerate openpyxl pandas
|
| 7 |
+
|
| 8 |
+
USAGE:
|
| 9 |
+
cd backend
|
| 10 |
+
python train_model.py
|
| 11 |
+
|
| 12 |
+
OUTPUT:
|
| 13 |
+
Fine-tuned model saved to: backend/model_output/
|
| 14 |
+
The backend auto-loads this model on next restart.
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
import os
|
| 18 |
+
import sys
|
| 19 |
+
import json
|
| 20 |
+
from pathlib import Path
|
| 21 |
+
|
| 22 |
+
# Force unbuffered output
|
| 23 |
+
sys.stdout.reconfigure(encoding='utf-8')
|
| 24 |
+
|
| 25 |
+
print("DEBUG: Script started", flush=True)
|
| 26 |
+
|
| 27 |
+
# ── Install dependencies if needed ───────────────────────────────────────────
|
| 28 |
+
print("DEBUG: Importing dependencies...", flush=True)
|
| 29 |
+
try:
|
| 30 |
+
import torch
|
| 31 |
+
print(f"DEBUG: Torch imported (v{torch.version})", flush=True)
|
| 32 |
+
|
| 33 |
+
# Import transformers early
|
| 34 |
+
import transformers
|
| 35 |
+
print(f"DEBUG: transformers imported (v{transformers.__version__})", flush=True)
|
| 36 |
+
|
| 37 |
+
from transformers import (
|
| 38 |
+
AutoTokenizer,
|
| 39 |
+
AutoModelForSequenceClassification,
|
| 40 |
+
TrainingArguments,
|
| 41 |
+
Trainer,
|
| 42 |
+
EarlyStoppingCallback
|
| 43 |
+
)
|
| 44 |
+
print("DEBUG: HuggingFace classes imported", flush=True)
|
| 45 |
+
|
| 46 |
+
import pandas as pd
|
| 47 |
+
print(f"DEBUG: pandas imported (v{pd.__version__})", flush=True)
|
| 48 |
+
|
| 49 |
+
import openpyxl
|
| 50 |
+
print("DEBUG: openpyxl imported", flush=True)
|
| 51 |
+
|
| 52 |
+
import sklearn
|
| 53 |
+
print(f"DEBUG: sklearn imported (v{sklearn.__version__})", flush=True)
|
| 54 |
+
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
|
| 55 |
+
print("DEBUG: sklearn metrics imported", flush=True)
|
| 56 |
+
|
| 57 |
+
import numpy as np
|
| 58 |
+
print(f"DEBUG: numpy imported (v{np.__version__})", flush=True)
|
| 59 |
+
|
| 60 |
+
from torch.utils.data import Dataset as TorchDataset
|
| 61 |
+
print("DEBUG: TorchDataset imported", flush=True)
|
| 62 |
+
|
| 63 |
+
except ImportError as e:
|
| 64 |
+
print(f"DEBUG: ImportError: {e}", flush=True)
|
| 65 |
+
sys.exit(1)
|
| 66 |
+
except Exception as e:
|
| 67 |
+
print(f"DEBUG: Exception during import: {e}", flush=True)
|
| 68 |
+
sys.exit(1)
|
| 69 |
+
|
| 70 |
+
# ── Paths ─────────────────────────────────────────────────────────────────────
|
| 71 |
+
BASE_DIR = Path(__file__).parent
|
| 72 |
+
DATA_DIR = BASE_DIR / "data"
|
| 73 |
+
OUTPUT_DIR = BASE_DIR / "model_output"
|
| 74 |
+
|
| 75 |
+
# ── Config ────────────────────────────────────────────────────────────────────
|
| 76 |
+
BASE_MODEL = "google/muril-base-cased" # MuRIL (Multilingual BERT) for Indian languages
|
| 77 |
+
# BASE_MODEL = "unitary/toxic-bert" # Fallback to general toxic-bert if needed
|
| 78 |
+
MAX_LENGTH = 128 # Longer context = better understanding of comments
|
| 79 |
+
EPOCHS = 8 # More epochs with early stopping patience=2
|
| 80 |
+
LEARNING_RATE = 3e-5 # Slightly higher LR for faster convergence
|
| 81 |
+
# TEST_SPLIT = 0.15 # Not needed if we use explicit files
|
| 82 |
+
|
| 83 |
+
# ── Find Excel files ─────────────────────────────────────────────────────
|
| 84 |
+
print(f"DEBUG: Searching for data in {DATA_DIR}", flush=True)
|
| 85 |
+
all_files = list(DATA_DIR.iterdir())
|
| 86 |
+
print(f"DEBUG: Found files: {[f.name for f in all_files]}", flush=True)
|
| 87 |
+
|
| 88 |
+
train_files = [f for f in all_files if 'training_data' in f.name.lower() and f.suffix in ['.xlsx', '.xls', '.csv']]
|
| 89 |
+
|
| 90 |
+
if not train_files:
|
| 91 |
+
print("✗ No training file found (looking for 'training_data*.xlsx')")
|
| 92 |
+
sys.exit(1)
|
| 93 |
+
else:
|
| 94 |
+
print(f"✓ Training files: {[f.name for f in train_files]}")
|
| 95 |
+
print("ℹ Test set will be a stratified 20% split from training data (same distribution)")
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
# ── Helper to load data ──────────────────────────────────────────────────────
|
| 99 |
+
|
| 100 |
+
def is_code_mixed(text):
|
| 101 |
+
"""
|
| 102 |
+
Returns True if text is Telugu-English code-mixed.
|
| 103 |
+
Keeps rows that have at least some Latin (English) characters.
|
| 104 |
+
Removes rows that are purely in Telugu script (U+0C00-U+0C7F).
|
| 105 |
+
"""
|
| 106 |
+
text = str(text)
|
| 107 |
+
has_latin = any('\u0041' <= c <= '\u007A' for c in text) # A-z
|
| 108 |
+
total = len([c for c in text if c.strip()])
|
| 109 |
+
telugu = len([c for c in text if '\u0C00' <= c <= '\u0C7F'])
|
| 110 |
+
# Skip if purely Telugu (>80% Telugu script chars) or has no Latin at all
|
| 111 |
+
if total == 0:
|
| 112 |
+
return False
|
| 113 |
+
if not has_latin:
|
| 114 |
+
return False
|
| 115 |
+
if telugu / total > 0.8:
|
| 116 |
+
return False
|
| 117 |
+
return True
|
| 118 |
+
|
| 119 |
+
def load_data(files):
|
| 120 |
+
hate_labels_set = {'hate', 'offensive', 'hof', '1', 'yes', 'toxic'}
|
| 121 |
+
frames = []
|
| 122 |
+
|
| 123 |
+
TEXT_NAMES = {'text', 'comment', 'comments', 'sentence', 'tweet', 'content', 'data'}
|
| 124 |
+
LABEL_NAMES = {'label', 'labels', 'category', 'class', 'tag', 'hate', 'annotation'}
|
| 125 |
+
|
| 126 |
+
for excel_file in files:
|
| 127 |
+
print(f" Loading: {excel_file.name}", flush=True)
|
| 128 |
+
try:
|
| 129 |
+
# Support both Excel and CSV files
|
| 130 |
+
if excel_file.suffix == '.csv':
|
| 131 |
+
sheets_data = [('csv', pd.read_csv(excel_file))]
|
| 132 |
+
else:
|
| 133 |
+
xl = pd.ExcelFile(excel_file)
|
| 134 |
+
sheets_data = [(sheet, xl.parse(sheet)) for sheet in xl.sheet_names]
|
| 135 |
+
|
| 136 |
+
for sheet, df in sheets_data:
|
| 137 |
+
|
| 138 |
+
# Column matching
|
| 139 |
+
text_col = next(
|
| 140 |
+
(c for c in df.columns if str(c).lower() in TEXT_NAMES or
|
| 141 |
+
any(t in str(c).lower() for t in ['text', 'comment', 'sentence'])), None
|
| 142 |
+
)
|
| 143 |
+
label_col = next(
|
| 144 |
+
(c for c in df.columns if str(c).lower() in LABEL_NAMES or
|
| 145 |
+
any(t in str(c).lower() for t in ['label', 'categor', 'class'])), None
|
| 146 |
+
)
|
| 147 |
+
|
| 148 |
+
if text_col and str(text_col).lower() in ['s.no', 'no', 'id', 'index', 'sr']:
|
| 149 |
+
text_col = None
|
| 150 |
+
|
| 151 |
+
if text_col and label_col:
|
| 152 |
+
sub = df[[text_col, label_col]].copy()
|
| 153 |
+
sub.columns = ['text', 'label']
|
| 154 |
+
sub = sub.dropna()
|
| 155 |
+
sub['label'] = sub['label'].astype(str).str.strip().str.lower()
|
| 156 |
+
sub['label_int'] = sub['label'].apply(lambda x: 1 if x in hate_labels_set else 0)
|
| 157 |
+
|
| 158 |
+
# ── Filter: keep only Telugu-English code-mixed rows ──────
|
| 159 |
+
before = len(sub)
|
| 160 |
+
sub = sub[sub['text'].apply(is_code_mixed)].reset_index(drop=True)
|
| 161 |
+
after = len(sub)
|
| 162 |
+
print(f" ✓ Sheet '{sheet}': {after} code-mixed rows kept (filtered out {before - after} pure Telugu rows)", flush=True)
|
| 163 |
+
|
| 164 |
+
frames.append(sub)
|
| 165 |
+
else:
|
| 166 |
+
print(f" ⚠ Sheet '{sheet}': Skipped (cols={list(df.columns)})", flush=True)
|
| 167 |
+
except Exception as e:
|
| 168 |
+
print(f" ✗ Error reading {excel_file.name}: {e}", flush=True)
|
| 169 |
+
|
| 170 |
+
if not frames:
|
| 171 |
+
return pd.DataFrame(columns=['text', 'label', 'label_int'])
|
| 172 |
+
|
| 173 |
+
combined = pd.concat(frames, ignore_index=True)
|
| 174 |
+
return combined
|
| 175 |
+
|
| 176 |
+
|
| 177 |
+
# ── Load Bad Words / Emojis as Additional Training Data ──────────────────────
|
| 178 |
+
def load_badwords_as_training_data():
|
| 179 |
+
"""Load telugu_badwords.txt, secure_words.bin, and bad_emojis.txt as toxic training examples."""
|
| 180 |
+
import base64
|
| 181 |
+
import random
|
| 182 |
+
random.seed(42)
|
| 183 |
+
|
| 184 |
+
toxic_words = []
|
| 185 |
+
|
| 186 |
+
# 1. Load telugu_badwords.txt
|
| 187 |
+
badwords_path = DATA_DIR / "telugu_badwords.txt"
|
| 188 |
+
if badwords_path.exists():
|
| 189 |
+
with open(badwords_path, "r", encoding="utf-8") as f:
|
| 190 |
+
for line in f:
|
| 191 |
+
word = line.strip()
|
| 192 |
+
if word:
|
| 193 |
+
toxic_words.append(word)
|
| 194 |
+
print(f" ✓ Loaded {len(toxic_words)} words from telugu_badwords.txt", flush=True)
|
| 195 |
+
|
| 196 |
+
# 2. Load secure_words.bin (base64 encoded)
|
| 197 |
+
secure_path = DATA_DIR / "secure_words.bin"
|
| 198 |
+
secure_count = 0
|
| 199 |
+
if secure_path.exists():
|
| 200 |
+
with open(secure_path, "rb") as f:
|
| 201 |
+
encoded_data = f.read()
|
| 202 |
+
decoded_data = base64.b64decode(encoded_data).decode("utf-8")
|
| 203 |
+
for line in decoded_data.splitlines():
|
| 204 |
+
word = line.strip()
|
| 205 |
+
if word and word not in toxic_words:
|
| 206 |
+
toxic_words.append(word)
|
| 207 |
+
secure_count += 1
|
| 208 |
+
print(f" ✓ Loaded {secure_count} additional words from secure_words.bin", flush=True)
|
| 209 |
+
|
| 210 |
+
# 3. Load bad_emojis.txt
|
| 211 |
+
emoji_path = DATA_DIR / "bad_emojis.txt"
|
| 212 |
+
emoji_count = 0
|
| 213 |
+
if emoji_path.exists():
|
| 214 |
+
with open(emoji_path, "r", encoding="utf-8") as f:
|
| 215 |
+
for line in f:
|
| 216 |
+
line = line.strip()
|
| 217 |
+
if line and not line.startswith("#"):
|
| 218 |
+
toxic_words.append(line)
|
| 219 |
+
emoji_count += 1
|
| 220 |
+
print(f" ✓ Loaded {emoji_count} offensive emojis from bad_emojis.txt", flush=True)
|
| 221 |
+
|
| 222 |
+
if not toxic_words:
|
| 223 |
+
return pd.DataFrame(columns=['text', 'label', 'label_int'])
|
| 224 |
+
|
| 225 |
+
# Create toxic training examples with natural sentence patterns
|
| 226 |
+
toxic_templates = [
|
| 227 |
+
"{word}",
|
| 228 |
+
"you are a {word}",
|
| 229 |
+
"{word} ga unnav",
|
| 230 |
+
"enti ra {word}",
|
| 231 |
+
"orey {word}",
|
| 232 |
+
"nuvvu {word}",
|
| 233 |
+
"{word} fellow",
|
| 234 |
+
"this {word}",
|
| 235 |
+
]
|
| 236 |
+
|
| 237 |
+
toxic_rows = []
|
| 238 |
+
for word in toxic_words:
|
| 239 |
+
# Use 2-3 random templates per word to create varied examples
|
| 240 |
+
templates = random.sample(toxic_templates, min(3, len(toxic_templates)))
|
| 241 |
+
for template in templates:
|
| 242 |
+
toxic_rows.append({
|
| 243 |
+
'text': template.format(word=word),
|
| 244 |
+
'label': 'hate',
|
| 245 |
+
'label_int': 1
|
| 246 |
+
})
|
| 247 |
+
|
| 248 |
+
# Generate matching SAFE examples to keep the dataset balanced
|
| 249 |
+
safe_phrases = [
|
| 250 |
+
"good morning everyone", "nice video", "great content bro",
|
| 251 |
+
"keep it up", "super ga undi", "chala bagundi",
|
| 252 |
+
"love this", "awesome work", "thank you for sharing",
|
| 253 |
+
"very helpful", "bagundi", "nice one", "well done",
|
| 254 |
+
"interesting topic", "manchi video", "super explanation",
|
| 255 |
+
"thanks for this", "really useful", "good job",
|
| 256 |
+
"happy birthday", "congratulations", "best wishes",
|
| 257 |
+
"nice song", "beautiful", "amazing performance",
|
| 258 |
+
"very informative", "subscribed", "waiting for next video",
|
| 259 |
+
"loved it", "manchi content", "edo oka roju",
|
| 260 |
+
"nenu chala happy", "meeru bagunnara", "thanks anna",
|
| 261 |
+
"thanks akka", "super bro", "nice edit",
|
| 262 |
+
"first comment", "who is watching in 2024",
|
| 263 |
+
"please make more videos", "this helped me a lot",
|
| 264 |
+
"I learned something new", "great tutorial", "perfect",
|
| 265 |
+
]
|
| 266 |
+
|
| 267 |
+
safe_rows = []
|
| 268 |
+
# Create enough safe examples to match toxic count
|
| 269 |
+
target_safe = len(toxic_rows)
|
| 270 |
+
for i in range(target_safe):
|
| 271 |
+
phrase = safe_phrases[i % len(safe_phrases)]
|
| 272 |
+
safe_rows.append({
|
| 273 |
+
'text': phrase,
|
| 274 |
+
'label': 'not-hate',
|
| 275 |
+
'label_int': 0
|
| 276 |
+
})
|
| 277 |
+
|
| 278 |
+
all_rows = toxic_rows + safe_rows
|
| 279 |
+
print(f" ✓ Generated {len(toxic_rows)} toxic + {len(safe_rows)} safe training examples from bad words/emojis", flush=True)
|
| 280 |
+
return pd.DataFrame(all_rows)
|
| 281 |
+
|
| 282 |
+
|
| 283 |
+
# ── Load and Split ───────────────────────────────────────────────────────────
|
| 284 |
+
print("\nLoading training data...", flush=True)
|
| 285 |
+
all_data = load_data(train_files)
|
| 286 |
+
if all_data.empty:
|
| 287 |
+
print("✗ Error: No usable data found.", flush=True)
|
| 288 |
+
sys.exit(1)
|
| 289 |
+
|
| 290 |
+
# Load bad words as additional training data
|
| 291 |
+
print("\nLoading bad words/emojis as training data...", flush=True)
|
| 292 |
+
badwords_data = load_badwords_as_training_data()
|
| 293 |
+
if not badwords_data.empty:
|
| 294 |
+
all_data = pd.concat([all_data, badwords_data], ignore_index=True)
|
| 295 |
+
print(f" Combined dataset size: {len(all_data)}", flush=True)
|
| 296 |
+
|
| 297 |
+
# Remove duplicates
|
| 298 |
+
len_before = len(all_data)
|
| 299 |
+
all_data = all_data.drop_duplicates(subset='text')
|
| 300 |
+
print(f" Deduplicated: {len_before} -> {len(all_data)}")
|
| 301 |
+
|
| 302 |
+
# ── Stratified 90/10 split (more training data = higher accuracy) ─────────────
|
| 303 |
+
from sklearn.model_selection import train_test_split
|
| 304 |
+
train_df, test_df = train_test_split(
|
| 305 |
+
all_data, test_size=0.10, random_state=42, stratify=all_data['label_int']
|
| 306 |
+
)
|
| 307 |
+
|
| 308 |
+
print(f"\nFinal Split: Train={len(train_df)} | Test={len(test_df)}")
|
| 309 |
+
print(f"Class Dist (Train): {train_df['label_int'].value_counts().to_dict()}")
|
| 310 |
+
print(f"Class Dist (Test): {test_df['label_int'].value_counts().to_dict()}")
|
| 311 |
+
|
| 312 |
+
train_texts = train_df['text'].tolist()
|
| 313 |
+
train_labels = train_df['label_int'].tolist()
|
| 314 |
+
test_texts = test_df['text'].tolist()
|
| 315 |
+
test_labels = test_df['label_int'].tolist()
|
| 316 |
+
|
| 317 |
+
# ── Load tokenizer & model ────────────────────────────────────────────────────
|
| 318 |
+
print(f"\nLoading model: {BASE_MODEL}", flush=True)
|
| 319 |
+
|
| 320 |
+
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
|
| 321 |
+
model = AutoModelForSequenceClassification.from_pretrained(
|
| 322 |
+
BASE_MODEL,
|
| 323 |
+
num_labels=2,
|
| 324 |
+
ignore_mismatched_sizes=True,
|
| 325 |
+
problem_type="single_label_classification" # Forces CrossEntropyLoss (fixes transformers v5 bug)
|
| 326 |
+
)
|
| 327 |
+
print(f"✓ Model loaded", flush=True)
|
| 328 |
+
|
| 329 |
+
# ── Dataset ───────────────────────────────────────────────────────────────────
|
| 330 |
+
class CommentDataset(TorchDataset):
|
| 331 |
+
def __init__(self, texts, labels):
|
| 332 |
+
self.encodings = tokenizer(
|
| 333 |
+
texts, truncation=True, padding=True,
|
| 334 |
+
max_length=MAX_LENGTH, return_tensors='pt'
|
| 335 |
+
)
|
| 336 |
+
self.labels = labels
|
| 337 |
+
|
| 338 |
+
def __len__(self): return len(self.labels)
|
| 339 |
+
|
| 340 |
+
def __getitem__(self, idx):
|
| 341 |
+
item = {k: v[idx] for k, v in self.encodings.items()}
|
| 342 |
+
item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
|
| 343 |
+
return item
|
| 344 |
+
|
| 345 |
+
print("Tokenizing datasets...", flush=True)
|
| 346 |
+
train_dataset = CommentDataset(train_texts, train_labels)
|
| 347 |
+
test_dataset = CommentDataset(test_texts, test_labels)
|
| 348 |
+
|
| 349 |
+
# ── Metrics ───────────────────────────────────────────────────────────────────
|
| 350 |
+
def compute_metrics(eval_pred):
|
| 351 |
+
logits, labels = eval_pred
|
| 352 |
+
preds = np.argmax(logits, axis=-1)
|
| 353 |
+
return {
|
| 354 |
+
'accuracy': accuracy_score(labels, preds),
|
| 355 |
+
'f1': f1_score(labels, preds, zero_division=0),
|
| 356 |
+
'precision': precision_score(labels, preds, zero_division=0),
|
| 357 |
+
'recall': recall_score(labels, preds, zero_division=0),
|
| 358 |
+
}
|
| 359 |
+
|
| 360 |
+
# ── Training ──────────────────────────────────────────────────────────────────
|
| 361 |
+
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
| 362 |
+
print(f"\nTraining on: {device.upper()}", flush=True)
|
| 363 |
+
|
| 364 |
+
OUTPUT_DIR.mkdir(exist_ok=True)
|
| 365 |
+
batch_size = 16 if device == 'cuda' else 8 # Smaller batch = better generalization on small datasets
|
| 366 |
+
eval_batch_size = 64 # No gradients during eval → can use larger batch
|
| 367 |
+
|
| 368 |
+
# 10% warmup steps
|
| 369 |
+
total_steps = (len(train_dataset) // batch_size) * EPOCHS
|
| 370 |
+
warmup_steps = int(total_steps * 0.1)
|
| 371 |
+
|
| 372 |
+
training_args = TrainingArguments(
|
| 373 |
+
output_dir=str(OUTPUT_DIR),
|
| 374 |
+
num_train_epochs=EPOCHS,
|
| 375 |
+
per_device_train_batch_size=batch_size,
|
| 376 |
+
per_device_eval_batch_size=eval_batch_size,
|
| 377 |
+
learning_rate=LEARNING_RATE,
|
| 378 |
+
warmup_steps=warmup_steps,
|
| 379 |
+
weight_decay=0.05, # Stronger regularization to prevent overfitting
|
| 380 |
+
eval_strategy="epoch",
|
| 381 |
+
save_strategy="epoch",
|
| 382 |
+
load_best_model_at_end=True,
|
| 383 |
+
metric_for_best_model="f1",
|
| 384 |
+
logging_steps=25,
|
| 385 |
+
report_to="none",
|
| 386 |
+
fp16=(device == 'cuda'),
|
| 387 |
+
)
|
| 388 |
+
|
| 389 |
+
trainer = Trainer(
|
| 390 |
+
model=model,
|
| 391 |
+
args=training_args,
|
| 392 |
+
train_dataset=train_dataset,
|
| 393 |
+
eval_dataset=test_dataset,
|
| 394 |
+
compute_metrics=compute_metrics,
|
| 395 |
+
callbacks=[EarlyStoppingCallback(early_stopping_patience=2)] # Stop early before overfitting
|
| 396 |
+
)
|
| 397 |
+
|
| 398 |
+
print(f"Starting training...", flush=True)
|
| 399 |
+
trainer.train()
|
| 400 |
+
|
| 401 |
+
# ── Final evaluation ──────────────────────────────────────────────────────────
|
| 402 |
+
print("\nEvaluating on test set...", flush=True)
|
| 403 |
+
results = trainer.evaluate()
|
| 404 |
+
print(f"\n{'='*60}")
|
| 405 |
+
print("FINAL RESULTS:")
|
| 406 |
+
print(f" Accuracy: {results.get('eval_accuracy', 0)*100:.2f}%")
|
| 407 |
+
print(f" F1 Score: {results.get('eval_f1', 0):.4f}")
|
| 408 |
+
print(f" Precision: {results.get('eval_precision', 0):.4f}")
|
| 409 |
+
print(f" Recall: {results.get('eval_recall', 0):.4f}")
|
| 410 |
+
print(f"{'='*60}")
|
| 411 |
+
|
| 412 |
+
# ── Save ──────────────────────────────────────────────────────────────────────
|
| 413 |
+
trainer.save_model(str(OUTPUT_DIR))
|
| 414 |
+
tokenizer.save_pretrained(str(OUTPUT_DIR))
|
| 415 |
+
with open(OUTPUT_DIR / "eval_results.json", 'w') as f:
|
| 416 |
+
json.dump(results, f, indent=2)
|
| 417 |
+
|
| 418 |
+
print(f"\n✅ Done! Model saved to: {OUTPUT_DIR}", flush=True)
|
verify_model.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
MODEL VERIFICATION SCRIPT
|
| 3 |
+
Use this to test your trained model locally on your PC.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
from transformers import pipeline
|
| 8 |
+
|
| 9 |
+
def test_model():
|
| 10 |
+
# 1. Path to your model folder
|
| 11 |
+
# Change this to 'model_output_v2' if testing the new version
|
| 12 |
+
model_path = "./model_output"
|
| 13 |
+
|
| 14 |
+
if not os.path.exists(model_path):
|
| 15 |
+
print(f"❌ Error: Model folder '{model_path}' not found.")
|
| 16 |
+
print("Please ensure you have moved your Kaggle/Colab output into the 'backend' folder.")
|
| 17 |
+
return
|
| 18 |
+
|
| 19 |
+
print("🔄 Loading model (this may take a few seconds)...")
|
| 20 |
+
try:
|
| 21 |
+
# Load the toxicity classifier
|
| 22 |
+
classifier = pipeline(
|
| 23 |
+
"text-classification",
|
| 24 |
+
model=model_path,
|
| 25 |
+
tokenizer=model_path,
|
| 26 |
+
device=-1 # Use -1 for CPU, 0 for first GPU
|
| 27 |
+
)
|
| 28 |
+
print("✅ Model loaded successfully!\n")
|
| 29 |
+
except Exception as e:
|
| 30 |
+
print(f"❌ Failed to load model: {e}")
|
| 31 |
+
return
|
| 32 |
+
|
| 33 |
+
print("Enter 'quit' to exit.")
|
| 34 |
+
while True:
|
| 35 |
+
text = input("\n📝 Enter a comment to test: ")
|
| 36 |
+
if text.lower() == 'quit':
|
| 37 |
+
break
|
| 38 |
+
|
| 39 |
+
if not text.strip():
|
| 40 |
+
continue
|
| 41 |
+
|
| 42 |
+
# Get prediction
|
| 43 |
+
result = classifier(text)[0]
|
| 44 |
+
|
| 45 |
+
label = result['label']
|
| 46 |
+
score = result['score']
|
| 47 |
+
|
| 48 |
+
# Map labels to human-readable text
|
| 49 |
+
# LABEL_1 is usually Toxic, LABEL_0 is Safe
|
| 50 |
+
is_toxic = "TOXIC 🔴" if label == "LABEL_1" else "SAFE 🟢"
|
| 51 |
+
|
| 52 |
+
print("-" * 30)
|
| 53 |
+
print(f"Result: {is_toxic}")
|
| 54 |
+
print(f"Confidence: {score*100:.2f}%")
|
| 55 |
+
print("-" * 30)
|
| 56 |
+
|
| 57 |
+
if __name__ == "__main__":
|
| 58 |
+
test_model()
|