Spaces:

tejesh916K
/

comment-guard-api

Sleeping

App Files Files Community

tejesh916K commited on Apr 7

Commit

b8300d6

0 Parent(s):

Deploy: Comment Guard API - FastAPI + MuRIL BERT

Browse files

Files changed (15) hide show

.dockerignore +6 -0
Dockerfile +16 -0
admin_manager.py +84 -0
clean_dataset.py +88 -0
data/bad_emojis.txt +94 -0
data/secure_words.bin +1 -0
data/telugu_badwords.txt +425 -0
export_badwords.py +50 -0
inspect_data.py +18 -0
kaggle_training_v3.py +327 -0
main.py +294 -0
merge_datasets.py +53 -0
requirements.txt +11 -0
train_model.py +418 -0
verify_model.py +58 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,6 @@

+__pycache__
+env/
+venv/
+.git
+.gitignore
+*.pyc

Dockerfile ADDED Viewed

	@@ -0,0 +1,16 @@

+FROM python:3.9-slim
+WORKDIR /app
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Pre-download the model to cache it in the image (optional but good for speed)
+# We can run a small python script to trigger the download or just let it download on first run.
+# For simplicity, we let it download on first run.
+COPY . .
+EXPOSE 8000
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]

admin_manager.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import base64
+import os
+import sys
+DATA_DIR = os.path.join(os.path.dirname(__file__), "data")
+PLAIN_FILE = os.path.join(DATA_DIR, "telugu_badwords.txt")
+SECURE_FILE = os.path.join(DATA_DIR, "secure_words.bin")
+def load_secure_words():
+    if not os.path.exists(SECURE_FILE):
+        return []
+    try:
+        with open(SECURE_FILE, "rb") as f:
+            encoded_data = f.read()
+            decoded_data = base64.b64decode(encoded_data).decode("utf-8")
+            return [w.strip() for w in decoded_data.splitlines() if w.strip()]
+    except Exception as e:
+        print(f"Error loading secure file: {e}")
+        return []
+def save_secure_words(words):
+    try:
+        content = "\n".join(words)
+        encoded_data = base64.b64encode(content.encode("utf-8"))
+        with open(SECURE_FILE, "wb") as f:
+            f.write(encoded_data)
+        print(f"Successfully saved {len(words)} words to secure storage.")
+    except Exception as e:
+        print(f"Error saving secure file: {e}")
+def migrate():
+    if not os.path.exists(PLAIN_FILE):
+        print(f"No plain text file found at {PLAIN_FILE}")
+        return
+    print(f"Migrating {PLAIN_FILE} to secure storage...")
+    with open(PLAIN_FILE, "r", encoding="utf-8") as f:
+        words = [line.strip() for line in f if line.strip() and not line.startswith("#")]
+    save_secure_words(words)
+    print("Migration complete. You can now safely delete the .txt file.")
+def view_words():
+    words = load_secure_words()
+    print(f"--- SECURE WORD LIST ({len(words)} words) ---")
+    for w in words:
+        print(w)
+    print("-------------------------------------------")
+def add_word(word):
+    words = load_secure_words()
+    if word in words:
+        print(f"'{word}' is already in the list.")
+        return
+    words.append(word)
+    save_secure_words(words)
+    print(f"Added '{word}'.")
+def remove_word(word):
+    words = load_secure_words()
+    if word not in words:
+        print(f"'{word}' not found in the list.")
+        return
+    words = [w for w in words if w != word]
+    save_secure_words(words)
+    print(f"Removed '{word}'.")
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print("Usage: python admin_manager.py [migrate|view|add <word>|remove <word>]")
+        sys.exit(1)
+    command = sys.argv[1]
+    if command == "migrate":
+        migrate()
+    elif command == "view":
+        view_words()
+    elif command == "add" and len(sys.argv) > 2:
+        add_word(sys.argv[2])
+    elif command == "remove" and len(sys.argv) > 2:
+        remove_word(sys.argv[2])
+    else:
+        print("Invalid command or missing argument.")

clean_dataset.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import traceback
+try:
+    import pandas as pd
+    path = 'data/training_data_telugu-hate.xlsx'
+    print(f"Loading {path}...")
+    df = pd.read_excel(path)
+    print(f"Original shape: {df.shape}")
+    # 1. Back up the original file just in case
+    df.to_excel('data/training_data_telugu-hate_backup.xlsx', index=False)
+    # Clean duplicates and nans
+    df = df.dropna(subset=['Comments', 'Label'])
+    df['Comments'] = df['Comments'].astype(str).str.strip()
+    df['Label'] = df['Label'].astype(str).str.strip().str.lower()
+    df = df[df['Label'].isin(['hate', 'non-hate'])]
+    df = df.drop_duplicates(subset=['Comments'], keep='first')
+    print(f"Shape after cleaning: {df.shape}")
+    # New words
+    toxic = [
+        "rey mental puku", "ni edava veshalu", "konda erri hook", "thu ni brathuku cheda", "panimashiva ra nuvu",
+        "erri puku nayala", "nuvu oka pedda jaffa", "siggu ledu ra neeku", "pichi pulka gadu", "waste fellow ra nuvu",
+        "dengay ra lathkor", "ni yamma kadupula koti", "adangi vedhava", "gudda balupu", "boku gadu vidu",
+        "rey chetha na kodaka", "poramboku nayala", "ni mokam chudu elagundo", "chapri gadu lanjodka", "lavada lo panulu",
+        "modda em kadu le", "pachi boothulu tidutha", "daridrudu", "tuppas gadu", "chavata chavata",
+        "mental gadu ra vidu", "sannasi", "bewarse gadu", "ne bondha ra ne bondha", "rey puku",
+        "vedava sannaasi", "guddalo em ledha", "ni amma", "ni abba", "rey lanjodoka", "addamina waste gadu",
+        "rotta gadu", "faltu gadu", "picha light teesko ra puku", "lathkor gadu", "erri pusa",
+        "bazar munda", "rey kojja nayala", "ni ayya ki cheppu", "solu gadu", "sollu cheppaku nayala",
+        "arey howle", "bhadcow gadu", "puka musko", "rey ni amma", "denga beta",
+        "ni puku lo na modda", "erri guda", "nuvvu oka waste puku", "ni yabba", "dunnapothu nayala",
+        "munda mokam", "sulli gadu", "arey erri", "pedda puku", "mental na kodaka", "lanja kodaka",
+        "ni amma ranku", "chethana kodaka", "musali puku", "gudda chimputha", "ni amma ninnu kaninda",
+        "rey neeku guddalo dammu leda", "ni mokam meda umma", "chepaleni boothulu", "thu ni bathuku", "kukka brathuku",
+        "ni bathuku bus stand", "picchi puku", "hook gani laga unnav", "gadida kodaka",
+        "donga puku", "munda edava", "musko ra jaffa", "bocchu gadu", "ni ayya puku", "naa modda guduvu",
+        "lavadalo comments", "item gani laga unnav", "loffer gadu", "ni face ki dippa okate takkuva", "pakodi gadu",
+        "mental hospital ki ellu", "rey pichi guda", "bithiri", "buffoon gadu", "420 gadu",
+        "ne kamma", "ni bondha pettu", "kothi na kodaka", "labor na kodaka", "signal daggara adukko", "Footpath gadu"
+    ]
+    safe = [
+        "super undi bro", "congrats macha", "all the best ra", "chala bagundi", "kekaa",
+        "thanks anna", "subram ga undi", "awesome work", "good job keep it up", "nice explanation",
+        "this is very helpful", "mee video lu ante chala ishtam", "first comment ra", "video super", "nice editing",
+        "super ga chepparu", "meeru inka goppavallu avvali", "waiting for next part", "good morning everyone", "have a nice day",
+        "really nice bro", "bhale cheppav", "good point", "manchi maata", "exactly macha",
+        "agreed", "well said anna", "proud of you", "jai hind", "super hit",
+        "very informative", "hats off to you", "good lesson learned", "superb acting", "next level",
+        "mind blowing performance", "keep soaring high", "bagundi", "baga chesaaru", "congratulations brother",
+        "so beautiful", "very nice song", "loved this", "manchi content idhi", "thank you so much",
+        "keep going", "amazing as always", "very true words", "good luck", "edo oka roju sadhistavu",
+        "meeru goppa anna", "salute anna", "inspiring video", "bhale tisaaru", "cinematography peaks",
+        "this made my day", "chala happy ga undi", "super star nvvu", "naaku idi chala use aindi", "respect",
+        "god bless you", "super anna", "keep doing videos", "nenu subscribe chesa", "like kottandi",
+        "miku manchi jargali", "great progress", "awesome efforts", "very nice tutorial", "fantastic",
+        "proud moment", "excellent work", "bhale undi kada", "super ga navvu", "nice smile",
+        "thanks for your support", "manchi advice", "helpful tips", "very clear", "super bro super",
+        "love from hyd", "amazing talent", "keep rocking", "gret job", "so soothing",
+        "wonderful video", "sweet comments", "very kind of you", "thank you akka", "wow super",
+        "masterpiece", "great info", "good stuff", "so positive", "happy for you", "best wishes",
+        "take care", "always supporting you", "superb explanation", "nice tutorial bro", "you are the best"
+    ]
+    # Map to new rows
+    new_rows = []
+    for t in toxic:
+        new_rows.append({'S.No': 'AUGMENTED_HATE', 'Comments': t, 'Label': 'hate'})
+    for s in safe:
+        new_rows.append({'S.No': 'AUGMENTED_SAFE', 'Comments': s, 'Label': 'non-hate'})
+    augment_df = pd.DataFrame(new_rows)
+    final_df = pd.concat([df, augment_df], ignore_index=True)
+    # Overwrite
+    final_df.to_excel(path, index=False)
+    print(f"Final shape: {final_df.shape}")
+    print("✅ Augmentation complete! Successfully wrote to Excel.")
+except Exception as e:
+    with open('error_log.txt', 'w') as f:
+        f.write(traceback.format_exc())
+    print("Script failed. See error_log.txt")

data/bad_emojis.txt ADDED Viewed

	@@ -0,0 +1,94 @@

+# Offensive Emojis Blacklist
+# Emojis that should be blocked in comments/chat
+# Add or remove as needed for your moderation policy
+# ── Offensive Gestures ────────────────────────────────────────────
+🖕
+🖕🏻
+🖕🏼
+🖕🏽
+🖕🏾
+🖕🏿
+🤙
+🤏
+# ── Threats / Violence / Weapons ─────────────────────────────────
+🔪
+🗡️
+🔫
+🪃
+💣
+🧨
+⚰️
+🪦
+☠️
+💀
+🩸
+🪓
+🏹
+⚔️
+🛡️
+# ── Death / Dark Symbols ──────────────────────────────────────────
+👊
+🤜
+🤛
+💢
+😡
+🤬
+😤
+👿
+😾
+# ── Explicit / Sexual Content ─────────────────────────────────────
+🍆
+🍑
+💦
+🔞
+🥵
+👅
+💋
+🍒
+🌮
+🌭
+🍌
+🍫
+🛏️
+🔑
+📸
+🩲
+🩳
+👙
+💊
+# ── Harassment / Mocking ─────────────────────────────────────────
+🤡
+🤢
+🤮
+💩
+🐷
+🐖
+🐮
+🐄
+🐒
+🙊
+🐸
+🐀
+🐁
+🦠
+🐛
+🪲
+🪳
+# ── Hate Symbols ─────────────────────────────────────────────────
+卐
+卍
+# ── Dangerous / Risk ─────────────────────────────────────────────
+🧪
+💉
+🩺
+☢️
+☣️
+⚠️
+🚨

data/secure_words.bin ADDED Viewed

	@@ -0,0 +1 @@

+ ZG9uZ2EKdmVkaGF2YQp2ZWRhdmEKcGFuZGkKa3Vra2EKbmVlIGFtbWEKbmVlIGFiYmEKY2hhdHRhCndhc3RlIGZlbGxvdwppdGVtCmxvdyBjbGFzcwpwaWNjaGkKZXJyaXBhcHBhCmVycmkKbXVuZGFtb3BpCmRhcmlkcnlhCnNhbmkKcGFuaWtpIG1hbGluYQp0aGlra2EKd29yc3QgZmVsbG93CmJsb29keSBmb29sCnVzZWxlc3MgZmVsbG93CmxhbmphCmxhbmpha29kYWthCmxhbmphIGtvZGFrYQptdW5kYQptb2RkYQpkZW5ndQpkZW5nZXkKZGVuZ3V0aGEKbmVlIHlhYmJhCmNoYXZhdGEKc2FubmFzaQpsdWNjaGEKaG93bGUKcHVrdQpwdWsKbWFkZGEKbGF2YWRhCmtvamphCmhpanJhCmJvY2NodQpuZSB5YW1tYQpuZSBheXlhCmJva3UKYmFkY293CmVycmkgcHVrdQpwaWNoaSBsYW5qYWtvZGFrYQpib2t1bG8KZ3VkZGEKbXVzYWxpCm5pIGJvbmRoYQpuaSBhYmJhCmNoZXR0YW5hIGtvZGFrYQpkdXJtYXJndWR1Cm5lZSBheXlhCmNoYXR0YSBuYSBrb2Rha2EKcGljaGkgcHVsa2EKZXJyaSBwdXNocGFtCndhc3RlIGdhZHUKbmUga2FtbWEKd2FzdGUgbmEga29kYWthCnBvcmFtYm9rdQpzaWdndSBsZW5pCmxhamphCnllcnJpCmJld2Fyc2kKYmV3YXJzCnBha29kaQpwdWxrYQpidWZmb29uCnNjb3VuZHJlbApyYXNjYWwKaWRpb3QKc3R1cGlkCmxvc2VyCmxvYWZlcgpyb3dkeQo0MjAKZG9uZ2FuYSBrb2Rha2EKbmVlIGZ1a3UKa29uZGEgZXJyaQpwb29rCnBvb2t1Cm1vZGRhbG8KbGF2YWRhbG8Kc3VsbGkKc3VsbGlnYQpsYWJvciBuYSBrb2Rha2EKY2hhcHJpCmNoYXByaSBnYWR1CmVycmlob29rCmhvb2sgZ2FkdQpiaGFkY293CmJoYWRrYXcKaG93bGEKamFmZmEKZ2FqdWxhdGhvCmtvamphIG5hIGtvZGFrYQpzaGlrYW5kaQpmYWtlIGdhZHUKZnJhdWQgZ2FkdQpkdW5uYXBvdGh1CmdhYWRpZGEKZ2FkaWRhCmJ1ZmZhbG8KbW9ua2V5CmtvdGhpCmtvdGhpIHZlZGhhdmEKc29sbHUKc29sdQpzb2xsdSBnYWR1CnZhZGh1cmEKb2RpeWFtbWEKeWFkYXZhCnllZGF2YQp0dWR1bXUKd2FzdGUgYm9keQpjaGV0aGEKY2hldHRhCnBlbmR1CnRyYXNoCmdhcmJhZ2UKZGlydHkgZmVsbG93Cm5hc3R5CmNoZWFwIGZlbGxvdwpsb3cgY2xhc3MgZmVsbG93CnRoaXJkIGNsYXNzCjNyZCBjbGFzcwo0dGggY2xhc3MKbWVudGFsbwpwc3ljaG8Kc2FkaXN0CnRodXB1awp3b3JzdCBnYWR1CnBpY2hpIG5hIGtvZGFrYQplcnJpIG5hIGtvZGFrYQpkb25nYSBuYSBrb2Rha2EKZG9uZ2EgbXVuZGEKcmFua3UgbXVuZGEKYmF6YXJ1IG11bmRhCmJhemFyIGRhbmEKcm9hZCBtZWVkYSB0aWdlIGRhbmEKdGlydWd1Ym90aHUKdGhpcnVndWJvdGh1CnRhYWd1Ym90aHUKdGFndWJvdGh1Cmp1bGUKanVsYXlpCmF2YWxhbmphCmFkZGFtaW5hCmFkZGFtaW5hIHBhbnVsdQpuZWVrdSBlbmR1a3UgcmEKbmVla3UgZW5kdWt1Cm11c3Vrb25pIGt1cmNobwptdXN1a28Kc2h1dCB1cApjbG9zZSB5b3VyIG1vdXRoCm5vcnUgbXV5eWkKbm9ydSBtdXN1a28Kbm90bG8KZ3VkZGFsbwpiYXN0aGkKc2x1bQpzbHVtIGZlbGxvdwpsb2NhbCBnYWR1CnVuY2l2aWxpemVkCmJhcmJhcmlhbgpicnV0ZQpzYXZhZ2UKcmFrc2hhc3VkYQpyYWtzaGFzaQp3aXRjaApiaXRjaApzbHV0Cndob3JlCnByb3N0aXR1dGUKYmFzdGFyZAphc3Nob2xlCmZ1Y2tlcgptb3RoZXJmdWNrZXIKc2lzdGVyIGZ1Y2tlcgpicm90aGVyIGZ1Y2tlcgpmYXRoZXIgZnVja2VyCmRpY2sKY29jawpwdXNzeQpjdW50CnRpdHMKYm9vYnMKbmlwcGxlCnBlbmlzCnZhZ2luYQpmdWNrCmZ1Y2tpbmcKZnVja2VkCnNjcmV3ZWQKc2hhZ2dlZApodW1wZWQKY3JlYW1waWUKc3Blcm0Kc2VtZW4Kaml6egpzcHVuawpzcXVpcnQKaG9ybnkKcmFwZQptb2xlc3QKaGFyYXNzCmFzc2F1bHQKYWJ1c2UKdmlvbGF0ZQpkZWdyYWRlCmh1bWlsaWF0ZQpzdWljaWRlCmt5cwpjaG9rZQpzdHJhbmdsZQpzdWZmb2NhdGUKc2xhcApzcGl0CnNoaXQKZmlsdGgKZ3JpbWUKbXVjawpzbGltZQpzY3VtCnZlcm1pbgpwZXN0CnBhcmFzaXRlCmxlZWNoCm1hZ2dvdApqYWNrYXNzCm11bGUKb3gKYnVsbAp2dWx0dXJlCnNuYWtlCmxpemFyZApiYXN0aGkgZ2FkdQpiYXN0aGkgZmVsbG93CnJvYWQgZmVsbG93CnJvYWQgZ2FkdQpzdHJlZXQgZmVsbG93CnBhdmVtZW50IGZlbGxvdwpmb290cGF0aCBnYWR1CnNpZ25hbCBnYWR1CnRyYWZmaWMgZmVsbG93CmF1dG8gZ2FkdQpyaWtzaGEgZ2FkdQpjb29saWUKY29vbGllIGdhZHUKbGFib3IgZ2FkdQpzd2VlcGVyIGdhZHUKZ2FyYmFnZSBnYWR1CmR1c3RiaW4gZ2FkdQp0b2lsZXQgZ2FkdQpndXR0ZXIgZ2FkdQpkcmFpbiBnYWR1CnNld2VyIGdhZHUKbWFuaG9sZSBnYWR1Cm5lZSBpbnRsbwpuZWUgaW50aSB2YWx1Cm5lZSBmYW1pbHkKbmVlIHBhcmVudHMKbmVlIGZhdGhlcgpuZWUgbW90aGVyCm5lZSBzaXN0ZXIKbmVlIGJyb3RoZXIKbmVlIHdpZmUKbmVlIGh1c2JhbmQKZ3VkZGEgbG8KcHVrdSBsbwptb2RkYSBsbwpsYXZhZGEgbG8KYm9ra2EgbG8Kbm90bG8gcGV0dGkKZ3VkZGFsbyBwZXR0aQpwdWt1bG8gcGV0dGkKbW9kZGFsbyBwZXR0aQpkZW5ndXRhbnUKZGVuZ2VzdGEKZGVuZ2FsaQpkZW5naWNodWtvCmRlbmdleSByYQpkZW5nZXkgbGUKZGVuZ2lwb3RoYQpkZW5naXBveWEKZGVuZ2lwb3lpbmEKZGVuZ2luY2h1a3VubmEKZGVuZ2ljaHVrdW50dW5uYQpkZW5ndXR1bm5hCmRlbmd1dHVubmFudQpkZW5ndXR1bm5hdgpkZW5ndXR1bm5hZHUKZGVuZ3V0dW5uYWRpCmRlbmd1dHVubmFtCmRlbmd1dHVubmFydQpkZW5ndXR1bm5haQpkZW5nYW51CmRlbmdhdgpkZW5nYWR1CmRlbmdpbmRpCmRlbmdhbQpkZW5nYXJ1CmRlbmdhaQpkZW5naW5hCmRlbmdpbmF2CmRlbmdpbmFkdQpkZW5naW5hZGkKZGVuZ2luYW0KZGVuZ2luYXJ1CmRlbmdpbmFpCmRlbmdlc2FudQpkZW5nZXNhdgpkZW5nZXNhZHUKZGVuZ2VzYWRpCmRlbmdlc2FtCmRlbmdlc2FydQpkZW5nZXNhaQpkZW5nZXN0YXYKZGVuZ2VzdGFkdQpkZW5nZXN0YWRpCmRlbmdlc3RhbQpkZW5nZXN0YXJ1CmRlbmdlc3RhaQpkZW5ndXRhdgpkZW5ndXRhZHUKZGVuZ3V0YWRpCmRlbmd1dGFtCmRlbmd1dGFydQpkZW5ndXRhaQpwdWt1bG8gZGVuZ3V0YW51Cmd1ZGRhbG8gZGVuZ3V0YW51Cm5vdGxvIGRlbmd1dGFudQpib2trYWxvIGRlbmd1dGFudQpsYXZhZGFsbyBkZW5ndXRhbnUKbW9kZGFsbyBkZW5ndXRhbnUKcHVrdSBkZW5ndXRhbnUKZ3VkZGEgZGVuZ3V0YW51CmJva2thIGRlbmd1dGFudQpsYXZhZGEgZGVuZ3V0YW51Cm1vZGRhIGRlbmd1dGFudQpub3J1IGRlbmd1dGFudQpwdWt1IHJhCmd1ZGRhIHJhCmJva2thIHJhCmxhdmFkYSByYQptb2RkYSByYQpub3J1IHJhCnB1a3UgbGUKZ3VkZGEgbGUKYm9ra2EgbGUKbGF2YWRhIGxlCm1vZGRhIGxlCm5vcnUgbGUKcHVrdSBsYW5qYQpndWRkYSBsYW5qYQpib2trYSBsYW5qYQpsYXZhZGEgbGFuamEKbW9kZGEgbGFuamEKbm9ydSBsYW5qYQpwdWt1IGtvZGFrYQpndWRkYSBrb2Rha2EKYm9ra2Ega29kYWthCmxhdmFkYSBrb2Rha2EKbW9kZGEga29kYWthCm5vcnUga29kYWthCnB1a3UgbXVuZGEKZ3VkZGEgbXVuZGEKYm9ra2EgbXVuZGEKbGF2YWRhIG11bmRhCm1vZGRhIG11bmRhCm5vcnUgbXVuZGEKcHVrdSBkb25nYQpndWRkYSBkb25nYQpib2trYSBkb25nYQpsYXZhZGEgZG9uZ2EKbW9kZGEgZG9uZ2EKbm9ydSBkb25nYQpwdWt1IGVycmkKZ3VkZGEgZXJyaQpib2trYSBlcnJpCmxhdmFkYSBlcnJpCm1vZGRhIGVycmkKbm9ydSBlcnJpCnB1a3UgcGljY2hpCmd1ZGRhIHBpY2NoaQpib2trYSBwaWNjaGkKbGF2YWRhIHBpY2NoaQptb2RkYSBwaWNjaGkKbm9ydSBwaWNjaGkKcHVrdSB3YXN0ZQpndWRkYSB3YXN0ZQpib2trYSB3YXN0ZQpsYXZhZGEgd2FzdGUKbW9kZGEgd2FzdGUKbm9ydSB3YXN0ZQpwdWt1bG8gcGV0dGkgZGVuZ3V0YW51Cmd1ZGRhbG8gcGV0dGkgZGVuZ3V0YW51Cm5vdGxvIHBldHRpIGRlbmd1dGFudQpib2trYWxvIHBldHRpIGRlbmd1dGFudQpsYXZhZGFsbyBwZXR0aSBkZW5ndXRhbnUKbW9kZGFsbyBwZXR0aSBkZW5ndXRhbnUKeW91IGFyZSBzdHVwaWQKeW91IGFyZSBhbiBpZGlvdAp5b3UncmUgc28gZHVtYgp3aGF0IGEgbG9zZXIKaSB3aWxsIGZpbmQgeW91CnlvdSBkZXNlcnZlIHRvIGRpZQppIGhhdGUgeW91CnlvdSdyZSBkaXNndXN0aW5nCm5vYm9keSBsaWtlcyB5b3UKeW91J3JlIHBhdGhldGljCmdldCBsb3N0Cm5vYm9keSBhc2tlZAp5b3UncmUgd29ydGhsZXNzCnlvdSdyZSB0cmFzaApraWxsIHlvdXJzZWxmCnlvdSdyZSB1Z2x5CnlvdSdyZSBhbm5veWluZwpnbyB0byBoZWxsCnN0dXBpZCBnYSB1bm5hdgp0aGlzIGlzIGdhcmJhZ2UKbm9ib2R5IGFza2VkIGZvciB5b3VyIG9waW5pb24K

data/telugu_badwords.txt ADDED Viewed

	@@ -0,0 +1,425 @@

+donga
+vedhava
+vedava
+pandi
+kukka
+nee amma
+nee abba
+chatta
+waste fellow
+item
+low class
+picchi
+erripappa
+erri
+mundamopi
+daridrya
+sani
+paniki malina
+thikka
+worst fellow
+bloody fool
+useless fellow
+lanja
+lanjakodaka
+lanja kodaka
+munda
+modda
+dengu
+dengey
+dengutha
+nee yabba
+chavata
+sannasi
+luccha
+howle
+puku
+puk
+madda
+lavada
+kojja
+hijra
+bocchu
+ne yamma
+ne ayya
+boku
+badcow
+erri puku
+pichi lanjakodaka
+bokulo
+gudda
+musali
+ni bondha
+ni abba
+chettana kodaka
+durmargudu
+nee ayya
+chatta na kodaka
+pichi pulka
+erri pushpam
+waste gadu
+ne kamma
+waste na kodaka
+poramboku
+siggu leni
+lajja
+yerri
+bewarsi
+bewars
+pakodi
+pulka
+buffoon
+scoundrel
+rascal
+idiot
+stupid
+loser
+loafer
+rowdy
+420
+dongana kodaka
+nee fuku
+konda erri
+pook
+pooku
+moddalo
+lavadalo
+sulli
+sulliga
+labor na kodaka
+chapri
+chapri gadu
+errihook
+hook gadu
+bhadcow
+bhadkaw
+howla
+jaffa
+gajulatho
+kojja na kodaka
+shikandi
+fake gadu
+fraud gadu
+dunnapothu
+gaadida
+gadida
+buffalo
+monkey
+kothi
+kothi vedhava
+sollu
+solu
+sollu gadu
+vadhura
+odiyamma
+yadava
+yedava
+tudumu
+waste body
+chetha
+chetta
+pendu
+trash
+garbage
+dirty fellow
+nasty
+cheap fellow
+low class fellow
+third class
+3rd class
+4th class
+mentalo
+psycho
+sadist
+thupuk
+worst gadu
+pichi na kodaka
+erri na kodaka
+donga na kodaka
+donga munda
+ranku munda
+bazaru munda
+bazar dana
+road meeda tige dana
+tirugubothu
+thirugubothu
+taagubothu
+tagubothu
+jule
+julayi
+avalanja
+addamina
+addamina panulu
+neeku enduku ra
+neeku enduku
+musukoni kurcho
+musuko
+shut up
+close your mouth
+noru muyyi
+noru musuko
+notlo
+guddalo
+basthi
+slum
+slum fellow
+local gadu
+uncivilized
+barbarian
+brute
+savage
+rakshasuda
+rakshasi
+witch
+bitch
+slut
+whore
+prostitute
+bastard
+asshole
+fucker
+motherfucker
+sister fucker
+brother fucker
+father fucker
+dick
+cock
+pussy
+cunt
+tits
+boobs
+nipple
+penis
+vagina
+fuck
+fucking
+fucked
+screwed
+shagged
+humped
+creampie
+sperm
+semen
+jizz
+spunk
+squirt
+horny
+rape
+molest
+harass
+assault
+abuse
+violate
+degrade
+humiliate
+suicide
+kys
+choke
+strangle
+suffocate
+slap
+spit
+shit
+filth
+grime
+muck
+slime
+scum
+vermin
+pest
+parasite
+leech
+maggot
+jackass
+mule
+ox
+bull
+vulture
+snake
+lizard
+basthi gadu
+basthi fellow
+road fellow
+road gadu
+street fellow
+pavement fellow
+footpath gadu
+signal gadu
+traffic fellow
+auto gadu
+riksha gadu
+coolie
+coolie gadu
+labor gadu
+sweeper gadu
+garbage gadu
+dustbin gadu
+toilet gadu
+gutter gadu
+drain gadu
+sewer gadu
+manhole gadu
+nee intlo
+nee inti valu
+nee family
+nee parents
+nee father
+nee mother
+nee sister
+nee brother
+nee wife
+nee husband
+gudda lo
+puku lo
+modda lo
+lavada lo
+bokka lo
+notlo petti
+guddalo petti
+pukulo petti
+moddalo petti
+dengutanu
+dengesta
+dengali
+dengichuko
+dengey ra
+dengey le
+dengipotha
+dengipoya
+dengipoyina
+denginchukunna
+dengichukuntunna
+dengutunna
+dengutunnanu
+dengutunnav
+dengutunnadu
+dengutunnadi
+dengutunnam
+dengutunnaru
+dengutunnai
+denganu
+dengav
+dengadu
+dengindi
+dengam
+dengaru
+dengai
+dengina
+denginav
+denginadu
+denginadi
+denginam
+denginaru
+denginai
+dengesanu
+dengesav
+dengesadu
+dengesadi
+dengesam
+dengesaru
+dengesai
+dengestav
+dengestadu
+dengestadi
+dengestam
+dengestaru
+dengestai
+dengutav
+dengutadu
+dengutadi
+dengutam
+dengutaru
+dengutai
+pukulo dengutanu
+guddalo dengutanu
+notlo dengutanu
+bokkalo dengutanu
+lavadalo dengutanu
+moddalo dengutanu
+puku dengutanu
+gudda dengutanu
+bokka dengutanu
+lavada dengutanu
+modda dengutanu
+noru dengutanu
+puku ra
+gudda ra
+bokka ra
+lavada ra
+modda ra
+noru ra
+puku le
+gudda le
+bokka le
+lavada le
+modda le
+noru le
+puku lanja
+gudda lanja
+bokka lanja
+lavada lanja
+modda lanja
+noru lanja
+puku kodaka
+gudda kodaka
+bokka kodaka
+lavada kodaka
+modda kodaka
+noru kodaka
+puku munda
+gudda munda
+bokka munda
+lavada munda
+modda munda
+noru munda
+puku donga
+gudda donga
+bokka donga
+lavada donga
+modda donga
+noru donga
+puku erri
+gudda erri
+bokka erri
+lavada erri
+modda erri
+noru erri
+puku picchi
+gudda picchi
+bokka picchi
+lavada picchi
+modda picchi
+noru picchi
+puku waste
+gudda waste
+bokka waste
+lavada waste
+modda waste
+noru waste
+pukulo petti dengutanu
+guddalo petti dengutanu
+notlo petti dengutanu
+bokkalo petti dengutanu
+lavadalo petti dengutanu
+moddalo petti dengutanu
+you are stupid
+you are an idiot
+you're so dumb
+what a loser
+i will find you
+you deserve to die
+i hate you
+you're disgusting
+nobody likes you
+you're pathetic
+get lost
+nobody asked
+you're worthless
+you're trash
+kill yourself
+you're ugly
+you're annoying
+go to hell
+stupid ga unnav
+this is garbage
+nobody asked for your opinion

export_badwords.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import os
+import base64
+import pandas as pd
+from pathlib import Path
+def export_badwords_to_excel(output_filename="custom_badwords_dataset.xlsx"):
+    data_dir = Path("data")
+    toxic_words = []
+    # 1. Load regular badwords
+    p1 = data_dir / "telugu_badwords.txt"
+    if p1.exists():
+        with open(p1, "r", encoding="utf-8") as f:
+            toxic_words.extend([l.strip() for l in f if l.strip()])
+    # 2. Load secure base64 badwords
+    p2 = data_dir / "secure_words.bin"
+    if p2.exists():
+        with open(p2, "rb") as f:
+            decoded = base64.b64decode(f.read()).decode("utf-8")
+            toxic_words.extend([l.strip() for l in decoded.splitlines() if l.strip()])
+    # 3. Load bad emojis
+    p3 = data_dir / "bad_emojis.txt"
+    if p3.exists():
+        with open(p3, "r", encoding="utf-8") as f:
+            toxic_words.extend([l.strip() for l in f if l.strip() and not l.strip().startswith("#")])
+    # Remove duplicates
+    toxic_words = list(set(toxic_words))
+    print(f"Total unique offensive terms gathered: {len(toxic_words)}")
+    if not toxic_words:
+        print("No words found to export.")
+        return
+    # Create a DataFrame
+    # Here we are just exporting the raw words as 'toxic'
+    df = pd.DataFrame({
+        'text': toxic_words,
+        'label': 'toxic'
+    })
+    # Save to Excel
+    output_path = data_dir / output_filename
+    df.to_excel(output_path, index=False)
+    print(f"Successfully exported {len(toxic_words)} words to {output_path}")
+if __name__ == "__main__":
+    export_badwords_to_excel()

inspect_data.py ADDED Viewed

	@@ -0,0 +1,18 @@

+import pandas as pd
+import sys
+with open('inspect_out.txt', 'w', encoding='utf-8') as f:
+    f.write("Loading dataset...\n")
+    try:
+        df = pd.read_excel('data/training_data_telugu-hate.xlsx')
+        f.write("Columns: " + str(df.columns.tolist()) + "\n")
+        f.write("Shape: " + str(df.shape) + "\n")
+        if 'label' in df.columns:
+            f.write("Value Counts for 'label':\n" + str(df['label'].value_counts()) + "\n")
+        f.write("\nFirst 5 rows:\n")
+        f.write(str(df.head()) + "\n")
+        # Look for missing values
+        f.write("\nMissing Values:\n" + str(df.isnull().sum()) + "\n")
+    except Exception as e:
+        f.write("Error: " + str(e) + "\n")

kaggle_training_v3.py ADDED Viewed

	@@ -0,0 +1,327 @@

+"""
+KAGGLE MODEL V3: Aiming for 90%+ Accuracy without Overfitting
+Optimizations:
+1. Increased Dataset Size: More diverse templates and safe phrases for data augmentation.
+2. Data Text Cleaning: Removed URLs, extra spaces, and user mentions to reduce noise.
+3. Class Balancing: Automatically oversamples the minority class to perfectly balance the dataset.
+4. Overfitting Prevention: Added Label Smoothing, Cosine Learning Rate Scheduler,
+   Warmup steps, and appropriate Weight Decay.
+5. Model: Using 'google/muril-base-cased' which is highly optimized for Indian languages
+   including Telugu, better for code-mixed text. Added custom dropout to config.
+"""
+import os
+import sys
+import json
+import base64
+import random
+import re
+from pathlib import Path
+# Force unbuffered output
+try:
+    if hasattr(sys.stdout, 'reconfigure'):
+        sys.stdout.reconfigure(encoding='utf-8')
+except Exception:
+    pass
+print("DEBUG: Kaggle V3 Training Script started", flush=True)
+# ── Paths ────────────────────────────────────────────────────────────────────
+KAGGLE_INPUT = Path("/kaggle/input")
+KAGGLE_OUTPUT = Path("/kaggle/working")
+DATA_DIR = None
+print(f"DEBUG: Checking for data in {KAGGLE_INPUT}...", flush=True)
+for p in KAGGLE_INPUT.glob("*"):
+    if p.is_dir() and any(p.glob("*training_data*")):
+        DATA_DIR = p
+        break
+if not DATA_DIR:
+    for p in KAGGLE_INPUT.rglob("*training_data*"):
+        DATA_DIR = p.parent
+        break
+if not DATA_DIR:
+    DATA_DIR = KAGGLE_INPUT / "comment-guard-data"
+OUTPUT_DIR = KAGGLE_OUTPUT / "model_output_v3"
+# ── Dependencies ─────────────────────────────────────────────────────────────
+try:
+    import torch
+    import transformers
+    from transformers import (
+        AutoTokenizer,
+        AutoModelForSequenceClassification,
+        AutoConfig,
+        TrainingArguments,
+        Trainer,
+        EarlyStoppingCallback
+    )
+    import pandas as pd
+    import openpyxl
+    import sklearn
+    from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
+    import numpy as np
+    from torch.utils.data import Dataset as TorchDataset
+    from sklearn.model_selection import train_test_split
+except ImportError:
+    print("⚠ Please run: !pip install transformers torch scikit-learn accelerate openpyxl pandas -q")
+    sys.exit(1)
+# ── Config ────────────────────────────────────────────────────────────────────
+BASE_MODEL    = "google/muril-base-cased" # Great for Telugu/Code-mixed
+MAX_LENGTH    = 128
+EPOCHS        = 10      # High max epochs, relying on early stopping
+LEARNING_RATE = 2e-5
+WEIGHT_DECAY  = 0.05
+LABEL_SMOOTHING = 0.1   # Helps prevent overfitting by softening labels
+WARMUP_RATIO  = 0.1     # Gradual learning rate increase
+# ── Functions ────────────────────────────────────────────────────────────────
+def clean_text(text):
+    text = str(text).lower()
+    text = re.sub(r'http\S+', '', text) # Remove URLs
+    text = re.sub(r'@\w+', '', text) # Remove mentions
+    text = re.sub(r'#\w+', '', text) # Remove hashtags
+    text = re.sub(r'\s+', ' ', text) # Remove extra whitespace
+    return text.strip()
+def is_code_mixed(text):
+    text = str(text)
+    has_latin = any('\u0041' <= c <= '\u007A' for c in text)
+    total = len([c for c in text if c.strip()])
+    # Simply require that it has some Latin characters (English alphabet)
+    if total == 0 or not has_latin: return False
+    return True
+def load_data(files):
+    hate_labels_set = {'hate', 'offensive', 'hof', '1', 'yes', 'toxic'}
+    frames = []
+    TEXT_NAMES  = {'text', 'comment', 'comments', 'sentence', 'tweet', 'content', 'data'}
+    LABEL_NAMES = {'label', 'labels', 'category', 'class', 'tag', 'hate', 'annotation'}
+    for excel_file in files:
+        try:
+            if excel_file.suffix == '.csv':
+                df = pd.read_csv(excel_file)
+                sheets_data = [('csv', df)]
+            else:
+                xl = pd.ExcelFile(excel_file)
+                sheets_data = [(sheet, xl.parse(sheet)) for sheet in xl.sheet_names]
+            for sheet, df in sheets_data:
+                text_col = next((c for c in df.columns if str(c).lower() in TEXT_NAMES or any(t in str(c).lower() for t in ['text', 'comment', 'sentence'])), None)
+                label_col = next((c for c in df.columns if str(c).lower() in LABEL_NAMES or any(t in str(c).lower() for t in ['label', 'categor', 'class'])), None)
+                if text_col and label_col:
+                    sub = df[[text_col, label_col]].copy()
+                    sub.columns = ['text', 'label']
+                    sub = sub.dropna()
+                    sub['text'] = sub['text'].apply(clean_text)
+                    sub['label_int'] = sub['label'].astype(str).str.strip().str.lower().apply(lambda x: 1 if x in hate_labels_set else 0)
+                    sub = sub[sub['text'].apply(is_code_mixed)].reset_index(drop=True)
+                    frames.append(sub)
+        except Exception as e:
+            print(f"Error loading {excel_file}: {e}")
+            pass
+    return pd.concat(frames, ignore_index=True) if frames else pd.DataFrame(columns=['text', 'label', 'label_int'])
+def load_badwords_augmented():
+    """V3: Massively expanded safe phrases and toxic templates to increase dataset robustness."""
+    toxic_words = []
+    p1, p2, p3 = DATA_DIR / "telugu_badwords.txt", DATA_DIR / "secure_words.bin", DATA_DIR / "bad_emojis.txt"
+    if p1.exists():
+        with open(p1, "r", encoding="utf-8") as f: toxic_words.extend([l.strip() for l in f if l.strip()])
+    if p2.exists():
+        with open(p2, "rb") as f: toxic_words.extend([l.strip() for l in base64.b64decode(f.read()).decode("utf-8").splitlines() if l.strip()])
+    if p3.exists():
+        with open(p3, "r", encoding="utf-8") as f: toxic_words.extend([l.strip() for l in f if l.strip() and not l.strip().startswith("#")])
+    if not toxic_words: return pd.DataFrame()
+    random.seed(42)
+    # Increased variety
+    toxic_templates = [
+        "{word}", "you are a {word}", "{word} ga unnav", "enti ra {word}",
+        "nuvvu {word}", "{word} fellow", "worst {word}", "rey {word}",
+        "ni yamma {word} nayala", "nuvvu pedda {word}", "chi {word} badava",
+        "endira ee {word} panulu", "tuppas {word} mokam", "nee lanti {word} inka evaru leru"
+    ]
+    safe_phrases = [
+        "bagundi bro", "keep it up", "manchi video", "super explanation", "thanks for sharing",
+        "helpful information", "nice edit", "waiting for next video", "super ga undi",
+        "love from ap", "good job", "congratulations brother", "beautiful video", "awesome music",
+        "next video eppudu?", "very interesting topic", "I learned a lot today", "nice talk",
+        "informative content", "meeru chala baga chepparu", "meeru chala handsome", "super anna",
+        "daily chustanu mee videos", "proud of you", "all the best for your future", "fantastic editing",
+        "thank you so much", "very nice presentation", "please upload more", "hello everyone",
+        "good morning brother", "have a great day ahead", "chala upayoga padindi", "excellent work"
+    ]
+    rows = []
+    for word in list(set(toxic_words)):
+        # Generate 4 toxic examples per word
+        for t in random.sample(toxic_templates, min(4, len(toxic_templates))):
+            rows.append({'text': t.format(word=word), 'label_int': 1})
+        # Generate 4 safe examples to match
+        for _ in range(4):
+            rows.append({'text': random.choice(safe_phrases), 'label_int': 0})
+    return pd.DataFrame(rows)
+# ── Main Execution ───────────────────────────────────────────────────────────
+if not DATA_DIR.exists():
+    print(f"✗ ERROR: DATA_DIR {DATA_DIR} not found. Ensure dataset is added to notebook.")
+    sys.exit(1)
+train_files = [f for f in DATA_DIR.iterdir() if 'training_data' in f.name.lower() and f.suffix in ['.xlsx', '.xls', '.csv']]
+all_data = load_data(train_files)
+aug_data = load_badwords_augmented()
+if not aug_data.empty:
+    all_data = pd.concat([all_data, aug_data], ignore_index=True)
+all_data = all_data.drop_duplicates(subset='text').reset_index(drop=True)
+# V3: DYNAMIC OVERSAMPLING & BALANCING
+counts = all_data['label_int'].value_counts()
+if len(counts) == 2:
+    majority_class = counts.idxmax()
+    minority_class = counts.idxmin()
+    majority_count = counts[majority_class]
+    minority_count = counts[minority_class]
+    if minority_count < majority_count:
+        df_majority = all_data[all_data['label_int'] == majority_class]
+        df_minority = all_data[all_data['label_int'] == minority_class]
+        # Oversample minority
+        df_minority_over = df_minority.sample(majority_count, replace=True, random_state=42)
+        all_data = pd.concat([df_majority, df_minority_over], axis=0).sample(frac=1, random_state=42).reset_index(drop=True)
+        print(f"DEBUG: Oversampled class {minority_class} to {majority_count}. Total rows symmetrically balanced: {len(all_data)}")
+# Train/Test Split
+train_df, test_df = train_test_split(all_data, test_size=0.10, random_state=42, stratify=all_data['label_int'])
+tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
+# Incorporating Dropout into config to prevent overfitting
+config = AutoConfig.from_pretrained(BASE_MODEL, num_labels=2, problem_type="single_label_classification")
+config.hidden_dropout_prob = 0.2
+config.attention_probs_dropout_prob = 0.2
+model = AutoModelForSequenceClassification.from_pretrained(
+    BASE_MODEL,
+    config=config,
+    ignore_mismatched_sizes=True
+)
+class CommentDataset(TorchDataset):
+    def __init__(self, texts, labels):
+        self.texts = texts # Store raw texts as well
+        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='pt')
+        self.labels = labels
+    def __len__(self): return len(self.labels)
+    def __getitem__(self, idx):
+        item = {k: v[idx] for k, v in self.encodings.items()}
+        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
+        return item
+train_dataset = CommentDataset(train_df['text'].tolist(), train_df['label_int'].tolist())
+test_dataset  = CommentDataset(test_df['text'].tolist(), test_df['label_int'].tolist())
+def compute_metrics(eval_pred):
+    logits, labels = eval_pred
+    preds = np.argmax(logits, axis=-1)
+    return {
+        'accuracy': accuracy_score(labels, preds),
+        'f1': f1_score(labels, preds, zero_division=0),
+        'precision': precision_score(labels, preds, zero_division=0),
+        'recall': recall_score(labels, preds, zero_division=0),
+    }
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+training_args = TrainingArguments(
+    output_dir=str(OUTPUT_DIR),
+    num_train_epochs=EPOCHS,
+    per_device_train_batch_size=16 if device == 'cuda' else 8,
+    per_device_eval_batch_size=32 if device == 'cuda' else 8,
+    learning_rate=LEARNING_RATE,
+    weight_decay=WEIGHT_DECAY,
+    warmup_ratio=WARMUP_RATIO,
+    lr_scheduler_type='cosine', # Cosine learning rate scheduler helps avoid overfitting and local minima
+    label_smoothing_factor=LABEL_SMOOTHING, # Distributes a bit of probability mass to other classes, reducing overconfidence
+    eval_strategy="epoch",
+    save_strategy="no",          # CHANGED: Don't save checkpoints to prevent KAGGLE STORAGE OVERFLOW
+    load_best_model_at_end=False, # CHANGED: Must be false if we aren't saving checkpoints
+    metric_for_best_model="f1",
+    report_to="none",
+    fp16=(device == 'cuda'),
+    logging_steps=50,
+)
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=train_dataset,
+    eval_dataset=test_dataset,
+    compute_metrics=compute_metrics,
+    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
+)
+print(f"Starting V3 training on {device}...")
+trainer.train()
+# Evaluate & Print Results
+print("\n📊 EVALUATING MODEL V3...")
+results = trainer.evaluate()
+print(f"\n{'='*50}\n🏆 V3 FINAL ACCURACY: {results.get('eval_accuracy', 0)*100:.2f}%\n{'='*50}")
+# --- CRITICAL KAGGLE STORAGE FIX ---
+# Free up disk space before saving by clearing the HuggingFace cache and previous runs
+print("\n🧹 Clearing disk space...")
+import shutil
+import gc
+# 1. Clear large dataframes and run garbage collection
+del all_data, train_df, test_df, train_dataset, test_dataset
+gc.collect()
+# 2. Clear known cache directories
+for cache_path in [".cache/huggingface", ".cache/torch"]:
+    cache_dir = Path.home() / cache_path
+    if cache_dir.exists():
+        try:
+            shutil.rmtree(cache_dir)
+            print(f"✅ Cleared {cache_dir}")
+        except Exception as e:
+            pass
+# 3. Aggressively delete OLD model outputs in /kaggle/working to free up 100s of MBs
+for old_dir in ["model_output", "model_output_v2", "wandb"]:
+    old_path = KAGGLE_OUTPUT / old_dir
+    if old_path.exists():
+        try:
+            shutil.rmtree(old_path)
+            print(f"✅ Deleted old directory: {old_path}")
+        except Exception as e:
+            pass
+# Save
+try:
+    trainer.save_model(str(OUTPUT_DIR))
+    tokenizer.save_pretrained(str(OUTPUT_DIR))
+    with open(OUTPUT_DIR / "eval_results.json", 'w') as f: json.dump(results, f, indent=2)
+    print(f"✅ Model saved successfully to: {OUTPUT_DIR}")
+except OSError as e:
+    print(f"\n❌ FATAL SAVING ERROR: {e}")
+    print("Kaggle ran out of disk space again! Try restarting your session or using a smaller BASE_MODEL.")

main.py ADDED Viewed

	@@ -0,0 +1,294 @@

+from fastapi import FastAPI, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+from transformers import pipeline
+from better_profanity import profanity
+from typing import List, Dict
+import re
+# Mild/acceptable words that better_profanity should NOT flag.
+# Using the library's built-in whitelist_words param is the most reliable fix.
+MILD_WORDS_WHITELIST = [
+    "damn", "hell", "crap", "dang", "heck", "shoot", "frick", "freaking",
+    "sucks", "suck", "bloody", "piss", "pissed",
+]
+# Initialize profanity filter with whitelisted mild words so they never trigger
+profanity.load_censor_words(whitelist_words=MILD_WORDS_WHITELIST)
+# Keep a set for the manual cleanup fallback (covers multi-word phrases)
+PROFANITY_WHITELIST = set(MILD_WORDS_WHITELIST) | {"keep it up", "great post"}
+# Pre-compiled regex patterns for profanity whitelist
+PROFANITY_WHITELIST_PATTERNS = {word: re.compile(r'\b' + re.escape(word) + r'\b', re.IGNORECASE) for word in PROFANITY_WHITELIST}
+def is_whitelisted(text: str) -> bool:
+    """Check if the text only triggers profanity due to whitelisted mild words."""
+    cleaned = text.lower()
+    for pattern in PROFANITY_WHITELIST_PATTERNS.values():
+        cleaned = pattern.sub("", cleaned)
+    return not profanity.contains_profanity(cleaned)
+# Keyword-based insult/threat detector to catch what the ML model misses.
+# Unicode apostrophe class ['‘’] handles both ASCII (') and curly (’) apostrophes.
+INSULT_KEYWORDS = [
+    # --- English insults / threats ---
+    r"\byou['‘’]?re so dumb\b",
+    r"\bwhat a loser\b",
+    r"\bi will find you\b",
+    r"\byou deserve to die\b",
+    r"\bi hate you\b",
+    r"\byou['‘’]?re disgusting\b",
+    r"\bnobody likes you\b",
+    r"\byou['‘’]?re pathetic\b",
+    r"\bget lost\b",
+    r"\bnobody asked\b",
+    r"\byou['‘’]?re worthless\b",
+    r"\byou['‘’]?re trash\b",
+    r"\bkill yourself\b",
+    r"\bgo kill yourself\b",
+    r"\byou['‘’]?re ugly\b",
+    r"\bshut up\b",
+    r"\byou['‘’]?re annoying\b",
+    r"\bgo to hell\b",
+    r"\bstupid ga\b",
+    r"\bwaste fellow\b",
+    r"\byou['‘’]?re an idiot\b",
+    r"\bthis is garbage\b",
+    r"\byou are stupid\b",
+    r"\byou are an idiot\b",
+    r"\byou['‘’]?re dumb\b",
+    r"\bstupid idiot\b",
+    r"\bbloody fool\b",
+    # --- Telugu-English compound insults: [insult word] + gadu/fellow/vaadu ---
+    r"\b(?:buffalo|monkey|mental|psycho|cheap|nasty|dirty|useless|worst|scoundrel)"
+    r"\s+(?:gadu|fellow|vaadu|ra)\b",
+    r"\b(?:rascal|buffoon|loafer|fraud|basthi|chapri|local|rowdy|420|kothi|waste)"
+    r"\s+(?:gadu|fellow|vaadu|ra)\b",
+    r"\b(?:third\s+class|low\s+class|third-class|low-class)\s+(?:gadu|fellow|vaadu)\b",
+    r"\b(?:buffalo|monkey|mental|psycho|cheap|nasty|dirty|useless|worst|scoundrel|rascal|buffoon|loafer|fraud)\s+fellow\b",
+    # --- Telugu standalone insult suffixes ---
+    r"\bkothi\s+vedhava\b",
+]
+INSULT_PATTERN = re.compile("|".join(INSULT_KEYWORDS), re.IGNORECASE | re.UNICODE)
+def contains_insult_keyword(text: str) -> bool:
+    """Check if text contains known insult/threat patterns."""
+    return bool(INSULT_PATTERN.search(text))
+# Load Custom Telugu-English Bad Words (Secure)
+import base64
+import os
+try:
+    secure_file_path = "data/secure_words.bin"
+    if os.path.exists(secure_file_path):
+        with open(secure_file_path, "rb") as f:
+            encoded_data = f.read()
+            decoded_data = base64.b64decode(encoded_data).decode("utf-8")
+            custom_words = [line.strip() for line in decoded_data.splitlines() if line.strip()]
+            profanity.add_censor_words(custom_words)
+        print(f"Loaded {len(custom_words)} custom bad words from secure storage.")
+    else:
+        print("Warning: Secure bad words file not found.")
+except Exception as e:
+    print(f"Warning: Could not load custom bad words: {e}")
+# Load Offensive Emojis
+offensive_emojis = set()
+try:
+    emoji_file_path = "data/bad_emojis.txt"
+    if os.path.exists(emoji_file_path):
+        with open(emoji_file_path, "r", encoding="utf-8") as f:
+            for line in f:
+                line = line.strip()
+                if line and not line.startswith("#"):
+                    offensive_emojis.add(line)
+        print(f"Loaded {len(offensive_emojis)} offensive emojis.")
+    else:
+        print("Warning: Offensive emojis file not found.")
+except Exception as e:
+    print(f"Warning: Could not load offensive emojis: {e}")
+def contains_offensive_emoji(text: str) -> bool:
+    """Check if text contains any offensive emojis"""
+    for emoji in offensive_emojis:
+        if emoji in text:
+            return True
+    return False
+app = FastAPI(title="AI Comment Moderation API")
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Initialize the toxicity classification pipeline
+# We use 'original' to keep the original distilbert-base-uncased-finetuned-sst-2-english if we wanted simple sentiment
+# However, for toxicity detection in Telugu-English code-mixed content, MuRIL (Multilingual
+# Representations for Indian Languages) BERT is preferred over standard DistilBERT or toxic-bert.
+# MuRIL is specifically trained on Indian languages and handles code-switching much better.
+# Current production model: google/muril-base-cased (fine-tuned)
+import torch
+# Optimizatons to prevent PyTorch from lagging the entire OS when running on CPU
+try:
+    if torch.cuda.is_available():
+        device = 0 # Use GPU
+        print("✓ CUDA GPU detected, running models on GPU for faster inference.")
+    else:
+        device = -1 # Use CPU
+        torch.set_num_threads(config.get("cpu_threads", 4)) # Limit to 4 threads rather than maxing out CPU
+        print(f"✓ CPU detected, limited PyTorch to {torch.get_num_threads()} threads to prevent system lag.")
+except Exception as e:
+    device = -1
+    pass
+try:
+    # Use fine-tuned model if available (produced by train_model.py)
+    fine_tuned_path = os.path.join(os.path.dirname(__file__), "model_output")
+    if os.path.exists(fine_tuned_path) and os.path.exists(os.path.join(fine_tuned_path, "config.json")):
+        print(f"✓ Loading fine-tuned model from: {fine_tuned_path}")
+        classifier = pipeline("text-classification", model=fine_tuned_path, top_k=None, device=device)
+    else:
+        print("Loading default model: google/muril-base-cased (Fallback)")
+        print("Note: MuRIL is highly recommended for Telugu-English code-mixed content.")
+        classifier = pipeline("text-classification", model="google/muril-base-cased", top_k=None, device=device)
+except Exception as e:
+    print(f"Error loading model: {e}")
+    classifier = None
+class CommentRequest(BaseModel):
+    text: str
+    strictness: str = "high" # "high" (Celeb) or "low" (Friend)
+class Score(BaseModel):
+    label: str
+    score: float
+class AnalysisResponse(BaseModel):
+    text: str
+    results: List[Score]
+    is_toxic: bool
+@app.get("/")
+def read_root():
+    return {"message": "AI Comment Moderation API is running"}
+@app.post("/analyze", response_model=AnalysisResponse)
+def analyze_comment(request: CommentRequest):
+    text = request.text.strip()
+    if not text:
+        raise HTTPException(status_code=400, detail="Text cannot be empty")
+    # 1. Strict "Bad Word" Check (Rule-based)
+    # MILD_WORDS_WHITELIST is already removed from the profanity library's censor list,
+    # so only genuine profanity (slurs, explicit words) will be flagged here.
+    if profanity.contains_profanity(text):
+        # Extra safety: remove any remaining multi-word safe phrases and re-check using PRECOMPILED regex
+        cleaned_text = text.lower()
+        for pattern in PROFANITY_WHITELIST_PATTERNS.values():
+            cleaned_text = pattern.sub("", cleaned_text)
+        if profanity.contains_profanity(cleaned_text):
+            return AnalysisResponse(
+                text=request.text,
+                results=[Score(label="profanity_strict", score=1.0)],
+                is_toxic=True
+            )
+        # Only multi-word mild phrase triggered it — continue to deeper checks
+    # 1b. Keyword-based insult/threat detector (catches ML model blind spots)
+    if contains_insult_keyword(text):
+        return AnalysisResponse(
+            text=request.text,
+            results=[Score(label="insult_keyword", score=1.0)],
+            is_toxic=True
+        )
+    # 2. Offensive Emoji Check
+    if contains_offensive_emoji(text):
+        return AnalysisResponse(
+            text=request.text,
+            results=[Score(label="offensive_emoji", score=1.0)],
+            is_toxic=True
+        )
+    # 2. Short Text Heuristic
+    if len(text) < 5:
+        return AnalysisResponse(
+            text=request.text,
+            results=[],
+            is_toxic=False
+        )
+    # 3. ML Model Check (Context-based)
+    if not classifier:
+         print("Classifier not loaded, skipping ML check.")
+         return AnalysisResponse(text=request.text, results=[], is_toxic=False)
+    results = classifier(text)
+    scores = results[0]
+    is_toxic = False
+    formatted_scores = []
+    # Define Threshold based on Strictness
+    # High (Celeb) = 0.4 (Strict)
+    # Low (Friend) = 0.7 (Balanced)
+    threshold = 0.4 if request.strictness == "high" else 0.7
+    # Labels that indicate toxicity. Ignores 'LABEL_0', 'non-toxic', 'neutral', etc.
+    TOXIC_LABELS = {"toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate", "LABEL_1"}
+    for item in scores:
+        label = item['label']
+        score = item['score']
+        formatted_scores.append(Score(label=label, score=score))
+        # Only mark as toxic if the label is in our toxic set AND exceeds threshold
+        if label in TOXIC_LABELS and score > threshold:
+            is_toxic = True
+    return AnalysisResponse(
+        text=request.text,
+        results=formatted_scores,
+        is_toxic=is_toxic
+    )
+@app.post("/submit")
+def submit_comment(request: CommentRequest):
+    # This is a mock endpoint. In a real app, this would save to DB.
+    # We re-check toxicity here to prevent bypassing frontend
+    if not classifier:
+         raise HTTPException(status_code=500, detail="Model not loaded")
+    results = classifier(request.text)[0]
+    is_toxic = any(item['score'] > 0.5 for item in results)
+    if is_toxic:
+        raise HTTPException(status_code=400, detail="Comment rejected due to toxicity.")
+    return {"message": "Comment posted successfully", "text": request.text}
+if __name__ == "__main__":
+    import uvicorn
+    import os
+    # Check for SSL certificates in data directory or root
+    key_file = "data/key.pem" if os.path.exists("data/key.pem") else "key.pem"
+    cert_file = "data/cert.pem" if os.path.exists("data/cert.pem") else "cert.pem"
+    if os.path.exists(key_file) and os.path.exists(cert_file):
+        print(f"Starting server with SSL/HTTPS enabled using {cert_file} and {key_file}...")
+        uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=True, ssl_keyfile=key_file, ssl_certfile=cert_file)
+    else:
+        print("SSL certificates not found. Starting server in HTTP mode.")
+        uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=True)

merge_datasets.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import pandas as pd
+from pathlib import Path
+def merge_datasets():
+    data_dir = Path("data")
+    custom_words_file = data_dir / "custom_badwords_dataset.xlsx"
+    main_dataset_file = data_dir / "training_data_telugu-hate.xlsx"
+    if not custom_words_file.exists():
+        print(f"Error: {custom_words_file} not found.")
+        return
+    if not main_dataset_file.exists():
+        print(f"Error: {main_dataset_file} not found.")
+        return
+    # Load both datasets
+    print("Loading data...")
+    custom_df = pd.read_excel(custom_words_file)
+    main_df = pd.read_excel(main_dataset_file)
+    print(f"Original main dataset size: {len(main_df)}")
+    print(f"Custom badwords size: {len(custom_df)}")
+    # Identify column names in main_dataset (usually text/comment and label/category)
+    # Based on kaggle_model script, we know text could be 'text' or 'comment'
+    text_col_main = next((c for c in main_df.columns if str(c).lower() in ['text', 'comment', 'comments', 'sentence']), 'text')
+    label_col_main = next((c for c in main_df.columns if str(c).lower() in ['label', 'labels', 'category', 'class']), 'label')
+    print(f"Identified columns in main dataset -> Text: '{text_col_main}', Label: '{label_col_main}'")
+    # Rename custom dataset columns to match main dataset
+    custom_df = custom_df.rename(columns={'text': text_col_main, 'label': label_col_main})
+    # Combine the dataframes
+    merged_df = pd.concat([main_df, custom_df], ignore_index=True)
+    # Remove any absolute duplicates just in case
+    merged_df = merged_df.drop_duplicates(subset=[text_col_main]).reset_index(drop=True)
+    print(f"New merged dataset size: {len(merged_df)}")
+    # Make a backup of the original just in case we need it
+    backup_path = data_dir / "training_data_telugu-hate_backup2.xlsx"
+    main_df.to_excel(backup_path, index=False)
+    print(f"Saved backup of original to {backup_path}")
+    # Overwrite the main dataset
+    merged_df.to_excel(main_dataset_file, index=False)
+    print(f"Successfully merged and saved updated dataset to {main_dataset_file}")
+if __name__ == "__main__":
+    merge_datasets()

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+fastapi
+uvicorn
+transformers
+torch
+pydantic
+better-profanity
+tf-keras
+scikit-learn
+requests
+datasets
+accelerate

train_model.py ADDED Viewed

	@@ -0,0 +1,418 @@

+"""
+Fine-tune MuRIL (google/muril-base-cased) on the HOLD-Telugu (Dravidian CodeMix) dataset.
+(MuRIL handles Telugu significantly better than standard toxic-bert)
+SETUP:
+1. Place the downloaded Excel file in: backend/data/  (any .xlsx file)
+2. Install deps: pip install transformers torch scikit-learn accelerate openpyxl pandas
+USAGE:
+  cd backend
+  python train_model.py
+OUTPUT:
+  Fine-tuned model saved to: backend/model_output/
+  The backend auto-loads this model on next restart.
+"""
+import os
+import sys
+import json
+from pathlib import Path
+# Force unbuffered output
+sys.stdout.reconfigure(encoding='utf-8')
+print("DEBUG: Script started", flush=True)
+# ── Install dependencies if needed ───────────────────────────────────────────
+print("DEBUG: Importing dependencies...", flush=True)
+try:
+    import torch
+    print(f"DEBUG: Torch imported (v{torch.version})", flush=True)
+    # Import transformers early
+    import transformers
+    print(f"DEBUG: transformers imported (v{transformers.__version__})", flush=True)
+    from transformers import (
+        AutoTokenizer,
+        AutoModelForSequenceClassification,
+        TrainingArguments,
+        Trainer,
+        EarlyStoppingCallback
+    )
+    print("DEBUG: HuggingFace classes imported", flush=True)
+    import pandas as pd
+    print(f"DEBUG: pandas imported (v{pd.__version__})", flush=True)
+    import openpyxl
+    print("DEBUG: openpyxl imported", flush=True)
+    import sklearn
+    print(f"DEBUG: sklearn imported (v{sklearn.__version__})", flush=True)
+    from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
+    print("DEBUG: sklearn metrics imported", flush=True)
+    import numpy as np
+    print(f"DEBUG: numpy imported (v{np.__version__})", flush=True)
+    from torch.utils.data import Dataset as TorchDataset
+    print("DEBUG: TorchDataset imported", flush=True)
+except ImportError as e:
+    print(f"DEBUG: ImportError: {e}", flush=True)
+    sys.exit(1)
+except Exception as e:
+    print(f"DEBUG: Exception during import: {e}", flush=True)
+    sys.exit(1)
+# ── Paths ─────────────────────────────────────────────────────────────────────
+BASE_DIR   = Path(__file__).parent
+DATA_DIR   = BASE_DIR / "data"
+OUTPUT_DIR = BASE_DIR / "model_output"
+# ── Config ────────────────────────────────────────────────────────────────────
+BASE_MODEL    = "google/muril-base-cased"   # MuRIL (Multilingual BERT) for Indian languages
+# BASE_MODEL    = "unitary/toxic-bert"         # Fallback to general toxic-bert if needed
+MAX_LENGTH    = 128  # Longer context = better understanding of comments
+EPOCHS        = 8    # More epochs with early stopping patience=2
+LEARNING_RATE = 3e-5 # Slightly higher LR for faster convergence
+# TEST_SPLIT    = 0.15 # Not needed if we use explicit files
+# ── Find Excel files ─────────────────────────────────────────────────────
+print(f"DEBUG: Searching for data in {DATA_DIR}", flush=True)
+all_files = list(DATA_DIR.iterdir())
+print(f"DEBUG: Found files: {[f.name for f in all_files]}", flush=True)
+train_files = [f for f in all_files if 'training_data' in f.name.lower() and f.suffix in ['.xlsx', '.xls', '.csv']]
+if not train_files:
+    print("✗ No training file found (looking for 'training_data*.xlsx')")
+    sys.exit(1)
+else:
+    print(f"✓ Training files: {[f.name for f in train_files]}")
+    print("ℹ Test set will be a stratified 20% split from training data (same distribution)")
+# ── Helper to load data ──────────────────────────────────────────────────────
+def is_code_mixed(text):
+    """
+    Returns True if text is Telugu-English code-mixed.
+    Keeps rows that have at least some Latin (English) characters.
+    Removes rows that are purely in Telugu script (U+0C00-U+0C7F).
+    """
+    text = str(text)
+    has_latin = any('\u0041' <= c <= '\u007A' for c in text)   # A-z
+    total     = len([c for c in text if c.strip()])
+    telugu    = len([c for c in text if '\u0C00' <= c <= '\u0C7F'])
+    # Skip if purely Telugu (>80% Telugu script chars) or has no Latin at all
+    if total == 0:
+        return False
+    if not has_latin:
+        return False
+    if telugu / total > 0.8:
+        return False
+    return True
+def load_data(files):
+    hate_labels_set = {'hate', 'offensive', 'hof', '1', 'yes', 'toxic'}
+    frames = []
+    TEXT_NAMES  = {'text', 'comment', 'comments', 'sentence', 'tweet', 'content', 'data'}
+    LABEL_NAMES = {'label', 'labels', 'category', 'class', 'tag', 'hate', 'annotation'}
+    for excel_file in files:
+        print(f"  Loading: {excel_file.name}", flush=True)
+        try:
+            # Support both Excel and CSV files
+            if excel_file.suffix == '.csv':
+                sheets_data = [('csv', pd.read_csv(excel_file))]
+            else:
+                xl = pd.ExcelFile(excel_file)
+                sheets_data = [(sheet, xl.parse(sheet)) for sheet in xl.sheet_names]
+            for sheet, df in sheets_data:
+                # Column matching
+                text_col = next(
+                    (c for c in df.columns if str(c).lower() in TEXT_NAMES or
+                     any(t in str(c).lower() for t in ['text', 'comment', 'sentence'])), None
+                )
+                label_col = next(
+                    (c for c in df.columns if str(c).lower() in LABEL_NAMES or
+                     any(t in str(c).lower() for t in ['label', 'categor', 'class'])), None
+                )
+                if text_col and str(text_col).lower() in ['s.no', 'no', 'id', 'index', 'sr']:
+                    text_col = None
+                if text_col and label_col:
+                    sub = df[[text_col, label_col]].copy()
+                    sub.columns = ['text', 'label']
+                    sub = sub.dropna()
+                    sub['label'] = sub['label'].astype(str).str.strip().str.lower()
+                    sub['label_int'] = sub['label'].apply(lambda x: 1 if x in hate_labels_set else 0)
+                    # ── Filter: keep only Telugu-English code-mixed rows ──────
+                    before = len(sub)
+                    sub = sub[sub['text'].apply(is_code_mixed)].reset_index(drop=True)
+                    after = len(sub)
+                    print(f"    ✓ Sheet '{sheet}': {after} code-mixed rows kept (filtered out {before - after} pure Telugu rows)", flush=True)
+                    frames.append(sub)
+                else:
+                    print(f"    ⚠ Sheet '{sheet}': Skipped (cols={list(df.columns)})", flush=True)
+        except Exception as e:
+            print(f"    ✗ Error reading {excel_file.name}: {e}", flush=True)
+    if not frames:
+        return pd.DataFrame(columns=['text', 'label', 'label_int'])
+    combined = pd.concat(frames, ignore_index=True)
+    return combined
+# ── Load Bad Words / Emojis as Additional Training Data ──────────────────────
+def load_badwords_as_training_data():
+    """Load telugu_badwords.txt, secure_words.bin, and bad_emojis.txt as toxic training examples."""
+    import base64
+    import random
+    random.seed(42)
+    toxic_words = []
+    # 1. Load telugu_badwords.txt
+    badwords_path = DATA_DIR / "telugu_badwords.txt"
+    if badwords_path.exists():
+        with open(badwords_path, "r", encoding="utf-8") as f:
+            for line in f:
+                word = line.strip()
+                if word:
+                    toxic_words.append(word)
+        print(f"  ✓ Loaded {len(toxic_words)} words from telugu_badwords.txt", flush=True)
+    # 2. Load secure_words.bin (base64 encoded)
+    secure_path = DATA_DIR / "secure_words.bin"
+    secure_count = 0
+    if secure_path.exists():
+        with open(secure_path, "rb") as f:
+            encoded_data = f.read()
+            decoded_data = base64.b64decode(encoded_data).decode("utf-8")
+            for line in decoded_data.splitlines():
+                word = line.strip()
+                if word and word not in toxic_words:
+                    toxic_words.append(word)
+                    secure_count += 1
+        print(f"  ✓ Loaded {secure_count} additional words from secure_words.bin", flush=True)
+    # 3. Load bad_emojis.txt
+    emoji_path = DATA_DIR / "bad_emojis.txt"
+    emoji_count = 0
+    if emoji_path.exists():
+        with open(emoji_path, "r", encoding="utf-8") as f:
+            for line in f:
+                line = line.strip()
+                if line and not line.startswith("#"):
+                    toxic_words.append(line)
+                    emoji_count += 1
+        print(f"  ✓ Loaded {emoji_count} offensive emojis from bad_emojis.txt", flush=True)
+    if not toxic_words:
+        return pd.DataFrame(columns=['text', 'label', 'label_int'])
+    # Create toxic training examples with natural sentence patterns
+    toxic_templates = [
+        "{word}",
+        "you are a {word}",
+        "{word} ga unnav",
+        "enti ra {word}",
+        "orey {word}",
+        "nuvvu {word}",
+        "{word} fellow",
+        "this {word}",
+    ]
+    toxic_rows = []
+    for word in toxic_words:
+        # Use 2-3 random templates per word to create varied examples
+        templates = random.sample(toxic_templates, min(3, len(toxic_templates)))
+        for template in templates:
+            toxic_rows.append({
+                'text': template.format(word=word),
+                'label': 'hate',
+                'label_int': 1
+            })
+    # Generate matching SAFE examples to keep the dataset balanced
+    safe_phrases = [
+        "good morning everyone", "nice video", "great content bro",
+        "keep it up", "super ga undi", "chala bagundi",
+        "love this", "awesome work", "thank you for sharing",
+        "very helpful", "bagundi", "nice one", "well done",
+        "interesting topic", "manchi video", "super explanation",
+        "thanks for this", "really useful", "good job",
+        "happy birthday", "congratulations", "best wishes",
+        "nice song", "beautiful", "amazing performance",
+        "very informative", "subscribed", "waiting for next video",
+        "loved it", "manchi content", "edo oka roju",
+        "nenu chala happy", "meeru bagunnara", "thanks anna",
+        "thanks akka", "super bro", "nice edit",
+        "first comment", "who is watching in 2024",
+        "please make more videos", "this helped me a lot",
+        "I learned something new", "great tutorial", "perfect",
+    ]
+    safe_rows = []
+    # Create enough safe examples to match toxic count
+    target_safe = len(toxic_rows)
+    for i in range(target_safe):
+        phrase = safe_phrases[i % len(safe_phrases)]
+        safe_rows.append({
+            'text': phrase,
+            'label': 'not-hate',
+            'label_int': 0
+        })
+    all_rows = toxic_rows + safe_rows
+    print(f"  ✓ Generated {len(toxic_rows)} toxic + {len(safe_rows)} safe training examples from bad words/emojis", flush=True)
+    return pd.DataFrame(all_rows)
+# ── Load and Split ───────────────────────────────────────────────────────────
+print("\nLoading training data...", flush=True)
+all_data = load_data(train_files)
+if all_data.empty:
+    print("✗ Error: No usable data found.", flush=True)
+    sys.exit(1)
+# Load bad words as additional training data
+print("\nLoading bad words/emojis as training data...", flush=True)
+badwords_data = load_badwords_as_training_data()
+if not badwords_data.empty:
+    all_data = pd.concat([all_data, badwords_data], ignore_index=True)
+    print(f"  Combined dataset size: {len(all_data)}", flush=True)
+# Remove duplicates
+len_before = len(all_data)
+all_data = all_data.drop_duplicates(subset='text')
+print(f"  Deduplicated: {len_before} -> {len(all_data)}")
+# ── Stratified 90/10 split (more training data = higher accuracy) ─────────────
+from sklearn.model_selection import train_test_split
+train_df, test_df = train_test_split(
+    all_data, test_size=0.10, random_state=42, stratify=all_data['label_int']
+)
+print(f"\nFinal Split: Train={len(train_df)} | Test={len(test_df)}")
+print(f"Class Dist (Train): {train_df['label_int'].value_counts().to_dict()}")
+print(f"Class Dist (Test):  {test_df['label_int'].value_counts().to_dict()}")
+train_texts  = train_df['text'].tolist()
+train_labels = train_df['label_int'].tolist()
+test_texts   = test_df['text'].tolist()
+test_labels  = test_df['label_int'].tolist()
+# ── Load tokenizer & model ────────────────────────────────────────────────────
+print(f"\nLoading model: {BASE_MODEL}", flush=True)
+tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
+model = AutoModelForSequenceClassification.from_pretrained(
+    BASE_MODEL,
+    num_labels=2,
+    ignore_mismatched_sizes=True,
+    problem_type="single_label_classification"  # Forces CrossEntropyLoss (fixes transformers v5 bug)
+)
+print(f"✓ Model loaded", flush=True)
+# ── Dataset ───────────────────────────────────────────────────────────────────
+class CommentDataset(TorchDataset):
+    def __init__(self, texts, labels):
+        self.encodings = tokenizer(
+            texts, truncation=True, padding=True,
+            max_length=MAX_LENGTH, return_tensors='pt'
+        )
+        self.labels = labels
+    def __len__(self): return len(self.labels)
+    def __getitem__(self, idx):
+        item = {k: v[idx] for k, v in self.encodings.items()}
+        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
+        return item
+print("Tokenizing datasets...", flush=True)
+train_dataset = CommentDataset(train_texts, train_labels)
+test_dataset  = CommentDataset(test_texts,  test_labels)
+# ── Metrics ───────────────────────────────────────────────────────────────────
+def compute_metrics(eval_pred):
+    logits, labels = eval_pred
+    preds = np.argmax(logits, axis=-1)
+    return {
+        'accuracy':  accuracy_score(labels, preds),
+        'f1':        f1_score(labels, preds, zero_division=0),
+        'precision': precision_score(labels, preds, zero_division=0),
+        'recall':    recall_score(labels, preds, zero_division=0),
+    }
+# ── Training ──────────────────────────────────────────────────────────────────
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+print(f"\nTraining on: {device.upper()}", flush=True)
+OUTPUT_DIR.mkdir(exist_ok=True)
+batch_size      = 16 if device == 'cuda' else 8   # Smaller batch = better generalization on small datasets
+eval_batch_size = 64   # No gradients during eval → can use larger batch
+# 10% warmup steps
+total_steps = (len(train_dataset) // batch_size) * EPOCHS
+warmup_steps = int(total_steps * 0.1)
+training_args = TrainingArguments(
+    output_dir=str(OUTPUT_DIR),
+    num_train_epochs=EPOCHS,
+    per_device_train_batch_size=batch_size,
+    per_device_eval_batch_size=eval_batch_size,
+    learning_rate=LEARNING_RATE,
+    warmup_steps=warmup_steps,
+    weight_decay=0.05,              # Stronger regularization to prevent overfitting
+    eval_strategy="epoch",
+    save_strategy="epoch",
+    load_best_model_at_end=True,
+    metric_for_best_model="f1",
+    logging_steps=25,
+    report_to="none",
+    fp16=(device == 'cuda'),
+)
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=train_dataset,
+    eval_dataset=test_dataset,
+    compute_metrics=compute_metrics,
+    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]  # Stop early before overfitting
+)
+print(f"Starting training...", flush=True)
+trainer.train()
+# ── Final evaluation ──────────────────────────────────────────────────────────
+print("\nEvaluating on test set...", flush=True)
+results = trainer.evaluate()
+print(f"\n{'='*60}")
+print("FINAL RESULTS:")
+print(f"  Accuracy:  {results.get('eval_accuracy', 0)*100:.2f}%")
+print(f"  F1 Score:  {results.get('eval_f1', 0):.4f}")
+print(f"  Precision: {results.get('eval_precision', 0):.4f}")
+print(f"  Recall:    {results.get('eval_recall', 0):.4f}")
+print(f"{'='*60}")
+# ── Save ──────────────────────────────────────────────────────────────────────
+trainer.save_model(str(OUTPUT_DIR))
+tokenizer.save_pretrained(str(OUTPUT_DIR))
+with open(OUTPUT_DIR / "eval_results.json", 'w') as f:
+    json.dump(results, f, indent=2)
+print(f"\n✅ Done! Model saved to: {OUTPUT_DIR}", flush=True)

verify_model.py ADDED Viewed

	@@ -0,0 +1,58 @@

+"""
+MODEL VERIFICATION SCRIPT
+Use this to test your trained model locally on your PC.
+"""
+import os
+from transformers import pipeline
+def test_model():
+    # 1. Path to your model folder
+    # Change this to 'model_output_v2' if testing the new version
+    model_path = "./model_output"
+    if not os.path.exists(model_path):
+        print(f"❌ Error: Model folder '{model_path}' not found.")
+        print("Please ensure you have moved your Kaggle/Colab output into the 'backend' folder.")
+        return
+    print("🔄 Loading model (this may take a few seconds)...")
+    try:
+        # Load the toxicity classifier
+        classifier = pipeline(
+            "text-classification",
+            model=model_path,
+            tokenizer=model_path,
+            device=-1 # Use -1 for CPU, 0 for first GPU
+        )
+        print("✅ Model loaded successfully!\n")
+    except Exception as e:
+        print(f"❌ Failed to load model: {e}")
+        return
+    print("Enter 'quit' to exit.")
+    while True:
+        text = input("\n📝 Enter a comment to test: ")
+        if text.lower() == 'quit':
+            break
+        if not text.strip():
+            continue
+        # Get prediction
+        result = classifier(text)[0]
+        label = result['label']
+        score = result['score']
+        # Map labels to human-readable text
+        # LABEL_1 is usually Toxic, LABEL_0 is Safe
+        is_toxic = "TOXIC 🔴" if label == "LABEL_1" else "SAFE 🟢"
+        print("-" * 30)
+        print(f"Result: {is_toxic}")
+        print(f"Confidence: {score*100:.2f}%")
+        print("-" * 30)
+if __name__ == "__main__":
+    test_model()