tejesh916K commited on
Commit
b8300d6
·
0 Parent(s):

Deploy: Comment Guard API - FastAPI + MuRIL BERT

Browse files
.dockerignore ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ __pycache__
2
+ env/
3
+ venv/
4
+ .git
5
+ .gitignore
6
+ *.pyc
Dockerfile ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9-slim
2
+
3
+ WORKDIR /app
4
+
5
+ COPY requirements.txt .
6
+ RUN pip install --no-cache-dir -r requirements.txt
7
+
8
+ # Pre-download the model to cache it in the image (optional but good for speed)
9
+ # We can run a small python script to trigger the download or just let it download on first run.
10
+ # For simplicity, we let it download on first run.
11
+
12
+ COPY . .
13
+
14
+ EXPOSE 8000
15
+
16
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
admin_manager.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import os
3
+ import sys
4
+
5
+ DATA_DIR = os.path.join(os.path.dirname(__file__), "data")
6
+ PLAIN_FILE = os.path.join(DATA_DIR, "telugu_badwords.txt")
7
+ SECURE_FILE = os.path.join(DATA_DIR, "secure_words.bin")
8
+
9
+ def load_secure_words():
10
+ if not os.path.exists(SECURE_FILE):
11
+ return []
12
+ try:
13
+ with open(SECURE_FILE, "rb") as f:
14
+ encoded_data = f.read()
15
+ decoded_data = base64.b64decode(encoded_data).decode("utf-8")
16
+ return [w.strip() for w in decoded_data.splitlines() if w.strip()]
17
+ except Exception as e:
18
+ print(f"Error loading secure file: {e}")
19
+ return []
20
+
21
+ def save_secure_words(words):
22
+ try:
23
+ content = "\n".join(words)
24
+ encoded_data = base64.b64encode(content.encode("utf-8"))
25
+ with open(SECURE_FILE, "wb") as f:
26
+ f.write(encoded_data)
27
+ print(f"Successfully saved {len(words)} words to secure storage.")
28
+ except Exception as e:
29
+ print(f"Error saving secure file: {e}")
30
+
31
+ def migrate():
32
+ if not os.path.exists(PLAIN_FILE):
33
+ print(f"No plain text file found at {PLAIN_FILE}")
34
+ return
35
+
36
+ print(f"Migrating {PLAIN_FILE} to secure storage...")
37
+ with open(PLAIN_FILE, "r", encoding="utf-8") as f:
38
+ words = [line.strip() for line in f if line.strip() and not line.startswith("#")]
39
+
40
+ save_secure_words(words)
41
+ print("Migration complete. You can now safely delete the .txt file.")
42
+
43
+ def view_words():
44
+ words = load_secure_words()
45
+ print(f"--- SECURE WORD LIST ({len(words)} words) ---")
46
+ for w in words:
47
+ print(w)
48
+ print("-------------------------------------------")
49
+
50
+ def add_word(word):
51
+ words = load_secure_words()
52
+ if word in words:
53
+ print(f"'{word}' is already in the list.")
54
+ return
55
+ words.append(word)
56
+ save_secure_words(words)
57
+ print(f"Added '{word}'.")
58
+
59
+ def remove_word(word):
60
+ words = load_secure_words()
61
+ if word not in words:
62
+ print(f"'{word}' not found in the list.")
63
+ return
64
+ words = [w for w in words if w != word]
65
+ save_secure_words(words)
66
+ print(f"Removed '{word}'.")
67
+
68
+ if __name__ == "__main__":
69
+ if len(sys.argv) < 2:
70
+ print("Usage: python admin_manager.py [migrate|view|add <word>|remove <word>]")
71
+ sys.exit(1)
72
+
73
+ command = sys.argv[1]
74
+
75
+ if command == "migrate":
76
+ migrate()
77
+ elif command == "view":
78
+ view_words()
79
+ elif command == "add" and len(sys.argv) > 2:
80
+ add_word(sys.argv[2])
81
+ elif command == "remove" and len(sys.argv) > 2:
82
+ remove_word(sys.argv[2])
83
+ else:
84
+ print("Invalid command or missing argument.")
clean_dataset.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import traceback
2
+
3
+ try:
4
+ import pandas as pd
5
+
6
+ path = 'data/training_data_telugu-hate.xlsx'
7
+ print(f"Loading {path}...")
8
+ df = pd.read_excel(path)
9
+ print(f"Original shape: {df.shape}")
10
+
11
+ # 1. Back up the original file just in case
12
+ df.to_excel('data/training_data_telugu-hate_backup.xlsx', index=False)
13
+
14
+ # Clean duplicates and nans
15
+ df = df.dropna(subset=['Comments', 'Label'])
16
+ df['Comments'] = df['Comments'].astype(str).str.strip()
17
+ df['Label'] = df['Label'].astype(str).str.strip().str.lower()
18
+ df = df[df['Label'].isin(['hate', 'non-hate'])]
19
+ df = df.drop_duplicates(subset=['Comments'], keep='first')
20
+
21
+ print(f"Shape after cleaning: {df.shape}")
22
+
23
+ # New words
24
+ toxic = [
25
+ "rey mental puku", "ni edava veshalu", "konda erri hook", "thu ni brathuku cheda", "panimashiva ra nuvu",
26
+ "erri puku nayala", "nuvu oka pedda jaffa", "siggu ledu ra neeku", "pichi pulka gadu", "waste fellow ra nuvu",
27
+ "dengay ra lathkor", "ni yamma kadupula koti", "adangi vedhava", "gudda balupu", "boku gadu vidu",
28
+ "rey chetha na kodaka", "poramboku nayala", "ni mokam chudu elagundo", "chapri gadu lanjodka", "lavada lo panulu",
29
+ "modda em kadu le", "pachi boothulu tidutha", "daridrudu", "tuppas gadu", "chavata chavata",
30
+ "mental gadu ra vidu", "sannasi", "bewarse gadu", "ne bondha ra ne bondha", "rey puku",
31
+ "vedava sannaasi", "guddalo em ledha", "ni amma", "ni abba", "rey lanjodoka", "addamina waste gadu",
32
+ "rotta gadu", "faltu gadu", "picha light teesko ra puku", "lathkor gadu", "erri pusa",
33
+ "bazar munda", "rey kojja nayala", "ni ayya ki cheppu", "solu gadu", "sollu cheppaku nayala",
34
+ "arey howle", "bhadcow gadu", "puka musko", "rey ni amma", "denga beta",
35
+ "ni puku lo na modda", "erri guda", "nuvvu oka waste puku", "ni yabba", "dunnapothu nayala",
36
+ "munda mokam", "sulli gadu", "arey erri", "pedda puku", "mental na kodaka", "lanja kodaka",
37
+ "ni amma ranku", "chethana kodaka", "musali puku", "gudda chimputha", "ni amma ninnu kaninda",
38
+ "rey neeku guddalo dammu leda", "ni mokam meda umma", "chepaleni boothulu", "thu ni bathuku", "kukka brathuku",
39
+ "ni bathuku bus stand", "picchi puku", "hook gani laga unnav", "gadida kodaka",
40
+ "donga puku", "munda edava", "musko ra jaffa", "bocchu gadu", "ni ayya puku", "naa modda guduvu",
41
+ "lavadalo comments", "item gani laga unnav", "loffer gadu", "ni face ki dippa okate takkuva", "pakodi gadu",
42
+ "mental hospital ki ellu", "rey pichi guda", "bithiri", "buffoon gadu", "420 gadu",
43
+ "ne kamma", "ni bondha pettu", "kothi na kodaka", "labor na kodaka", "signal daggara adukko", "Footpath gadu"
44
+ ]
45
+
46
+ safe = [
47
+ "super undi bro", "congrats macha", "all the best ra", "chala bagundi", "kekaa",
48
+ "thanks anna", "subram ga undi", "awesome work", "good job keep it up", "nice explanation",
49
+ "this is very helpful", "mee video lu ante chala ishtam", "first comment ra", "video super", "nice editing",
50
+ "super ga chepparu", "meeru inka goppavallu avvali", "waiting for next part", "good morning everyone", "have a nice day",
51
+ "really nice bro", "bhale cheppav", "good point", "manchi maata", "exactly macha",
52
+ "agreed", "well said anna", "proud of you", "jai hind", "super hit",
53
+ "very informative", "hats off to you", "good lesson learned", "superb acting", "next level",
54
+ "mind blowing performance", "keep soaring high", "bagundi", "baga chesaaru", "congratulations brother",
55
+ "so beautiful", "very nice song", "loved this", "manchi content idhi", "thank you so much",
56
+ "keep going", "amazing as always", "very true words", "good luck", "edo oka roju sadhistavu",
57
+ "meeru goppa anna", "salute anna", "inspiring video", "bhale tisaaru", "cinematography peaks",
58
+ "this made my day", "chala happy ga undi", "super star nvvu", "naaku idi chala use aindi", "respect",
59
+ "god bless you", "super anna", "keep doing videos", "nenu subscribe chesa", "like kottandi",
60
+ "miku manchi jargali", "great progress", "awesome efforts", "very nice tutorial", "fantastic",
61
+ "proud moment", "excellent work", "bhale undi kada", "super ga navvu", "nice smile",
62
+ "thanks for your support", "manchi advice", "helpful tips", "very clear", "super bro super",
63
+ "love from hyd", "amazing talent", "keep rocking", "gret job", "so soothing",
64
+ "wonderful video", "sweet comments", "very kind of you", "thank you akka", "wow super",
65
+ "masterpiece", "great info", "good stuff", "so positive", "happy for you", "best wishes",
66
+ "take care", "always supporting you", "superb explanation", "nice tutorial bro", "you are the best"
67
+ ]
68
+
69
+ # Map to new rows
70
+ new_rows = []
71
+
72
+ for t in toxic:
73
+ new_rows.append({'S.No': 'AUGMENTED_HATE', 'Comments': t, 'Label': 'hate'})
74
+ for s in safe:
75
+ new_rows.append({'S.No': 'AUGMENTED_SAFE', 'Comments': s, 'Label': 'non-hate'})
76
+
77
+ augment_df = pd.DataFrame(new_rows)
78
+ final_df = pd.concat([df, augment_df], ignore_index=True)
79
+
80
+ # Overwrite
81
+ final_df.to_excel(path, index=False)
82
+ print(f"Final shape: {final_df.shape}")
83
+ print("✅ Augmentation complete! Successfully wrote to Excel.")
84
+
85
+ except Exception as e:
86
+ with open('error_log.txt', 'w') as f:
87
+ f.write(traceback.format_exc())
88
+ print("Script failed. See error_log.txt")
data/bad_emojis.txt ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Offensive Emojis Blacklist
2
+ # Emojis that should be blocked in comments/chat
3
+ # Add or remove as needed for your moderation policy
4
+
5
+ # ── Offensive Gestures ────────────────────────────────────────────
6
+ 🖕
7
+ 🖕🏻
8
+ 🖕🏼
9
+ 🖕🏽
10
+ 🖕🏾
11
+ 🖕🏿
12
+ 🤙
13
+ 🤏
14
+
15
+ # ── Threats / Violence / Weapons ─────────────────────────────────
16
+ 🔪
17
+ 🗡️
18
+ 🔫
19
+ 🪃
20
+ 💣
21
+ 🧨
22
+ ⚰️
23
+ 🪦
24
+ ☠️
25
+ 💀
26
+ 🩸
27
+ 🪓
28
+ 🏹
29
+ ⚔️
30
+ 🛡️
31
+
32
+ # ── Death / Dark Symbols ──────────────────────────────────────────
33
+ 👊
34
+ 🤜
35
+ 🤛
36
+ 💢
37
+ 😡
38
+ 🤬
39
+ 😤
40
+ 👿
41
+ 😾
42
+
43
+ # ── Explicit / Sexual Content ─────────────────────────────────────
44
+ 🍆
45
+ 🍑
46
+ 💦
47
+ 🔞
48
+ 🥵
49
+ 👅
50
+ 💋
51
+ 🍒
52
+ 🌮
53
+ 🌭
54
+ 🍌
55
+ 🍫
56
+ 🛏️
57
+ 🔑
58
+ 📸
59
+ 🩲
60
+ 🩳
61
+ 👙
62
+ 💊
63
+
64
+ # ── Harassment / Mocking ─────────────────────────────────────────
65
+ 🤡
66
+ 🤢
67
+ 🤮
68
+ 💩
69
+ 🐷
70
+ 🐖
71
+ 🐮
72
+ 🐄
73
+ 🐒
74
+ 🙊
75
+ 🐸
76
+ 🐀
77
+ 🐁
78
+ 🦠
79
+ 🐛
80
+ 🪲
81
+ 🪳
82
+
83
+ # ── Hate Symbols ─────────────────────────────────────────────────
84
+
85
+
86
+
87
+ # ── Dangerous / Risk ─────────────────────────────────────────────
88
+ 🧪
89
+ 💉
90
+ 🩺
91
+ ☢️
92
+ ☣️
93
+ ⚠️
94
+ 🚨
data/secure_words.bin ADDED
@@ -0,0 +1 @@
 
 
1
+ ZG9uZ2EKdmVkaGF2YQp2ZWRhdmEKcGFuZGkKa3Vra2EKbmVlIGFtbWEKbmVlIGFiYmEKY2hhdHRhCndhc3RlIGZlbGxvdwppdGVtCmxvdyBjbGFzcwpwaWNjaGkKZXJyaXBhcHBhCmVycmkKbXVuZGFtb3BpCmRhcmlkcnlhCnNhbmkKcGFuaWtpIG1hbGluYQp0aGlra2EKd29yc3QgZmVsbG93CmJsb29keSBmb29sCnVzZWxlc3MgZmVsbG93CmxhbmphCmxhbmpha29kYWthCmxhbmphIGtvZGFrYQptdW5kYQptb2RkYQpkZW5ndQpkZW5nZXkKZGVuZ3V0aGEKbmVlIHlhYmJhCmNoYXZhdGEKc2FubmFzaQpsdWNjaGEKaG93bGUKcHVrdQpwdWsKbWFkZGEKbGF2YWRhCmtvamphCmhpanJhCmJvY2NodQpuZSB5YW1tYQpuZSBheXlhCmJva3UKYmFkY293CmVycmkgcHVrdQpwaWNoaSBsYW5qYWtvZGFrYQpib2t1bG8KZ3VkZGEKbXVzYWxpCm5pIGJvbmRoYQpuaSBhYmJhCmNoZXR0YW5hIGtvZGFrYQpkdXJtYXJndWR1Cm5lZSBheXlhCmNoYXR0YSBuYSBrb2Rha2EKcGljaGkgcHVsa2EKZXJyaSBwdXNocGFtCndhc3RlIGdhZHUKbmUga2FtbWEKd2FzdGUgbmEga29kYWthCnBvcmFtYm9rdQpzaWdndSBsZW5pCmxhamphCnllcnJpCmJld2Fyc2kKYmV3YXJzCnBha29kaQpwdWxrYQpidWZmb29uCnNjb3VuZHJlbApyYXNjYWwKaWRpb3QKc3R1cGlkCmxvc2VyCmxvYWZlcgpyb3dkeQo0MjAKZG9uZ2FuYSBrb2Rha2EKbmVlIGZ1a3UKa29uZGEgZXJyaQpwb29rCnBvb2t1Cm1vZGRhbG8KbGF2YWRhbG8Kc3VsbGkKc3VsbGlnYQpsYWJvciBuYSBrb2Rha2EKY2hhcHJpCmNoYXByaSBnYWR1CmVycmlob29rCmhvb2sgZ2FkdQpiaGFkY293CmJoYWRrYXcKaG93bGEKamFmZmEKZ2FqdWxhdGhvCmtvamphIG5hIGtvZGFrYQpzaGlrYW5kaQpmYWtlIGdhZHUKZnJhdWQgZ2FkdQpkdW5uYXBvdGh1CmdhYWRpZGEKZ2FkaWRhCmJ1ZmZhbG8KbW9ua2V5CmtvdGhpCmtvdGhpIHZlZGhhdmEKc29sbHUKc29sdQpzb2xsdSBnYWR1CnZhZGh1cmEKb2RpeWFtbWEKeWFkYXZhCnllZGF2YQp0dWR1bXUKd2FzdGUgYm9keQpjaGV0aGEKY2hldHRhCnBlbmR1CnRyYXNoCmdhcmJhZ2UKZGlydHkgZmVsbG93Cm5hc3R5CmNoZWFwIGZlbGxvdwpsb3cgY2xhc3MgZmVsbG93CnRoaXJkIGNsYXNzCjNyZCBjbGFzcwo0dGggY2xhc3MKbWVudGFsbwpwc3ljaG8Kc2FkaXN0CnRodXB1awp3b3JzdCBnYWR1CnBpY2hpIG5hIGtvZGFrYQplcnJpIG5hIGtvZGFrYQpkb25nYSBuYSBrb2Rha2EKZG9uZ2EgbXVuZGEKcmFua3UgbXVuZGEKYmF6YXJ1IG11bmRhCmJhemFyIGRhbmEKcm9hZCBtZWVkYSB0aWdlIGRhbmEKdGlydWd1Ym90aHUKdGhpcnVndWJvdGh1CnRhYWd1Ym90aHUKdGFndWJvdGh1Cmp1bGUKanVsYXlpCmF2YWxhbmphCmFkZGFtaW5hCmFkZGFtaW5hIHBhbnVsdQpuZWVrdSBlbmR1a3UgcmEKbmVla3UgZW5kdWt1Cm11c3Vrb25pIGt1cmNobwptdXN1a28Kc2h1dCB1cApjbG9zZSB5b3VyIG1vdXRoCm5vcnUgbXV5eWkKbm9ydSBtdXN1a28Kbm90bG8KZ3VkZGFsbwpiYXN0aGkKc2x1bQpzbHVtIGZlbGxvdwpsb2NhbCBnYWR1CnVuY2l2aWxpemVkCmJhcmJhcmlhbgpicnV0ZQpzYXZhZ2UKcmFrc2hhc3VkYQpyYWtzaGFzaQp3aXRjaApiaXRjaApzbHV0Cndob3JlCnByb3N0aXR1dGUKYmFzdGFyZAphc3Nob2xlCmZ1Y2tlcgptb3RoZXJmdWNrZXIKc2lzdGVyIGZ1Y2tlcgpicm90aGVyIGZ1Y2tlcgpmYXRoZXIgZnVja2VyCmRpY2sKY29jawpwdXNzeQpjdW50CnRpdHMKYm9vYnMKbmlwcGxlCnBlbmlzCnZhZ2luYQpmdWNrCmZ1Y2tpbmcKZnVja2VkCnNjcmV3ZWQKc2hhZ2dlZApodW1wZWQKY3JlYW1waWUKc3Blcm0Kc2VtZW4Kaml6egpzcHVuawpzcXVpcnQKaG9ybnkKcmFwZQptb2xlc3QKaGFyYXNzCmFzc2F1bHQKYWJ1c2UKdmlvbGF0ZQpkZWdyYWRlCmh1bWlsaWF0ZQpzdWljaWRlCmt5cwpjaG9rZQpzdHJhbmdsZQpzdWZmb2NhdGUKc2xhcApzcGl0CnNoaXQKZmlsdGgKZ3JpbWUKbXVjawpzbGltZQpzY3VtCnZlcm1pbgpwZXN0CnBhcmFzaXRlCmxlZWNoCm1hZ2dvdApqYWNrYXNzCm11bGUKb3gKYnVsbAp2dWx0dXJlCnNuYWtlCmxpemFyZApiYXN0aGkgZ2FkdQpiYXN0aGkgZmVsbG93CnJvYWQgZmVsbG93CnJvYWQgZ2FkdQpzdHJlZXQgZmVsbG93CnBhdmVtZW50IGZlbGxvdwpmb290cGF0aCBnYWR1CnNpZ25hbCBnYWR1CnRyYWZmaWMgZmVsbG93CmF1dG8gZ2FkdQpyaWtzaGEgZ2FkdQpjb29saWUKY29vbGllIGdhZHUKbGFib3IgZ2FkdQpzd2VlcGVyIGdhZHUKZ2FyYmFnZSBnYWR1CmR1c3RiaW4gZ2FkdQp0b2lsZXQgZ2FkdQpndXR0ZXIgZ2FkdQpkcmFpbiBnYWR1CnNld2VyIGdhZHUKbWFuaG9sZSBnYWR1Cm5lZSBpbnRsbwpuZWUgaW50aSB2YWx1Cm5lZSBmYW1pbHkKbmVlIHBhcmVudHMKbmVlIGZhdGhlcgpuZWUgbW90aGVyCm5lZSBzaXN0ZXIKbmVlIGJyb3RoZXIKbmVlIHdpZmUKbmVlIGh1c2JhbmQKZ3VkZGEgbG8KcHVrdSBsbwptb2RkYSBsbwpsYXZhZGEgbG8KYm9ra2EgbG8Kbm90bG8gcGV0dGkKZ3VkZGFsbyBwZXR0aQpwdWt1bG8gcGV0dGkKbW9kZGFsbyBwZXR0aQpkZW5ndXRhbnUKZGVuZ2VzdGEKZGVuZ2FsaQpkZW5naWNodWtvCmRlbmdleSByYQpkZW5nZXkgbGUKZGVuZ2lwb3RoYQpkZW5naXBveWEKZGVuZ2lwb3lpbmEKZGVuZ2luY2h1a3VubmEKZGVuZ2ljaHVrdW50dW5uYQpkZW5ndXR1bm5hCmRlbmd1dHVubmFudQpkZW5ndXR1bm5hdgpkZW5ndXR1bm5hZHUKZGVuZ3V0dW5uYWRpCmRlbmd1dHVubmFtCmRlbmd1dHVubmFydQpkZW5ndXR1bm5haQpkZW5nYW51CmRlbmdhdgpkZW5nYWR1CmRlbmdpbmRpCmRlbmdhbQpkZW5nYXJ1CmRlbmdhaQpkZW5naW5hCmRlbmdpbmF2CmRlbmdpbmFkdQpkZW5naW5hZGkKZGVuZ2luYW0KZGVuZ2luYXJ1CmRlbmdpbmFpCmRlbmdlc2FudQpkZW5nZXNhdgpkZW5nZXNhZHUKZGVuZ2VzYWRpCmRlbmdlc2FtCmRlbmdlc2FydQpkZW5nZXNhaQpkZW5nZXN0YXYKZGVuZ2VzdGFkdQpkZW5nZXN0YWRpCmRlbmdlc3RhbQpkZW5nZXN0YXJ1CmRlbmdlc3RhaQpkZW5ndXRhdgpkZW5ndXRhZHUKZGVuZ3V0YWRpCmRlbmd1dGFtCmRlbmd1dGFydQpkZW5ndXRhaQpwdWt1bG8gZGVuZ3V0YW51Cmd1ZGRhbG8gZGVuZ3V0YW51Cm5vdGxvIGRlbmd1dGFudQpib2trYWxvIGRlbmd1dGFudQpsYXZhZGFsbyBkZW5ndXRhbnUKbW9kZGFsbyBkZW5ndXRhbnUKcHVrdSBkZW5ndXRhbnUKZ3VkZGEgZGVuZ3V0YW51CmJva2thIGRlbmd1dGFudQpsYXZhZGEgZGVuZ3V0YW51Cm1vZGRhIGRlbmd1dGFudQpub3J1IGRlbmd1dGFudQpwdWt1IHJhCmd1ZGRhIHJhCmJva2thIHJhCmxhdmFkYSByYQptb2RkYSByYQpub3J1IHJhCnB1a3UgbGUKZ3VkZGEgbGUKYm9ra2EgbGUKbGF2YWRhIGxlCm1vZGRhIGxlCm5vcnUgbGUKcHVrdSBsYW5qYQpndWRkYSBsYW5qYQpib2trYSBsYW5qYQpsYXZhZGEgbGFuamEKbW9kZGEgbGFuamEKbm9ydSBsYW5qYQpwdWt1IGtvZGFrYQpndWRkYSBrb2Rha2EKYm9ra2Ega29kYWthCmxhdmFkYSBrb2Rha2EKbW9kZGEga29kYWthCm5vcnUga29kYWthCnB1a3UgbXVuZGEKZ3VkZGEgbXVuZGEKYm9ra2EgbXVuZGEKbGF2YWRhIG11bmRhCm1vZGRhIG11bmRhCm5vcnUgbXVuZGEKcHVrdSBkb25nYQpndWRkYSBkb25nYQpib2trYSBkb25nYQpsYXZhZGEgZG9uZ2EKbW9kZGEgZG9uZ2EKbm9ydSBkb25nYQpwdWt1IGVycmkKZ3VkZGEgZXJyaQpib2trYSBlcnJpCmxhdmFkYSBlcnJpCm1vZGRhIGVycmkKbm9ydSBlcnJpCnB1a3UgcGljY2hpCmd1ZGRhIHBpY2NoaQpib2trYSBwaWNjaGkKbGF2YWRhIHBpY2NoaQptb2RkYSBwaWNjaGkKbm9ydSBwaWNjaGkKcHVrdSB3YXN0ZQpndWRkYSB3YXN0ZQpib2trYSB3YXN0ZQpsYXZhZGEgd2FzdGUKbW9kZGEgd2FzdGUKbm9ydSB3YXN0ZQpwdWt1bG8gcGV0dGkgZGVuZ3V0YW51Cmd1ZGRhbG8gcGV0dGkgZGVuZ3V0YW51Cm5vdGxvIHBldHRpIGRlbmd1dGFudQpib2trYWxvIHBldHRpIGRlbmd1dGFudQpsYXZhZGFsbyBwZXR0aSBkZW5ndXRhbnUKbW9kZGFsbyBwZXR0aSBkZW5ndXRhbnUKeW91IGFyZSBzdHVwaWQKeW91IGFyZSBhbiBpZGlvdAp5b3UncmUgc28gZHVtYgp3aGF0IGEgbG9zZXIKaSB3aWxsIGZpbmQgeW91CnlvdSBkZXNlcnZlIHRvIGRpZQppIGhhdGUgeW91CnlvdSdyZSBkaXNndXN0aW5nCm5vYm9keSBsaWtlcyB5b3UKeW91J3JlIHBhdGhldGljCmdldCBsb3N0Cm5vYm9keSBhc2tlZAp5b3UncmUgd29ydGhsZXNzCnlvdSdyZSB0cmFzaApraWxsIHlvdXJzZWxmCnlvdSdyZSB1Z2x5CnlvdSdyZSBhbm5veWluZwpnbyB0byBoZWxsCnN0dXBpZCBnYSB1bm5hdgp0aGlzIGlzIGdhcmJhZ2UKbm9ib2R5IGFza2VkIGZvciB5b3VyIG9waW5pb24K
data/telugu_badwords.txt ADDED
@@ -0,0 +1,425 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ donga
2
+ vedhava
3
+ vedava
4
+ pandi
5
+ kukka
6
+ nee amma
7
+ nee abba
8
+ chatta
9
+ waste fellow
10
+ item
11
+ low class
12
+ picchi
13
+ erripappa
14
+ erri
15
+ mundamopi
16
+ daridrya
17
+ sani
18
+ paniki malina
19
+ thikka
20
+ worst fellow
21
+ bloody fool
22
+ useless fellow
23
+ lanja
24
+ lanjakodaka
25
+ lanja kodaka
26
+ munda
27
+ modda
28
+ dengu
29
+ dengey
30
+ dengutha
31
+ nee yabba
32
+ chavata
33
+ sannasi
34
+ luccha
35
+ howle
36
+ puku
37
+ puk
38
+ madda
39
+ lavada
40
+ kojja
41
+ hijra
42
+ bocchu
43
+ ne yamma
44
+ ne ayya
45
+ boku
46
+ badcow
47
+ erri puku
48
+ pichi lanjakodaka
49
+ bokulo
50
+ gudda
51
+ musali
52
+ ni bondha
53
+ ni abba
54
+ chettana kodaka
55
+ durmargudu
56
+ nee ayya
57
+ chatta na kodaka
58
+ pichi pulka
59
+ erri pushpam
60
+ waste gadu
61
+ ne kamma
62
+ waste na kodaka
63
+ poramboku
64
+ siggu leni
65
+ lajja
66
+ yerri
67
+ bewarsi
68
+ bewars
69
+ pakodi
70
+ pulka
71
+ buffoon
72
+ scoundrel
73
+ rascal
74
+ idiot
75
+ stupid
76
+ loser
77
+ loafer
78
+ rowdy
79
+ 420
80
+ dongana kodaka
81
+ nee fuku
82
+ konda erri
83
+ pook
84
+ pooku
85
+ moddalo
86
+ lavadalo
87
+ sulli
88
+ sulliga
89
+ labor na kodaka
90
+ chapri
91
+ chapri gadu
92
+ errihook
93
+ hook gadu
94
+ bhadcow
95
+ bhadkaw
96
+ howla
97
+ jaffa
98
+ gajulatho
99
+ kojja na kodaka
100
+ shikandi
101
+ fake gadu
102
+ fraud gadu
103
+ dunnapothu
104
+ gaadida
105
+ gadida
106
+ buffalo
107
+ monkey
108
+ kothi
109
+ kothi vedhava
110
+ sollu
111
+ solu
112
+ sollu gadu
113
+ vadhura
114
+ odiyamma
115
+ yadava
116
+ yedava
117
+ tudumu
118
+ waste body
119
+ chetha
120
+ chetta
121
+ pendu
122
+ trash
123
+ garbage
124
+ dirty fellow
125
+ nasty
126
+ cheap fellow
127
+ low class fellow
128
+ third class
129
+ 3rd class
130
+ 4th class
131
+ mentalo
132
+ psycho
133
+ sadist
134
+ thupuk
135
+ worst gadu
136
+ pichi na kodaka
137
+ erri na kodaka
138
+ donga na kodaka
139
+ donga munda
140
+ ranku munda
141
+ bazaru munda
142
+ bazar dana
143
+ road meeda tige dana
144
+ tirugubothu
145
+ thirugubothu
146
+ taagubothu
147
+ tagubothu
148
+ jule
149
+ julayi
150
+ avalanja
151
+ addamina
152
+ addamina panulu
153
+ neeku enduku ra
154
+ neeku enduku
155
+ musukoni kurcho
156
+ musuko
157
+ shut up
158
+ close your mouth
159
+ noru muyyi
160
+ noru musuko
161
+ notlo
162
+ guddalo
163
+ basthi
164
+ slum
165
+ slum fellow
166
+ local gadu
167
+ uncivilized
168
+ barbarian
169
+ brute
170
+ savage
171
+ rakshasuda
172
+ rakshasi
173
+ witch
174
+ bitch
175
+ slut
176
+ whore
177
+ prostitute
178
+ bastard
179
+ asshole
180
+ fucker
181
+ motherfucker
182
+ sister fucker
183
+ brother fucker
184
+ father fucker
185
+ dick
186
+ cock
187
+ pussy
188
+ cunt
189
+ tits
190
+ boobs
191
+ nipple
192
+ penis
193
+ vagina
194
+ fuck
195
+ fucking
196
+ fucked
197
+ screwed
198
+ shagged
199
+ humped
200
+ creampie
201
+ sperm
202
+ semen
203
+ jizz
204
+ spunk
205
+ squirt
206
+ horny
207
+ rape
208
+ molest
209
+ harass
210
+ assault
211
+ abuse
212
+ violate
213
+ degrade
214
+ humiliate
215
+ suicide
216
+ kys
217
+ choke
218
+ strangle
219
+ suffocate
220
+ slap
221
+ spit
222
+ shit
223
+ filth
224
+ grime
225
+ muck
226
+ slime
227
+ scum
228
+ vermin
229
+ pest
230
+ parasite
231
+ leech
232
+ maggot
233
+ jackass
234
+ mule
235
+ ox
236
+ bull
237
+ vulture
238
+ snake
239
+ lizard
240
+ basthi gadu
241
+ basthi fellow
242
+ road fellow
243
+ road gadu
244
+ street fellow
245
+ pavement fellow
246
+ footpath gadu
247
+ signal gadu
248
+ traffic fellow
249
+ auto gadu
250
+ riksha gadu
251
+ coolie
252
+ coolie gadu
253
+ labor gadu
254
+ sweeper gadu
255
+ garbage gadu
256
+ dustbin gadu
257
+ toilet gadu
258
+ gutter gadu
259
+ drain gadu
260
+ sewer gadu
261
+ manhole gadu
262
+ nee intlo
263
+ nee inti valu
264
+ nee family
265
+ nee parents
266
+ nee father
267
+ nee mother
268
+ nee sister
269
+ nee brother
270
+ nee wife
271
+ nee husband
272
+ gudda lo
273
+ puku lo
274
+ modda lo
275
+ lavada lo
276
+ bokka lo
277
+ notlo petti
278
+ guddalo petti
279
+ pukulo petti
280
+ moddalo petti
281
+ dengutanu
282
+ dengesta
283
+ dengali
284
+ dengichuko
285
+ dengey ra
286
+ dengey le
287
+ dengipotha
288
+ dengipoya
289
+ dengipoyina
290
+ denginchukunna
291
+ dengichukuntunna
292
+ dengutunna
293
+ dengutunnanu
294
+ dengutunnav
295
+ dengutunnadu
296
+ dengutunnadi
297
+ dengutunnam
298
+ dengutunnaru
299
+ dengutunnai
300
+ denganu
301
+ dengav
302
+ dengadu
303
+ dengindi
304
+ dengam
305
+ dengaru
306
+ dengai
307
+ dengina
308
+ denginav
309
+ denginadu
310
+ denginadi
311
+ denginam
312
+ denginaru
313
+ denginai
314
+ dengesanu
315
+ dengesav
316
+ dengesadu
317
+ dengesadi
318
+ dengesam
319
+ dengesaru
320
+ dengesai
321
+ dengestav
322
+ dengestadu
323
+ dengestadi
324
+ dengestam
325
+ dengestaru
326
+ dengestai
327
+ dengutav
328
+ dengutadu
329
+ dengutadi
330
+ dengutam
331
+ dengutaru
332
+ dengutai
333
+ pukulo dengutanu
334
+ guddalo dengutanu
335
+ notlo dengutanu
336
+ bokkalo dengutanu
337
+ lavadalo dengutanu
338
+ moddalo dengutanu
339
+ puku dengutanu
340
+ gudda dengutanu
341
+ bokka dengutanu
342
+ lavada dengutanu
343
+ modda dengutanu
344
+ noru dengutanu
345
+ puku ra
346
+ gudda ra
347
+ bokka ra
348
+ lavada ra
349
+ modda ra
350
+ noru ra
351
+ puku le
352
+ gudda le
353
+ bokka le
354
+ lavada le
355
+ modda le
356
+ noru le
357
+ puku lanja
358
+ gudda lanja
359
+ bokka lanja
360
+ lavada lanja
361
+ modda lanja
362
+ noru lanja
363
+ puku kodaka
364
+ gudda kodaka
365
+ bokka kodaka
366
+ lavada kodaka
367
+ modda kodaka
368
+ noru kodaka
369
+ puku munda
370
+ gudda munda
371
+ bokka munda
372
+ lavada munda
373
+ modda munda
374
+ noru munda
375
+ puku donga
376
+ gudda donga
377
+ bokka donga
378
+ lavada donga
379
+ modda donga
380
+ noru donga
381
+ puku erri
382
+ gudda erri
383
+ bokka erri
384
+ lavada erri
385
+ modda erri
386
+ noru erri
387
+ puku picchi
388
+ gudda picchi
389
+ bokka picchi
390
+ lavada picchi
391
+ modda picchi
392
+ noru picchi
393
+ puku waste
394
+ gudda waste
395
+ bokka waste
396
+ lavada waste
397
+ modda waste
398
+ noru waste
399
+ pukulo petti dengutanu
400
+ guddalo petti dengutanu
401
+ notlo petti dengutanu
402
+ bokkalo petti dengutanu
403
+ lavadalo petti dengutanu
404
+ moddalo petti dengutanu
405
+ you are stupid
406
+ you are an idiot
407
+ you're so dumb
408
+ what a loser
409
+ i will find you
410
+ you deserve to die
411
+ i hate you
412
+ you're disgusting
413
+ nobody likes you
414
+ you're pathetic
415
+ get lost
416
+ nobody asked
417
+ you're worthless
418
+ you're trash
419
+ kill yourself
420
+ you're ugly
421
+ you're annoying
422
+ go to hell
423
+ stupid ga unnav
424
+ this is garbage
425
+ nobody asked for your opinion
export_badwords.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import base64
3
+ import pandas as pd
4
+ from pathlib import Path
5
+
6
+ def export_badwords_to_excel(output_filename="custom_badwords_dataset.xlsx"):
7
+ data_dir = Path("data")
8
+ toxic_words = []
9
+
10
+ # 1. Load regular badwords
11
+ p1 = data_dir / "telugu_badwords.txt"
12
+ if p1.exists():
13
+ with open(p1, "r", encoding="utf-8") as f:
14
+ toxic_words.extend([l.strip() for l in f if l.strip()])
15
+
16
+ # 2. Load secure base64 badwords
17
+ p2 = data_dir / "secure_words.bin"
18
+ if p2.exists():
19
+ with open(p2, "rb") as f:
20
+ decoded = base64.b64decode(f.read()).decode("utf-8")
21
+ toxic_words.extend([l.strip() for l in decoded.splitlines() if l.strip()])
22
+
23
+ # 3. Load bad emojis
24
+ p3 = data_dir / "bad_emojis.txt"
25
+ if p3.exists():
26
+ with open(p3, "r", encoding="utf-8") as f:
27
+ toxic_words.extend([l.strip() for l in f if l.strip() and not l.strip().startswith("#")])
28
+
29
+ # Remove duplicates
30
+ toxic_words = list(set(toxic_words))
31
+ print(f"Total unique offensive terms gathered: {len(toxic_words)}")
32
+
33
+ if not toxic_words:
34
+ print("No words found to export.")
35
+ return
36
+
37
+ # Create a DataFrame
38
+ # Here we are just exporting the raw words as 'toxic'
39
+ df = pd.DataFrame({
40
+ 'text': toxic_words,
41
+ 'label': 'toxic'
42
+ })
43
+
44
+ # Save to Excel
45
+ output_path = data_dir / output_filename
46
+ df.to_excel(output_path, index=False)
47
+ print(f"Successfully exported {len(toxic_words)} words to {output_path}")
48
+
49
+ if __name__ == "__main__":
50
+ export_badwords_to_excel()
inspect_data.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import sys
3
+
4
+ with open('inspect_out.txt', 'w', encoding='utf-8') as f:
5
+ f.write("Loading dataset...\n")
6
+ try:
7
+ df = pd.read_excel('data/training_data_telugu-hate.xlsx')
8
+ f.write("Columns: " + str(df.columns.tolist()) + "\n")
9
+ f.write("Shape: " + str(df.shape) + "\n")
10
+ if 'label' in df.columns:
11
+ f.write("Value Counts for 'label':\n" + str(df['label'].value_counts()) + "\n")
12
+ f.write("\nFirst 5 rows:\n")
13
+ f.write(str(df.head()) + "\n")
14
+
15
+ # Look for missing values
16
+ f.write("\nMissing Values:\n" + str(df.isnull().sum()) + "\n")
17
+ except Exception as e:
18
+ f.write("Error: " + str(e) + "\n")
kaggle_training_v3.py ADDED
@@ -0,0 +1,327 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ KAGGLE MODEL V3: Aiming for 90%+ Accuracy without Overfitting
3
+ Optimizations:
4
+ 1. Increased Dataset Size: More diverse templates and safe phrases for data augmentation.
5
+ 2. Data Text Cleaning: Removed URLs, extra spaces, and user mentions to reduce noise.
6
+ 3. Class Balancing: Automatically oversamples the minority class to perfectly balance the dataset.
7
+ 4. Overfitting Prevention: Added Label Smoothing, Cosine Learning Rate Scheduler,
8
+ Warmup steps, and appropriate Weight Decay.
9
+ 5. Model: Using 'google/muril-base-cased' which is highly optimized for Indian languages
10
+ including Telugu, better for code-mixed text. Added custom dropout to config.
11
+ """
12
+
13
+ import os
14
+ import sys
15
+ import json
16
+ import base64
17
+ import random
18
+ import re
19
+ from pathlib import Path
20
+
21
+ # Force unbuffered output
22
+ try:
23
+ if hasattr(sys.stdout, 'reconfigure'):
24
+ sys.stdout.reconfigure(encoding='utf-8')
25
+ except Exception:
26
+ pass
27
+
28
+ print("DEBUG: Kaggle V3 Training Script started", flush=True)
29
+
30
+ # ── Paths ────────────────────────────────────────────────────────────────────
31
+ KAGGLE_INPUT = Path("/kaggle/input")
32
+ KAGGLE_OUTPUT = Path("/kaggle/working")
33
+
34
+ DATA_DIR = None
35
+ print(f"DEBUG: Checking for data in {KAGGLE_INPUT}...", flush=True)
36
+
37
+ for p in KAGGLE_INPUT.glob("*"):
38
+ if p.is_dir() and any(p.glob("*training_data*")):
39
+ DATA_DIR = p
40
+ break
41
+
42
+ if not DATA_DIR:
43
+ for p in KAGGLE_INPUT.rglob("*training_data*"):
44
+ DATA_DIR = p.parent
45
+ break
46
+
47
+ if not DATA_DIR:
48
+ DATA_DIR = KAGGLE_INPUT / "comment-guard-data"
49
+
50
+ OUTPUT_DIR = KAGGLE_OUTPUT / "model_output_v3"
51
+
52
+ # ── Dependencies ─────────────────────────────────────────────────────────────
53
+ try:
54
+ import torch
55
+ import transformers
56
+ from transformers import (
57
+ AutoTokenizer,
58
+ AutoModelForSequenceClassification,
59
+ AutoConfig,
60
+ TrainingArguments,
61
+ Trainer,
62
+ EarlyStoppingCallback
63
+ )
64
+ import pandas as pd
65
+ import openpyxl
66
+ import sklearn
67
+ from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
68
+ import numpy as np
69
+ from torch.utils.data import Dataset as TorchDataset
70
+ from sklearn.model_selection import train_test_split
71
+ except ImportError:
72
+ print("⚠ Please run: !pip install transformers torch scikit-learn accelerate openpyxl pandas -q")
73
+ sys.exit(1)
74
+
75
+ # ── Config ────────────────────────────────────────────────────────────────────
76
+ BASE_MODEL = "google/muril-base-cased" # Great for Telugu/Code-mixed
77
+ MAX_LENGTH = 128
78
+ EPOCHS = 10 # High max epochs, relying on early stopping
79
+ LEARNING_RATE = 2e-5
80
+ WEIGHT_DECAY = 0.05
81
+ LABEL_SMOOTHING = 0.1 # Helps prevent overfitting by softening labels
82
+ WARMUP_RATIO = 0.1 # Gradual learning rate increase
83
+
84
+ # ── Functions ────────────────────────────────────────────────────────────────
85
+
86
+ def clean_text(text):
87
+ text = str(text).lower()
88
+ text = re.sub(r'http\S+', '', text) # Remove URLs
89
+ text = re.sub(r'@\w+', '', text) # Remove mentions
90
+ text = re.sub(r'#\w+', '', text) # Remove hashtags
91
+ text = re.sub(r'\s+', ' ', text) # Remove extra whitespace
92
+ return text.strip()
93
+
94
+ def is_code_mixed(text):
95
+ text = str(text)
96
+ has_latin = any('\u0041' <= c <= '\u007A' for c in text)
97
+ total = len([c for c in text if c.strip()])
98
+ # Simply require that it has some Latin characters (English alphabet)
99
+ if total == 0 or not has_latin: return False
100
+ return True
101
+
102
+ def load_data(files):
103
+ hate_labels_set = {'hate', 'offensive', 'hof', '1', 'yes', 'toxic'}
104
+ frames = []
105
+ TEXT_NAMES = {'text', 'comment', 'comments', 'sentence', 'tweet', 'content', 'data'}
106
+ LABEL_NAMES = {'label', 'labels', 'category', 'class', 'tag', 'hate', 'annotation'}
107
+
108
+ for excel_file in files:
109
+ try:
110
+ if excel_file.suffix == '.csv':
111
+ df = pd.read_csv(excel_file)
112
+ sheets_data = [('csv', df)]
113
+ else:
114
+ xl = pd.ExcelFile(excel_file)
115
+ sheets_data = [(sheet, xl.parse(sheet)) for sheet in xl.sheet_names]
116
+
117
+ for sheet, df in sheets_data:
118
+ text_col = next((c for c in df.columns if str(c).lower() in TEXT_NAMES or any(t in str(c).lower() for t in ['text', 'comment', 'sentence'])), None)
119
+ label_col = next((c for c in df.columns if str(c).lower() in LABEL_NAMES or any(t in str(c).lower() for t in ['label', 'categor', 'class'])), None)
120
+
121
+ if text_col and label_col:
122
+ sub = df[[text_col, label_col]].copy()
123
+ sub.columns = ['text', 'label']
124
+ sub = sub.dropna()
125
+ sub['text'] = sub['text'].apply(clean_text)
126
+ sub['label_int'] = sub['label'].astype(str).str.strip().str.lower().apply(lambda x: 1 if x in hate_labels_set else 0)
127
+ sub = sub[sub['text'].apply(is_code_mixed)].reset_index(drop=True)
128
+ frames.append(sub)
129
+ except Exception as e:
130
+ print(f"Error loading {excel_file}: {e}")
131
+ pass
132
+
133
+ return pd.concat(frames, ignore_index=True) if frames else pd.DataFrame(columns=['text', 'label', 'label_int'])
134
+
135
+ def load_badwords_augmented():
136
+ """V3: Massively expanded safe phrases and toxic templates to increase dataset robustness."""
137
+ toxic_words = []
138
+ p1, p2, p3 = DATA_DIR / "telugu_badwords.txt", DATA_DIR / "secure_words.bin", DATA_DIR / "bad_emojis.txt"
139
+ if p1.exists():
140
+ with open(p1, "r", encoding="utf-8") as f: toxic_words.extend([l.strip() for l in f if l.strip()])
141
+ if p2.exists():
142
+ with open(p2, "rb") as f: toxic_words.extend([l.strip() for l in base64.b64decode(f.read()).decode("utf-8").splitlines() if l.strip()])
143
+ if p3.exists():
144
+ with open(p3, "r", encoding="utf-8") as f: toxic_words.extend([l.strip() for l in f if l.strip() and not l.strip().startswith("#")])
145
+
146
+ if not toxic_words: return pd.DataFrame()
147
+
148
+ random.seed(42)
149
+ # Increased variety
150
+ toxic_templates = [
151
+ "{word}", "you are a {word}", "{word} ga unnav", "enti ra {word}",
152
+ "nuvvu {word}", "{word} fellow", "worst {word}", "rey {word}",
153
+ "ni yamma {word} nayala", "nuvvu pedda {word}", "chi {word} badava",
154
+ "endira ee {word} panulu", "tuppas {word} mokam", "nee lanti {word} inka evaru leru"
155
+ ]
156
+
157
+ safe_phrases = [
158
+ "bagundi bro", "keep it up", "manchi video", "super explanation", "thanks for sharing",
159
+ "helpful information", "nice edit", "waiting for next video", "super ga undi",
160
+ "love from ap", "good job", "congratulations brother", "beautiful video", "awesome music",
161
+ "next video eppudu?", "very interesting topic", "I learned a lot today", "nice talk",
162
+ "informative content", "meeru chala baga chepparu", "meeru chala handsome", "super anna",
163
+ "daily chustanu mee videos", "proud of you", "all the best for your future", "fantastic editing",
164
+ "thank you so much", "very nice presentation", "please upload more", "hello everyone",
165
+ "good morning brother", "have a great day ahead", "chala upayoga padindi", "excellent work"
166
+ ]
167
+
168
+ rows = []
169
+ for word in list(set(toxic_words)):
170
+ # Generate 4 toxic examples per word
171
+ for t in random.sample(toxic_templates, min(4, len(toxic_templates))):
172
+ rows.append({'text': t.format(word=word), 'label_int': 1})
173
+ # Generate 4 safe examples to match
174
+ for _ in range(4):
175
+ rows.append({'text': random.choice(safe_phrases), 'label_int': 0})
176
+
177
+ return pd.DataFrame(rows)
178
+
179
+ # ── Main Execution ───────────────────────────────────────────────────────────
180
+
181
+ if not DATA_DIR.exists():
182
+ print(f"✗ ERROR: DATA_DIR {DATA_DIR} not found. Ensure dataset is added to notebook.")
183
+ sys.exit(1)
184
+
185
+ train_files = [f for f in DATA_DIR.iterdir() if 'training_data' in f.name.lower() and f.suffix in ['.xlsx', '.xls', '.csv']]
186
+ all_data = load_data(train_files)
187
+ aug_data = load_badwords_augmented()
188
+ if not aug_data.empty:
189
+ all_data = pd.concat([all_data, aug_data], ignore_index=True)
190
+
191
+ all_data = all_data.drop_duplicates(subset='text').reset_index(drop=True)
192
+
193
+ # V3: DYNAMIC OVERSAMPLING & BALANCING
194
+ counts = all_data['label_int'].value_counts()
195
+ if len(counts) == 2:
196
+ majority_class = counts.idxmax()
197
+ minority_class = counts.idxmin()
198
+ majority_count = counts[majority_class]
199
+ minority_count = counts[minority_class]
200
+
201
+ if minority_count < majority_count:
202
+ df_majority = all_data[all_data['label_int'] == majority_class]
203
+ df_minority = all_data[all_data['label_int'] == minority_class]
204
+
205
+ # Oversample minority
206
+ df_minority_over = df_minority.sample(majority_count, replace=True, random_state=42)
207
+ all_data = pd.concat([df_majority, df_minority_over], axis=0).sample(frac=1, random_state=42).reset_index(drop=True)
208
+ print(f"DEBUG: Oversampled class {minority_class} to {majority_count}. Total rows symmetrically balanced: {len(all_data)}")
209
+
210
+ # Train/Test Split
211
+ train_df, test_df = train_test_split(all_data, test_size=0.10, random_state=42, stratify=all_data['label_int'])
212
+
213
+ tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
214
+
215
+ # Incorporating Dropout into config to prevent overfitting
216
+ config = AutoConfig.from_pretrained(BASE_MODEL, num_labels=2, problem_type="single_label_classification")
217
+ config.hidden_dropout_prob = 0.2
218
+ config.attention_probs_dropout_prob = 0.2
219
+
220
+ model = AutoModelForSequenceClassification.from_pretrained(
221
+ BASE_MODEL,
222
+ config=config,
223
+ ignore_mismatched_sizes=True
224
+ )
225
+
226
+ class CommentDataset(TorchDataset):
227
+ def __init__(self, texts, labels):
228
+ self.texts = texts # Store raw texts as well
229
+ self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='pt')
230
+ self.labels = labels
231
+ def __len__(self): return len(self.labels)
232
+ def __getitem__(self, idx):
233
+ item = {k: v[idx] for k, v in self.encodings.items()}
234
+ item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
235
+ return item
236
+
237
+ train_dataset = CommentDataset(train_df['text'].tolist(), train_df['label_int'].tolist())
238
+ test_dataset = CommentDataset(test_df['text'].tolist(), test_df['label_int'].tolist())
239
+
240
+ def compute_metrics(eval_pred):
241
+ logits, labels = eval_pred
242
+ preds = np.argmax(logits, axis=-1)
243
+ return {
244
+ 'accuracy': accuracy_score(labels, preds),
245
+ 'f1': f1_score(labels, preds, zero_division=0),
246
+ 'precision': precision_score(labels, preds, zero_division=0),
247
+ 'recall': recall_score(labels, preds, zero_division=0),
248
+ }
249
+
250
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
251
+ OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
252
+
253
+ training_args = TrainingArguments(
254
+ output_dir=str(OUTPUT_DIR),
255
+ num_train_epochs=EPOCHS,
256
+ per_device_train_batch_size=16 if device == 'cuda' else 8,
257
+ per_device_eval_batch_size=32 if device == 'cuda' else 8,
258
+ learning_rate=LEARNING_RATE,
259
+ weight_decay=WEIGHT_DECAY,
260
+ warmup_ratio=WARMUP_RATIO,
261
+ lr_scheduler_type='cosine', # Cosine learning rate scheduler helps avoid overfitting and local minima
262
+ label_smoothing_factor=LABEL_SMOOTHING, # Distributes a bit of probability mass to other classes, reducing overconfidence
263
+ eval_strategy="epoch",
264
+ save_strategy="no", # CHANGED: Don't save checkpoints to prevent KAGGLE STORAGE OVERFLOW
265
+ load_best_model_at_end=False, # CHANGED: Must be false if we aren't saving checkpoints
266
+ metric_for_best_model="f1",
267
+ report_to="none",
268
+ fp16=(device == 'cuda'),
269
+ logging_steps=50,
270
+ )
271
+
272
+ trainer = Trainer(
273
+ model=model,
274
+ args=training_args,
275
+ train_dataset=train_dataset,
276
+ eval_dataset=test_dataset,
277
+ compute_metrics=compute_metrics,
278
+ callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
279
+ )
280
+
281
+ print(f"Starting V3 training on {device}...")
282
+ trainer.train()
283
+
284
+ # Evaluate & Print Results
285
+ print("\n📊 EVALUATING MODEL V3...")
286
+ results = trainer.evaluate()
287
+ print(f"\n{'='*50}\n🏆 V3 FINAL ACCURACY: {results.get('eval_accuracy', 0)*100:.2f}%\n{'='*50}")
288
+
289
+ # --- CRITICAL KAGGLE STORAGE FIX ---
290
+ # Free up disk space before saving by clearing the HuggingFace cache and previous runs
291
+ print("\n🧹 Clearing disk space...")
292
+ import shutil
293
+ import gc
294
+
295
+ # 1. Clear large dataframes and run garbage collection
296
+ del all_data, train_df, test_df, train_dataset, test_dataset
297
+ gc.collect()
298
+
299
+ # 2. Clear known cache directories
300
+ for cache_path in [".cache/huggingface", ".cache/torch"]:
301
+ cache_dir = Path.home() / cache_path
302
+ if cache_dir.exists():
303
+ try:
304
+ shutil.rmtree(cache_dir)
305
+ print(f"✅ Cleared {cache_dir}")
306
+ except Exception as e:
307
+ pass
308
+
309
+ # 3. Aggressively delete OLD model outputs in /kaggle/working to free up 100s of MBs
310
+ for old_dir in ["model_output", "model_output_v2", "wandb"]:
311
+ old_path = KAGGLE_OUTPUT / old_dir
312
+ if old_path.exists():
313
+ try:
314
+ shutil.rmtree(old_path)
315
+ print(f"✅ Deleted old directory: {old_path}")
316
+ except Exception as e:
317
+ pass
318
+
319
+ # Save
320
+ try:
321
+ trainer.save_model(str(OUTPUT_DIR))
322
+ tokenizer.save_pretrained(str(OUTPUT_DIR))
323
+ with open(OUTPUT_DIR / "eval_results.json", 'w') as f: json.dump(results, f, indent=2)
324
+ print(f"✅ Model saved successfully to: {OUTPUT_DIR}")
325
+ except OSError as e:
326
+ print(f"\n❌ FATAL SAVING ERROR: {e}")
327
+ print("Kaggle ran out of disk space again! Try restarting your session or using a smaller BASE_MODEL.")
main.py ADDED
@@ -0,0 +1,294 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException
2
+ from fastapi.middleware.cors import CORSMiddleware
3
+ from pydantic import BaseModel
4
+ from transformers import pipeline
5
+ from better_profanity import profanity
6
+ from typing import List, Dict
7
+ import re
8
+
9
+ # Mild/acceptable words that better_profanity should NOT flag.
10
+ # Using the library's built-in whitelist_words param is the most reliable fix.
11
+ MILD_WORDS_WHITELIST = [
12
+ "damn", "hell", "crap", "dang", "heck", "shoot", "frick", "freaking",
13
+ "sucks", "suck", "bloody", "piss", "pissed",
14
+ ]
15
+
16
+ # Initialize profanity filter with whitelisted mild words so they never trigger
17
+ profanity.load_censor_words(whitelist_words=MILD_WORDS_WHITELIST)
18
+
19
+ # Keep a set for the manual cleanup fallback (covers multi-word phrases)
20
+ PROFANITY_WHITELIST = set(MILD_WORDS_WHITELIST) | {"keep it up", "great post"}
21
+
22
+ # Pre-compiled regex patterns for profanity whitelist
23
+ PROFANITY_WHITELIST_PATTERNS = {word: re.compile(r'\b' + re.escape(word) + r'\b', re.IGNORECASE) for word in PROFANITY_WHITELIST}
24
+
25
+ def is_whitelisted(text: str) -> bool:
26
+ """Check if the text only triggers profanity due to whitelisted mild words."""
27
+ cleaned = text.lower()
28
+ for pattern in PROFANITY_WHITELIST_PATTERNS.values():
29
+ cleaned = pattern.sub("", cleaned)
30
+ return not profanity.contains_profanity(cleaned)
31
+
32
+ # Keyword-based insult/threat detector to catch what the ML model misses.
33
+ # Unicode apostrophe class ['‘’] handles both ASCII (') and curly (’) apostrophes.
34
+ INSULT_KEYWORDS = [
35
+ # --- English insults / threats ---
36
+ r"\byou['‘’]?re so dumb\b",
37
+ r"\bwhat a loser\b",
38
+ r"\bi will find you\b",
39
+ r"\byou deserve to die\b",
40
+ r"\bi hate you\b",
41
+ r"\byou['‘’]?re disgusting\b",
42
+ r"\bnobody likes you\b",
43
+ r"\byou['‘’]?re pathetic\b",
44
+ r"\bget lost\b",
45
+ r"\bnobody asked\b",
46
+ r"\byou['‘’]?re worthless\b",
47
+ r"\byou['‘’]?re trash\b",
48
+ r"\bkill yourself\b",
49
+ r"\bgo kill yourself\b",
50
+ r"\byou['‘’]?re ugly\b",
51
+ r"\bshut up\b",
52
+ r"\byou['‘’]?re annoying\b",
53
+ r"\bgo to hell\b",
54
+ r"\bstupid ga\b",
55
+ r"\bwaste fellow\b",
56
+ r"\byou['‘’]?re an idiot\b",
57
+ r"\bthis is garbage\b",
58
+ r"\byou are stupid\b",
59
+ r"\byou are an idiot\b",
60
+ r"\byou['‘’]?re dumb\b",
61
+ r"\bstupid idiot\b",
62
+ r"\bbloody fool\b",
63
+ # --- Telugu-English compound insults: [insult word] + gadu/fellow/vaadu ---
64
+ r"\b(?:buffalo|monkey|mental|psycho|cheap|nasty|dirty|useless|worst|scoundrel)"
65
+ r"\s+(?:gadu|fellow|vaadu|ra)\b",
66
+ r"\b(?:rascal|buffoon|loafer|fraud|basthi|chapri|local|rowdy|420|kothi|waste)"
67
+ r"\s+(?:gadu|fellow|vaadu|ra)\b",
68
+ r"\b(?:third\s+class|low\s+class|third-class|low-class)\s+(?:gadu|fellow|vaadu)\b",
69
+ r"\b(?:buffalo|monkey|mental|psycho|cheap|nasty|dirty|useless|worst|scoundrel|rascal|buffoon|loafer|fraud)\s+fellow\b",
70
+ # --- Telugu standalone insult suffixes ---
71
+ r"\bkothi\s+vedhava\b",
72
+ ]
73
+ INSULT_PATTERN = re.compile("|".join(INSULT_KEYWORDS), re.IGNORECASE | re.UNICODE)
74
+
75
+ def contains_insult_keyword(text: str) -> bool:
76
+ """Check if text contains known insult/threat patterns."""
77
+ return bool(INSULT_PATTERN.search(text))
78
+
79
+ # Load Custom Telugu-English Bad Words (Secure)
80
+ import base64
81
+ import os
82
+
83
+ try:
84
+ secure_file_path = "data/secure_words.bin"
85
+ if os.path.exists(secure_file_path):
86
+ with open(secure_file_path, "rb") as f:
87
+ encoded_data = f.read()
88
+ decoded_data = base64.b64decode(encoded_data).decode("utf-8")
89
+ custom_words = [line.strip() for line in decoded_data.splitlines() if line.strip()]
90
+ profanity.add_censor_words(custom_words)
91
+ print(f"Loaded {len(custom_words)} custom bad words from secure storage.")
92
+ else:
93
+ print("Warning: Secure bad words file not found.")
94
+ except Exception as e:
95
+ print(f"Warning: Could not load custom bad words: {e}")
96
+
97
+ # Load Offensive Emojis
98
+ offensive_emojis = set()
99
+ try:
100
+ emoji_file_path = "data/bad_emojis.txt"
101
+ if os.path.exists(emoji_file_path):
102
+ with open(emoji_file_path, "r", encoding="utf-8") as f:
103
+ for line in f:
104
+ line = line.strip()
105
+ if line and not line.startswith("#"):
106
+ offensive_emojis.add(line)
107
+ print(f"Loaded {len(offensive_emojis)} offensive emojis.")
108
+ else:
109
+ print("Warning: Offensive emojis file not found.")
110
+ except Exception as e:
111
+ print(f"Warning: Could not load offensive emojis: {e}")
112
+
113
+ def contains_offensive_emoji(text: str) -> bool:
114
+ """Check if text contains any offensive emojis"""
115
+ for emoji in offensive_emojis:
116
+ if emoji in text:
117
+ return True
118
+ return False
119
+
120
+
121
+ app = FastAPI(title="AI Comment Moderation API")
122
+
123
+ app.add_middleware(
124
+ CORSMiddleware,
125
+ allow_origins=["*"],
126
+ allow_credentials=True,
127
+ allow_methods=["*"],
128
+ allow_headers=["*"],
129
+ )
130
+
131
+ # Initialize the toxicity classification pipeline
132
+ # We use 'original' to keep the original distilbert-base-uncased-finetuned-sst-2-english if we wanted simple sentiment
133
+ # However, for toxicity detection in Telugu-English code-mixed content, MuRIL (Multilingual
134
+ # Representations for Indian Languages) BERT is preferred over standard DistilBERT or toxic-bert.
135
+ # MuRIL is specifically trained on Indian languages and handles code-switching much better.
136
+ # Current production model: google/muril-base-cased (fine-tuned)
137
+ import torch
138
+
139
+ # Optimizatons to prevent PyTorch from lagging the entire OS when running on CPU
140
+ try:
141
+ if torch.cuda.is_available():
142
+ device = 0 # Use GPU
143
+ print("✓ CUDA GPU detected, running models on GPU for faster inference.")
144
+ else:
145
+ device = -1 # Use CPU
146
+ torch.set_num_threads(config.get("cpu_threads", 4)) # Limit to 4 threads rather than maxing out CPU
147
+ print(f"✓ CPU detected, limited PyTorch to {torch.get_num_threads()} threads to prevent system lag.")
148
+ except Exception as e:
149
+ device = -1
150
+ pass
151
+
152
+ try:
153
+ # Use fine-tuned model if available (produced by train_model.py)
154
+ fine_tuned_path = os.path.join(os.path.dirname(__file__), "model_output")
155
+ if os.path.exists(fine_tuned_path) and os.path.exists(os.path.join(fine_tuned_path, "config.json")):
156
+ print(f"✓ Loading fine-tuned model from: {fine_tuned_path}")
157
+ classifier = pipeline("text-classification", model=fine_tuned_path, top_k=None, device=device)
158
+ else:
159
+ print("Loading default model: google/muril-base-cased (Fallback)")
160
+ print("Note: MuRIL is highly recommended for Telugu-English code-mixed content.")
161
+ classifier = pipeline("text-classification", model="google/muril-base-cased", top_k=None, device=device)
162
+ except Exception as e:
163
+ print(f"Error loading model: {e}")
164
+ classifier = None
165
+
166
+
167
+ class CommentRequest(BaseModel):
168
+ text: str
169
+ strictness: str = "high" # "high" (Celeb) or "low" (Friend)
170
+
171
+ class Score(BaseModel):
172
+ label: str
173
+ score: float
174
+
175
+ class AnalysisResponse(BaseModel):
176
+ text: str
177
+ results: List[Score]
178
+ is_toxic: bool
179
+
180
+ @app.get("/")
181
+ def read_root():
182
+ return {"message": "AI Comment Moderation API is running"}
183
+
184
+ @app.post("/analyze", response_model=AnalysisResponse)
185
+ def analyze_comment(request: CommentRequest):
186
+ text = request.text.strip()
187
+ if not text:
188
+ raise HTTPException(status_code=400, detail="Text cannot be empty")
189
+
190
+ # 1. Strict "Bad Word" Check (Rule-based)
191
+ # MILD_WORDS_WHITELIST is already removed from the profanity library's censor list,
192
+ # so only genuine profanity (slurs, explicit words) will be flagged here.
193
+ if profanity.contains_profanity(text):
194
+ # Extra safety: remove any remaining multi-word safe phrases and re-check using PRECOMPILED regex
195
+ cleaned_text = text.lower()
196
+ for pattern in PROFANITY_WHITELIST_PATTERNS.values():
197
+ cleaned_text = pattern.sub("", cleaned_text)
198
+
199
+ if profanity.contains_profanity(cleaned_text):
200
+ return AnalysisResponse(
201
+ text=request.text,
202
+ results=[Score(label="profanity_strict", score=1.0)],
203
+ is_toxic=True
204
+ )
205
+ # Only multi-word mild phrase triggered it — continue to deeper checks
206
+
207
+ # 1b. Keyword-based insult/threat detector (catches ML model blind spots)
208
+ if contains_insult_keyword(text):
209
+ return AnalysisResponse(
210
+ text=request.text,
211
+ results=[Score(label="insult_keyword", score=1.0)],
212
+ is_toxic=True
213
+ )
214
+
215
+ # 2. Offensive Emoji Check
216
+ if contains_offensive_emoji(text):
217
+ return AnalysisResponse(
218
+ text=request.text,
219
+ results=[Score(label="offensive_emoji", score=1.0)],
220
+ is_toxic=True
221
+ )
222
+
223
+
224
+ # 2. Short Text Heuristic
225
+ if len(text) < 5:
226
+ return AnalysisResponse(
227
+ text=request.text,
228
+ results=[],
229
+ is_toxic=False
230
+ )
231
+
232
+ # 3. ML Model Check (Context-based)
233
+ if not classifier:
234
+ print("Classifier not loaded, skipping ML check.")
235
+ return AnalysisResponse(text=request.text, results=[], is_toxic=False)
236
+
237
+ results = classifier(text)
238
+ scores = results[0]
239
+
240
+ is_toxic = False
241
+ formatted_scores = []
242
+
243
+ # Define Threshold based on Strictness
244
+ # High (Celeb) = 0.4 (Strict)
245
+ # Low (Friend) = 0.7 (Balanced)
246
+ threshold = 0.4 if request.strictness == "high" else 0.7
247
+
248
+ # Labels that indicate toxicity. Ignores 'LABEL_0', 'non-toxic', 'neutral', etc.
249
+ TOXIC_LABELS = {"toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate", "LABEL_1"}
250
+
251
+ for item in scores:
252
+ label = item['label']
253
+ score = item['score']
254
+ formatted_scores.append(Score(label=label, score=score))
255
+
256
+ # Only mark as toxic if the label is in our toxic set AND exceeds threshold
257
+ if label in TOXIC_LABELS and score > threshold:
258
+ is_toxic = True
259
+
260
+ return AnalysisResponse(
261
+ text=request.text,
262
+ results=formatted_scores,
263
+ is_toxic=is_toxic
264
+ )
265
+
266
+ @app.post("/submit")
267
+ def submit_comment(request: CommentRequest):
268
+ # This is a mock endpoint. In a real app, this would save to DB.
269
+ # We re-check toxicity here to prevent bypassing frontend
270
+ if not classifier:
271
+ raise HTTPException(status_code=500, detail="Model not loaded")
272
+
273
+ results = classifier(request.text)[0]
274
+ is_toxic = any(item['score'] > 0.5 for item in results)
275
+
276
+ if is_toxic:
277
+ raise HTTPException(status_code=400, detail="Comment rejected due to toxicity.")
278
+
279
+ return {"message": "Comment posted successfully", "text": request.text}
280
+
281
+ if __name__ == "__main__":
282
+ import uvicorn
283
+ import os
284
+
285
+ # Check for SSL certificates in data directory or root
286
+ key_file = "data/key.pem" if os.path.exists("data/key.pem") else "key.pem"
287
+ cert_file = "data/cert.pem" if os.path.exists("data/cert.pem") else "cert.pem"
288
+
289
+ if os.path.exists(key_file) and os.path.exists(cert_file):
290
+ print(f"Starting server with SSL/HTTPS enabled using {cert_file} and {key_file}...")
291
+ uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=True, ssl_keyfile=key_file, ssl_certfile=cert_file)
292
+ else:
293
+ print("SSL certificates not found. Starting server in HTTP mode.")
294
+ uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=True)
merge_datasets.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from pathlib import Path
3
+
4
+ def merge_datasets():
5
+ data_dir = Path("data")
6
+ custom_words_file = data_dir / "custom_badwords_dataset.xlsx"
7
+ main_dataset_file = data_dir / "training_data_telugu-hate.xlsx"
8
+
9
+ if not custom_words_file.exists():
10
+ print(f"Error: {custom_words_file} not found.")
11
+ return
12
+
13
+ if not main_dataset_file.exists():
14
+ print(f"Error: {main_dataset_file} not found.")
15
+ return
16
+
17
+ # Load both datasets
18
+ print("Loading data...")
19
+ custom_df = pd.read_excel(custom_words_file)
20
+ main_df = pd.read_excel(main_dataset_file)
21
+
22
+ print(f"Original main dataset size: {len(main_df)}")
23
+ print(f"Custom badwords size: {len(custom_df)}")
24
+
25
+ # Identify column names in main_dataset (usually text/comment and label/category)
26
+ # Based on kaggle_model script, we know text could be 'text' or 'comment'
27
+ text_col_main = next((c for c in main_df.columns if str(c).lower() in ['text', 'comment', 'comments', 'sentence']), 'text')
28
+ label_col_main = next((c for c in main_df.columns if str(c).lower() in ['label', 'labels', 'category', 'class']), 'label')
29
+
30
+ print(f"Identified columns in main dataset -> Text: '{text_col_main}', Label: '{label_col_main}'")
31
+
32
+ # Rename custom dataset columns to match main dataset
33
+ custom_df = custom_df.rename(columns={'text': text_col_main, 'label': label_col_main})
34
+
35
+ # Combine the dataframes
36
+ merged_df = pd.concat([main_df, custom_df], ignore_index=True)
37
+
38
+ # Remove any absolute duplicates just in case
39
+ merged_df = merged_df.drop_duplicates(subset=[text_col_main]).reset_index(drop=True)
40
+
41
+ print(f"New merged dataset size: {len(merged_df)}")
42
+
43
+ # Make a backup of the original just in case we need it
44
+ backup_path = data_dir / "training_data_telugu-hate_backup2.xlsx"
45
+ main_df.to_excel(backup_path, index=False)
46
+ print(f"Saved backup of original to {backup_path}")
47
+
48
+ # Overwrite the main dataset
49
+ merged_df.to_excel(main_dataset_file, index=False)
50
+ print(f"Successfully merged and saved updated dataset to {main_dataset_file}")
51
+
52
+ if __name__ == "__main__":
53
+ merge_datasets()
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ transformers
4
+ torch
5
+ pydantic
6
+ better-profanity
7
+ tf-keras
8
+ scikit-learn
9
+ requests
10
+ datasets
11
+ accelerate
train_model.py ADDED
@@ -0,0 +1,418 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Fine-tune MuRIL (google/muril-base-cased) on the HOLD-Telugu (Dravidian CodeMix) dataset.
3
+ (MuRIL handles Telugu significantly better than standard toxic-bert)
4
+ SETUP:
5
+ 1. Place the downloaded Excel file in: backend/data/ (any .xlsx file)
6
+ 2. Install deps: pip install transformers torch scikit-learn accelerate openpyxl pandas
7
+
8
+ USAGE:
9
+ cd backend
10
+ python train_model.py
11
+
12
+ OUTPUT:
13
+ Fine-tuned model saved to: backend/model_output/
14
+ The backend auto-loads this model on next restart.
15
+ """
16
+
17
+ import os
18
+ import sys
19
+ import json
20
+ from pathlib import Path
21
+
22
+ # Force unbuffered output
23
+ sys.stdout.reconfigure(encoding='utf-8')
24
+
25
+ print("DEBUG: Script started", flush=True)
26
+
27
+ # ── Install dependencies if needed ───────────────────────────────────────────
28
+ print("DEBUG: Importing dependencies...", flush=True)
29
+ try:
30
+ import torch
31
+ print(f"DEBUG: Torch imported (v{torch.version})", flush=True)
32
+
33
+ # Import transformers early
34
+ import transformers
35
+ print(f"DEBUG: transformers imported (v{transformers.__version__})", flush=True)
36
+
37
+ from transformers import (
38
+ AutoTokenizer,
39
+ AutoModelForSequenceClassification,
40
+ TrainingArguments,
41
+ Trainer,
42
+ EarlyStoppingCallback
43
+ )
44
+ print("DEBUG: HuggingFace classes imported", flush=True)
45
+
46
+ import pandas as pd
47
+ print(f"DEBUG: pandas imported (v{pd.__version__})", flush=True)
48
+
49
+ import openpyxl
50
+ print("DEBUG: openpyxl imported", flush=True)
51
+
52
+ import sklearn
53
+ print(f"DEBUG: sklearn imported (v{sklearn.__version__})", flush=True)
54
+ from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
55
+ print("DEBUG: sklearn metrics imported", flush=True)
56
+
57
+ import numpy as np
58
+ print(f"DEBUG: numpy imported (v{np.__version__})", flush=True)
59
+
60
+ from torch.utils.data import Dataset as TorchDataset
61
+ print("DEBUG: TorchDataset imported", flush=True)
62
+
63
+ except ImportError as e:
64
+ print(f"DEBUG: ImportError: {e}", flush=True)
65
+ sys.exit(1)
66
+ except Exception as e:
67
+ print(f"DEBUG: Exception during import: {e}", flush=True)
68
+ sys.exit(1)
69
+
70
+ # ── Paths ─────────────────────────────────────────────────────────────────────
71
+ BASE_DIR = Path(__file__).parent
72
+ DATA_DIR = BASE_DIR / "data"
73
+ OUTPUT_DIR = BASE_DIR / "model_output"
74
+
75
+ # ── Config ────────────────────────────────────────────────────────────────────
76
+ BASE_MODEL = "google/muril-base-cased" # MuRIL (Multilingual BERT) for Indian languages
77
+ # BASE_MODEL = "unitary/toxic-bert" # Fallback to general toxic-bert if needed
78
+ MAX_LENGTH = 128 # Longer context = better understanding of comments
79
+ EPOCHS = 8 # More epochs with early stopping patience=2
80
+ LEARNING_RATE = 3e-5 # Slightly higher LR for faster convergence
81
+ # TEST_SPLIT = 0.15 # Not needed if we use explicit files
82
+
83
+ # ── Find Excel files ─────────────────────────────────────────────────────
84
+ print(f"DEBUG: Searching for data in {DATA_DIR}", flush=True)
85
+ all_files = list(DATA_DIR.iterdir())
86
+ print(f"DEBUG: Found files: {[f.name for f in all_files]}", flush=True)
87
+
88
+ train_files = [f for f in all_files if 'training_data' in f.name.lower() and f.suffix in ['.xlsx', '.xls', '.csv']]
89
+
90
+ if not train_files:
91
+ print("✗ No training file found (looking for 'training_data*.xlsx')")
92
+ sys.exit(1)
93
+ else:
94
+ print(f"✓ Training files: {[f.name for f in train_files]}")
95
+ print("ℹ Test set will be a stratified 20% split from training data (same distribution)")
96
+
97
+
98
+ # ── Helper to load data ──────────────────────────────────────────────────────
99
+
100
+ def is_code_mixed(text):
101
+ """
102
+ Returns True if text is Telugu-English code-mixed.
103
+ Keeps rows that have at least some Latin (English) characters.
104
+ Removes rows that are purely in Telugu script (U+0C00-U+0C7F).
105
+ """
106
+ text = str(text)
107
+ has_latin = any('\u0041' <= c <= '\u007A' for c in text) # A-z
108
+ total = len([c for c in text if c.strip()])
109
+ telugu = len([c for c in text if '\u0C00' <= c <= '\u0C7F'])
110
+ # Skip if purely Telugu (>80% Telugu script chars) or has no Latin at all
111
+ if total == 0:
112
+ return False
113
+ if not has_latin:
114
+ return False
115
+ if telugu / total > 0.8:
116
+ return False
117
+ return True
118
+
119
+ def load_data(files):
120
+ hate_labels_set = {'hate', 'offensive', 'hof', '1', 'yes', 'toxic'}
121
+ frames = []
122
+
123
+ TEXT_NAMES = {'text', 'comment', 'comments', 'sentence', 'tweet', 'content', 'data'}
124
+ LABEL_NAMES = {'label', 'labels', 'category', 'class', 'tag', 'hate', 'annotation'}
125
+
126
+ for excel_file in files:
127
+ print(f" Loading: {excel_file.name}", flush=True)
128
+ try:
129
+ # Support both Excel and CSV files
130
+ if excel_file.suffix == '.csv':
131
+ sheets_data = [('csv', pd.read_csv(excel_file))]
132
+ else:
133
+ xl = pd.ExcelFile(excel_file)
134
+ sheets_data = [(sheet, xl.parse(sheet)) for sheet in xl.sheet_names]
135
+
136
+ for sheet, df in sheets_data:
137
+
138
+ # Column matching
139
+ text_col = next(
140
+ (c for c in df.columns if str(c).lower() in TEXT_NAMES or
141
+ any(t in str(c).lower() for t in ['text', 'comment', 'sentence'])), None
142
+ )
143
+ label_col = next(
144
+ (c for c in df.columns if str(c).lower() in LABEL_NAMES or
145
+ any(t in str(c).lower() for t in ['label', 'categor', 'class'])), None
146
+ )
147
+
148
+ if text_col and str(text_col).lower() in ['s.no', 'no', 'id', 'index', 'sr']:
149
+ text_col = None
150
+
151
+ if text_col and label_col:
152
+ sub = df[[text_col, label_col]].copy()
153
+ sub.columns = ['text', 'label']
154
+ sub = sub.dropna()
155
+ sub['label'] = sub['label'].astype(str).str.strip().str.lower()
156
+ sub['label_int'] = sub['label'].apply(lambda x: 1 if x in hate_labels_set else 0)
157
+
158
+ # ── Filter: keep only Telugu-English code-mixed rows ──────
159
+ before = len(sub)
160
+ sub = sub[sub['text'].apply(is_code_mixed)].reset_index(drop=True)
161
+ after = len(sub)
162
+ print(f" ✓ Sheet '{sheet}': {after} code-mixed rows kept (filtered out {before - after} pure Telugu rows)", flush=True)
163
+
164
+ frames.append(sub)
165
+ else:
166
+ print(f" ⚠ Sheet '{sheet}': Skipped (cols={list(df.columns)})", flush=True)
167
+ except Exception as e:
168
+ print(f" ✗ Error reading {excel_file.name}: {e}", flush=True)
169
+
170
+ if not frames:
171
+ return pd.DataFrame(columns=['text', 'label', 'label_int'])
172
+
173
+ combined = pd.concat(frames, ignore_index=True)
174
+ return combined
175
+
176
+
177
+ # ── Load Bad Words / Emojis as Additional Training Data ──────────────────────
178
+ def load_badwords_as_training_data():
179
+ """Load telugu_badwords.txt, secure_words.bin, and bad_emojis.txt as toxic training examples."""
180
+ import base64
181
+ import random
182
+ random.seed(42)
183
+
184
+ toxic_words = []
185
+
186
+ # 1. Load telugu_badwords.txt
187
+ badwords_path = DATA_DIR / "telugu_badwords.txt"
188
+ if badwords_path.exists():
189
+ with open(badwords_path, "r", encoding="utf-8") as f:
190
+ for line in f:
191
+ word = line.strip()
192
+ if word:
193
+ toxic_words.append(word)
194
+ print(f" ✓ Loaded {len(toxic_words)} words from telugu_badwords.txt", flush=True)
195
+
196
+ # 2. Load secure_words.bin (base64 encoded)
197
+ secure_path = DATA_DIR / "secure_words.bin"
198
+ secure_count = 0
199
+ if secure_path.exists():
200
+ with open(secure_path, "rb") as f:
201
+ encoded_data = f.read()
202
+ decoded_data = base64.b64decode(encoded_data).decode("utf-8")
203
+ for line in decoded_data.splitlines():
204
+ word = line.strip()
205
+ if word and word not in toxic_words:
206
+ toxic_words.append(word)
207
+ secure_count += 1
208
+ print(f" ✓ Loaded {secure_count} additional words from secure_words.bin", flush=True)
209
+
210
+ # 3. Load bad_emojis.txt
211
+ emoji_path = DATA_DIR / "bad_emojis.txt"
212
+ emoji_count = 0
213
+ if emoji_path.exists():
214
+ with open(emoji_path, "r", encoding="utf-8") as f:
215
+ for line in f:
216
+ line = line.strip()
217
+ if line and not line.startswith("#"):
218
+ toxic_words.append(line)
219
+ emoji_count += 1
220
+ print(f" ✓ Loaded {emoji_count} offensive emojis from bad_emojis.txt", flush=True)
221
+
222
+ if not toxic_words:
223
+ return pd.DataFrame(columns=['text', 'label', 'label_int'])
224
+
225
+ # Create toxic training examples with natural sentence patterns
226
+ toxic_templates = [
227
+ "{word}",
228
+ "you are a {word}",
229
+ "{word} ga unnav",
230
+ "enti ra {word}",
231
+ "orey {word}",
232
+ "nuvvu {word}",
233
+ "{word} fellow",
234
+ "this {word}",
235
+ ]
236
+
237
+ toxic_rows = []
238
+ for word in toxic_words:
239
+ # Use 2-3 random templates per word to create varied examples
240
+ templates = random.sample(toxic_templates, min(3, len(toxic_templates)))
241
+ for template in templates:
242
+ toxic_rows.append({
243
+ 'text': template.format(word=word),
244
+ 'label': 'hate',
245
+ 'label_int': 1
246
+ })
247
+
248
+ # Generate matching SAFE examples to keep the dataset balanced
249
+ safe_phrases = [
250
+ "good morning everyone", "nice video", "great content bro",
251
+ "keep it up", "super ga undi", "chala bagundi",
252
+ "love this", "awesome work", "thank you for sharing",
253
+ "very helpful", "bagundi", "nice one", "well done",
254
+ "interesting topic", "manchi video", "super explanation",
255
+ "thanks for this", "really useful", "good job",
256
+ "happy birthday", "congratulations", "best wishes",
257
+ "nice song", "beautiful", "amazing performance",
258
+ "very informative", "subscribed", "waiting for next video",
259
+ "loved it", "manchi content", "edo oka roju",
260
+ "nenu chala happy", "meeru bagunnara", "thanks anna",
261
+ "thanks akka", "super bro", "nice edit",
262
+ "first comment", "who is watching in 2024",
263
+ "please make more videos", "this helped me a lot",
264
+ "I learned something new", "great tutorial", "perfect",
265
+ ]
266
+
267
+ safe_rows = []
268
+ # Create enough safe examples to match toxic count
269
+ target_safe = len(toxic_rows)
270
+ for i in range(target_safe):
271
+ phrase = safe_phrases[i % len(safe_phrases)]
272
+ safe_rows.append({
273
+ 'text': phrase,
274
+ 'label': 'not-hate',
275
+ 'label_int': 0
276
+ })
277
+
278
+ all_rows = toxic_rows + safe_rows
279
+ print(f" ✓ Generated {len(toxic_rows)} toxic + {len(safe_rows)} safe training examples from bad words/emojis", flush=True)
280
+ return pd.DataFrame(all_rows)
281
+
282
+
283
+ # ── Load and Split ───────────────────────────────────────────────────────────
284
+ print("\nLoading training data...", flush=True)
285
+ all_data = load_data(train_files)
286
+ if all_data.empty:
287
+ print("✗ Error: No usable data found.", flush=True)
288
+ sys.exit(1)
289
+
290
+ # Load bad words as additional training data
291
+ print("\nLoading bad words/emojis as training data...", flush=True)
292
+ badwords_data = load_badwords_as_training_data()
293
+ if not badwords_data.empty:
294
+ all_data = pd.concat([all_data, badwords_data], ignore_index=True)
295
+ print(f" Combined dataset size: {len(all_data)}", flush=True)
296
+
297
+ # Remove duplicates
298
+ len_before = len(all_data)
299
+ all_data = all_data.drop_duplicates(subset='text')
300
+ print(f" Deduplicated: {len_before} -> {len(all_data)}")
301
+
302
+ # ── Stratified 90/10 split (more training data = higher accuracy) ─────────────
303
+ from sklearn.model_selection import train_test_split
304
+ train_df, test_df = train_test_split(
305
+ all_data, test_size=0.10, random_state=42, stratify=all_data['label_int']
306
+ )
307
+
308
+ print(f"\nFinal Split: Train={len(train_df)} | Test={len(test_df)}")
309
+ print(f"Class Dist (Train): {train_df['label_int'].value_counts().to_dict()}")
310
+ print(f"Class Dist (Test): {test_df['label_int'].value_counts().to_dict()}")
311
+
312
+ train_texts = train_df['text'].tolist()
313
+ train_labels = train_df['label_int'].tolist()
314
+ test_texts = test_df['text'].tolist()
315
+ test_labels = test_df['label_int'].tolist()
316
+
317
+ # ── Load tokenizer & model ────────────────────────────────────────────────────
318
+ print(f"\nLoading model: {BASE_MODEL}", flush=True)
319
+
320
+ tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
321
+ model = AutoModelForSequenceClassification.from_pretrained(
322
+ BASE_MODEL,
323
+ num_labels=2,
324
+ ignore_mismatched_sizes=True,
325
+ problem_type="single_label_classification" # Forces CrossEntropyLoss (fixes transformers v5 bug)
326
+ )
327
+ print(f"✓ Model loaded", flush=True)
328
+
329
+ # ── Dataset ───────────────────────────────────────────────────────────────────
330
+ class CommentDataset(TorchDataset):
331
+ def __init__(self, texts, labels):
332
+ self.encodings = tokenizer(
333
+ texts, truncation=True, padding=True,
334
+ max_length=MAX_LENGTH, return_tensors='pt'
335
+ )
336
+ self.labels = labels
337
+
338
+ def __len__(self): return len(self.labels)
339
+
340
+ def __getitem__(self, idx):
341
+ item = {k: v[idx] for k, v in self.encodings.items()}
342
+ item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
343
+ return item
344
+
345
+ print("Tokenizing datasets...", flush=True)
346
+ train_dataset = CommentDataset(train_texts, train_labels)
347
+ test_dataset = CommentDataset(test_texts, test_labels)
348
+
349
+ # ── Metrics ───────────────────────────────────────────────────────────────────
350
+ def compute_metrics(eval_pred):
351
+ logits, labels = eval_pred
352
+ preds = np.argmax(logits, axis=-1)
353
+ return {
354
+ 'accuracy': accuracy_score(labels, preds),
355
+ 'f1': f1_score(labels, preds, zero_division=0),
356
+ 'precision': precision_score(labels, preds, zero_division=0),
357
+ 'recall': recall_score(labels, preds, zero_division=0),
358
+ }
359
+
360
+ # ── Training ──────────────────────────────────────────────────────────────────
361
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
362
+ print(f"\nTraining on: {device.upper()}", flush=True)
363
+
364
+ OUTPUT_DIR.mkdir(exist_ok=True)
365
+ batch_size = 16 if device == 'cuda' else 8 # Smaller batch = better generalization on small datasets
366
+ eval_batch_size = 64 # No gradients during eval → can use larger batch
367
+
368
+ # 10% warmup steps
369
+ total_steps = (len(train_dataset) // batch_size) * EPOCHS
370
+ warmup_steps = int(total_steps * 0.1)
371
+
372
+ training_args = TrainingArguments(
373
+ output_dir=str(OUTPUT_DIR),
374
+ num_train_epochs=EPOCHS,
375
+ per_device_train_batch_size=batch_size,
376
+ per_device_eval_batch_size=eval_batch_size,
377
+ learning_rate=LEARNING_RATE,
378
+ warmup_steps=warmup_steps,
379
+ weight_decay=0.05, # Stronger regularization to prevent overfitting
380
+ eval_strategy="epoch",
381
+ save_strategy="epoch",
382
+ load_best_model_at_end=True,
383
+ metric_for_best_model="f1",
384
+ logging_steps=25,
385
+ report_to="none",
386
+ fp16=(device == 'cuda'),
387
+ )
388
+
389
+ trainer = Trainer(
390
+ model=model,
391
+ args=training_args,
392
+ train_dataset=train_dataset,
393
+ eval_dataset=test_dataset,
394
+ compute_metrics=compute_metrics,
395
+ callbacks=[EarlyStoppingCallback(early_stopping_patience=2)] # Stop early before overfitting
396
+ )
397
+
398
+ print(f"Starting training...", flush=True)
399
+ trainer.train()
400
+
401
+ # ── Final evaluation ──────────────────────────────────────────────────────────
402
+ print("\nEvaluating on test set...", flush=True)
403
+ results = trainer.evaluate()
404
+ print(f"\n{'='*60}")
405
+ print("FINAL RESULTS:")
406
+ print(f" Accuracy: {results.get('eval_accuracy', 0)*100:.2f}%")
407
+ print(f" F1 Score: {results.get('eval_f1', 0):.4f}")
408
+ print(f" Precision: {results.get('eval_precision', 0):.4f}")
409
+ print(f" Recall: {results.get('eval_recall', 0):.4f}")
410
+ print(f"{'='*60}")
411
+
412
+ # ── Save ──────────────────────────────────────────────────────────────────────
413
+ trainer.save_model(str(OUTPUT_DIR))
414
+ tokenizer.save_pretrained(str(OUTPUT_DIR))
415
+ with open(OUTPUT_DIR / "eval_results.json", 'w') as f:
416
+ json.dump(results, f, indent=2)
417
+
418
+ print(f"\n✅ Done! Model saved to: {OUTPUT_DIR}", flush=True)
verify_model.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ MODEL VERIFICATION SCRIPT
3
+ Use this to test your trained model locally on your PC.
4
+ """
5
+
6
+ import os
7
+ from transformers import pipeline
8
+
9
+ def test_model():
10
+ # 1. Path to your model folder
11
+ # Change this to 'model_output_v2' if testing the new version
12
+ model_path = "./model_output"
13
+
14
+ if not os.path.exists(model_path):
15
+ print(f"❌ Error: Model folder '{model_path}' not found.")
16
+ print("Please ensure you have moved your Kaggle/Colab output into the 'backend' folder.")
17
+ return
18
+
19
+ print("🔄 Loading model (this may take a few seconds)...")
20
+ try:
21
+ # Load the toxicity classifier
22
+ classifier = pipeline(
23
+ "text-classification",
24
+ model=model_path,
25
+ tokenizer=model_path,
26
+ device=-1 # Use -1 for CPU, 0 for first GPU
27
+ )
28
+ print("✅ Model loaded successfully!\n")
29
+ except Exception as e:
30
+ print(f"❌ Failed to load model: {e}")
31
+ return
32
+
33
+ print("Enter 'quit' to exit.")
34
+ while True:
35
+ text = input("\n📝 Enter a comment to test: ")
36
+ if text.lower() == 'quit':
37
+ break
38
+
39
+ if not text.strip():
40
+ continue
41
+
42
+ # Get prediction
43
+ result = classifier(text)[0]
44
+
45
+ label = result['label']
46
+ score = result['score']
47
+
48
+ # Map labels to human-readable text
49
+ # LABEL_1 is usually Toxic, LABEL_0 is Safe
50
+ is_toxic = "TOXIC 🔴" if label == "LABEL_1" else "SAFE 🟢"
51
+
52
+ print("-" * 30)
53
+ print(f"Result: {is_toxic}")
54
+ print(f"Confidence: {score*100:.2f}%")
55
+ print("-" * 30)
56
+
57
+ if __name__ == "__main__":
58
+ test_model()