Spaces:

Thilak118
/

TamilCommentsToxicityDetection

Sleeping

App Files Files Community

Thilak118 commited on Dec 28, 2025

Commit

022dc69

verified ·

1 Parent(s): 47ad443

Update app.py

Browse files

Files changed (1) hide show

app.py +32 -93

app.py CHANGED Viewed

@@ -1,75 +1,40 @@
 import gradio as gr
 import torch
 import re
-from transformers import AutoModelForSequenceClassification, AutoTokenizer
-from deep_translator import GoogleTranslator
-# -----------------------------
-# Load model & tokenizer
-# -----------------------------
-MODEL_NAME = "Thilak118/indic-bert-toxicity-classifier_tamil"
-tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model.to(device)
-model.eval()
-# -----------------------------
-# Translator (English → Tamil)
-# -----------------------------
-translator = GoogleTranslator(source="auto", target="ta")
-# -----------------------------
-# Utility functions
-# -----------------------------
-def is_tamil_text(text):
-    return bool(re.search(r"[\u0B80-\u0BFF]", text))
 def clean_text(text):
-    text = re.sub(r"[^\u0B80-\u0BFF\s.,!?]", "", text)
-    text = re.sub(r"\s+", " ", text).strip()
     return text
-def transliterate_to_tamil(text):
-    try:
-        return translator.translate(text)
-    except Exception:
-        return "❌ Transliteration failed"
-# -----------------------------
-# Prediction function
-# -----------------------------
-def predict_toxicity(user_input):
-    if not user_input or not user_input.strip():
-        return "❌ Please enter some text"
-    # Step 1: Convert to Tamil if needed
-    if is_tamil_text(user_input):
-        tamil_text = user_input
-    else:
-        tamil_text = transliterate_to_tamil(user_input)
-        if "failed" in tamil_text.lower():
-            return tamil_text
-    # Step 2: Clean Tamil text
-    cleaned_text = clean_text(tamil_text)
-    if not cleaned_text:
-        return "❌ Invalid Tamil text after cleaning"
-    # Step 3: Tokenize
     inputs = tokenizer(
         cleaned_text,
         return_tensors="pt",
         padding=True,
         truncation=True,
         max_length=128
-    )
-    inputs = {k: v.to(device) for k, v in inputs.items()}
-    # Step 4: Inference
     with torch.no_grad():
         outputs = model(**inputs)
@@ -80,54 +45,28 @@ def predict_toxicity(user_input):
     label = "Toxic" if prediction == 0 else "Non-Toxic"
     return (
-        f"📝 Tamil Text: {cleaned_text}\n"
-        f"🔍 Prediction: {label}\n"
-        f"📊 Confidence: {confidence:.2f}%"
     )
-# -----------------------------
-# Gradio UI
-# -----------------------------
-with gr.Blocks(title="Tamil Toxicity Classifier") as app:
     gr.Markdown(
         """
-        ## Tamil Text Toxicity Classifier 🇮🇳
-        Enter **Tamil text** or **English transliteration**
-        (example: `nee romba mosamaanavan` → நீ ரொம்ப மோசமானவன்)
         """
     )
-    with gr.Row():
-        input_text = gr.Textbox(
-            label="Input Text",
-            placeholder="e.g., nee romba mosamaanavan",
-            lines=2
-        )
-        preview_text = gr.Textbox(
-            label="Tamil Preview",
-            interactive=False,
-            lines=2
-        )
-    preview_btn = gr.Button("Preview Transliteration")
-    predict_btn = gr.Button("Predict Toxicity")
-    output = gr.Textbox(
-        label="Result",
-        interactive=False,
-        lines=5
-    )
-    preview_btn.click(
-        fn=transliterate_to_tamil,
-        inputs=input_text,
-        outputs=preview_text
-    )
-    predict_btn.click(
-        fn=predict_toxicity,
-        inputs=input_text,
-        outputs=output
-    )
-app.launch()

 import gradio as gr
 import torch
 import re
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+from indic_transliteration.sanscript import transliterate, ITRANS, TAMIL
+MODEL_PATH = "Thilak118/indic-bert-toxicity-classifier_tamil"
+tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
+model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH)
+model.eval()
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model.to(device)
+def transliterate_to_tamil(text):
+    if text and text.strip():
+        return transliterate(text, ITRANS, TAMIL)
+    return ""
 def clean_text(text):
+    text = re.sub(r'[^\u0B80-\u0BFF\s.,!?]', '', text)
+    text = re.sub(r'\s+', ' ', text).strip()
     return text
+def predict_toxicity(input_text):
+    ta_text = transliterate_to_tamil(input_text)
+    cleaned_text = clean_text(ta_text)
     inputs = tokenizer(
         cleaned_text,
         return_tensors="pt",
         padding=True,
         truncation=True,
         max_length=128
+    ).to(device)
     with torch.no_grad():
         outputs = model(**inputs)
     label = "Toxic" if prediction == 0 else "Non-Toxic"
     return (
+        f"Tamil Text: {cleaned_text}\n"
+        f"Prediction: {label}\n"
+        f"Confidence: {confidence:.2f}%"
     )
+with gr.Blocks(title="Tamil Toxicity Classifier") as demo:
     gr.Markdown(
         """
+        # Tamil Text Toxicity Classifier 🇮🇳
+        Enter **English transliteration**
+        Example: `nee romba mosam`
         """
     )
+    input_text = gr.Textbox(label="Enter Text (English)", lines=2)
+    preview = gr.Textbox(label="Tamil Text", interactive=False)
+    output = gr.Textbox(label="Prediction", lines=4)
+    preview_btn = gr.Button("Preview Tamil Text")
+    predict_btn = gr.Button("Predict Toxicity")
+    preview_btn.click(transliterate_to_tamil, input_text, preview)
+    predict_btn.click(predict_toxicity, input_text, output)
+demo.launch()