Thilak118's picture
Create app.py
3648b06 verified
import re
import torch
import gradio as gr
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from deep_translator import GoogleTranslator
# -------------------- Load Kannada Model & Tokenizer --------------------
MODEL_PATH = "Thilak118/indic-bert-toxicity-classifier_kannada"
model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH)
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()
# -------------------- Translator (English → Kannada) --------------------
translator = GoogleTranslator(source="en", target="kn")
# -------------------- Clean Kannada Text --------------------
def clean_text(text):
text = re.sub(r"[^\u0C80-\u0CFF\s.,!?]", "", str(text))
text = re.sub(r"\s+", " ", text).strip()
return text
# -------------------- Transliterate / Translate to Kannada --------------------
def transliterate_to_kannada(text):
if text and text.strip():
try:
return translator.translate(text)
except Exception:
return "Translation failed"
return ""
# -------------------- Toxicity Prediction --------------------
def predict_toxicity(english_text):
kannada_text = transliterate_to_kannada(english_text)
if "failed" in kannada_text.lower():
return f"Transliterated Kannada Text: {kannada_text}\nPrediction: Failed"
cleaned_text = clean_text(kannada_text)
inputs = tokenizer(
cleaned_text,
return_tensors="pt",
padding=True,
truncation=True,
max_length=128
).to(device)
with torch.no_grad():
outputs = model(**inputs)
prediction = torch.argmax(outputs.logits, dim=1).item()
probs = torch.softmax(outputs.logits, dim=1)[0]
confidence = probs[prediction].item() * 100
label = "Toxic" if prediction == 0 else "Non-Toxic"
return (
f"Transliterated Kannada Text: {cleaned_text}\n"
f"Prediction: {label}\n"
f"Confidence: {confidence:.2f}%"
)
# -------------------- Gradio UI --------------------
with gr.Blocks(title="Kannada Text Toxicity Classifier (IndicBERT)") as demo:
gr.Markdown(
"""
# Kannada Text Toxicity Classifier (IndicBERT + Transliteration)
Enter Kannada text written in **English letters**
Example:
**`ninage ashtondu seen illa le`**
The app converts it into **Kannada script** and predicts toxicity.
"""
)
with gr.Row():
with gr.Column():
input_text = gr.Textbox(
label="Enter Kannada Text (English Transliteration)",
placeholder="e.g., ninage ashtondu seen illa le",
lines=2
)
with gr.Column():
transliterated_text = gr.Textbox(
label="Kannada Script (Preview)",
interactive=False,
lines=2
)
with gr.Row():
preview_btn = gr.Button("Preview Transliteration")
predict_btn = gr.Button("Predict Toxicity")
output_text = gr.Textbox(
label="Prediction Output",
interactive=False,
lines=5
)
preview_btn.click(
fn=transliterate_to_kannada,
inputs=input_text,
outputs=transliterated_text
)
predict_btn.click(
fn=predict_toxicity,
inputs=input_text,
outputs=output_text
)
# -------------------- Launch App --------------------
demo.launch()