import re import torch import gradio as gr from transformers import AutoTokenizer, AutoModelForSequenceClassification from deep_translator import GoogleTranslator # -------------------- Load Kannada Model & Tokenizer -------------------- MODEL_PATH = "Thilak118/indic-bert-toxicity-classifier_kannada" model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH) tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) model.eval() # -------------------- Translator (English → Kannada) -------------------- translator = GoogleTranslator(source="en", target="kn") # -------------------- Clean Kannada Text -------------------- def clean_text(text): text = re.sub(r"[^\u0C80-\u0CFF\s.,!?]", "", str(text)) text = re.sub(r"\s+", " ", text).strip() return text # -------------------- Transliterate / Translate to Kannada -------------------- def transliterate_to_kannada(text): if text and text.strip(): try: return translator.translate(text) except Exception: return "Translation failed" return "" # -------------------- Toxicity Prediction -------------------- def predict_toxicity(english_text): kannada_text = transliterate_to_kannada(english_text) if "failed" in kannada_text.lower(): return f"Transliterated Kannada Text: {kannada_text}\nPrediction: Failed" cleaned_text = clean_text(kannada_text) inputs = tokenizer( cleaned_text, return_tensors="pt", padding=True, truncation=True, max_length=128 ).to(device) with torch.no_grad(): outputs = model(**inputs) prediction = torch.argmax(outputs.logits, dim=1).item() probs = torch.softmax(outputs.logits, dim=1)[0] confidence = probs[prediction].item() * 100 label = "Toxic" if prediction == 0 else "Non-Toxic" return ( f"Transliterated Kannada Text: {cleaned_text}\n" f"Prediction: {label}\n" f"Confidence: {confidence:.2f}%" ) # -------------------- Gradio UI -------------------- with gr.Blocks(title="Kannada Text Toxicity Classifier (IndicBERT)") as demo: gr.Markdown( """ # Kannada Text Toxicity Classifier (IndicBERT + Transliteration) Enter Kannada text written in **English letters** Example: **`ninage ashtondu seen illa le`** The app converts it into **Kannada script** and predicts toxicity. """ ) with gr.Row(): with gr.Column(): input_text = gr.Textbox( label="Enter Kannada Text (English Transliteration)", placeholder="e.g., ninage ashtondu seen illa le", lines=2 ) with gr.Column(): transliterated_text = gr.Textbox( label="Kannada Script (Preview)", interactive=False, lines=2 ) with gr.Row(): preview_btn = gr.Button("Preview Transliteration") predict_btn = gr.Button("Predict Toxicity") output_text = gr.Textbox( label="Prediction Output", interactive=False, lines=5 ) preview_btn.click( fn=transliterate_to_kannada, inputs=input_text, outputs=transliterated_text ) predict_btn.click( fn=predict_toxicity, inputs=input_text, outputs=output_text ) # -------------------- Launch App -------------------- demo.launch()