| import re |
| import torch |
| import gradio as gr |
| from transformers import AutoTokenizer, AutoModelForSequenceClassification |
| from deep_translator import GoogleTranslator |
|
|
| |
| MODEL_PATH = "Thilak118/indic-bert-toxicity-classifier_kannada" |
|
|
| model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH) |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH) |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
| model.to(device) |
| model.eval() |
|
|
| |
| translator = GoogleTranslator(source="en", target="kn") |
|
|
| |
| def clean_text(text): |
| text = re.sub(r"[^\u0C80-\u0CFF\s.,!?]", "", str(text)) |
| text = re.sub(r"\s+", " ", text).strip() |
| return text |
|
|
| |
| def transliterate_to_kannada(text): |
| if text and text.strip(): |
| try: |
| return translator.translate(text) |
| except Exception: |
| return "Translation failed" |
| return "" |
|
|
| |
| def predict_toxicity(english_text): |
| kannada_text = transliterate_to_kannada(english_text) |
|
|
| if "failed" in kannada_text.lower(): |
| return f"Transliterated Kannada Text: {kannada_text}\nPrediction: Failed" |
|
|
| cleaned_text = clean_text(kannada_text) |
|
|
| inputs = tokenizer( |
| cleaned_text, |
| return_tensors="pt", |
| padding=True, |
| truncation=True, |
| max_length=128 |
| ).to(device) |
|
|
| with torch.no_grad(): |
| outputs = model(**inputs) |
|
|
| prediction = torch.argmax(outputs.logits, dim=1).item() |
| probs = torch.softmax(outputs.logits, dim=1)[0] |
| confidence = probs[prediction].item() * 100 |
|
|
| label = "Toxic" if prediction == 0 else "Non-Toxic" |
|
|
| return ( |
| f"Transliterated Kannada Text: {cleaned_text}\n" |
| f"Prediction: {label}\n" |
| f"Confidence: {confidence:.2f}%" |
| ) |
|
|
| |
| with gr.Blocks(title="Kannada Text Toxicity Classifier (IndicBERT)") as demo: |
| gr.Markdown( |
| """ |
| # Kannada Text Toxicity Classifier (IndicBERT + Transliteration) |
| |
| Enter Kannada text written in **English letters** |
| Example: |
| **`ninage ashtondu seen illa le`** |
| |
| The app converts it into **Kannada script** and predicts toxicity. |
| """ |
| ) |
|
|
| with gr.Row(): |
| with gr.Column(): |
| input_text = gr.Textbox( |
| label="Enter Kannada Text (English Transliteration)", |
| placeholder="e.g., ninage ashtondu seen illa le", |
| lines=2 |
| ) |
| with gr.Column(): |
| transliterated_text = gr.Textbox( |
| label="Kannada Script (Preview)", |
| interactive=False, |
| lines=2 |
| ) |
|
|
| with gr.Row(): |
| preview_btn = gr.Button("Preview Transliteration") |
| predict_btn = gr.Button("Predict Toxicity") |
|
|
| output_text = gr.Textbox( |
| label="Prediction Output", |
| interactive=False, |
| lines=5 |
| ) |
|
|
| preview_btn.click( |
| fn=transliterate_to_kannada, |
| inputs=input_text, |
| outputs=transliterated_text |
| ) |
|
|
| predict_btn.click( |
| fn=predict_toxicity, |
| inputs=input_text, |
| outputs=output_text |
| ) |
|
|
| |
| demo.launch() |
|
|