Thilak118's picture
Update app.py
a336fb7 verified
import gradio as gr
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import re
from deep_translator import GoogleTranslator
# Load model & tokenizer
model_name = "Thilak118/indic-bert-toxicity-classifier"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()
# Initialize translator
# Note: GoogleTranslator's source is 'auto' by default, but you had 'en'.
# Keeping it as 'en' to align with the intent of translating English transliteration to Telugu.
translator = GoogleTranslator(source='en', target='te')
def clean_text(text):
# Keep only Telugu characters (Unicode range \u0C00-\u0C7F), spaces, and basic punctuation
text = re.sub(r'[^\u0C00-\u0C7F\s.,!?]', '', text)
# Collapse multiple spaces into a single space and strip leading/trailing spaces
text = re.sub(r'\s+', ' ', text).strip()
return text
def is_telugu_text(text):
# Check if the text contains any Telugu script characters
return bool(re.search(r'[\u0C00-\u0C7F]', text))
def transliterate_to_telugu(text):
"""
Translates English transliteration (or any non-Telugu text) to Telugu script.
"""
try:
# The deep_translator's GoogleTranslator is used for this
return translator.translate(text)
except Exception as e:
return f"Error in transliteration: {str(e)}"
def predict_toxicity(user_input):
"""
Processes user input, converts to Telugu if necessary, cleans it,
and predicts toxicity using the Hugging Face model.
"""
try:
original_input = user_input
# Check if the input is already in Telugu
if is_telugu_text(original_input):
telugu_text = original_input
else:
# Transliterate (translate) the English input to Telugu
telugu_text = transliterate_to_telugu(original_input)
if "Error in transliteration" in telugu_text:
return telugu_text
# Clean the Telugu text (remove non-Telugu, non-punctuation chars)
cleaned = clean_text(telugu_text)
# Tokenize and prepare inputs for the model
inputs = tokenizer(cleaned, return_tensors="pt", padding=True, truncation=True, max_length=128)
inputs = {k: v.to(device) for k, v in inputs.items()}
# Run inference
with torch.no_grad():
outputs = model(**inputs)
# Process results
prediction = torch.argmax(outputs.logits, dim=1).item()
prob = torch.softmax(outputs.logits, dim=1)[0]
confidence = max(prob).item() * 100
# Assuming 0 is Toxic and 1 is Non-Toxic based on typical binary classification
label = "Toxic" if prediction == 0 else "Non-Toxic"
# Return the prediction result
return f"Transliterated Telugu Text: {cleaned}\nPrediction: {label}\nConfidence: {confidence:.2f}%"
except Exception as e:
return f"Error: {str(e)}"
# ✅ Gradio Interface
with gr.Blocks() as interface:
gr.Markdown(
"""
Telugu Text Toxicity Classifier
Enter Telugu text, typically in **English transliteration** (e.g., 'neeku' for నీకు).
The application will first attempt to convert it to the Telugu script, clean it, and then
predict if the resulting Telugu text is **Toxic** or **Non-Toxic**.
"""
)
with gr.Row():
english_input = gr.Textbox(
label="Enter Telugu Text (in English Transliteration or Telugu Script)",
placeholder="e.g., chala baagundhi or చాలా బాగుంది",
lines=2
)
telugu_preview = gr.Textbox(
label="Transliterated Telugu Text (Preview)",
interactive=False, # Changed to False as it's a preview/output
lines=2
)
preview_button = gr.Button("Preview Transliteration")
predict_button = gr.Button("Predict Toxicity")
output = gr.Textbox(label="Prediction Output", lines=5)
# Event handlers
preview_button.click(
fn=transliterate_to_telugu,
inputs=english_input,
outputs=telugu_preview
)
predict_button.click(
fn=predict_toxicity,
inputs=english_input,
outputs=output
)
interface.launch()