Spaces:

Thilak118
/

teluguCommentToxicityDetection

Sleeping

App Files Files Community

teluguCommentToxicityDetection / app.py

Thilak118

Update app.py

a336fb7 verified 6 months ago

raw

history blame contribute delete

4.45 kB

	import gradio as gr
	import torch
	from transformers import AutoModelForSequenceClassification, AutoTokenizer
	import re
	from deep_translator import GoogleTranslator

	# Load model & tokenizer
	model_name = "Thilak118/indic-bert-toxicity-classifier"
	model = AutoModelForSequenceClassification.from_pretrained(model_name)
	tokenizer = AutoTokenizer.from_pretrained(model_name)

	# Set device
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	model.to(device)
	model.eval()

	# Initialize translator
	# Note: GoogleTranslator's source is 'auto' by default, but you had 'en'.
	# Keeping it as 'en' to align with the intent of translating English transliteration to Telugu.
	translator = GoogleTranslator(source='en', target='te')

	def clean_text(text):
	# Keep only Telugu characters (Unicode range \u0C00-\u0C7F), spaces, and basic punctuation
	text = re.sub(r'[^\u0C00-\u0C7F\s.,!?]', '', text)
	# Collapse multiple spaces into a single space and strip leading/trailing spaces
	text = re.sub(r'\s+', ' ', text).strip()
	return text

	def is_telugu_text(text):
	# Check if the text contains any Telugu script characters
	return bool(re.search(r'[\u0C00-\u0C7F]', text))

	def transliterate_to_telugu(text):
	"""
	Translates English transliteration (or any non-Telugu text) to Telugu script.
	"""
	try:
	# The deep_translator's GoogleTranslator is used for this
	return translator.translate(text)
	except Exception as e:
	return f"Error in transliteration: {str(e)}"

	def predict_toxicity(user_input):
	"""
	Processes user input, converts to Telugu if necessary, cleans it,
	and predicts toxicity using the Hugging Face model.
	"""
	try:
	original_input = user_input

	# Check if the input is already in Telugu
	if is_telugu_text(original_input):
	telugu_text = original_input
	else:
	# Transliterate (translate) the English input to Telugu
	telugu_text = transliterate_to_telugu(original_input)
	if "Error in transliteration" in telugu_text:
	return telugu_text

	# Clean the Telugu text (remove non-Telugu, non-punctuation chars)
	cleaned = clean_text(telugu_text)

	# Tokenize and prepare inputs for the model
	inputs = tokenizer(cleaned, return_tensors="pt", padding=True, truncation=True, max_length=128)
	inputs = {k: v.to(device) for k, v in inputs.items()}

	# Run inference
	with torch.no_grad():
	outputs = model(**inputs)

	# Process results
	prediction = torch.argmax(outputs.logits, dim=1).item()
	prob = torch.softmax(outputs.logits, dim=1)[0]
	confidence = max(prob).item() * 100
	# Assuming 0 is Toxic and 1 is Non-Toxic based on typical binary classification
	label = "Toxic" if prediction == 0 else "Non-Toxic"

	# Return the prediction result
	return f"Transliterated Telugu Text: {cleaned}\nPrediction: {label}\nConfidence: {confidence:.2f}%"
	except Exception as e:
	return f"Error: {str(e)}"

	# ✅ Gradio Interface
	with gr.Blocks() as interface:
	gr.Markdown(
	"""
	Telugu Text Toxicity Classifier
	Enter Telugu text, typically in English transliteration (e.g., 'neeku' for నీకు).
	The application will first attempt to convert it to the Telugu script, clean it, and then
	predict if the resulting Telugu text is Toxic or Non-Toxic.


	"""
	)
	with gr.Row():
	english_input = gr.Textbox(
	label="Enter Telugu Text (in English Transliteration or Telugu Script)",
	placeholder="e.g., chala baagundhi or చాలా బాగుంది",
	lines=2
	)
	telugu_preview = gr.Textbox(
	label="Transliterated Telugu Text (Preview)",
	interactive=False, # Changed to False as it's a preview/output
	lines=2
	)

	preview_button = gr.Button("Preview Transliteration")
	predict_button = gr.Button("Predict Toxicity")
	output = gr.Textbox(label="Prediction Output", lines=5)

	# Event handlers
	preview_button.click(
	fn=transliterate_to_telugu,
	inputs=english_input,
	outputs=telugu_preview
	)

	predict_button.click(
	fn=predict_toxicity,
	inputs=english_input,
	outputs=output
	)


	interface.launch()