Spaces:

rana811
/

NLP_project2.0

Sleeping

App Files Files Community

NLP_project2.0 / app.py

rana811

Update app.py

c82adfc verified 3 months ago

raw

history blame contribute delete

8.36 kB

	import gradio as gr
	import torch
	import librosa
	import numpy as np
	import os
	from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
	from preprocess import clean_arabic_text

	# --- 1. CONFIGURATION ---
	CUSTOM_MODEL_ID = "rana811/final_arabic_model"
	BACKUP_MODEL = "hossam87/bert-base-arabic-hate-speech"

	print(f"🌍 Connecting to Hugging Face Hub: {CUSTOM_MODEL_ID}...")

	# --- 2. LOAD MODELS ---
	try:
	# A. Load Whisper for Speech-to-Text
	asr_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-small")

	tokenizer = AutoTokenizer.from_pretrained(CUSTOM_MODEL_ID)
	model = AutoModelForSequenceClassification.from_pretrained(CUSTOM_MODEL_ID)
	model.eval()

	MODEL_TO_USE = CUSTOM_MODEL_ID
	print("✅ Successfully loaded rana811/final_arabic_model!")

	except Exception as e:
	print(f"❌ Error loading custom model: {e}")
	print(f"⚠️ Switching to backup: {BACKUP_MODEL}")

	# Fallback to the generic model if something goes wrong
	tokenizer = AutoTokenizer.from_pretrained(BACKUP_MODEL)
	model = AutoModelForSequenceClassification.from_pretrained(BACKUP_MODEL)
	MODEL_TO_USE = BACKUP_MODEL

	# --- 2. HELPER FUNCTIONS ---
	def get_acoustic_excitement(audio_path):
	y, sr = librosa.load(audio_path, sr=16000)
	rms = np.mean(librosa.feature.rms(y=y))
	zcr = np.mean(librosa.feature.zero_crossing_rate(y))
	tempo_arr = librosa.feature.tempo(y=y, sr=sr)
	tempo = tempo_arr[0] if len(tempo_arr) > 0 else 110

	norm_rms = min(rms * 10, 1.0)
	norm_zcr = min(zcr * 10, 1.0)
	norm_tempo = min((tempo - 60) / 100, 1.0)

	excitement = (0.5 * norm_rms) + (0.3 * norm_tempo) + (0.2 * norm_zcr)
	return excitement

	def predict_text(text):
	inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128)
	with torch.no_grad():
	logits = model(**inputs).logits
	probs = torch.nn.functional.softmax(logits, dim=-1)[0]
	return probs.tolist()

	def multimodal_fusion(text, audio_excitement):
	probs = predict_text(text)

	if len(probs) == 3:
	prob_safe, prob_warning, prob_toxic = probs
	else:
	prob_safe = probs[0]
	prob_toxic = probs[1]
	prob_warning = 0.0

	final_label = "Unknown"
	confidence = 0.0

	if prob_toxic > 0.5:
	final_label = "TOXIC ❌"
	confidence = prob_toxic
	if audio_excitement < 0.3:
	final_label = "NEEDS WARNING ⚠️ (Toxic Text, Calm Voice)"
	elif prob_warning > prob_toxic and prob_warning > prob_safe:
	final_label = "NEEDS WARNING ⚠️"
	confidence = prob_warning
	if audio_excitement > 0.7:
	final_label = "TOXIC ❌ (Escalated by Tone)"
	else:
	final_label = "SAFE ✅"
	confidence = prob_safe
	if audio_excitement > 0.85:
	final_label = "NEEDS WARNING ⚠️ (Aggressive Yelling)"

	return final_label, confidence
	# --- 3. MAIN APP FUNCTION ---
	def process_input(audio_path, text_input):

	# CASE 1: BOTH inputs are present (The "Scenario Tester")
	# We use the TYPED text for meaning, and the AUDIO for tone.
	if audio_path is not None and text_input is not None and len(text_input.strip()) > 0:

	# A. Use typed text directly (Skip Whisper)
	clean_text = clean_arabic_text(text_input)

	# B. Analyze Audio Tone
	try:
	excitement = get_acoustic_excitement(audio_path)
	# C. Fuse them
	label, conf = multimodal_fusion(clean_text, excitement)

	return (
	f"Used Manual Text: {clean_text}",
	f"{excitement:.2f}",
	label,
	f"{conf:.2f}",
	"✅ FUSED: Manual Text + Audio Tone"
	)
	except Exception as e:
	return "Error with audio file", "0.00", "Error", "0.00", str(e)

	# CASE 2: Audio Only (Standard Multimodal)
	# We use Whisper to get text, then fuse with Audio tone.
	elif audio_path is not None:
	try:
	# A. Transcribe (Whisper)
	transcription = asr_pipeline(audio_path)["text"]
	clean_transcript = clean_arabic_text(transcription)

	# B. Analyze Audio Tone
	excitement = get_acoustic_excitement(audio_path)

	# C. Fuse them
	label, conf = multimodal_fusion(clean_transcript, excitement)

	return (
	f"Transcribed: {clean_transcript}",
	f"{excitement:.2f}",
	label,
	f"{conf:.2f}",
	"✅ FUSED: Whisper ASR + Audio Tone"
	)
	except Exception as e:
	return f"Error: {str(e)}", "0.00", "Error", "0.00", "-"

	# CASE 3: Text Only (Unimodal)
	elif text_input and len(text_input.strip()) > 0:
	clean_text = clean_arabic_text(text_input)
	probs = predict_text(clean_text)

	labels = ["SAFE ✅", "NEEDS WARNING ⚠️", "TOXIC ❌"]
	if len(probs) == 2: labels = ["SAFE ✅", "TOXIC ❌"] # Handle backup model

	max_idx = np.argmax(probs)
	return clean_text, "N/A", labels[max_idx], f"{probs[max_idx]:.2f}", "⚠️ Text Only (No Tone Analysis)"

	return "Please upload audio or enter text.", "-", "-", "-", "-"

	# --- 4. GRADIO UI (UNIFIED) ---
	with gr.Blocks(title="Arabic Multimodal Hate Detector") as demo:
	gr.Markdown("## 🛡️ Arabic Toxicity & Hate Speech Detection (Multimodal)")
	gr.Markdown(f"Current Model: `{MODEL_TO_USE}`")
	gr.Markdown("ℹ️ How to use: Upload audio to analyze speech. Optionally, type text while uploading audio to test specific 'Text + Tone' scenarios.")

	with gr.Row():
	# LEFT COLUMN: INPUTS
	with gr.Column():
	audio_in = gr.Audio(sources=["microphone", "upload"], type="filepath", label="1. Audio Input (Tone/Prosody)")
	text_in = gr.Textbox(label="2. Manual Text Override (Optional)", placeholder="Type here to override Whisper transcription...")
	submit_btn = gr.Button("Analyze Multimodal", variant="primary")

	# RIGHT COLUMN: OUTPUTS
	with gr.Column():
	status_box = gr.Textbox(label="Processing Mode")
	out_transcription = gr.Textbox(label="Text Content Used")
	with gr.Row():
	out_excitement = gr.Textbox(label="Acoustic Excitement (0-1)")
	out_conf = gr.Textbox(label="Confidence")
	out_label = gr.Textbox(label="FINAL CLASSIFICATION", scale=2)

	# Click Event
	submit_btn.click(
	process_input,
	inputs=[audio_in, text_in],
	outputs=[out_transcription, out_excitement, out_label, out_conf, status_box]
	)

	demo.launch()

	# --- 5. GRADIO UI ---
	with gr.Blocks(title="Arabic Multimodal Hate Detector") as demo:
	gr.Markdown("## 🛡️ Arabic Toxicity & Hate Speech Detection (Multimodal)")
	gr.Markdown(f"Current Model: `{MODEL_TO_USE}`")

	with gr.Tab("🎤 Speech Analysis"):
	audio_in = gr.Audio(sources=["microphone", "upload"], type="filepath")
	btn_audio = gr.Button("Analyze Speech")
	with gr.Row():
	out_transcription = gr.Textbox(label="Transcription")
	out_excitement = gr.Textbox(label="Acoustic Excitement (0-1)")
	with gr.Row():
	out_label_a = gr.Textbox(label="Final Class")
	out_conf_a = gr.Textbox(label="Confidence")

	with gr.Tab("📝 Text Analysis"):
	text_in = gr.Textbox(label="Enter Arabic Text", placeholder="اكتب هنا...")
	btn_text = gr.Button("Analyze Text")
	with gr.Row():
	out_clean = gr.Textbox(label="Processed Text")
	out_ignore = gr.Textbox(visible=False)
	out_label_t = gr.Textbox(label="Final Class")
	out_conf_t = gr.Textbox(label="Confidence")


	btn_audio.click(
	process_input,
	inputs=[audio_in, text_in],
	outputs=[out_transcription, out_excitement, out_label_a, out_conf_a]
	)

	btn_text.click(
	process_input,
	inputs=[audio_in, text_in],
	outputs=[out_clean, out_ignore, out_label_t, out_conf_t]
	)

	demo.launch()