NLP_project2.0 / app.py
rana811's picture
Update app.py
c82adfc verified
import gradio as gr
import torch
import librosa
import numpy as np
import os
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from preprocess import clean_arabic_text
# --- 1. CONFIGURATION ---
CUSTOM_MODEL_ID = "rana811/final_arabic_model"
BACKUP_MODEL = "hossam87/bert-base-arabic-hate-speech"
print(f"🌍 Connecting to Hugging Face Hub: {CUSTOM_MODEL_ID}...")
# --- 2. LOAD MODELS ---
try:
# A. Load Whisper for Speech-to-Text
asr_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-small")
tokenizer = AutoTokenizer.from_pretrained(CUSTOM_MODEL_ID)
model = AutoModelForSequenceClassification.from_pretrained(CUSTOM_MODEL_ID)
model.eval()
MODEL_TO_USE = CUSTOM_MODEL_ID
print("βœ… Successfully loaded rana811/final_arabic_model!")
except Exception as e:
print(f"❌ Error loading custom model: {e}")
print(f"⚠️ Switching to backup: {BACKUP_MODEL}")
# Fallback to the generic model if something goes wrong
tokenizer = AutoTokenizer.from_pretrained(BACKUP_MODEL)
model = AutoModelForSequenceClassification.from_pretrained(BACKUP_MODEL)
MODEL_TO_USE = BACKUP_MODEL
# --- 2. HELPER FUNCTIONS ---
def get_acoustic_excitement(audio_path):
y, sr = librosa.load(audio_path, sr=16000)
rms = np.mean(librosa.feature.rms(y=y))
zcr = np.mean(librosa.feature.zero_crossing_rate(y))
tempo_arr = librosa.feature.tempo(y=y, sr=sr)
tempo = tempo_arr[0] if len(tempo_arr) > 0 else 110
norm_rms = min(rms * 10, 1.0)
norm_zcr = min(zcr * 10, 1.0)
norm_tempo = min((tempo - 60) / 100, 1.0)
excitement = (0.5 * norm_rms) + (0.3 * norm_tempo) + (0.2 * norm_zcr)
return excitement
def predict_text(text):
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128)
with torch.no_grad():
logits = model(**inputs).logits
probs = torch.nn.functional.softmax(logits, dim=-1)[0]
return probs.tolist()
def multimodal_fusion(text, audio_excitement):
probs = predict_text(text)
if len(probs) == 3:
prob_safe, prob_warning, prob_toxic = probs
else:
prob_safe = probs[0]
prob_toxic = probs[1]
prob_warning = 0.0
final_label = "Unknown"
confidence = 0.0
if prob_toxic > 0.5:
final_label = "TOXIC ❌"
confidence = prob_toxic
if audio_excitement < 0.3:
final_label = "NEEDS WARNING ⚠️ (Toxic Text, Calm Voice)"
elif prob_warning > prob_toxic and prob_warning > prob_safe:
final_label = "NEEDS WARNING ⚠️"
confidence = prob_warning
if audio_excitement > 0.7:
final_label = "TOXIC ❌ (Escalated by Tone)"
else:
final_label = "SAFE βœ…"
confidence = prob_safe
if audio_excitement > 0.85:
final_label = "NEEDS WARNING ⚠️ (Aggressive Yelling)"
return final_label, confidence
# --- 3. MAIN APP FUNCTION ---
def process_input(audio_path, text_input):
# CASE 1: BOTH inputs are present (The "Scenario Tester")
# We use the TYPED text for meaning, and the AUDIO for tone.
if audio_path is not None and text_input is not None and len(text_input.strip()) > 0:
# A. Use typed text directly (Skip Whisper)
clean_text = clean_arabic_text(text_input)
# B. Analyze Audio Tone
try:
excitement = get_acoustic_excitement(audio_path)
# C. Fuse them
label, conf = multimodal_fusion(clean_text, excitement)
return (
f"Used Manual Text: {clean_text}",
f"{excitement:.2f}",
label,
f"{conf:.2f}",
"βœ… FUSED: Manual Text + Audio Tone"
)
except Exception as e:
return "Error with audio file", "0.00", "Error", "0.00", str(e)
# CASE 2: Audio Only (Standard Multimodal)
# We use Whisper to get text, then fuse with Audio tone.
elif audio_path is not None:
try:
# A. Transcribe (Whisper)
transcription = asr_pipeline(audio_path)["text"]
clean_transcript = clean_arabic_text(transcription)
# B. Analyze Audio Tone
excitement = get_acoustic_excitement(audio_path)
# C. Fuse them
label, conf = multimodal_fusion(clean_transcript, excitement)
return (
f"Transcribed: {clean_transcript}",
f"{excitement:.2f}",
label,
f"{conf:.2f}",
"βœ… FUSED: Whisper ASR + Audio Tone"
)
except Exception as e:
return f"Error: {str(e)}", "0.00", "Error", "0.00", "-"
# CASE 3: Text Only (Unimodal)
elif text_input and len(text_input.strip()) > 0:
clean_text = clean_arabic_text(text_input)
probs = predict_text(clean_text)
labels = ["SAFE βœ…", "NEEDS WARNING ⚠️", "TOXIC ❌"]
if len(probs) == 2: labels = ["SAFE βœ…", "TOXIC ❌"] # Handle backup model
max_idx = np.argmax(probs)
return clean_text, "N/A", labels[max_idx], f"{probs[max_idx]:.2f}", "⚠️ Text Only (No Tone Analysis)"
return "Please upload audio or enter text.", "-", "-", "-", "-"
# --- 4. GRADIO UI (UNIFIED) ---
with gr.Blocks(title="Arabic Multimodal Hate Detector") as demo:
gr.Markdown("## πŸ›‘οΈ Arabic Toxicity & Hate Speech Detection (Multimodal)")
gr.Markdown(f"**Current Model:** `{MODEL_TO_USE}`")
gr.Markdown("ℹ️ **How to use:** Upload audio to analyze speech. Optionally, type text *while* uploading audio to test specific 'Text + Tone' scenarios.")
with gr.Row():
# LEFT COLUMN: INPUTS
with gr.Column():
audio_in = gr.Audio(sources=["microphone", "upload"], type="filepath", label="1. Audio Input (Tone/Prosody)")
text_in = gr.Textbox(label="2. Manual Text Override (Optional)", placeholder="Type here to override Whisper transcription...")
submit_btn = gr.Button("Analyze Multimodal", variant="primary")
# RIGHT COLUMN: OUTPUTS
with gr.Column():
status_box = gr.Textbox(label="Processing Mode")
out_transcription = gr.Textbox(label="Text Content Used")
with gr.Row():
out_excitement = gr.Textbox(label="Acoustic Excitement (0-1)")
out_conf = gr.Textbox(label="Confidence")
out_label = gr.Textbox(label="FINAL CLASSIFICATION", scale=2)
# Click Event
submit_btn.click(
process_input,
inputs=[audio_in, text_in],
outputs=[out_transcription, out_excitement, out_label, out_conf, status_box]
)
demo.launch()
# --- 5. GRADIO UI ---
with gr.Blocks(title="Arabic Multimodal Hate Detector") as demo:
gr.Markdown("## πŸ›‘οΈ Arabic Toxicity & Hate Speech Detection (Multimodal)")
gr.Markdown(f"**Current Model:** `{MODEL_TO_USE}`")
with gr.Tab("🎀 Speech Analysis"):
audio_in = gr.Audio(sources=["microphone", "upload"], type="filepath")
btn_audio = gr.Button("Analyze Speech")
with gr.Row():
out_transcription = gr.Textbox(label="Transcription")
out_excitement = gr.Textbox(label="Acoustic Excitement (0-1)")
with gr.Row():
out_label_a = gr.Textbox(label="Final Class")
out_conf_a = gr.Textbox(label="Confidence")
with gr.Tab("πŸ“ Text Analysis"):
text_in = gr.Textbox(label="Enter Arabic Text", placeholder="Ψ§ΩƒΨͺΨ¨ Ω‡Ω†Ψ§...")
btn_text = gr.Button("Analyze Text")
with gr.Row():
out_clean = gr.Textbox(label="Processed Text")
out_ignore = gr.Textbox(visible=False)
out_label_t = gr.Textbox(label="Final Class")
out_conf_t = gr.Textbox(label="Confidence")
btn_audio.click(
process_input,
inputs=[audio_in, text_in],
outputs=[out_transcription, out_excitement, out_label_a, out_conf_a]
)
btn_text.click(
process_input,
inputs=[audio_in, text_in],
outputs=[out_clean, out_ignore, out_label_t, out_conf_t]
)
demo.launch()