Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import torch | |
| import librosa | |
| import numpy as np | |
| import os | |
| from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline | |
| from preprocess import clean_arabic_text | |
| # --- 1. CONFIGURATION --- | |
| CUSTOM_MODEL_ID = "rana811/final_arabic_model" | |
| BACKUP_MODEL = "hossam87/bert-base-arabic-hate-speech" | |
| print(f"π Connecting to Hugging Face Hub: {CUSTOM_MODEL_ID}...") | |
| # --- 2. LOAD MODELS --- | |
| try: | |
| # A. Load Whisper for Speech-to-Text | |
| asr_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-small") | |
| tokenizer = AutoTokenizer.from_pretrained(CUSTOM_MODEL_ID) | |
| model = AutoModelForSequenceClassification.from_pretrained(CUSTOM_MODEL_ID) | |
| model.eval() | |
| MODEL_TO_USE = CUSTOM_MODEL_ID | |
| print("β Successfully loaded rana811/final_arabic_model!") | |
| except Exception as e: | |
| print(f"β Error loading custom model: {e}") | |
| print(f"β οΈ Switching to backup: {BACKUP_MODEL}") | |
| # Fallback to the generic model if something goes wrong | |
| tokenizer = AutoTokenizer.from_pretrained(BACKUP_MODEL) | |
| model = AutoModelForSequenceClassification.from_pretrained(BACKUP_MODEL) | |
| MODEL_TO_USE = BACKUP_MODEL | |
| # --- 2. HELPER FUNCTIONS --- | |
| def get_acoustic_excitement(audio_path): | |
| y, sr = librosa.load(audio_path, sr=16000) | |
| rms = np.mean(librosa.feature.rms(y=y)) | |
| zcr = np.mean(librosa.feature.zero_crossing_rate(y)) | |
| tempo_arr = librosa.feature.tempo(y=y, sr=sr) | |
| tempo = tempo_arr[0] if len(tempo_arr) > 0 else 110 | |
| norm_rms = min(rms * 10, 1.0) | |
| norm_zcr = min(zcr * 10, 1.0) | |
| norm_tempo = min((tempo - 60) / 100, 1.0) | |
| excitement = (0.5 * norm_rms) + (0.3 * norm_tempo) + (0.2 * norm_zcr) | |
| return excitement | |
| def predict_text(text): | |
| inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128) | |
| with torch.no_grad(): | |
| logits = model(**inputs).logits | |
| probs = torch.nn.functional.softmax(logits, dim=-1)[0] | |
| return probs.tolist() | |
| def multimodal_fusion(text, audio_excitement): | |
| probs = predict_text(text) | |
| if len(probs) == 3: | |
| prob_safe, prob_warning, prob_toxic = probs | |
| else: | |
| prob_safe = probs[0] | |
| prob_toxic = probs[1] | |
| prob_warning = 0.0 | |
| final_label = "Unknown" | |
| confidence = 0.0 | |
| if prob_toxic > 0.5: | |
| final_label = "TOXIC β" | |
| confidence = prob_toxic | |
| if audio_excitement < 0.3: | |
| final_label = "NEEDS WARNING β οΈ (Toxic Text, Calm Voice)" | |
| elif prob_warning > prob_toxic and prob_warning > prob_safe: | |
| final_label = "NEEDS WARNING β οΈ" | |
| confidence = prob_warning | |
| if audio_excitement > 0.7: | |
| final_label = "TOXIC β (Escalated by Tone)" | |
| else: | |
| final_label = "SAFE β " | |
| confidence = prob_safe | |
| if audio_excitement > 0.85: | |
| final_label = "NEEDS WARNING β οΈ (Aggressive Yelling)" | |
| return final_label, confidence | |
| # --- 3. MAIN APP FUNCTION --- | |
| def process_input(audio_path, text_input): | |
| # CASE 1: BOTH inputs are present (The "Scenario Tester") | |
| # We use the TYPED text for meaning, and the AUDIO for tone. | |
| if audio_path is not None and text_input is not None and len(text_input.strip()) > 0: | |
| # A. Use typed text directly (Skip Whisper) | |
| clean_text = clean_arabic_text(text_input) | |
| # B. Analyze Audio Tone | |
| try: | |
| excitement = get_acoustic_excitement(audio_path) | |
| # C. Fuse them | |
| label, conf = multimodal_fusion(clean_text, excitement) | |
| return ( | |
| f"Used Manual Text: {clean_text}", | |
| f"{excitement:.2f}", | |
| label, | |
| f"{conf:.2f}", | |
| "β FUSED: Manual Text + Audio Tone" | |
| ) | |
| except Exception as e: | |
| return "Error with audio file", "0.00", "Error", "0.00", str(e) | |
| # CASE 2: Audio Only (Standard Multimodal) | |
| # We use Whisper to get text, then fuse with Audio tone. | |
| elif audio_path is not None: | |
| try: | |
| # A. Transcribe (Whisper) | |
| transcription = asr_pipeline(audio_path)["text"] | |
| clean_transcript = clean_arabic_text(transcription) | |
| # B. Analyze Audio Tone | |
| excitement = get_acoustic_excitement(audio_path) | |
| # C. Fuse them | |
| label, conf = multimodal_fusion(clean_transcript, excitement) | |
| return ( | |
| f"Transcribed: {clean_transcript}", | |
| f"{excitement:.2f}", | |
| label, | |
| f"{conf:.2f}", | |
| "β FUSED: Whisper ASR + Audio Tone" | |
| ) | |
| except Exception as e: | |
| return f"Error: {str(e)}", "0.00", "Error", "0.00", "-" | |
| # CASE 3: Text Only (Unimodal) | |
| elif text_input and len(text_input.strip()) > 0: | |
| clean_text = clean_arabic_text(text_input) | |
| probs = predict_text(clean_text) | |
| labels = ["SAFE β ", "NEEDS WARNING β οΈ", "TOXIC β"] | |
| if len(probs) == 2: labels = ["SAFE β ", "TOXIC β"] # Handle backup model | |
| max_idx = np.argmax(probs) | |
| return clean_text, "N/A", labels[max_idx], f"{probs[max_idx]:.2f}", "β οΈ Text Only (No Tone Analysis)" | |
| return "Please upload audio or enter text.", "-", "-", "-", "-" | |
| # --- 4. GRADIO UI (UNIFIED) --- | |
| with gr.Blocks(title="Arabic Multimodal Hate Detector") as demo: | |
| gr.Markdown("## π‘οΈ Arabic Toxicity & Hate Speech Detection (Multimodal)") | |
| gr.Markdown(f"**Current Model:** `{MODEL_TO_USE}`") | |
| gr.Markdown("βΉοΈ **How to use:** Upload audio to analyze speech. Optionally, type text *while* uploading audio to test specific 'Text + Tone' scenarios.") | |
| with gr.Row(): | |
| # LEFT COLUMN: INPUTS | |
| with gr.Column(): | |
| audio_in = gr.Audio(sources=["microphone", "upload"], type="filepath", label="1. Audio Input (Tone/Prosody)") | |
| text_in = gr.Textbox(label="2. Manual Text Override (Optional)", placeholder="Type here to override Whisper transcription...") | |
| submit_btn = gr.Button("Analyze Multimodal", variant="primary") | |
| # RIGHT COLUMN: OUTPUTS | |
| with gr.Column(): | |
| status_box = gr.Textbox(label="Processing Mode") | |
| out_transcription = gr.Textbox(label="Text Content Used") | |
| with gr.Row(): | |
| out_excitement = gr.Textbox(label="Acoustic Excitement (0-1)") | |
| out_conf = gr.Textbox(label="Confidence") | |
| out_label = gr.Textbox(label="FINAL CLASSIFICATION", scale=2) | |
| # Click Event | |
| submit_btn.click( | |
| process_input, | |
| inputs=[audio_in, text_in], | |
| outputs=[out_transcription, out_excitement, out_label, out_conf, status_box] | |
| ) | |
| demo.launch() | |
| # --- 5. GRADIO UI --- | |
| with gr.Blocks(title="Arabic Multimodal Hate Detector") as demo: | |
| gr.Markdown("## π‘οΈ Arabic Toxicity & Hate Speech Detection (Multimodal)") | |
| gr.Markdown(f"**Current Model:** `{MODEL_TO_USE}`") | |
| with gr.Tab("π€ Speech Analysis"): | |
| audio_in = gr.Audio(sources=["microphone", "upload"], type="filepath") | |
| btn_audio = gr.Button("Analyze Speech") | |
| with gr.Row(): | |
| out_transcription = gr.Textbox(label="Transcription") | |
| out_excitement = gr.Textbox(label="Acoustic Excitement (0-1)") | |
| with gr.Row(): | |
| out_label_a = gr.Textbox(label="Final Class") | |
| out_conf_a = gr.Textbox(label="Confidence") | |
| with gr.Tab("π Text Analysis"): | |
| text_in = gr.Textbox(label="Enter Arabic Text", placeholder="Ψ§ΩΨͺΨ¨ ΩΩΨ§...") | |
| btn_text = gr.Button("Analyze Text") | |
| with gr.Row(): | |
| out_clean = gr.Textbox(label="Processed Text") | |
| out_ignore = gr.Textbox(visible=False) | |
| out_label_t = gr.Textbox(label="Final Class") | |
| out_conf_t = gr.Textbox(label="Confidence") | |
| btn_audio.click( | |
| process_input, | |
| inputs=[audio_in, text_in], | |
| outputs=[out_transcription, out_excitement, out_label_a, out_conf_a] | |
| ) | |
| btn_text.click( | |
| process_input, | |
| inputs=[audio_in, text_in], | |
| outputs=[out_clean, out_ignore, out_label_t, out_conf_t] | |
| ) | |
| demo.launch() |