import gradio as gr import os import random import soundfile as sf import re from transformers import pipeline from datasets import load_dataset from gradio_client import Client from stats_data import get_indomain_breakdown, get_experimental_summary, SPEAKER_META # 1. Initialize Local Whisper (Baseline) whisper_asr = pipeline("automatic-speech-recognition", model="openai/whisper-tiny") # 2. Setup Private Backend Connection (Hidden logic) HF_TOKEN = os.getenv("HF_TOKEN") PRIVATE_BACKEND_URL = "st192011/Torgo-DSR-Private" # Update with your private space name def normalize_text(text): """Simple normalization for comparison: lowercase and strip punctuation.""" return re.sub(r'[^\w\s]', '', text).lower().strip() def get_sample(speaker_id): """Accesses HF Datasets via Streaming to get a sample for the UI.""" try: if "UA" in speaker_id: # Note: UA-Speech ID logic (Speaker F02) path = "ngdiana/uaspeech_severity_high" actual_spk = "F02" else: path = "unsw-cse/torgo" actual_spk = speaker_id # Stream dataset to avoid huge downloads ds = load_dataset(path, split="test", streaming=True) # Filter for the chosen speaker speaker_ds = ds.filter(lambda x: x["speaker_id"] == actual_spk) # Take a small buffer and pick a random sample samples = list(speaker_ds.take(20)) sample = random.choice(samples) audio_path = "sample_audio.wav" sf.write(audio_path, sample["audio"]["array"], sample["audio"]["sampling_rate"]) return audio_path, sample["text"], SPEAKER_META[speaker_id] except Exception as e: return None, f"Error accessing dataset: {e}", None def run_correction(audio_path, gt_text): if audio_path is None: return "No audio input", "", "" # A. Local Whisper Inference w_raw = whisper_asr(audio_path)["text"] w_norm = normalize_text(w_raw) # B. Call Private Backend for the 5K and 10K results try: client = Client(PRIVATE_BACKEND_URL, hf_token=HF_TOKEN) # Private app receives audio + normalized whisper, returns (5k_pred, 10k_pred) res_5k, res_10k = client.predict(audio_path, w_norm, api_name="/predict_dsr_dual") except Exception as e: res_5k, res_10k = "Backend Connection Required", f"Details: {e}" return w_raw, res_5k, res_10k # UI Layout with gr.Blocks(theme=gr.themes.Default(), title="Torgo DSR Lab") as demo: gr.Markdown("# ⚗️ Torgo DSR Lab") gr.Markdown("### Neural Reconstruction and ASR Correction for Torgo and UA-Speech") with gr.Tab("🔬 Laboratory"): with gr.Row(): with gr.Column(scale=1): gr.Markdown("#### 1. Dataset Explorer") spk_input = gr.Dropdown(list(SPEAKER_META.keys()), label="Select Speaker Profile") load_btn = gr.Button("🎲 Load Random Dataset Sample") gr.Markdown("---") audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Input Audio") with gr.Column(scale=2): gr.Markdown("#### 2. Metadata & Ground Truth") gt_box = gr.Textbox(label="Ground Truth (Human Label)", interactive=False) meta_box = gr.JSON(label="Speaker Characteristics") gr.Markdown("#### 3. Comparison Results") w_out = gr.Textbox(label="Whisper Tiny Baseline (Raw Transcript)") with gr.Row(): out_5k = gr.Textbox(label="5K Pure Model (Acoustic Focus)") out_10k = gr.Textbox(label="10K Triple-Mix Model (Linguistic Focus)") run_btn = gr.Button("🚀 Run Correction Layer", variant="primary") with gr.Tab("📊 Research Statistics"): gr.Markdown("# 🔬 Evaluation Metrics") gr.Markdown(""" **Metric:** Exact Match Accuracy. Calculated by comparing the **normalized prediction** (lowercase, no punctuation) against the **normalized ground truth**. """) gr.Markdown("### 1. In-Domain Torgo Breakdown (By Speaker)") gr.DataFrame(get_indomain_breakdown()) gr.Markdown("### 2. Experimental Milestone Summary") gr.Markdown("_Note: The 10K model was utilized to test generalization via LOSO on unseen speaker F01._") gr.DataFrame(get_experimental_summary()) # Event Logic load_btn.click(get_sample, inputs=spk_input, outputs=[audio_input, gt_box, meta_box]) run_btn.click(run_correction, inputs=[audio_input, gt_box], outputs=[w_out, out_5k, out_10k]) demo.launch()