Spaces:

st192011
/

Torgo-DSR-Lab

Runtime error

App Files Files Community

st192011 commited on 1 day ago

Commit

005401d

verified ·

1 Parent(s): 1368417

Update app.py

Browse files

Files changed (1) hide show

app.py +151 -190

app.py CHANGED Viewed

@@ -1,242 +1,203 @@
 import gradio as gr
-from gradio_client import Client
 import os
 import io
 import re
 import random
 import librosa
 import soundfile as sf
-import torch
-import numpy as np
 from transformers import pipeline
 from datasets import load_dataset, Audio
-import tempfile
-# ==========================================
-# 1. SETUP & AUTHENTICATION
-# ==========================================
-# HF Token for accessing Gated Datasets and Private Space
-HF_TOKEN = os.getenv("HF_TOKEN")
-# Private Backend Configuration
-PRIVATE_SPACE_URL = "st192011/Torgo-DSR-Private"
-print(f"Connecting to Private Backend at {PRIVATE_SPACE_URL}...")
-try:
-    backend_client = Client(PRIVATE_SPACE_URL, hf_token=HF_TOKEN)
-    print("✅ Successfully connected to Private Backend.")
-except Exception as e:
-    print(f"⚠️ Warning: Could not connect to backend. Error: {e}")
-    backend_client = None
-# ==========================================
-# 2. WHISPER TINY (Strict Colab Settings)
-# ==========================================
-device = "cuda" if torch.cuda.is_available() else "cpu"
-print(f"Loading Whisper Tiny on {device}...")
 whisper_asr = pipeline(
     "automatic-speech-recognition",
     model="openai/whisper-tiny",
-    device=device,
     generate_kwargs={
         "language": "en",
-        "task": "transcribe",
-        "repetition_penalty": 3.0,
-        "max_new_tokens": 64
     }
 )
-# ==========================================
-# 3. METADATA & DATA LOGIC
-# ==========================================
-SPEAKER_META = {
-    "F01": {"Gender": "Female", "Severity": "Severe", "Dataset": "Torgo"},
-    "F03": {"Gender": "Female", "Severity": "Mild", "Dataset": "Torgo"},
-    "F04": {"Gender": "Female", "Severity": "Mild", "Dataset": "Torgo"},
-    "M01": {"Gender": "Male", "Severity": "Moderate", "Dataset": "Torgo"},
-    "M02": {"Gender": "Male", "Severity": "Mild", "Dataset": "Torgo"},
-    "M03": {"Gender": "Male", "Severity": "Mild", "Dataset": "Torgo"},
-    "M04": {"Gender": "Male", "Severity": "Moderate", "Dataset": "Torgo"},
-    "M05": {"Gender": "Male", "Severity": "Severe", "Dataset": "Torgo"},
-    "F02 (UA)": {"Gender": "Female", "Severity": "Severe", "Dataset": "UA-Speech"}
-}
 def get_sample_logic(speaker_id):
-    """
-    Exact logic from Colab:
-    - Uses abnerh/TORGO-database for Torgo
-    - Uses resproj007/uaspeech_female for UA
-    - Uses librosa + io.BytesIO for decoding
-    """
-    print(f"Attempting to load sample for: {speaker_id}")
     try:
         if "UA" in speaker_id:
-            # UA-Speech logic: Direct pull
-            ds = load_dataset("resproj007/uaspeech_female", split="train", streaming=True, token=HF_TOKEN)
-            ds = ds.cast_column("audio", Audio(decode=False))
-            # F02 is the only speaker here, skip random amount for variety
-            iterator = iter(ds.skip(random.randint(0, 50)))
-            sample = next(iterator)
         else:
-            # Torgo logic: abnerh dataset with filtering
-            ds = load_dataset("abnerh/TORGO-database", split="train", streaming=True, token=HF_TOKEN)
-            ds = ds.cast_column("audio", Audio(decode=False))
-            # Filter by speaker ID
             def filter_spk(x):
-                # Try to get speaker_id from metadata, fall back to filename parsing
                 sid = str(x.get('speaker_id', '')).upper()
                 if not sid or sid == "NONE":
-                    path = x.get('audio', {}).get('path', '')
-                    sid = os.path.basename(path).split('_')[0].upper()
                 return sid == speaker_id
-            speaker_ds = ds.filter(filter_spk)
-            # Shuffle buffer to get random samples
-            iterator = iter(speaker_ds.shuffle(buffer_size=10))
-            sample = next(iterator)
-        # Metadata extraction
-        gt_text = sample.get('transcription') or sample.get('text') or sample.get('sentence') or "Unknown"
-        # --- Manual Byte Decoding (Colab Logic) ---
         audio_bytes = sample['audio']['bytes']
-        # Load directly into librosa using BytesIO
-        audio_data, sample_rate = librosa.load(io.BytesIO(audio_bytes), sr=16000)
-        # Save to temp file for Gradio/Backend
-        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
-            sf.write(tmp.name, audio_data, sample_rate)
-            temp_path = tmp.name
-        return temp_path, gt_text.lower().strip(), SPEAKER_META.get(speaker_id, {})
-    except StopIteration:
-        return None, "Error: Could not find any samples for this speaker in the stream.", {}
     except Exception as e:
-        return None, f"Dataset Error: {str(e)}", {}
-def run_whisper_step(audio_path):
-    """
-    Step 2: Baseline ASR
-    """
-    if not audio_path:
-        return "No audio loaded", ""
-    try:
-        result = whisper_asr(audio_path)
-        raw_w = result["text"]
-        # Normalized Baseline (No punctuation, lowercase)
-        norm_w = re.sub(r'[^\w\s]', '', raw_w).lower().strip()
-        return raw_w, norm_w
-    except Exception as e:
-        return f"Whisper Error: {e}", "Error"
-def run_model_step(audio_path):
-    """
-    Step 3: Private Backend Reconstruction
-    """
-    if not audio_path:
-        return "No audio loaded", "Step 1 incomplete"
-    if not backend_client:
-        return None, "⚠️ Backend Disconnected. Check Private Space."
     try:
-        print("Sending audio to Private Backend...")
-        # Calls the /predict_dsr endpoint in the private space
-        # Expecting returns: [Audio Path, Transcription String]
-        result = backend_client.predict(
-            audio_path,
-            api_name="/predict_dsr"
-        )
-        reconstructed_audio = result[0]
-        dsr_text = result[1]
-        return reconstructed_audio, dsr_text
     except Exception as e:
-        return None, f"Backend Prediction Error: {str(e)}"
-# ==========================================
-# 4. GRADIO UI
-# ==========================================
 with gr.Blocks(theme=gr.themes.Soft(), title="Torgo DSR Lab") as demo:
-    gr.Markdown(
-        """
-        # ⚗️ Torgo DSR Lab
-        **Integrated Research Interface** | *Syncs with Colab Logic*
-        """
-    )
-    # State to hold the current audio path across steps
-    current_audio_state = gr.State("")
-    with gr.Row():
-        # --- COLUMN 1: LOAD ---
-        with gr.Column(scale=1):
-            gr.Markdown("### Step 1: Load Sample")
-            speaker_input = gr.Dropdown(
-                choices=sorted(list(SPEAKER_META.keys())),
-                label="Speaker ID",
-                value="F01"
-            )
-            load_btn = gr.Button("🎲 Load Data", variant="secondary")
-            # Displays
-            input_audio_display = gr.Audio(label="Input Audio", type="filepath", interactive=False)
-            gt_box = gr.Textbox(label="Ground Truth", interactive=False)
-            meta_display = gr.JSON(label="Speaker Meta")
-        # --- COLUMN 2: BASELINE ---
-        with gr.Column(scale=1):
-            gr.Markdown("### Step 2: ASR Baseline")
-            whisper_btn = gr.Button("Run Whisper Tiny")
-            w_raw = gr.Textbox(label="Whisper Raw")
-            w_norm = gr.Textbox(label="Whisper Normalized (WER Check)")
-        # --- COLUMN 3: RECONSTRUCTION ---
-        with gr.Column(scale=1):
-            gr.Markdown("### Step 3: Neural Reconstruction")
-            model_btn = gr.Button("🚀 Run 10K Triple-Mix Model", variant="primary")
-            output_audio_display = gr.Audio(label="Reconstructed Audio", type="filepath", interactive=False)
-            final_out_text = gr.Textbox(label="DSR Transcription")
-    # ==========================================
-    # EVENT HANDLERS
-    # ==========================================
-    # Step 1: Load
-    def on_load(speaker_id):
-        path, text, meta = get_sample_logic(speaker_id)
-        if path:
-            return path, path, text, meta # Update State, Audio Player, Text, JSON
-        else:
-            return None, None, text, meta # Handle errors passed in 'text' variable
     load_btn.click(
-        fn=on_load,
-        inputs=[speaker_input],
-        outputs=[current_audio_state, input_audio_display, gt_box, meta_display]
     )
-    # Step 2: Whisper
     whisper_btn.click(
-        fn=run_whisper_step,
-        inputs=[current_audio_state],
         outputs=[w_raw, w_norm]
     )
-    # Step 3: Backend Model
     model_btn.click(
-        fn=run_model_step,
-        inputs=[current_audio_state],
-        outputs=[output_audio_display, final_out_text]
     )
-if __name__ == "__main__":
-    demo.launch()

 import gradio as gr
 import os
 import io
 import re
 import random
 import librosa
 import soundfile as sf
+import pandas as pd
 from transformers import pipeline
 from datasets import load_dataset, Audio
+from gradio_client import Client
+from stats_data import get_indomain_breakdown, get_experimental_summary, SPEAKER_META
+# 1. Initialize Baseline ASR (Strict English, Repetition Penalty 3.0)
+print("Initializing Whisper Tiny Baseline...")
 whisper_asr = pipeline(
     "automatic-speech-recognition",
     model="openai/whisper-tiny",
     generate_kwargs={
         "language": "en",
+        "task": "transcribe",
+        "repetition_penalty": 3.0
     }
 )
+HF_TOKEN = os.getenv("HF_TOKEN")
+PRIVATE_BACKEND_URL = "st192011/Torgo-DSR-Private"
+def normalize_text(text):
+    if not text: return ""
+    return re.sub(r'[^\w\s]', '', text).lower().strip()
+def format_audio(audio_path):
+    """Ensures audio is 16kHz mono to match ASR training conditions."""
+    y, sr = librosa.load(audio_path, sr=16000)
+    out_path = "formatted_input.wav"
+    sf.write(out_path, y, sr)
+    return out_path
+# --- Logic: Data Loading ---
 def get_sample_logic(speaker_id):
     try:
         if "UA" in speaker_id:
+            # UA-Speech Access (Direct pull for F02)
+            dataset = load_dataset("resproj007/uaspeech_female", split="train", streaming=True)
+            dataset = dataset.cast_column("audio", Audio(decode=False))
+            # UA is small, skip slightly for variety
+            sample = next(iter(dataset.skip(random.randint(0, 30))))
+            gt_text = sample.get('text') or sample.get('transcription') or sample.get('sentence')
         else:
+            # Torgo Access (Manual filtering as per Colab fix)
+            dataset = load_dataset("abnerh/TORGO-database", split="train", streaming=True)
+            dataset = dataset.cast_column("audio", Audio(decode=False))
             def filter_spk(x):
                 sid = str(x.get('speaker_id', '')).upper()
                 if not sid or sid == "NONE":
+                    sid = os.path.basename(x['audio']['path']).split('_')[0].upper()
                 return sid == speaker_id
+            speaker_ds = dataset.filter(filter_spk)
+            sample = next(iter(speaker_ds.shuffle(buffer_size=10)))
+            gt_text = sample.get('transcription') or sample.get('text')
+        # Decode Bytes manually to bypass torchcodec errors
         audio_bytes = sample['audio']['bytes']
+        audio_data, sr = librosa.load(io.BytesIO(audio_bytes), sr=16000)
+        temp_path = "dataset_sample.wav"
+        sf.write(temp_path, audio_data, sr)
+        return temp_path, gt_text.lower().strip(), SPEAKER_META[speaker_id]
     except Exception as e:
+        return None, f"Dataset Error: {e}", {}
+# --- Logic: Model Processing ---
+def process_audio_step_1(audio_path):
+    """Runs Whisper Baseline and returns normalized text."""
+    if not audio_path: return "No audio", ""
+    # Pre-process audio format
+    formatted_path = format_audio(audio_path)
+    # Run Whisper
+    result = whisper_asr(formatted_path)
+    raw_w = result["text"]
+    norm_w = normalize_text(raw_w)
+    return raw_w, norm_w
+def process_audio_step_2(audio_path, norm_whisper):
+    """Sends audio + normalized whisper to the Private Model."""
+    if not audio_path or not norm_whisper: return "Incomplete input from previous steps."
     try:
+        formatted_path = format_audio(audio_path)
+        client = Client(PRIVATE_BACKEND_URL, hf_token=HF_TOKEN)
+        prediction = client.predict(formatted_path, norm_whisper, api_name="/predict_dsr")
+        return prediction
     except Exception as e:
+        return f"Backend Connection Required. Details: {e}"
+# --- UI Construction ---
 with gr.Blocks(theme=gr.themes.Soft(), title="Torgo DSR Lab") as demo:
+    gr.Markdown("# ⚗️ Torgo DSR Lab")
+    gr.Markdown("Neural Reconstruction Layer for Torgo (In-domain/LOSO) and UA-Speech (Zero-Shot).")
+    # Hidden state to store the path of the currently active audio
+    active_audio_path = gr.State("")
+    with gr.Tab("🔬 Laboratory"):
+        with gr.Row():
+            # LEFT COLUMN: Data Input
+            with gr.Column(scale=1):
+                with gr.Group():
+                    gr.Markdown("### Channel A: Research Datasets")
+                    speaker_input = gr.Dropdown(sorted(list(SPEAKER_META.keys())), label="Select Speaker Profile", value="F01")
+                    load_btn = gr.Button("Load Sample from Dataset")
+                    gt_box = gr.Textbox(label="Ground Truth (Reference)", interactive=False)
+                    meta_display = gr.JSON(label="Speaker Metadata")
+                gr.Markdown("---")
+                with gr.Group():
+                    gr.Markdown("### Channel B: Personal Input")
+                    user_audio = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Record or Upload Audio")
+                    user_load_btn = gr.Button("Use This Audio")
+            # RIGHT COLUMN: Transcripts
+            with gr.Column(scale=2):
+                gr.Markdown("### Analysis & Reconstruction")
+                with gr.Group():
+                    gr.Markdown("#### Step 1: ASR Baseline")
+                    whisper_btn = gr.Button("Run Whisper Tiny")
+                    w_raw = gr.Textbox(label="Whisper Raw Transcript")
+                    w_norm = gr.Textbox(label="Whisper Normalized (Input for Model)")
+                gr.Markdown("---")
+                with gr.Group():
+                    gr.Markdown("#### Step 2: Neural Reconstruction")
+                    model_btn = gr.Button("Run Our Correction Model", variant="primary")
+                    final_out = gr.Textbox(label="DSR Lab Prediction (5K Model)")
+    with gr.Tab("📊 Research Statistics"):
+        gr.Markdown("# 🔬 Performance Evaluation")
+        with gr.Row():
+            with gr.Column():
+                gr.Markdown("""
+                ### 📏 Metric: Exact Match Accuracy
+                Accuracy is the percentage of samples where the **normalized prediction** (lowercase, no punctuation) exactly matches the **normalized ground truth**.
+                """)
+            with gr.Column():
+                gr.Markdown("""
+                ### 🧪 Model Definitions
+                * **5K Pure Model:** Trained on real-world Torgo articulatory distortions. Optimized for phonetic fidelity.
+                * **10K Triple-Mix Model:** Includes synthetic data and anchors; utilized for generalization (LOSO) testing.
+                """)
+        gr.Markdown("---")
+        gr.Markdown("## 1. Torgo In-Domain Analysis (By Speaker)")
+        gr.DataFrame(get_indomain_breakdown())
+        gr.Markdown("## 2. Experimental Milestone Summary")
+        gr.DataFrame(get_experimental_summary())
+        gr.Markdown("""
+        ### 🔍 Key Discovery: The Acoustic Floor
+        Our research found that the **5K Pure Model** achieved higher accuracy in both in-domain and zero-shot tasks. This suggests an **'Acoustic Floor'** exists where real-world phonetic distortions are more valuable for model grounding than synthetic linguistic diversity.
+        """)
+    # --- Event Handlers ---
+    # Dataset Channel: Load -> Update State -> Update UI Text/Meta
     load_btn.click(
+        get_sample_logic,
+        inputs=speaker_input,
+        outputs=[active_audio_path, gt_box, meta_display]
+    )
+    # Personal Channel: Use Audio -> Update State -> Clear GT
+    user_load_btn.click(
+        lambda x: (x, "User Provided Audio", {"Dataset": "Custom", "Severity": "Unknown"}),
+        inputs=user_audio,
+        outputs=[active_audio_path, gt_box, meta_display]
     )
+    # Step 1: Whisper (Uses State)
     whisper_btn.click(
+        process_audio_step_1,
+        inputs=active_audio_path,
         outputs=[w_raw, w_norm]
     )
+    # Step 2: Model (Uses State + Whisper result)
     model_btn.click(
+        process_audio_step_2,
+        inputs=[active_audio_path, w_norm],
+        outputs=final_out
     )
+demo.launch()