Spaces:

st192011
/

Torgo-DSR-Lab

Running

App Files Files Community

st192011 commited on 9 days ago

Commit

4eb1313

verified ·

1 Parent(s): f310c2d

Update app.py

Browse files

Files changed (1) hide show

app.py +77 -87

app.py CHANGED Viewed

@@ -1,137 +1,127 @@
 import gradio as gr
 import os
-import random
-import soundfile as sf
-import re
 import io
 import librosa
-import torch
 from transformers import pipeline
 from datasets import load_dataset, Audio
 from gradio_client import Client
 from stats_data import get_indomain_breakdown, get_experimental_summary, SPEAKER_META
-# 1. Initialize Local Whisper Tiny (Baseline)
-# CPU friendly, fast inference
-whisper_asr = pipeline("automatic-speech-recognition", model="openai/whisper-tiny")
-# 2. Private Backend Config
 HF_TOKEN = os.getenv("HF_TOKEN")
-PRIVATE_BACKEND_URL = "st192011/Torgo-DSR-Private"
-def get_sample(speaker_id):
-    """Integrated loading logic from your research code."""
     try:
-        if speaker_id == "F02":
-            # UA-Speech loading logic
-            dataset = load_dataset("resproj007/uaspeech_female", split="test", streaming=True)
-            # F02 is usually the primary speaker in this slice
-            sample = next(iter(dataset.shuffle(buffer_size=20)))
-            gt_text = sample.get('text') or sample.get('transcription') or sample.get('sentence', 'Unknown')
-            audio_data = sample['audio']['array']
-            sample_rate = sample['audio']['sampling_rate']
         else:
-            # Torgo loading logic
             dataset = load_dataset("abnerh/TORGO-database", split="train", streaming=True)
-            # Cast for manual decoding as per your training script
             dataset = dataset.cast_column("audio", Audio(decode=False))
-            # Filter by speaker
-            speaker_ds = dataset.filter(lambda x: str(x.get('speaker_id', '')).upper() == speaker_id)
-            sample = next(iter(speaker_ds.shuffle(buffer_size=20)))
-            # Extract ground truth
-            gt_text = sample.get('transcription') or sample.get('text', 'Unknown')
-            # Decode Audio bytes
-            audio_bytes = sample['audio']['bytes']
-            audio_data, sample_rate = librosa.load(io.BytesIO(audio_bytes), sr=16000)
-        # Save to temporary file for Gradio and Whisper
-        temp_path = "temp_sample.wav"
         sf.write(temp_path, audio_data, sample_rate)
         return temp_path, gt_text.lower().strip(), SPEAKER_META[speaker_id]
     except Exception as e:
-        return None, f"Error accessing dataset: {e}", None
-def run_correction(audio_path, gt_text):
-    if audio_path is None:
-        return "No audio provided", "", "Please load a sample or record audio."
-    # A. Local Whisper Inference
-    try:
-        w_res = whisper_asr(audio_path)
-        w_raw = w_res["text"]
-        w_norm = re.sub(r'[^\w\s]', '', w_raw).lower().strip()
-    except Exception as e:
-        return f"Whisper Error: {e}", "", ""
-    # B. Call Private Backend
-    # This sends the audio and the whisper transcript to your private Gemma model
     try:
         client = Client(PRIVATE_BACKEND_URL, hf_token=HF_TOKEN)
-        # Note: Your private backend should expect (audio_file, whisper_text)
-        res_5k, res_10k = client.predict(audio_path, w_norm, api_name="/predict_dsr_dual")
     except Exception as e:
-        res_5k = "Backend Offline"
-        res_10k = "Please ensure the Private Space is running."
-    return w_raw, res_5k, res_10k
-# UI Construction
 with gr.Blocks(theme=gr.themes.Soft(), title="Torgo DSR Lab") as demo:
     gr.Markdown("# ⚗️ Torgo DSR Lab")
-    gr.Markdown("### Neural Reconstruction Layer for Torgo and UA-Speech Zero-Shot")
-    with gr.Tab("🔬 Interactive Lab"):
         with gr.Row():
             with gr.Column(scale=1):
-                gr.Markdown("#### 1. Select and Load Sample")
-                spk_input = gr.Dropdown(list(SPEAKER_META.keys()), label="Speaker ID", value="F01")
-                load_btn = gr.Button("🎲 Get Random Sample", variant="secondary")
-                gr.Markdown("---")
-                audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Audio Input")
             with gr.Column(scale=2):
-                gr.Markdown("#### 2. Metadata & Comparison")
-                with gr.Row():
-                    gt_box = gr.Textbox(label="Ground Truth", interactive=False)
-                    meta_box = gr.JSON(label="Speaker Meta")
-                w_out = gr.Textbox(label="Whisper Tiny Baseline (Raw Transcript)")
-                with gr.Row():
-                    out_5k = gr.Textbox(label="5K Pure Model Prediction")
-                    out_10k = gr.Textbox(label="10K Triple-Mix Prediction")
-        run_btn = gr.Button("🚀 Run ASR & Reconstruction", variant="primary")
     with gr.Tab("📊 Research Statistics"):
         gr.Markdown("# 🔬 Performance Evaluation")
         with gr.Row():
             with gr.Column():
                 gr.Markdown("""
                 ### 📏 Metric: Exact Match Accuracy
-                Accuracy is calculated as the percentage of samples where the **normalized prediction** (lowercase, no punctuation) exactly matches the **ground truth**.
                 """)
             with gr.Column():
                 gr.Markdown("""
                 ### 🧪 Model Definitions
-                * **5K Pure Model:** Trained on 5,000 real Torgo samples. Optimized for articulatory fidelity.
-                * **10K Triple-Mix Model:** Includes phonetic anchors and synthetic data. Used for Generalization (LOSO) testing.
                 """)
-        gr.Markdown("## 1. Torgo In-Domain Breakdown (By Speaker)")
         gr.DataFrame(get_indomain_breakdown())
-        gr.Markdown("## 2. Experimental Condition Summary")
         gr.DataFrame(get_experimental_summary())
-    # Event Handlers
-    load_btn.click(get_sample, inputs=spk_input, outputs=[audio_input, gt_box, meta_box])
-    run_btn.click(run_correction, inputs=[audio_input, gt_box], outputs=[w_out, out_5k, out_10k])
 demo.launch()

 import gradio as gr
 import os
 import io
+import re
+import random
 import librosa
+import soundfile as sf
 from transformers import pipeline
 from datasets import load_dataset, Audio
 from gradio_client import Client
 from stats_data import get_indomain_breakdown, get_experimental_summary, SPEAKER_META
+# 1. Initialize Baseline ASR (Forced to English)
+whisper_asr = pipeline(
+    "automatic-speech-recognition",
+    model="openai/whisper-tiny",
+    generate_kwargs={"language": "en", "task": "transcribe"}
+)
+# 2. Configuration from Space Secrets
 HF_TOKEN = os.getenv("HF_TOKEN")
+PRIVATE_BACKEND_URL = os.getenv("PRIVATE_BACKEND_URL")
+def normalize_text(text):
+    if not text: return ""
+    return re.sub(r'[^\w\s]', '', text).lower().strip()
+def get_sample_logic(speaker_id):
+    """Bypasses internal decoders to ensure data access works for both datasets."""
     try:
+        if speaker_id == "F02 (UA)":
+            dataset = load_dataset("resproj007/uaspeech_female", split="train", streaming=True)
+            dataset = dataset.cast_column("audio", Audio(decode=False))
+            speaker_ds = dataset.filter(lambda x: x["speaker_id"] == "F02")
         else:
             dataset = load_dataset("abnerh/TORGO-database", split="train", streaming=True)
             dataset = dataset.cast_column("audio", Audio(decode=False))
+            def filter_spk(x):
+                sid = str(x.get('speaker_id', '')).upper()
+                if not sid or sid == "NONE":
+                    sid = os.path.basename(x['audio']['path']).split('_')[0].upper()
+                return sid == speaker_id
+            speaker_ds = dataset.filter(filter_spk)
+        sample = next(iter(speaker_ds.shuffle(buffer_size=20)))
+        gt_text = sample.get('transcription') or sample.get('text') or sample.get('sentence') or "Unknown"
+        audio_bytes = sample['audio']['bytes']
+        audio_data, sample_rate = librosa.load(io.BytesIO(audio_bytes), sr=16000)
+        temp_path = "current_sample.wav"
         sf.write(temp_path, audio_data, sample_rate)
         return temp_path, gt_text.lower().strip(), SPEAKER_META[speaker_id]
     except Exception as e:
+        return None, f"Dataset Error: {e}", {}
+def run_whisper_step(audio_path):
+    if not audio_path: return "No audio loaded", ""
+    result = whisper_asr(audio_path)
+    raw_w = result["text"]
+    norm_w = normalize_text(raw_w)
+    return raw_w, norm_w
+def run_model_step(audio_path, norm_whisper):
+    if not audio_path or not norm_whisper: return "Load data and run Whisper first."
     try:
         client = Client(PRIVATE_BACKEND_URL, hf_token=HF_TOKEN)
+        # Private app expects audio and normalized whisper
+        # Adjust api_name to match your private space definition
+        prediction = client.predict(audio_path, norm_whisper, api_name="/predict_dsr")
+        return prediction
     except Exception as e:
+        return f"Backend Error: {e}. Ensure Private Space is running."
+# UI Layout
 with gr.Blocks(theme=gr.themes.Soft(), title="Torgo DSR Lab") as demo:
     gr.Markdown("# ⚗️ Torgo DSR Lab")
+    gr.Markdown("Stepwise evaluation of standard ASR vs. Neural Reconstruction Layer.")
+    current_audio_path = gr.State("")
+    with gr.Tab("🔬 Laboratory"):
         with gr.Row():
             with gr.Column(scale=1):
+                gr.Markdown("### Step 1: Load Data")
+                speaker_input = gr.Dropdown(sorted(list(SPEAKER_META.keys())), label="Speaker ID", value="F01")
+                load_btn = gr.Button("Load Data")
+                meta_display = gr.JSON(label="Speaker Meta")
+                gt_box = gr.Textbox(label="Ground Truth")
             with gr.Column(scale=2):
+                gr.Markdown("### Step 2: ASR Baseline")
+                whisper_btn = gr.Button("Run Whisper Tiny")
+                w_raw = gr.Textbox(label="Whisper Raw")
+                w_norm = gr.Textbox(label="Whisper Normalized")
+                gr.Markdown("---")
+                gr.Markdown("### Step 3: Neural Reconstruction")
+                model_btn = gr.Button("Run Our Model", variant="primary")
+                final_out = gr.Textbox(label="DSR Lab Prediction")
     with gr.Tab("📊 Research Statistics"):
         gr.Markdown("# 🔬 Performance Evaluation")
         with gr.Row():
             with gr.Column():
                 gr.Markdown("""
                 ### 📏 Metric: Exact Match Accuracy
+                Accuracy is the percentage of samples where the **normalized prediction** (lowercase, no punctuation) exactly matches the **normalized ground truth**.
                 """)
             with gr.Column():
                 gr.Markdown("""
                 ### 🧪 Model Definitions
+                * **5K Pure Model:** Trained on real-world Torgo distortions. Optimized for articulatory accuracy.
+                * **10K Triple-Mix Model:** Includes synthetic data and anchors; tested on unseen speakers (LOSO).
                 """)
+        gr.Markdown("---")
+        gr.Markdown("## 1. Torgo In-Domain Analysis (By Speaker)")
         gr.DataFrame(get_indomain_breakdown())
+        gr.Markdown("## 2. Experimental Milestone Summary")
         gr.DataFrame(get_experimental_summary())
+    # Connectivity
+    load_btn.click(get_sample_logic, inputs=speaker_input, outputs=[current_audio_path, gt_box, meta_display])
+    whisper_btn.click(run_whisper_step, inputs=current_audio_path, outputs=[w_raw, w_norm])
+    model_btn.click(run_model_step, inputs=[current_audio_path, w_norm], outputs=final_out)
 demo.launch()