Spaces:

st192011
/

Torgo-DSR-Lab

Running

App Files Files Community

st192011 commited on 9 days ago

Commit

712d6bb

verified ·

1 Parent(s): baa22ab

Update app.py

Browse files

Files changed (1) hide show

app.py +84 -58

app.py CHANGED Viewed

@@ -3,108 +3,134 @@ import os
 import random
 import soundfile as sf
 import re
 from transformers import pipeline
-from datasets import load_dataset
 from gradio_client import Client
 from stats_data import get_indomain_breakdown, get_experimental_summary, SPEAKER_META
-# 1. Initialize Local Whisper (Baseline)
 whisper_asr = pipeline("automatic-speech-recognition", model="openai/whisper-tiny")
-# 2. Setup Private Backend Connection (Hidden logic)
 HF_TOKEN = os.getenv("HF_TOKEN")
-PRIVATE_BACKEND_URL = "st192011/Torgo-DSR-Private" # Update with your private space name
-def normalize_text(text):
-    """Simple normalization for comparison: lowercase and strip punctuation."""
-    return re.sub(r'[^\w\s]', '', text).lower().strip()
 def get_sample(speaker_id):
-    """Accesses HF Datasets via Streaming to get a sample for the UI."""
     try:
-        if "UA" in speaker_id:
-            # Note: UA-Speech ID logic (Speaker F02)
-            path = "ngdiana/uaspeech_severity_high"
-            actual_spk = "F02"
         else:
-            path = "unsw-cse/torgo"
-            actual_spk = speaker_id
-        # Stream dataset to avoid huge downloads
-        ds = load_dataset(path, split="test", streaming=True)
-        # Filter for the chosen speaker
-        speaker_ds = ds.filter(lambda x: x["speaker_id"] == actual_spk)
-        # Take a small buffer and pick a random sample
-        samples = list(speaker_ds.take(20))
-        sample = random.choice(samples)
-        audio_path = "sample_audio.wav"
-        sf.write(audio_path, sample["audio"]["array"], sample["audio"]["sampling_rate"])
-        return audio_path, sample["text"], SPEAKER_META[speaker_id]
     except Exception as e:
         return None, f"Error accessing dataset: {e}", None
 def run_correction(audio_path, gt_text):
-    if audio_path is None: return "No audio input", "", ""
     # A. Local Whisper Inference
-    w_raw = whisper_asr(audio_path)["text"]
-    w_norm = normalize_text(w_raw)
-    # B. Call Private Backend for the 5K and 10K results
     try:
         client = Client(PRIVATE_BACKEND_URL, hf_token=HF_TOKEN)
-        # Private app receives audio + normalized whisper, returns (5k_pred, 10k_pred)
         res_5k, res_10k = client.predict(audio_path, w_norm, api_name="/predict_dsr_dual")
     except Exception as e:
-        res_5k, res_10k = "Backend Connection Required", f"Details: {e}"
     return w_raw, res_5k, res_10k
-# UI Layout
-with gr.Blocks(theme=gr.themes.Default(), title="Torgo DSR Lab") as demo:
     gr.Markdown("# ⚗️ Torgo DSR Lab")
-    gr.Markdown("### Neural Reconstruction and ASR Correction for Torgo and UA-Speech")
-    with gr.Tab("🔬 Laboratory"):
         with gr.Row():
             with gr.Column(scale=1):
-                gr.Markdown("#### 1. Dataset Explorer")
-                spk_input = gr.Dropdown(list(SPEAKER_META.keys()), label="Select Speaker Profile")
-                load_btn = gr.Button("🎲 Load Random Dataset Sample")
                 gr.Markdown("---")
-                audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Input Audio")
             with gr.Column(scale=2):
-                gr.Markdown("#### 2. Metadata & Ground Truth")
-                gt_box = gr.Textbox(label="Ground Truth (Human Label)", interactive=False)
-                meta_box = gr.JSON(label="Speaker Characteristics")
-                gr.Markdown("#### 3. Comparison Results")
                 w_out = gr.Textbox(label="Whisper Tiny Baseline (Raw Transcript)")
                 with gr.Row():
-                    out_5k = gr.Textbox(label="5K Pure Model (Acoustic Focus)")
-                    out_10k = gr.Textbox(label="10K Triple-Mix Model (Linguistic Focus)")
-        run_btn = gr.Button("🚀 Run Correction Layer", variant="primary")
     with gr.Tab("📊 Research Statistics"):
-        gr.Markdown("# 🔬 Evaluation Metrics")
-        gr.Markdown("""
-        **Metric:** Exact Match Accuracy.
-        Calculated by comparing the **normalized prediction** (lowercase, no punctuation) against the **normalized ground truth**.
-        """)
-        gr.Markdown("### 1. In-Domain Torgo Breakdown (By Speaker)")
         gr.DataFrame(get_indomain_breakdown())
-        gr.Markdown("### 2. Experimental Milestone Summary")
-        gr.Markdown("_Note: The 10K model was utilized to test generalization via LOSO on unseen speaker F01._")
         gr.DataFrame(get_experimental_summary())
-    # Event Logic
     load_btn.click(get_sample, inputs=spk_input, outputs=[audio_input, gt_box, meta_box])
     run_btn.click(run_correction, inputs=[audio_input, gt_box], outputs=[w_out, out_5k, out_10k])

 import random
 import soundfile as sf
 import re
+import io
+import librosa
+import torch
 from transformers import pipeline
+from datasets import load_dataset, Audio
 from gradio_client import Client
 from stats_data import get_indomain_breakdown, get_experimental_summary, SPEAKER_META
+# 1. Initialize Local Whisper Tiny (Baseline)
+# CPU friendly, fast inference
 whisper_asr = pipeline("automatic-speech-recognition", model="openai/whisper-tiny")
+# 2. Private Backend Config
 HF_TOKEN = os.getenv("HF_TOKEN")
+PRIVATE_BACKEND_URL = "st192011/Torgo-DSR-Private"
 def get_sample(speaker_id):
+    """Integrated loading logic from your research code."""
     try:
+        if speaker_id == "F02":
+            # UA-Speech loading logic
+            dataset = load_dataset("resproj007/uaspeech_female", split="test", streaming=True)
+            # F02 is usually the primary speaker in this slice
+            sample = next(iter(dataset.shuffle(buffer_size=20)))
+            gt_text = sample.get('text') or sample.get('transcription') or sample.get('sentence', 'Unknown')
+            audio_data = sample['audio']['array']
+            sample_rate = sample['audio']['sampling_rate']
         else:
+            # Torgo loading logic
+            dataset = load_dataset("abnerh/TORGO-database", split="train", streaming=True)
+            # Cast for manual decoding as per your training script
+            dataset = dataset.cast_column("audio", Audio(decode=False))
+            # Filter by speaker
+            speaker_ds = dataset.filter(lambda x: str(x.get('speaker_id', '')).upper() == speaker_id)
+            sample = next(iter(speaker_ds.shuffle(buffer_size=20)))
+            # Extract ground truth
+            gt_text = sample.get('transcription') or sample.get('text', 'Unknown')
+            # Decode Audio bytes
+            audio_bytes = sample['audio']['bytes']
+            audio_data, sample_rate = librosa.load(io.BytesIO(audio_bytes), sr=16000)
+        # Save to temporary file for Gradio and Whisper
+        temp_path = "temp_sample.wav"
+        sf.write(temp_path, audio_data, sample_rate)
+        return temp_path, gt_text.lower().strip(), SPEAKER_META[speaker_id]
     except Exception as e:
         return None, f"Error accessing dataset: {e}", None
 def run_correction(audio_path, gt_text):
+    if audio_path is None:
+        return "No audio provided", "", "Please load a sample or record audio."
     # A. Local Whisper Inference
+    try:
+        w_res = whisper_asr(audio_path)
+        w_raw = w_res["text"]
+        w_norm = re.sub(r'[^\w\s]', '', w_raw).lower().strip()
+    except Exception as e:
+        return f"Whisper Error: {e}", "", ""
+    # B. Call Private Backend
+    # This sends the audio and the whisper transcript to your private Gemma model
     try:
         client = Client(PRIVATE_BACKEND_URL, hf_token=HF_TOKEN)
+        # Note: Your private backend should expect (audio_file, whisper_text)
         res_5k, res_10k = client.predict(audio_path, w_norm, api_name="/predict_dsr_dual")
     except Exception as e:
+        res_5k = "Backend Offline"
+        res_10k = "Please ensure the Private Space is running."
     return w_raw, res_5k, res_10k
+# UI Construction
+with gr.Blocks(theme=gr.themes.Soft(), title="Torgo DSR Lab") as demo:
     gr.Markdown("# ⚗️ Torgo DSR Lab")
+    gr.Markdown("### Neural Reconstruction Layer for Torgo and UA-Speech Zero-Shot")
+    with gr.Tab("🔬 Interactive Lab"):
         with gr.Row():
             with gr.Column(scale=1):
+                gr.Markdown("#### 1. Select and Load Sample")
+                spk_input = gr.Dropdown(list(SPEAKER_META.keys()), label="Speaker ID", value="F01")
+                load_btn = gr.Button("🎲 Get Random Sample", variant="secondary")
                 gr.Markdown("---")
+                audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Audio Input")
             with gr.Column(scale=2):
+                gr.Markdown("#### 2. Metadata & Comparison")
+                with gr.Row():
+                    gt_box = gr.Textbox(label="Ground Truth", interactive=False)
+                    meta_box = gr.JSON(label="Speaker Meta")
                 w_out = gr.Textbox(label="Whisper Tiny Baseline (Raw Transcript)")
                 with gr.Row():
+                    out_5k = gr.Textbox(label="5K Pure Model Prediction")
+                    out_10k = gr.Textbox(label="10K Triple-Mix Prediction")
+        run_btn = gr.Button("🚀 Run ASR & Reconstruction", variant="primary")
     with gr.Tab("📊 Research Statistics"):
+        gr.Markdown("# 🔬 Performance Evaluation")
+        with gr.Row():
+            with gr.Column():
+                gr.Markdown("""
+                ### 📏 Metric: Exact Match Accuracy
+                Accuracy is calculated as the percentage of samples where the **normalized prediction** (lowercase, no punctuation) exactly matches the **ground truth**.
+                """)
+            with gr.Column():
+                gr.Markdown("""
+                ### 🧪 Model Definitions
+                * **5K Pure Model:** Trained on 5,000 real Torgo samples. Optimized for articulatory fidelity.
+                * **10K Triple-Mix Model:** Includes phonetic anchors and synthetic data. Used for Generalization (LOSO) testing.
+                """)
+        gr.Markdown("## 1. Torgo In-Domain Breakdown (By Speaker)")
         gr.DataFrame(get_indomain_breakdown())
+        gr.Markdown("## 2. Experimental Condition Summary")
         gr.DataFrame(get_experimental_summary())
+    # Event Handlers
     load_btn.click(get_sample, inputs=spk_input, outputs=[audio_input, gt_box, meta_box])
     run_btn.click(run_correction, inputs=[audio_input, gt_box], outputs=[w_out, out_5k, out_10k])