Spaces:

st192011
/

Torgo-DSR-Lab

Sleeping

App Files Files Community

st192011 commited on 17 days ago

Commit

034cefa

verified ·

1 Parent(s): b3a0889

Update app.py

Browse files

Files changed (1) hide show

app.py +47 -49

app.py CHANGED Viewed

@@ -5,103 +5,96 @@ import re
 import random
 import librosa
 import soundfile as sf
-import pandas as pd
 from transformers import pipeline
 from datasets import load_dataset, Audio
 from gradio_client import Client
 from stats_data import get_indomain_breakdown, get_experimental_summary, SPEAKER_META
-# 1. Configuration & Indices
-TORGO_INDICES = {'FC01': 0, 'FC02': 302, 'FC03': 2489, 'MC02': 4411, 'MC01': 5534, 'MC03': 7689, 'MC04': 9358, 'M05': 10978, 'M02': 11565, 'M04': 12337, 'M01': 13003, 'F01': 13746, 'M03': 13982, 'F04': 14792, 'F03': 15465}
-HF_TOKEN = os.getenv("HF_TOKEN")
-PRIVATE_BACKEND_URL = "st192011/Torgo-DSR-Private"
-# 2. Local Whisper Baseline
 print("Loading Whisper Tiny...")
 whisper_asr = pipeline(
     "automatic-speech-recognition",
     model="openai/whisper-tiny",
-    generate_kwargs={
-        "language": "en",
-        "task": "transcribe",
-        "repetition_penalty": 3.0,
-        "max_new_tokens": 64
-    }
 )
-def normalize_text(text):
     if not text: return ""
     return re.sub(r'[^\w\s]', '', text).lower().strip()
-# --- Logic Functions ---
 def get_sample_logic(speaker_id):
-    """Bypasses internal decoders for both Torgo and UA to avoid environment errors."""
     try:
-        if speaker_id == "F02":
             dataset = load_dataset("resproj007/uaspeech_female", split="train", streaming=True)
             dataset = dataset.cast_column("audio", Audio(decode=False))
-            # UA dataset is usually smaller; iterate to find variety or use F02 specifically
-            sample = next(iter(dataset.shuffle(buffer_size=50)))
         else:
             dataset = load_dataset("abnerh/TORGO-database", split="train", streaming=True)
             dataset = dataset.cast_column("audio", Audio(decode=False))
-            start_idx = TORGO_INDICES.get(speaker_id, 0)
-            # Jump directly to speaker start + random offset within speaker range
-            sample = next(iter(dataset.skip(start_idx + random.randint(0, 15))))
-        # Process Ground Truth
-        gt_text = sample.get('transcription') or sample.get('text') or sample.get('sentence') or "Unknown"
-        # Manual Decode via Librosa to ensure stability on CPU tier
         audio_bytes = sample['audio']['bytes']
-        audio_data, sample_rate = librosa.load(io.BytesIO(audio_bytes), sr=16000)
-        temp_path = "current_sample.wav"
-        sf.write(temp_path, audio_data, sample_rate)
-        return temp_path, gt_text.lower().strip(), SPEAKER_META.get(speaker_id, {})
     except Exception as e:
-        return None, f"Dataset Access Error: {e}", {}
 def run_whisper_step(audio_path):
     if not audio_path: return "No audio loaded", ""
     result = whisper_asr(audio_path)
     raw_w = result["text"]
-    norm_w = normalize_text(raw_w)
     return raw_w, norm_w
 def run_model_step(audio_path, norm_whisper):
-    if not audio_path or not norm_whisper: return "Load data and run Whisper first."
     try:
         client = Client(PRIVATE_BACKEND_URL, hf_token=HF_TOKEN)
-        # Calls private app for Gemma 3 5K Model prediction
         prediction = client.predict(audio_path, norm_whisper, api_name="/predict_dsr")
         return prediction
     except Exception as e:
         return f"Backend Offline. Research Details: {e}"
-# --- UI Layout ---
 with gr.Blocks(theme=gr.themes.Soft(), title="Torgo DSR Lab") as demo:
     gr.Markdown("# ⚗️ Torgo DSR Lab")
-    gr.Markdown("Reconstruction and Correction layer for severe dysarthric speech.")
     current_audio_path = gr.State("")
     with gr.Tab("🔬 Laboratory"):
         with gr.Row():
             with gr.Column(scale=1):
-                gr.Markdown("### Step 1: Select Speaker")
-                # Removed 'FC' control speakers from dropdown as requested
-                dysarthric_speakers = ["F01", "F03", "F04", "M01", "M02", "M03", "M04", "M05", "F02"]
-                speaker_input = gr.Dropdown(sorted(dysarthric_speakers), label="Speaker ID", value="F01")
                 load_btn = gr.Button("Load Data")
                 meta_display = gr.JSON(label="Speaker Meta")
                 gt_box = gr.Textbox(label="Ground Truth")
             with gr.Column(scale=2):
                 gr.Markdown("### Step 2: ASR Baseline")
@@ -121,24 +114,29 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Torgo DSR Lab") as demo:
             with gr.Column():
                 gr.Markdown("""
                 ### 📏 Metric: Exact Match Accuracy
-                Accuracy is calculated as the percentage of samples where the **normalized prediction** (lowercase, no punctuation) exactly matches the **ground truth**.
                 """)
             with gr.Column():
                 gr.Markdown("""
                 ### 🧪 Model Definitions
                 * **5K Pure Model:** Trained on real articulatory distortions. Optimized for phonetic fidelity.
-                * **10K Triple-Mix Model:** Includes anchors and synthetic data. Used to test **generalization (LOSO)** on unseen speakers.
                 """)
-        gr.Markdown("## 1. Torgo In-Domain Breakdown (By Speaker)")
         gr.DataFrame(get_indomain_breakdown())
         gr.Markdown("## 2. Experimental Summary")
         gr.DataFrame(get_experimental_summary())
     # Event Mapping
-    load_btn.click(get_sample_logic, inputs=speaker_input, outputs=[current_audio_path, gt_box, meta_display])
     whisper_btn.click(run_whisper_step, inputs=current_audio_path, outputs=[w_raw, w_norm])
     model_btn.click(run_model_step, inputs=[current_audio_path, w_norm], outputs=final_out)

 import random
 import librosa
 import soundfile as sf
 from transformers import pipeline
 from datasets import load_dataset, Audio
 from gradio_client import Client
 from stats_data import get_indomain_breakdown, get_experimental_summary, SPEAKER_META
+# 1. Initialize Baseline ASR (Strict English, Repetition Penalty 3.0)
 print("Loading Whisper Tiny...")
 whisper_asr = pipeline(
     "automatic-speech-recognition",
     model="openai/whisper-tiny",
+    generate_kwargs={"language": "en", "task": "transcribe", "repetition_penalty": 3.0}
 )
+HF_TOKEN = os.getenv("HF_TOKEN")
+PRIVATE_BACKEND_URL = "st192011/Torgo-DSR-Private"
+def normalize(text):
     if not text: return ""
     return re.sub(r'[^\w\s]', '', text).lower().strip()
+# --- Logic: Data Loading ---
 def get_sample_logic(speaker_id):
     try:
+        if "UA" in speaker_id:
+            # UA-Speech Access (Direct pull for F02)
             dataset = load_dataset("resproj007/uaspeech_female", split="train", streaming=True)
             dataset = dataset.cast_column("audio", Audio(decode=False))
+            # UA is small, skip slightly for variety
+            sample = next(iter(dataset.skip(random.randint(0, 30))))
+            gt_text = sample.get('text') or sample.get('transcription') or sample.get('sentence')
         else:
+            # Torgo Access (Manual filtering as per Colab fix)
             dataset = load_dataset("abnerh/TORGO-database", split="train", streaming=True)
             dataset = dataset.cast_column("audio", Audio(decode=False))
+            def filter_spk(x):
+                sid = str(x.get('speaker_id', '')).upper()
+                if not sid or sid == "NONE":
+                    sid = os.path.basename(x['audio']['path']).split('_')[0].upper()
+                return sid == speaker_id
+            speaker_ds = dataset.filter(filter_spk)
+            sample = next(iter(speaker_ds.shuffle(buffer_size=10)))
+            gt_text = sample.get('transcription') or sample.get('text')
+        # Decode Bytes manually to bypass torchcodec errors
         audio_bytes = sample['audio']['bytes']
+        audio_data, sr = librosa.load(io.BytesIO(audio_bytes), sr=16000)
+        temp_path = "sample.wav"
+        sf.write(temp_path, audio_data, sr)
+        return temp_path, gt_text.lower().strip(), SPEAKER_META[speaker_id]
     except Exception as e:
+        return None, f"Dataset Error: {e}", {}
+# --- Logic: Model Steps ---
 def run_whisper_step(audio_path):
     if not audio_path: return "No audio loaded", ""
     result = whisper_asr(audio_path)
     raw_w = result["text"]
+    norm_w = normalize(raw_w)
     return raw_w, norm_w
 def run_model_step(audio_path, norm_whisper):
+    if not audio_path or not norm_whisper: return "Complete Steps 1 & 2 first."
     try:
+        # Call the private space for the 5K Gemma Model prediction
         client = Client(PRIVATE_BACKEND_URL, hf_token=HF_TOKEN)
         prediction = client.predict(audio_path, norm_whisper, api_name="/predict_dsr")
         return prediction
     except Exception as e:
         return f"Backend Offline. Research Details: {e}"
+# --- UI Construction ---
 with gr.Blocks(theme=gr.themes.Soft(), title="Torgo DSR Lab") as demo:
     gr.Markdown("# ⚗️ Torgo DSR Lab")
+    gr.Markdown("Neural Reconstruction for Severe Dysarthria benchmarked on Torgo and UA-Speech.")
     current_audio_path = gr.State("")
     with gr.Tab("🔬 Laboratory"):
         with gr.Row():
             with gr.Column(scale=1):
+                gr.Markdown("### Step 1: Load Sample")
+                speaker_input = gr.Dropdown(sorted(list(SPEAKER_META.keys())), label="Speaker ID", value="F01")
                 load_btn = gr.Button("Load Data")
                 meta_display = gr.JSON(label="Speaker Meta")
                 gt_box = gr.Textbox(label="Ground Truth")
+                # Added visible audio for user verification
+                audio_preview = gr.Audio(label="Audio Preview", type="filepath")
             with gr.Column(scale=2):
                 gr.Markdown("### Step 2: ASR Baseline")
             with gr.Column():
                 gr.Markdown("""
                 ### 📏 Metric: Exact Match Accuracy
+                Accuracy is the percentage of samples where the **normalized prediction** (lowercase, no punctuation) matches the **ground truth**.
                 """)
             with gr.Column():
                 gr.Markdown("""
                 ### 🧪 Model Definitions
                 * **5K Pure Model:** Trained on real articulatory distortions. Optimized for phonetic fidelity.
+                * **10K Triple-Mix Model:** Includes synthetic data and anchors; utilized for generalization testing.
                 """)
+        gr.Markdown("## 1. Torgo In-Domain Analysis")
         gr.DataFrame(get_indomain_breakdown())
         gr.Markdown("## 2. Experimental Summary")
         gr.DataFrame(get_experimental_summary())
     # Event Mapping
+    load_btn.click(
+        get_sample_logic,
+        inputs=speaker_input,
+        outputs=[current_audio_path, gt_box, meta_display]
+    ).then(lambda x: x, inputs=current_audio_path, outputs=audio_preview)
     whisper_btn.click(run_whisper_step, inputs=current_audio_path, outputs=[w_raw, w_norm])
     model_btn.click(run_model_step, inputs=[current_audio_path, w_norm], outputs=final_out)