Spaces:

st192011
/

Torgo-DSR-Lab

Running

App Files Files Community

st192011 commited on 9 days ago

Commit

08dd52c

verified ·

1 Parent(s): b160197

Update app.py

Browse files

Files changed (1) hide show

app.py +59 -63

app.py CHANGED Viewed

@@ -10,89 +10,81 @@ from datasets import load_dataset, Audio
 from gradio_client import Client
 from stats_data import get_indomain_breakdown, get_experimental_summary, SPEAKER_META
-# 1. Initialize Baseline ASR with Generation Constraints
-# Set max_new_tokens to 64 to prevent infinite "L-O-O-O" loops
 whisper_asr = pipeline(
     "automatic-speech-recognition",
     model="openai/whisper-tiny",
     generate_kwargs={
         "language": "en",
-        "task": "transcribe",
-        "max_new_tokens": 64,
-        "repetition_penalty": 1.5 # Discourages token looping
     }
 )
 HF_TOKEN = os.getenv("HF_TOKEN")
-PRIVATE_BACKEND_URL = os.getenv("PRIVATE_BACKEND_URL")
-def normalize_text(text):
     if not text: return ""
     return re.sub(r'[^\w\s]', '', text).lower().strip()
 def get_sample_logic(speaker_id):
     try:
-        # PATH A: UA-SPEECH (Strictly following your provided running block)
         if speaker_id == "F02 (UA)":
             dataset = load_dataset("resproj007/uaspeech_female", split="train", streaming=True)
-            # Shuffle helps pick a different word each time
-            sample = next(iter(dataset.shuffle(buffer_size=100)))
-            gt_text = sample.get('text') or sample.get('transcription') or sample.get('sentence', 'Unknown')
-            audio_data = sample['audio']['array']
-            sample_rate = sample['audio']['sampling_rate']
-        # PATH B: TORGO (Optimized for speed)
         else:
             dataset = load_dataset("abnerh/TORGO-database", split="train", streaming=True)
             dataset = dataset.cast_column("audio", Audio(decode=False))
-            # Speed Hack: Shuffle the stream buffer to find the speaker faster
-            # This avoids starting from speaker MC01 every time
-            shuffled_ds = dataset.shuffle(buffer_size=1000)
-            # Find first match in shuffled stream
-            found_sample = None
-            for item in shuffled_ds:
-                sid = str(item.get('speaker_id', '')).upper()
                 if not sid or sid == "NONE":
-                    sid = os.path.basename(item['audio']['path']).split('_')[0].upper()
-                if sid == speaker_id:
-                    found_sample = item
-                    break
-            if not found_sample:
-                return None, "Speaker search timeout. Try again.", {}
-            gt_text = found_sample.get('transcription') or found_sample.get('text', 'Unknown')
-            audio_bytes = found_sample['audio']['bytes']
-            audio_data, sample_rate = librosa.load(io.BytesIO(audio_bytes), sr=16000)
-        temp_path = "current_sample.wav"
-        sf.write(temp_path, audio_data, sample_rate)
         return temp_path, gt_text.lower().strip(), SPEAKER_META[speaker_id]
     except Exception as e:
-        return None, f"Dataset Error: {e}", {}
-def run_whisper_step(audio_path):
-    if not audio_path: return "No audio loaded", ""
-    result = whisper_asr(audio_path)
-    raw_w = result["text"]
-    norm_w = normalize_text(raw_w)
-    return raw_w, norm_w
-def run_model_step(audio_path, norm_whisper):
-    if not audio_path or not norm_whisper: return "Incomplete steps"
     try:
         client = Client(PRIVATE_BACKEND_URL, hf_token=HF_TOKEN)
-        # Calling Private App which uses repetition_penalty=3.0
-        prediction = client.predict(audio_path, norm_whisper, api_name="/predict_dsr")
-        return prediction
     except Exception as e:
-        return f"Backend Offline. Research Model requires Private Space access."
-# UI
 with gr.Blocks(theme=gr.themes.Soft(), title="Torgo DSR Lab") as demo:
     gr.Markdown("# ⚗️ Torgo DSR Lab")
     current_audio_path = gr.State("")
@@ -100,11 +92,12 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Torgo DSR Lab") as demo:
     with gr.Tab("🔬 Laboratory"):
         with gr.Row():
             with gr.Column(scale=1):
-                gr.Markdown("### Step 1: Load Sample")
-                speaker_input = gr.Dropdown(sorted(list(SPEAKER_META.keys())), label="Speaker ID", value="F01")
                 load_btn = gr.Button("Load Data")
-                meta_display = gr.JSON(label="Speaker Meta")
                 gt_box = gr.Textbox(label="Ground Truth")
             with gr.Column(scale=2):
                 gr.Markdown("### Step 2: ASR Baseline")
@@ -129,17 +122,20 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Torgo DSR Lab") as demo:
                 gr.Markdown("""
                 ### 🧪 Model Definitions
                 * **5K Pure Model:** Trained on 5,000 real Torgo samples. Optimized for articulatory fidelity.
-                * **10K Triple-Mix Model:** Includes synthetic data and anchors. Tested on **unseen speakers (LOSO)** to prove generalization.
                 """)
-        gr.Markdown("---")
-        gr.Markdown("## 1. Torgo In-Domain Analysis")
         gr.DataFrame(get_indomain_breakdown())
         gr.Markdown("## 2. Experimental Summary")
         gr.DataFrame(get_experimental_summary())
-    load_btn.click(get_sample_logic, inputs=speaker_input, outputs=[current_audio_path, gt_box, meta_display])
-    whisper_btn.click(run_whisper_step, inputs=current_audio_path, outputs=[w_raw, w_norm])
-    model_btn.click(run_model_step, inputs=[current_audio_path, w_norm], outputs=final_out)
 demo.launch()

 from gradio_client import Client
 from stats_data import get_indomain_breakdown, get_experimental_summary, SPEAKER_META
+# 1. Setup Local Whisper (Forced to English, High Repetition Penalty)
+print("Initializing ASR Baseline...")
 whisper_asr = pipeline(
     "automatic-speech-recognition",
     model="openai/whisper-tiny",
     generate_kwargs={
         "language": "en",
+        "task": "transcribe",
+        "repetition_penalty": 3.0,
+        "max_new_tokens": 64
     }
 )
+# 2. Private Backend Config
 HF_TOKEN = os.getenv("HF_TOKEN")
+PRIVATE_BACKEND_URL = "st192011/Torgo-DSR-Private"
+def normalize(text):
     if not text: return ""
     return re.sub(r'[^\w\s]', '', text).lower().strip()
 def get_sample_logic(speaker_id):
+    """Optimized data loader: Skips normal control speakers to find targets faster."""
     try:
         if speaker_id == "F02 (UA)":
             dataset = load_dataset("resproj007/uaspeech_female", split="train", streaming=True)
+            dataset = dataset.cast_column("audio", Audio(decode=False))
+            # F02 is the primary dysarthric speaker in this split
+            speaker_ds = dataset.filter(lambda x: x["speaker_id"] == "F02")
         else:
             dataset = load_dataset("abnerh/TORGO-database", split="train", streaming=True)
             dataset = dataset.cast_column("audio", Audio(decode=False))
+            # Skip logic: ignore samples with 'control' status to speed up stream
+            def is_target_dysarthric(x):
+                sid = str(x.get('speaker_id', '')).upper()
                 if not sid or sid == "NONE":
+                    sid = os.path.basename(x['audio']['path']).split('_')[0].upper()
+                status = str(x.get('speech_status', '')).lower()
+                return sid == speaker_id and "control" not in status
+            speaker_ds = dataset.filter(is_target_dysarthric)
+        # Get sample and decode
+        sample = next(iter(speaker_ds.shuffle(buffer_size=10)))
+        gt_text = sample.get('transcription') or sample.get('text') or sample.get('sentence') or "Unknown"
+        audio_bytes = sample['audio']['bytes']
+        audio_data, sr = librosa.load(io.BytesIO(audio_bytes), sr=16000)
+        temp_path = "sample.wav"
+        sf.write(temp_path, audio_data, sr)
         return temp_path, gt_text.lower().strip(), SPEAKER_META[speaker_id]
     except Exception as e:
+        return None, f"Loading error: {e}", {}
+def run_lab(audio_path):
+    if not audio_path: return "", "", "Error: No Audio"
+    # Baseline
+    w_res = whisper_asr(audio_path)
+    w_raw = w_res["text"]
+    w_norm = normalize(w_raw)
+    # Private Model Call
     try:
         client = Client(PRIVATE_BACKEND_URL, hf_token=HF_TOKEN)
+        # Assuming private backend returns the 5K prediction string
+        prediction = client.predict(audio_path, w_norm, api_name="/predict_dsr")
     except Exception as e:
+        prediction = f"Backend offline or Error: {e}"
+    return w_raw, w_norm, prediction
+# UI Construction
 with gr.Blocks(theme=gr.themes.Soft(), title="Torgo DSR Lab") as demo:
     gr.Markdown("# ⚗️ Torgo DSR Lab")
     current_audio_path = gr.State("")
     with gr.Tab("🔬 Laboratory"):
         with gr.Row():
             with gr.Column(scale=1):
+                gr.Markdown("### Step 1: Data Selection")
+                spk_input = gr.Dropdown(sorted(list(SPEAKER_META.keys())), label="Speaker ID", value="F01")
                 load_btn = gr.Button("Load Data")
+                meta_json = gr.JSON(label="Speaker Metadata")
                 gt_box = gr.Textbox(label="Ground Truth")
+                audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Input Audio")
             with gr.Column(scale=2):
                 gr.Markdown("### Step 2: ASR Baseline")
                 gr.Markdown("""
                 ### 🧪 Model Definitions
                 * **5K Pure Model:** Trained on 5,000 real Torgo samples. Optimized for articulatory fidelity.
+                * **10K Triple-Mix Model:** Includes phonetic anchors and synthetic data. Utilized to test **generalization (LOSO)** on unseen speakers.
                 """)
+        gr.Markdown("## 1. Torgo In-Domain Breakdown (By Speaker)")
         gr.DataFrame(get_indomain_breakdown())
         gr.Markdown("## 2. Experimental Summary")
         gr.DataFrame(get_experimental_summary())
+    # Connection logic
+    load_btn.click(get_sample_logic, inputs=spk_input, outputs=[current_audio_path, gt_box, meta_json]).then(
+        lambda x: x, inputs=current_audio_path, outputs=audio_input
+    )
+    whisper_btn.click(run_whisper_step if 'run_whisper_step' in globals() else run_lab, inputs=current_audio_path, outputs=[w_raw, w_norm, final_out])
+    model_btn.click(run_lab, inputs=current_audio_path, outputs=[w_raw, w_norm, final_out])
 demo.launch()