Spaces:

st192011
/

Torgo-DSR-Lab

Running

App Files Files Community

st192011 commited on 6 days ago

Commit

77940c5

verified ·

1 Parent(s): 07ead75

Update app.py

Browse files

Files changed (1) hide show

app.py +87 -85

app.py CHANGED Viewed

@@ -5,132 +5,134 @@ import re
 import random
 import librosa
 import soundfile as sf
-import pandas as pd
 from transformers import pipeline
 from datasets import load_dataset, Audio
 from gradio_client import Client
 from stats_data import get_indomain_breakdown, get_experimental_summary, SPEAKER_META
-# 1. Configuration
-TORGO_INDICES = {'FC01': 0, 'FC02': 302, 'FC03': 2489, 'MC02': 4411, 'MC01': 5534, 'MC03': 7689, 'MC04': 9358, 'M05': 10978, 'M02': 11565, 'M04': 12337, 'M01': 13003, 'F01': 13746, 'M03': 13982, 'F04': 14792, 'F03': 15465}
-HF_TOKEN = os.getenv("HF_TOKEN")
-PRIVATE_BACKEND_URL = "st192011/Torgo-DSR-Private"
-# 2. Local Whisper Baseline (Strict English, Repetition Penalty 3.0)
 whisper_asr = pipeline(
     "automatic-speech-recognition",
     model="openai/whisper-tiny",
-    generate_kwargs={"language": "en", "task": "transcribe", "repetition_penalty": 3.0, "max_new_tokens": 64}
 )
 def normalize_text(text):
     if not text: return ""
     return re.sub(r'[^\w\s]', '', text).lower().strip()
-def standardize_audio(input_path):
-    """Ensures audio is 16kHz, Mono, and compatible with all models."""
-    if not input_path: return None
-    audio, sr = librosa.load(input_path, sr=16000, mono=True)
-    out_path = "processed_audio.wav"
-    sf.write(out_path, audio, 16000)
-    return out_path
-# --- Logic: Data Loading ---
-def get_sample_logic(speaker_id):
-    try:
-        if speaker_id == "F02 (UA)":
-            dataset = load_dataset("resproj007/uaspeech_female", split="train", streaming=True)
-            dataset = dataset.cast_column("audio", Audio(decode=False))
-            sample = next(iter(dataset.skip(random.randint(0, 30))))
-            gt_text = sample.get('text') or sample.get('transcription') or "Unknown"
-        else:
-            dataset = load_dataset("abnerh/TORGO-database", split="train", streaming=True)
-            dataset = dataset.cast_column("audio", Audio(decode=False))
-            start_idx = TORGO_INDICES.get(speaker_id, 0)
-            sample = next(iter(dataset.skip(start_idx + random.randint(0, 15))))
-            gt_text = sample.get('transcription') or sample.get('text') or "Unknown"
-        # Decode Bytes manually and Standardize
-        audio_bytes = sample['audio']['bytes']
-        audio_data, _ = librosa.load(io.BytesIO(audio_bytes), sr=16000, mono=True)
-        temp_path = "sample.wav"
-        sf.write(temp_path, audio_data, 16000)
-        # We return the path to the gr.Audio component (which stores it in State)
-        return temp_path, gt_text.lower().strip(), SPEAKER_META[speaker_id]
-    except Exception as e:
-        return None, f"Dataset Error: {e}", {}
-# --- Logic: Model Steps ---
-def run_whisper_step(audio_path):
     if not audio_path: return "No audio loaded", ""
-    # Standardize format before Whisper
-    clean_audio = standardize_audio(audio_path)
-    result = whisper_asr(clean_audio)
     raw_w = result["text"]
     norm_w = normalize_text(raw_w)
     return raw_w, norm_w
-def run_model_step(audio_path, norm_whisper):
-    if not audio_path or not norm_whisper: return "Complete Step 1 & 2 first."
-    # Standardize format before sending to Private Backend
-    clean_audio = standardize_audio(audio_path)
     try:
         client = Client(PRIVATE_BACKEND_URL, hf_token=HF_TOKEN)
-        prediction = client.predict(clean_audio, norm_whisper, api_name="/predict_dsr")
         return prediction
     except Exception as e:
-        return f"Backend Offline. Details: {e}"
-# --- UI Construction ---
 with gr.Blocks(theme=gr.themes.Soft(), title="Torgo DSR Lab") as demo:
     gr.Markdown("# ⚗️ Torgo DSR Lab")
-    gr.Markdown("Neural Reconstruction for Severe Dysarthria. Load samples from Torgo/UA or record your own.")
-    with gr.Tab("🔬 Laboratory"):
         with gr.Row():
             with gr.Column(scale=1):
-                gr.Markdown("### Step 1: Input Audio")
-                speaker_input = gr.Dropdown(sorted(list(SPEAKER_META.keys())), label="Speaker Profile (for Dataset Samples)")
-                load_btn = gr.Button("🎲 Load Dataset Sample")
                 gr.Markdown("---")
-                # Unified Input: Handles both Dataset Samples and User Input
-                audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Audio Input (Record/Upload/Dataset)")
-                meta_display = gr.JSON(label="Speaker Metadata")
-                gt_box = gr.Textbox(label="Ground Truth (if from dataset)")
             with gr.Column(scale=2):
-                gr.Markdown("### Step 2: ASR Baseline")
-                whisper_btn = gr.Button("Run Whisper Tiny")
-                w_raw = gr.Textbox(label="Whisper Raw")
-                w_norm = gr.Textbox(label="Whisper Normalized")
                 gr.Markdown("---")
-                gr.Markdown("### Step 3: Neural Reconstruction")
-                model_btn = gr.Button("Run Our Model", variant="primary")
-                final_out = gr.Textbox(label="DSR Lab Prediction")
     with gr.Tab("📊 Research Statistics"):
-        gr.Markdown("# 🔬 Performance Evaluation")
-        with gr.Row():
-            with gr.Column():
-                gr.Markdown("### 📏 Metric: Exact Match Accuracy")
-                gr.Markdown("Accuracy is calculated on normalized text (lowercase, no punctuation).")
-            with gr.Column():
-                gr.Markdown("### 🧪 Model Definitions")
-                gr.Markdown("* **5K Pure Model:** Real data focus. \n* **10K Triple-Mix Model:** LOSO Generalization focus.")
-        gr.Markdown("## 1. Torgo In-Domain Analysis")
         gr.DataFrame(get_indomain_breakdown())
-        gr.Markdown("## 2. Experimental Summary")
         gr.DataFrame(get_experimental_summary())
-    # Logic connections
-    load_btn.click(get_sample_logic, inputs=speaker_input, outputs=[audio_input, gt_box, meta_display])
-    whisper_btn.click(run_whisper_step, inputs=audio_input, outputs=[w_raw, w_norm])
-    model_btn.click(run_model_step, inputs=[audio_input, w_norm], outputs=final_out)
 demo.launch()

 import random
 import librosa
 import soundfile as sf
 from transformers import pipeline
 from datasets import load_dataset, Audio
 from gradio_client import Client
 from stats_data import get_indomain_breakdown, get_experimental_summary, SPEAKER_META
+# 1. Setup Local Whisper Baseline (English, Strict Generation)
 whisper_asr = pipeline(
     "automatic-speech-recognition",
     model="openai/whisper-tiny",
+    generate_kwargs={"language": "en", "task": "transcribe", "repetition_penalty": 3.0}
 )
+HF_TOKEN = os.getenv("HF_TOKEN")
+PRIVATE_BACKEND_URL = "st192011/Torgo-DSR-Private"
 def normalize_text(text):
     if not text: return ""
     return re.sub(r'[^\w\s]', '', text).lower().strip()
+# --- Shared Processing Logic ---
+def process_audio_file(audio_path):
+    """Ensures any input audio is formatted correctly for ASR systems (16kHz Mono)."""
+    y, sr = librosa.load(audio_path, sr=16000)
+    fixed_path = "processed_audio.wav"
+    sf.write(fixed_path, y, sr)
+    return fixed_path
+def run_whisper_logic(audio_path):
     if not audio_path: return "No audio loaded", ""
+    formatted_path = process_audio_file(audio_path)
+    result = whisper_asr(formatted_path)
     raw_w = result["text"]
     norm_w = normalize_text(raw_w)
     return raw_w, norm_w
+def run_reconstruction_logic(audio_path, norm_whisper):
+    if not audio_path or not norm_whisper: return "Run Whisper step first."
     try:
         client = Client(PRIVATE_BACKEND_URL, hf_token=HF_TOKEN)
+        # Private backend handles Wav2Vec, Allosaurus, and Gemma 3 arbitration
+        prediction = client.predict(audio_path, norm_whisper, api_name="/predict_dsr")
         return prediction
     except Exception as e:
+        return f"Backend Offline. Error: {e}"
+# --- Channel 1: Dataset Loader ---
+def get_dataset_sample(speaker_id):
+    try:
+        if speaker_id == "F02":
+            ds = load_dataset("resproj007/uaspeech_female", split="train", streaming=True)
+            ds = ds.cast_column("audio", Audio(decode=False))
+            sample = next(iter(ds.skip(random.randint(0, 50))))
+            gt_text = sample.get('text') or sample.get('transcription') or "Unknown"
+        else:
+            ds = load_dataset("abnerh/TORGO-database", split="train", streaming=True)
+            ds = ds.cast_column("audio", Audio(decode=False))
+            indices = {'M05': 10978, 'M02': 11565, 'M04': 12337, 'M01': 13003, 'F01': 13746, 'M03': 13982, 'F04': 14792, 'F03': 15465}
+            start_idx = indices.get(speaker_id, 0)
+            sample = next(iter(ds.skip(start_idx + random.randint(0, 10))))
+            gt_text = sample.get('transcription') or sample.get('text') or "Unknown"
+        audio_data, sr = librosa.load(io.BytesIO(sample['audio']['bytes']), sr=16000)
+        temp_path = f"sample_{speaker_id}.wav"
+        sf.write(temp_path, audio_data, sr)
+        return temp_path, gt_text.lower().strip(), SPEAKER_META.get(speaker_id, {})
+    except Exception as e:
+        return None, f"Dataset Error: {e}", {}
+# --- UI Layout ---
 with gr.Blocks(theme=gr.themes.Soft(), title="Torgo DSR Lab") as demo:
     gr.Markdown("# ⚗️ Torgo DSR Lab")
+    gr.Markdown("ASR Correction and Reconstruction Layer for Torgo and UA-Speech.")
+    # States for audio paths
+    lab_audio_state = gr.State("")
+    user_audio_state = gr.State("")
+    with gr.Tab("🔬 Research Samples"):
+        gr.Markdown("Select clinical samples from the Torgo or UA-Speech datasets.")
         with gr.Row():
             with gr.Column(scale=1):
+                speaker_input = gr.Dropdown(sorted(list(SPEAKER_META.keys())), label="Speaker ID", value="F01")
+                load_btn = gr.Button("Load Sample Data")
+                meta_display = gr.JSON(label="Sample Metadata")
+                gt_box = gr.Textbox(label="Ground Truth")
+            with gr.Column(scale=2):
+                whisper_btn_lab = gr.Button("1. Generate Whisper Baseline")
+                w_raw_lab = gr.Textbox(label="Whisper Raw")
+                w_norm_lab = gr.Textbox(label="Whisper Normalized")
                 gr.Markdown("---")
+                model_btn_lab = gr.Button("2. Run Neural Reconstruction", variant="primary")
+                final_out_lab = gr.Textbox(label="DSR Lab Prediction")
+    with gr.Tab("🎤 Personal Test"):
+        gr.Markdown("Record or upload your own audio to test the reconstruction layer.")
+        with gr.Row():
+            with gr.Column(scale=1):
+                user_audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath", label="User Audio")
+                process_user_btn = gr.Button("Prepare Audio")
             with gr.Column(scale=2):
+                whisper_btn_user = gr.Button("1. Generate Whisper Baseline")
+                w_raw_user = gr.Textbox(label="Whisper Raw")
+                w_norm_user = gr.Textbox(label="Whisper Normalized")
                 gr.Markdown("---")
+                model_btn_user = gr.Button("2. Run Neural Reconstruction", variant="primary")
+                final_out_user = gr.Textbox(label="DSR Lab Prediction")
     with gr.Tab("📊 Research Statistics"):
+        gr.Markdown("# 🔬 Scientific Evaluation")
+        gr.Markdown("**Metric:** Exact Match Accuracy on normalized text (lowercase, no punctuation).")
+        gr.Markdown("## 1. Torgo In-Domain Breakdown")
         gr.DataFrame(get_indomain_breakdown())
+        gr.Markdown("## 2. Experimental Milestone Summary")
         gr.DataFrame(get_experimental_summary())
+    # --- Events: Research Tab ---
+    load_btn.click(get_dataset_sample, inputs=speaker_input, outputs=[lab_audio_state, gt_box, meta_display])
+    whisper_btn_lab.click(run_whisper_logic, inputs=lab_audio_state, outputs=[w_raw_lab, w_norm_lab])
+    model_btn_lab.click(run_reconstruction_logic, inputs=[lab_audio_state, w_norm_lab], outputs=final_out_lab)
+    # --- Events: Personal Tab ---
+    process_user_btn.click(lambda x: x, inputs=user_audio_input, outputs=user_audio_state)
+    whisper_btn_user.click(run_whisper_logic, inputs=user_audio_state, outputs=[w_raw_user, w_norm_user])
+    model_btn_user.click(run_reconstruction_logic, inputs=[user_audio_state, w_norm_user], outputs=final_out_user)
 demo.launch()