Spaces:

st192011
/

Torgo-DSR-Lab

Sleeping

App Files Files Community

st192011 commited on 13 days ago

Commit

b160197

verified ·

1 Parent(s): 6cf37ae

Update app.py

Browse files

Files changed (1) hide show

app.py +43 -42

app.py CHANGED Viewed

@@ -5,70 +5,71 @@ import re
 import random
 import librosa
 import soundfile as sf
-import torch
 from transformers import pipeline
 from datasets import load_dataset, Audio
 from gradio_client import Client
 from stats_data import get_indomain_breakdown, get_experimental_summary, SPEAKER_META
-# 1. Initialize Whisper Tiny (Forced to English with strict output control)
-# max_new_tokens=64 and repetition_penalty=3.0 prevent the "L-O-O-O" infinite loops
 whisper_asr = pipeline(
     "automatic-speech-recognition",
     model="openai/whisper-tiny",
     generate_kwargs={
         "language": "en",
-        "task": "transcribe",
-        "repetition_penalty": 3.0,
         "max_new_tokens": 64,
-        "no_repeat_ngram_size": 3
     }
 )
-# 2. Secret Configuration from Space Settings
 HF_TOKEN = os.getenv("HF_TOKEN")
 PRIVATE_BACKEND_URL = os.getenv("PRIVATE_BACKEND_URL")
 def normalize_text(text):
     if not text: return ""
-    # Remove special chars and lowercase
     return re.sub(r'[^\w\s]', '', text).lower().strip()
-# --- Data Loading Logic ---
 def get_sample_logic(speaker_id):
     try:
         if speaker_id == "F02 (UA)":
-            # 1. UA-Speech access (Using the running code you provided)
             dataset = load_dataset("resproj007/uaspeech_female", split="train", streaming=True)
-            # Since this repo is specifically for UA female, we pull the sample directly
-            sample = next(iter(dataset.shuffle(buffer_size=50)))
-            gt_text = sample.get('text') or sample.get('transcription') or sample.get('sentence')
             audio_data = sample['audio']['array']
             sample_rate = sample['audio']['sampling_rate']
         else:
-            # 2. Torgo access (Using your training logic)
             dataset = load_dataset("abnerh/TORGO-database", split="train", streaming=True)
             dataset = dataset.cast_column("audio", Audio(decode=False))
-            # Use path-parsing to find specific speaker IDs in Torgo
-            def filter_spk(x):
-                sid = str(x.get('speaker_id', '')).upper()
-                if not sid or sid == "NONE":
-                    sid = os.path.basename(x['audio']['path']).split('_')[0].upper()
-                return sid == speaker_id
-            speaker_ds = dataset.filter(filter_spk)
-            sample = next(iter(speaker_ds.shuffle(buffer_size=20)))
-            gt_text = sample.get('transcription') or sample.get('text')
-            audio_bytes = sample['audio']['bytes']
             audio_data, sample_rate = librosa.load(io.BytesIO(audio_bytes), sr=16000)
         temp_path = "current_sample.wav"
         sf.write(temp_path, audio_data, sample_rate)
         return temp_path, gt_text.lower().strip(), SPEAKER_META[speaker_id]
     except Exception as e:
@@ -76,33 +77,30 @@ def get_sample_logic(speaker_id):
 def run_whisper_step(audio_path):
     if not audio_path: return "No audio loaded", ""
-    # Baseline with loop-prevention
     result = whisper_asr(audio_path)
     raw_w = result["text"]
     norm_w = normalize_text(raw_w)
     return raw_w, norm_w
 def run_model_step(audio_path, norm_whisper):
-    if not audio_path or not norm_whisper: return "Load data and run Whisper first."
     try:
         client = Client(PRIVATE_BACKEND_URL, hf_token=HF_TOKEN)
-        # Private app expects audio and normalized whisper
         prediction = client.predict(audio_path, norm_whisper, api_name="/predict_dsr")
         return prediction
     except Exception as e:
-        return f"Backend Offline. Details: {e}"
-# UI Construction
 with gr.Blocks(theme=gr.themes.Soft(), title="Torgo DSR Lab") as demo:
     gr.Markdown("# ⚗️ Torgo DSR Lab")
-    gr.Markdown("Reconstruction Layer for Torgo and UA-Speech")
     current_audio_path = gr.State("")
     with gr.Tab("🔬 Laboratory"):
         with gr.Row():
             with gr.Column(scale=1):
-                gr.Markdown("### Step 1: Load Data")
                 speaker_input = gr.Dropdown(sorted(list(SPEAKER_META.keys())), label="Speaker ID", value="F01")
                 load_btn = gr.Button("Load Data")
                 meta_display = gr.JSON(label="Speaker Meta")
@@ -123,20 +121,23 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Torgo DSR Lab") as demo:
         gr.Markdown("# 🔬 Performance Evaluation")
         with gr.Row():
             with gr.Column():
-                gr.Markdown("### 📏 Metric: Exact Match Accuracy")
-                gr.Markdown("Accuracy is calculated by comparing the **normalized prediction** against the **normalized ground truth**.")
             with gr.Column():
-                gr.Markdown("### 🧪 Model Definitions")
-                gr.Markdown("* **5K Pure Model:** Trained on real Torgo speech. Optimized for articulatory accuracy.")
-                gr.Markdown("* **10K Triple-Mix Model:** Includes synthetic data and anchors. Tested on unseen speakers (LOSO).")
         gr.Markdown("---")
-        gr.Markdown("## 1. Torgo In-Domain Breakdown")
         gr.DataFrame(get_indomain_breakdown())
         gr.Markdown("## 2. Experimental Summary")
         gr.DataFrame(get_experimental_summary())
-    # Connectivity
     load_btn.click(get_sample_logic, inputs=speaker_input, outputs=[current_audio_path, gt_box, meta_display])
     whisper_btn.click(run_whisper_step, inputs=current_audio_path, outputs=[w_raw, w_norm])
     model_btn.click(run_model_step, inputs=[current_audio_path, w_norm], outputs=final_out)

 import random
 import librosa
 import soundfile as sf
 from transformers import pipeline
 from datasets import load_dataset, Audio
 from gradio_client import Client
 from stats_data import get_indomain_breakdown, get_experimental_summary, SPEAKER_META
+# 1. Initialize Baseline ASR with Generation Constraints
+# Set max_new_tokens to 64 to prevent infinite "L-O-O-O" loops
 whisper_asr = pipeline(
     "automatic-speech-recognition",
     model="openai/whisper-tiny",
     generate_kwargs={
         "language": "en",
+        "task": "transcribe",
         "max_new_tokens": 64,
+        "repetition_penalty": 1.5 # Discourages token looping
     }
 )
 HF_TOKEN = os.getenv("HF_TOKEN")
 PRIVATE_BACKEND_URL = os.getenv("PRIVATE_BACKEND_URL")
 def normalize_text(text):
     if not text: return ""
     return re.sub(r'[^\w\s]', '', text).lower().strip()
 def get_sample_logic(speaker_id):
     try:
+        # PATH A: UA-SPEECH (Strictly following your provided running block)
         if speaker_id == "F02 (UA)":
             dataset = load_dataset("resproj007/uaspeech_female", split="train", streaming=True)
+            # Shuffle helps pick a different word each time
+            sample = next(iter(dataset.shuffle(buffer_size=100)))
+            gt_text = sample.get('text') or sample.get('transcription') or sample.get('sentence', 'Unknown')
             audio_data = sample['audio']['array']
             sample_rate = sample['audio']['sampling_rate']
+        # PATH B: TORGO (Optimized for speed)
         else:
             dataset = load_dataset("abnerh/TORGO-database", split="train", streaming=True)
             dataset = dataset.cast_column("audio", Audio(decode=False))
+            # Speed Hack: Shuffle the stream buffer to find the speaker faster
+            # This avoids starting from speaker MC01 every time
+            shuffled_ds = dataset.shuffle(buffer_size=1000)
+            # Find first match in shuffled stream
+            found_sample = None
+            for item in shuffled_ds:
+                sid = str(item.get('speaker_id', '')).upper()
+                if not sid or sid == "NONE":
+                    sid = os.path.basename(item['audio']['path']).split('_')[0].upper()
+                if sid == speaker_id:
+                    found_sample = item
+                    break
+            if not found_sample:
+                return None, "Speaker search timeout. Try again.", {}
+            gt_text = found_sample.get('transcription') or found_sample.get('text', 'Unknown')
+            audio_bytes = found_sample['audio']['bytes']
             audio_data, sample_rate = librosa.load(io.BytesIO(audio_bytes), sr=16000)
         temp_path = "current_sample.wav"
         sf.write(temp_path, audio_data, sample_rate)
         return temp_path, gt_text.lower().strip(), SPEAKER_META[speaker_id]
     except Exception as e:
 def run_whisper_step(audio_path):
     if not audio_path: return "No audio loaded", ""
     result = whisper_asr(audio_path)
     raw_w = result["text"]
     norm_w = normalize_text(raw_w)
     return raw_w, norm_w
 def run_model_step(audio_path, norm_whisper):
+    if not audio_path or not norm_whisper: return "Incomplete steps"
     try:
         client = Client(PRIVATE_BACKEND_URL, hf_token=HF_TOKEN)
+        # Calling Private App which uses repetition_penalty=3.0
         prediction = client.predict(audio_path, norm_whisper, api_name="/predict_dsr")
         return prediction
     except Exception as e:
+        return f"Backend Offline. Research Model requires Private Space access."
+# UI
 with gr.Blocks(theme=gr.themes.Soft(), title="Torgo DSR Lab") as demo:
     gr.Markdown("# ⚗️ Torgo DSR Lab")
     current_audio_path = gr.State("")
     with gr.Tab("🔬 Laboratory"):
         with gr.Row():
             with gr.Column(scale=1):
+                gr.Markdown("### Step 1: Load Sample")
                 speaker_input = gr.Dropdown(sorted(list(SPEAKER_META.keys())), label="Speaker ID", value="F01")
                 load_btn = gr.Button("Load Data")
                 meta_display = gr.JSON(label="Speaker Meta")
         gr.Markdown("# 🔬 Performance Evaluation")
         with gr.Row():
             with gr.Column():
+                gr.Markdown("""
+                ### 📏 Metric: Exact Match Accuracy
+                Accuracy is calculated as the percentage of samples where the **normalized prediction** (lowercase, no punctuation) exactly matches the **ground truth**.
+                """)
             with gr.Column():
+                gr.Markdown("""
+                ### 🧪 Model Definitions
+                * **5K Pure Model:** Trained on 5,000 real Torgo samples. Optimized for articulatory fidelity.
+                * **10K Triple-Mix Model:** Includes synthetic data and anchors. Tested on **unseen speakers (LOSO)** to prove generalization.
+                """)
         gr.Markdown("---")
+        gr.Markdown("## 1. Torgo In-Domain Analysis")
         gr.DataFrame(get_indomain_breakdown())
         gr.Markdown("## 2. Experimental Summary")
         gr.DataFrame(get_experimental_summary())
     load_btn.click(get_sample_logic, inputs=speaker_input, outputs=[current_audio_path, gt_box, meta_display])
     whisper_btn.click(run_whisper_step, inputs=current_audio_path, outputs=[w_raw, w_norm])
     model_btn.click(run_model_step, inputs=[current_audio_path, w_norm], outputs=final_out)