Spaces:

st192011
/

Torgo-DSR-Lab

Sleeping

App Files Files Community

st192011 commited on 23 days ago

Commit

483c30e

verified ·

1 Parent(s): 4eb1313

Update app.py

Browse files

Files changed (1) hide show

app.py +27 -25

app.py CHANGED Viewed

@@ -5,19 +5,20 @@ import re
 import random
 import librosa
 import soundfile as sf
 from transformers import pipeline
 from datasets import load_dataset, Audio
 from gradio_client import Client
 from stats_data import get_indomain_breakdown, get_experimental_summary, SPEAKER_META
-# 1. Initialize Baseline ASR (Forced to English)
 whisper_asr = pipeline(
     "automatic-speech-recognition",
     model="openai/whisper-tiny",
     generate_kwargs={"language": "en", "task": "transcribe"}
 )
-# 2. Configuration from Space Secrets
 HF_TOKEN = os.getenv("HF_TOKEN")
 PRIVATE_BACKEND_URL = os.getenv("PRIVATE_BACKEND_URL")
@@ -26,27 +27,34 @@ def normalize_text(text):
     return re.sub(r'[^\w\s]', '', text).lower().strip()
 def get_sample_logic(speaker_id):
-    """Bypasses internal decoders to ensure data access works for both datasets."""
     try:
-        if speaker_id == "F02 (UA)":
             dataset = load_dataset("resproj007/uaspeech_female", split="train", streaming=True)
             dataset = dataset.cast_column("audio", Audio(decode=False))
-            speaker_ds = dataset.filter(lambda x: x["speaker_id"] == "F02")
         else:
             dataset = load_dataset("abnerh/TORGO-database", split="train", streaming=True)
             dataset = dataset.cast_column("audio", Audio(decode=False))
             def filter_spk(x):
                 sid = str(x.get('speaker_id', '')).upper()
                 if not sid or sid == "NONE":
                     sid = os.path.basename(x['audio']['path']).split('_')[0].upper()
                 return sid == speaker_id
             speaker_ds = dataset.filter(filter_spk)
-        sample = next(iter(speaker_ds.shuffle(buffer_size=20)))
-        gt_text = sample.get('transcription') or sample.get('text') or sample.get('sentence') or "Unknown"
         audio_bytes = sample['audio']['bytes']
         audio_data, sample_rate = librosa.load(io.BytesIO(audio_bytes), sr=16000)
         temp_path = "current_sample.wav"
         sf.write(temp_path, audio_data, sample_rate)
@@ -65,18 +73,15 @@ def run_model_step(audio_path, norm_whisper):
     if not audio_path or not norm_whisper: return "Load data and run Whisper first."
     try:
         client = Client(PRIVATE_BACKEND_URL, hf_token=HF_TOKEN)
-        # Private app expects audio and normalized whisper
-        # Adjust api_name to match your private space definition
         prediction = client.predict(audio_path, norm_whisper, api_name="/predict_dsr")
         return prediction
     except Exception as e:
-        return f"Backend Error: {e}. Ensure Private Space is running."
-# UI Layout
 with gr.Blocks(theme=gr.themes.Soft(), title="Torgo DSR Lab") as demo:
     gr.Markdown("# ⚗️ Torgo DSR Lab")
-    gr.Markdown("Stepwise evaluation of standard ASR vs. Neural Reconstruction Layer.")
     current_audio_path = gr.State("")
     with gr.Tab("🔬 Laboratory"):
@@ -103,20 +108,17 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Torgo DSR Lab") as demo:
         gr.Markdown("# 🔬 Performance Evaluation")
         with gr.Row():
             with gr.Column():
-                gr.Markdown("""
-                ### 📏 Metric: Exact Match Accuracy
-                Accuracy is the percentage of samples where the **normalized prediction** (lowercase, no punctuation) exactly matches the **normalized ground truth**.
-                """)
             with gr.Column():
-                gr.Markdown("""
-                ### 🧪 Model Definitions
-                * **5K Pure Model:** Trained on real-world Torgo distortions. Optimized for articulatory accuracy.
-                * **10K Triple-Mix Model:** Includes synthetic data and anchors; tested on unseen speakers (LOSO).
-                """)
         gr.Markdown("---")
-        gr.Markdown("## 1. Torgo In-Domain Analysis (By Speaker)")
         gr.DataFrame(get_indomain_breakdown())
-        gr.Markdown("## 2. Experimental Milestone Summary")
         gr.DataFrame(get_experimental_summary())
     # Connectivity

 import random
 import librosa
 import soundfile as sf
+import torch
 from transformers import pipeline
 from datasets import load_dataset, Audio
 from gradio_client import Client
 from stats_data import get_indomain_breakdown, get_experimental_summary, SPEAKER_META
+# 1. Initialize Whisper Tiny (Forced to English)
 whisper_asr = pipeline(
     "automatic-speech-recognition",
     model="openai/whisper-tiny",
     generate_kwargs={"language": "en", "task": "transcribe"}
 )
+# 2. Secret Configuration
 HF_TOKEN = os.getenv("HF_TOKEN")
 PRIVATE_BACKEND_URL = os.getenv("PRIVATE_BACKEND_URL")
     return re.sub(r'[^\w\s]', '', text).lower().strip()
 def get_sample_logic(speaker_id):
+    """Bypasses internal decoders for stability and handles schema differences."""
     try:
+        if "UA" in speaker_id:
+            # UA-Speech loading (As per your working Colab code)
             dataset = load_dataset("resproj007/uaspeech_female", split="train", streaming=True)
             dataset = dataset.cast_column("audio", Audio(decode=False))
+            # For UA Female shard, we pick a random sample directly
+            sample = next(iter(dataset.shuffle(buffer_size=50)))
+            gt_text = sample.get('text') or sample.get('transcription') or sample.get('sentence')
         else:
+            # Torgo loading (Using path-parsing for Speaker IDs)
             dataset = load_dataset("abnerh/TORGO-database", split="train", streaming=True)
             dataset = dataset.cast_column("audio", Audio(decode=False))
             def filter_spk(x):
                 sid = str(x.get('speaker_id', '')).upper()
                 if not sid or sid == "NONE":
                     sid = os.path.basename(x['audio']['path']).split('_')[0].upper()
                 return sid == speaker_id
             speaker_ds = dataset.filter(filter_spk)
+            sample = next(iter(speaker_ds.shuffle(buffer_size=20)))
+            gt_text = sample.get('transcription') or sample.get('text')
+        # Manual Decode via librosa (Bypasses torchcodec requirement)
         audio_bytes = sample['audio']['bytes']
         audio_data, sample_rate = librosa.load(io.BytesIO(audio_bytes), sr=16000)
         temp_path = "current_sample.wav"
         sf.write(temp_path, audio_data, sample_rate)
     if not audio_path or not norm_whisper: return "Load data and run Whisper first."
     try:
         client = Client(PRIVATE_BACKEND_URL, hf_token=HF_TOKEN)
+        # Call private Gemma model (Backend uses repetition_penalty=3.0)
         prediction = client.predict(audio_path, norm_whisper, api_name="/predict_dsr")
         return prediction
     except Exception as e:
+        return f"Backend Error: {e}. Check if Private Space is Awake."
+# UI Construction
 with gr.Blocks(theme=gr.themes.Soft(), title="Torgo DSR Lab") as demo:
     gr.Markdown("# ⚗️ Torgo DSR Lab")
     current_audio_path = gr.State("")
     with gr.Tab("🔬 Laboratory"):
         gr.Markdown("# 🔬 Performance Evaluation")
         with gr.Row():
             with gr.Column():
+                gr.Markdown("### 📏 Metric: Exact Match Accuracy")
+                gr.Markdown("Accuracy is the percentage of samples where the **normalized prediction** (lowercase, no punctuation) exactly matches the **ground truth**.")
             with gr.Column():
+                gr.Markdown("### 🧪 Model Definitions")
+                gr.Markdown("* **5K Pure Model:** Trained on real Torgo speech. Optimized for articulatory accuracy.")
+                gr.Markdown("* **10K Triple-Mix Model:** Includes synthetic data and anchors. Tested on unseen speakers (LOSO).")
         gr.Markdown("---")
+        gr.Markdown("## 1. Torgo In-Domain Breakdown")
         gr.DataFrame(get_indomain_breakdown())
+        gr.Markdown("## 2. Experimental Summary")
         gr.DataFrame(get_experimental_summary())
     # Connectivity