Spaces:

st192011
/

Torgo-DSR-Lab

Running

App Files Files Community

st192011 commited on 19 days ago

Commit

d5b3a6f

verified ·

1 Parent(s): 483c30e

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -15

app.py CHANGED Viewed

@@ -11,33 +11,39 @@ from datasets import load_dataset, Audio
 from gradio_client import Client
 from stats_data import get_indomain_breakdown, get_experimental_summary, SPEAKER_META
-# 1. Initialize Whisper Tiny (Forced to English)
 whisper_asr = pipeline(
     "automatic-speech-recognition",
     model="openai/whisper-tiny",
-    generate_kwargs={"language": "en", "task": "transcribe"}
 )
-# 2. Secret Configuration
 HF_TOKEN = os.getenv("HF_TOKEN")
 PRIVATE_BACKEND_URL = os.getenv("PRIVATE_BACKEND_URL")
 def normalize_text(text):
     if not text: return ""
     return re.sub(r'[^\w\s]', '', text).lower().strip()
 def get_sample_logic(speaker_id):
-    """Bypasses internal decoders for stability and handles schema differences."""
     try:
         if "UA" in speaker_id:
-            # UA-Speech loading (As per your working Colab code)
             dataset = load_dataset("resproj007/uaspeech_female", split="train", streaming=True)
             dataset = dataset.cast_column("audio", Audio(decode=False))
-            # For UA Female shard, we pick a random sample directly
-            sample = next(iter(dataset.shuffle(buffer_size=50)))
-            gt_text = sample.get('text') or sample.get('transcription') or sample.get('sentence')
         else:
-            # Torgo loading (Using path-parsing for Speaker IDs)
             dataset = load_dataset("abnerh/TORGO-database", split="train", streaming=True)
             dataset = dataset.cast_column("audio", Audio(decode=False))
@@ -48,10 +54,11 @@ def get_sample_logic(speaker_id):
                 return sid == speaker_id
             speaker_ds = dataset.filter(filter_spk)
-            sample = next(iter(speaker_ds.shuffle(buffer_size=20)))
-            gt_text = sample.get('transcription') or sample.get('text')
-        # Manual Decode via librosa (Bypasses torchcodec requirement)
         audio_bytes = sample['audio']['bytes']
         audio_data, sample_rate = librosa.load(io.BytesIO(audio_bytes), sr=16000)
@@ -64,6 +71,7 @@ def get_sample_logic(speaker_id):
 def run_whisper_step(audio_path):
     if not audio_path: return "No audio loaded", ""
     result = whisper_asr(audio_path)
     raw_w = result["text"]
     norm_w = normalize_text(raw_w)
@@ -73,15 +81,17 @@ def run_model_step(audio_path, norm_whisper):
     if not audio_path or not norm_whisper: return "Load data and run Whisper first."
     try:
         client = Client(PRIVATE_BACKEND_URL, hf_token=HF_TOKEN)
-        # Call private Gemma model (Backend uses repetition_penalty=3.0)
         prediction = client.predict(audio_path, norm_whisper, api_name="/predict_dsr")
         return prediction
     except Exception as e:
-        return f"Backend Error: {e}. Check if Private Space is Awake."
 # UI Construction
 with gr.Blocks(theme=gr.themes.Soft(), title="Torgo DSR Lab") as demo:
     gr.Markdown("# ⚗️ Torgo DSR Lab")
     current_audio_path = gr.State("")
     with gr.Tab("🔬 Laboratory"):
@@ -109,7 +119,7 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Torgo DSR Lab") as demo:
         with gr.Row():
             with gr.Column():
                 gr.Markdown("### 📏 Metric: Exact Match Accuracy")
-                gr.Markdown("Accuracy is the percentage of samples where the **normalized prediction** (lowercase, no punctuation) exactly matches the **ground truth**.")
             with gr.Column():
                 gr.Markdown("### 🧪 Model Definitions")
                 gr.Markdown("* **5K Pure Model:** Trained on real Torgo speech. Optimized for articulatory accuracy.")

 from gradio_client import Client
 from stats_data import get_indomain_breakdown, get_experimental_summary, SPEAKER_META
+# 1. Initialize Whisper Tiny (Forced to English with strict output control)
+# max_new_tokens=64 and repetition_penalty=3.0 prevent the "L-O-O-O" infinite loops
 whisper_asr = pipeline(
     "automatic-speech-recognition",
     model="openai/whisper-tiny",
+    generate_kwargs={
+        "language": "en",
+        "task": "transcribe",
+        "repetition_penalty": 3.0,
+        "max_new_tokens": 64,
+        "no_repeat_ngram_size": 3
+    }
 )
+# 2. Secret Configuration from Space Settings
 HF_TOKEN = os.getenv("HF_TOKEN")
 PRIVATE_BACKEND_URL = os.getenv("PRIVATE_BACKEND_URL")
 def normalize_text(text):
     if not text: return ""
+    # Remove special chars and lowercase
     return re.sub(r'[^\w\s]', '', text).lower().strip()
 def get_sample_logic(speaker_id):
+    """Bypasses internal decoders for stability and handles dataset differences."""
     try:
         if "UA" in speaker_id:
+            # UA-Speech loading (Speaker F02)
             dataset = load_dataset("resproj007/uaspeech_female", split="train", streaming=True)
             dataset = dataset.cast_column("audio", Audio(decode=False))
+            speaker_ds = dataset.filter(lambda x: x["speaker_id"] == "F02")
         else:
+            # Torgo loading (Using path-parsing for IDs)
             dataset = load_dataset("abnerh/TORGO-database", split="train", streaming=True)
             dataset = dataset.cast_column("audio", Audio(decode=False))
                 return sid == speaker_id
             speaker_ds = dataset.filter(filter_spk)
+        # Get sample and decode manually
+        sample = next(iter(speaker_ds.shuffle(buffer_size=50)))
+        gt_text = sample.get('text') or sample.get('transcription') or sample.get('sentence')
         audio_bytes = sample['audio']['bytes']
         audio_data, sample_rate = librosa.load(io.BytesIO(audio_bytes), sr=16000)
 def run_whisper_step(audio_path):
     if not audio_path: return "No audio loaded", ""
+    # Baseline with loop-prevention
     result = whisper_asr(audio_path)
     raw_w = result["text"]
     norm_w = normalize_text(raw_w)
     if not audio_path or not norm_whisper: return "Load data and run Whisper first."
     try:
         client = Client(PRIVATE_BACKEND_URL, hf_token=HF_TOKEN)
+        # Private app expects audio and normalized whisper
         prediction = client.predict(audio_path, norm_whisper, api_name="/predict_dsr")
         return prediction
     except Exception as e:
+        return f"Backend Offline. Details: {e}"
 # UI Construction
 with gr.Blocks(theme=gr.themes.Soft(), title="Torgo DSR Lab") as demo:
     gr.Markdown("# ⚗️ Torgo DSR Lab")
+    gr.Markdown("Reconstruction Layer for Torgo and UA-Speech")
     current_audio_path = gr.State("")
     with gr.Tab("🔬 Laboratory"):
         with gr.Row():
             with gr.Column():
                 gr.Markdown("### 📏 Metric: Exact Match Accuracy")
+                gr.Markdown("Accuracy is calculated by comparing the **normalized prediction** against the **normalized ground truth**.")
             with gr.Column():
                 gr.Markdown("### 🧪 Model Definitions")
                 gr.Markdown("* **5K Pure Model:** Trained on real Torgo speech. Optimized for articulatory accuracy.")