st192011 commited on
Commit
483c30e
·
verified ·
1 Parent(s): 4eb1313

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -25
app.py CHANGED
@@ -5,19 +5,20 @@ import re
5
  import random
6
  import librosa
7
  import soundfile as sf
 
8
  from transformers import pipeline
9
  from datasets import load_dataset, Audio
10
  from gradio_client import Client
11
  from stats_data import get_indomain_breakdown, get_experimental_summary, SPEAKER_META
12
 
13
- # 1. Initialize Baseline ASR (Forced to English)
14
  whisper_asr = pipeline(
15
  "automatic-speech-recognition",
16
  model="openai/whisper-tiny",
17
  generate_kwargs={"language": "en", "task": "transcribe"}
18
  )
19
 
20
- # 2. Configuration from Space Secrets
21
  HF_TOKEN = os.getenv("HF_TOKEN")
22
  PRIVATE_BACKEND_URL = os.getenv("PRIVATE_BACKEND_URL")
23
 
@@ -26,27 +27,34 @@ def normalize_text(text):
26
  return re.sub(r'[^\w\s]', '', text).lower().strip()
27
 
28
  def get_sample_logic(speaker_id):
29
- """Bypasses internal decoders to ensure data access works for both datasets."""
30
  try:
31
- if speaker_id == "F02 (UA)":
 
32
  dataset = load_dataset("resproj007/uaspeech_female", split="train", streaming=True)
33
  dataset = dataset.cast_column("audio", Audio(decode=False))
34
- speaker_ds = dataset.filter(lambda x: x["speaker_id"] == "F02")
 
 
35
  else:
 
36
  dataset = load_dataset("abnerh/TORGO-database", split="train", streaming=True)
37
  dataset = dataset.cast_column("audio", Audio(decode=False))
 
38
  def filter_spk(x):
39
  sid = str(x.get('speaker_id', '')).upper()
40
  if not sid or sid == "NONE":
41
  sid = os.path.basename(x['audio']['path']).split('_')[0].upper()
42
  return sid == speaker_id
 
43
  speaker_ds = dataset.filter(filter_spk)
 
 
44
 
45
- sample = next(iter(speaker_ds.shuffle(buffer_size=20)))
46
- gt_text = sample.get('transcription') or sample.get('text') or sample.get('sentence') or "Unknown"
47
-
48
  audio_bytes = sample['audio']['bytes']
49
  audio_data, sample_rate = librosa.load(io.BytesIO(audio_bytes), sr=16000)
 
50
  temp_path = "current_sample.wav"
51
  sf.write(temp_path, audio_data, sample_rate)
52
 
@@ -65,18 +73,15 @@ def run_model_step(audio_path, norm_whisper):
65
  if not audio_path or not norm_whisper: return "Load data and run Whisper first."
66
  try:
67
  client = Client(PRIVATE_BACKEND_URL, hf_token=HF_TOKEN)
68
- # Private app expects audio and normalized whisper
69
- # Adjust api_name to match your private space definition
70
  prediction = client.predict(audio_path, norm_whisper, api_name="/predict_dsr")
71
  return prediction
72
  except Exception as e:
73
- return f"Backend Error: {e}. Ensure Private Space is running."
74
 
75
- # UI Layout
76
  with gr.Blocks(theme=gr.themes.Soft(), title="Torgo DSR Lab") as demo:
77
  gr.Markdown("# ⚗️ Torgo DSR Lab")
78
- gr.Markdown("Stepwise evaluation of standard ASR vs. Neural Reconstruction Layer.")
79
-
80
  current_audio_path = gr.State("")
81
 
82
  with gr.Tab("🔬 Laboratory"):
@@ -103,20 +108,17 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Torgo DSR Lab") as demo:
103
  gr.Markdown("# 🔬 Performance Evaluation")
104
  with gr.Row():
105
  with gr.Column():
106
- gr.Markdown("""
107
- ### 📏 Metric: Exact Match Accuracy
108
- Accuracy is the percentage of samples where the **normalized prediction** (lowercase, no punctuation) exactly matches the **normalized ground truth**.
109
- """)
110
  with gr.Column():
111
- gr.Markdown("""
112
- ### 🧪 Model Definitions
113
- * **5K Pure Model:** Trained on real-world Torgo distortions. Optimized for articulatory accuracy.
114
- * **10K Triple-Mix Model:** Includes synthetic data and anchors; tested on unseen speakers (LOSO).
115
- """)
116
  gr.Markdown("---")
117
- gr.Markdown("## 1. Torgo In-Domain Analysis (By Speaker)")
118
  gr.DataFrame(get_indomain_breakdown())
119
- gr.Markdown("## 2. Experimental Milestone Summary")
120
  gr.DataFrame(get_experimental_summary())
121
 
122
  # Connectivity
 
5
  import random
6
  import librosa
7
  import soundfile as sf
8
+ import torch
9
  from transformers import pipeline
10
  from datasets import load_dataset, Audio
11
  from gradio_client import Client
12
  from stats_data import get_indomain_breakdown, get_experimental_summary, SPEAKER_META
13
 
14
+ # 1. Initialize Whisper Tiny (Forced to English)
15
  whisper_asr = pipeline(
16
  "automatic-speech-recognition",
17
  model="openai/whisper-tiny",
18
  generate_kwargs={"language": "en", "task": "transcribe"}
19
  )
20
 
21
+ # 2. Secret Configuration
22
  HF_TOKEN = os.getenv("HF_TOKEN")
23
  PRIVATE_BACKEND_URL = os.getenv("PRIVATE_BACKEND_URL")
24
 
 
27
  return re.sub(r'[^\w\s]', '', text).lower().strip()
28
 
29
  def get_sample_logic(speaker_id):
30
+ """Bypasses internal decoders for stability and handles schema differences."""
31
  try:
32
+ if "UA" in speaker_id:
33
+ # UA-Speech loading (As per your working Colab code)
34
  dataset = load_dataset("resproj007/uaspeech_female", split="train", streaming=True)
35
  dataset = dataset.cast_column("audio", Audio(decode=False))
36
+ # For UA Female shard, we pick a random sample directly
37
+ sample = next(iter(dataset.shuffle(buffer_size=50)))
38
+ gt_text = sample.get('text') or sample.get('transcription') or sample.get('sentence')
39
  else:
40
+ # Torgo loading (Using path-parsing for Speaker IDs)
41
  dataset = load_dataset("abnerh/TORGO-database", split="train", streaming=True)
42
  dataset = dataset.cast_column("audio", Audio(decode=False))
43
+
44
  def filter_spk(x):
45
  sid = str(x.get('speaker_id', '')).upper()
46
  if not sid or sid == "NONE":
47
  sid = os.path.basename(x['audio']['path']).split('_')[0].upper()
48
  return sid == speaker_id
49
+
50
  speaker_ds = dataset.filter(filter_spk)
51
+ sample = next(iter(speaker_ds.shuffle(buffer_size=20)))
52
+ gt_text = sample.get('transcription') or sample.get('text')
53
 
54
+ # Manual Decode via librosa (Bypasses torchcodec requirement)
 
 
55
  audio_bytes = sample['audio']['bytes']
56
  audio_data, sample_rate = librosa.load(io.BytesIO(audio_bytes), sr=16000)
57
+
58
  temp_path = "current_sample.wav"
59
  sf.write(temp_path, audio_data, sample_rate)
60
 
 
73
  if not audio_path or not norm_whisper: return "Load data and run Whisper first."
74
  try:
75
  client = Client(PRIVATE_BACKEND_URL, hf_token=HF_TOKEN)
76
+ # Call private Gemma model (Backend uses repetition_penalty=3.0)
 
77
  prediction = client.predict(audio_path, norm_whisper, api_name="/predict_dsr")
78
  return prediction
79
  except Exception as e:
80
+ return f"Backend Error: {e}. Check if Private Space is Awake."
81
 
82
+ # UI Construction
83
  with gr.Blocks(theme=gr.themes.Soft(), title="Torgo DSR Lab") as demo:
84
  gr.Markdown("# ⚗️ Torgo DSR Lab")
 
 
85
  current_audio_path = gr.State("")
86
 
87
  with gr.Tab("🔬 Laboratory"):
 
108
  gr.Markdown("# 🔬 Performance Evaluation")
109
  with gr.Row():
110
  with gr.Column():
111
+ gr.Markdown("### 📏 Metric: Exact Match Accuracy")
112
+ gr.Markdown("Accuracy is the percentage of samples where the **normalized prediction** (lowercase, no punctuation) exactly matches the **ground truth**.")
 
 
113
  with gr.Column():
114
+ gr.Markdown("### 🧪 Model Definitions")
115
+ gr.Markdown("* **5K Pure Model:** Trained on real Torgo speech. Optimized for articulatory accuracy.")
116
+ gr.Markdown("* **10K Triple-Mix Model:** Includes synthetic data and anchors. Tested on unseen speakers (LOSO).")
117
+
 
118
  gr.Markdown("---")
119
+ gr.Markdown("## 1. Torgo In-Domain Breakdown")
120
  gr.DataFrame(get_indomain_breakdown())
121
+ gr.Markdown("## 2. Experimental Summary")
122
  gr.DataFrame(get_experimental_summary())
123
 
124
  # Connectivity