st192011 commited on
Commit
b160197
·
verified ·
1 Parent(s): 6cf37ae

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +43 -42
app.py CHANGED
@@ -5,70 +5,71 @@ import re
5
  import random
6
  import librosa
7
  import soundfile as sf
8
- import torch
9
  from transformers import pipeline
10
  from datasets import load_dataset, Audio
11
  from gradio_client import Client
12
  from stats_data import get_indomain_breakdown, get_experimental_summary, SPEAKER_META
13
 
14
- # 1. Initialize Whisper Tiny (Forced to English with strict output control)
15
- # max_new_tokens=64 and repetition_penalty=3.0 prevent the "L-O-O-O" infinite loops
16
  whisper_asr = pipeline(
17
  "automatic-speech-recognition",
18
  model="openai/whisper-tiny",
19
  generate_kwargs={
20
  "language": "en",
21
- "task": "transcribe",
22
- "repetition_penalty": 3.0,
23
  "max_new_tokens": 64,
24
- "no_repeat_ngram_size": 3
25
  }
26
  )
27
 
28
- # 2. Secret Configuration from Space Settings
29
  HF_TOKEN = os.getenv("HF_TOKEN")
30
  PRIVATE_BACKEND_URL = os.getenv("PRIVATE_BACKEND_URL")
31
 
32
  def normalize_text(text):
33
  if not text: return ""
34
- # Remove special chars and lowercase
35
  return re.sub(r'[^\w\s]', '', text).lower().strip()
36
 
37
- # --- Data Loading Logic ---
38
  def get_sample_logic(speaker_id):
39
  try:
 
40
  if speaker_id == "F02 (UA)":
41
- # 1. UA-Speech access (Using the running code you provided)
42
  dataset = load_dataset("resproj007/uaspeech_female", split="train", streaming=True)
43
- # Since this repo is specifically for UA female, we pull the sample directly
44
- sample = next(iter(dataset.shuffle(buffer_size=50)))
45
-
46
- gt_text = sample.get('text') or sample.get('transcription') or sample.get('sentence')
47
  audio_data = sample['audio']['array']
48
  sample_rate = sample['audio']['sampling_rate']
49
 
 
50
  else:
51
- # 2. Torgo access (Using your training logic)
52
  dataset = load_dataset("abnerh/TORGO-database", split="train", streaming=True)
53
  dataset = dataset.cast_column("audio", Audio(decode=False))
54
 
55
- # Use path-parsing to find specific speaker IDs in Torgo
56
- def filter_spk(x):
57
- sid = str(x.get('speaker_id', '')).upper()
58
- if not sid or sid == "NONE":
59
- sid = os.path.basename(x['audio']['path']).split('_')[0].upper()
60
- return sid == speaker_id
61
 
62
- speaker_ds = dataset.filter(filter_spk)
63
- sample = next(iter(speaker_ds.shuffle(buffer_size=20)))
 
 
 
 
 
 
 
 
64
 
65
- gt_text = sample.get('transcription') or sample.get('text')
66
- audio_bytes = sample['audio']['bytes']
 
 
 
67
  audio_data, sample_rate = librosa.load(io.BytesIO(audio_bytes), sr=16000)
68
 
69
  temp_path = "current_sample.wav"
70
  sf.write(temp_path, audio_data, sample_rate)
71
-
72
  return temp_path, gt_text.lower().strip(), SPEAKER_META[speaker_id]
73
 
74
  except Exception as e:
@@ -76,33 +77,30 @@ def get_sample_logic(speaker_id):
76
 
77
  def run_whisper_step(audio_path):
78
  if not audio_path: return "No audio loaded", ""
79
- # Baseline with loop-prevention
80
  result = whisper_asr(audio_path)
81
  raw_w = result["text"]
82
  norm_w = normalize_text(raw_w)
83
  return raw_w, norm_w
84
 
85
  def run_model_step(audio_path, norm_whisper):
86
- if not audio_path or not norm_whisper: return "Load data and run Whisper first."
87
  try:
88
  client = Client(PRIVATE_BACKEND_URL, hf_token=HF_TOKEN)
89
- # Private app expects audio and normalized whisper
90
  prediction = client.predict(audio_path, norm_whisper, api_name="/predict_dsr")
91
  return prediction
92
  except Exception as e:
93
- return f"Backend Offline. Details: {e}"
94
 
95
- # UI Construction
96
  with gr.Blocks(theme=gr.themes.Soft(), title="Torgo DSR Lab") as demo:
97
  gr.Markdown("# ⚗️ Torgo DSR Lab")
98
- gr.Markdown("Reconstruction Layer for Torgo and UA-Speech")
99
-
100
  current_audio_path = gr.State("")
101
 
102
  with gr.Tab("🔬 Laboratory"):
103
  with gr.Row():
104
  with gr.Column(scale=1):
105
- gr.Markdown("### Step 1: Load Data")
106
  speaker_input = gr.Dropdown(sorted(list(SPEAKER_META.keys())), label="Speaker ID", value="F01")
107
  load_btn = gr.Button("Load Data")
108
  meta_display = gr.JSON(label="Speaker Meta")
@@ -123,20 +121,23 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Torgo DSR Lab") as demo:
123
  gr.Markdown("# 🔬 Performance Evaluation")
124
  with gr.Row():
125
  with gr.Column():
126
- gr.Markdown("### 📏 Metric: Exact Match Accuracy")
127
- gr.Markdown("Accuracy is calculated by comparing the **normalized prediction** against the **normalized ground truth**.")
 
 
128
  with gr.Column():
129
- gr.Markdown("### 🧪 Model Definitions")
130
- gr.Markdown("* **5K Pure Model:** Trained on real Torgo speech. Optimized for articulatory accuracy.")
131
- gr.Markdown("* **10K Triple-Mix Model:** Includes synthetic data and anchors. Tested on unseen speakers (LOSO).")
132
-
 
 
133
  gr.Markdown("---")
134
- gr.Markdown("## 1. Torgo In-Domain Breakdown")
135
  gr.DataFrame(get_indomain_breakdown())
136
  gr.Markdown("## 2. Experimental Summary")
137
  gr.DataFrame(get_experimental_summary())
138
 
139
- # Connectivity
140
  load_btn.click(get_sample_logic, inputs=speaker_input, outputs=[current_audio_path, gt_box, meta_display])
141
  whisper_btn.click(run_whisper_step, inputs=current_audio_path, outputs=[w_raw, w_norm])
142
  model_btn.click(run_model_step, inputs=[current_audio_path, w_norm], outputs=final_out)
 
5
  import random
6
  import librosa
7
  import soundfile as sf
 
8
  from transformers import pipeline
9
  from datasets import load_dataset, Audio
10
  from gradio_client import Client
11
  from stats_data import get_indomain_breakdown, get_experimental_summary, SPEAKER_META
12
 
13
+ # 1. Initialize Baseline ASR with Generation Constraints
14
+ # Set max_new_tokens to 64 to prevent infinite "L-O-O-O" loops
15
  whisper_asr = pipeline(
16
  "automatic-speech-recognition",
17
  model="openai/whisper-tiny",
18
  generate_kwargs={
19
  "language": "en",
20
+ "task": "transcribe",
 
21
  "max_new_tokens": 64,
22
+ "repetition_penalty": 1.5 # Discourages token looping
23
  }
24
  )
25
 
 
26
  HF_TOKEN = os.getenv("HF_TOKEN")
27
  PRIVATE_BACKEND_URL = os.getenv("PRIVATE_BACKEND_URL")
28
 
29
  def normalize_text(text):
30
  if not text: return ""
 
31
  return re.sub(r'[^\w\s]', '', text).lower().strip()
32
 
 
33
  def get_sample_logic(speaker_id):
34
  try:
35
+ # PATH A: UA-SPEECH (Strictly following your provided running block)
36
  if speaker_id == "F02 (UA)":
 
37
  dataset = load_dataset("resproj007/uaspeech_female", split="train", streaming=True)
38
+ # Shuffle helps pick a different word each time
39
+ sample = next(iter(dataset.shuffle(buffer_size=100)))
40
+ gt_text = sample.get('text') or sample.get('transcription') or sample.get('sentence', 'Unknown')
 
41
  audio_data = sample['audio']['array']
42
  sample_rate = sample['audio']['sampling_rate']
43
 
44
+ # PATH B: TORGO (Optimized for speed)
45
  else:
 
46
  dataset = load_dataset("abnerh/TORGO-database", split="train", streaming=True)
47
  dataset = dataset.cast_column("audio", Audio(decode=False))
48
 
49
+ # Speed Hack: Shuffle the stream buffer to find the speaker faster
50
+ # This avoids starting from speaker MC01 every time
51
+ shuffled_ds = dataset.shuffle(buffer_size=1000)
 
 
 
52
 
53
+ # Find first match in shuffled stream
54
+ found_sample = None
55
+ for item in shuffled_ds:
56
+ sid = str(item.get('speaker_id', '')).upper()
57
+ if not sid or sid == "NONE":
58
+ sid = os.path.basename(item['audio']['path']).split('_')[0].upper()
59
+
60
+ if sid == speaker_id:
61
+ found_sample = item
62
+ break
63
 
64
+ if not found_sample:
65
+ return None, "Speaker search timeout. Try again.", {}
66
+
67
+ gt_text = found_sample.get('transcription') or found_sample.get('text', 'Unknown')
68
+ audio_bytes = found_sample['audio']['bytes']
69
  audio_data, sample_rate = librosa.load(io.BytesIO(audio_bytes), sr=16000)
70
 
71
  temp_path = "current_sample.wav"
72
  sf.write(temp_path, audio_data, sample_rate)
 
73
  return temp_path, gt_text.lower().strip(), SPEAKER_META[speaker_id]
74
 
75
  except Exception as e:
 
77
 
78
  def run_whisper_step(audio_path):
79
  if not audio_path: return "No audio loaded", ""
 
80
  result = whisper_asr(audio_path)
81
  raw_w = result["text"]
82
  norm_w = normalize_text(raw_w)
83
  return raw_w, norm_w
84
 
85
  def run_model_step(audio_path, norm_whisper):
86
+ if not audio_path or not norm_whisper: return "Incomplete steps"
87
  try:
88
  client = Client(PRIVATE_BACKEND_URL, hf_token=HF_TOKEN)
89
+ # Calling Private App which uses repetition_penalty=3.0
90
  prediction = client.predict(audio_path, norm_whisper, api_name="/predict_dsr")
91
  return prediction
92
  except Exception as e:
93
+ return f"Backend Offline. Research Model requires Private Space access."
94
 
95
+ # UI
96
  with gr.Blocks(theme=gr.themes.Soft(), title="Torgo DSR Lab") as demo:
97
  gr.Markdown("# ⚗️ Torgo DSR Lab")
 
 
98
  current_audio_path = gr.State("")
99
 
100
  with gr.Tab("🔬 Laboratory"):
101
  with gr.Row():
102
  with gr.Column(scale=1):
103
+ gr.Markdown("### Step 1: Load Sample")
104
  speaker_input = gr.Dropdown(sorted(list(SPEAKER_META.keys())), label="Speaker ID", value="F01")
105
  load_btn = gr.Button("Load Data")
106
  meta_display = gr.JSON(label="Speaker Meta")
 
121
  gr.Markdown("# 🔬 Performance Evaluation")
122
  with gr.Row():
123
  with gr.Column():
124
+ gr.Markdown("""
125
+ ### 📏 Metric: Exact Match Accuracy
126
+ Accuracy is calculated as the percentage of samples where the **normalized prediction** (lowercase, no punctuation) exactly matches the **ground truth**.
127
+ """)
128
  with gr.Column():
129
+ gr.Markdown("""
130
+ ### 🧪 Model Definitions
131
+ * **5K Pure Model:** Trained on 5,000 real Torgo samples. Optimized for articulatory fidelity.
132
+ * **10K Triple-Mix Model:** Includes synthetic data and anchors. Tested on **unseen speakers (LOSO)** to prove generalization.
133
+ """)
134
+
135
  gr.Markdown("---")
136
+ gr.Markdown("## 1. Torgo In-Domain Analysis")
137
  gr.DataFrame(get_indomain_breakdown())
138
  gr.Markdown("## 2. Experimental Summary")
139
  gr.DataFrame(get_experimental_summary())
140
 
 
141
  load_btn.click(get_sample_logic, inputs=speaker_input, outputs=[current_audio_path, gt_box, meta_display])
142
  whisper_btn.click(run_whisper_step, inputs=current_audio_path, outputs=[w_raw, w_norm])
143
  model_btn.click(run_model_step, inputs=[current_audio_path, w_norm], outputs=final_out)