st192011 commited on
Commit
005401d
Β·
verified Β·
1 Parent(s): 1368417

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +151 -190
app.py CHANGED
@@ -1,242 +1,203 @@
1
  import gradio as gr
2
- from gradio_client import Client
3
  import os
4
  import io
5
  import re
6
  import random
7
  import librosa
8
  import soundfile as sf
9
- import torch
10
- import numpy as np
11
  from transformers import pipeline
12
  from datasets import load_dataset, Audio
13
- import tempfile
14
-
15
- # ==========================================
16
- # 1. SETUP & AUTHENTICATION
17
- # ==========================================
18
-
19
- # HF Token for accessing Gated Datasets and Private Space
20
- HF_TOKEN = os.getenv("HF_TOKEN")
21
-
22
- # Private Backend Configuration
23
- PRIVATE_SPACE_URL = "st192011/Torgo-DSR-Private"
24
-
25
- print(f"Connecting to Private Backend at {PRIVATE_SPACE_URL}...")
26
- try:
27
- backend_client = Client(PRIVATE_SPACE_URL, hf_token=HF_TOKEN)
28
- print("βœ… Successfully connected to Private Backend.")
29
- except Exception as e:
30
- print(f"⚠️ Warning: Could not connect to backend. Error: {e}")
31
- backend_client = None
32
-
33
- # ==========================================
34
- # 2. WHISPER TINY (Strict Colab Settings)
35
- # ==========================================
36
- device = "cuda" if torch.cuda.is_available() else "cpu"
37
- print(f"Loading Whisper Tiny on {device}...")
38
 
 
 
39
  whisper_asr = pipeline(
40
  "automatic-speech-recognition",
41
  model="openai/whisper-tiny",
42
- device=device,
43
  generate_kwargs={
44
  "language": "en",
45
- "task": "transcribe",
46
- "repetition_penalty": 3.0,
47
- "max_new_tokens": 64
48
  }
49
  )
50
 
51
- # ==========================================
52
- # 3. METADATA & DATA LOGIC
53
- # ==========================================
54
-
55
- SPEAKER_META = {
56
- "F01": {"Gender": "Female", "Severity": "Severe", "Dataset": "Torgo"},
57
- "F03": {"Gender": "Female", "Severity": "Mild", "Dataset": "Torgo"},
58
- "F04": {"Gender": "Female", "Severity": "Mild", "Dataset": "Torgo"},
59
- "M01": {"Gender": "Male", "Severity": "Moderate", "Dataset": "Torgo"},
60
- "M02": {"Gender": "Male", "Severity": "Mild", "Dataset": "Torgo"},
61
- "M03": {"Gender": "Male", "Severity": "Mild", "Dataset": "Torgo"},
62
- "M04": {"Gender": "Male", "Severity": "Moderate", "Dataset": "Torgo"},
63
- "M05": {"Gender": "Male", "Severity": "Severe", "Dataset": "Torgo"},
64
- "F02 (UA)": {"Gender": "Female", "Severity": "Severe", "Dataset": "UA-Speech"}
65
- }
66
 
 
 
 
 
 
 
 
 
67
  def get_sample_logic(speaker_id):
68
- """
69
- Exact logic from Colab:
70
- - Uses abnerh/TORGO-database for Torgo
71
- - Uses resproj007/uaspeech_female for UA
72
- - Uses librosa + io.BytesIO for decoding
73
- """
74
- print(f"Attempting to load sample for: {speaker_id}")
75
  try:
76
  if "UA" in speaker_id:
77
- # UA-Speech logic: Direct pull
78
- ds = load_dataset("resproj007/uaspeech_female", split="train", streaming=True, token=HF_TOKEN)
79
- ds = ds.cast_column("audio", Audio(decode=False))
80
- # F02 is the only speaker here, skip random amount for variety
81
- iterator = iter(ds.skip(random.randint(0, 50)))
82
- sample = next(iterator)
83
  else:
84
- # Torgo logic: abnerh dataset with filtering
85
- ds = load_dataset("abnerh/TORGO-database", split="train", streaming=True, token=HF_TOKEN)
86
- ds = ds.cast_column("audio", Audio(decode=False))
87
 
88
- # Filter by speaker ID
89
  def filter_spk(x):
90
- # Try to get speaker_id from metadata, fall back to filename parsing
91
  sid = str(x.get('speaker_id', '')).upper()
92
  if not sid or sid == "NONE":
93
- path = x.get('audio', {}).get('path', '')
94
- sid = os.path.basename(path).split('_')[0].upper()
95
  return sid == speaker_id
96
 
97
- speaker_ds = ds.filter(filter_spk)
98
- # Shuffle buffer to get random samples
99
- iterator = iter(speaker_ds.shuffle(buffer_size=10))
100
- sample = next(iterator)
101
 
102
- # Metadata extraction
103
- gt_text = sample.get('transcription') or sample.get('text') or sample.get('sentence') or "Unknown"
104
-
105
- # --- Manual Byte Decoding (Colab Logic) ---
106
  audio_bytes = sample['audio']['bytes']
107
- # Load directly into librosa using BytesIO
108
- audio_data, sample_rate = librosa.load(io.BytesIO(audio_bytes), sr=16000)
109
-
110
- # Save to temp file for Gradio/Backend
111
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
112
- sf.write(tmp.name, audio_data, sample_rate)
113
- temp_path = tmp.name
114
 
115
- return temp_path, gt_text.lower().strip(), SPEAKER_META.get(speaker_id, {})
116
-
117
- except StopIteration:
118
- return None, "Error: Could not find any samples for this speaker in the stream.", {}
119
  except Exception as e:
120
- return None, f"Dataset Error: {str(e)}", {}
121
-
122
- def run_whisper_step(audio_path):
123
- """
124
- Step 2: Baseline ASR
125
- """
126
- if not audio_path:
127
- return "No audio loaded", ""
128
 
129
- try:
130
- result = whisper_asr(audio_path)
131
- raw_w = result["text"]
132
- # Normalized Baseline (No punctuation, lowercase)
133
- norm_w = re.sub(r'[^\w\s]', '', raw_w).lower().strip()
134
- return raw_w, norm_w
135
- except Exception as e:
136
- return f"Whisper Error: {e}", "Error"
137
-
138
- def run_model_step(audio_path):
139
- """
140
- Step 3: Private Backend Reconstruction
141
- """
142
- if not audio_path:
143
- return "No audio loaded", "Step 1 incomplete"
144
 
145
- if not backend_client:
146
- return None, "⚠️ Backend Disconnected. Check Private Space."
147
-
148
  try:
149
- print("Sending audio to Private Backend...")
150
- # Calls the /predict_dsr endpoint in the private space
151
- # Expecting returns: [Audio Path, Transcription String]
152
- result = backend_client.predict(
153
- audio_path,
154
- api_name="/predict_dsr"
155
- )
156
- reconstructed_audio = result[0]
157
- dsr_text = result[1]
158
- return reconstructed_audio, dsr_text
159
  except Exception as e:
160
- return None, f"Backend Prediction Error: {str(e)}"
161
-
162
- # ==========================================
163
- # 4. GRADIO UI
164
- # ==========================================
165
 
 
166
  with gr.Blocks(theme=gr.themes.Soft(), title="Torgo DSR Lab") as demo:
167
- gr.Markdown(
168
- """
169
- # βš—οΈ Torgo DSR Lab
170
- **Integrated Research Interface** | *Syncs with Colab Logic*
171
- """
172
- )
173
 
174
- # State to hold the current audio path across steps
175
- current_audio_state = gr.State("")
176
-
177
- with gr.Row():
178
- # --- COLUMN 1: LOAD ---
179
- with gr.Column(scale=1):
180
- gr.Markdown("### Step 1: Load Sample")
181
- speaker_input = gr.Dropdown(
182
- choices=sorted(list(SPEAKER_META.keys())),
183
- label="Speaker ID",
184
- value="F01"
185
- )
186
- load_btn = gr.Button("🎲 Load Data", variant="secondary")
187
-
188
- # Displays
189
- input_audio_display = gr.Audio(label="Input Audio", type="filepath", interactive=False)
190
- gt_box = gr.Textbox(label="Ground Truth", interactive=False)
191
- meta_display = gr.JSON(label="Speaker Meta")
192
-
193
- # --- COLUMN 2: BASELINE ---
194
- with gr.Column(scale=1):
195
- gr.Markdown("### Step 2: ASR Baseline")
196
- whisper_btn = gr.Button("Run Whisper Tiny")
197
-
198
- w_raw = gr.Textbox(label="Whisper Raw")
199
- w_norm = gr.Textbox(label="Whisper Normalized (WER Check)")
200
-
201
- # --- COLUMN 3: RECONSTRUCTION ---
202
- with gr.Column(scale=1):
203
- gr.Markdown("### Step 3: Neural Reconstruction")
204
- model_btn = gr.Button("πŸš€ Run 10K Triple-Mix Model", variant="primary")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
 
206
- output_audio_display = gr.Audio(label="Reconstructed Audio", type="filepath", interactive=False)
207
- final_out_text = gr.Textbox(label="DSR Transcription")
208
-
209
- # ==========================================
210
- # EVENT HANDLERS
211
- # ==========================================
212
-
213
- # Step 1: Load
214
- def on_load(speaker_id):
215
- path, text, meta = get_sample_logic(speaker_id)
216
- if path:
217
- return path, path, text, meta # Update State, Audio Player, Text, JSON
218
- else:
219
- return None, None, text, meta # Handle errors passed in 'text' variable
 
 
 
 
220
 
 
 
 
221
  load_btn.click(
222
- fn=on_load,
223
- inputs=[speaker_input],
224
- outputs=[current_audio_state, input_audio_display, gt_box, meta_display]
 
 
 
 
 
 
 
225
  )
226
 
227
- # Step 2: Whisper
228
  whisper_btn.click(
229
- fn=run_whisper_step,
230
- inputs=[current_audio_state],
231
  outputs=[w_raw, w_norm]
232
  )
233
 
234
- # Step 3: Backend Model
235
  model_btn.click(
236
- fn=run_model_step,
237
- inputs=[current_audio_state],
238
- outputs=[output_audio_display, final_out_text]
239
  )
240
 
241
- if __name__ == "__main__":
242
- demo.launch()
 
1
  import gradio as gr
 
2
  import os
3
  import io
4
  import re
5
  import random
6
  import librosa
7
  import soundfile as sf
8
+ import pandas as pd
 
9
  from transformers import pipeline
10
  from datasets import load_dataset, Audio
11
+ from gradio_client import Client
12
+ from stats_data import get_indomain_breakdown, get_experimental_summary, SPEAKER_META
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
+ # 1. Initialize Baseline ASR (Strict English, Repetition Penalty 3.0)
15
+ print("Initializing Whisper Tiny Baseline...")
16
  whisper_asr = pipeline(
17
  "automatic-speech-recognition",
18
  model="openai/whisper-tiny",
 
19
  generate_kwargs={
20
  "language": "en",
21
+ "task": "transcribe",
22
+ "repetition_penalty": 3.0
 
23
  }
24
  )
25
 
26
+ HF_TOKEN = os.getenv("HF_TOKEN")
27
+ PRIVATE_BACKEND_URL = "st192011/Torgo-DSR-Private"
28
+
29
+ def normalize_text(text):
30
+ if not text: return ""
31
+ return re.sub(r'[^\w\s]', '', text).lower().strip()
 
 
 
 
 
 
 
 
 
32
 
33
+ def format_audio(audio_path):
34
+ """Ensures audio is 16kHz mono to match ASR training conditions."""
35
+ y, sr = librosa.load(audio_path, sr=16000)
36
+ out_path = "formatted_input.wav"
37
+ sf.write(out_path, y, sr)
38
+ return out_path
39
+
40
+ # --- Logic: Data Loading ---
41
  def get_sample_logic(speaker_id):
 
 
 
 
 
 
 
42
  try:
43
  if "UA" in speaker_id:
44
+ # UA-Speech Access (Direct pull for F02)
45
+ dataset = load_dataset("resproj007/uaspeech_female", split="train", streaming=True)
46
+ dataset = dataset.cast_column("audio", Audio(decode=False))
47
+ # UA is small, skip slightly for variety
48
+ sample = next(iter(dataset.skip(random.randint(0, 30))))
49
+ gt_text = sample.get('text') or sample.get('transcription') or sample.get('sentence')
50
  else:
51
+ # Torgo Access (Manual filtering as per Colab fix)
52
+ dataset = load_dataset("abnerh/TORGO-database", split="train", streaming=True)
53
+ dataset = dataset.cast_column("audio", Audio(decode=False))
54
 
 
55
  def filter_spk(x):
 
56
  sid = str(x.get('speaker_id', '')).upper()
57
  if not sid or sid == "NONE":
58
+ sid = os.path.basename(x['audio']['path']).split('_')[0].upper()
 
59
  return sid == speaker_id
60
 
61
+ speaker_ds = dataset.filter(filter_spk)
62
+ sample = next(iter(speaker_ds.shuffle(buffer_size=10)))
63
+ gt_text = sample.get('transcription') or sample.get('text')
 
64
 
65
+ # Decode Bytes manually to bypass torchcodec errors
 
 
 
66
  audio_bytes = sample['audio']['bytes']
67
+ audio_data, sr = librosa.load(io.BytesIO(audio_bytes), sr=16000)
68
+ temp_path = "dataset_sample.wav"
69
+ sf.write(temp_path, audio_data, sr)
 
 
 
 
70
 
71
+ return temp_path, gt_text.lower().strip(), SPEAKER_META[speaker_id]
 
 
 
72
  except Exception as e:
73
+ return None, f"Dataset Error: {e}", {}
74
+
75
+ # --- Logic: Model Processing ---
76
+ def process_audio_step_1(audio_path):
77
+ """Runs Whisper Baseline and returns normalized text."""
78
+ if not audio_path: return "No audio", ""
 
 
79
 
80
+ # Pre-process audio format
81
+ formatted_path = format_audio(audio_path)
82
+
83
+ # Run Whisper
84
+ result = whisper_asr(formatted_path)
85
+ raw_w = result["text"]
86
+ norm_w = normalize_text(raw_w)
87
+ return raw_w, norm_w
88
+
89
+ def process_audio_step_2(audio_path, norm_whisper):
90
+ """Sends audio + normalized whisper to the Private Model."""
91
+ if not audio_path or not norm_whisper: return "Incomplete input from previous steps."
 
 
 
92
 
 
 
 
93
  try:
94
+ formatted_path = format_audio(audio_path)
95
+ client = Client(PRIVATE_BACKEND_URL, hf_token=HF_TOKEN)
96
+ prediction = client.predict(formatted_path, norm_whisper, api_name="/predict_dsr")
97
+ return prediction
 
 
 
 
 
 
98
  except Exception as e:
99
+ return f"Backend Connection Required. Details: {e}"
 
 
 
 
100
 
101
+ # --- UI Construction ---
102
  with gr.Blocks(theme=gr.themes.Soft(), title="Torgo DSR Lab") as demo:
103
+ gr.Markdown("# βš—οΈ Torgo DSR Lab")
104
+ gr.Markdown("Neural Reconstruction Layer for Torgo (In-domain/LOSO) and UA-Speech (Zero-Shot).")
 
 
 
 
105
 
106
+ # Hidden state to store the path of the currently active audio
107
+ active_audio_path = gr.State("")
108
+
109
+ with gr.Tab("πŸ”¬ Laboratory"):
110
+ with gr.Row():
111
+ # LEFT COLUMN: Data Input
112
+ with gr.Column(scale=1):
113
+ with gr.Group():
114
+ gr.Markdown("### Channel A: Research Datasets")
115
+ speaker_input = gr.Dropdown(sorted(list(SPEAKER_META.keys())), label="Select Speaker Profile", value="F01")
116
+ load_btn = gr.Button("Load Sample from Dataset")
117
+ gt_box = gr.Textbox(label="Ground Truth (Reference)", interactive=False)
118
+ meta_display = gr.JSON(label="Speaker Metadata")
119
+
120
+ gr.Markdown("---")
121
+
122
+ with gr.Group():
123
+ gr.Markdown("### Channel B: Personal Input")
124
+ user_audio = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Record or Upload Audio")
125
+ user_load_btn = gr.Button("Use This Audio")
126
+
127
+ # RIGHT COLUMN: Transcripts
128
+ with gr.Column(scale=2):
129
+ gr.Markdown("### Analysis & Reconstruction")
130
+
131
+ with gr.Group():
132
+ gr.Markdown("#### Step 1: ASR Baseline")
133
+ whisper_btn = gr.Button("Run Whisper Tiny")
134
+ w_raw = gr.Textbox(label="Whisper Raw Transcript")
135
+ w_norm = gr.Textbox(label="Whisper Normalized (Input for Model)")
136
+
137
+ gr.Markdown("---")
138
+
139
+ with gr.Group():
140
+ gr.Markdown("#### Step 2: Neural Reconstruction")
141
+ model_btn = gr.Button("Run Our Correction Model", variant="primary")
142
+ final_out = gr.Textbox(label="DSR Lab Prediction (5K Model)")
143
+
144
+ with gr.Tab("πŸ“Š Research Statistics"):
145
+ gr.Markdown("# πŸ”¬ Performance Evaluation")
146
+
147
+ with gr.Row():
148
+ with gr.Column():
149
+ gr.Markdown("""
150
+ ### πŸ“ Metric: Exact Match Accuracy
151
+ Accuracy is the percentage of samples where the **normalized prediction** (lowercase, no punctuation) exactly matches the **normalized ground truth**.
152
+ """)
153
 
154
+ with gr.Column():
155
+ gr.Markdown("""
156
+ ### πŸ§ͺ Model Definitions
157
+ * **5K Pure Model:** Trained on real-world Torgo articulatory distortions. Optimized for phonetic fidelity.
158
+ * **10K Triple-Mix Model:** Includes synthetic data and anchors; utilized for generalization (LOSO) testing.
159
+ """)
160
+
161
+ gr.Markdown("---")
162
+ gr.Markdown("## 1. Torgo In-Domain Analysis (By Speaker)")
163
+ gr.DataFrame(get_indomain_breakdown())
164
+
165
+ gr.Markdown("## 2. Experimental Milestone Summary")
166
+ gr.DataFrame(get_experimental_summary())
167
+
168
+ gr.Markdown("""
169
+ ### πŸ” Key Discovery: The Acoustic Floor
170
+ Our research found that the **5K Pure Model** achieved higher accuracy in both in-domain and zero-shot tasks. This suggests an **'Acoustic Floor'** exists where real-world phonetic distortions are more valuable for model grounding than synthetic linguistic diversity.
171
+ """)
172
 
173
+ # --- Event Handlers ---
174
+
175
+ # Dataset Channel: Load -> Update State -> Update UI Text/Meta
176
  load_btn.click(
177
+ get_sample_logic,
178
+ inputs=speaker_input,
179
+ outputs=[active_audio_path, gt_box, meta_display]
180
+ )
181
+
182
+ # Personal Channel: Use Audio -> Update State -> Clear GT
183
+ user_load_btn.click(
184
+ lambda x: (x, "User Provided Audio", {"Dataset": "Custom", "Severity": "Unknown"}),
185
+ inputs=user_audio,
186
+ outputs=[active_audio_path, gt_box, meta_display]
187
  )
188
 
189
+ # Step 1: Whisper (Uses State)
190
  whisper_btn.click(
191
+ process_audio_step_1,
192
+ inputs=active_audio_path,
193
  outputs=[w_raw, w_norm]
194
  )
195
 
196
+ # Step 2: Model (Uses State + Whisper result)
197
  model_btn.click(
198
+ process_audio_step_2,
199
+ inputs=[active_audio_path, w_norm],
200
+ outputs=final_out
201
  )
202
 
203
+ demo.launch()