st192011 commited on
Commit
4eb1313
·
verified ·
1 Parent(s): f310c2d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +77 -87
app.py CHANGED
@@ -1,137 +1,127 @@
1
  import gradio as gr
2
  import os
3
- import random
4
- import soundfile as sf
5
- import re
6
  import io
 
 
7
  import librosa
8
- import torch
9
  from transformers import pipeline
10
  from datasets import load_dataset, Audio
11
  from gradio_client import Client
12
  from stats_data import get_indomain_breakdown, get_experimental_summary, SPEAKER_META
13
 
14
- # 1. Initialize Local Whisper Tiny (Baseline)
15
- # CPU friendly, fast inference
16
- whisper_asr = pipeline("automatic-speech-recognition", model="openai/whisper-tiny")
 
 
 
17
 
18
- # 2. Private Backend Config
19
  HF_TOKEN = os.getenv("HF_TOKEN")
20
- PRIVATE_BACKEND_URL = "st192011/Torgo-DSR-Private"
21
 
22
- def get_sample(speaker_id):
23
- """Integrated loading logic from your research code."""
 
 
 
 
24
  try:
25
- if speaker_id == "F02":
26
- # UA-Speech loading logic
27
- dataset = load_dataset("resproj007/uaspeech_female", split="test", streaming=True)
28
- # F02 is usually the primary speaker in this slice
29
- sample = next(iter(dataset.shuffle(buffer_size=20)))
30
- gt_text = sample.get('text') or sample.get('transcription') or sample.get('sentence', 'Unknown')
31
- audio_data = sample['audio']['array']
32
- sample_rate = sample['audio']['sampling_rate']
33
  else:
34
- # Torgo loading logic
35
  dataset = load_dataset("abnerh/TORGO-database", split="train", streaming=True)
36
- # Cast for manual decoding as per your training script
37
  dataset = dataset.cast_column("audio", Audio(decode=False))
38
-
39
- # Filter by speaker
40
- speaker_ds = dataset.filter(lambda x: str(x.get('speaker_id', '')).upper() == speaker_id)
41
- sample = next(iter(speaker_ds.shuffle(buffer_size=20)))
42
-
43
- # Extract ground truth
44
- gt_text = sample.get('transcription') or sample.get('text', 'Unknown')
45
-
46
- # Decode Audio bytes
47
- audio_bytes = sample['audio']['bytes']
48
- audio_data, sample_rate = librosa.load(io.BytesIO(audio_bytes), sr=16000)
49
 
50
- # Save to temporary file for Gradio and Whisper
51
- temp_path = "temp_sample.wav"
 
 
 
 
52
  sf.write(temp_path, audio_data, sample_rate)
53
 
54
  return temp_path, gt_text.lower().strip(), SPEAKER_META[speaker_id]
55
-
56
  except Exception as e:
57
- return None, f"Error accessing dataset: {e}", None
58
 
59
- def run_correction(audio_path, gt_text):
60
- if audio_path is None:
61
- return "No audio provided", "", "Please load a sample or record audio."
62
-
63
- # A. Local Whisper Inference
64
- try:
65
- w_res = whisper_asr(audio_path)
66
- w_raw = w_res["text"]
67
- w_norm = re.sub(r'[^\w\s]', '', w_raw).lower().strip()
68
- except Exception as e:
69
- return f"Whisper Error: {e}", "", ""
70
-
71
- # B. Call Private Backend
72
- # This sends the audio and the whisper transcript to your private Gemma model
73
  try:
74
  client = Client(PRIVATE_BACKEND_URL, hf_token=HF_TOKEN)
75
- # Note: Your private backend should expect (audio_file, whisper_text)
76
- res_5k, res_10k = client.predict(audio_path, w_norm, api_name="/predict_dsr_dual")
 
 
77
  except Exception as e:
78
- res_5k = "Backend Offline"
79
- res_10k = "Please ensure the Private Space is running."
80
-
81
- return w_raw, res_5k, res_10k
82
 
83
- # UI Construction
84
  with gr.Blocks(theme=gr.themes.Soft(), title="Torgo DSR Lab") as demo:
85
  gr.Markdown("# ⚗️ Torgo DSR Lab")
86
- gr.Markdown("### Neural Reconstruction Layer for Torgo and UA-Speech Zero-Shot")
87
 
88
- with gr.Tab("🔬 Interactive Lab"):
 
 
89
  with gr.Row():
90
  with gr.Column(scale=1):
91
- gr.Markdown("#### 1. Select and Load Sample")
92
- spk_input = gr.Dropdown(list(SPEAKER_META.keys()), label="Speaker ID", value="F01")
93
- load_btn = gr.Button("🎲 Get Random Sample", variant="secondary")
94
- gr.Markdown("---")
95
- audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Audio Input")
96
-
97
  with gr.Column(scale=2):
98
- gr.Markdown("#### 2. Metadata & Comparison")
99
- with gr.Row():
100
- gt_box = gr.Textbox(label="Ground Truth", interactive=False)
101
- meta_box = gr.JSON(label="Speaker Meta")
102
 
103
- w_out = gr.Textbox(label="Whisper Tiny Baseline (Raw Transcript)")
104
- with gr.Row():
105
- out_5k = gr.Textbox(label="5K Pure Model Prediction")
106
- out_10k = gr.Textbox(label="10K Triple-Mix Prediction")
107
-
108
- run_btn = gr.Button("🚀 Run ASR & Reconstruction", variant="primary")
109
 
110
  with gr.Tab("📊 Research Statistics"):
111
  gr.Markdown("# 🔬 Performance Evaluation")
112
-
113
  with gr.Row():
114
  with gr.Column():
115
  gr.Markdown("""
116
  ### 📏 Metric: Exact Match Accuracy
117
- Accuracy is calculated as the percentage of samples where the **normalized prediction** (lowercase, no punctuation) exactly matches the **ground truth**.
118
  """)
119
-
120
  with gr.Column():
121
  gr.Markdown("""
122
  ### 🧪 Model Definitions
123
- * **5K Pure Model:** Trained on 5,000 real Torgo samples. Optimized for articulatory fidelity.
124
- * **10K Triple-Mix Model:** Includes phonetic anchors and synthetic data. Used for Generalization (LOSO) testing.
125
  """)
126
-
127
- gr.Markdown("## 1. Torgo In-Domain Breakdown (By Speaker)")
128
  gr.DataFrame(get_indomain_breakdown())
129
-
130
- gr.Markdown("## 2. Experimental Condition Summary")
131
  gr.DataFrame(get_experimental_summary())
132
 
133
- # Event Handlers
134
- load_btn.click(get_sample, inputs=spk_input, outputs=[audio_input, gt_box, meta_box])
135
- run_btn.click(run_correction, inputs=[audio_input, gt_box], outputs=[w_out, out_5k, out_10k])
 
136
 
137
  demo.launch()
 
1
  import gradio as gr
2
  import os
 
 
 
3
  import io
4
+ import re
5
+ import random
6
  import librosa
7
+ import soundfile as sf
8
  from transformers import pipeline
9
  from datasets import load_dataset, Audio
10
  from gradio_client import Client
11
  from stats_data import get_indomain_breakdown, get_experimental_summary, SPEAKER_META
12
 
13
+ # 1. Initialize Baseline ASR (Forced to English)
14
+ whisper_asr = pipeline(
15
+ "automatic-speech-recognition",
16
+ model="openai/whisper-tiny",
17
+ generate_kwargs={"language": "en", "task": "transcribe"}
18
+ )
19
 
20
+ # 2. Configuration from Space Secrets
21
  HF_TOKEN = os.getenv("HF_TOKEN")
22
+ PRIVATE_BACKEND_URL = os.getenv("PRIVATE_BACKEND_URL")
23
 
24
+ def normalize_text(text):
25
+ if not text: return ""
26
+ return re.sub(r'[^\w\s]', '', text).lower().strip()
27
+
28
+ def get_sample_logic(speaker_id):
29
+ """Bypasses internal decoders to ensure data access works for both datasets."""
30
  try:
31
+ if speaker_id == "F02 (UA)":
32
+ dataset = load_dataset("resproj007/uaspeech_female", split="train", streaming=True)
33
+ dataset = dataset.cast_column("audio", Audio(decode=False))
34
+ speaker_ds = dataset.filter(lambda x: x["speaker_id"] == "F02")
 
 
 
 
35
  else:
 
36
  dataset = load_dataset("abnerh/TORGO-database", split="train", streaming=True)
 
37
  dataset = dataset.cast_column("audio", Audio(decode=False))
38
+ def filter_spk(x):
39
+ sid = str(x.get('speaker_id', '')).upper()
40
+ if not sid or sid == "NONE":
41
+ sid = os.path.basename(x['audio']['path']).split('_')[0].upper()
42
+ return sid == speaker_id
43
+ speaker_ds = dataset.filter(filter_spk)
 
 
 
 
 
44
 
45
+ sample = next(iter(speaker_ds.shuffle(buffer_size=20)))
46
+ gt_text = sample.get('transcription') or sample.get('text') or sample.get('sentence') or "Unknown"
47
+
48
+ audio_bytes = sample['audio']['bytes']
49
+ audio_data, sample_rate = librosa.load(io.BytesIO(audio_bytes), sr=16000)
50
+ temp_path = "current_sample.wav"
51
  sf.write(temp_path, audio_data, sample_rate)
52
 
53
  return temp_path, gt_text.lower().strip(), SPEAKER_META[speaker_id]
 
54
  except Exception as e:
55
+ return None, f"Dataset Error: {e}", {}
56
 
57
+ def run_whisper_step(audio_path):
58
+ if not audio_path: return "No audio loaded", ""
59
+ result = whisper_asr(audio_path)
60
+ raw_w = result["text"]
61
+ norm_w = normalize_text(raw_w)
62
+ return raw_w, norm_w
63
+
64
+ def run_model_step(audio_path, norm_whisper):
65
+ if not audio_path or not norm_whisper: return "Load data and run Whisper first."
 
 
 
 
 
66
  try:
67
  client = Client(PRIVATE_BACKEND_URL, hf_token=HF_TOKEN)
68
+ # Private app expects audio and normalized whisper
69
+ # Adjust api_name to match your private space definition
70
+ prediction = client.predict(audio_path, norm_whisper, api_name="/predict_dsr")
71
+ return prediction
72
  except Exception as e:
73
+ return f"Backend Error: {e}. Ensure Private Space is running."
 
 
 
74
 
75
+ # UI Layout
76
  with gr.Blocks(theme=gr.themes.Soft(), title="Torgo DSR Lab") as demo:
77
  gr.Markdown("# ⚗️ Torgo DSR Lab")
78
+ gr.Markdown("Stepwise evaluation of standard ASR vs. Neural Reconstruction Layer.")
79
 
80
+ current_audio_path = gr.State("")
81
+
82
+ with gr.Tab("🔬 Laboratory"):
83
  with gr.Row():
84
  with gr.Column(scale=1):
85
+ gr.Markdown("### Step 1: Load Data")
86
+ speaker_input = gr.Dropdown(sorted(list(SPEAKER_META.keys())), label="Speaker ID", value="F01")
87
+ load_btn = gr.Button("Load Data")
88
+ meta_display = gr.JSON(label="Speaker Meta")
89
+ gt_box = gr.Textbox(label="Ground Truth")
90
+
91
  with gr.Column(scale=2):
92
+ gr.Markdown("### Step 2: ASR Baseline")
93
+ whisper_btn = gr.Button("Run Whisper Tiny")
94
+ w_raw = gr.Textbox(label="Whisper Raw")
95
+ w_norm = gr.Textbox(label="Whisper Normalized")
96
 
97
+ gr.Markdown("---")
98
+ gr.Markdown("### Step 3: Neural Reconstruction")
99
+ model_btn = gr.Button("Run Our Model", variant="primary")
100
+ final_out = gr.Textbox(label="DSR Lab Prediction")
 
 
101
 
102
  with gr.Tab("📊 Research Statistics"):
103
  gr.Markdown("# 🔬 Performance Evaluation")
 
104
  with gr.Row():
105
  with gr.Column():
106
  gr.Markdown("""
107
  ### 📏 Metric: Exact Match Accuracy
108
+ Accuracy is the percentage of samples where the **normalized prediction** (lowercase, no punctuation) exactly matches the **normalized ground truth**.
109
  """)
 
110
  with gr.Column():
111
  gr.Markdown("""
112
  ### 🧪 Model Definitions
113
+ * **5K Pure Model:** Trained on real-world Torgo distortions. Optimized for articulatory accuracy.
114
+ * **10K Triple-Mix Model:** Includes synthetic data and anchors; tested on unseen speakers (LOSO).
115
  """)
116
+ gr.Markdown("---")
117
+ gr.Markdown("## 1. Torgo In-Domain Analysis (By Speaker)")
118
  gr.DataFrame(get_indomain_breakdown())
119
+ gr.Markdown("## 2. Experimental Milestone Summary")
 
120
  gr.DataFrame(get_experimental_summary())
121
 
122
+ # Connectivity
123
+ load_btn.click(get_sample_logic, inputs=speaker_input, outputs=[current_audio_path, gt_box, meta_display])
124
+ whisper_btn.click(run_whisper_step, inputs=current_audio_path, outputs=[w_raw, w_norm])
125
+ model_btn.click(run_model_step, inputs=[current_audio_path, w_norm], outputs=final_out)
126
 
127
  demo.launch()