AvtnshM commited on
Commit
20a9846
·
verified ·
1 Parent(s): cb5ddd8
Files changed (1) hide show
  1. app.py +101 -40
app.py CHANGED
@@ -57,7 +57,6 @@ def load_model_and_processor(model_name):
57
  except Exception as e:
58
  return None, None, f"Error loading model: {str(e)}"
59
 
60
-
61
  # Compute metrics (WER, CER, RTF)
62
  def compute_metrics(reference, hypothesis, audio_duration, total_time):
63
  if not reference or not hypothesis:
@@ -72,13 +71,15 @@ def compute_metrics(reference, hypothesis, audio_duration, total_time):
72
  except Exception:
73
  return None, None, None, None
74
 
75
-
76
  # Main transcription function
77
  def transcribe_audio(audio_file, selected_models, reference_text=""):
78
  if not audio_file:
79
- return "Please upload an audio file."
 
 
 
80
 
81
- results = []
82
  try:
83
  # Load and preprocess audio once
84
  audio, sr = librosa.load(audio_file, sr=16000)
@@ -87,7 +88,14 @@ def transcribe_audio(audio_file, selected_models, reference_text=""):
87
  for model_name in selected_models:
88
  model, processor, model_type = load_model_and_processor(model_name)
89
  if isinstance(model_type, str) and model_type.startswith("Error"):
90
- results.append(f"{model_name}: {model_type}")
 
 
 
 
 
 
 
91
  continue
92
 
93
  inputs = processor(audio, sampling_rate=16000, return_tensors="pt")
@@ -107,48 +115,101 @@ def transcribe_audio(audio_file, selected_models, reference_text=""):
107
  total_time = time.time() - start_time
108
 
109
  # Compute metrics
110
- wer_score, cer_score, rtf, total_time_tracked = "", "", "", ""
111
  if reference_text and transcription:
112
- wer_score, cer_score, rtf, total_time_tracked = compute_metrics(
113
  reference_text, transcription, audio_duration, total_time
114
  )
115
- wer_score = round(wer_score, 3) if wer_score is not None else ""
116
- cer_score = round(cer_score, 3) if cer_score is not None else ""
117
- rtf = round(rtf, 3) if rtf is not None else ""
118
- total_time_tracked = round(total_time_tracked, 2) if total_time_tracked is not None else ""
119
-
120
- result = (
121
- f"### {model_name}\n"
122
- f"- **Transcription:** {transcription}\n"
123
- f"- **WER:** {wer_score}\n"
124
- f"- **CER:** {cer_score}\n"
125
- f"- **RTF:** {rtf}\n"
126
- f"- **Time Taken (s):** {total_time_tracked}\n"
127
- )
128
- results.append(result)
129
-
130
- return "\n\n".join(results)
 
 
 
 
 
131
  except Exception as e:
132
- return f"Error during transcription: {str(e)}"
133
 
134
-
135
- # Gradio interface
136
  def create_interface():
137
  model_choices = list(MODEL_CONFIGS.keys())
138
- return gr.Interface(
139
- fn=transcribe_audio,
140
- inputs=[
141
- gr.Audio(type="filepath", label="Upload Audio File (16kHz recommended)"),
142
- gr.CheckboxGroup(choices=model_choices, label="Select Models", value=model_choices),
143
- gr.Textbox(label="Reference Text (Optional for WER/CER)", placeholder="Enter or paste ground truth text here", lines=8, interactive=True),
144
- ],
145
- outputs=gr.Markdown(label="Results"),
146
- title="Multilingual Speech-to-Text Benchmark",
147
- description="Upload an audio file, select one or more models, and optionally provide reference text. The app benchmarks WER, CER, RTF, and Time Taken for each model.",
148
- allow_flagging="never",
149
- )
150
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
 
152
  if __name__ == "__main__":
153
  iface = create_interface()
154
- iface.launch()
 
 
 
 
 
 
 
57
  except Exception as e:
58
  return None, None, f"Error loading model: {str(e)}"
59
 
 
60
  # Compute metrics (WER, CER, RTF)
61
  def compute_metrics(reference, hypothesis, audio_duration, total_time):
62
  if not reference or not hypothesis:
 
71
  except Exception:
72
  return None, None, None, None
73
 
 
74
  # Main transcription function
75
  def transcribe_audio(audio_file, selected_models, reference_text=""):
76
  if not audio_file:
77
+ return "Please upload an audio file.", []
78
+
79
+ if not selected_models:
80
+ return "Please select at least one model.", []
81
 
82
+ table_data = []
83
  try:
84
  # Load and preprocess audio once
85
  audio, sr = librosa.load(audio_file, sr=16000)
 
88
  for model_name in selected_models:
89
  model, processor, model_type = load_model_and_processor(model_name)
90
  if isinstance(model_type, str) and model_type.startswith("Error"):
91
+ table_data.append([
92
+ model_name,
93
+ f"Error: {model_type}",
94
+ "-",
95
+ "-",
96
+ "-",
97
+ "-"
98
+ ])
99
  continue
100
 
101
  inputs = processor(audio, sampling_rate=16000, return_tensors="pt")
 
115
  total_time = time.time() - start_time
116
 
117
  # Compute metrics
118
+ wer_score, cer_score, rtf = "-", "-", "-"
119
  if reference_text and transcription:
120
+ wer_val, cer_val, rtf_val, _ = compute_metrics(
121
  reference_text, transcription, audio_duration, total_time
122
  )
123
+ wer_score = f"{wer_val:.3f}" if wer_val is not None else "-"
124
+ cer_score = f"{cer_val:.3f}" if cer_val is not None else "-"
125
+ rtf = f"{rtf_val:.3f}" if rtf_val is not None else "-"
126
+
127
+ # Add row to table
128
+ table_data.append([
129
+ model_name,
130
+ transcription,
131
+ wer_score,
132
+ cer_score,
133
+ rtf,
134
+ f"{total_time:.2f}s"
135
+ ])
136
+
137
+ # Create summary text
138
+ summary = f"**Audio Duration:** {audio_duration:.2f}s\n"
139
+ summary += f"**Models Tested:** {len(selected_models)}\n"
140
+ if reference_text:
141
+ summary += f"**Reference Text:** {reference_text[:100]}{'...' if len(reference_text) > 100 else ''}\n"
142
+
143
+ return summary, table_data
144
  except Exception as e:
145
+ return f"Error during transcription: {str(e)}", []
146
 
147
+ # Create Gradio interface with blocks for better control
 
148
  def create_interface():
149
  model_choices = list(MODEL_CONFIGS.keys())
150
+
151
+ with gr.Blocks(title="Multilingual Speech-to-Text Benchmark") as iface:
152
+ gr.Markdown("""
153
+ # Multilingual Speech-to-Text Benchmark
154
+ Upload an audio file, select one or more models, and optionally provide reference text.
155
+ The app benchmarks WER, CER, RTF, and Time Taken for each model.
156
+ """)
157
+
158
+ with gr.Row():
159
+ with gr.Column(scale=1):
160
+ audio_input = gr.Audio(
161
+ label="Upload Audio File (16kHz recommended)",
162
+ type="filepath"
163
+ )
164
+ model_selection = gr.CheckboxGroup(
165
+ choices=model_choices,
166
+ label="Select Models",
167
+ value=[model_choices[0]], # Default to first model
168
+ interactive=True
169
+ )
170
+ reference_input = gr.Textbox(
171
+ label="Reference Text (Optional for WER/CER)",
172
+ placeholder="Enter or paste ground truth text here",
173
+ lines=8,
174
+ interactive=True,
175
+ max_lines=20
176
+ )
177
+ submit_btn = gr.Button("Transcribe", variant="primary", size="lg")
178
+
179
+ with gr.Column(scale=2):
180
+ summary_output = gr.Markdown(label="Summary", value="Upload an audio file and select models to begin...")
181
+
182
+ results_table = gr.Dataframe(
183
+ headers=["Model", "Transcription", "WER", "CER", "RTF", "Time Taken"],
184
+ datatype=["str", "str", "str", "str", "str", "str"],
185
+ label="Results Comparison",
186
+ interactive=False,
187
+ wrap=True,
188
+ column_widths=[150, 400, 80, 80, 80, 100]
189
+ )
190
+
191
+ # Connect the function
192
+ submit_btn.click(
193
+ fn=transcribe_audio,
194
+ inputs=[audio_input, model_selection, reference_input],
195
+ outputs=[summary_output, results_table]
196
+ )
197
+
198
+ # Also allow triggering on Enter in reference text
199
+ reference_input.submit(
200
+ fn=transcribe_audio,
201
+ inputs=[audio_input, model_selection, reference_input],
202
+ outputs=[summary_output, results_table]
203
+ )
204
+
205
+ return iface
206
 
207
  if __name__ == "__main__":
208
  iface = create_interface()
209
+ iface.launch(
210
+ share=False,
211
+ debug=True,
212
+ server_name="0.0.0.0",
213
+ server_port=7860,
214
+ show_error=True
215
+ )