fischerman commited on
Commit
94bdf88
·
verified ·
1 Parent(s): 609956c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +107 -195
app.py CHANGED
@@ -15,71 +15,55 @@ from jiwer import wer, cer
15
  import time
16
 
17
  # Language configurations
 
18
  LANGUAGE_CONFIGS = {
19
  "Hindi (हिंदी)": {
20
  "code": "hi",
21
  "script": "Devanagari",
22
- "models": ["AudioX-North", "IndicConformer", "MMS"]
23
  },
24
  "Gujarati (ગુજરાતી)": {
25
- "code": "gu",
26
  "script": "Gujarati",
27
- "models": ["AudioX-North", "IndicConformer", "MMS"]
28
  },
29
  "Marathi (मराठी)": {
30
  "code": "mr",
31
- "script": "Devanagari",
32
- "models": ["AudioX-North", "IndicConformer", "MMS"]
33
  },
34
  "Tamil (தமிழ்)": {
35
  "code": "ta",
36
  "script": "Tamil",
37
- "models": ["AudioX-South", "IndicConformer", "MMS"]
38
  },
39
  "Telugu (తెలుగు)": {
40
  "code": "te",
41
  "script": "Telugu",
42
- "models": ["AudioX-South", "IndicConformer", "MMS"]
43
  },
44
  "Kannada (ಕನ್ನಡ)": {
45
  "code": "kn",
46
  "script": "Kannada",
47
- "models": ["AudioX-South", "IndicConformer", "MMS"]
48
  },
49
  "Malayalam (മലയാളം)": {
50
  "code": "ml",
51
  "script": "Malayalam",
52
- "models": ["AudioX-South", "IndicConformer", "MMS"]
53
  }
54
  }
55
 
56
  # Model configurations
 
57
  MODEL_CONFIGS = {
58
- "AudioX-North": {
59
- "repo": "jiviai/audioX-north-v1",
60
- "model_type": "whisper",
61
- "description": "Supports Hindi, Gujarati, Marathi",
62
- "languages": ["hi", "gu", "mr"]
63
- },
64
- "AudioX-South": {
65
- "repo": "jiviai/audioX-south-v1",
66
- "model_type": "whisper",
67
- "description": "Supports Tamil, Telugu, Kannada, Malayalam",
68
- "languages": ["ta", "te", "kn", "ml"]
69
- },
70
  "IndicConformer": {
71
  "repo": "ai4bharat/indic-conformer-600m-multilingual",
72
  "model_type": "ctc_rnnt",
73
  "description": "Supports 22 Indian languages",
74
  "trust_remote_code": True,
75
  "languages": ["hi", "gu", "mr", "ta", "te", "kn", "ml", "bn", "pa", "or", "as", "ur"]
76
- },
77
- "MMS": {
78
- "repo": "facebook/mms-1b-all",
79
- "model_type": "ctc",
80
- "description": "Supports 1,400+ languages",
81
- "languages": ["hi", "gu", "mr", "ta", "te", "kn", "ml"]
82
- },
83
  }
84
 
85
  # Load model and processor
@@ -87,14 +71,12 @@ def load_model_and_processor(model_name):
87
  config = MODEL_CONFIGS[model_name]
88
  repo = config["repo"]
89
  model_type = config["model_type"]
90
- trust_remote_code = config.get("trust_remote_code", False)
91
-
92
  try:
93
  if model_name == "IndicConformer":
94
  print(f"Loading {model_name}...")
95
  try:
96
  model = AutoModel.from_pretrained(
97
- repo,
98
  trust_remote_code=True,
99
  torch_dtype=torch.float32,
100
  low_cpu_mem_usage=True
@@ -104,19 +86,6 @@ def load_model_and_processor(model_name):
104
  model = AutoModel.from_pretrained(repo, trust_remote_code=True)
105
  processor = None
106
  return model, processor, model_type
107
-
108
- elif model_name in ["AudioX-North", "AudioX-South"]:
109
- # Use Whisper processor and model for AudioX variants
110
- processor = WhisperProcessor.from_pretrained(repo)
111
- model = WhisperForConditionalGeneration.from_pretrained(repo)
112
- model.config.forced_decoder_ids = None
113
- return model, processor, model_type
114
-
115
- elif model_name == "MMS":
116
- model = AutoModelForCTC.from_pretrained(repo)
117
- processor = AutoProcessor.from_pretrained(repo)
118
- return model, processor, model_type
119
-
120
  except Exception as e:
121
  return None, None, f"Error loading model: {str(e)}"
122
 
@@ -138,120 +107,94 @@ def compute_metrics(reference, hypothesis, audio_duration, total_time):
138
  def transcribe_audio(audio_file, selected_language, selected_models, reference_text=""):
139
  if not audio_file:
140
  return "Please upload an audio file.", [], ""
141
-
142
  if not selected_models:
143
  return "Please select at least one model.", [], ""
144
-
145
  if not selected_language:
146
  return "Please select a language.", [], ""
147
 
148
  # Get language info
149
  lang_info = LANGUAGE_CONFIGS[selected_language]
150
  lang_code = lang_info["code"]
151
-
152
  table_data = []
153
  try:
154
  # Load and preprocess audio once
155
  audio, sr = librosa.load(audio_file, sr=16000)
156
  audio_duration = len(audio) / sr
157
-
158
- for model_name in selected_models:
159
- # Check if model supports the selected language
160
- if model_name.replace("AudioX-", "AudioX-") not in lang_info["models"]:
161
- table_data.append([
162
- model_name,
163
- f"Language {selected_language} not supported by this model",
164
- "-", "-", "-", "-"
165
- ])
166
- continue
167
-
168
- model, processor, model_type = load_model_and_processor(model_name)
169
- if isinstance(model_type, str) and model_type.startswith("Error"):
170
- table_data.append([
171
- model_name,
172
- f"Error: {model_type}",
173
- "-", "-", "-", "-"
174
- ])
175
- continue
176
-
177
- start_time = time.time()
178
 
179
- try:
180
- if model_name == "IndicConformer":
181
- # AI4Bharat specific processing
182
- wav = torch.from_numpy(audio).unsqueeze(0)
183
- if torch.max(torch.abs(wav)) > 0:
184
- wav = wav / torch.max(torch.abs(wav))
185
-
186
- with torch.no_grad():
187
- transcription = model(wav, lang_code, "rnnt")
188
- if isinstance(transcription, list):
189
- transcription = transcription[0] if transcription else ""
190
- transcription = str(transcription).strip()
191
-
192
- elif model_name in ["AudioX-North", "AudioX-South"]:
193
- # AudioX Whisper-based processing
194
- if sr != 16000:
195
- audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
196
-
197
- input_features = processor(audio, sampling_rate=16000, return_tensors="pt").input_features
198
-
199
- with torch.no_grad():
200
- predicted_ids = model.generate(
201
- input_features,
202
- task="transcribe",
203
- language=lang_code
204
- )
205
- transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
206
-
207
- else: # MMS
208
- # Standard CTC processing for MMS
209
- inputs = processor(audio, sampling_rate=16000, return_tensors="pt")
210
-
211
- with torch.no_grad():
212
- input_values = inputs["input_values"]
213
- logits = model(input_values).logits
214
- predicted_ids = torch.argmax(logits, dim=-1)
215
- transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
216
-
217
- except Exception as e:
218
- transcription = f"Processing error: {str(e)}"
219
-
220
- total_time = time.time() - start_time
221
-
222
- # Compute metrics
223
- wer_score, cer_score, rtf = "-", "-", "-"
224
- if reference_text and transcription and not transcription.startswith("Processing error"):
225
- wer_val, cer_val, rtf_val, _ = compute_metrics(
226
- reference_text, transcription, audio_duration, total_time
227
- )
228
- wer_score = f"{wer_val:.3f}" if wer_val is not None else "-"
229
- cer_score = f"{cer_val:.3f}" if cer_val is not None else "-"
230
- rtf = f"{rtf_val:.3f}" if rtf_val is not None else "-"
231
-
232
- # Add row to table
233
  table_data.append([
234
  model_name,
235
- transcription,
236
- wer_score,
237
- cer_score,
238
- rtf,
239
- f"{total_time:.2f}s"
240
  ])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
241
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
242
  # Create summary text
243
  summary = f"**Language:** {selected_language} ({lang_code})\n"
244
  summary += f"**Audio Duration:** {audio_duration:.2f}s\n"
245
- summary += f"**Models Tested:** {len(selected_models)}\n"
246
  if reference_text:
247
  summary += f"**Reference Text:** {reference_text[:100]}{'...' if len(reference_text) > 100 else ''}\n"
248
-
249
  # Create copyable text output
250
  copyable_text = "MULTILINGUAL SPEECH-TO-TEXT BENCHMARK RESULTS\n" + "="*55 + "\n\n"
251
  copyable_text += f"Language: {selected_language} ({lang_code})\n"
252
  copyable_text += f"Script: {lang_info['script']}\n"
253
  copyable_text += f"Audio Duration: {audio_duration:.2f}s\n"
254
- copyable_text += f"Models Tested: {len(selected_models)}\n"
255
  if reference_text:
256
  copyable_text += f"Reference Text: {reference_text}\n"
257
  copyable_text += "\n" + "-"*55 + "\n\n"
@@ -264,8 +207,9 @@ def transcribe_audio(audio_file, selected_language, selected_models, reference_t
264
  copyable_text += f"RTF: {row[4]}\n"
265
  copyable_text += f"Time Taken: {row[5]}\n"
266
  copyable_text += "\n" + "-"*35 + "\n\n"
267
-
268
  return summary, table_data, copyable_text
 
269
  except Exception as e:
270
  error_msg = f"Error during transcription: {str(e)}"
271
  return error_msg, [], error_msg
@@ -273,19 +217,17 @@ def transcribe_audio(audio_file, selected_language, selected_models, reference_t
273
  # Create Gradio interface
274
  def create_interface():
275
  language_choices = list(LANGUAGE_CONFIGS.keys())
276
-
277
  with gr.Blocks(title="Multilingual Speech-to-Text Benchmark", css="""
278
  .language-info { background: #f0f8ff; padding: 10px; border-radius: 5px; margin: 10px 0; }
279
  .copy-area { font-family: monospace; font-size: 12px; }
280
  """) as iface:
281
  gr.Markdown("""
282
  # 🌐 Multilingual Speech-to-Text Benchmark
283
-
284
- Compare ASR models across **7 Indian Languages** with comprehensive metrics.
285
-
286
- **Supported Languages:** Hindi, Gujarati, Marathi, Tamil, Telugu, Kannada, Malayalam
287
  """)
288
-
289
  with gr.Row():
290
  with gr.Column(scale=1):
291
  # Language selection
@@ -295,35 +237,35 @@ def create_interface():
295
  value=language_choices[0],
296
  interactive=True
297
  )
298
-
299
  audio_input = gr.Audio(
300
- label="📹 Upload Audio File (16kHz recommended)",
301
  type="filepath"
302
  )
303
-
304
- # Dynamic model selection based on language
305
  model_selection = gr.CheckboxGroup(
306
- choices=["AudioX-North", "IndicConformer", "MMS"],
307
  label="🤖 Select Models",
308
- value=["AudioX-North", "IndicConformer"],
309
- interactive=True
310
  )
311
-
312
  reference_input = gr.Textbox(
313
  label="📄 Reference Text (optional, paste supported)",
314
  placeholder="Paste reference transcription here...",
315
  lines=4,
316
  interactive=True
317
  )
318
-
319
  submit_btn = gr.Button("🚀 Run Multilingual Benchmark", variant="primary", size="lg")
320
-
321
  with gr.Column(scale=2):
322
  summary_output = gr.Markdown(
323
- label="📊 Summary",
324
  value="Select language, upload audio file and choose models to begin..."
325
  )
326
-
327
  results_table = gr.Dataframe(
328
  headers=["Model", "Transcription", "WER", "CER", "RTF", "Time"],
329
  datatype=["str", "str", "str", "str", "str", "str"],
@@ -332,7 +274,7 @@ def create_interface():
332
  wrap=True,
333
  column_widths=[120, 350, 60, 60, 60, 80]
334
  )
335
-
336
  # Copyable results section
337
  with gr.Group():
338
  gr.Markdown("### 📋 Export Results")
@@ -346,70 +288,40 @@ def create_interface():
346
  placeholder="Benchmark results will appear here..."
347
  )
348
 
349
- # Update model choices based on language selection
350
- def update_model_choices(selected_language):
351
- if not selected_language:
352
- return gr.CheckboxGroup(choices=[], value=[])
353
-
354
- lang_info = LANGUAGE_CONFIGS[selected_language]
355
- available_models = lang_info["models"]
356
-
357
- # Map display names
358
- model_map = {
359
- "AudioX-North": "AudioX-North",
360
- "AudioX-South": "AudioX-South",
361
- "IndicConformer": "IndicConformer",
362
- "MMS": "MMS"
363
- }
364
-
365
- available_choices = [model_map[model] for model in available_models if model in model_map]
366
- default_selection = available_choices[:2] if len(available_choices) >= 2 else available_choices
367
-
368
- return gr.CheckboxGroup(choices=available_choices, value=default_selection)
369
-
370
- # Connect language selection to model updates
371
- language_selection.change(
372
- fn=update_model_choices,
373
- inputs=[language_selection],
374
- outputs=[model_selection]
375
- )
376
-
377
  # Connect the main function
378
  submit_btn.click(
379
  fn=transcribe_audio,
380
  inputs=[audio_input, language_selection, model_selection, reference_input],
381
  outputs=[summary_output, results_table, copyable_output]
382
  )
383
-
384
  reference_input.submit(
385
  fn=transcribe_audio,
386
  inputs=[audio_input, language_selection, model_selection, reference_input],
387
  outputs=[summary_output, results_table, copyable_output]
388
  )
389
-
390
  # Language information display
391
  gr.Markdown("""
392
  ---
393
  ### 📤 Language & Model Support Matrix
394
 
395
- | Language | Script | AudioX-North | AudioX-South | IndicConformer | MMS |
396
- |----------|---------|-------------|-------------|---------------|-----|
397
- | Hindi | Devanagari | ✅ | ❌ | ✅ | ✅ |
398
- | Gujarati | Gujarati | ✅ | ❌ | ✅ | ✅ |
399
- | Marathi | Devanagari | ✅ | ❌ | ✅ | ✅ |
400
- | Tamil | Tamil | ❌ | ✅ | ✅ | ✅ |
401
- | Telugu | Telugu | ❌ | ✅ | ✅ | ✅ |
402
- | Kannada | Kannada | ❌ | ✅ | ✅ | ✅ |
403
- | Malayalam | Malayalam | ❌ | ✅ | ✅ | ✅ |
404
 
405
  ### 💡 Tips:
406
- - **Models auto-filter** based on selected language
407
- - **Reference Text**: Enable WER/CER calculation by providing ground truth
408
- - **Copy Results**: Export formatted results using the copy button
409
- - **Best Performance**: Use AudioX models for their specialized languages
410
  """)
411
-
412
- return iface
413
 
414
  if __name__ == "__main__":
415
  iface = create_interface()
 
15
  import time
16
 
17
  # Language configurations
18
+ # Simplified to only include IndicConformer
19
  LANGUAGE_CONFIGS = {
20
  "Hindi (हिंदी)": {
21
  "code": "hi",
22
  "script": "Devanagari",
23
+ "models": ["IndicConformer"]
24
  },
25
  "Gujarati (ગુજરાતી)": {
26
+ "code": "gu",
27
  "script": "Gujarati",
28
+ "models": ["IndicConformer"]
29
  },
30
  "Marathi (मराठी)": {
31
  "code": "mr",
32
+ "script": "Devanagari",
33
+ "models": ["IndicConformer"]
34
  },
35
  "Tamil (தமிழ்)": {
36
  "code": "ta",
37
  "script": "Tamil",
38
+ "models": ["IndicConformer"]
39
  },
40
  "Telugu (తెలుగు)": {
41
  "code": "te",
42
  "script": "Telugu",
43
+ "models": ["IndicConformer"]
44
  },
45
  "Kannada (ಕನ್ನಡ)": {
46
  "code": "kn",
47
  "script": "Kannada",
48
+ "models": ["IndicConformer"]
49
  },
50
  "Malayalam (മലയാളം)": {
51
  "code": "ml",
52
  "script": "Malayalam",
53
+ "models": ["IndicConformer"]
54
  }
55
  }
56
 
57
  # Model configurations
58
+ # Simplified to only include IndicConformer
59
  MODEL_CONFIGS = {
 
 
 
 
 
 
 
 
 
 
 
 
60
  "IndicConformer": {
61
  "repo": "ai4bharat/indic-conformer-600m-multilingual",
62
  "model_type": "ctc_rnnt",
63
  "description": "Supports 22 Indian languages",
64
  "trust_remote_code": True,
65
  "languages": ["hi", "gu", "mr", "ta", "te", "kn", "ml", "bn", "pa", "or", "as", "ur"]
66
+ }
 
 
 
 
 
 
67
  }
68
 
69
  # Load model and processor
 
71
  config = MODEL_CONFIGS[model_name]
72
  repo = config["repo"]
73
  model_type = config["model_type"]
 
 
74
  try:
75
  if model_name == "IndicConformer":
76
  print(f"Loading {model_name}...")
77
  try:
78
  model = AutoModel.from_pretrained(
79
+ repo,
80
  trust_remote_code=True,
81
  torch_dtype=torch.float32,
82
  low_cpu_mem_usage=True
 
86
  model = AutoModel.from_pretrained(repo, trust_remote_code=True)
87
  processor = None
88
  return model, processor, model_type
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  except Exception as e:
90
  return None, None, f"Error loading model: {str(e)}"
91
 
 
107
  def transcribe_audio(audio_file, selected_language, selected_models, reference_text=""):
108
  if not audio_file:
109
  return "Please upload an audio file.", [], ""
 
110
  if not selected_models:
111
  return "Please select at least one model.", [], ""
 
112
  if not selected_language:
113
  return "Please select a language.", [], ""
114
 
115
  # Get language info
116
  lang_info = LANGUAGE_CONFIGS[selected_language]
117
  lang_code = lang_info["code"]
118
+
119
  table_data = []
120
  try:
121
  # Load and preprocess audio once
122
  audio, sr = librosa.load(audio_file, sr=16000)
123
  audio_duration = len(audio) / sr
124
+
125
+ # We only use one model now: IndicConformer
126
+ model_name = "IndicConformer"
127
+
128
+ # Check if model supports the selected language
129
+ if model_name not in lang_info["models"]:
130
+ table_data.append([
131
+ model_name,
132
+ f"Language {selected_language} not supported by this model",
133
+ "-", "-", "-", "-"
134
+ ])
135
+ # This part will not be reached due to simplified UI, but kept for robustness
 
 
 
 
 
 
 
 
 
136
 
137
+ model, processor, model_type = load_model_and_processor(model_name)
138
+ if isinstance(model_type, str) and model_type.startswith("Error"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
  table_data.append([
140
  model_name,
141
+ f"Error: {model_type}",
142
+ "-", "-", "-", "-"
 
 
 
143
  ])
144
+ return "Error loading model.", [], "" # Exit on model error
145
+
146
+ start_time = time.time()
147
+
148
+ try:
149
+ # AI4Bharat specific processing for IndicConformer
150
+ wav = torch.from_numpy(audio).unsqueeze(0)
151
+ if torch.max(torch.abs(wav)) > 0:
152
+ wav = wav / torch.max(torch.abs(wav))
153
+
154
+ with torch.no_grad():
155
+ transcription = model(wav, lang_code, "rnnt")
156
+ if isinstance(transcription, list):
157
+ transcription = transcription[0] if transcription else ""
158
+ transcription = str(transcription).strip()
159
 
160
+ except Exception as e:
161
+ transcription = f"Processing error: {str(e)}"
162
+
163
+ total_time = time.time() - start_time
164
+
165
+ # Compute metrics
166
+ wer_score, cer_score, rtf = "-", "-", "-"
167
+ if reference_text and transcription and not transcription.startswith("Processing error"):
168
+ wer_val, cer_val, rtf_val, _ = compute_metrics(
169
+ reference_text, transcription, audio_duration, total_time
170
+ )
171
+ wer_score = f"{wer_val:.3f}" if wer_val is not None else "-"
172
+ cer_score = f"{cer_val:.3f}" if cer_val is not None else "-"
173
+ rtf = f"{rtf_val:.3f}" if rtf_val is not None else "-"
174
+
175
+ # Add row to table
176
+ table_data.append([
177
+ model_name,
178
+ transcription,
179
+ wer_score,
180
+ cer_score,
181
+ rtf,
182
+ f"{total_time:.2f}s"
183
+ ])
184
+
185
  # Create summary text
186
  summary = f"**Language:** {selected_language} ({lang_code})\n"
187
  summary += f"**Audio Duration:** {audio_duration:.2f}s\n"
188
+ summary += f"**Model Tested:** {model_name}\n"
189
  if reference_text:
190
  summary += f"**Reference Text:** {reference_text[:100]}{'...' if len(reference_text) > 100 else ''}\n"
191
+
192
  # Create copyable text output
193
  copyable_text = "MULTILINGUAL SPEECH-TO-TEXT BENCHMARK RESULTS\n" + "="*55 + "\n\n"
194
  copyable_text += f"Language: {selected_language} ({lang_code})\n"
195
  copyable_text += f"Script: {lang_info['script']}\n"
196
  copyable_text += f"Audio Duration: {audio_duration:.2f}s\n"
197
+ copyable_text += f"Model Tested: {model_name}\n"
198
  if reference_text:
199
  copyable_text += f"Reference Text: {reference_text}\n"
200
  copyable_text += "\n" + "-"*55 + "\n\n"
 
207
  copyable_text += f"RTF: {row[4]}\n"
208
  copyable_text += f"Time Taken: {row[5]}\n"
209
  copyable_text += "\n" + "-"*35 + "\n\n"
210
+
211
  return summary, table_data, copyable_text
212
+
213
  except Exception as e:
214
  error_msg = f"Error during transcription: {str(e)}"
215
  return error_msg, [], error_msg
 
217
  # Create Gradio interface
218
  def create_interface():
219
  language_choices = list(LANGUAGE_CONFIGS.keys())
220
+
221
  with gr.Blocks(title="Multilingual Speech-to-Text Benchmark", css="""
222
  .language-info { background: #f0f8ff; padding: 10px; border-radius: 5px; margin: 10px 0; }
223
  .copy-area { font-family: monospace; font-size: 12px; }
224
  """) as iface:
225
  gr.Markdown("""
226
  # 🌐 Multilingual Speech-to-Text Benchmark
227
+
228
+ Using only the **IndicConformer** model for 22 Indian languages.
 
 
229
  """)
230
+
231
  with gr.Row():
232
  with gr.Column(scale=1):
233
  # Language selection
 
237
  value=language_choices[0],
238
  interactive=True
239
  )
240
+
241
  audio_input = gr.Audio(
242
+ label="📹 Upload Audio File (16kHz recommended)",
243
  type="filepath"
244
  )
245
+
246
+ # Model selection is now a fixed checkbox
247
  model_selection = gr.CheckboxGroup(
248
+ choices=["IndicConformer"],
249
  label="🤖 Select Models",
250
+ value=["IndicConformer"],
251
+ interactive=False # Disabled as only one model is used
252
  )
253
+
254
  reference_input = gr.Textbox(
255
  label="📄 Reference Text (optional, paste supported)",
256
  placeholder="Paste reference transcription here...",
257
  lines=4,
258
  interactive=True
259
  )
260
+
261
  submit_btn = gr.Button("🚀 Run Multilingual Benchmark", variant="primary", size="lg")
262
+
263
  with gr.Column(scale=2):
264
  summary_output = gr.Markdown(
265
+ label="📊 Summary",
266
  value="Select language, upload audio file and choose models to begin..."
267
  )
268
+
269
  results_table = gr.Dataframe(
270
  headers=["Model", "Transcription", "WER", "CER", "RTF", "Time"],
271
  datatype=["str", "str", "str", "str", "str", "str"],
 
274
  wrap=True,
275
  column_widths=[120, 350, 60, 60, 60, 80]
276
  )
277
+
278
  # Copyable results section
279
  with gr.Group():
280
  gr.Markdown("### 📋 Export Results")
 
288
  placeholder="Benchmark results will appear here..."
289
  )
290
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
291
  # Connect the main function
292
  submit_btn.click(
293
  fn=transcribe_audio,
294
  inputs=[audio_input, language_selection, model_selection, reference_input],
295
  outputs=[summary_output, results_table, copyable_output]
296
  )
297
+
298
  reference_input.submit(
299
  fn=transcribe_audio,
300
  inputs=[audio_input, language_selection, model_selection, reference_input],
301
  outputs=[summary_output, results_table, copyable_output]
302
  )
303
+
304
  # Language information display
305
  gr.Markdown("""
306
  ---
307
  ### 📤 Language & Model Support Matrix
308
 
309
+ | Language | Script | IndicConformer |
310
+ |----------|---------|---------------|
311
+ | Hindi | Devanagari | ✅ |
312
+ | Gujarati | Gujarati | ✅ |
313
+ | Marathi | Devanagari | ✅ |
314
+ | Tamil | Tamil | ✅ |
315
+ | Telugu | Telugu | ✅ |
316
+ | Kannada | Kannada | ✅ |
317
+ | Malayalam | Malayalam | ✅ |
318
 
319
  ### 💡 Tips:
320
+ - **Model is fixed** to IndicConformer for this app.
321
+ - **Reference Text**: Enable WER/CER calculation by providing ground truth.
322
+ - **Copy Results**: Export formatted results using the copy button.
 
323
  """)
324
+ return iface
 
325
 
326
  if __name__ == "__main__":
327
  iface = create_interface()