AvtnshM commited on
Commit
d282f41
·
verified ·
1 Parent(s): eaef30e
Files changed (1) hide show
  1. app.py +56 -15
app.py CHANGED
@@ -8,6 +8,7 @@ from transformers import (
8
  AutoModel,
9
  WhisperProcessor,
10
  WhisperForConditionalGeneration,
 
11
  )
12
  import librosa
13
  import numpy as np
@@ -19,32 +20,32 @@ LANGUAGE_CONFIGS = {
19
  "Hindi (हिंदी)": {
20
  "code": "hi",
21
  "script": "Devanagari",
22
- "models": ["AudioX-North", "IndicConformer", "MMS"]
23
  },
24
  "Gujarati (ગુજરાતી)": {
25
  "code": "gu",
26
  "script": "Gujarati",
27
- "models": ["AudioX-North", "IndicConformer", "MMS"]
28
  },
29
  "Marathi (मराठी)": {
30
  "code": "mr",
31
  "script": "Devanagari",
32
- "models": ["AudioX-North", "IndicConformer", "MMS"]
33
  },
34
  "Tamil (தமிழ்)": {
35
  "code": "ta",
36
  "script": "Tamil",
37
- "models": ["AudioX-South", "IndicConformer", "MMS"]
38
  },
39
  "Telugu (తెలుగు)": {
40
  "code": "te",
41
  "script": "Telugu",
42
- "models": ["AudioX-South", "IndicConformer", "MMS"]
43
  },
44
  "Kannada (ಕನ್ನಡ)": {
45
  "code": "kn",
46
  "script": "Kannada",
47
- "models": ["AudioX-South", "IndicConformer", "MMS"]
48
  }
49
  }
50
 
@@ -75,6 +76,13 @@ MODEL_CONFIGS = {
75
  "description": "Supports 1,400+ languages",
76
  "languages": ["hi", "gu", "mr", "ta", "te", "kn", "ml"]
77
  },
 
 
 
 
 
 
 
78
  }
79
 
80
  # Load model and processor
@@ -111,6 +119,17 @@ def load_model_and_processor(model_name):
111
  model = AutoModelForCTC.from_pretrained(repo)
112
  processor = AutoProcessor.from_pretrained(repo)
113
  return model, processor, model_type
 
 
 
 
 
 
 
 
 
 
 
114
 
115
  except Exception as e:
116
  return None, None, f"Error loading model: {str(e)}"
@@ -199,6 +218,27 @@ def transcribe_audio(audio_file, selected_language, selected_models, reference_t
199
  )
200
  transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
201
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
  else: # MMS
203
  # Standard CTC processing for MMS
204
  inputs = processor(audio, sampling_rate=16000, return_tensors="pt")
@@ -298,7 +338,7 @@ def create_interface():
298
 
299
  # Dynamic model selection based on language
300
  model_selection = gr.CheckboxGroup(
301
- choices=["AudioX-North", "IndicConformer", "MMS"],
302
  label="🤖 Select Models",
303
  value=["AudioX-North", "IndicConformer"],
304
  interactive=True
@@ -387,20 +427,21 @@ def create_interface():
387
  ---
388
  ### 🔤 Language & Model Support Matrix
389
 
390
- | Language | Script | AudioX-North | AudioX-South | IndicConformer | MMS |
391
- |----------|---------|-------------|-------------|---------------|-----|
392
- | Hindi | Devanagari | ✅ | ❌ | ✅ | ✅ |
393
- | Gujarati | Gujarati | ✅ | ❌ | ✅ | ✅ |
394
- | Marathi | Devanagari | ✅ | ❌ | ✅ | ✅ |
395
- | Tamil | Tamil | ❌ | ✅ | ✅ | ✅ |
396
- | Telugu | Telugu | ❌ | ✅ | ✅ | ✅ |
397
- | Kannada | Kannada | ❌ | ✅ | ✅ | ✅ |
398
 
399
  ### 💡 Tips:
400
  - **Models auto-filter** based on selected language
401
  - **Reference Text**: Enable WER/CER calculation by providing ground truth
402
  - **Copy Results**: Export formatted results using the copy button
403
  - **Best Performance**: Use AudioX models for their specialized languages
 
404
  """)
405
 
406
  return iface
 
8
  AutoModel,
9
  WhisperProcessor,
10
  WhisperForConditionalGeneration,
11
+ pipeline,
12
  )
13
  import librosa
14
  import numpy as np
 
20
  "Hindi (हिंदी)": {
21
  "code": "hi",
22
  "script": "Devanagari",
23
+ "models": ["AudioX-North", "IndicConformer", "MMS", "Shuka"]
24
  },
25
  "Gujarati (ગુજરાતી)": {
26
  "code": "gu",
27
  "script": "Gujarati",
28
+ "models": ["AudioX-North", "IndicConformer", "MMS", "Shuka"]
29
  },
30
  "Marathi (मराठी)": {
31
  "code": "mr",
32
  "script": "Devanagari",
33
+ "models": ["AudioX-North", "IndicConformer", "MMS", "Shuka"]
34
  },
35
  "Tamil (தமிழ்)": {
36
  "code": "ta",
37
  "script": "Tamil",
38
+ "models": ["AudioX-South", "IndicConformer", "MMS", "Shuka"]
39
  },
40
  "Telugu (తెలుగు)": {
41
  "code": "te",
42
  "script": "Telugu",
43
+ "models": ["AudioX-South", "IndicConformer", "MMS", "Shuka"]
44
  },
45
  "Kannada (ಕನ್ನಡ)": {
46
  "code": "kn",
47
  "script": "Kannada",
48
+ "models": ["AudioX-South", "IndicConformer", "MMS", "Shuka"]
49
  }
50
  }
51
 
 
76
  "description": "Supports 1,400+ languages",
77
  "languages": ["hi", "gu", "mr", "ta", "te", "kn", "ml"]
78
  },
79
+ "Shuka": {
80
+ "repo": "sarvamai/shuka_v1",
81
+ "model_type": "audio_llm",
82
+ "description": "Audio-LLM for Indic languages (transcription mode)",
83
+ "trust_remote_code": True,
84
+ "languages": ["hi", "gu", "mr", "ta", "te", "kn", "ml", "bn", "pa", "or", "as", "ur", "en"]
85
+ },
86
  }
87
 
88
  # Load model and processor
 
119
  model = AutoModelForCTC.from_pretrained(repo)
120
  processor = AutoProcessor.from_pretrained(repo)
121
  return model, processor, model_type
122
+
123
+ elif model_name == "Shuka":
124
+ # Load Shuka using pipeline for easier handling
125
+ print(f"Loading {model_name}... (this may take a few minutes)")
126
+ pipe = pipeline(
127
+ model=repo,
128
+ trust_remote_code=True,
129
+ device=0 if torch.cuda.is_available() else -1,
130
+ torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
131
+ )
132
+ return pipe, None, model_type
133
 
134
  except Exception as e:
135
  return None, None, f"Error loading model: {str(e)}"
 
218
  )
219
  transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
220
 
221
+ elif model_name == "Shuka":
222
+ # Shuka Audio-LLM processing in transcription mode
223
+ turns = [
224
+ {'role': 'system', 'content': 'You are a precise transcription assistant. Transcribe the audio exactly as spoken, maintaining original language and format. Do not translate, summarize, or add explanations - only provide the exact spoken text.'},
225
+ {'role': 'user', 'content': '<|audio|>'}
226
+ ]
227
+
228
+ result = model({
229
+ 'audio': audio,
230
+ 'turns': turns,
231
+ 'sampling_rate': 16000
232
+ }, max_new_tokens=512)
233
+
234
+ # Extract transcription from result
235
+ if isinstance(result, list) and len(result) > 0:
236
+ transcription = result[0].get('generated_text', '').strip()
237
+ elif isinstance(result, dict):
238
+ transcription = result.get('generated_text', '').strip()
239
+ else:
240
+ transcription = str(result).strip()
241
+
242
  else: # MMS
243
  # Standard CTC processing for MMS
244
  inputs = processor(audio, sampling_rate=16000, return_tensors="pt")
 
338
 
339
  # Dynamic model selection based on language
340
  model_selection = gr.CheckboxGroup(
341
+ choices=["AudioX-North", "IndicConformer", "MMS", "Shuka"],
342
  label="🤖 Select Models",
343
  value=["AudioX-North", "IndicConformer"],
344
  interactive=True
 
427
  ---
428
  ### 🔤 Language & Model Support Matrix
429
 
430
+ | Language | Script | AudioX-North | AudioX-South | IndicConformer | MMS | Shuka |
431
+ |----------|---------|-------------|-------------|---------------|-----|--------|
432
+ | Hindi | Devanagari | ✅ | ❌ | ✅ | ✅ | ✅ |
433
+ | Gujarati | Gujarati | ✅ | ❌ | ✅ | ✅ | ✅ |
434
+ | Marathi | Devanagari | ✅ | ❌ | ✅ | ✅ | ✅ |
435
+ | Tamil | Tamil | ❌ | ✅ | ✅ | ✅ | ✅ |
436
+ | Telugu | Telugu | ❌ | ✅ | ✅ | ✅ | ✅ |
437
+ | Kannada | Kannada | ❌ | ✅ | ✅ | ✅ | ✅ |
438
 
439
  ### 💡 Tips:
440
  - **Models auto-filter** based on selected language
441
  - **Reference Text**: Enable WER/CER calculation by providing ground truth
442
  - **Copy Results**: Export formatted results using the copy button
443
  - **Best Performance**: Use AudioX models for their specialized languages
444
+ - **Shuka Model**: Audio-LLM in transcription mode (may take longer to load)
445
  """)
446
 
447
  return iface