AvtnshM commited on
Commit
3019682
·
verified ·
1 Parent(s): bfd49db

Reverting back to V22

Browse files
Files changed (1) hide show
  1. app.py +15 -56
app.py CHANGED
@@ -8,7 +8,6 @@ from transformers import (
8
  AutoModel,
9
  WhisperProcessor,
10
  WhisperForConditionalGeneration,
11
- pipeline,
12
  )
13
  import librosa
14
  import numpy as np
@@ -20,32 +19,32 @@ LANGUAGE_CONFIGS = {
20
  "Hindi (हिंदी)": {
21
  "code": "hi",
22
  "script": "Devanagari",
23
- "models": ["AudioX-North", "IndicConformer", "MMS", "Shuka"]
24
  },
25
  "Gujarati (ગુજરાતી)": {
26
  "code": "gu",
27
  "script": "Gujarati",
28
- "models": ["AudioX-North", "IndicConformer", "MMS", "Shuka"]
29
  },
30
  "Marathi (मराठी)": {
31
  "code": "mr",
32
  "script": "Devanagari",
33
- "models": ["AudioX-North", "IndicConformer", "MMS", "Shuka"]
34
  },
35
  "Tamil (தமிழ்)": {
36
  "code": "ta",
37
  "script": "Tamil",
38
- "models": ["AudioX-South", "IndicConformer", "MMS", "Shuka"]
39
  },
40
  "Telugu (తెలుగు)": {
41
  "code": "te",
42
  "script": "Telugu",
43
- "models": ["AudioX-South", "IndicConformer", "MMS", "Shuka"]
44
  },
45
  "Kannada (ಕನ್ನಡ)": {
46
  "code": "kn",
47
  "script": "Kannada",
48
- "models": ["AudioX-South", "IndicConformer", "MMS", "Shuka"]
49
  }
50
  }
51
 
@@ -76,13 +75,6 @@ MODEL_CONFIGS = {
76
  "description": "Supports 1,400+ languages",
77
  "languages": ["hi", "gu", "mr", "ta", "te", "kn", "ml"]
78
  },
79
- "Shuka": {
80
- "repo": "sarvamai/shuka_v1",
81
- "model_type": "audio_llm",
82
- "description": "Audio-LLM for Indic languages (transcription mode)",
83
- "trust_remote_code": True,
84
- "languages": ["hi", "gu", "mr", "ta", "te", "kn", "ml", "bn", "pa", "or", "as", "ur", "en"]
85
- },
86
  }
87
 
88
  # Load model and processor
@@ -119,17 +111,6 @@ def load_model_and_processor(model_name):
119
  model = AutoModelForCTC.from_pretrained(repo)
120
  processor = AutoProcessor.from_pretrained(repo)
121
  return model, processor, model_type
122
-
123
- elif model_name == "Shuka":
124
- # Load Shuka using pipeline for easier handling
125
- print(f"Loading {model_name}... (this may take a few minutes)")
126
- pipe = pipeline(
127
- model=repo,
128
- trust_remote_code=True,
129
- device=0 if torch.cuda.is_available() else -1,
130
- torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
131
- )
132
- return pipe, None, model_type
133
 
134
  except Exception as e:
135
  return None, None, f"Error loading model: {str(e)}"
@@ -218,27 +199,6 @@ def transcribe_audio(audio_file, selected_language, selected_models, reference_t
218
  )
219
  transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
220
 
221
- elif model_name == "Shuka":
222
- # Shuka Audio-LLM processing in transcription mode
223
- turns = [
224
- {'role': 'system', 'content': 'You are a precise transcription assistant. Transcribe the audio exactly as spoken, maintaining original language and format. Do not translate, summarize, or add explanations - only provide the exact spoken text.'},
225
- {'role': 'user', 'content': '<|audio|>'}
226
- ]
227
-
228
- result = model({
229
- 'audio': audio,
230
- 'turns': turns,
231
- 'sampling_rate': 16000
232
- }, max_new_tokens=512)
233
-
234
- # Extract transcription from result
235
- if isinstance(result, list) and len(result) > 0:
236
- transcription = result[0].get('generated_text', '').strip()
237
- elif isinstance(result, dict):
238
- transcription = result.get('generated_text', '').strip()
239
- else:
240
- transcription = str(result).strip()
241
-
242
  else: # MMS
243
  # Standard CTC processing for MMS
244
  inputs = processor(audio, sampling_rate=16000, return_tensors="pt")
@@ -338,7 +298,7 @@ def create_interface():
338
 
339
  # Dynamic model selection based on language
340
  model_selection = gr.CheckboxGroup(
341
- choices=["AudioX-North", "IndicConformer", "MMS", "Shuka"],
342
  label="🤖 Select Models",
343
  value=["AudioX-North", "IndicConformer"],
344
  interactive=True
@@ -427,21 +387,20 @@ def create_interface():
427
  ---
428
  ### 🔤 Language & Model Support Matrix
429
 
430
- | Language | Script | AudioX-North | AudioX-South | IndicConformer | MMS | Shuka |
431
- |----------|---------|-------------|-------------|---------------|-----|--------|
432
- | Hindi | Devanagari | ✅ | ❌ | ✅ | ✅ | ✅ |
433
- | Gujarati | Gujarati | ✅ | ❌ | ✅ | ✅ | ✅ |
434
- | Marathi | Devanagari | ✅ | ❌ | ✅ | ✅ | ✅ |
435
- | Tamil | Tamil | ❌ | ✅ | ✅ | ✅ | ✅ |
436
- | Telugu | Telugu | ❌ | ✅ | ✅ | ✅ | ✅ |
437
- | Kannada | Kannada | ❌ | ✅ | ✅ | ✅ | ✅ |
438
 
439
  ### 💡 Tips:
440
  - **Models auto-filter** based on selected language
441
  - **Reference Text**: Enable WER/CER calculation by providing ground truth
442
  - **Copy Results**: Export formatted results using the copy button
443
  - **Best Performance**: Use AudioX models for their specialized languages
444
- - **Shuka Model**: Audio-LLM in transcription mode (may take longer to load)
445
  """)
446
 
447
  return iface
 
8
  AutoModel,
9
  WhisperProcessor,
10
  WhisperForConditionalGeneration,
 
11
  )
12
  import librosa
13
  import numpy as np
 
19
  "Hindi (हिंदी)": {
20
  "code": "hi",
21
  "script": "Devanagari",
22
+ "models": ["AudioX-North", "IndicConformer", "MMS"]
23
  },
24
  "Gujarati (ગુજરાતી)": {
25
  "code": "gu",
26
  "script": "Gujarati",
27
+ "models": ["AudioX-North", "IndicConformer", "MMS"]
28
  },
29
  "Marathi (मराठी)": {
30
  "code": "mr",
31
  "script": "Devanagari",
32
+ "models": ["AudioX-North", "IndicConformer", "MMS"]
33
  },
34
  "Tamil (தமிழ்)": {
35
  "code": "ta",
36
  "script": "Tamil",
37
+ "models": ["AudioX-South", "IndicConformer", "MMS"]
38
  },
39
  "Telugu (తెలుగు)": {
40
  "code": "te",
41
  "script": "Telugu",
42
+ "models": ["AudioX-South", "IndicConformer", "MMS"]
43
  },
44
  "Kannada (ಕನ್ನಡ)": {
45
  "code": "kn",
46
  "script": "Kannada",
47
+ "models": ["AudioX-South", "IndicConformer", "MMS"]
48
  }
49
  }
50
 
 
75
  "description": "Supports 1,400+ languages",
76
  "languages": ["hi", "gu", "mr", "ta", "te", "kn", "ml"]
77
  },
 
 
 
 
 
 
 
78
  }
79
 
80
  # Load model and processor
 
111
  model = AutoModelForCTC.from_pretrained(repo)
112
  processor = AutoProcessor.from_pretrained(repo)
113
  return model, processor, model_type
 
 
 
 
 
 
 
 
 
 
 
114
 
115
  except Exception as e:
116
  return None, None, f"Error loading model: {str(e)}"
 
199
  )
200
  transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
201
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
  else: # MMS
203
  # Standard CTC processing for MMS
204
  inputs = processor(audio, sampling_rate=16000, return_tensors="pt")
 
298
 
299
  # Dynamic model selection based on language
300
  model_selection = gr.CheckboxGroup(
301
+ choices=["AudioX-North", "IndicConformer", "MMS"],
302
  label="🤖 Select Models",
303
  value=["AudioX-North", "IndicConformer"],
304
  interactive=True
 
387
  ---
388
  ### 🔤 Language & Model Support Matrix
389
 
390
+ | Language | Script | AudioX-North | AudioX-South | IndicConformer | MMS |
391
+ |----------|---------|-------------|-------------|---------------|-----|
392
+ | Hindi | Devanagari | ✅ | ❌ | ✅ | ✅ |
393
+ | Gujarati | Gujarati | ✅ | ❌ | ✅ | ✅ |
394
+ | Marathi | Devanagari | ✅ | ❌ | ✅ | ✅ |
395
+ | Tamil | Tamil | ❌ | ✅ | ✅ | ✅ |
396
+ | Telugu | Telugu | ❌ | ✅ | ✅ | ✅ |
397
+ | Kannada | Kannada | ❌ | ✅ | ✅ | ✅ |
398
 
399
  ### 💡 Tips:
400
  - **Models auto-filter** based on selected language
401
  - **Reference Text**: Enable WER/CER calculation by providing ground truth
402
  - **Copy Results**: Export formatted results using the copy button
403
  - **Best Performance**: Use AudioX models for their specialized languages
 
404
  """)
405
 
406
  return iface