ASR_New

Runtime error

App Files Files Community

Noumida commited on Aug 11, 2025

Commit

3627a6f

verified ·

1 Parent(s): 354de8c

Update app.py

Browse files

Files changed (1) hide show

app.py +29 -41

app.py CHANGED Viewed

@@ -3,26 +3,29 @@ import torch
 import torchaudio
 import gradio as gr
 import spaces
-from transformers import AutoModel, Wav2Vec2Processor, Wav2Vec2ForCTC
 DESCRIPTION = "IndicConformer ASR with Automatic Language Identification"
 device = "cuda" if torch.cuda.is_available() else "cpu"
 # --- Model Loading ---
-# ASR Model (IndicConformer) - This is a custom model, so AutoModel is appropriate
 print("Loading ASR model (IndicConformer)...")
 asr_model_id = "ai4bharat/indic-conformer-600m-multilingual"
 asr_model = AutoModel.from_pretrained(asr_model_id, trust_remote_code=True).to(device)
 asr_model.eval()
 print("✅ ASR Model loaded.")
-# Language Identification (LID) Model - Standard Wav2Vec2 architecture
-print("\nLoading Language ID model (MMS-LID)...")
-lid_model_id = "facebook/wav2vec2-base-960h-lid" # Using the official LID fine-tune
-# The processor bundles the feature extractor and the tokenizer/decoder for this model
-lid_processor = Wav2Vec2Processor.from_pretrained(lid_model_id)
-lid_model = Wav2Vec2ForCTC.from_pretrained(lid_model_id).to(device)
 lid_model.eval()
 print("✅ Language ID Model loaded.")
@@ -30,36 +33,16 @@ print("✅ Language ID Model loaded.")
 # --- Language Mappings ---
 # Maps the LID model's output code to the ASR model's code
 LID_TO_ASR_LANG_MAP = {
-    "asm": "as", "ben": "bn", "brx": "br", "doi": "doi", "guj": "gu", "hin": "hi",
-    "kan": "kn", "kas": "ks", "kok": "kok", "mai": "mai", "mal": "ml", "mni": "mni",
-    "mar": "mr", "nep": "ne", "ori": "or", "pan": "pa", "san": "sa", "sat": "sat",
-    "snd": "sd", "tam": "ta", "tel": "te", "urd": "ur",
-    # Adding English as the LID model supports it
-    "eng": "en"
 }
 # Maps the ASR model's code back to a full name for display
-ASR_CODE_TO_NAME = {
-    "as": "Assamese", "bn": "Bengali", "br": "Bodo", "doi": "Dogri", "gu": "Gujarati",
-    "hi": "Hindi", "kn": "Kannada", "ks": "Kashmiri", "kok": "Konkani", "mai": "Maithili",
-    "ml": "Malayalam", "mni": "Manipuri", "mr": "Marathi", "ne": "Nepali", "or": "Odia",
-    "pa": "Punjabi", "sa": "Sanskrit", "sat": "Santali", "sd": "Sindhi", "ta": "Tamil",
-    "te": "Telugu", "ur": "Urdu", "en": "English"
-}
-# --- Core Logic Functions ---
-def identify_language(waveform_16k):
-    """Identifies the language from an audio waveform using the LID model."""
-    input_values = lid_processor(waveform_16k.squeeze(), sampling_rate=16000, return_tensors="pt").input_values
-    with torch.no_grad():
-        logits = lid_model(input_values.to(device)).logits
-    predicted_ids = torch.argmax(logits, dim=-1)
-    # The 'decode' function for this specific LID model gives the language code
-    language_code = lid_processor.decode(predicted_ids)
-    return language_code.strip()
 @spaces.GPU
@@ -68,6 +51,7 @@ def transcribe_audio_with_lid(audio_path):
         return "Please provide an audio file.", "", ""
     try:
         waveform, sr = torchaudio.load(audio_path)
         waveform_16k = torchaudio.functional.resample(waveform, sr, 16000)
     except Exception as e:
@@ -75,14 +59,18 @@ def transcribe_audio_with_lid(audio_path):
     try:
         # 1. --- Language Identification ---
-        # The LID model's output is a simple language code (e.g., "hin" for Hindi)
-        detected_lid_code = identify_language(waveform_16k)
         # 2. --- Map to ASR Language Code ---
-        # Note: We are simplifying the mapping as the new LID model gives simpler codes
-        asr_lang_code = detected_lid_code.lower()
-        if asr_lang_code not in ASR_CODE_TO_NAME:
             detected_lang_str = f"Detected '{detected_lid_code}', which is not supported by the ASR model."
             return detected_lang_str, "N/A", "N/A"
@@ -101,7 +89,7 @@ def transcribe_audio_with_lid(audio_path):
 # --- Gradio UI ---
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown(f"## {DESCRIPTION}")
-    gr.Markdown("Upload or record audio in any of the supported languages. The app will automatically detect the language and provide the transcription.")
     with gr.Row():
         with gr.Column(scale=1):

 import torchaudio
 import gradio as gr
 import spaces
+# Import the correct AutoModel class for the task
+from transformers import AutoModel, AutoModelForAudioClassification, Wav2Vec2FeatureExtractor
 DESCRIPTION = "IndicConformer ASR with Automatic Language Identification"
 device = "cuda" if torch.cuda.is_available() else "cpu"
 # --- Model Loading ---
+# NOTE: If running on a Space with a HF_TOKEN secret,
+# transformers will automatically use it for gated models.
+# ASR Model (IndicConformer)
 print("Loading ASR model (IndicConformer)...")
 asr_model_id = "ai4bharat/indic-conformer-600m-multilingual"
 asr_model = AutoModel.from_pretrained(asr_model_id, trust_remote_code=True).to(device)
 asr_model.eval()
 print("✅ ASR Model loaded.")
+# Language Identification (LID) Model
+print("\nLoading Language ID model (MMS-LID-1024)...")
+lid_model_id = "facebook/mms-lid-1024"
+lid_processor = Wav2Vec2FeatureExtractor.from_pretrained(lid_model_id)
+# Load the model with its audio classification head to get logits
+lid_model = AutoModelForAudioClassification.from_pretrained(lid_model_id).to(device) # <-- THIS LINE IS UPDATED
 lid_model.eval()
 print("✅ Language ID Model loaded.")
 # --- Language Mappings ---
 # Maps the LID model's output code to the ASR model's code
 LID_TO_ASR_LANG_MAP = {
+    "asm_Beng": "as", "ben_Beng": "bn", "brx_Deva": "br", "doi_Deva": "doi",
+    "guj_Gujr": "gu", "hin_Deva": "hi", "kan_Knda": "kn", "kas_Arab": "ks",
+    "kas_Deva": "ks", "gom_Deva": "kok", "mai_Deva": "mai", "mal_Mlym": "ml",
+    "mni_Beng": "mni", "mar_Deva": "mr", "nep_Deva": "ne", "ory_Orya": "or",
+    "pan_Guru": "pa", "san_Deva": "sa", "sat_Olck": "sat", "snd_Arab": "sd",
+    "tam_Taml": "ta", "tel_Telu": "te", "urd_Arab": "ur"
 }
 # Maps the ASR model's code back to a full name for display
+ASR_CODE_TO_NAME = { "as": "Assamese", "bn": "Bengali", "br": "Bodo", "doi": "Dogri", "gu": "Gujarati", "hi": "Hindi", "kn": "Kannada", "ks": "Kashmiri", "kok": "Konkani", "mai": "Maithili", "ml": "Malayalam", "mni": "Manipuri", "mr": "Marathi", "ne": "Nepali", "or": "Odia", "pa": "Punjabi", "sa": "Sanskrit", "sat": "Santali", "sd": "Sindhi", "ta": "Tamil", "te": "Telugu", "ur": "Urdu"}
 @spaces.GPU
         return "Please provide an audio file.", "", ""
     try:
+        # Load and preprocess audio
         waveform, sr = torchaudio.load(audio_path)
         waveform_16k = torchaudio.functional.resample(waveform, sr, 16000)
     except Exception as e:
     try:
         # 1. --- Language Identification ---
+        inputs = lid_processor(waveform_16k.squeeze(), sampling_rate=16000, return_tensors="pt").to(device)
+        with torch.no_grad():
+            outputs = lid_model(**inputs)
+        # This will now work because the output object has the .logits attribute
+        predicted_lid_id = outputs.logits.argmax(-1).item()
+        detected_lid_code = lid_model.config.id2label[predicted_lid_id]
         # 2. --- Map to ASR Language Code ---
+        asr_lang_code = LID_TO_ASR_LANG_MAP.get(detected_lid_code)
+        if not asr_lang_code:
             detected_lang_str = f"Detected '{detected_lid_code}', which is not supported by the ASR model."
             return detected_lang_str, "N/A", "N/A"
 # --- Gradio UI ---
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown(f"## {DESCRIPTION}")
+    gr.Markdown("Upload or record audio in any of the 22 supported Indian languages. The app will automatically detect the language and provide the transcription.")
     with gr.Row():
         with gr.Column(scale=1):