ASR_New

Runtime error

App Files Files Community

Noumida commited on Aug 11, 2025

Commit

391a015

verified ·

1 Parent(s): ff42fba

Update app.py

Browse files

Files changed (1) hide show

app.py +20 -48

app.py CHANGED Viewed

@@ -3,25 +3,38 @@ import torch
 import torchaudio
 import gradio as gr
 import spaces
-from transformers import AutoModel, AutoProcessor, Wav2Vec2ForCTC
-import re
 DESCRIPTION = "IndicConformer ASR with Automatic Language Identification"
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print("Loading ASR model (IndicConformer)...")
 asr_model_id = "ai4bharat/indic-conformer-600m-multilingual"
 asr_model = AutoModel.from_pretrained(asr_model_id, trust_remote_code=True).to(device)
 asr_model.eval()
-print(" ASR Model loaded.")
-print("\nLoading Language ID model (MMS-LID)...")
-lid_model_id = "facebook/mms-lid"
 lid_processor = AutoProcessor.from_pretrained(lid_model_id)
 lid_model = AutoModel.from_pretrained(lid_model_id).to(device)
 lid_model.eval()
-print(" Language ID Model loaded.")
 # --- Language Mappings ---
@@ -37,45 +50,8 @@ LID_TO_ASR_LANG_MAP = {
 # Maps the ASR model's code back to a full name for display
 ASR_CODE_TO_NAME = { "as": "Assamese", "bn": "Bengali", "br": "Bodo", "doi": "Dogri", "gu": "Gujarati", "hi": "Hindi", "kn": "Kannada", "ks": "Kashmiri", "kok": "Konkani", "mai": "Maithili", "ml": "Malayalam", "mni": "Manipuri", "mr": "Marathi", "ne": "Nepali", "or": "Odia", "pa": "Punjabi", "sa": "Sanskrit", "sat": "Santali", "sd": "Sindhi", "ta": "Tamil", "te": "Telugu", "ur": "Urdu"}
-import torch
-import torchaudio
-import gradio as gr
-import spaces
-from transformers import AutoModel, AutoProcessor, Wav2Vec2ForCTC
-import re
-DESCRIPTION = "IndicConformer ASR with Automatic Language Identification"
-device = "cuda" if torch.cuda.is_available() else "cpu"
-# --- ASR Model (The one we used before) ---
-print("Loading ASR model (IndicConformer)...")
-asr_model_id = "ai4bharat/indic-conformer-600m-multilingual"
-asr_model = AutoModel.from_pretrained(asr_model_id, trust_remote_code=True).to(device)
-asr_model.eval()
-print(" ASR Model loaded.")
-# --- Language Identification (LID) Model ---
-print("\nLoading Language ID model (MMS-LID)...")
-lid_model_id = "facebook/mms-lid"
-lid_processor = AutoProcessor.from_pretrained(lid_model_id)
-lid_model = AutoModel.from_pretrained(lid_model_id).to(device)
-lid_model.eval()
-print(" Language ID Model loaded.")
-# --- Language Mappings ---
-# Maps the LID model's output code to the ASR model's code
-LID_TO_ASR_LANG_MAP = {
-    "asm_Beng": "as", "ben_Beng": "bn", "brx_Deva": "br", "doi_Deva": "doi",
-    "guj_Gujr": "gu", "hin_Deva": "hi", "kan_Knda": "kn", "kas_Arab": "ks",
-    "kas_Deva": "ks", "gom_Deva": "kok", "mai_Deva": "mai", "mal_Mlym": "ml",
-    "mni_Beng": "mni", "mar_Deva": "mr", "nep_Deva": "ne", "ory_Orya": "or",
-    "pan_Guru": "pa", "san_Deva": "sa", "sat_Olck": "sat", "snd_Arab": "sd",
-    "tam_Taml": "ta", "tel_Telu": "te", "urd_Arab": "ur"
-}
-# Maps the ASR model's code back to a full name for display
-ASR_CODE_TO_NAME = { "as": "Assamese", "bn": "Bengali", "br": "Bodo", "doi": "Dogri", "gu": "Gujarati", "hi": "Hindi", "kn": "Kannada", "ks": "Kashmiri", "kok": "Konkani", "mai": "Maithili", "ml": "Malayalam", "mni": "Manipuri", "mr": "Marathi", "ne": "Nepali", "or": "Odia", "pa": "Punjabi", "sa": "Sanskrit", "sat": "Santali", "sd": "Sindhi", "ta": "Tamil", "te": "Telugu", "ur": "Urdu"}
 @spaces.GPU
 def transcribe_audio_with_lid(audio_path):
     if not audio_path:
@@ -84,7 +60,6 @@ def transcribe_audio_with_lid(audio_path):
     try:
         # Load and preprocess audio
         waveform, sr = torchaudio.load(audio_path)
-        # Resample for both models
         waveform_16k = torchaudio.functional.resample(waveform, sr, 16000)
     except Exception as e:
         return f"Error loading audio: {e}", "", ""
@@ -95,9 +70,7 @@ def transcribe_audio_with_lid(audio_path):
         with torch.no_grad():
             outputs = lid_model(**inputs)
-        # Get the top predicted language ID from the LID model
         predicted_lid_id = outputs.logits.argmax(-1).item()
-        # The model.config.id2label gives us the language code like "hin_Deva"
         detected_lid_code = lid_model.config.id2label[predicted_lid_id]
         # 2. --- Map to ASR Language Code ---
@@ -111,7 +84,6 @@ def transcribe_audio_with_lid(audio_path):
         # 3. --- Transcription using the detected language ---
         with torch.no_grad():
-            # Use the ASR model with the correctly identified language code
             transcription_ctc = asr_model(waveform_16k.to(device), asr_lang_code, "ctc")
             transcription_rnnt = asr_model(waveform_16k.to(device), asr_lang_code, "rnnt")
@@ -120,7 +92,7 @@ def transcribe_audio_with_lid(audio_path):
     return detected_lang_str, transcription_ctc.strip(), transcription_rnnt.strip()
-# Gradio UI (no major changes needed here)
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown(f"## {DESCRIPTION}")
     gr.Markdown("Upload or record audio in any of the 22 supported Indian languages. The app will automatically detect the language and provide the transcription.")

 import torchaudio
 import gradio as gr
 import spaces
+from transformers import AutoModel, AutoProcessor
+from huggingface_hub import login
+from google.colab import userdata # Or use os.environ if not in Colab
 DESCRIPTION = "IndicConformer ASR with Automatic Language Identification"
 device = "cuda" if torch.cuda.is_available() else "cpu"
+# --- Authentication Step ---
+try:
+    # Fetches the token from secrets (e.g., in Colab or Hugging Face Spaces)
+    HF_TOKEN = userdata.get('HF_TOKEN')
+    login(token=HF_TOKEN)
+    print("✅ Successfully logged into Hugging Face Hub.")
+except Exception as e:
+    print(f"⚠️ Could not log into Hugging Face Hub. Please ensure HF_TOKEN is set correctly. Error: {e}")
+# --- Model Loading ---
+# ASR Model (IndicConformer)
 print("Loading ASR model (IndicConformer)...")
 asr_model_id = "ai4bharat/indic-conformer-600m-multilingual"
 asr_model = AutoModel.from_pretrained(asr_model_id, trust_remote_code=True).to(device)
 asr_model.eval()
+print("✅ ASR Model loaded.")
+# Language Identification (LID) Model - Updated to the user-specified version
+print("\nLoading Language ID model (MMS-LID-1024)...")
+lid_model_id = "facebook/mms-lid-1024" # <-- THIS LINE HAS BEEN UPDATED
 lid_processor = AutoProcessor.from_pretrained(lid_model_id)
 lid_model = AutoModel.from_pretrained(lid_model_id).to(device)
 lid_model.eval()
+print("✅ Language ID Model loaded.")
 # --- Language Mappings ---
 # Maps the ASR model's code back to a full name for display
 ASR_CODE_TO_NAME = { "as": "Assamese", "bn": "Bengali", "br": "Bodo", "doi": "Dogri", "gu": "Gujarati", "hi": "Hindi", "kn": "Kannada", "ks": "Kashmiri", "kok": "Konkani", "mai": "Maithili", "ml": "Malayalam", "mni": "Manipuri", "mr": "Marathi", "ne": "Nepali", "or": "Odia", "pa": "Punjabi", "sa": "Sanskrit", "sat": "Santali", "sd": "Sindhi", "ta": "Tamil", "te": "Telugu", "ur": "Urdu"}
 @spaces.GPU
 def transcribe_audio_with_lid(audio_path):
     if not audio_path:
     try:
         # Load and preprocess audio
         waveform, sr = torchaudio.load(audio_path)
         waveform_16k = torchaudio.functional.resample(waveform, sr, 16000)
     except Exception as e:
         return f"Error loading audio: {e}", "", ""
         with torch.no_grad():
             outputs = lid_model(**inputs)
         predicted_lid_id = outputs.logits.argmax(-1).item()
         detected_lid_code = lid_model.config.id2label[predicted_lid_id]
         # 2. --- Map to ASR Language Code ---
         # 3. --- Transcription using the detected language ---
         with torch.no_grad():
             transcription_ctc = asr_model(waveform_16k.to(device), asr_lang_code, "ctc")
             transcription_rnnt = asr_model(waveform_16k.to(device), asr_lang_code, "rnnt")
     return detected_lang_str, transcription_ctc.strip(), transcription_rnnt.strip()
+# --- Gradio UI ---
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown(f"## {DESCRIPTION}")
     gr.Markdown("Upload or record audio in any of the 22 supported Indian languages. The app will automatically detect the language and provide the transcription.")