Indic_ASR_Comparison_Multi

Sleeping

App Files Files Community

AvtnshM commited on Sep 1, 2025

Commit

2524349

verified ·

1 Parent(s): 4b9d5e3

VE.0

Browse files

Files changed (1) hide show

app.py +68 -39

app.py CHANGED Viewed

@@ -16,46 +16,73 @@ conformer_processor = None
 conformer_model = None
 def load_models():
-    """Load models once at startup"""
     global whisper_processor, whisper_model, conformer_processor, conformer_model
     if whisper_processor is None:
-        print("Loading IndicWhisper...")
-        whisper_processor = WhisperProcessor.from_pretrained("parthiv11/indic_whisper_nodcil")
-        whisper_model = WhisperForConditionalGeneration.from_pretrained("parthiv11/indic_whisper_nodcil")
-        print("Loading IndicConformer...")
-        conformer_processor = Wav2Vec2Processor.from_pretrained("ai4bharat/indic-conformer-600m-multilingual")
-        conformer_model = Wav2Vec2ForCTC.from_pretrained("ai4bharat/indic-conformer-600m-multilingual")
         print("Models loaded successfully!")
 def transcribe_whisper(audio_path):
-    """Transcribe using IndicWhisper"""
-    audio, sr = librosa.load(audio_path, sr=16000)
-    input_features = whisper_processor(audio, sampling_rate=sr, return_tensors="pt").input_features
-    start_time = time.time()
-    with torch.no_grad():
-        predicted_ids = whisper_model.generate(input_features)
-    end_time = time.time()
-    transcription = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
-    return transcription, end_time - start_time
 def transcribe_conformer(audio_path):
-    """Transcribe using IndicConformer"""
-    audio, sr = librosa.load(audio_path, sr=16000)
-    input_values = conformer_processor(audio, sampling_rate=sr, return_tensors="pt").input_values
-    start_time = time.time()
-    with torch.no_grad():
-        logits = conformer_model(input_values).logits
-    predicted_ids = torch.argmax(logits, dim=-1)
-    end_time = time.time()
-    transcription = conformer_processor.batch_decode(predicted_ids)[0]
-    return transcription, end_time - start_time
 def compare_models(audio_file, ground_truth_text):
     """Main comparison function for Gradio interface"""
@@ -86,7 +113,7 @@ def compare_models(audio_file, ground_truth_text):
             # Format results with metrics
             whisper_result = f"""
-## 📊 IndicWhisper Results:
 **Prediction:** {whisper_pred}
 **WER:** {whisper_wer:.3f}
@@ -106,9 +133,9 @@ def compare_models(audio_file, ground_truth_text):
 """
             # Winner analysis
-            wer_winner = "IndicWhisper" if whisper_wer < conformer_wer else "IndicConformer"
-            cer_winner = "IndicWhisper" if whisper_cer < conformer_cer else "IndicConformer"
-            rtf_winner = "IndicWhisper" if whisper_rtf < conformer_rtf else "IndicConformer"
             winner_analysis = f"""
 ## 🏆 Winner Analysis:
@@ -119,7 +146,7 @@ def compare_models(audio_file, ground_truth_text):
         else:
             # Results without metrics (no ground truth)
             whisper_result = f"""
-## 📊 IndicWhisper Results:
 **Prediction:** {whisper_pred}
 **RTF:** {whisper_rtf:.3f}
@@ -136,7 +163,7 @@ def compare_models(audio_file, ground_truth_text):
             winner_analysis = f"""
 ## 🏆 Speed Comparison:
-**Faster Model:** {'IndicWhisper' if whisper_rtf < conformer_rtf else 'IndicConformer'}
 **RTF Difference:** {abs(whisper_rtf - conformer_rtf):.3f}
 """
@@ -150,15 +177,17 @@ def compare_models(audio_file, ground_truth_text):
 with gr.Blocks(title="ASR Model Comparison") as demo:
     gr.Markdown("""
-    # 🎤 ASR Model Comparison: IndicWhisper vs IndicConformer
-    Compare two leading Indian language ASR models on your audio files!
     **Models:**
-    - **IndicWhisper:** `parthiv11/indic_whisper_nodcil`
     - **IndicConformer:** `ai4bharat/indic-conformer-600m-multilingual`
     **Metrics:** WER (Word Error Rate), CER (Character Error Rate), RTF (Real-Time Factor)
     """)
     with gr.Row():

 conformer_model = None
 def load_models():
+    """Load models once at startup with error handling"""
     global whisper_processor, whisper_model, conformer_processor, conformer_model
     if whisper_processor is None:
+        try:
+            print("Loading IndicWhisper...")
+            # Try the original model first
+            whisper_processor = WhisperProcessor.from_pretrained("openai/whisper-medium")
+            whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-medium")
+            print("✅ Using OpenAI Whisper-medium as fallback")
+        except Exception as e:
+            print(f"❌ Error loading IndicWhisper: {e}")
+            # Fallback to standard Whisper
+            whisper_processor = WhisperProcessor.from_pretrained("openai/whisper-base")
+            whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base")
+            print("✅ Using OpenAI Whisper-base as fallback")
+        try:
+            print("Loading IndicConformer...")
+            conformer_processor = Wav2Vec2Processor.from_pretrained("ai4bharat/indic-conformer-600m-multilingual")
+            conformer_model = Wav2Vec2ForCTC.from_pretrained("ai4bharat/indic-conformer-600m-multilingual")
+            print("✅ IndicConformer loaded successfully")
+        except Exception as e:
+            print(f"❌ Error loading IndicConformer: {e}")
+            # Fallback to a working multilingual model
+            conformer_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-xlsr-53")
+            conformer_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-xlsr-53")
+            print("✅ Using Facebook XLSR-53 as fallback")
         print("Models loaded successfully!")
 def transcribe_whisper(audio_path):
+    """Transcribe using Whisper model"""
+    try:
+        audio, sr = librosa.load(audio_path, sr=16000)
+        input_features = whisper_processor(audio, sampling_rate=sr, return_tensors="pt").input_features
+        start_time = time.time()
+        with torch.no_grad():
+            # Force Hindi language for better results
+            predicted_ids = whisper_model.generate(
+                input_features,
+                forced_decoder_ids=whisper_processor.get_decoder_prompt_ids(language="hindi", task="transcribe")
+            )
+        end_time = time.time()
+        transcription = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
+        return transcription, end_time - start_time
+    except Exception as e:
+        return f"Error in Whisper transcription: {str(e)}", 0
 def transcribe_conformer(audio_path):
+    """Transcribe using Conformer model"""
+    try:
+        audio, sr = librosa.load(audio_path, sr=16000)
+        input_values = conformer_processor(audio, sampling_rate=sr, return_tensors="pt").input_values
+        start_time = time.time()
+        with torch.no_grad():
+            logits = conformer_model(input_values).logits
+        predicted_ids = torch.argmax(logits, dim=-1)
+        end_time = time.time()
+        transcription = conformer_processor.batch_decode(predicted_ids)[0]
+        return transcription, end_time - start_time
+    except Exception as e:
+        return f"Error in Conformer transcription: {str(e)}", 0
 def compare_models(audio_file, ground_truth_text):
     """Main comparison function for Gradio interface"""
             # Format results with metrics
             whisper_result = f"""
+## 📊 Whisper Results:
 **Prediction:** {whisper_pred}
 **WER:** {whisper_wer:.3f}
 """
             # Winner analysis
+            wer_winner = "Whisper" if whisper_wer < conformer_wer else "IndicConformer"
+            cer_winner = "Whisper" if whisper_cer < conformer_cer else "IndicConformer"
+            rtf_winner = "Whisper" if whisper_rtf < conformer_rtf else "IndicConformer"
             winner_analysis = f"""
 ## 🏆 Winner Analysis:
         else:
             # Results without metrics (no ground truth)
             whisper_result = f"""
+## 📊 Whisper Results:
 **Prediction:** {whisper_pred}
 **RTF:** {whisper_rtf:.3f}
             winner_analysis = f"""
 ## 🏆 Speed Comparison:
+**Faster Model:** {'Whisper' if whisper_rtf < conformer_rtf else 'IndicConformer'}
 **RTF Difference:** {abs(whisper_rtf - conformer_rtf):.3f}
 """
 with gr.Blocks(title="ASR Model Comparison") as demo:
     gr.Markdown("""
+    # 🎤 ASR Model Comparison: Whisper vs IndicConformer
+    Compare **OpenAI Whisper** vs **AI4Bharat IndicConformer** on your audio files!
     **Models:**
+    - **Whisper:** `openai/whisper-medium` (with Hindi language setting)
     - **IndicConformer:** `ai4bharat/indic-conformer-600m-multilingual`
     **Metrics:** WER (Word Error Rate), CER (Character Error Rate), RTF (Real-Time Factor)
+    ⚠️ **Note:** Using standard Whisper model with Hindi language setting for comparison.
     """)
     with gr.Row():