Indic_ASR_Comparison_Multi

Sleeping

App Files Files Community

fischerman commited on Sep 26, 2025

Commit

94bdf88

verified ·

1 Parent(s): 609956c

Update app.py

Browse files

Files changed (1) hide show

app.py +107 -195

app.py CHANGED Viewed

@@ -15,71 +15,55 @@ from jiwer import wer, cer
 import time
 # Language configurations
 LANGUAGE_CONFIGS = {
     "Hindi (हिंदी)": {
         "code": "hi",
         "script": "Devanagari",
-        "models": ["AudioX-North", "IndicConformer", "MMS"]
     },
     "Gujarati (ગુજરાતી)": {
-        "code": "gu",
         "script": "Gujarati",
-        "models": ["AudioX-North", "IndicConformer", "MMS"]
     },
     "Marathi (मराठी)": {
         "code": "mr",
-        "script": "Devanagari",
-        "models": ["AudioX-North", "IndicConformer", "MMS"]
     },
     "Tamil (தமிழ்)": {
         "code": "ta",
         "script": "Tamil",
-        "models": ["AudioX-South", "IndicConformer", "MMS"]
     },
     "Telugu (తెలుగు)": {
         "code": "te",
         "script": "Telugu",
-        "models": ["AudioX-South", "IndicConformer", "MMS"]
     },
     "Kannada (ಕನ್ನಡ)": {
         "code": "kn",
         "script": "Kannada",
-        "models": ["AudioX-South", "IndicConformer", "MMS"]
     },
     "Malayalam (മലയാളം)": {
         "code": "ml",
         "script": "Malayalam",
-        "models": ["AudioX-South", "IndicConformer", "MMS"]
     }
 }
 # Model configurations
 MODEL_CONFIGS = {
-    "AudioX-North": {
-        "repo": "jiviai/audioX-north-v1",
-        "model_type": "whisper",
-        "description": "Supports Hindi, Gujarati, Marathi",
-        "languages": ["hi", "gu", "mr"]
-    },
-    "AudioX-South": {
-        "repo": "jiviai/audioX-south-v1",
-        "model_type": "whisper",
-        "description": "Supports Tamil, Telugu, Kannada, Malayalam",
-        "languages": ["ta", "te", "kn", "ml"]
-    },
     "IndicConformer": {
         "repo": "ai4bharat/indic-conformer-600m-multilingual",
         "model_type": "ctc_rnnt",
         "description": "Supports 22 Indian languages",
         "trust_remote_code": True,
         "languages": ["hi", "gu", "mr", "ta", "te", "kn", "ml", "bn", "pa", "or", "as", "ur"]
-    },
-    "MMS": {
-        "repo": "facebook/mms-1b-all",
-        "model_type": "ctc",
-        "description": "Supports 1,400+ languages",
-        "languages": ["hi", "gu", "mr", "ta", "te", "kn", "ml"]
-    },
 }
 # Load model and processor
@@ -87,14 +71,12 @@ def load_model_and_processor(model_name):
     config = MODEL_CONFIGS[model_name]
     repo = config["repo"]
     model_type = config["model_type"]
-    trust_remote_code = config.get("trust_remote_code", False)
     try:
         if model_name == "IndicConformer":
             print(f"Loading {model_name}...")
             try:
                 model = AutoModel.from_pretrained(
-                    repo,
                     trust_remote_code=True,
                     torch_dtype=torch.float32,
                     low_cpu_mem_usage=True
@@ -104,19 +86,6 @@ def load_model_and_processor(model_name):
                 model = AutoModel.from_pretrained(repo, trust_remote_code=True)
             processor = None
             return model, processor, model_type
-        elif model_name in ["AudioX-North", "AudioX-South"]:
-            # Use Whisper processor and model for AudioX variants
-            processor = WhisperProcessor.from_pretrained(repo)
-            model = WhisperForConditionalGeneration.from_pretrained(repo)
-            model.config.forced_decoder_ids = None
-            return model, processor, model_type
-        elif model_name == "MMS":
-            model = AutoModelForCTC.from_pretrained(repo)
-            processor = AutoProcessor.from_pretrained(repo)
-            return model, processor, model_type
     except Exception as e:
         return None, None, f"Error loading model: {str(e)}"
@@ -138,120 +107,94 @@ def compute_metrics(reference, hypothesis, audio_duration, total_time):
 def transcribe_audio(audio_file, selected_language, selected_models, reference_text=""):
     if not audio_file:
         return "Please upload an audio file.", [], ""
     if not selected_models:
         return "Please select at least one model.", [], ""
     if not selected_language:
         return "Please select a language.", [], ""
     # Get language info
     lang_info = LANGUAGE_CONFIGS[selected_language]
     lang_code = lang_info["code"]
     table_data = []
     try:
         # Load and preprocess audio once
         audio, sr = librosa.load(audio_file, sr=16000)
         audio_duration = len(audio) / sr
-        for model_name in selected_models:
-            # Check if model supports the selected language
-            if model_name.replace("AudioX-", "AudioX-") not in lang_info["models"]:
-                table_data.append([
-                    model_name,
-                    f"Language {selected_language} not supported by this model",
-                    "-", "-", "-", "-"
-                ])
-                continue
-            model, processor, model_type = load_model_and_processor(model_name)
-            if isinstance(model_type, str) and model_type.startswith("Error"):
-                table_data.append([
-                    model_name,
-                    f"Error: {model_type}",
-                    "-", "-", "-", "-"
-                ])
-                continue
-            start_time = time.time()
-            try:
-                if model_name == "IndicConformer":
-                    # AI4Bharat specific processing
-                    wav = torch.from_numpy(audio).unsqueeze(0)
-                    if torch.max(torch.abs(wav)) > 0:
-                        wav = wav / torch.max(torch.abs(wav))
-                    with torch.no_grad():
-                        transcription = model(wav, lang_code, "rnnt")
-                        if isinstance(transcription, list):
-                            transcription = transcription[0] if transcription else ""
-                        transcription = str(transcription).strip()
-                elif model_name in ["AudioX-North", "AudioX-South"]:
-                    # AudioX Whisper-based processing
-                    if sr != 16000:
-                        audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
-                    input_features = processor(audio, sampling_rate=16000, return_tensors="pt").input_features
-                    with torch.no_grad():
-                        predicted_ids = model.generate(
-                            input_features,
-                            task="transcribe",
-                            language=lang_code
-                        )
-                        transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
-                else:  # MMS
-                    # Standard CTC processing for MMS
-                    inputs = processor(audio, sampling_rate=16000, return_tensors="pt")
-                    with torch.no_grad():
-                        input_values = inputs["input_values"]
-                        logits = model(input_values).logits
-                        predicted_ids = torch.argmax(logits, dim=-1)
-                        transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
-            except Exception as e:
-                transcription = f"Processing error: {str(e)}"
-            total_time = time.time() - start_time
-            # Compute metrics
-            wer_score, cer_score, rtf = "-", "-", "-"
-            if reference_text and transcription and not transcription.startswith("Processing error"):
-                wer_val, cer_val, rtf_val, _ = compute_metrics(
-                    reference_text, transcription, audio_duration, total_time
-                )
-                wer_score = f"{wer_val:.3f}" if wer_val is not None else "-"
-                cer_score = f"{cer_val:.3f}" if cer_val is not None else "-"
-                rtf = f"{rtf_val:.3f}" if rtf_val is not None else "-"
-            # Add row to table
             table_data.append([
                 model_name,
-                transcription,
-                wer_score,
-                cer_score,
-                rtf,
-                f"{total_time:.2f}s"
             ])
         # Create summary text
         summary = f"**Language:** {selected_language} ({lang_code})\n"
         summary += f"**Audio Duration:** {audio_duration:.2f}s\n"
-        summary += f"**Models Tested:** {len(selected_models)}\n"
         if reference_text:
             summary += f"**Reference Text:** {reference_text[:100]}{'...' if len(reference_text) > 100 else ''}\n"
         # Create copyable text output
         copyable_text = "MULTILINGUAL SPEECH-TO-TEXT BENCHMARK RESULTS\n" + "="*55 + "\n\n"
         copyable_text += f"Language: {selected_language} ({lang_code})\n"
         copyable_text += f"Script: {lang_info['script']}\n"
         copyable_text += f"Audio Duration: {audio_duration:.2f}s\n"
-        copyable_text += f"Models Tested: {len(selected_models)}\n"
         if reference_text:
             copyable_text += f"Reference Text: {reference_text}\n"
         copyable_text += "\n" + "-"*55 + "\n\n"
@@ -264,8 +207,9 @@ def transcribe_audio(audio_file, selected_language, selected_models, reference_t
             copyable_text += f"RTF: {row[4]}\n"
             copyable_text += f"Time Taken: {row[5]}\n"
             copyable_text += "\n" + "-"*35 + "\n\n"
         return summary, table_data, copyable_text
     except Exception as e:
         error_msg = f"Error during transcription: {str(e)}"
         return error_msg, [], error_msg
@@ -273,19 +217,17 @@ def transcribe_audio(audio_file, selected_language, selected_models, reference_t
 # Create Gradio interface
 def create_interface():
     language_choices = list(LANGUAGE_CONFIGS.keys())
     with gr.Blocks(title="Multilingual Speech-to-Text Benchmark", css="""
         .language-info { background: #f0f8ff; padding: 10px; border-radius: 5px; margin: 10px 0; }
         .copy-area { font-family: monospace; font-size: 12px; }
     """) as iface:
         gr.Markdown("""
         # 🌐 Multilingual Speech-to-Text Benchmark
-        Compare ASR models across **7 Indian Languages** with comprehensive metrics.
-        **Supported Languages:** Hindi, Gujarati, Marathi, Tamil, Telugu, Kannada, Malayalam
         """)
         with gr.Row():
             with gr.Column(scale=1):
                 # Language selection
@@ -295,35 +237,35 @@ def create_interface():
                     value=language_choices[0],
                     interactive=True
                 )
                 audio_input = gr.Audio(
-                    label="📹 Upload Audio File (16kHz recommended)",
                     type="filepath"
                 )
-                # Dynamic model selection based on language
                 model_selection = gr.CheckboxGroup(
-                    choices=["AudioX-North", "IndicConformer", "MMS"],
                     label="🤖 Select Models",
-                    value=["AudioX-North", "IndicConformer"],
-                    interactive=True
                 )
                 reference_input = gr.Textbox(
                     label="📄 Reference Text (optional, paste supported)",
                     placeholder="Paste reference transcription here...",
                     lines=4,
                     interactive=True
                 )
                 submit_btn = gr.Button("🚀 Run Multilingual Benchmark", variant="primary", size="lg")
             with gr.Column(scale=2):
                 summary_output = gr.Markdown(
-                    label="📊 Summary",
                     value="Select language, upload audio file and choose models to begin..."
                 )
                 results_table = gr.Dataframe(
                     headers=["Model", "Transcription", "WER", "CER", "RTF", "Time"],
                     datatype=["str", "str", "str", "str", "str", "str"],
@@ -332,7 +274,7 @@ def create_interface():
                     wrap=True,
                     column_widths=[120, 350, 60, 60, 60, 80]
                 )
                 # Copyable results section
                 with gr.Group():
                     gr.Markdown("### 📋 Export Results")
@@ -346,70 +288,40 @@ def create_interface():
                         placeholder="Benchmark results will appear here..."
                     )
-        # Update model choices based on language selection
-        def update_model_choices(selected_language):
-            if not selected_language:
-                return gr.CheckboxGroup(choices=[], value=[])
-            lang_info = LANGUAGE_CONFIGS[selected_language]
-            available_models = lang_info["models"]
-            # Map display names
-            model_map = {
-                "AudioX-North": "AudioX-North",
-                "AudioX-South": "AudioX-South",
-                "IndicConformer": "IndicConformer",
-                "MMS": "MMS"
-            }
-            available_choices = [model_map[model] for model in available_models if model in model_map]
-            default_selection = available_choices[:2] if len(available_choices) >= 2 else available_choices
-            return gr.CheckboxGroup(choices=available_choices, value=default_selection)
-        # Connect language selection to model updates
-        language_selection.change(
-            fn=update_model_choices,
-            inputs=[language_selection],
-            outputs=[model_selection]
-        )
         # Connect the main function
         submit_btn.click(
             fn=transcribe_audio,
             inputs=[audio_input, language_selection, model_selection, reference_input],
             outputs=[summary_output, results_table, copyable_output]
         )
         reference_input.submit(
             fn=transcribe_audio,
             inputs=[audio_input, language_selection, model_selection, reference_input],
             outputs=[summary_output, results_table, copyable_output]
         )
         # Language information display
         gr.Markdown("""
         ---
         ### 📤 Language & Model Support Matrix
-        | Language | Script | AudioX-North | AudioX-South | IndicConformer | MMS |
-        |----------|---------|-------------|-------------|---------------|-----|
-        | Hindi | Devanagari | ✅ | ❌ | ✅ | ✅ |
-        | Gujarati | Gujarati | ✅ | ❌ | ✅ | ✅ |
-        | Marathi | Devanagari | ✅ | ❌ | ✅ | ✅ |
-        | Tamil | Tamil | ❌ | ✅ | ✅ | ✅ |
-        | Telugu | Telugu | ❌ | ✅ | ✅ | ✅ |
-        | Kannada | Kannada | ❌ | ✅ | ✅ | ✅ |
-        | Malayalam | Malayalam | ❌ | ✅ | ✅ | ✅ |
         ### 💡 Tips:
-        - **Models auto-filter** based on selected language
-        - **Reference Text**: Enable WER/CER calculation by providing ground truth
-        - **Copy Results**: Export formatted results using the copy button
-        - **Best Performance**: Use AudioX models for their specialized languages
         """)
-    return iface
 if __name__ == "__main__":
     iface = create_interface()

 import time
 # Language configurations
+# Simplified to only include IndicConformer
 LANGUAGE_CONFIGS = {
     "Hindi (हिंदी)": {
         "code": "hi",
         "script": "Devanagari",
+        "models": ["IndicConformer"]
     },
     "Gujarati (ગુજરાતી)": {
+        "code": "gu",
         "script": "Gujarati",
+        "models": ["IndicConformer"]
     },
     "Marathi (मराठी)": {
         "code": "mr",
+        "script": "Devanagari",
+        "models": ["IndicConformer"]
     },
     "Tamil (தமிழ்)": {
         "code": "ta",
         "script": "Tamil",
+        "models": ["IndicConformer"]
     },
     "Telugu (తెలుగు)": {
         "code": "te",
         "script": "Telugu",
+        "models": ["IndicConformer"]
     },
     "Kannada (ಕನ್ನಡ)": {
         "code": "kn",
         "script": "Kannada",
+        "models": ["IndicConformer"]
     },
     "Malayalam (മലയാളം)": {
         "code": "ml",
         "script": "Malayalam",
+        "models": ["IndicConformer"]
     }
 }
 # Model configurations
+# Simplified to only include IndicConformer
 MODEL_CONFIGS = {
     "IndicConformer": {
         "repo": "ai4bharat/indic-conformer-600m-multilingual",
         "model_type": "ctc_rnnt",
         "description": "Supports 22 Indian languages",
         "trust_remote_code": True,
         "languages": ["hi", "gu", "mr", "ta", "te", "kn", "ml", "bn", "pa", "or", "as", "ur"]
+    }
 }
 # Load model and processor
     config = MODEL_CONFIGS[model_name]
     repo = config["repo"]
     model_type = config["model_type"]
     try:
         if model_name == "IndicConformer":
             print(f"Loading {model_name}...")
             try:
                 model = AutoModel.from_pretrained(
+                    repo,
                     trust_remote_code=True,
                     torch_dtype=torch.float32,
                     low_cpu_mem_usage=True
                 model = AutoModel.from_pretrained(repo, trust_remote_code=True)
             processor = None
             return model, processor, model_type
     except Exception as e:
         return None, None, f"Error loading model: {str(e)}"
 def transcribe_audio(audio_file, selected_language, selected_models, reference_text=""):
     if not audio_file:
         return "Please upload an audio file.", [], ""
     if not selected_models:
         return "Please select at least one model.", [], ""
     if not selected_language:
         return "Please select a language.", [], ""
     # Get language info
     lang_info = LANGUAGE_CONFIGS[selected_language]
     lang_code = lang_info["code"]
     table_data = []
     try:
         # Load and preprocess audio once
         audio, sr = librosa.load(audio_file, sr=16000)
         audio_duration = len(audio) / sr
+        # We only use one model now: IndicConformer
+        model_name = "IndicConformer"
+        # Check if model supports the selected language
+        if model_name not in lang_info["models"]:
+            table_data.append([
+                model_name,
+                f"Language {selected_language} not supported by this model",
+                "-", "-", "-", "-"
+            ])
+            # This part will not be reached due to simplified UI, but kept for robustness
+        model, processor, model_type = load_model_and_processor(model_name)
+        if isinstance(model_type, str) and model_type.startswith("Error"):
             table_data.append([
                 model_name,
+                f"Error: {model_type}",
+                "-", "-", "-", "-"
             ])
+            return "Error loading model.", [], "" # Exit on model error
+        start_time = time.time()
+        try:
+            # AI4Bharat specific processing for IndicConformer
+            wav = torch.from_numpy(audio).unsqueeze(0)
+            if torch.max(torch.abs(wav)) > 0:
+                wav = wav / torch.max(torch.abs(wav))
+            with torch.no_grad():
+                transcription = model(wav, lang_code, "rnnt")
+                if isinstance(transcription, list):
+                    transcription = transcription[0] if transcription else ""
+                transcription = str(transcription).strip()
+        except Exception as e:
+            transcription = f"Processing error: {str(e)}"
+        total_time = time.time() - start_time
+        # Compute metrics
+        wer_score, cer_score, rtf = "-", "-", "-"
+        if reference_text and transcription and not transcription.startswith("Processing error"):
+            wer_val, cer_val, rtf_val, _ = compute_metrics(
+                reference_text, transcription, audio_duration, total_time
+            )
+            wer_score = f"{wer_val:.3f}" if wer_val is not None else "-"
+            cer_score = f"{cer_val:.3f}" if cer_val is not None else "-"
+            rtf = f"{rtf_val:.3f}" if rtf_val is not None else "-"
+        # Add row to table
+        table_data.append([
+            model_name,
+            transcription,
+            wer_score,
+            cer_score,
+            rtf,
+            f"{total_time:.2f}s"
+        ])
         # Create summary text
         summary = f"**Language:** {selected_language} ({lang_code})\n"
         summary += f"**Audio Duration:** {audio_duration:.2f}s\n"
+        summary += f"**Model Tested:** {model_name}\n"
         if reference_text:
             summary += f"**Reference Text:** {reference_text[:100]}{'...' if len(reference_text) > 100 else ''}\n"
         # Create copyable text output
         copyable_text = "MULTILINGUAL SPEECH-TO-TEXT BENCHMARK RESULTS\n" + "="*55 + "\n\n"
         copyable_text += f"Language: {selected_language} ({lang_code})\n"
         copyable_text += f"Script: {lang_info['script']}\n"
         copyable_text += f"Audio Duration: {audio_duration:.2f}s\n"
+        copyable_text += f"Model Tested: {model_name}\n"
         if reference_text:
             copyable_text += f"Reference Text: {reference_text}\n"
         copyable_text += "\n" + "-"*55 + "\n\n"
             copyable_text += f"RTF: {row[4]}\n"
             copyable_text += f"Time Taken: {row[5]}\n"
             copyable_text += "\n" + "-"*35 + "\n\n"
         return summary, table_data, copyable_text
     except Exception as e:
         error_msg = f"Error during transcription: {str(e)}"
         return error_msg, [], error_msg
 # Create Gradio interface
 def create_interface():
     language_choices = list(LANGUAGE_CONFIGS.keys())
     with gr.Blocks(title="Multilingual Speech-to-Text Benchmark", css="""
         .language-info { background: #f0f8ff; padding: 10px; border-radius: 5px; margin: 10px 0; }
         .copy-area { font-family: monospace; font-size: 12px; }
     """) as iface:
         gr.Markdown("""
         # 🌐 Multilingual Speech-to-Text Benchmark
+        Using only the **IndicConformer** model for 22 Indian languages.
         """)
         with gr.Row():
             with gr.Column(scale=1):
                 # Language selection
                     value=language_choices[0],
                     interactive=True
                 )
                 audio_input = gr.Audio(
+                    label="📹 Upload Audio File (16kHz recommended)",
                     type="filepath"
                 )
+                # Model selection is now a fixed checkbox
                 model_selection = gr.CheckboxGroup(
+                    choices=["IndicConformer"],
                     label="🤖 Select Models",
+                    value=["IndicConformer"],
+                    interactive=False  # Disabled as only one model is used
                 )
                 reference_input = gr.Textbox(
                     label="📄 Reference Text (optional, paste supported)",
                     placeholder="Paste reference transcription here...",
                     lines=4,
                     interactive=True
                 )
                 submit_btn = gr.Button("🚀 Run Multilingual Benchmark", variant="primary", size="lg")
             with gr.Column(scale=2):
                 summary_output = gr.Markdown(
+                    label="📊 Summary",
                     value="Select language, upload audio file and choose models to begin..."
                 )
                 results_table = gr.Dataframe(
                     headers=["Model", "Transcription", "WER", "CER", "RTF", "Time"],
                     datatype=["str", "str", "str", "str", "str", "str"],
                     wrap=True,
                     column_widths=[120, 350, 60, 60, 60, 80]
                 )
                 # Copyable results section
                 with gr.Group():
                     gr.Markdown("### 📋 Export Results")
                         placeholder="Benchmark results will appear here..."
                     )
         # Connect the main function
         submit_btn.click(
             fn=transcribe_audio,
             inputs=[audio_input, language_selection, model_selection, reference_input],
             outputs=[summary_output, results_table, copyable_output]
         )
         reference_input.submit(
             fn=transcribe_audio,
             inputs=[audio_input, language_selection, model_selection, reference_input],
             outputs=[summary_output, results_table, copyable_output]
         )
         # Language information display
         gr.Markdown("""
         ---
         ### 📤 Language & Model Support Matrix
+        | Language | Script | IndicConformer |
+        |----------|---------|---------------|
+        | Hindi | Devanagari | ✅ |
+        | Gujarati | Gujarati | ✅ |
+        | Marathi | Devanagari | ✅ |
+        | Tamil | Tamil | ✅ |
+        | Telugu | Telugu | ✅ |
+        | Kannada | Kannada | ✅ |
+        | Malayalam | Malayalam | ✅ |
         ### 💡 Tips:
+        - **Model is fixed** to IndicConformer for this app.
+        - **Reference Text**: Enable WER/CER calculation by providing ground truth.
+        - **Copy Results**: Export formatted results using the copy button.
         """)
+        return iface
 if __name__ == "__main__":
     iface = create_interface()