Spaces:

ginic
/

wav2ipa

Running

App Files Files Community

Error Handling + Joe's Suggestion + FileNaming

by parthbhangla - opened Jun 26, 2025

base: refs/heads/main

←

from: refs/pr/3

Discussion Files changed

+67

-41

Files changed (1) hide show

app.py +67 -41

app.py CHANGED Viewed

@@ -47,31 +47,24 @@ def load_model_and_predict(
     audio_in: str,
     model_state: dict,
 ):
-    if audio_in is None:
-        return (
-            "",
-            model_state,
-            gr.Textbox(label=TEXTGRID_NAME_INPUT_LABEL, interactive=False),
-        )
-    if model_state["model_name"] != model_name:
-        model_state = {
-            "loaded_model": pipeline(
-                task="automatic-speech-recognition", model=model_name
-            ),
-            "model_name": model_name,
-        }
-    prediction = model_state["loaded_model"](audio_in)["text"]
-    return (
-        prediction,
-        model_state,
-        gr.Textbox(
-            label=TEXTGRID_NAME_INPUT_LABEL,
-            interactive=True,
-            value=Path(audio_in).with_suffix(".TextGrid").name,
-        ),
-    )
 def get_textgrid_contents(audio_in, textgrid_tier_name, transcription_prediction):
@@ -144,6 +137,34 @@ def extract_tier_names(textgrid_file):
         return gr.update(choices=tier_names, value=tier_names[0] if tier_names else None)
     except Exception as e:
         return gr.update(choices=[], value=None)
 def launch_demo():
@@ -154,10 +175,6 @@ def launch_demo():
         "model_name": DEFAULT_MODEL,
     }
-    # Helper function - enables the interval transcribe button
-    def enable_interval_transcribe_btn(audio, textgrid):
-        return gr.update(interactive=(audio is not None and textgrid is not None))
     with gr.Blocks() as demo:
         gr.Markdown("""# Automatic International Phonetic Alphabet Transcription
         This demo allows you to experiment with producing phonetic transcriptions of uploaded or recorded audio using a selected automatic speech recognition (ASR) model.""")
@@ -172,7 +189,7 @@ def launch_demo():
         # Dropdown for transcription type selection
         transcription_type = gr.Dropdown(
-            choices=["Full Audio", "Interval"],
             label="Transcription Type",
             value=None,
             interactive=True,
@@ -187,7 +204,6 @@ def launch_demo():
             full_prediction = gr.Textbox(label="IPA Transcription", show_copy_button=True)
             full_textgrid_tier = gr.Textbox(label="TextGrid Tier Name", value="transcription", interactive=True)
-            full_textgrid_filename = gr.Textbox(label=TEXTGRID_NAME_INPUT_LABEL, interactive=False)
             full_textgrid_contents = gr.Textbox(label="TextGrid Contents", show_copy_button=True)
             full_download_btn = gr.DownloadButton(label=TEXTGRID_DOWNLOAD_TEXT, interactive=False, variant="primary")
@@ -209,7 +225,7 @@ def launch_demo():
         transcription_type.change(
             fn=lambda t: (
                 gr.update(visible=t == "Full Audio"),
-                gr.update(visible=t == "Interval"),
             ),
             inputs=transcription_type,
             outputs=[full_audio_section, interval_section],
@@ -226,7 +242,7 @@ def launch_demo():
         full_transcribe_btn.click(
             fn=load_model_and_predict,
             inputs=[model_name, full_audio, model_state],
-            outputs=[full_prediction, model_state, full_textgrid_filename],
         )
         full_prediction.change(
@@ -236,25 +252,29 @@ def launch_demo():
         )
         full_textgrid_contents.change(
-            fn=get_interactive_download_button,
-            inputs=[full_textgrid_contents, full_textgrid_filename],
             outputs=[full_download_btn],
         )
         full_reset_btn.click(
             fn=lambda: (None, "", "", "", gr.update(interactive=False)),
-            outputs=[full_audio, full_prediction, full_textgrid_filename, full_textgrid_contents, full_download_btn],
         )
         # Enable interval transcribe button only when both files are uploaded
         interval_audio.change(
-            fn=enable_interval_transcribe_btn,
             inputs=[interval_audio, interval_textgrid_file],
             outputs=[interval_transcribe_btn],
         )
         interval_textgrid_file.change(
-            fn=enable_interval_transcribe_btn,
             inputs=[interval_audio, interval_textgrid_file],
             outputs=[interval_transcribe_btn],
         )
@@ -273,8 +293,14 @@ def launch_demo():
         )
         interval_result.change(
-            fn=lambda tg_text: gr.update(value=write_textgrid(tg_text, "interval_output.TextGrid"), interactive=True),
-            inputs=[interval_result],
             outputs=[interval_download_btn],
         )
@@ -286,4 +312,4 @@ def launch_demo():
     demo.launch(max_file_size="100mb")
 if __name__ == "__main__":
-    launch_demo()

     audio_in: str,
     model_state: dict,
 ):
+    try:
+        if audio_in is None:
+            return (
+                "",
+                model_state,
+                gr.Textbox(label=TEXTGRID_NAME_INPUT_LABEL, interactive=False),
+            )
+        if model_state["model_name"] != model_name:
+            model_state = {
+                "loaded_model": pipeline(task="automatic-speech-recognition", model=model_name),
+                "model_name": model_name,
+            }
+        prediction = model_state["loaded_model"](audio_in)["text"]
+        return prediction, model_state
+    except Exception as e:
+        raise gr.Error(f"Failed to load model: {str(e)}")
 def get_textgrid_contents(audio_in, textgrid_tier_name, transcription_prediction):
         return gr.update(choices=tier_names, value=tier_names[0] if tier_names else None)
     except Exception as e:
         return gr.update(choices=[], value=None)
+def validate_textgrid_for_intervals(audio_path, textgrid_file):
+    try:
+        if not audio_path or not textgrid_file:
+            return gr.update(interactive=False)
+        audio_duration = librosa.get_duration(path=audio_path)
+        tg = tgt.io.read_textgrid(textgrid_file.name)
+        tg_end_time = max(tier.end_time for tier in tg.tiers)
+        if tg_end_time > audio_duration:
+            raise gr.Error(
+                f"TextGrid ends at {tg_end_time:.2f}s but audio is only {audio_duration:.2f}s. "
+                "Please upload matching files."
+            )
+        epsilon = 0.01
+        if abs(tg_end_time - audio_duration) > epsilon:
+            gr.Warning(
+                f"TextGrid ends at {tg_end_time:.2f}s but audio is {audio_duration:.2f}s. "
+                "Only the annotated portion will be transcribed."
+            )
+        return gr.update(interactive=True)
+    except Exception as e:
+        raise gr.Error(f"Invalid TextGrid or audio file:\n{str(e)}")
 def launch_demo():
         "model_name": DEFAULT_MODEL,
     }
     with gr.Blocks() as demo:
         gr.Markdown("""# Automatic International Phonetic Alphabet Transcription
         This demo allows you to experiment with producing phonetic transcriptions of uploaded or recorded audio using a selected automatic speech recognition (ASR) model.""")
         # Dropdown for transcription type selection
         transcription_type = gr.Dropdown(
+            choices=["Full Audio", "TextGrid Interval"],
             label="Transcription Type",
             value=None,
             interactive=True,
             full_prediction = gr.Textbox(label="IPA Transcription", show_copy_button=True)
             full_textgrid_tier = gr.Textbox(label="TextGrid Tier Name", value="transcription", interactive=True)
             full_textgrid_contents = gr.Textbox(label="TextGrid Contents", show_copy_button=True)
             full_download_btn = gr.DownloadButton(label=TEXTGRID_DOWNLOAD_TEXT, interactive=False, variant="primary")
         transcription_type.change(
             fn=lambda t: (
                 gr.update(visible=t == "Full Audio"),
+                gr.update(visible=t == "TextGrid Interval"),
             ),
             inputs=transcription_type,
             outputs=[full_audio_section, interval_section],
         full_transcribe_btn.click(
             fn=load_model_and_predict,
             inputs=[model_name, full_audio, model_state],
+            outputs=[full_prediction, model_state],
         )
         full_prediction.change(
         )
         full_textgrid_contents.change(
+            fn=lambda tg_text, audio_path: get_interactive_download_button(
+                tg_text,
+                Path(audio_path).with_suffix(".TextGrid").name if audio_path else "output.TextGrid"
+            ),
+            inputs=[full_textgrid_contents, full_audio],
             outputs=[full_download_btn],
         )
         full_reset_btn.click(
             fn=lambda: (None, "", "", "", gr.update(interactive=False)),
+            outputs=[full_audio, full_prediction, full_textgrid_contents, full_download_btn],
         )
         # Enable interval transcribe button only when both files are uploaded
         interval_audio.change(
+            fn=validate_textgrid_for_intervals,
             inputs=[interval_audio, interval_textgrid_file],
             outputs=[interval_transcribe_btn],
         )
         interval_textgrid_file.change(
+            fn=validate_textgrid_for_intervals,
             inputs=[interval_audio, interval_textgrid_file],
             outputs=[interval_transcribe_btn],
         )
         )
         interval_result.change(
+            fn=lambda tg_text, audio_path: gr.update(
+                value=write_textgrid(
+                    tg_text,
+                    Path(audio_path).with_suffix(".TextGrid").name
+                ),
+                interactive=True,
+            ),
+            inputs=[interval_result, interval_audio],
             outputs=[interval_download_btn],
         )
     demo.launch(max_file_size="100mb")
 if __name__ == "__main__":
+    launch_demo()