Spaces:

ginic
/

wav2ipa

Running

App Files Files Community

TextGrid Interval Support + UI Changes + Dependency Update

by parthbhangla - opened Jun 24, 2025

base: refs/heads/main

←

from: refs/pr/2

Discussion Files changed

+152

-37

Files changed (2) hide show

app.py +150 -36
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -1,18 +1,21 @@
 from pathlib import Path
 import tempfile
 import gradio as gr
 import librosa
 import tgt.core
 import tgt.io3
 from transformers import pipeline
 TEXTGRID_DIR = tempfile.mkdtemp()
 DEFAULT_MODEL = "ginic/data_seed_bs64_4_wav2vec2-large-xlsr-53-buckeye-ipa"
 TEXTGRID_DOWNLOAD_TEXT = "Download TextGrid file"
 TEXTGRID_NAME_INPUT_LABEL = "TextGrid file name"
 VALID_MODELS = [
     "ctaguchi/wav2vec2-large-xlsr-japlmthufielta-ipa1000-ns",
     "ctaguchi/wav2vec2-large-xlsr-japlmthufielta-ipa-plus-2000",
@@ -105,6 +108,44 @@ def get_interactive_download_button(textgrid_contents, textgrid_filename):
     )
 def launch_demo():
     initial_model = {
         "loaded_model": pipeline(
@@ -113,63 +154,136 @@ def launch_demo():
         "model_name": DEFAULT_MODEL,
     }
     with gr.Blocks() as demo:
-        gr.Markdown(
-            """# Automatic International Phonetic Alphabet Transcription
-            This demo allows you to experiment with producing phonetic transcriptions of uploaded or recorded audio using a selected automatic speech recognition (ASR) model.""",
-        )
         model_name = gr.Dropdown(
             VALID_MODELS,
             value=DEFAULT_MODEL,
             label="IPA transcription ASR model",
             info="Select the model to use for prediction.",
         )
-        audio_in = gr.Audio(type="filepath", show_download_button=True)
         model_state = gr.State(value=initial_model)
-        prediction = gr.Textbox(label="Predicted IPA transcription")
-        gr.Markdown("""## TextGrid File Options
-                    Change these inputs if you'd like to customize and download the transcription in [TextGrid format](https://www.fon.hum.uva.nl/praat/manual/TextGrid_file_formats.html) for Praat.
-                    """)
-        textgrid_tier = gr.Textbox(
-            label="TextGrid Tier Name", value="transcription", interactive=True
-        )
-        textgrid_filename = gr.Textbox(
-            label=TEXTGRID_NAME_INPUT_LABEL, interactive=False
-        )
-        textgrid_contents = gr.Textbox(
-            label="TextGrid Contents",
-            value=get_textgrid_contents,
-            inputs=[audio_in, textgrid_tier, prediction],
         )
-        download_btn = gr.DownloadButton(
-            label=TEXTGRID_DOWNLOAD_TEXT,
-            interactive=False,  # Don't allow download button to be active until an upload happened
-            variant="primary",
         )
-        # Update prediction if model or audio changes
-        gr.on(
-            triggers=[audio_in.input, model_name.change],
             fn=load_model_and_predict,
-            inputs=[model_name, audio_in, model_state],
-            outputs=[prediction, model_state, textgrid_filename],
         )
-        # Download button becomes interactive if user updates audio or textgrid params
-        gr.on(
-            triggers=[textgrid_contents.change, textgrid_filename.change],
             fn=get_interactive_download_button,
-            inputs=[textgrid_contents, textgrid_filename],
-            outputs=[download_btn],
         )
-    demo.launch(max_file_size="100mb")
 if __name__ == "__main__":
     launch_demo()

+# Imports
 from pathlib import Path
 import tempfile
+import os
 import gradio as gr
 import librosa
 import tgt.core
 import tgt.io3
+import soundfile as sf
 from transformers import pipeline
+# Constants
 TEXTGRID_DIR = tempfile.mkdtemp()
 DEFAULT_MODEL = "ginic/data_seed_bs64_4_wav2vec2-large-xlsr-53-buckeye-ipa"
 TEXTGRID_DOWNLOAD_TEXT = "Download TextGrid file"
 TEXTGRID_NAME_INPUT_LABEL = "TextGrid file name"
+# Selection of models
 VALID_MODELS = [
     "ctaguchi/wav2vec2-large-xlsr-japlmthufielta-ipa1000-ns",
     "ctaguchi/wav2vec2-large-xlsr-japlmthufielta-ipa-plus-2000",
     )
+def transcribe_intervals(audio_in, textgrid_path, source_tier, target_tier, model_state):
+    if audio_in is None or textgrid_path is None:
+        return "Missing audio or TextGrid input file."
+    tg=tgt.io.read_textgrid(textgrid_path.name)
+    tier = tg.get_tier_by_name(source_tier)
+    ipa_tier = tgt.core.IntervalTier(name=target_tier)
+    for interval in tier.intervals:
+        if not interval.text.strip(): # Skip empty text intervals
+            continue
+        start, end = interval.start_time, interval.end_time
+        try:
+            y, sr = librosa.load(audio_in, sr=None, offset=start, duration=end-start)
+            with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
+                sf.write(temp_audio.name, y, sr)
+                prediction = model_state["loaded_model"](temp_audio.name)["text"]
+                ipa_tier.add_annotation(tgt.core.Interval(start, end, prediction))
+                os.remove(temp_audio.name)
+        except Exception as e:
+            ipa_tier.add_annotation(tgt.core.Interval(start, end, f"[Error]: {str(e)}"))
+    tg.add_tier(ipa_tier)
+    tgt_str = tgt.io3.export_to_long_textgrid(tg)
+    return tgt_str
+def extract_tier_names(textgrid_file):
+    try:
+        tg = tgt.io.read_textgrid(textgrid_file.name)
+        tier_names = [tier.name for tier in tg.tiers]
+        return gr.update(choices=tier_names, value=tier_names[0] if tier_names else None)
+    except Exception as e:
+        return gr.update(choices=[], value=None)
 def launch_demo():
     initial_model = {
         "loaded_model": pipeline(
         "model_name": DEFAULT_MODEL,
     }
+    # Helper function - enables the interval transcribe button
+    def enable_interval_transcribe_btn(audio, textgrid):
+        return gr.update(interactive=(audio is not None and textgrid is not None))
     with gr.Blocks() as demo:
+        gr.Markdown("""# Automatic International Phonetic Alphabet Transcription
+        This demo allows you to experiment with producing phonetic transcriptions of uploaded or recorded audio using a selected automatic speech recognition (ASR) model.""")
+        # Dropdown for model selection
         model_name = gr.Dropdown(
             VALID_MODELS,
             value=DEFAULT_MODEL,
             label="IPA transcription ASR model",
             info="Select the model to use for prediction.",
         )
+        # Dropdown for transcription type selection
+        transcription_type = gr.Dropdown(
+            choices=["Full Audio", "Interval"],
+            label="Transcription Type",
+            value=None,
+            interactive=True,
+        )
         model_state = gr.State(value=initial_model)
+        # Full audio transcription section
+        with gr.Column(visible=False) as full_audio_section:
+            full_audio = gr.Audio(type="filepath", show_download_button=True, label="Upload Audio File")
+            full_transcribe_btn = gr.Button("Transcribe Full Audio", interactive=False, variant="primary")
+            full_prediction = gr.Textbox(label="IPA Transcription", show_copy_button=True)
+            full_textgrid_tier = gr.Textbox(label="TextGrid Tier Name", value="transcription", interactive=True)
+            full_textgrid_filename = gr.Textbox(label=TEXTGRID_NAME_INPUT_LABEL, interactive=False)
+            full_textgrid_contents = gr.Textbox(label="TextGrid Contents", show_copy_button=True)
+            full_download_btn = gr.DownloadButton(label=TEXTGRID_DOWNLOAD_TEXT, interactive=False, variant="primary")
+            full_reset_btn = gr.Button("Reset", variant="secondary")
+        # Interval transcription section
+        with gr.Column(visible=False) as interval_section:
+            interval_audio = gr.Audio(type="filepath", show_download_button=True, label="Upload Audio File")
+            interval_textgrid_file = gr.File(file_types=[".TextGrid"], label="Upload TextGrid File")
+            tier_names = gr.Dropdown(label="Source Tier (existing)", choices=[], interactive=True)
+            target_tier = gr.Textbox(label="Target Tier (new)", value="IPATier", placeholder="e.g. IPATier")
+            interval_transcribe_btn = gr.Button("Transcribe Intervals", interactive=False, variant="primary")
+            interval_result = gr.Textbox(label="IPA Interval Transcription", show_copy_button=True, interactive=False)
+            interval_download_btn = gr.DownloadButton(label=TEXTGRID_DOWNLOAD_TEXT, interactive=False, variant="primary")
+            interval_reset_btn = gr.Button("Reset", variant="secondary")
+        # Section visibility toggle
+        transcription_type.change(
+            fn=lambda t: (
+                gr.update(visible=t == "Full Audio"),
+                gr.update(visible=t == "Interval"),
+            ),
+            inputs=transcription_type,
+            outputs=[full_audio_section, interval_section],
         )
+        # Enable full transcribe button after audio uploaded
+        full_audio.change(
+            fn=lambda audio: gr.update(interactive=audio is not None),
+            inputs=full_audio,
+            outputs=full_transcribe_btn,
         )
+        # Full transcription logic
+        full_transcribe_btn.click(
             fn=load_model_and_predict,
+            inputs=[model_name, full_audio, model_state],
+            outputs=[full_prediction, model_state, full_textgrid_filename],
+        )
+        full_prediction.change(
+            fn=get_textgrid_contents,
+            inputs=[full_audio, full_textgrid_tier, full_prediction],
+            outputs=[full_textgrid_contents],
         )
+        full_textgrid_contents.change(
             fn=get_interactive_download_button,
+            inputs=[full_textgrid_contents, full_textgrid_filename],
+            outputs=[full_download_btn],
         )
+        full_reset_btn.click(
+            fn=lambda: (None, "", "", "", gr.update(interactive=False)),
+            outputs=[full_audio, full_prediction, full_textgrid_filename, full_textgrid_contents, full_download_btn],
+        )
+        # Enable interval transcribe button only when both files are uploaded
+        interval_audio.change(
+            fn=enable_interval_transcribe_btn,
+            inputs=[interval_audio, interval_textgrid_file],
+            outputs=[interval_transcribe_btn],
+        )
+        interval_textgrid_file.change(
+            fn=enable_interval_transcribe_btn,
+            inputs=[interval_audio, interval_textgrid_file],
+            outputs=[interval_transcribe_btn],
+        )
+        # Interval logic
+        interval_textgrid_file.change(
+            fn=extract_tier_names,
+            inputs=[interval_textgrid_file],
+            outputs=[tier_names],
+        )
+        interval_transcribe_btn.click(
+            fn=transcribe_intervals,
+            inputs=[interval_audio, interval_textgrid_file, tier_names, target_tier, model_state],
+            outputs=[interval_result],
+        )
+        interval_result.change(
+            fn=lambda tg_text: gr.update(value=write_textgrid(tg_text, "interval_output.TextGrid"), interactive=True),
+            inputs=[interval_result],
+            outputs=[interval_download_btn],
+        )
+        interval_reset_btn.click(
+            fn=lambda: (None, None, gr.update(choices=[]), "IPATier", "", gr.update(interactive=False)),
+            outputs=[interval_audio, interval_textgrid_file, tier_names, target_tier, interval_result, interval_download_btn],
+        )
+    demo.launch(max_file_size="100mb")
 if __name__ == "__main__":
     launch_demo()

requirements.txt CHANGED Viewed

@@ -1,4 +1,5 @@
 ffmpeg
 librosa
 tgt
-transformers[torch]

 ffmpeg
 librosa
 tgt
+transformers[torch]
+soundfile