Spaces:

ginic
/

wav2ipa

Running

App Files Files Community

parthbhangla commited on Jun 20, 2025

Commit

e2b0a4b

verified ·

1 Parent(s): ef1761e

Added TextGrid Interval Support

Browse files

Here's screenshots of the changes:

![Screenshot 2025-06-20 at 11.10.12 AM.png](https://cdn-uploads.huggingface.co/production/uploads/68304f0b3842b6d373054246/mIOFt1mhHnVuoFEaX4-zj.png)
![Screenshot 2025-06-20 at 11.10.29 AM.png](https://cdn-uploads.huggingface.co/production/uploads/68304f0b3842b6d373054246/Afw4-76y9oyRSGeGkri0_.png)

Here's a sample output:
```
File type = "ooTextFile"
Object class = "TextGrid"

xmin = 0.0
xmax = 26.74608
tiers? <exists>
size = 2
item []:
item [1]:
class = "IntervalTier"
name = "sentence"
xmin = 0.0
xmax = 26.74608
intervals: size = 10
intervals [1]:
xmin = 0.0
xmax = 0.7995000400000001
text = ""
intervals [2]:
xmin = 0.7995000400000001
xmax = 3.4185402
text = "and seemed to work pretty well that way too"
intervals [3]:
xmin = 3.4185402
xmax = 4.7975004
text = ""
intervals [4]:
xmin = 4.7975004
xmax = 12.9231205
text = "also of course we would usually fill the gas tank on sunday if we had been doing any driving through the week"
intervals [5]:
xmin = 12.9231205
xmax = 13.4625
text = ""
intervals [6]:
xmin = 13.4625
xmax = 16.130500340179836
text = "and my first recollections"
intervals [7]:
xmin = 16.130500340179836
xmax = 16.421021
text = ""
intervals [8]:
xmin = 16.421021
xmax = 21.62864
text = "that there were no filling stations at all any anywhere that i can remember"
intervals [9]:
xmin = 21.62864
xmax = 22.84768
text = ""
intervals [10]:
xmin = 22.84768
xmax = 26.74608
text = "so dad when he first got his car"
item [2]:
class = "IntervalTier"
name = "IPATier"
xmin = 0.0
xmax = 26.74608
intervals: size = 10
intervals [1]:
xmin = 0.0
xmax = 0.7995000400000001
text = ""
intervals [2]:
xmin = 0.7995000400000001
xmax = 3.4185402
text = "ʌnsimdwʌkpɹɪɾiwɛlðæʔweɪtu"
intervals [3]:
xmin = 3.4185402
xmax = 4.7975004
text = ""
intervals [4]:
xmin = 4.7975004
xmax = 12.9231205
text = "ɔlsoʊkɔʌwiwʊdjuʒlifɪlʌɡæʃtæŋkɑnsʌndiɪfɪfwihɛdbɪnduɪŋɛnidɹaɪvɪŋθɹuðʌwik"
intervals [5]:
xmin = 12.9231205
xmax = 13.4625
text = ""
intervals [6]:
xmin = 13.4625
xmax = 16.130500340179836
text = "ændmaɪfɹ̩stɹɛkʌlɛkʃʌnz"
intervals [7]:
xmin = 16.130500340179836
xmax = 16.421021
text = ""
intervals [8]:
xmin = 16.421021
xmax = 21.62864
text = "ɑðɛtɛɹwɹ̩ʌnoʊfɪlɪŋsteɪʃʌnzɾɔɛniɛñiweɪðɛɾaɪkɪnɹmɛmbʌ"
intervals [9]:
xmin = 21.62864
xmax = 22.84768
text = ""
intervals [10]:
xmin = 22.84768
xmax = 26.74608
text = "soʊdɛdwɛɾ̃ifɹ̩stɡɑtɪzkʌ"
```

The files I used to test it out:

<audio controls src="https://cdn-uploads.huggingface.co/production/uploads/68304f0b3842b6d373054246/MaEGwMyzK8JNI0BH8eBE_.wav"></audio>
```
File type = "ooTextFile"
Object class = "TextGrid"

xmin = 0
xmax = 26.74608
tiers? <exists>
size = 1
item []:
item [1]:
class = "IntervalTier"
name = "sentence"
xmin = 0
xmax = 26.74608
intervals: size = 10
intervals [1]:
xmin = 0
xmax = 0.7995000400000001
text = ""
intervals [2]:
xmin = 0.7995000400000001
xmax = 3.4185402
text = "and seemed to work pretty well that way too"
intervals [3]:
xmin = 3.4185402
xmax = 4.7975004
text = ""
intervals [4]:
xmin = 4.7975004
xmax = 12.9231205
text = "also of course we would usually fill the gas tank on sunday if we had been doing any driving through the week"
intervals [5]:
xmin = 12.9231205
xmax = 13.4625
text = ""
intervals [6]:
xmin = 13.4625
xmax = 16.130500340179836
text = "and my first recollections"
intervals [7]:
xmin = 16.130500340179836
xmax = 16.421021
text = ""
intervals [8]:
xmin = 16.421021
xmax = 21.62864
text = "that there were no filling stations at all any anywhere that i can remember"
intervals [9]:
xmin = 21.62864
xmax = 22.84768
text = ""
intervals [10]:
xmin = 22.84768
xmax = 26.74608
text = "so dad when he first got his car"
```

Files changed (1) hide show

app.py +87 -1

app.py CHANGED Viewed

@@ -1,10 +1,12 @@
 from pathlib import Path
 import tempfile
 import gradio as gr
 import librosa
 import tgt.core
 import tgt.io3
 from transformers import pipeline
 TEXTGRID_DIR = tempfile.mkdtemp()
@@ -105,6 +107,45 @@ def get_interactive_download_button(textgrid_contents, textgrid_filename):
     )
 def launch_demo():
     initial_model = {
         "loaded_model": pipeline(
@@ -125,9 +166,15 @@ def launch_demo():
             info="Select the model to use for prediction.",
         )
         audio_in = gr.Audio(type="filepath", show_download_button=True)
         model_state = gr.State(value=initial_model)
-        prediction = gr.Textbox(label="Predicted IPA transcription")
         gr.Markdown("""## TextGrid File Options
                     Change these inputs if you'd like to customize and download the transcription in [TextGrid format](https://www.fon.hum.uva.nl/praat/manual/TextGrid_file_formats.html) for Praat.
@@ -152,6 +199,14 @@ def launch_demo():
             variant="primary",
         )
         # Update prediction if model or audio changes
         gr.on(
             triggers=[audio_in.input, model_name.change],
@@ -160,6 +215,13 @@ def launch_demo():
             outputs=[prediction, model_state, textgrid_filename],
         )
         # Download button becomes interactive if user updates audio or textgrid params
         gr.on(
             triggers=[textgrid_contents.change, textgrid_filename.change],
@@ -168,6 +230,30 @@ def launch_demo():
             outputs=[download_btn],
         )
     demo.launch(max_file_size="100mb")

 from pathlib import Path
 import tempfile
+import os
 import gradio as gr
 import librosa
 import tgt.core
 import tgt.io3
+import soundfile as sf
 from transformers import pipeline
 TEXTGRID_DIR = tempfile.mkdtemp()
     )
+def transcribe_intervals(audio_in, textgrid_path, source_tier, target_tier, model_state):
+    if audio_in is None or textgrid_path is None:
+        return "Missing audio or TextGrid input file."
+    tg=tgt.io.read_textgrid(textgrid_path.name)
+    tier = tg.get_tier_by_name(source_tier)
+    ipa_tier = tgt.core.IntervalTier(name=target_tier)
+    for interval in tier.intervals:
+        if not interval.text.strip(): # Skip empty text intervals
+            continue
+        start, end = interval.start_time, interval.end_time
+        try:
+            y, sr = librosa.load(audio_in, sr=None, offset=start, duration=end-start)
+            with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
+                sf.write(temp_audio.name, y, sr)
+                prediction = model_state["loaded_model"](temp_audio.name)["text"]
+                ipa_tier.add_annotation(tgt.core.Interval(start, end, prediction))
+                os.remove(temp_audio.name)
+        except Exception as e:
+            ipa_tier.add_annotation(tgt.core.Interval(start, end, f"[Error]: {str(e)}"))
+    tg.add_tier(ipa_tier)
+    tgt_str = tgt.io3.export_to_long_textgrid(tg)
+    return tgt_str, tgt_str
+def extract_tier_names(textgrid_file):
+    try:
+        tg = tgt.io.read_textgrid(textgrid_file.name)
+        tier_names = [tier.name for tier in tg.tiers]
+        return gr.update(choices=tier_names, value=tier_names[0] if tier_names else None)
+    except Exception as e:
+        return gr.update(choices=[], value=None)
 def launch_demo():
     initial_model = {
         "loaded_model": pipeline(
             info="Select the model to use for prediction.",
         )
         audio_in = gr.Audio(type="filepath", show_download_button=True)
+        textgrid_file = gr.File(file_types=[".TextGrid"], label="Upload TextGrid File")
         model_state = gr.State(value=initial_model)
+        tier_names = gr.Dropdown(label="Source Tier (existing)", choices=[], interactive=True)
+        target_tier = gr.Textbox(label="Target Tier (new)", value="IPATier", placeholder="e.g. IPATier")
+        run_btn = gr.Button("Transcribe Intervals")
+        prediction = gr.Textbox(label="Full Audio IPA transcription")
         gr.Markdown("""## TextGrid File Options
                     Change these inputs if you'd like to customize and download the transcription in [TextGrid format](https://www.fon.hum.uva.nl/praat/manual/TextGrid_file_formats.html) for Praat.
             variant="primary",
         )
+        transcription_result = gr.Textbox(visible=False)
+        textgrid_preview = gr.Textbox(
+            label="Updated Interval Wise TextGrid Preview",
+            lines=20,
+            interactive=False,
+            show_copy_button=True
+        )
         # Update prediction if model or audio changes
         gr.on(
             triggers=[audio_in.input, model_name.change],
             outputs=[prediction, model_state, textgrid_filename],
         )
+        gr.on(
+            triggers=[audio_in.input, textgrid_tier.input, prediction.change],
+            fn=get_textgrid_contents,
+            inputs=[audio_in, textgrid_tier, prediction],
+            outputs=[textgrid_contents],
+        )
         # Download button becomes interactive if user updates audio or textgrid params
         gr.on(
             triggers=[textgrid_contents.change, textgrid_filename.change],
             outputs=[download_btn],
         )
+        textgrid_file.change(
+            fn=extract_tier_names,
+            inputs=[textgrid_file],
+            outputs=[tier_names],
+        )
+        run_btn.click(
+            fn=transcribe_intervals,
+            inputs=[audio_in, textgrid_file, tier_names, target_tier, model_state],
+            outputs=[transcription_result, textgrid_preview]
+        )
+        transcription_result.change(
+            fn=lambda tg_text: tg_text,
+            inputs=transcription_result,
+            outputs=textgrid_contents
+        )
+        transcription_result.change(
+            fn=lambda tg_text, filename: write_textgrid(tg_text, filename),
+            inputs=[transcription_result, textgrid_filename],
+            outputs=download_btn
+        )
     demo.launch(max_file_size="100mb")