Files changed (4) hide show
  1. Dockerfile +0 -1
  2. README.md +10 -8
  3. app.py +289 -0
  4. requirements.txt +5 -0
Dockerfile DELETED
@@ -1 +0,0 @@
1
- FROM umasscds/autoipaalign:latest
 
 
README.md CHANGED
@@ -1,14 +1,16 @@
1
  ---
2
- title: Wav2IPA
3
- emoji: 💬
4
  colorFrom: red
5
- colorTo: purple
6
- sdk: docker
7
- app_port: 7860
 
 
8
  license: mit
9
  ---
10
- # About
11
- This interactive demo allows you to play with audio to International Phonetic Alphabet transcription models trained in the [Wav2IPA](https://github.com/ginic/multipa](https://github.com/ginic/wav2ipa) project and deployed in [AutoIPAAlign](https://github.com/ginic/AutoIPAAlign).
12
 
13
  # Local Testing
14
- To use these audio models locally, you should install Docker and use [the autoipaalign image](https://hub.docker.com/repository/docker/umasscds/autoipaalign/general). Alternatively, check out the [AutoIPAAlign](https://github.com/ginic/AutoIPAAign) package that space is based on.
 
1
  ---
2
+ title: Multipa Audio To Ipa
3
+ emoji: 🐨
4
  colorFrom: red
5
+ colorTo: blue
6
+ sdk: gradio
7
+ sdk_version: 5.29.0
8
+ app_file: app.py
9
+ pinned: false
10
  license: mit
11
  ---
12
+ # About
13
+ This interactive demo allows you to play with audio to International Phonetic Alphabet transcription models trained in the [multIPA](https://github.com/ginic/multipa) project.
14
 
15
  # Local Testing
16
+ To use audio models locally, you must manually install ffmpeg and ffprobe, see [this discussion](https://discuss.huggingface.co/t/audio-classification-pipeline-valueerror-ffmpeg-was-not-found-but-is-required-to-load-audio-files-from-filename/16137/8).
app.py ADDED
@@ -0,0 +1,289 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Imports
2
+ from pathlib import Path
3
+ import tempfile
4
+ import os
5
+ import gradio as gr
6
+ import librosa
7
+ import tgt.core
8
+ import tgt.io3
9
+ import soundfile as sf
10
+ from transformers import pipeline
11
+
12
+ # Constants
13
+ TEXTGRID_DIR = tempfile.mkdtemp()
14
+ DEFAULT_MODEL = "ginic/data_seed_bs64_4_wav2vec2-large-xlsr-53-buckeye-ipa"
15
+ TEXTGRID_DOWNLOAD_TEXT = "Download TextGrid file"
16
+ TEXTGRID_NAME_INPUT_LABEL = "TextGrid file name"
17
+
18
+ # Selection of models
19
+ VALID_MODELS = [
20
+ "ctaguchi/wav2vec2-large-xlsr-japlmthufielta-ipa1000-ns",
21
+ "ctaguchi/wav2vec2-large-xlsr-japlmthufielta-ipa-plus-2000",
22
+ "ginic/data_seed_bs64_1_wav2vec2-large-xlsr-53-buckeye-ipa",
23
+ "ginic/data_seed_bs64_2_wav2vec2-large-xlsr-53-buckeye-ipa",
24
+ "ginic/data_seed_bs64_3_wav2vec2-large-xlsr-53-buckeye-ipa",
25
+ "ginic/data_seed_bs64_4_wav2vec2-large-xlsr-53-buckeye-ipa",
26
+ "ginic/gender_split_30_female_1_wav2vec2-large-xlsr-53-buckeye-ipa",
27
+ "ginic/gender_split_30_female_2_wav2vec2-large-xlsr-53-buckeye-ipa",
28
+ "ginic/gender_split_30_female_3_wav2vec2-large-xlsr-53-buckeye-ipa",
29
+ "ginic/gender_split_30_female_4_wav2vec2-large-xlsr-53-buckeye-ipa",
30
+ "ginic/gender_split_30_female_5_wav2vec2-large-xlsr-53-buckeye-ipa",
31
+ "ginic/gender_split_70_female_1_wav2vec2-large-xlsr-53-buckeye-ipa",
32
+ "ginic/gender_split_70_female_2_wav2vec2-large-xlsr-53-buckeye-ipa",
33
+ "ginic/gender_split_70_female_3_wav2vec2-large-xlsr-53-buckeye-ipa",
34
+ "ginic/gender_split_70_female_4_wav2vec2-large-xlsr-53-buckeye-ipa",
35
+ "ginic/gender_split_70_female_5_wav2vec2-large-xlsr-53-buckeye-ipa",
36
+ "ginic/vary_individuals_old_only_1_wav2vec2-large-xlsr-53-buckeye-ipa",
37
+ "ginic/vary_individuals_old_only_2_wav2vec2-large-xlsr-53-buckeye-ipa",
38
+ "ginic/vary_individuals_old_only_3_wav2vec2-large-xlsr-53-buckeye-ipa",
39
+ "ginic/vary_individuals_young_only_1_wav2vec2-large-xlsr-53-buckeye-ipa",
40
+ "ginic/vary_individuals_young_only_2_wav2vec2-large-xlsr-53-buckeye-ipa",
41
+ "ginic/vary_individuals_young_only_3_wav2vec2-large-xlsr-53-buckeye-ipa",
42
+ ]
43
+
44
+
45
+ def load_model_and_predict(
46
+ model_name: str,
47
+ audio_in: str,
48
+ model_state: dict,
49
+ ):
50
+ if audio_in is None:
51
+ return (
52
+ "",
53
+ model_state,
54
+ gr.Textbox(label=TEXTGRID_NAME_INPUT_LABEL, interactive=False),
55
+ )
56
+
57
+ if model_state["model_name"] != model_name:
58
+ model_state = {
59
+ "loaded_model": pipeline(
60
+ task="automatic-speech-recognition", model=model_name
61
+ ),
62
+ "model_name": model_name,
63
+ }
64
+
65
+ prediction = model_state["loaded_model"](audio_in)["text"]
66
+ return (
67
+ prediction,
68
+ model_state,
69
+ gr.Textbox(
70
+ label=TEXTGRID_NAME_INPUT_LABEL,
71
+ interactive=True,
72
+ value=Path(audio_in).with_suffix(".TextGrid").name,
73
+ ),
74
+ )
75
+
76
+
77
+ def get_textgrid_contents(audio_in, textgrid_tier_name, transcription_prediction):
78
+ if audio_in is None or transcription_prediction is None:
79
+ return ""
80
+
81
+ duration = librosa.get_duration(path=audio_in)
82
+
83
+ annotation = tgt.core.Interval(0, duration, transcription_prediction)
84
+ transcription_tier = tgt.core.IntervalTier(
85
+ start_time=0, end_time=duration, name=textgrid_tier_name
86
+ )
87
+ transcription_tier.add_annotation(annotation)
88
+ textgrid = tgt.core.TextGrid()
89
+ textgrid.add_tier(transcription_tier)
90
+ return tgt.io3.export_to_long_textgrid(textgrid)
91
+
92
+
93
+ def write_textgrid(textgrid_contents, textgrid_filename):
94
+ """Writes the text grid contents to a named file in the temporary directory.
95
+ Returns the path for download.
96
+ """
97
+ textgrid_path = Path(TEXTGRID_DIR) / Path(textgrid_filename).name
98
+ textgrid_path.write_text(textgrid_contents)
99
+ return textgrid_path
100
+
101
+
102
+ def get_interactive_download_button(textgrid_contents, textgrid_filename):
103
+ return gr.DownloadButton(
104
+ label=TEXTGRID_DOWNLOAD_TEXT,
105
+ variant="primary",
106
+ interactive=True,
107
+ value=write_textgrid(textgrid_contents, textgrid_filename),
108
+ )
109
+
110
+
111
+ def transcribe_intervals(audio_in, textgrid_path, source_tier, target_tier, model_state):
112
+ if audio_in is None or textgrid_path is None:
113
+ return "Missing audio or TextGrid input file."
114
+
115
+ tg=tgt.io.read_textgrid(textgrid_path.name)
116
+ tier = tg.get_tier_by_name(source_tier)
117
+ ipa_tier = tgt.core.IntervalTier(name=target_tier)
118
+
119
+ for interval in tier.intervals:
120
+ if not interval.text.strip(): # Skip empty text intervals
121
+ continue
122
+
123
+ start, end = interval.start_time, interval.end_time
124
+ try:
125
+ y, sr = librosa.load(audio_in, sr=None, offset=start, duration=end-start)
126
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
127
+ sf.write(temp_audio.name, y, sr)
128
+ prediction = model_state["loaded_model"](temp_audio.name)["text"]
129
+ ipa_tier.add_annotation(tgt.core.Interval(start, end, prediction))
130
+ os.remove(temp_audio.name)
131
+ except Exception as e:
132
+ ipa_tier.add_annotation(tgt.core.Interval(start, end, f"[Error]: {str(e)}"))
133
+
134
+ tg.add_tier(ipa_tier)
135
+ tgt_str = tgt.io3.export_to_long_textgrid(tg)
136
+
137
+ return tgt_str
138
+
139
+
140
+ def extract_tier_names(textgrid_file):
141
+ try:
142
+ tg = tgt.io.read_textgrid(textgrid_file.name)
143
+ tier_names = [tier.name for tier in tg.tiers]
144
+ return gr.update(choices=tier_names, value=tier_names[0] if tier_names else None)
145
+ except Exception as e:
146
+ return gr.update(choices=[], value=None)
147
+
148
+
149
+ def launch_demo():
150
+ initial_model = {
151
+ "loaded_model": pipeline(
152
+ task="automatic-speech-recognition", model=DEFAULT_MODEL
153
+ ),
154
+ "model_name": DEFAULT_MODEL,
155
+ }
156
+
157
+ # Helper function - enables the interval transcribe button
158
+ def enable_interval_transcribe_btn(audio, textgrid):
159
+ return gr.update(interactive=(audio is not None and textgrid is not None))
160
+
161
+ with gr.Blocks() as demo:
162
+ gr.Markdown("""# Automatic International Phonetic Alphabet Transcription
163
+ This demo allows you to experiment with producing phonetic transcriptions of uploaded or recorded audio using a selected automatic speech recognition (ASR) model.""")
164
+
165
+ # Dropdown for model selection
166
+ model_name = gr.Dropdown(
167
+ VALID_MODELS,
168
+ value=DEFAULT_MODEL,
169
+ label="IPA transcription ASR model",
170
+ info="Select the model to use for prediction.",
171
+ )
172
+
173
+ # Dropdown for transcription type selection
174
+ transcription_type = gr.Dropdown(
175
+ choices=["Full Audio", "Interval"],
176
+ label="Transcription Type",
177
+ value=None,
178
+ interactive=True,
179
+ )
180
+
181
+ model_state = gr.State(value=initial_model)
182
+
183
+ # Full audio transcription section
184
+ with gr.Column(visible=False) as full_audio_section:
185
+ full_audio = gr.Audio(type="filepath", show_download_button=True, label="Upload Audio File")
186
+ full_transcribe_btn = gr.Button("Transcribe Full Audio", interactive=False, variant="primary")
187
+ full_prediction = gr.Textbox(label="IPA Transcription", show_copy_button=True)
188
+
189
+ full_textgrid_tier = gr.Textbox(label="TextGrid Tier Name", value="transcription", interactive=True)
190
+ full_textgrid_filename = gr.Textbox(label=TEXTGRID_NAME_INPUT_LABEL, interactive=False)
191
+
192
+ full_textgrid_contents = gr.Textbox(label="TextGrid Contents", show_copy_button=True)
193
+ full_download_btn = gr.DownloadButton(label=TEXTGRID_DOWNLOAD_TEXT, interactive=False, variant="primary")
194
+ full_reset_btn = gr.Button("Reset", variant="secondary")
195
+
196
+ # Interval transcription section
197
+ with gr.Column(visible=False) as interval_section:
198
+ interval_audio = gr.Audio(type="filepath", show_download_button=True, label="Upload Audio File")
199
+ interval_textgrid_file = gr.File(file_types=[".TextGrid"], label="Upload TextGrid File")
200
+ tier_names = gr.Dropdown(label="Source Tier (existing)", choices=[], interactive=True)
201
+ target_tier = gr.Textbox(label="Target Tier (new)", value="IPATier", placeholder="e.g. IPATier")
202
+
203
+ interval_transcribe_btn = gr.Button("Transcribe Intervals", interactive=False, variant="primary")
204
+ interval_result = gr.Textbox(label="IPA Interval Transcription", show_copy_button=True, interactive=False)
205
+ interval_download_btn = gr.DownloadButton(label=TEXTGRID_DOWNLOAD_TEXT, interactive=False, variant="primary")
206
+ interval_reset_btn = gr.Button("Reset", variant="secondary")
207
+
208
+ # Section visibility toggle
209
+ transcription_type.change(
210
+ fn=lambda t: (
211
+ gr.update(visible=t == "Full Audio"),
212
+ gr.update(visible=t == "Interval"),
213
+ ),
214
+ inputs=transcription_type,
215
+ outputs=[full_audio_section, interval_section],
216
+ )
217
+
218
+ # Enable full transcribe button after audio uploaded
219
+ full_audio.change(
220
+ fn=lambda audio: gr.update(interactive=audio is not None),
221
+ inputs=full_audio,
222
+ outputs=full_transcribe_btn,
223
+ )
224
+
225
+ # Full transcription logic
226
+ full_transcribe_btn.click(
227
+ fn=load_model_and_predict,
228
+ inputs=[model_name, full_audio, model_state],
229
+ outputs=[full_prediction, model_state, full_textgrid_filename],
230
+ )
231
+
232
+ full_prediction.change(
233
+ fn=get_textgrid_contents,
234
+ inputs=[full_audio, full_textgrid_tier, full_prediction],
235
+ outputs=[full_textgrid_contents],
236
+ )
237
+
238
+ full_textgrid_contents.change(
239
+ fn=get_interactive_download_button,
240
+ inputs=[full_textgrid_contents, full_textgrid_filename],
241
+ outputs=[full_download_btn],
242
+ )
243
+
244
+ full_reset_btn.click(
245
+ fn=lambda: (None, "", "", "", gr.update(interactive=False)),
246
+ outputs=[full_audio, full_prediction, full_textgrid_filename, full_textgrid_contents, full_download_btn],
247
+ )
248
+
249
+ # Enable interval transcribe button only when both files are uploaded
250
+ interval_audio.change(
251
+ fn=enable_interval_transcribe_btn,
252
+ inputs=[interval_audio, interval_textgrid_file],
253
+ outputs=[interval_transcribe_btn],
254
+ )
255
+
256
+ interval_textgrid_file.change(
257
+ fn=enable_interval_transcribe_btn,
258
+ inputs=[interval_audio, interval_textgrid_file],
259
+ outputs=[interval_transcribe_btn],
260
+ )
261
+
262
+ # Interval logic
263
+ interval_textgrid_file.change(
264
+ fn=extract_tier_names,
265
+ inputs=[interval_textgrid_file],
266
+ outputs=[tier_names],
267
+ )
268
+
269
+ interval_transcribe_btn.click(
270
+ fn=transcribe_intervals,
271
+ inputs=[interval_audio, interval_textgrid_file, tier_names, target_tier, model_state],
272
+ outputs=[interval_result],
273
+ )
274
+
275
+ interval_result.change(
276
+ fn=lambda tg_text: gr.update(value=write_textgrid(tg_text, "interval_output.TextGrid"), interactive=True),
277
+ inputs=[interval_result],
278
+ outputs=[interval_download_btn],
279
+ )
280
+
281
+ interval_reset_btn.click(
282
+ fn=lambda: (None, None, gr.update(choices=[]), "IPATier", "", gr.update(interactive=False)),
283
+ outputs=[interval_audio, interval_textgrid_file, tier_names, target_tier, interval_result, interval_download_btn],
284
+ )
285
+
286
+ demo.launch(max_file_size="100mb")
287
+
288
+ if __name__ == "__main__":
289
+ launch_demo()
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ ffmpeg
2
+ librosa
3
+ tgt
4
+ transformers[torch]
5
+ soundfile