Spaces:
Running
Running
Error Handling + Joe's Suggestion + FileNaming
#3
by parthbhangla - opened
app.py
CHANGED
|
@@ -47,31 +47,24 @@ def load_model_and_predict(
|
|
| 47 |
audio_in: str,
|
| 48 |
model_state: dict,
|
| 49 |
):
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
model_state =
|
| 59 |
-
|
| 60 |
-
task="automatic-speech-recognition", model=model_name
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
model_state,
|
| 69 |
-
gr.Textbox(
|
| 70 |
-
label=TEXTGRID_NAME_INPUT_LABEL,
|
| 71 |
-
interactive=True,
|
| 72 |
-
value=Path(audio_in).with_suffix(".TextGrid").name,
|
| 73 |
-
),
|
| 74 |
-
)
|
| 75 |
|
| 76 |
|
| 77 |
def get_textgrid_contents(audio_in, textgrid_tier_name, transcription_prediction):
|
|
@@ -144,6 +137,34 @@ def extract_tier_names(textgrid_file):
|
|
| 144 |
return gr.update(choices=tier_names, value=tier_names[0] if tier_names else None)
|
| 145 |
except Exception as e:
|
| 146 |
return gr.update(choices=[], value=None)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
|
| 148 |
|
| 149 |
def launch_demo():
|
|
@@ -154,10 +175,6 @@ def launch_demo():
|
|
| 154 |
"model_name": DEFAULT_MODEL,
|
| 155 |
}
|
| 156 |
|
| 157 |
-
# Helper function - enables the interval transcribe button
|
| 158 |
-
def enable_interval_transcribe_btn(audio, textgrid):
|
| 159 |
-
return gr.update(interactive=(audio is not None and textgrid is not None))
|
| 160 |
-
|
| 161 |
with gr.Blocks() as demo:
|
| 162 |
gr.Markdown("""# Automatic International Phonetic Alphabet Transcription
|
| 163 |
This demo allows you to experiment with producing phonetic transcriptions of uploaded or recorded audio using a selected automatic speech recognition (ASR) model.""")
|
|
@@ -172,7 +189,7 @@ def launch_demo():
|
|
| 172 |
|
| 173 |
# Dropdown for transcription type selection
|
| 174 |
transcription_type = gr.Dropdown(
|
| 175 |
-
choices=["Full Audio", "Interval"],
|
| 176 |
label="Transcription Type",
|
| 177 |
value=None,
|
| 178 |
interactive=True,
|
|
@@ -187,7 +204,6 @@ def launch_demo():
|
|
| 187 |
full_prediction = gr.Textbox(label="IPA Transcription", show_copy_button=True)
|
| 188 |
|
| 189 |
full_textgrid_tier = gr.Textbox(label="TextGrid Tier Name", value="transcription", interactive=True)
|
| 190 |
-
full_textgrid_filename = gr.Textbox(label=TEXTGRID_NAME_INPUT_LABEL, interactive=False)
|
| 191 |
|
| 192 |
full_textgrid_contents = gr.Textbox(label="TextGrid Contents", show_copy_button=True)
|
| 193 |
full_download_btn = gr.DownloadButton(label=TEXTGRID_DOWNLOAD_TEXT, interactive=False, variant="primary")
|
|
@@ -209,7 +225,7 @@ def launch_demo():
|
|
| 209 |
transcription_type.change(
|
| 210 |
fn=lambda t: (
|
| 211 |
gr.update(visible=t == "Full Audio"),
|
| 212 |
-
gr.update(visible=t == "Interval"),
|
| 213 |
),
|
| 214 |
inputs=transcription_type,
|
| 215 |
outputs=[full_audio_section, interval_section],
|
|
@@ -226,7 +242,7 @@ def launch_demo():
|
|
| 226 |
full_transcribe_btn.click(
|
| 227 |
fn=load_model_and_predict,
|
| 228 |
inputs=[model_name, full_audio, model_state],
|
| 229 |
-
outputs=[full_prediction, model_state
|
| 230 |
)
|
| 231 |
|
| 232 |
full_prediction.change(
|
|
@@ -236,25 +252,29 @@ def launch_demo():
|
|
| 236 |
)
|
| 237 |
|
| 238 |
full_textgrid_contents.change(
|
| 239 |
-
fn=
|
| 240 |
-
|
|
|
|
|
|
|
|
|
|
| 241 |
outputs=[full_download_btn],
|
| 242 |
)
|
| 243 |
|
|
|
|
| 244 |
full_reset_btn.click(
|
| 245 |
fn=lambda: (None, "", "", "", gr.update(interactive=False)),
|
| 246 |
-
outputs=[full_audio, full_prediction,
|
| 247 |
)
|
| 248 |
|
| 249 |
# Enable interval transcribe button only when both files are uploaded
|
| 250 |
interval_audio.change(
|
| 251 |
-
fn=
|
| 252 |
inputs=[interval_audio, interval_textgrid_file],
|
| 253 |
outputs=[interval_transcribe_btn],
|
| 254 |
)
|
| 255 |
|
| 256 |
interval_textgrid_file.change(
|
| 257 |
-
fn=
|
| 258 |
inputs=[interval_audio, interval_textgrid_file],
|
| 259 |
outputs=[interval_transcribe_btn],
|
| 260 |
)
|
|
@@ -273,8 +293,14 @@ def launch_demo():
|
|
| 273 |
)
|
| 274 |
|
| 275 |
interval_result.change(
|
| 276 |
-
fn=lambda tg_text: gr.update(
|
| 277 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 278 |
outputs=[interval_download_btn],
|
| 279 |
)
|
| 280 |
|
|
@@ -286,4 +312,4 @@ def launch_demo():
|
|
| 286 |
demo.launch(max_file_size="100mb")
|
| 287 |
|
| 288 |
if __name__ == "__main__":
|
| 289 |
-
launch_demo()
|
|
|
|
| 47 |
audio_in: str,
|
| 48 |
model_state: dict,
|
| 49 |
):
|
| 50 |
+
try:
|
| 51 |
+
if audio_in is None:
|
| 52 |
+
return (
|
| 53 |
+
"",
|
| 54 |
+
model_state,
|
| 55 |
+
gr.Textbox(label=TEXTGRID_NAME_INPUT_LABEL, interactive=False),
|
| 56 |
+
)
|
| 57 |
+
|
| 58 |
+
if model_state["model_name"] != model_name:
|
| 59 |
+
model_state = {
|
| 60 |
+
"loaded_model": pipeline(task="automatic-speech-recognition", model=model_name),
|
| 61 |
+
"model_name": model_name,
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
prediction = model_state["loaded_model"](audio_in)["text"]
|
| 65 |
+
return prediction, model_state
|
| 66 |
+
except Exception as e:
|
| 67 |
+
raise gr.Error(f"Failed to load model: {str(e)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
|
| 69 |
|
| 70 |
def get_textgrid_contents(audio_in, textgrid_tier_name, transcription_prediction):
|
|
|
|
| 137 |
return gr.update(choices=tier_names, value=tier_names[0] if tier_names else None)
|
| 138 |
except Exception as e:
|
| 139 |
return gr.update(choices=[], value=None)
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
def validate_textgrid_for_intervals(audio_path, textgrid_file):
|
| 143 |
+
try:
|
| 144 |
+
if not audio_path or not textgrid_file:
|
| 145 |
+
return gr.update(interactive=False)
|
| 146 |
+
|
| 147 |
+
audio_duration = librosa.get_duration(path=audio_path)
|
| 148 |
+
tg = tgt.io.read_textgrid(textgrid_file.name)
|
| 149 |
+
tg_end_time = max(tier.end_time for tier in tg.tiers)
|
| 150 |
+
|
| 151 |
+
if tg_end_time > audio_duration:
|
| 152 |
+
raise gr.Error(
|
| 153 |
+
f"TextGrid ends at {tg_end_time:.2f}s but audio is only {audio_duration:.2f}s. "
|
| 154 |
+
"Please upload matching files."
|
| 155 |
+
)
|
| 156 |
+
|
| 157 |
+
epsilon = 0.01
|
| 158 |
+
if abs(tg_end_time - audio_duration) > epsilon:
|
| 159 |
+
gr.Warning(
|
| 160 |
+
f"TextGrid ends at {tg_end_time:.2f}s but audio is {audio_duration:.2f}s. "
|
| 161 |
+
"Only the annotated portion will be transcribed."
|
| 162 |
+
)
|
| 163 |
+
|
| 164 |
+
return gr.update(interactive=True)
|
| 165 |
+
|
| 166 |
+
except Exception as e:
|
| 167 |
+
raise gr.Error(f"Invalid TextGrid or audio file:\n{str(e)}")
|
| 168 |
|
| 169 |
|
| 170 |
def launch_demo():
|
|
|
|
| 175 |
"model_name": DEFAULT_MODEL,
|
| 176 |
}
|
| 177 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 178 |
with gr.Blocks() as demo:
|
| 179 |
gr.Markdown("""# Automatic International Phonetic Alphabet Transcription
|
| 180 |
This demo allows you to experiment with producing phonetic transcriptions of uploaded or recorded audio using a selected automatic speech recognition (ASR) model.""")
|
|
|
|
| 189 |
|
| 190 |
# Dropdown for transcription type selection
|
| 191 |
transcription_type = gr.Dropdown(
|
| 192 |
+
choices=["Full Audio", "TextGrid Interval"],
|
| 193 |
label="Transcription Type",
|
| 194 |
value=None,
|
| 195 |
interactive=True,
|
|
|
|
| 204 |
full_prediction = gr.Textbox(label="IPA Transcription", show_copy_button=True)
|
| 205 |
|
| 206 |
full_textgrid_tier = gr.Textbox(label="TextGrid Tier Name", value="transcription", interactive=True)
|
|
|
|
| 207 |
|
| 208 |
full_textgrid_contents = gr.Textbox(label="TextGrid Contents", show_copy_button=True)
|
| 209 |
full_download_btn = gr.DownloadButton(label=TEXTGRID_DOWNLOAD_TEXT, interactive=False, variant="primary")
|
|
|
|
| 225 |
transcription_type.change(
|
| 226 |
fn=lambda t: (
|
| 227 |
gr.update(visible=t == "Full Audio"),
|
| 228 |
+
gr.update(visible=t == "TextGrid Interval"),
|
| 229 |
),
|
| 230 |
inputs=transcription_type,
|
| 231 |
outputs=[full_audio_section, interval_section],
|
|
|
|
| 242 |
full_transcribe_btn.click(
|
| 243 |
fn=load_model_and_predict,
|
| 244 |
inputs=[model_name, full_audio, model_state],
|
| 245 |
+
outputs=[full_prediction, model_state],
|
| 246 |
)
|
| 247 |
|
| 248 |
full_prediction.change(
|
|
|
|
| 252 |
)
|
| 253 |
|
| 254 |
full_textgrid_contents.change(
|
| 255 |
+
fn=lambda tg_text, audio_path: get_interactive_download_button(
|
| 256 |
+
tg_text,
|
| 257 |
+
Path(audio_path).with_suffix(".TextGrid").name if audio_path else "output.TextGrid"
|
| 258 |
+
),
|
| 259 |
+
inputs=[full_textgrid_contents, full_audio],
|
| 260 |
outputs=[full_download_btn],
|
| 261 |
)
|
| 262 |
|
| 263 |
+
|
| 264 |
full_reset_btn.click(
|
| 265 |
fn=lambda: (None, "", "", "", gr.update(interactive=False)),
|
| 266 |
+
outputs=[full_audio, full_prediction, full_textgrid_contents, full_download_btn],
|
| 267 |
)
|
| 268 |
|
| 269 |
# Enable interval transcribe button only when both files are uploaded
|
| 270 |
interval_audio.change(
|
| 271 |
+
fn=validate_textgrid_for_intervals,
|
| 272 |
inputs=[interval_audio, interval_textgrid_file],
|
| 273 |
outputs=[interval_transcribe_btn],
|
| 274 |
)
|
| 275 |
|
| 276 |
interval_textgrid_file.change(
|
| 277 |
+
fn=validate_textgrid_for_intervals,
|
| 278 |
inputs=[interval_audio, interval_textgrid_file],
|
| 279 |
outputs=[interval_transcribe_btn],
|
| 280 |
)
|
|
|
|
| 293 |
)
|
| 294 |
|
| 295 |
interval_result.change(
|
| 296 |
+
fn=lambda tg_text, audio_path: gr.update(
|
| 297 |
+
value=write_textgrid(
|
| 298 |
+
tg_text,
|
| 299 |
+
Path(audio_path).with_suffix(".TextGrid").name
|
| 300 |
+
),
|
| 301 |
+
interactive=True,
|
| 302 |
+
),
|
| 303 |
+
inputs=[interval_result, interval_audio],
|
| 304 |
outputs=[interval_download_btn],
|
| 305 |
)
|
| 306 |
|
|
|
|
| 312 |
demo.launch(max_file_size="100mb")
|
| 313 |
|
| 314 |
if __name__ == "__main__":
|
| 315 |
+
launch_demo()
|