Spaces:
Running
Running
improve readability
Browse files
app.py
CHANGED
|
@@ -264,33 +264,30 @@ def swap_synth(model_name):
|
|
| 264 |
ENGINE.synth = VOCOS[model_name]
|
| 265 |
|
| 266 |
|
| 267 |
-
with gr.Blocks(title="Speech
|
| 268 |
-
|
| 269 |
with gr.Row():
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
"""
|
| 273 |
-
## ποΈ Phonological Vector Demo
|
| 274 |
|
| 275 |
-
|
|
|
|
| 276 |
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 282 |
|
| 283 |
-
# ββ Row 1: input audio ββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 284 |
-
audio_dropdown = gr.Dropdown(
|
| 285 |
-
choices=[w["text"] for w in EXAMPLE_WRD],
|
| 286 |
-
label="Choose a word to modify (or record your own below)",
|
| 287 |
-
value="Full sentence",
|
| 288 |
-
interactive=True,
|
| 289 |
-
)
|
| 290 |
-
|
| 291 |
with gr.Column(scale=1):
|
| 292 |
-
#
|
| 293 |
-
gr.Markdown("### Modification hyperparameters")
|
| 294 |
with gr.Row():
|
| 295 |
start_time = gr.Number(label="Start (s)", value=0.0, precision=3, scale=1, interactive=True)
|
| 296 |
stop_time = gr.Number(label="Stop (s)", value=1.0, precision=3, scale=1, interactive=True)
|
|
@@ -329,41 +326,49 @@ with gr.Blocks(title="Speech Modification Demo") as demo:
|
|
| 329 |
|
| 330 |
with gr.Row():
|
| 331 |
with gr.Column(scale=1):
|
| 332 |
-
gr.Markdown("### Input")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 333 |
audio_input = gr.Audio(
|
| 334 |
-
label="Input Audio",
|
| 335 |
type="numpy",
|
| 336 |
sources=["upload", "microphone"],
|
| 337 |
recording=True,
|
| 338 |
-
value=
|
| 339 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 340 |
|
|
|
|
|
|
|
|
|
|
| 341 |
trigger_source = gr.State(value=None)
|
|
|
|
| 342 |
audio_input.upload(fn=lambda: "upload", inputs=[], outputs=[trigger_source])
|
| 343 |
audio_input.stop_recording(fn=lambda: "record", inputs=[], outputs=[trigger_source])
|
| 344 |
-
audio_dropdown.change(fn=lambda x: x, inputs=[audio_dropdown], outputs=[trigger_source])
|
| 345 |
|
| 346 |
input_audio_plot = gr.Plot(
|
| 347 |
-
label="Input Spectrogram",
|
| 348 |
show_label=True,
|
| 349 |
-
value=plot_spectrogram_original(_read_audio(EXAMPLE_AUDIO), "Full sentence"),
|
| 350 |
elem_id="input-spectrogram-plot",
|
| 351 |
)
|
| 352 |
trigger_source.change(
|
| 353 |
fn=_read_partial_audio,
|
| 354 |
inputs=[audio_input, trigger_source],
|
| 355 |
outputs=audio_input,
|
| 356 |
-
)
|
| 357 |
-
trigger_source.change(
|
| 358 |
fn=plot_spectrogram_original,
|
| 359 |
inputs=[audio_input, trigger_source],
|
| 360 |
outputs=input_audio_plot,
|
| 361 |
)
|
| 362 |
|
| 363 |
with gr.Column(scale=1):
|
| 364 |
-
gr.Markdown("### Output")
|
| 365 |
-
|
| 366 |
-
output_audio_plot = gr.Plot(label="Output Spectrogram", show_label=True)
|
| 367 |
|
| 368 |
run_btn.click(
|
| 369 |
fn=run_speech_edit,
|
|
|
|
| 264 |
ENGINE.synth = VOCOS[model_name]
|
| 265 |
|
| 266 |
|
| 267 |
+
with gr.Blocks(title="Phonological Vector-based Speech Editing Demo") as demo:
|
|
|
|
| 268 |
with gr.Row():
|
| 269 |
+
gr.Markdown("""
|
| 270 |
+
## ποΈ Phonological Vector-based Speech Editing Demo
|
|
|
|
|
|
|
| 271 |
|
| 272 |
+
Demonstration for the paper [[b]=[d]-[t]+[p]: Self-supervised Speech Models Discover Phonological Vector Arithmetic](https://arxiv.org/abs/2602.18899).
|
| 273 |
+
This demo reproduces Experiment 2: Scale of Phonological Vectors, illustrating the controllability of speech editing by phonological vectors.
|
| 274 |
|
| 275 |
+
**Upload, record, or use the example audio (or word). Then, inspect the spectrogram, select the time window, choose a phonological vector to apply, then hit Run.**
|
| 276 |
+
(For the example words, we gave 0.25s margin to the start and end of the word.)""")
|
| 277 |
+
|
| 278 |
+
with gr.Row():
|
| 279 |
+
with gr.Column(scale=1):
|
| 280 |
+
gr.Markdown("""
|
| 281 |
+
### Hyperparameters
|
| 282 |
+
- **Start / Stop (s)**: Time range (in seconds) over which the phonological vector is applied. Use the input spectrogram to identify the target phone's boundaries.
|
| 283 |
+
- **Lambda**: Strength of the phonological vector. Positive values strengthen the selected feature; negative values strengthens the opposite feature.
|
| 284 |
+
- **Vocos training dataset**: Training corpus used for the vocoder (Vocos) that resynthesizes the modified representation back to audio.
|
| 285 |
+
- **Vector extraction method**: How phonological vectors are estimated from S3M representations. Different options correspond to different training dataset/calculating the vectors.
|
| 286 |
+
- **Phonological feature**: The phonological vector to add into the selected time window.
|
| 287 |
+
""")
|
| 288 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 289 |
with gr.Column(scale=1):
|
| 290 |
+
gr.Markdown("""### Hyperparameters""")
|
|
|
|
| 291 |
with gr.Row():
|
| 292 |
start_time = gr.Number(label="Start (s)", value=0.0, precision=3, scale=1, interactive=True)
|
| 293 |
stop_time = gr.Number(label="Stop (s)", value=1.0, precision=3, scale=1, interactive=True)
|
|
|
|
| 326 |
|
| 327 |
with gr.Row():
|
| 328 |
with gr.Column(scale=1):
|
| 329 |
+
gr.Markdown("### Input audio")
|
| 330 |
+
audio_dropdown = gr.Dropdown(
|
| 331 |
+
choices=[w["text"] for w in EXAMPLE_WRD],
|
| 332 |
+
label="Choose a word to modify (or record your own below)",
|
| 333 |
+
value=None,
|
| 334 |
+
interactive=True,
|
| 335 |
+
)
|
| 336 |
audio_input = gr.Audio(
|
|
|
|
| 337 |
type="numpy",
|
| 338 |
sources=["upload", "microphone"],
|
| 339 |
recording=True,
|
| 340 |
+
value=None,
|
| 341 |
)
|
| 342 |
+
with gr.Column(scale=1):
|
| 343 |
+
gr.Markdown("### Output audio")
|
| 344 |
+
audio_output = gr.Audio(type="numpy", interactive=False)
|
| 345 |
+
|
| 346 |
|
| 347 |
+
with gr.Row():
|
| 348 |
+
with gr.Column(scale=1):
|
| 349 |
+
gr.Markdown("### Input spectrogram")
|
| 350 |
trigger_source = gr.State(value=None)
|
| 351 |
+
audio_dropdown.change(fn=lambda x: x, inputs=[audio_dropdown], outputs=[trigger_source])
|
| 352 |
audio_input.upload(fn=lambda: "upload", inputs=[], outputs=[trigger_source])
|
| 353 |
audio_input.stop_recording(fn=lambda: "record", inputs=[], outputs=[trigger_source])
|
|
|
|
| 354 |
|
| 355 |
input_audio_plot = gr.Plot(
|
|
|
|
| 356 |
show_label=True,
|
|
|
|
| 357 |
elem_id="input-spectrogram-plot",
|
| 358 |
)
|
| 359 |
trigger_source.change(
|
| 360 |
fn=_read_partial_audio,
|
| 361 |
inputs=[audio_input, trigger_source],
|
| 362 |
outputs=audio_input,
|
| 363 |
+
).then(
|
|
|
|
| 364 |
fn=plot_spectrogram_original,
|
| 365 |
inputs=[audio_input, trigger_source],
|
| 366 |
outputs=input_audio_plot,
|
| 367 |
)
|
| 368 |
|
| 369 |
with gr.Column(scale=1):
|
| 370 |
+
gr.Markdown("### Output spectrogram")
|
| 371 |
+
output_audio_plot = gr.Plot(show_label=True)
|
|
|
|
| 372 |
|
| 373 |
run_btn.click(
|
| 374 |
fn=run_speech_edit,
|