juice500 commited on
Commit
2a05b7a
Β·
1 Parent(s): 22a75d9

improve readability

Browse files
Files changed (1) hide show
  1. app.py +38 -33
app.py CHANGED
@@ -264,33 +264,30 @@ def swap_synth(model_name):
264
  ENGINE.synth = VOCOS[model_name]
265
 
266
 
267
- with gr.Blocks(title="Speech Modification Demo") as demo:
268
-
269
  with gr.Row():
270
- with gr.Column(scale=1):
271
- gr.Markdown(
272
- """
273
- ## πŸŽ™οΈ Phonological Vector Demo
274
 
275
- Demonstration for the paper [[b]=[d]-[t]+[p]: Self-supervised Speech Models Discover Phonological Vector Arithmetic](https://arxiv.org/abs/2602.18899).
 
276
 
277
- Upload, record, or use the example audio.
278
- For the example words, we gave 0.25s margin to the start and end of the word.
279
- Inspect the spectrogram, select the time window, choose a phonological vector to apply, then hit **Run**.
280
- """
281
- )
 
 
 
 
 
 
 
 
282
 
283
- # ── Row 1: input audio ────────────────────────────────────────────────
284
- audio_dropdown = gr.Dropdown(
285
- choices=[w["text"] for w in EXAMPLE_WRD],
286
- label="Choose a word to modify (or record your own below)",
287
- value="Full sentence",
288
- interactive=True,
289
- )
290
-
291
  with gr.Column(scale=1):
292
- # ── Row 3: time selection + modification ──────────────────────────────
293
- gr.Markdown("### Modification hyperparameters")
294
  with gr.Row():
295
  start_time = gr.Number(label="Start (s)", value=0.0, precision=3, scale=1, interactive=True)
296
  stop_time = gr.Number(label="Stop (s)", value=1.0, precision=3, scale=1, interactive=True)
@@ -329,41 +326,49 @@ with gr.Blocks(title="Speech Modification Demo") as demo:
329
 
330
  with gr.Row():
331
  with gr.Column(scale=1):
332
- gr.Markdown("### Input")
 
 
 
 
 
 
333
  audio_input = gr.Audio(
334
- label="Input Audio",
335
  type="numpy",
336
  sources=["upload", "microphone"],
337
  recording=True,
338
- value=_read_audio(EXAMPLE_AUDIO),
339
  )
 
 
 
 
340
 
 
 
 
341
  trigger_source = gr.State(value=None)
 
342
  audio_input.upload(fn=lambda: "upload", inputs=[], outputs=[trigger_source])
343
  audio_input.stop_recording(fn=lambda: "record", inputs=[], outputs=[trigger_source])
344
- audio_dropdown.change(fn=lambda x: x, inputs=[audio_dropdown], outputs=[trigger_source])
345
 
346
  input_audio_plot = gr.Plot(
347
- label="Input Spectrogram",
348
  show_label=True,
349
- value=plot_spectrogram_original(_read_audio(EXAMPLE_AUDIO), "Full sentence"),
350
  elem_id="input-spectrogram-plot",
351
  )
352
  trigger_source.change(
353
  fn=_read_partial_audio,
354
  inputs=[audio_input, trigger_source],
355
  outputs=audio_input,
356
- )
357
- trigger_source.change(
358
  fn=plot_spectrogram_original,
359
  inputs=[audio_input, trigger_source],
360
  outputs=input_audio_plot,
361
  )
362
 
363
  with gr.Column(scale=1):
364
- gr.Markdown("### Output")
365
- audio_output = gr.Audio(label="Modified Audio", type="numpy", interactive=False)
366
- output_audio_plot = gr.Plot(label="Output Spectrogram", show_label=True)
367
 
368
  run_btn.click(
369
  fn=run_speech_edit,
 
264
  ENGINE.synth = VOCOS[model_name]
265
 
266
 
267
+ with gr.Blocks(title="Phonological Vector-based Speech Editing Demo") as demo:
 
268
  with gr.Row():
269
+ gr.Markdown("""
270
+ ## πŸŽ™οΈ Phonological Vector-based Speech Editing Demo
 
 
271
 
272
+ Demonstration for the paper [[b]=[d]-[t]+[p]: Self-supervised Speech Models Discover Phonological Vector Arithmetic](https://arxiv.org/abs/2602.18899).
273
+ This demo reproduces Experiment 2: Scale of Phonological Vectors, illustrating the controllability of speech editing by phonological vectors.
274
 
275
+ **Upload, record, or use the example audio (or word). Then, inspect the spectrogram, select the time window, choose a phonological vector to apply, then hit Run.**
276
+ (For the example words, we gave 0.25s margin to the start and end of the word.)""")
277
+
278
+ with gr.Row():
279
+ with gr.Column(scale=1):
280
+ gr.Markdown("""
281
+ ### Hyperparameters
282
+ - **Start / Stop (s)**: Time range (in seconds) over which the phonological vector is applied. Use the input spectrogram to identify the target phone's boundaries.
283
+ - **Lambda**: Strength of the phonological vector. Positive values strengthen the selected feature; negative values strengthens the opposite feature.
284
+ - **Vocos training dataset**: Training corpus used for the vocoder (Vocos) that resynthesizes the modified representation back to audio.
285
+ - **Vector extraction method**: How phonological vectors are estimated from S3M representations. Different options correspond to different training dataset/calculating the vectors.
286
+ - **Phonological feature**: The phonological vector to add into the selected time window.
287
+ """)
288
 
 
 
 
 
 
 
 
 
289
  with gr.Column(scale=1):
290
+ gr.Markdown("""### Hyperparameters""")
 
291
  with gr.Row():
292
  start_time = gr.Number(label="Start (s)", value=0.0, precision=3, scale=1, interactive=True)
293
  stop_time = gr.Number(label="Stop (s)", value=1.0, precision=3, scale=1, interactive=True)
 
326
 
327
  with gr.Row():
328
  with gr.Column(scale=1):
329
+ gr.Markdown("### Input audio")
330
+ audio_dropdown = gr.Dropdown(
331
+ choices=[w["text"] for w in EXAMPLE_WRD],
332
+ label="Choose a word to modify (or record your own below)",
333
+ value=None,
334
+ interactive=True,
335
+ )
336
  audio_input = gr.Audio(
 
337
  type="numpy",
338
  sources=["upload", "microphone"],
339
  recording=True,
340
+ value=None,
341
  )
342
+ with gr.Column(scale=1):
343
+ gr.Markdown("### Output audio")
344
+ audio_output = gr.Audio(type="numpy", interactive=False)
345
+
346
 
347
+ with gr.Row():
348
+ with gr.Column(scale=1):
349
+ gr.Markdown("### Input spectrogram")
350
  trigger_source = gr.State(value=None)
351
+ audio_dropdown.change(fn=lambda x: x, inputs=[audio_dropdown], outputs=[trigger_source])
352
  audio_input.upload(fn=lambda: "upload", inputs=[], outputs=[trigger_source])
353
  audio_input.stop_recording(fn=lambda: "record", inputs=[], outputs=[trigger_source])
 
354
 
355
  input_audio_plot = gr.Plot(
 
356
  show_label=True,
 
357
  elem_id="input-spectrogram-plot",
358
  )
359
  trigger_source.change(
360
  fn=_read_partial_audio,
361
  inputs=[audio_input, trigger_source],
362
  outputs=audio_input,
363
+ ).then(
 
364
  fn=plot_spectrogram_original,
365
  inputs=[audio_input, trigger_source],
366
  outputs=input_audio_plot,
367
  )
368
 
369
  with gr.Column(scale=1):
370
+ gr.Markdown("### Output spectrogram")
371
+ output_audio_plot = gr.Plot(show_label=True)
 
372
 
373
  run_btn.click(
374
  fn=run_speech_edit,