Spaces:
Running
Running
Hmm...
Browse files
app.py
CHANGED
|
@@ -407,12 +407,22 @@ with gr.Blocks(
|
|
| 407 |
with gr.Tab("Roformer"):
|
| 408 |
with gr.Group():
|
| 409 |
with gr.Row():
|
| 410 |
-
roformer_model = gr.Dropdown(value="MelBand Roformer Kim | Big Beta 5e FT by unwa", label="Select the Model", choices=list(ROFORMER_MODELS.keys()))
|
| 411 |
-
|
| 412 |
-
|
| 413 |
-
|
| 414 |
-
|
| 415 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 416 |
with gr.Row():
|
| 417 |
roformer_audio = gr.Audio(label="Input Audio", type="filepath")
|
| 418 |
with gr.Row():
|
|
@@ -424,12 +434,22 @@ with gr.Blocks(
|
|
| 424 |
with gr.Tab("MDX23C"):
|
| 425 |
with gr.Group():
|
| 426 |
with gr.Row():
|
| 427 |
-
mdx23c_model = gr.Dropdown(value="MDX23C-InstVoc HQ", label="Select the Model", choices=list(MDX23C_MODELS.keys()))
|
| 428 |
-
|
| 429 |
-
|
| 430 |
-
|
| 431 |
-
|
| 432 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 433 |
with gr.Row():
|
| 434 |
mdx23c_audio = gr.Audio(label="Input Audio", type="filepath")
|
| 435 |
with gr.Row():
|
|
@@ -441,12 +461,23 @@ with gr.Blocks(
|
|
| 441 |
with gr.Tab("MDX-NET"):
|
| 442 |
with gr.Group():
|
| 443 |
with gr.Row():
|
| 444 |
-
mdx_model = gr.Dropdown(value="UVR-MDX-NET Inst HQ 5", label="Select the Model", choices=list(MDXNET_MODELS.keys()))
|
| 445 |
-
|
| 446 |
-
|
| 447 |
-
|
| 448 |
-
|
| 449 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 450 |
with gr.Row():
|
| 451 |
mdx_audio = gr.Audio(label="Input Audio", type="filepath")
|
| 452 |
with gr.Row():
|
|
@@ -458,14 +489,27 @@ with gr.Blocks(
|
|
| 458 |
with gr.Tab("VR ARCH"):
|
| 459 |
with gr.Group():
|
| 460 |
with gr.Row():
|
| 461 |
-
vr_model = gr.Dropdown(value="1_HP-UVR", label="Select the Model", choices=list(VR_ARCH_MODELS.keys()))
|
| 462 |
-
|
| 463 |
-
|
| 464 |
-
|
| 465 |
-
|
| 466 |
-
|
| 467 |
-
|
| 468 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 469 |
with gr.Row():
|
| 470 |
vr_audio = gr.Audio(label="Input Audio", type="filepath")
|
| 471 |
with gr.Row():
|
|
@@ -477,12 +521,22 @@ with gr.Blocks(
|
|
| 477 |
with gr.Tab("Demucs"):
|
| 478 |
with gr.Group():
|
| 479 |
with gr.Row():
|
| 480 |
-
demucs_model = gr.Dropdown(value="htdemucs_6s", label="Select the Model", choices=list(DEMUCS_MODELS.keys()))
|
| 481 |
-
|
| 482 |
-
|
| 483 |
-
|
| 484 |
-
|
| 485 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 486 |
with gr.Row():
|
| 487 |
demucs_audio = gr.Audio(label="Input Audio", type="filepath")
|
| 488 |
with gr.Row():
|
|
@@ -498,17 +552,10 @@ with gr.Blocks(
|
|
| 498 |
demucs_stem6 = gr.Audio(label="Stem 6", type="filepath", interactive=False)
|
| 499 |
|
| 500 |
with gr.Tab("Settings"):
|
| 501 |
-
with gr.
|
| 502 |
-
|
| 503 |
-
|
| 504 |
-
|
| 505 |
-
output_dir = gr.Textbox(value="output", label="File output directory", info="The directory where output files will be saved.", placeholder="output")
|
| 506 |
-
output_format = gr.Dropdown(value="wav", choices=["wav", "flac", "mp3"], label="Output Format", info="The format of the output audio file.")
|
| 507 |
-
with gr.Row():
|
| 508 |
-
norm_threshold = gr.Slider(minimum=0.1, maximum=1, step=0.1, value=0.9, label="Normalization threshold", info="The threshold for audio normalization.")
|
| 509 |
-
amp_threshold = gr.Slider(minimum=0.0, maximum=1, step=0.1, value=0.0, label="Amplification threshold", info="The threshold for audio amplification.")
|
| 510 |
-
with gr.Row():
|
| 511 |
-
batch_size = gr.Slider(minimum=1, maximum=16, step=1, value=1, label="Batch Size", info="Larger consumes more RAM but may process slightly faster.")
|
| 512 |
|
| 513 |
with gr.Accordion("Rename Stems", open=False):
|
| 514 |
gr.Markdown(
|
|
@@ -569,10 +616,10 @@ with gr.Blocks(
|
|
| 569 |
roformer_pitch_shift,
|
| 570 |
model_file_dir,
|
| 571 |
output_dir,
|
| 572 |
-
|
| 573 |
-
|
| 574 |
-
|
| 575 |
-
|
| 576 |
vocals_stem,
|
| 577 |
instrumental_stem,
|
| 578 |
other_stem,
|
|
@@ -597,10 +644,10 @@ with gr.Blocks(
|
|
| 597 |
mdx23c_pitch_shift,
|
| 598 |
model_file_dir,
|
| 599 |
output_dir,
|
| 600 |
-
|
| 601 |
-
|
| 602 |
-
|
| 603 |
-
|
| 604 |
vocals_stem,
|
| 605 |
instrumental_stem,
|
| 606 |
other_stem,
|
|
@@ -625,10 +672,10 @@ with gr.Blocks(
|
|
| 625 |
mdx_denoise,
|
| 626 |
model_file_dir,
|
| 627 |
output_dir,
|
| 628 |
-
|
| 629 |
-
|
| 630 |
-
|
| 631 |
-
|
| 632 |
vocals_stem,
|
| 633 |
instrumental_stem,
|
| 634 |
other_stem,
|
|
@@ -655,10 +702,10 @@ with gr.Blocks(
|
|
| 655 |
vr_high_end_process,
|
| 656 |
model_file_dir,
|
| 657 |
output_dir,
|
| 658 |
-
|
| 659 |
-
|
| 660 |
-
|
| 661 |
-
|
| 662 |
vocals_stem,
|
| 663 |
instrumental_stem,
|
| 664 |
other_stem,
|
|
@@ -683,9 +730,9 @@ with gr.Blocks(
|
|
| 683 |
demucs_segments_enabled,
|
| 684 |
model_file_dir,
|
| 685 |
output_dir,
|
| 686 |
-
|
| 687 |
-
|
| 688 |
-
|
| 689 |
vocals_stem,
|
| 690 |
instrumental_stem,
|
| 691 |
other_stem,
|
|
|
|
| 407 |
with gr.Tab("Roformer"):
|
| 408 |
with gr.Group():
|
| 409 |
with gr.Row():
|
| 410 |
+
roformer_model = gr.Dropdown(value="MelBand Roformer Kim | Big Beta 5e FT by unwa", label="Select the Model", choices=list(ROFORMER_MODELS.keys()), scale=3)
|
| 411 |
+
roformer_output_format = gr.Dropdown(value="wav", choices=["wav", "flac", "mp3"], label="Output Format", info="The format of the output audio file.", scale=1)
|
| 412 |
+
with gr.Accordion("Advanced settings", open=False):
|
| 413 |
+
with gr.Column():
|
| 414 |
+
with gr.Group():
|
| 415 |
+
roformer_override_seg_size = gr.Checkbox(value=False, label="Override segment size", info="Override model default segment size instead of using the model default value.")
|
| 416 |
+
roformer_seg_size = gr.Slider(minimum=32, maximum=4000, step=32, value=256, label="Segment Size", info="Larger consumes more resources, but may give better results.")
|
| 417 |
+
with gr.Group():
|
| 418 |
+
with gr.Row():
|
| 419 |
+
roformer_overlap = gr.Slider(minimum=2, maximum=10, step=1, value=8, label="Overlap", info="Amount of overlap between prediction windows. Lower is better but slower.")
|
| 420 |
+
roformer_pitch_shift = gr.Slider(minimum=-24, maximum=24, step=1, value=0, label="Pitch shift", info="Shift audio pitch by a number of semitones while processing. may improve output for deep/high vocals.")
|
| 421 |
+
with gr.Group():
|
| 422 |
+
with gr.Row():
|
| 423 |
+
roformer_batch_size = gr.Slider(minimum=1, maximum=16, step=1, value=1, label="Batch Size", info="Larger consumes more RAM but may process slightly faster.")
|
| 424 |
+
roformer_norm_threshold = gr.Slider(minimum=0.1, maximum=1, step=0.1, value=0.9, label="Normalization threshold", info="The threshold for audio normalization.")
|
| 425 |
+
roformer_amp_threshold = gr.Slider(minimum=0.0, maximum=1, step=0.1, value=0.0, label="Amplification threshold", info="The threshold for audio amplification.")
|
| 426 |
with gr.Row():
|
| 427 |
roformer_audio = gr.Audio(label="Input Audio", type="filepath")
|
| 428 |
with gr.Row():
|
|
|
|
| 434 |
with gr.Tab("MDX23C"):
|
| 435 |
with gr.Group():
|
| 436 |
with gr.Row():
|
| 437 |
+
mdx23c_model = gr.Dropdown(value="MDX23C-InstVoc HQ", label="Select the Model", choices=list(MDX23C_MODELS.keys()), scale=3)
|
| 438 |
+
mdx23c_output_format = gr.Dropdown(value="wav", choices=["wav", "flac", "mp3"], label="Output Format", info="The format of the output audio file.", scale=1)
|
| 439 |
+
with gr.Accordion("Advanced settings", open=False):
|
| 440 |
+
with gr.Column():
|
| 441 |
+
with gr.Group():
|
| 442 |
+
mdx23c_override_seg_size = gr.Checkbox(value=False, label="Override segment size", info="Override model default segment size instead of using the model default value.")
|
| 443 |
+
mdx23c_seg_size = gr.Slider(minimum=32, maximum=4000, step=32, value=256, label="Segment Size", info="Larger consumes more resources, but may give better results.")
|
| 444 |
+
with gr.Group():
|
| 445 |
+
with gr.Row():
|
| 446 |
+
mdx23c_overlap = gr.Slider(minimum=2, maximum=50, step=1, value=8, label="Overlap", info="Amount of overlap between prediction windows. Higher is better but slower.")
|
| 447 |
+
mdx23c_pitch_shift = gr.Slider(minimum=-24, maximum=24, step=1, value=0, label="Pitch shift", info="Shift audio pitch by a number of semitones while processing. may improve output for deep/high vocals.")
|
| 448 |
+
with gr.Group():
|
| 449 |
+
with gr.Row():
|
| 450 |
+
mdx23c_batch_size = gr.Slider(minimum=1, maximum=16, step=1, value=1, label="Batch Size", info="Larger consumes more RAM but may process slightly faster.")
|
| 451 |
+
mdx23c_norm_threshold = gr.Slider(minimum=0.1, maximum=1, step=0.1, value=0.9, label="Normalization threshold", info="The threshold for audio normalization.")
|
| 452 |
+
mdx23c_amp_threshold = gr.Slider(minimum=0.0, maximum=1, step=0.1, value=0.0, label="Amplification threshold", info="The threshold for audio amplification.")
|
| 453 |
with gr.Row():
|
| 454 |
mdx23c_audio = gr.Audio(label="Input Audio", type="filepath")
|
| 455 |
with gr.Row():
|
|
|
|
| 461 |
with gr.Tab("MDX-NET"):
|
| 462 |
with gr.Group():
|
| 463 |
with gr.Row():
|
| 464 |
+
mdx_model = gr.Dropdown(value="UVR-MDX-NET Inst HQ 5", label="Select the Model", choices=list(MDXNET_MODELS.keys()), scale=3)
|
| 465 |
+
mdx_output_format = gr.Dropdown(value="wav", choices=["wav", "flac", "mp3"], label="Output Format", info="The format of the output audio file.", scale=1)
|
| 466 |
+
with gr.Accordion("Advanced settings", open=False):
|
| 467 |
+
with gr.Column():
|
| 468 |
+
with gr.Group():
|
| 469 |
+
with gr.Row():
|
| 470 |
+
mdx_hop_length = gr.Slider(minimum=32, maximum=2048, step=32, value=1024, label="Hop Length", info="Usually called stride in neural networks; only change if you know what you're doing.")
|
| 471 |
+
mdx_seg_size = gr.Slider(minimum=32, maximum=4000, step=32, value=256, label="Segment Size", info="Larger consumes more resources, but may give better results.")
|
| 472 |
+
with gr.Group():
|
| 473 |
+
with gr.Row():
|
| 474 |
+
mdx_overlap = gr.Slider(minimum=0.001, maximum=0.999, step=0.001, value=0.25, label="Overlap", info="Amount of overlap between prediction windows. Higher is better but slower.")
|
| 475 |
+
mdx_denoise = gr.Checkbox(value=False, label="Denoise", info="Enable denoising after separation.")
|
| 476 |
+
with gr.Group():
|
| 477 |
+
with gr.Row():
|
| 478 |
+
mdx_batch_size = gr.Slider(minimum=1, maximum=16, step=1, value=1, label="Batch Size", info="Larger consumes more RAM but may process slightly faster.")
|
| 479 |
+
mdx_norm_threshold = gr.Slider(minimum=0.1, maximum=1, step=0.1, value=0.9, label="Normalization threshold", info="The threshold for audio normalization.")
|
| 480 |
+
mdx_amp_threshold = gr.Slider(minimum=0.0, maximum=1, step=0.1, value=0.0, label="Amplification threshold", info="The threshold for audio amplification.")
|
| 481 |
with gr.Row():
|
| 482 |
mdx_audio = gr.Audio(label="Input Audio", type="filepath")
|
| 483 |
with gr.Row():
|
|
|
|
| 489 |
with gr.Tab("VR ARCH"):
|
| 490 |
with gr.Group():
|
| 491 |
with gr.Row():
|
| 492 |
+
vr_model = gr.Dropdown(value="1_HP-UVR", label="Select the Model", choices=list(VR_ARCH_MODELS.keys()), scale=3)
|
| 493 |
+
vr_output_format = gr.Dropdown(value="wav", choices=["wav", "flac", "mp3"], label="Output Format", info="The format of the output audio file.", scale=1)
|
| 494 |
+
with gr.Accordion("Advanced settings", open=False):
|
| 495 |
+
with gr.Column():
|
| 496 |
+
with gr.Group():
|
| 497 |
+
with gr.Row():
|
| 498 |
+
vr_window_size = gr.Slider(minimum=320, maximum=1024, step=32, value=512, label="Window Size", info="Balance quality and speed. 1024 = fast but lower, 320 = slower but better quality.")
|
| 499 |
+
vr_aggression = gr.Slider(minimum=1, maximum=100, step=1, value=5, label="Agression", info="Intensity of primary stem extraction.")
|
| 500 |
+
with gr.Group():
|
| 501 |
+
with gr.Column():
|
| 502 |
+
vr_post_process = gr.Checkbox(value=False, label="Post Process", info="Identify leftover artifacts within vocal output; may improve separation for some songs.")
|
| 503 |
+
vr_post_process_threshold = gr.Slider(minimum=0.1, maximum=0.3, step=0.1, value=0.2, label="Post Process Threshold", info="Threshold for post-processing.")
|
| 504 |
+
with gr.Group():
|
| 505 |
+
with gr.Row():
|
| 506 |
+
vr_tta = gr.Checkbox(value=False, label="TTA", info="Enable Test-Time-Augmentation; slow but improves quality.")
|
| 507 |
+
vr_high_end_process = gr.Checkbox(value=False, label="High End Process", info="Mirror the missing frequency range of the output.")
|
| 508 |
+
with gr.Group():
|
| 509 |
+
with gr.Row():
|
| 510 |
+
vr_batch_size = gr.Slider(minimum=1, maximum=16, step=1, value=1, label="Batch Size", info="Larger consumes more RAM but may process slightly faster.")
|
| 511 |
+
vr_norm_threshold = gr.Slider(minimum=0.1, maximum=1, step=0.1, value=0.9, label="Normalization threshold", info="The threshold for audio normalization.")
|
| 512 |
+
vr_amp_threshold = gr.Slider(minimum=0.0, maximum=1, step=0.1, value=0.0, label="Amplification threshold", info="The threshold for audio amplification.")
|
| 513 |
with gr.Row():
|
| 514 |
vr_audio = gr.Audio(label="Input Audio", type="filepath")
|
| 515 |
with gr.Row():
|
|
|
|
| 521 |
with gr.Tab("Demucs"):
|
| 522 |
with gr.Group():
|
| 523 |
with gr.Row():
|
| 524 |
+
demucs_model = gr.Dropdown(value="htdemucs_6s", label="Select the Model", choices=list(DEMUCS_MODELS.keys()), scale=3)
|
| 525 |
+
demucs_output_format = gr.Dropdown(value="wav", choices=["wav", "flac", "mp3"], label="Output Format", info="The format of the output audio file.", scale=1)
|
| 526 |
+
with gr.Accordion("Advanced settings", open=False):
|
| 527 |
+
with gr.Column():
|
| 528 |
+
with gr.Group():
|
| 529 |
+
with gr.Row():
|
| 530 |
+
demucs_seg_size = gr.Slider(minimum=1, maximum=100, step=1, value=40, label="Segment Size", info="Size of segments into which the audio is split. Higher = slower but better quality.")
|
| 531 |
+
demucs_shifts = gr.Slider(minimum=0, maximum=20, step=1, value=2, label="Shifts", info="Number of predictions with random shifts, higher = slower but better quality.")
|
| 532 |
+
with gr.Group():
|
| 533 |
+
with gr.Row():
|
| 534 |
+
demucs_overlap = gr.Slider(minimum=0.001, maximum=0.999, step=0.001, value=0.25, label="Overlap", info="Overlap between prediction windows. Higher = slower but better quality.")
|
| 535 |
+
demucs_segments_enabled = gr.Checkbox(value=True, label="Segment-wise processing", info="Enable segment-wise processing.")
|
| 536 |
+
with gr.Group():
|
| 537 |
+
with gr.Row():
|
| 538 |
+
demucs_norm_threshold = gr.Slider(minimum=0.1, maximum=1, step=0.1, value=0.9, label="Normalization threshold", info="The threshold for audio normalization.")
|
| 539 |
+
demucs_amp_threshold = gr.Slider(minimum=0.0, maximum=1, step=0.1, value=0.0, label="Amplification threshold", info="The threshold for audio amplification.")
|
| 540 |
with gr.Row():
|
| 541 |
demucs_audio = gr.Audio(label="Input Audio", type="filepath")
|
| 542 |
with gr.Row():
|
|
|
|
| 552 |
demucs_stem6 = gr.Audio(label="Stem 6", type="filepath", interactive=False)
|
| 553 |
|
| 554 |
with gr.Tab("Settings"):
|
| 555 |
+
with gr.Group():
|
| 556 |
+
with gr.Row():
|
| 557 |
+
model_file_dir = gr.Textbox(value="/tmp/PolUVR-models/", label="Directory to cache model files", info="The directory where model files are stored.", placeholder="/tmp/PolUVR-models/")
|
| 558 |
+
output_dir = gr.Textbox(value="output", label="File output directory", info="The directory where output files will be saved.", placeholder="output")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 559 |
|
| 560 |
with gr.Accordion("Rename Stems", open=False):
|
| 561 |
gr.Markdown(
|
|
|
|
| 616 |
roformer_pitch_shift,
|
| 617 |
model_file_dir,
|
| 618 |
output_dir,
|
| 619 |
+
roformer_output_format,
|
| 620 |
+
roformer_norm_threshold,
|
| 621 |
+
roformer_amp_threshold,
|
| 622 |
+
roformer_batch_size,
|
| 623 |
vocals_stem,
|
| 624 |
instrumental_stem,
|
| 625 |
other_stem,
|
|
|
|
| 644 |
mdx23c_pitch_shift,
|
| 645 |
model_file_dir,
|
| 646 |
output_dir,
|
| 647 |
+
mdx23c_output_format,
|
| 648 |
+
mdx23c_norm_threshold,
|
| 649 |
+
mdx23c_amp_threshold,
|
| 650 |
+
mdx23c_batch_size,
|
| 651 |
vocals_stem,
|
| 652 |
instrumental_stem,
|
| 653 |
other_stem,
|
|
|
|
| 672 |
mdx_denoise,
|
| 673 |
model_file_dir,
|
| 674 |
output_dir,
|
| 675 |
+
mdx_output_format,
|
| 676 |
+
mdx_norm_threshold,
|
| 677 |
+
mdx_amp_threshold,
|
| 678 |
+
mdx_batch_size,
|
| 679 |
vocals_stem,
|
| 680 |
instrumental_stem,
|
| 681 |
other_stem,
|
|
|
|
| 702 |
vr_high_end_process,
|
| 703 |
model_file_dir,
|
| 704 |
output_dir,
|
| 705 |
+
vr_output_format,
|
| 706 |
+
vr_norm_threshold,
|
| 707 |
+
vr_amp_threshold,
|
| 708 |
+
vr_batch_size,
|
| 709 |
vocals_stem,
|
| 710 |
instrumental_stem,
|
| 711 |
other_stem,
|
|
|
|
| 730 |
demucs_segments_enabled,
|
| 731 |
model_file_dir,
|
| 732 |
output_dir,
|
| 733 |
+
demucs_output_format,
|
| 734 |
+
demucs_norm_threshold,
|
| 735 |
+
demucs_amp_threshold,
|
| 736 |
vocals_stem,
|
| 737 |
instrumental_stem,
|
| 738 |
other_stem,
|