Spaces:

Politrees
/

audio-separator_UVR

Running

App Files Files Community

Politrees commited on Jan 11, 2025

Commit

69836a2

verified ·

1 Parent(s): 2b7c9e8

Hmm...

Browse files

Files changed (1) hide show

app.py +109 -62

app.py CHANGED Viewed

@@ -407,12 +407,22 @@ with gr.Blocks(
     with gr.Tab("Roformer"):
         with gr.Group():
             with gr.Row():
-                roformer_model = gr.Dropdown(value="MelBand Roformer Kim | Big Beta 5e FT by unwa", label="Select the Model", choices=list(ROFORMER_MODELS.keys()))
-            with gr.Row():
-                roformer_seg_size = gr.Slider(minimum=32, maximum=4000, step=32, value=256, label="Segment Size", info="Larger consumes more resources, but may give better results.")
-                roformer_override_seg_size = gr.Checkbox(value=False, label="Override segment size", info="Override model default segment size instead of using the model default value.")
-                roformer_overlap = gr.Slider(minimum=2, maximum=10, step=1, value=8, label="Overlap", info="Amount of overlap between prediction windows. Lower is better but slower.")
-                roformer_pitch_shift = gr.Slider(minimum=-24, maximum=24, step=1, value=0, label="Pitch shift", info="Shift audio pitch by a number of semitones while processing. may improve output for deep/high vocals.")
         with gr.Row():
             roformer_audio = gr.Audio(label="Input Audio", type="filepath")
         with gr.Row():
@@ -424,12 +434,22 @@ with gr.Blocks(
     with gr.Tab("MDX23C"):
         with gr.Group():
             with gr.Row():
-                mdx23c_model = gr.Dropdown(value="MDX23C-InstVoc HQ", label="Select the Model", choices=list(MDX23C_MODELS.keys()))
-            with gr.Row():
-                mdx23c_seg_size = gr.Slider(minimum=32, maximum=4000, step=32, value=256, label="Segment Size", info="Larger consumes more resources, but may give better results.")
-                mdx23c_override_seg_size = gr.Checkbox(value=False, label="Override segment size", info="Override model default segment size instead of using the model default value.")
-                mdx23c_overlap = gr.Slider(minimum=2, maximum=50, step=1, value=8, label="Overlap", info="Amount of overlap between prediction windows. Higher is better but slower.")
-                mdx23c_pitch_shift = gr.Slider(minimum=-24, maximum=24, step=1, value=0, label="Pitch shift", info="Shift audio pitch by a number of semitones while processing. may improve output for deep/high vocals.")
         with gr.Row():
             mdx23c_audio = gr.Audio(label="Input Audio", type="filepath")
         with gr.Row():
@@ -441,12 +461,23 @@ with gr.Blocks(
     with gr.Tab("MDX-NET"):
         with gr.Group():
             with gr.Row():
-                mdx_model = gr.Dropdown(value="UVR-MDX-NET Inst HQ 5", label="Select the Model", choices=list(MDXNET_MODELS.keys()))
-            with gr.Row():
-                mdx_hop_length = gr.Slider(minimum=32, maximum=2048, step=32, value=1024, label="Hop Length", info="Usually called stride in neural networks; only change if you know what you're doing.")
-                mdx_seg_size = gr.Slider(minimum=32, maximum=4000, step=32, value=256, label="Segment Size", info="Larger consumes more resources, but may give better results.")
-                mdx_overlap = gr.Slider(minimum=0.001, maximum=0.999, step=0.001, value=0.25, label="Overlap", info="Amount of overlap between prediction windows. Higher is better but slower.")
-                mdx_denoise = gr.Checkbox(value=False, label="Denoise", info="Enable denoising after separation.")
         with gr.Row():
             mdx_audio = gr.Audio(label="Input Audio", type="filepath")
         with gr.Row():
@@ -458,14 +489,27 @@ with gr.Blocks(
     with gr.Tab("VR ARCH"):
         with gr.Group():
             with gr.Row():
-                vr_model = gr.Dropdown(value="1_HP-UVR", label="Select the Model", choices=list(VR_ARCH_MODELS.keys()))
-            with gr.Row():
-                vr_window_size = gr.Slider(minimum=320, maximum=1024, step=32, value=512, label="Window Size", info="Balance quality and speed. 1024 = fast but lower, 320 = slower but better quality.")
-                vr_aggression = gr.Slider(minimum=1, maximum=100, step=1, value=5, label="Agression", info="Intensity of primary stem extraction.")
-                vr_tta = gr.Checkbox(value=False, label="TTA", info="Enable Test-Time-Augmentation; slow but improves quality.")
-                vr_post_process = gr.Checkbox(value=False, label="Post Process", info="Identify leftover artifacts within vocal output; may improve separation for some songs.")
-                vr_post_process_threshold = gr.Slider(minimum=0.1, maximum=0.3, step=0.1, value=0.2, label="Post Process Threshold", info="Threshold for post-processing.")
-                vr_high_end_process = gr.Checkbox(value=False, label="High End Process", info="Mirror the missing frequency range of the output.")
         with gr.Row():
             vr_audio = gr.Audio(label="Input Audio", type="filepath")
         with gr.Row():
@@ -477,12 +521,22 @@ with gr.Blocks(
     with gr.Tab("Demucs"):
         with gr.Group():
             with gr.Row():
-                demucs_model = gr.Dropdown(value="htdemucs_6s", label="Select the Model", choices=list(DEMUCS_MODELS.keys()))
-            with gr.Row():
-                demucs_seg_size = gr.Slider(minimum=1, maximum=100, step=1, value=40, label="Segment Size", info="Size of segments into which the audio is split. Higher = slower but better quality.")
-                demucs_shifts = gr.Slider(minimum=0, maximum=20, step=1, value=2, label="Shifts", info="Number of predictions with random shifts, higher = slower but better quality.")
-                demucs_overlap = gr.Slider(minimum=0.001, maximum=0.999, step=0.001, value=0.25, label="Overlap", info="Overlap between prediction windows. Higher = slower but better quality.")
-                demucs_segments_enabled = gr.Checkbox(value=True, label="Segment-wise processing", info="Enable segment-wise processing.")
         with gr.Row():
             demucs_audio = gr.Audio(label="Input Audio", type="filepath")
         with gr.Row():
@@ -498,17 +552,10 @@ with gr.Blocks(
             demucs_stem6 = gr.Audio(label="Stem 6", type="filepath", interactive=False)
     with gr.Tab("Settings"):
-        with gr.Accordion("General settings", open=False):
-          with gr.Group():
-              model_file_dir = gr.Textbox(value="/tmp/PolUVR-models/", label="Directory to cache model files", info="The directory where model files are stored.", placeholder="/tmp/PolUVR-models/")
-              with gr.Row():
-                  output_dir = gr.Textbox(value="output", label="File output directory", info="The directory where output files will be saved.", placeholder="output")
-                  output_format = gr.Dropdown(value="wav", choices=["wav", "flac", "mp3"], label="Output Format", info="The format of the output audio file.")
-              with gr.Row():
-                  norm_threshold = gr.Slider(minimum=0.1, maximum=1, step=0.1, value=0.9, label="Normalization threshold", info="The threshold for audio normalization.")
-                  amp_threshold = gr.Slider(minimum=0.0, maximum=1, step=0.1, value=0.0, label="Amplification threshold", info="The threshold for audio amplification.")
-              with gr.Row():
-                  batch_size = gr.Slider(minimum=1, maximum=16, step=1, value=1, label="Batch Size", info="Larger consumes more RAM but may process slightly faster.")
         with gr.Accordion("Rename Stems", open=False):
             gr.Markdown(
@@ -569,10 +616,10 @@ with gr.Blocks(
             roformer_pitch_shift,
             model_file_dir,
             output_dir,
-            output_format,
-            norm_threshold,
-            amp_threshold,
-            batch_size,
             vocals_stem,
             instrumental_stem,
             other_stem,
@@ -597,10 +644,10 @@ with gr.Blocks(
             mdx23c_pitch_shift,
             model_file_dir,
             output_dir,
-            output_format,
-            norm_threshold,
-            amp_threshold,
-            batch_size,
             vocals_stem,
             instrumental_stem,
             other_stem,
@@ -625,10 +672,10 @@ with gr.Blocks(
             mdx_denoise,
             model_file_dir,
             output_dir,
-            output_format,
-            norm_threshold,
-            amp_threshold,
-            batch_size,
             vocals_stem,
             instrumental_stem,
             other_stem,
@@ -655,10 +702,10 @@ with gr.Blocks(
             vr_high_end_process,
             model_file_dir,
             output_dir,
-            output_format,
-            norm_threshold,
-            amp_threshold,
-            batch_size,
             vocals_stem,
             instrumental_stem,
             other_stem,
@@ -683,9 +730,9 @@ with gr.Blocks(
             demucs_segments_enabled,
             model_file_dir,
             output_dir,
-            output_format,
-            norm_threshold,
-            amp_threshold,
             vocals_stem,
             instrumental_stem,
             other_stem,

     with gr.Tab("Roformer"):
         with gr.Group():
             with gr.Row():
+                roformer_model = gr.Dropdown(value="MelBand Roformer Kim | Big Beta 5e FT by unwa", label="Select the Model", choices=list(ROFORMER_MODELS.keys()), scale=3)
+                roformer_output_format = gr.Dropdown(value="wav", choices=["wav", "flac", "mp3"], label="Output Format", info="The format of the output audio file.", scale=1)
+            with gr.Accordion("Advanced settings", open=False):
+                with gr.Column():
+                    with gr.Group():
+                        roformer_override_seg_size = gr.Checkbox(value=False, label="Override segment size", info="Override model default segment size instead of using the model default value.")
+                        roformer_seg_size = gr.Slider(minimum=32, maximum=4000, step=32, value=256, label="Segment Size", info="Larger consumes more resources, but may give better results.")
+                    with gr.Group():
+                        with gr.Row():
+                            roformer_overlap = gr.Slider(minimum=2, maximum=10, step=1, value=8, label="Overlap", info="Amount of overlap between prediction windows. Lower is better but slower.")
+                            roformer_pitch_shift = gr.Slider(minimum=-24, maximum=24, step=1, value=0, label="Pitch shift", info="Shift audio pitch by a number of semitones while processing. may improve output for deep/high vocals.")
+                    with gr.Group():
+                        with gr.Row():
+                            roformer_batch_size = gr.Slider(minimum=1, maximum=16, step=1, value=1, label="Batch Size", info="Larger consumes more RAM but may process slightly faster.")
+                            roformer_norm_threshold = gr.Slider(minimum=0.1, maximum=1, step=0.1, value=0.9, label="Normalization threshold", info="The threshold for audio normalization.")
+                            roformer_amp_threshold = gr.Slider(minimum=0.0, maximum=1, step=0.1, value=0.0, label="Amplification threshold", info="The threshold for audio amplification.")
         with gr.Row():
             roformer_audio = gr.Audio(label="Input Audio", type="filepath")
         with gr.Row():
     with gr.Tab("MDX23C"):
         with gr.Group():
             with gr.Row():
+                mdx23c_model = gr.Dropdown(value="MDX23C-InstVoc HQ", label="Select the Model", choices=list(MDX23C_MODELS.keys()), scale=3)
+                mdx23c_output_format = gr.Dropdown(value="wav", choices=["wav", "flac", "mp3"], label="Output Format", info="The format of the output audio file.", scale=1)
+            with gr.Accordion("Advanced settings", open=False):
+                with gr.Column():
+                    with gr.Group():
+                        mdx23c_override_seg_size = gr.Checkbox(value=False, label="Override segment size", info="Override model default segment size instead of using the model default value.")
+                        mdx23c_seg_size = gr.Slider(minimum=32, maximum=4000, step=32, value=256, label="Segment Size", info="Larger consumes more resources, but may give better results.")
+                    with gr.Group():
+                        with gr.Row():
+                            mdx23c_overlap = gr.Slider(minimum=2, maximum=50, step=1, value=8, label="Overlap", info="Amount of overlap between prediction windows. Higher is better but slower.")
+                            mdx23c_pitch_shift = gr.Slider(minimum=-24, maximum=24, step=1, value=0, label="Pitch shift", info="Shift audio pitch by a number of semitones while processing. may improve output for deep/high vocals.")
+                    with gr.Group():
+                        with gr.Row():
+                            mdx23c_batch_size = gr.Slider(minimum=1, maximum=16, step=1, value=1, label="Batch Size", info="Larger consumes more RAM but may process slightly faster.")
+                            mdx23c_norm_threshold = gr.Slider(minimum=0.1, maximum=1, step=0.1, value=0.9, label="Normalization threshold", info="The threshold for audio normalization.")
+                            mdx23c_amp_threshold = gr.Slider(minimum=0.0, maximum=1, step=0.1, value=0.0, label="Amplification threshold", info="The threshold for audio amplification.")
         with gr.Row():
             mdx23c_audio = gr.Audio(label="Input Audio", type="filepath")
         with gr.Row():
     with gr.Tab("MDX-NET"):
         with gr.Group():
             with gr.Row():
+                mdx_model = gr.Dropdown(value="UVR-MDX-NET Inst HQ 5", label="Select the Model", choices=list(MDXNET_MODELS.keys()), scale=3)
+                mdx_output_format = gr.Dropdown(value="wav", choices=["wav", "flac", "mp3"], label="Output Format", info="The format of the output audio file.", scale=1)
+            with gr.Accordion("Advanced settings", open=False):
+                with gr.Column():
+                    with gr.Group():
+                        with gr.Row():
+                            mdx_hop_length = gr.Slider(minimum=32, maximum=2048, step=32, value=1024, label="Hop Length", info="Usually called stride in neural networks; only change if you know what you're doing.")
+                            mdx_seg_size = gr.Slider(minimum=32, maximum=4000, step=32, value=256, label="Segment Size", info="Larger consumes more resources, but may give better results.")
+                    with gr.Group():
+                        with gr.Row():
+                            mdx_overlap = gr.Slider(minimum=0.001, maximum=0.999, step=0.001, value=0.25, label="Overlap", info="Amount of overlap between prediction windows. Higher is better but slower.")
+                            mdx_denoise = gr.Checkbox(value=False, label="Denoise", info="Enable denoising after separation.")
+                    with gr.Group():
+                        with gr.Row():
+                            mdx_batch_size = gr.Slider(minimum=1, maximum=16, step=1, value=1, label="Batch Size", info="Larger consumes more RAM but may process slightly faster.")
+                            mdx_norm_threshold = gr.Slider(minimum=0.1, maximum=1, step=0.1, value=0.9, label="Normalization threshold", info="The threshold for audio normalization.")
+                            mdx_amp_threshold = gr.Slider(minimum=0.0, maximum=1, step=0.1, value=0.0, label="Amplification threshold", info="The threshold for audio amplification.")
         with gr.Row():
             mdx_audio = gr.Audio(label="Input Audio", type="filepath")
         with gr.Row():
     with gr.Tab("VR ARCH"):
         with gr.Group():
             with gr.Row():
+                vr_model = gr.Dropdown(value="1_HP-UVR", label="Select the Model", choices=list(VR_ARCH_MODELS.keys()), scale=3)
+                vr_output_format = gr.Dropdown(value="wav", choices=["wav", "flac", "mp3"], label="Output Format", info="The format of the output audio file.", scale=1)
+            with gr.Accordion("Advanced settings", open=False):
+                with gr.Column():
+                    with gr.Group():
+                        with gr.Row():
+                            vr_window_size = gr.Slider(minimum=320, maximum=1024, step=32, value=512, label="Window Size", info="Balance quality and speed. 1024 = fast but lower, 320 = slower but better quality.")
+                            vr_aggression = gr.Slider(minimum=1, maximum=100, step=1, value=5, label="Agression", info="Intensity of primary stem extraction.")
+                    with gr.Group():
+                        with gr.Column():
+                            vr_post_process = gr.Checkbox(value=False, label="Post Process", info="Identify leftover artifacts within vocal output; may improve separation for some songs.")
+                            vr_post_process_threshold = gr.Slider(minimum=0.1, maximum=0.3, step=0.1, value=0.2, label="Post Process Threshold", info="Threshold for post-processing.")
+                    with gr.Group():
+                        with gr.Row():
+                            vr_tta = gr.Checkbox(value=False, label="TTA", info="Enable Test-Time-Augmentation; slow but improves quality.")
+                            vr_high_end_process = gr.Checkbox(value=False, label="High End Process", info="Mirror the missing frequency range of the output.")
+                    with gr.Group():
+                        with gr.Row():
+                            vr_batch_size = gr.Slider(minimum=1, maximum=16, step=1, value=1, label="Batch Size", info="Larger consumes more RAM but may process slightly faster.")
+                            vr_norm_threshold = gr.Slider(minimum=0.1, maximum=1, step=0.1, value=0.9, label="Normalization threshold", info="The threshold for audio normalization.")
+                            vr_amp_threshold = gr.Slider(minimum=0.0, maximum=1, step=0.1, value=0.0, label="Amplification threshold", info="The threshold for audio amplification.")
         with gr.Row():
             vr_audio = gr.Audio(label="Input Audio", type="filepath")
         with gr.Row():
     with gr.Tab("Demucs"):
         with gr.Group():
             with gr.Row():
+                demucs_model = gr.Dropdown(value="htdemucs_6s", label="Select the Model", choices=list(DEMUCS_MODELS.keys()), scale=3)
+                demucs_output_format = gr.Dropdown(value="wav", choices=["wav", "flac", "mp3"], label="Output Format", info="The format of the output audio file.", scale=1)
+            with gr.Accordion("Advanced settings", open=False):
+                with gr.Column():
+                    with gr.Group():
+                        with gr.Row():
+                            demucs_seg_size = gr.Slider(minimum=1, maximum=100, step=1, value=40, label="Segment Size", info="Size of segments into which the audio is split. Higher = slower but better quality.")
+                            demucs_shifts = gr.Slider(minimum=0, maximum=20, step=1, value=2, label="Shifts", info="Number of predictions with random shifts, higher = slower but better quality.")
+                    with gr.Group():
+                        with gr.Row():
+                            demucs_overlap = gr.Slider(minimum=0.001, maximum=0.999, step=0.001, value=0.25, label="Overlap", info="Overlap between prediction windows. Higher = slower but better quality.")
+                            demucs_segments_enabled = gr.Checkbox(value=True, label="Segment-wise processing", info="Enable segment-wise processing.")
+                    with gr.Group():
+                        with gr.Row():
+                            demucs_norm_threshold = gr.Slider(minimum=0.1, maximum=1, step=0.1, value=0.9, label="Normalization threshold", info="The threshold for audio normalization.")
+                            demucs_amp_threshold = gr.Slider(minimum=0.0, maximum=1, step=0.1, value=0.0, label="Amplification threshold", info="The threshold for audio amplification.")
         with gr.Row():
             demucs_audio = gr.Audio(label="Input Audio", type="filepath")
         with gr.Row():
             demucs_stem6 = gr.Audio(label="Stem 6", type="filepath", interactive=False)
     with gr.Tab("Settings"):
+        with gr.Group():
+            with gr.Row():
+                model_file_dir = gr.Textbox(value="/tmp/PolUVR-models/", label="Directory to cache model files", info="The directory where model files are stored.", placeholder="/tmp/PolUVR-models/")
+                output_dir = gr.Textbox(value="output", label="File output directory", info="The directory where output files will be saved.", placeholder="output")
         with gr.Accordion("Rename Stems", open=False):
             gr.Markdown(
             roformer_pitch_shift,
             model_file_dir,
             output_dir,
+            roformer_output_format,
+            roformer_norm_threshold,
+            roformer_amp_threshold,
+            roformer_batch_size,
             vocals_stem,
             instrumental_stem,
             other_stem,
             mdx23c_pitch_shift,
             model_file_dir,
             output_dir,
+            mdx23c_output_format,
+            mdx23c_norm_threshold,
+            mdx23c_amp_threshold,
+            mdx23c_batch_size,
             vocals_stem,
             instrumental_stem,
             other_stem,
             mdx_denoise,
             model_file_dir,
             output_dir,
+            mdx_output_format,
+            mdx_norm_threshold,
+            mdx_amp_threshold,
+            mdx_batch_size,
             vocals_stem,
             instrumental_stem,
             other_stem,
             vr_high_end_process,
             model_file_dir,
             output_dir,
+            vr_output_format,
+            vr_norm_threshold,
+            vr_amp_threshold,
+            vr_batch_size,
             vocals_stem,
             instrumental_stem,
             other_stem,
             demucs_segments_enabled,
             model_file_dir,
             output_dir,
+            demucs_output_format,
+            demucs_norm_threshold,
+            demucs_amp_threshold,
             vocals_stem,
             instrumental_stem,
             other_stem,