Spaces:
Running
Running
app.py
CHANGED
|
@@ -2666,335 +2666,6 @@ def _pair_audio_tracks_and_gain(
|
|
| 2666 |
if audio_track
|
| 2667 |
]
|
| 2668 |
|
| 2669 |
-
@cache
|
| 2670 |
-
def _get_voice_converter() -> VoiceConverter:
|
| 2671 |
-
"""
|
| 2672 |
-
Get a voice converter.
|
| 2673 |
-
|
| 2674 |
-
Returns
|
| 2675 |
-
-------
|
| 2676 |
-
VoiceConverter
|
| 2677 |
-
A voice converter.
|
| 2678 |
-
|
| 2679 |
-
"""
|
| 2680 |
-
from ultimate_rvc.rvc.infer.infer import VoiceConverter # noqa: PLC0415
|
| 2681 |
-
|
| 2682 |
-
return VoiceConverter()
|
| 2683 |
-
|
| 2684 |
-
|
| 2685 |
-
def convert(
|
| 2686 |
-
audio_track: StrPath,
|
| 2687 |
-
directory: StrPath,
|
| 2688 |
-
model_name: str,
|
| 2689 |
-
n_octaves: int = 0,
|
| 2690 |
-
n_semitones: int = 0,
|
| 2691 |
-
f0_methods: Sequence[F0Method] | None = None,
|
| 2692 |
-
index_rate: float = 0.3,
|
| 2693 |
-
rms_mix_rate: float = 1.0,
|
| 2694 |
-
protect_rate: float = 0.33,
|
| 2695 |
-
hop_length: int = 128,
|
| 2696 |
-
split_audio: bool = False,
|
| 2697 |
-
autotune_audio: bool = False,
|
| 2698 |
-
autotune_strength: float = 1.0,
|
| 2699 |
-
clean_audio: bool = False,
|
| 2700 |
-
clean_strength: float = 0.7,
|
| 2701 |
-
embedder_model: EmbedderModel = EmbedderModel.CONTENTVEC,
|
| 2702 |
-
custom_embedder_model: str | None = None,
|
| 2703 |
-
sid: int = 0,
|
| 2704 |
-
content_type: RVCContentType = RVCContentType.AUDIO,
|
| 2705 |
-
make_directory: bool = False,
|
| 2706 |
-
) -> Path:
|
| 2707 |
-
"""
|
| 2708 |
-
Convert an audio track using an RVC model.
|
| 2709 |
-
|
| 2710 |
-
Parameters
|
| 2711 |
-
----------
|
| 2712 |
-
audio_track : StrPath
|
| 2713 |
-
The path to the audio track to convert.
|
| 2714 |
-
directory : StrPath
|
| 2715 |
-
The path to the directory where the converted audio track
|
| 2716 |
-
will be saved.
|
| 2717 |
-
model_name : str
|
| 2718 |
-
The name of the model to use for voice conversion.
|
| 2719 |
-
n_octaves : int, default=0
|
| 2720 |
-
The number of octaves to pitch-shift the converted audio by.
|
| 2721 |
-
n_semitones : int, default=0
|
| 2722 |
-
The number of semitones to pitch-shift the converted audio by.
|
| 2723 |
-
f0_methods : Sequence[F0Method], optional
|
| 2724 |
-
The methods to use for pitch extraction. If None, the method
|
| 2725 |
-
used is rmvpe.
|
| 2726 |
-
index_rate : float, default=0.3
|
| 2727 |
-
The influence of the index file on the voice conversion.
|
| 2728 |
-
rms_mix_rate : float, default = 1.0
|
| 2729 |
-
The blending rate of the volume envelope of the converted
|
| 2730 |
-
audio.
|
| 2731 |
-
protect_rate : float, default=0.33
|
| 2732 |
-
The protection rate for consonants and breathing sounds.
|
| 2733 |
-
hop_length : int, default=128
|
| 2734 |
-
The hop length to use for CREPE-based pitch extraction.
|
| 2735 |
-
split_audio : bool, default=False
|
| 2736 |
-
Whether to split the audio track into smaller segments before
|
| 2737 |
-
converting it.
|
| 2738 |
-
autotune_audio : bool, default=False
|
| 2739 |
-
Whether to apply autotune to the converted audio.
|
| 2740 |
-
autotune_strength : float, default=1.0
|
| 2741 |
-
The strength of the autotune to apply to the converted audio.
|
| 2742 |
-
clean_audio : bool, default=False
|
| 2743 |
-
Whether to clean the converted audio.
|
| 2744 |
-
clean_strength : float, default=0.7
|
| 2745 |
-
The intensity of the cleaning to apply to the converted audio.
|
| 2746 |
-
embedder_model : EmbedderModel, default=EmbedderModel.CONTENTVEC
|
| 2747 |
-
The model to use for generating speaker embeddings.
|
| 2748 |
-
custom_embedder_model : str, optional
|
| 2749 |
-
The name of a custom embedder model to use for generating
|
| 2750 |
-
speaker embeddings.
|
| 2751 |
-
sid : int, default=0
|
| 2752 |
-
The speaker id to use for multi-speaker models.
|
| 2753 |
-
content_type : RVCContentType, default=RVCContentType.AUDIO
|
| 2754 |
-
The type of content to convert. Determines what is shown in
|
| 2755 |
-
display mesages and saved file names.
|
| 2756 |
-
make_directory : bool, default=False
|
| 2757 |
-
Whether to create the directory where the converted audio
|
| 2758 |
-
track will be saved if it does not exist.
|
| 2759 |
-
|
| 2760 |
-
Returns
|
| 2761 |
-
-------
|
| 2762 |
-
Path
|
| 2763 |
-
The path to the converted audio track.
|
| 2764 |
-
|
| 2765 |
-
"""
|
| 2766 |
-
match content_type:
|
| 2767 |
-
case RVCContentType.VOCALS:
|
| 2768 |
-
track_entity = Entity.VOCALS_TRACK
|
| 2769 |
-
directory_entity = Entity.SONG_DIR
|
| 2770 |
-
case RVCContentType.VOICE:
|
| 2771 |
-
track_entity = Entity.VOICE_TRACK
|
| 2772 |
-
directory_entity = Entity.DIRECTORY
|
| 2773 |
-
case RVCContentType.SPEECH:
|
| 2774 |
-
track_entity = Entity.SPEECH_TRACK
|
| 2775 |
-
directory_entity = Entity.DIRECTORY
|
| 2776 |
-
case RVCContentType.AUDIO:
|
| 2777 |
-
track_entity = Entity.AUDIO_TRACK
|
| 2778 |
-
directory_entity = Entity.DIRECTORY
|
| 2779 |
-
audio_path = validate_audio_file_exists(audio_track, track_entity)
|
| 2780 |
-
if make_directory:
|
| 2781 |
-
Path(directory).mkdir(parents=True, exist_ok=True)
|
| 2782 |
-
directory_path = validate_audio_dir_exists(directory, directory_entity)
|
| 2783 |
-
validate_model(model_name, Entity.VOICE_MODEL)
|
| 2784 |
-
custom_embedder_model_path = None
|
| 2785 |
-
if embedder_model == EmbedderModel.CUSTOM:
|
| 2786 |
-
custom_embedder_model_path = validate_model(
|
| 2787 |
-
custom_embedder_model,
|
| 2788 |
-
Entity.CUSTOM_EMBEDDER_MODEL,
|
| 2789 |
-
)
|
| 2790 |
-
|
| 2791 |
-
audio_path = wavify(
|
| 2792 |
-
audio_path,
|
| 2793 |
-
directory_path,
|
| 2794 |
-
"20_Input",
|
| 2795 |
-
accepted_formats={AudioExt.M4A, AudioExt.AAC},
|
| 2796 |
-
)
|
| 2797 |
-
|
| 2798 |
-
n_semitones = n_octaves * 12 + n_semitones
|
| 2799 |
-
f0_methods_set = set(f0_methods) if f0_methods else {F0Method.RMVPE}
|
| 2800 |
-
|
| 2801 |
-
args_dict = RVCAudioMetaData(
|
| 2802 |
-
audio_track=FileMetaData(
|
| 2803 |
-
name=audio_path.name,
|
| 2804 |
-
hash_id=get_file_hash(audio_path),
|
| 2805 |
-
),
|
| 2806 |
-
model_name=model_name,
|
| 2807 |
-
n_semitones=n_semitones,
|
| 2808 |
-
f0_methods=sorted(f0_methods_set),
|
| 2809 |
-
index_rate=index_rate,
|
| 2810 |
-
rms_mix_rate=rms_mix_rate,
|
| 2811 |
-
protect_rate=protect_rate,
|
| 2812 |
-
hop_length=hop_length,
|
| 2813 |
-
split_audio=split_audio,
|
| 2814 |
-
autotune_audio=autotune_audio,
|
| 2815 |
-
autotune_strength=autotune_strength,
|
| 2816 |
-
clean_audio=clean_audio,
|
| 2817 |
-
clean_strength=clean_strength,
|
| 2818 |
-
embedder_model=embedder_model,
|
| 2819 |
-
custom_embedder_model=custom_embedder_model,
|
| 2820 |
-
sid=sid,
|
| 2821 |
-
).model_dump()
|
| 2822 |
-
|
| 2823 |
-
paths = [
|
| 2824 |
-
get_unique_base_path(
|
| 2825 |
-
directory_path,
|
| 2826 |
-
f"21_{content_type.capitalize()}_Converted",
|
| 2827 |
-
args_dict,
|
| 2828 |
-
).with_suffix(suffix)
|
| 2829 |
-
for suffix in [".wav", ".json"]
|
| 2830 |
-
]
|
| 2831 |
-
|
| 2832 |
-
converted_audio_path, converted_audio_json_path = paths
|
| 2833 |
-
|
| 2834 |
-
if not all(path.exists() for path in paths):
|
| 2835 |
-
rvc_model_path, rvc_index_path = _get_rvc_files(model_name)
|
| 2836 |
-
|
| 2837 |
-
voice_converter = _get_voice_converter()
|
| 2838 |
-
|
| 2839 |
-
voice_converter.convert_audio(
|
| 2840 |
-
audio_input_path=str(audio_path),
|
| 2841 |
-
audio_output_path=str(converted_audio_path),
|
| 2842 |
-
model_path=str(rvc_model_path),
|
| 2843 |
-
index_path=str(rvc_index_path) if rvc_index_path else "",
|
| 2844 |
-
pitch=n_semitones,
|
| 2845 |
-
f0_methods=f0_methods_set,
|
| 2846 |
-
index_rate=index_rate,
|
| 2847 |
-
volume_envelope=rms_mix_rate,
|
| 2848 |
-
protect=protect_rate,
|
| 2849 |
-
hop_length=hop_length,
|
| 2850 |
-
split_audio=split_audio,
|
| 2851 |
-
f0_autotune=autotune_audio,
|
| 2852 |
-
f0_autotune_strength=autotune_strength,
|
| 2853 |
-
embedder_model=embedder_model,
|
| 2854 |
-
embedder_model_custom=(
|
| 2855 |
-
str(custom_embedder_model_path)
|
| 2856 |
-
if custom_embedder_model_path is not None
|
| 2857 |
-
else None
|
| 2858 |
-
),
|
| 2859 |
-
clean_audio=clean_audio,
|
| 2860 |
-
clean_strength=clean_strength,
|
| 2861 |
-
post_process=False,
|
| 2862 |
-
resample_sr=0,
|
| 2863 |
-
sid=sid,
|
| 2864 |
-
)
|
| 2865 |
-
json_dump(args_dict, converted_audio_json_path)
|
| 2866 |
-
return converted_audio_path
|
| 2867 |
-
|
| 2868 |
-
|
| 2869 |
-
@cache
|
| 2870 |
-
def _get_audio_separator(
|
| 2871 |
-
output_dir: StrPath = INTERMEDIATE_AUDIO_BASE_DIR,
|
| 2872 |
-
output_format: str = AudioExt.WAV,
|
| 2873 |
-
segment_size: int = SegmentSize.SEG_256,
|
| 2874 |
-
sample_rate: int = 44100,
|
| 2875 |
-
) -> Separator:
|
| 2876 |
-
static_ffmpeg.add_paths(weak=True)
|
| 2877 |
-
from audio_separator.separator import Separator # noqa: PLC0415
|
| 2878 |
-
|
| 2879 |
-
"""
|
| 2880 |
-
Get an audio separator.
|
| 2881 |
-
|
| 2882 |
-
Parameters
|
| 2883 |
-
----------
|
| 2884 |
-
output_dir : StrPath, default=INTERMEDIATE_AUDIO_BASE_DIR
|
| 2885 |
-
The directory to save the separated audio to.
|
| 2886 |
-
output_format : str, default=AudioExt.WAV
|
| 2887 |
-
The format to save the separated audio in.
|
| 2888 |
-
segment_size : int, default=SegmentSize.SEG_256
|
| 2889 |
-
The segment size to use for separation.
|
| 2890 |
-
sample_rate : int, default=44100
|
| 2891 |
-
The sample rate to use for separation.
|
| 2892 |
-
|
| 2893 |
-
Returns
|
| 2894 |
-
-------
|
| 2895 |
-
Separator
|
| 2896 |
-
An audio separator.
|
| 2897 |
-
|
| 2898 |
-
"""
|
| 2899 |
-
return Separator(
|
| 2900 |
-
model_file_dir=SEPARATOR_MODELS_DIR,
|
| 2901 |
-
use_autocast=False,
|
| 2902 |
-
output_dir=output_dir,
|
| 2903 |
-
output_format=output_format,
|
| 2904 |
-
sample_rate=sample_rate,
|
| 2905 |
-
mdx_params={
|
| 2906 |
-
"hop_length": 1024,
|
| 2907 |
-
"segment_size": segment_size,
|
| 2908 |
-
"overlap": 0.25,
|
| 2909 |
-
"batch_size": 1,
|
| 2910 |
-
"enable_denoise": True,
|
| 2911 |
-
},
|
| 2912 |
-
)
|
| 2913 |
-
|
| 2914 |
-
|
| 2915 |
-
|
| 2916 |
-
@spaces.GPU(duration=zgpuduration)
|
| 2917 |
-
def separate_audio(
|
| 2918 |
-
audio_track: StrPath,
|
| 2919 |
-
song_dir: StrPath,
|
| 2920 |
-
model_name: SeparationModel,
|
| 2921 |
-
segment_size: int,
|
| 2922 |
-
) -> tuple[Path, Path]:
|
| 2923 |
-
"""
|
| 2924 |
-
Separate an audio track into a primary stem and a secondary stem.
|
| 2925 |
-
|
| 2926 |
-
Parameters
|
| 2927 |
-
----------
|
| 2928 |
-
audio_track : StrPath
|
| 2929 |
-
The path to the audio track to separate.
|
| 2930 |
-
song_dir : StrPath
|
| 2931 |
-
The path to the song directory where the separated primary stem
|
| 2932 |
-
and secondary stem will be saved.
|
| 2933 |
-
model_name : str
|
| 2934 |
-
The name of the model to use for audio separation.
|
| 2935 |
-
segment_size : int
|
| 2936 |
-
The segment size to use for audio separation.
|
| 2937 |
-
|
| 2938 |
-
Returns
|
| 2939 |
-
-------
|
| 2940 |
-
primary_path : Path
|
| 2941 |
-
The path to the separated primary stem.
|
| 2942 |
-
secondary_path : Path
|
| 2943 |
-
The path to the separated secondary stem.
|
| 2944 |
-
|
| 2945 |
-
"""
|
| 2946 |
-
audio_path = validate_audio_file_exists(audio_track, Entity.AUDIO_TRACK)
|
| 2947 |
-
song_dir_path = validate_audio_dir_exists(song_dir, Entity.SONG_DIR)
|
| 2948 |
-
|
| 2949 |
-
args_dict = SeparatedAudioMetaData(
|
| 2950 |
-
audio_track=FileMetaData(
|
| 2951 |
-
name=audio_path.name,
|
| 2952 |
-
hash_id=get_file_hash(audio_path),
|
| 2953 |
-
),
|
| 2954 |
-
model_name=model_name,
|
| 2955 |
-
segment_size=segment_size,
|
| 2956 |
-
).model_dump()
|
| 2957 |
-
|
| 2958 |
-
paths = [
|
| 2959 |
-
get_unique_base_path(
|
| 2960 |
-
song_dir_path,
|
| 2961 |
-
prefix,
|
| 2962 |
-
args_dict,
|
| 2963 |
-
).with_suffix(suffix)
|
| 2964 |
-
for prefix in ["11_Stem_Primary", "11_Stem_Secondary"]
|
| 2965 |
-
for suffix in [".wav", ".json"]
|
| 2966 |
-
]
|
| 2967 |
-
|
| 2968 |
-
(
|
| 2969 |
-
primary_path,
|
| 2970 |
-
primary_json_path,
|
| 2971 |
-
secondary_path,
|
| 2972 |
-
secondary_json_path,
|
| 2973 |
-
) = paths
|
| 2974 |
-
|
| 2975 |
-
if not all(path.exists() for path in paths):
|
| 2976 |
-
audio_separator = _get_audio_separator(
|
| 2977 |
-
output_dir=song_dir_path,
|
| 2978 |
-
segment_size=segment_size,
|
| 2979 |
-
)
|
| 2980 |
-
audio_separator.load_model(model_name)
|
| 2981 |
-
audio_separator.separate(
|
| 2982 |
-
str(audio_path),
|
| 2983 |
-
custom_output_names={
|
| 2984 |
-
audio_separator.model_instance.primary_stem_name: str(
|
| 2985 |
-
primary_path.with_suffix(""),
|
| 2986 |
-
),
|
| 2987 |
-
audio_separator.model_instance.secondary_stem_name: str(
|
| 2988 |
-
secondary_path.with_suffix(""),
|
| 2989 |
-
),
|
| 2990 |
-
},
|
| 2991 |
-
)
|
| 2992 |
-
json_dump(args_dict, primary_json_path)
|
| 2993 |
-
json_dump(args_dict, secondary_json_path)
|
| 2994 |
-
|
| 2995 |
-
return primary_path, secondary_path
|
| 2996 |
-
|
| 2997 |
-
|
| 2998 |
def run_pipeline2(
|
| 2999 |
source: str,
|
| 3000 |
model_name: str,
|
|
|
|
| 2666 |
if audio_track
|
| 2667 |
]
|
| 2668 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2669 |
def run_pipeline2(
|
| 2670 |
source: str,
|
| 2671 |
model_name: str,
|