Voice-Changer

Runtime error

App Files Files Community

Shanuka01 commited on Nov 14, 2023

Commit

5174d65

1 Parent(s): c3d2147

Update app_multi.py

Browse files

Files changed (1) hide show

app_multi.py +467 -2

app_multi.py CHANGED Viewed

@@ -72,7 +72,6 @@ app_css = '''
     max-height: 100px;
     float: right;
 }
 #model_info p {
     margin: unset;
 }
@@ -354,4 +353,470 @@ def youtube_downloader(
     quiet = "--quiet --no-warnings" if quiet else ""
     command = f"""
-        yt-dlp {quiet} -x --audio-format wav -f bestaudio -o "{output_filename}" --download-sections "*{start_time}-{end_time}" "{url_base}{video_identifier}"  # noqa: E501

     max-height: 100px;
     float: right;
 }
 #model_info p {
     margin: unset;
 }
     quiet = "--quiet --no-warnings" if quiet else ""
     command = f"""
+        yt-dlp {quiet} -x --audio-format wav -f bestaudio -o "{output_filename}" --download-sections "*{start_time}-{end_time}" "{url_base}{video_identifier}"  # noqa: E501
+    """.strip()
+    attempts = 0
+    while True:
+        try:
+            _ = subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT)
+        except subprocess.CalledProcessError:
+            attempts += 1
+            if attempts == num_attempts:
+                return None
+        else:
+            break
+    if output_path.exists():
+        return output_path
+    else:
+        return None
+def audio_separated(audio_input, progress=gr.Progress()):
+    # start progress
+    progress(progress=0, desc="Starting...")
+    time.sleep(0.1)
+    # check file input
+    if audio_input is None:
+        # show progress
+        for i in progress.tqdm(range(100), desc="Please wait..."):
+            time.sleep(0.01)
+        return (None, None, 'Please input audio.')
+    # create filename
+    filename = str(random.randint(10000,99999))+datetime.now().strftime("%d%m%Y%H%M%S")
+    # progress
+    progress(progress=0.10, desc="Please wait...")
+    # make dir output
+    os.makedirs("output", exist_ok=True)
+    # progress
+    progress(progress=0.20, desc="Please wait...")
+    # write
+    if high_quality:
+        write(filename+".wav", audio_input[0], audio_input[1])
+    else:
+        write(filename+".mp3", audio_input[0], audio_input[1])
+    # progress
+    progress(progress=0.50, desc="Please wait...")
+    # demucs process
+    if high_quality:
+        command_demucs = "python3 -m demucs --two-stems=vocals -d cpu "+filename+".wav -o output"
+    else:
+        command_demucs = "python3 -m demucs --two-stems=vocals --mp3 --mp3-bitrate 128 -d cpu "+filename+".mp3 -o output"
+    os.system(command_demucs)
+    # progress
+    progress(progress=0.70, desc="Please wait...")
+    # remove file audio
+    if high_quality:
+        command_delete = "rm -v ./"+filename+".wav"
+    else:
+        command_delete = "rm -v ./"+filename+".mp3"
+    os.system(command_delete)
+    # progress
+    progress(progress=0.80, desc="Please wait...")
+    # progress
+    for i in progress.tqdm(range(80,100), desc="Please wait..."):
+        time.sleep(0.1)
+    if high_quality:
+        return "./output/htdemucs/"+filename+"/vocals.wav","./output/htdemucs/"+filename+"/no_vocals.wav","Successfully..."
+    else:
+        return "./output/htdemucs/"+filename+"/vocals.mp3","./output/htdemucs/"+filename+"/no_vocals.mp3","Successfully..."
+# https://github.com/fumiama/Retrieval-based-Voice-Conversion-WebUI/blob/main/infer-web.py#L118  # noqa
+def vc_func(
+    input_audio, model_index, pitch_adjust, f0_method, feat_ratio,
+    filter_radius, rms_mix_rate, resample_option
+):
+    if input_audio is None:
+        return (None, 'Please provide input audio.')
+    if model_index is None:
+        return (None, 'Please select a model.')
+    model = loaded_models[model_index]
+    # Reference: so-vits
+    (audio_samp, audio_npy) = input_audio
+    # https://huggingface.co/spaces/zomehwh/rvc-models/blob/main/app.py#L49
+    # Can be change well, we will see
+    if (audio_npy.shape[0] / audio_samp) > 600 and in_hf_space:
+        return (None, 'Input audio is longer than 600 secs.')
+    # Bloody hell: https://stackoverflow.com/questions/26921836/
+    if audio_npy.dtype != np.float32:  # :thonk:
+        audio_npy = (
+            audio_npy / np.iinfo(audio_npy.dtype).max
+        ).astype(np.float32)
+    if len(audio_npy.shape) > 1:
+        audio_npy = librosa.to_mono(audio_npy.transpose(1, 0))
+    if audio_samp != 16000:
+        audio_npy = librosa.resample(
+            audio_npy,
+            orig_sr=audio_samp,
+            target_sr=16000
+        )
+    pitch_int = int(pitch_adjust)
+    resample = (
+        0 if resample_option == 'Disable resampling'
+        else int(resample_option)
+    )
+    times = [0, 0, 0]
+    checksum = hashlib.sha512()
+    checksum.update(audio_npy.tobytes())
+    output_audio = model['vc'].pipeline(
+        hubert_model,
+        model['net_g'],
+        model['metadata'].get('speaker_id', 0),
+        audio_npy,
+        checksum.hexdigest(),
+        times,
+        pitch_int,
+        f0_method,
+        path.join('model', model['name'], model['metadata']['feat_index']),
+        feat_ratio,
+        model['if_f0'],
+        filter_radius,
+        model['target_sr'],
+        resample,
+        rms_mix_rate,
+        'v2'
+    )
+    out_sr = (
+        resample if resample >= 16000 and model['target_sr'] != resample
+        else model['target_sr']
+    )
+    print(f'npy: {times[0]}s, f0: {times[1]}s, infer: {times[2]}s')
+    return ((out_sr, output_audio), 'Success')
+async def edge_tts_vc_func(
+    input_text, model_index, tts_speaker, pitch_adjust, f0_method, feat_ratio,
+    filter_radius, rms_mix_rate, resample_option
+):
+    if input_text is None:
+        return (None, 'Please provide TTS text.')
+    if tts_speaker is None:
+        return (None, 'Please select TTS speaker.')
+    if model_index is None:
+        return (None, 'Please select a model.')
+    speaker = tts_speakers_list[tts_speaker]['ShortName']
+    (tts_np, tts_sr) = await util.call_edge_tts(speaker, input_text)
+    return vc_func(
+        (tts_sr, tts_np),
+        model_index,
+        pitch_adjust,
+        f0_method,
+        feat_ratio,
+        filter_radius,
+        rms_mix_rate,
+        resample_option
+    )
+def update_model_info(model_index):
+    if model_index is None:
+        return str(
+            '### Model info\n'
+            'Please select a model from dropdown above.'
+        )
+    model = loaded_models[model_index]
+    model_icon = model['metadata'].get('icon', '')
+    return str(
+        '### Model info\n'
+        '![model icon]({icon})'
+        '**{name}**\n\n'
+        'Author: {author}\n\n'
+        'Source: {source}\n\n'
+        '{note}'
+    ).format(
+        name=model['metadata'].get('name'),
+        author=model['metadata'].get('author', 'Anonymous'),
+        source=model['metadata'].get('source', 'Unknown'),
+        note=model['metadata'].get('note', ''),
+        icon=(
+            model_icon
+            if model_icon.startswith(('http://', 'https://'))
+            else '/file/model/%s/%s' % (model['name'], model_icon)
+        )
+    )
+def _example_vc(
+    input_audio, model_index, pitch_adjust, f0_method, feat_ratio,
+    filter_radius, rms_mix_rate, resample_option
+):
+    (audio, message) = vc_func(
+        input_audio, model_index, pitch_adjust, f0_method, feat_ratio,
+        filter_radius, rms_mix_rate, resample_option
+    )
+    return (
+        audio,
+        message,
+        update_model_info(model_index)
+    )
+async def _example_edge_tts(
+    input_text, model_index, tts_speaker, pitch_adjust, f0_method, feat_ratio,
+    filter_radius, rms_mix_rate, resample_option
+):
+    (audio, message) = await edge_tts_vc_func(
+        input_text, model_index, tts_speaker, pitch_adjust, f0_method,
+        feat_ratio, filter_radius, rms_mix_rate, resample_option
+    )
+    return (
+        audio,
+        message,
+        update_model_info(model_index)
+    )
+with app:
+    gr.HTML("<center>"
+            "<h1>🥳🎶🎡 - AI歌手，RVC歌声转换 + AI变声</h1>"
+            "</center>")
+    gr.Markdown("### <center>🦄 - 能够自动提取视频中的声音，并去除背景音；Powered by [RVC-Project](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI)</center>")
+    gr.Markdown("### <center>更多精彩应用，敬请关注[滔滔AI](http://www.talktalkai.com)；滔滔AI，为爱滔滔！💕</center>")
+    with gr.Tab("🤗 - B站视频提取声音"):
+        with gr.Row():
+            with gr.Column():
+                ydl_url_input  = gr.Textbox(label="B站视频网址(可直接填写相应的BV号)", value = "https://www.bilibili.com/video/BV...")
+                start = gr.Number(value=0, label="起始时间 (秒)")
+                end = gr.Number(value=15, label="结束时间 (秒)")
+                ydl_url_submit = gr.Button("提取声音文件吧", variant="primary")
+                as_audio_submit = gr.Button("去除背景音吧", variant="primary")
+            with gr.Column():
+                ydl_audio_output = gr.Audio(label="Audio from Bilibili")
+                as_audio_input  = ydl_audio_output
+                as_audio_vocals    = gr.Audio(label="歌曲人声部分")
+                as_audio_no_vocals = gr.Audio(label="Music only", type="filepath", visible=False)
+                as_audio_message   = gr.Textbox(label="Message", visible=False)
+    ydl_url_submit.click(fn=youtube_downloader, inputs=[ydl_url_input, start, end], outputs=[ydl_audio_output])
+    as_audio_submit.click(fn=audio_separated, inputs=[as_audio_input], outputs=[as_audio_vocals, as_audio_no_vocals, as_audio_message], show_progress=True, queue=True)
+    with gr.Row():
+        with gr.Column():
+            with gr.Tab('🎶 - 歌声转换'):
+                input_audio = as_audio_vocals
+                vc_convert_btn = gr.Button('进行歌声转换吧！', variant='primary')
+                full_song = gr.Button("加入歌曲伴奏吧！", variant="primary")
+                new_song = gr.Audio(label="AI歌手+伴奏", type="filepath")
+            with gr.Tab('🎙️ - 文本转语音'):
+                tts_input = gr.Textbox(
+                    label='请填写您想要转换的文本(中英皆��)',
+                    lines=3
+                )
+                tts_speaker = gr.Dropdown(
+                    [
+                        '%s (%s)' % (
+                            s['FriendlyName'],
+                            s['Gender']
+                        )
+                        for s in tts_speakers_list
+                    ],
+                    label='请选择一个相应语言的说话人',
+                    type='index'
+                )
+                tts_convert_btn = gr.Button('进行AI变声吧', variant='primary')
+            with gr.Tab("📺 - 音乐视频"):
+                with gr.Row():
+                    with gr.Column():
+                        inp1 = gr.Textbox(label="为视频配上精彩的文案吧(选填;英文)")
+                        inp2 = new_song
+                        inp3 = gr.Image(source='upload', type='filepath', label="上传一张背景图片吧")
+                        btn = gr.Button("生成您的专属音乐视频吧", variant="primary")
+                    with gr.Column():
+                        out1 = gr.Video(label='您的专属音乐视频')
+            btn.click(fn=infer, inputs=[inp1, inp2, inp3], outputs=[out1])
+            pitch_adjust = gr.Slider(
+                label='Pitch',
+                minimum=-24,
+                maximum=24,
+                step=1,
+                value=0
+            )
+            f0_method = gr.Radio(
+                label='f0 methods',
+                choices=['pm', 'rmvpe'],
+                value='rmvpe',
+                interactive=True
+            )
+            with gr.Accordion('更多设置', open=False):
+                feat_ratio = gr.Slider(
+                    label='Feature ratio',
+                    minimum=0,
+                    maximum=1,
+                    step=0.1,
+                    value=0.6
+                )
+                filter_radius = gr.Slider(
+                    label='Filter radius',
+                    minimum=0,
+                    maximum=7,
+                    step=1,
+                    value=3
+                )
+                rms_mix_rate = gr.Slider(
+                    label='Volume envelope mix rate',
+                    minimum=0,
+                    maximum=1,
+                    step=0.1,
+                    value=1
+                )
+                resample_rate = gr.Dropdown(
+                    [
+                        'Disable resampling',
+                        '16000',
+                        '22050',
+                        '44100',
+                        '48000'
+                    ],
+                    label='Resample rate',
+                    value='Disable resampling'
+                )
+        with gr.Column():
+            # Model select
+            model_index = gr.Dropdown(
+                [
+                    '%s - %s' % (
+                        m['metadata'].get('source', 'Unknown'),
+                        m['metadata'].get('name')
+                    )
+                    for m in loaded_models
+                ],
+                label='请选择您的AI歌手(必选)',
+                type='index'
+            )
+            # Model info
+            with gr.Box():
+                model_info = gr.Markdown(
+                    '### AI歌手信息\n'
+                    'Please select a model from dropdown above.',
+                    elem_id='model_info'
+                )
+            output_audio = gr.Audio(label='AI歌手(无伴奏)', type="filepath")
+            output_msg = gr.Textbox(label='Output message')
+    multi_examples = multi_cfg.get('examples')
+    if (
+        multi_examples and
+        multi_examples.get('vc') and multi_examples.get('tts_vc')
+    ):
+        with gr.Accordion('Sweet sweet examples', open=False):
+            with gr.Row():
+                # VC Example
+                if multi_examples.get('vc'):
+                    gr.Examples(
+                        label='Audio conversion examples',
+                        examples=multi_examples.get('vc'),
+                        inputs=[
+                            input_audio, model_index, pitch_adjust, f0_method,
+                            feat_ratio
+                        ],
+                        outputs=[output_audio, output_msg, model_info],
+                        fn=_example_vc,
+                        cache_examples=args.cache_examples,
+                        run_on_click=args.cache_examples
+                    )
+                # Edge TTS Example
+                if multi_examples.get('tts_vc'):
+                    gr.Examples(
+                        label='TTS conversion examples',
+                        examples=multi_examples.get('tts_vc'),
+                        inputs=[
+                            tts_input, model_index, tts_speaker, pitch_adjust,
+                            f0_method, feat_ratio
+                        ],
+                        outputs=[output_audio, output_msg, model_info],
+                        fn=_example_edge_tts,
+                        cache_examples=args.cache_examples,
+                        run_on_click=args.cache_examples
+                    )
+    vc_convert_btn.click(
+        vc_func,
+        [
+            input_audio, model_index, pitch_adjust, f0_method, feat_ratio,
+            filter_radius, rms_mix_rate, resample_rate
+        ],
+        [output_audio, output_msg],
+        api_name='audio_conversion'
+    )
+    tts_convert_btn.click(
+        edge_tts_vc_func,
+        [
+            tts_input, model_index, tts_speaker, pitch_adjust, f0_method,
+            feat_ratio, filter_radius, rms_mix_rate, resample_rate
+        ],
+        [output_audio, output_msg],
+        api_name='tts_conversion'
+    )
+    full_song.click(fn=mix, inputs=[output_audio, as_audio_no_vocals], outputs=[new_song])
+    model_index.change(
+        update_model_info,
+        inputs=[model_index],
+        outputs=[model_info],
+        show_progress=False,
+        queue=False
+    )
+    gr.Markdown("### <center>注意❗：请不要生成会对个人以及组织造成侵害的内容，此程序仅供科研、学习及个人娱乐使用。</center>")
+    gr.Markdown("### <center>🧸 - 如何使用此程序：填写视频网址和视频起止时间后，依次点击“提取声音文件吧”、“去除背景音吧”、“进行歌声转换吧！”、“加入歌曲伴奏吧！”四个按键即可。</center>")
+    gr.HTML('''
+        <div class="footer">
+                    <p>🌊🏞️🎶 - 江水东流急，滔滔无尽声。 明·顾璘
+                    </p>
+        </div>
+    ''')
+app.queue(
+    concurrency_count=1,
+    max_size=20,
+    api_open=args.api
+).launch(show_error=True)