Shanuka01 commited on
Commit
5174d65
·
1 Parent(s): c3d2147

Update app_multi.py

Browse files
Files changed (1) hide show
  1. app_multi.py +467 -2
app_multi.py CHANGED
@@ -72,7 +72,6 @@ app_css = '''
72
  max-height: 100px;
73
  float: right;
74
  }
75
-
76
  #model_info p {
77
  margin: unset;
78
  }
@@ -354,4 +353,470 @@ def youtube_downloader(
354
 
355
  quiet = "--quiet --no-warnings" if quiet else ""
356
  command = f"""
357
- yt-dlp {quiet} -x --audio-format wav -f bestaudio -o "{output_filename}" --download-sections "*{start_time}-{end_time}" "{url_base}{video_identifier}" # noqa: E501
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  max-height: 100px;
73
  float: right;
74
  }
 
75
  #model_info p {
76
  margin: unset;
77
  }
 
353
 
354
  quiet = "--quiet --no-warnings" if quiet else ""
355
  command = f"""
356
+ yt-dlp {quiet} -x --audio-format wav -f bestaudio -o "{output_filename}" --download-sections "*{start_time}-{end_time}" "{url_base}{video_identifier}" # noqa: E501
357
+ """.strip()
358
+
359
+ attempts = 0
360
+ while True:
361
+ try:
362
+ _ = subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT)
363
+ except subprocess.CalledProcessError:
364
+ attempts += 1
365
+ if attempts == num_attempts:
366
+ return None
367
+ else:
368
+ break
369
+
370
+ if output_path.exists():
371
+ return output_path
372
+ else:
373
+ return None
374
+
375
+ def audio_separated(audio_input, progress=gr.Progress()):
376
+ # start progress
377
+ progress(progress=0, desc="Starting...")
378
+ time.sleep(0.1)
379
+
380
+ # check file input
381
+ if audio_input is None:
382
+ # show progress
383
+ for i in progress.tqdm(range(100), desc="Please wait..."):
384
+ time.sleep(0.01)
385
+
386
+ return (None, None, 'Please input audio.')
387
+
388
+ # create filename
389
+ filename = str(random.randint(10000,99999))+datetime.now().strftime("%d%m%Y%H%M%S")
390
+
391
+ # progress
392
+ progress(progress=0.10, desc="Please wait...")
393
+
394
+ # make dir output
395
+ os.makedirs("output", exist_ok=True)
396
+
397
+ # progress
398
+ progress(progress=0.20, desc="Please wait...")
399
+
400
+ # write
401
+ if high_quality:
402
+ write(filename+".wav", audio_input[0], audio_input[1])
403
+ else:
404
+ write(filename+".mp3", audio_input[0], audio_input[1])
405
+
406
+ # progress
407
+ progress(progress=0.50, desc="Please wait...")
408
+
409
+ # demucs process
410
+ if high_quality:
411
+ command_demucs = "python3 -m demucs --two-stems=vocals -d cpu "+filename+".wav -o output"
412
+ else:
413
+ command_demucs = "python3 -m demucs --two-stems=vocals --mp3 --mp3-bitrate 128 -d cpu "+filename+".mp3 -o output"
414
+
415
+ os.system(command_demucs)
416
+
417
+ # progress
418
+ progress(progress=0.70, desc="Please wait...")
419
+
420
+ # remove file audio
421
+ if high_quality:
422
+ command_delete = "rm -v ./"+filename+".wav"
423
+ else:
424
+ command_delete = "rm -v ./"+filename+".mp3"
425
+
426
+ os.system(command_delete)
427
+
428
+ # progress
429
+ progress(progress=0.80, desc="Please wait...")
430
+
431
+ # progress
432
+ for i in progress.tqdm(range(80,100), desc="Please wait..."):
433
+ time.sleep(0.1)
434
+
435
+ if high_quality:
436
+ return "./output/htdemucs/"+filename+"/vocals.wav","./output/htdemucs/"+filename+"/no_vocals.wav","Successfully..."
437
+ else:
438
+ return "./output/htdemucs/"+filename+"/vocals.mp3","./output/htdemucs/"+filename+"/no_vocals.mp3","Successfully..."
439
+
440
+
441
+ # https://github.com/fumiama/Retrieval-based-Voice-Conversion-WebUI/blob/main/infer-web.py#L118 # noqa
442
+ def vc_func(
443
+ input_audio, model_index, pitch_adjust, f0_method, feat_ratio,
444
+ filter_radius, rms_mix_rate, resample_option
445
+ ):
446
+ if input_audio is None:
447
+ return (None, 'Please provide input audio.')
448
+
449
+ if model_index is None:
450
+ return (None, 'Please select a model.')
451
+
452
+ model = loaded_models[model_index]
453
+
454
+ # Reference: so-vits
455
+ (audio_samp, audio_npy) = input_audio
456
+
457
+ # https://huggingface.co/spaces/zomehwh/rvc-models/blob/main/app.py#L49
458
+ # Can be change well, we will see
459
+ if (audio_npy.shape[0] / audio_samp) > 600 and in_hf_space:
460
+ return (None, 'Input audio is longer than 600 secs.')
461
+
462
+ # Bloody hell: https://stackoverflow.com/questions/26921836/
463
+ if audio_npy.dtype != np.float32: # :thonk:
464
+ audio_npy = (
465
+ audio_npy / np.iinfo(audio_npy.dtype).max
466
+ ).astype(np.float32)
467
+
468
+ if len(audio_npy.shape) > 1:
469
+ audio_npy = librosa.to_mono(audio_npy.transpose(1, 0))
470
+
471
+ if audio_samp != 16000:
472
+ audio_npy = librosa.resample(
473
+ audio_npy,
474
+ orig_sr=audio_samp,
475
+ target_sr=16000
476
+ )
477
+
478
+ pitch_int = int(pitch_adjust)
479
+
480
+ resample = (
481
+ 0 if resample_option == 'Disable resampling'
482
+ else int(resample_option)
483
+ )
484
+
485
+ times = [0, 0, 0]
486
+
487
+ checksum = hashlib.sha512()
488
+ checksum.update(audio_npy.tobytes())
489
+
490
+ output_audio = model['vc'].pipeline(
491
+ hubert_model,
492
+ model['net_g'],
493
+ model['metadata'].get('speaker_id', 0),
494
+ audio_npy,
495
+ checksum.hexdigest(),
496
+ times,
497
+ pitch_int,
498
+ f0_method,
499
+ path.join('model', model['name'], model['metadata']['feat_index']),
500
+ feat_ratio,
501
+ model['if_f0'],
502
+ filter_radius,
503
+ model['target_sr'],
504
+ resample,
505
+ rms_mix_rate,
506
+ 'v2'
507
+ )
508
+
509
+ out_sr = (
510
+ resample if resample >= 16000 and model['target_sr'] != resample
511
+ else model['target_sr']
512
+ )
513
+
514
+ print(f'npy: {times[0]}s, f0: {times[1]}s, infer: {times[2]}s')
515
+ return ((out_sr, output_audio), 'Success')
516
+
517
+
518
+ async def edge_tts_vc_func(
519
+ input_text, model_index, tts_speaker, pitch_adjust, f0_method, feat_ratio,
520
+ filter_radius, rms_mix_rate, resample_option
521
+ ):
522
+ if input_text is None:
523
+ return (None, 'Please provide TTS text.')
524
+
525
+ if tts_speaker is None:
526
+ return (None, 'Please select TTS speaker.')
527
+
528
+ if model_index is None:
529
+ return (None, 'Please select a model.')
530
+
531
+ speaker = tts_speakers_list[tts_speaker]['ShortName']
532
+ (tts_np, tts_sr) = await util.call_edge_tts(speaker, input_text)
533
+ return vc_func(
534
+ (tts_sr, tts_np),
535
+ model_index,
536
+ pitch_adjust,
537
+ f0_method,
538
+ feat_ratio,
539
+ filter_radius,
540
+ rms_mix_rate,
541
+ resample_option
542
+ )
543
+
544
+
545
+ def update_model_info(model_index):
546
+ if model_index is None:
547
+ return str(
548
+ '### Model info\n'
549
+ 'Please select a model from dropdown above.'
550
+ )
551
+
552
+ model = loaded_models[model_index]
553
+ model_icon = model['metadata'].get('icon', '')
554
+
555
+ return str(
556
+ '### Model info\n'
557
+ '![model icon]({icon})'
558
+ '**{name}**\n\n'
559
+ 'Author: {author}\n\n'
560
+ 'Source: {source}\n\n'
561
+ '{note}'
562
+ ).format(
563
+ name=model['metadata'].get('name'),
564
+ author=model['metadata'].get('author', 'Anonymous'),
565
+ source=model['metadata'].get('source', 'Unknown'),
566
+ note=model['metadata'].get('note', ''),
567
+ icon=(
568
+ model_icon
569
+ if model_icon.startswith(('http://', 'https://'))
570
+ else '/file/model/%s/%s' % (model['name'], model_icon)
571
+ )
572
+ )
573
+
574
+
575
+ def _example_vc(
576
+ input_audio, model_index, pitch_adjust, f0_method, feat_ratio,
577
+ filter_radius, rms_mix_rate, resample_option
578
+ ):
579
+ (audio, message) = vc_func(
580
+ input_audio, model_index, pitch_adjust, f0_method, feat_ratio,
581
+ filter_radius, rms_mix_rate, resample_option
582
+ )
583
+ return (
584
+ audio,
585
+ message,
586
+ update_model_info(model_index)
587
+ )
588
+
589
+
590
+ async def _example_edge_tts(
591
+ input_text, model_index, tts_speaker, pitch_adjust, f0_method, feat_ratio,
592
+ filter_radius, rms_mix_rate, resample_option
593
+ ):
594
+ (audio, message) = await edge_tts_vc_func(
595
+ input_text, model_index, tts_speaker, pitch_adjust, f0_method,
596
+ feat_ratio, filter_radius, rms_mix_rate, resample_option
597
+ )
598
+ return (
599
+ audio,
600
+ message,
601
+ update_model_info(model_index)
602
+ )
603
+
604
+
605
+ with app:
606
+ gr.HTML("<center>"
607
+ "<h1>🥳🎶🎡 - AI歌手,RVC歌声转换 + AI变声</h1>"
608
+ "</center>")
609
+ gr.Markdown("### <center>🦄 - 能够自动提取视频中的声音,并去除背景音;Powered by [RVC-Project](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI)</center>")
610
+ gr.Markdown("### <center>更多精彩应用,敬请关注[滔滔AI](http://www.talktalkai.com);滔滔AI,为爱滔滔!💕</center>")
611
+
612
+ with gr.Tab("🤗 - B站视频提取声音"):
613
+ with gr.Row():
614
+ with gr.Column():
615
+ ydl_url_input = gr.Textbox(label="B站视频网址(可直接填写相应的BV号)", value = "https://www.bilibili.com/video/BV...")
616
+ start = gr.Number(value=0, label="起始时间 (秒)")
617
+ end = gr.Number(value=15, label="结束时间 (秒)")
618
+ ydl_url_submit = gr.Button("提取声音文件吧", variant="primary")
619
+ as_audio_submit = gr.Button("去除背景音吧", variant="primary")
620
+ with gr.Column():
621
+ ydl_audio_output = gr.Audio(label="Audio from Bilibili")
622
+ as_audio_input = ydl_audio_output
623
+ as_audio_vocals = gr.Audio(label="歌曲人声部分")
624
+ as_audio_no_vocals = gr.Audio(label="Music only", type="filepath", visible=False)
625
+ as_audio_message = gr.Textbox(label="Message", visible=False)
626
+
627
+ ydl_url_submit.click(fn=youtube_downloader, inputs=[ydl_url_input, start, end], outputs=[ydl_audio_output])
628
+ as_audio_submit.click(fn=audio_separated, inputs=[as_audio_input], outputs=[as_audio_vocals, as_audio_no_vocals, as_audio_message], show_progress=True, queue=True)
629
+
630
+ with gr.Row():
631
+ with gr.Column():
632
+ with gr.Tab('🎶 - 歌声转换'):
633
+ input_audio = as_audio_vocals
634
+ vc_convert_btn = gr.Button('进行歌声转换吧!', variant='primary')
635
+ full_song = gr.Button("加入歌曲伴奏吧!", variant="primary")
636
+ new_song = gr.Audio(label="AI歌手+伴奏", type="filepath")
637
+
638
+ with gr.Tab('🎙️ - 文本转语音'):
639
+ tts_input = gr.Textbox(
640
+ label='请填写您想要转换的文本(中英皆��)',
641
+ lines=3
642
+ )
643
+ tts_speaker = gr.Dropdown(
644
+ [
645
+ '%s (%s)' % (
646
+ s['FriendlyName'],
647
+ s['Gender']
648
+ )
649
+ for s in tts_speakers_list
650
+ ],
651
+ label='请选择一个相应语言的说话人',
652
+ type='index'
653
+ )
654
+
655
+ tts_convert_btn = gr.Button('进行AI变声吧', variant='primary')
656
+
657
+ with gr.Tab("📺 - 音乐视频"):
658
+ with gr.Row():
659
+ with gr.Column():
660
+ inp1 = gr.Textbox(label="为视频配上精彩的文案吧(选填;英文)")
661
+ inp2 = new_song
662
+ inp3 = gr.Image(source='upload', type='filepath', label="上传一张背景图片吧")
663
+ btn = gr.Button("生成您的专属音乐视频吧", variant="primary")
664
+
665
+ with gr.Column():
666
+ out1 = gr.Video(label='您的专属音乐视频')
667
+ btn.click(fn=infer, inputs=[inp1, inp2, inp3], outputs=[out1])
668
+
669
+ pitch_adjust = gr.Slider(
670
+ label='Pitch',
671
+ minimum=-24,
672
+ maximum=24,
673
+ step=1,
674
+ value=0
675
+ )
676
+ f0_method = gr.Radio(
677
+ label='f0 methods',
678
+ choices=['pm', 'rmvpe'],
679
+ value='rmvpe',
680
+ interactive=True
681
+ )
682
+
683
+ with gr.Accordion('更多设置', open=False):
684
+ feat_ratio = gr.Slider(
685
+ label='Feature ratio',
686
+ minimum=0,
687
+ maximum=1,
688
+ step=0.1,
689
+ value=0.6
690
+ )
691
+ filter_radius = gr.Slider(
692
+ label='Filter radius',
693
+ minimum=0,
694
+ maximum=7,
695
+ step=1,
696
+ value=3
697
+ )
698
+ rms_mix_rate = gr.Slider(
699
+ label='Volume envelope mix rate',
700
+ minimum=0,
701
+ maximum=1,
702
+ step=0.1,
703
+ value=1
704
+ )
705
+ resample_rate = gr.Dropdown(
706
+ [
707
+ 'Disable resampling',
708
+ '16000',
709
+ '22050',
710
+ '44100',
711
+ '48000'
712
+ ],
713
+ label='Resample rate',
714
+ value='Disable resampling'
715
+ )
716
+
717
+ with gr.Column():
718
+ # Model select
719
+ model_index = gr.Dropdown(
720
+ [
721
+ '%s - %s' % (
722
+ m['metadata'].get('source', 'Unknown'),
723
+ m['metadata'].get('name')
724
+ )
725
+ for m in loaded_models
726
+ ],
727
+ label='请选择您的AI歌手(必选)',
728
+ type='index'
729
+ )
730
+
731
+ # Model info
732
+ with gr.Box():
733
+ model_info = gr.Markdown(
734
+ '### AI歌手信息\n'
735
+ 'Please select a model from dropdown above.',
736
+ elem_id='model_info'
737
+ )
738
+
739
+ output_audio = gr.Audio(label='AI歌手(无伴奏)', type="filepath")
740
+ output_msg = gr.Textbox(label='Output message')
741
+
742
+ multi_examples = multi_cfg.get('examples')
743
+ if (
744
+ multi_examples and
745
+ multi_examples.get('vc') and multi_examples.get('tts_vc')
746
+ ):
747
+ with gr.Accordion('Sweet sweet examples', open=False):
748
+ with gr.Row():
749
+ # VC Example
750
+ if multi_examples.get('vc'):
751
+ gr.Examples(
752
+ label='Audio conversion examples',
753
+ examples=multi_examples.get('vc'),
754
+ inputs=[
755
+ input_audio, model_index, pitch_adjust, f0_method,
756
+ feat_ratio
757
+ ],
758
+ outputs=[output_audio, output_msg, model_info],
759
+ fn=_example_vc,
760
+ cache_examples=args.cache_examples,
761
+ run_on_click=args.cache_examples
762
+ )
763
+
764
+ # Edge TTS Example
765
+ if multi_examples.get('tts_vc'):
766
+ gr.Examples(
767
+ label='TTS conversion examples',
768
+ examples=multi_examples.get('tts_vc'),
769
+ inputs=[
770
+ tts_input, model_index, tts_speaker, pitch_adjust,
771
+ f0_method, feat_ratio
772
+ ],
773
+ outputs=[output_audio, output_msg, model_info],
774
+ fn=_example_edge_tts,
775
+ cache_examples=args.cache_examples,
776
+ run_on_click=args.cache_examples
777
+ )
778
+
779
+ vc_convert_btn.click(
780
+ vc_func,
781
+ [
782
+ input_audio, model_index, pitch_adjust, f0_method, feat_ratio,
783
+ filter_radius, rms_mix_rate, resample_rate
784
+ ],
785
+ [output_audio, output_msg],
786
+ api_name='audio_conversion'
787
+ )
788
+
789
+ tts_convert_btn.click(
790
+ edge_tts_vc_func,
791
+ [
792
+ tts_input, model_index, tts_speaker, pitch_adjust, f0_method,
793
+ feat_ratio, filter_radius, rms_mix_rate, resample_rate
794
+ ],
795
+ [output_audio, output_msg],
796
+ api_name='tts_conversion'
797
+ )
798
+
799
+ full_song.click(fn=mix, inputs=[output_audio, as_audio_no_vocals], outputs=[new_song])
800
+
801
+ model_index.change(
802
+ update_model_info,
803
+ inputs=[model_index],
804
+ outputs=[model_info],
805
+ show_progress=False,
806
+ queue=False
807
+ )
808
+
809
+ gr.Markdown("### <center>注意❗:请不要生成会对个人以及组织造成侵害的内容,此程序仅供科研、学习及个人娱乐使用。</center>")
810
+ gr.Markdown("### <center>🧸 - 如何使用此程序:填写视频网址和视频起止时间后,依次点击“提取声音文件吧”、“去除背景音吧”、“进行歌声转换吧!”、“加入歌曲伴奏吧!”四个按键即可。</center>")
811
+ gr.HTML('''
812
+ <div class="footer">
813
+ <p>🌊🏞️🎶 - 江水东流急,滔滔无尽声。 明·顾璘
814
+ </p>
815
+ </div>
816
+ ''')
817
+
818
+ app.queue(
819
+ concurrency_count=1,
820
+ max_size=20,
821
+ api_open=args.api
822
+ ).launch(show_error=True)