3417543_models

Paused

App Files Files Community

ajayarora1235 commited on Apr 3, 2024

Commit

4f1e982

1 Parent(s): 8de3ef1

add models direct to space

Browse files

Files changed (4) hide show

.gitattributes +2 -0
app.py +167 -1
pretrained_models/encodec_4cb2048_giga.th +3 -0
pretrained_models/giga330M.pth +3 -0

.gitattributes CHANGED Viewed

@@ -1,2 +1,4 @@
 ilariasuitewallpaper.jpg filter=lfs diff=lfs merge=lfs -text
 ilariaaisuite.png filter=lfs diff=lfs merge=lfs -text

 ilariasuitewallpaper.jpg filter=lfs diff=lfs merge=lfs -text
 ilariaaisuite.png filter=lfs diff=lfs merge=lfs -text
+pretrained_models/giga330M.pth filter=lfs diff=lfs merge=lfs -text
+pretrained_models/encodec_4cb2048_giga.th filter=lfs diff=lfs merge=lfs -text

app.py CHANGED Viewed

@@ -1502,6 +1502,7 @@ def run(seed, stop_repetition, sample_batch_size, left_margin, right_margin, cod
     # take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt
     cut_off_sec = cutoff_value  # NOTE: according to forced-alignment file, the word "common" stop as 3.01 sec, this should be different for different audio
     target_transcript = transcribed_text + target_transcript
     info = torchaudio.info(audio_fn)
     audio_dur = info.num_frames / info.sample_rate
@@ -1545,6 +1546,136 @@ def run(seed, stop_repetition, sample_batch_size, left_margin, right_margin, cod
     return [seg_save_fn_concat, seg_save_fn_gen]
 def upload_to_dataset(files, dir):
     if dir == '':
         dir = './dataset'
@@ -1678,6 +1809,7 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue="pink", secondary_hue="rose")
                             output_audio_gen = gr.Audio(label="Output Audio generated")
                             cutoff_value = gr.Number(label="cutoff_time", interactive=True, step=0.01)
                             run_btn = gr.Button(value="run")
                             target_transcript = gr.Textbox(label="target transcript")
                         transcribe_btn.click(fn=transcribe_btn_click, inputs=[input_audio],
@@ -1704,7 +1836,7 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue="pink", secondary_hue="rose")
                                         output_audio_con,
                                         output_audio_gen
                                     ])
                 with gr.Column():
                     vc_output2 = gr.Audio(
                         label="Final Result! (Click on the three dots to download the audio)",
@@ -1864,6 +1996,40 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue="pink", secondary_hue="rose")
                     ],
                     [vc_output1, vc_output2],
                 )
             with gr.Accordion("Batch Conversion",open=False, visible=False):
                 with gr.Row():

     # take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt
     cut_off_sec = cutoff_value  # NOTE: according to forced-alignment file, the word "common" stop as 3.01 sec, this should be different for different audio
     target_transcript = transcribed_text + target_transcript
+    print(target_transcript)
     info = torchaudio.info(audio_fn)
     audio_dur = info.num_frames / info.sample_rate
     return [seg_save_fn_concat, seg_save_fn_gen]
+def run_joint(seed, stop_repetition, sample_batch_size, left_margin, right_margin, codec_audio_sr, codec_sr, top_k, top_p,
+        temperature, kvcache, cutoff_value, target_transcript, silence_tokens, transcribed_text,
+        sid,
+        f0_up_key,
+        f0_file,
+        f0_method,
+        file_index,
+        #file_index2,
+        # file_big_npy,
+        index_rate,
+        filter_radius,
+        resample_sr,
+        rms_mix_rate,
+        protect,
+        crepe_hop_length):
+    global voicecraft_model, voicecraft_config, phn2num
+    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+    os.environ["USER"] = "USER"
+    # take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt
+    cut_off_sec = cutoff_value  # NOTE: according to forced-alignment file, the word "common" stop as 3.01 sec, this should be different for different audio
+    target_transcript = transcribed_text + target_transcript
+    print(target_transcript)
+    info = torchaudio.info(audio_fn)
+    audio_dur = info.num_frames / info.sample_rate
+    assert cut_off_sec < audio_dur, f"cut_off_sec {cut_off_sec} is larger than the audio duration {audio_dur}"
+    prompt_end_frame = int(cut_off_sec * info.sample_rate)
+    if voicecraft_model is None:
+        load_voicecraft()
+    encodec_fn = "./pretrained_models/encodec_4cb2048_giga.th"
+    text_tokenizer = TextTokenizer(backend="espeak")
+    audio_tokenizer = AudioTokenizer(signature=encodec_fn)  # will also put the neural codec model on gpu
+    # # run the model to get the output
+    decode_config = {'top_k': top_k, 'top_p': top_p, 'temperature': temperature, 'stop_repetition': stop_repetition,
+                     'kvcache': kvcache, "codec_audio_sr": codec_audio_sr, "codec_sr": codec_sr,
+                     "silence_tokens": silence_tokens, "sample_batch_size": sample_batch_size}
+    from lib.voicecraft.inference_tts_scale import inference_one_sample
+    concated_audio, gen_audio = inference_one_sample(voicecraft_model, voicecraft_config, phn2num, text_tokenizer, audio_tokenizer,
+                                                     audio_fn, target_transcript, config.device, decode_config,
+                                                     prompt_end_frame)
+    # save segments for comparison
+    concated_audio, gen_audio = concated_audio[0].cpu(), gen_audio[0].cpu()
+    # logging.info(f"length of the resynthesize orig audio: {orig_audio.shape}")
+    output_dir = "./demo/generated_tts"
+    os.makedirs(output_dir, exist_ok=True)
+    seg_save_fn_gen = f"{output_dir}/{os.path.basename(audio_fn)[:-4]}_gen_seed{seed}.wav"
+    seg_save_fn_concat = f"{output_dir}/{os.path.basename(audio_fn)[:-4]}_concat_seed{seed}.wav"
+    torchaudio.save(seg_save_fn_gen, gen_audio, int(codec_audio_sr))
+    torchaudio.save(seg_save_fn_concat, concated_audio, int(codec_audio_sr))
+    global tgt_sr, net_g, vc, hubert_model, version
+    f0_up_key = int(f0_up_key)
+    try:
+        audio = gen_audio
+        audio_max = np.abs(audio).max() / 0.95
+        if audio_max > 1:
+            audio /= audio_max
+        times = [0, 0, 0]
+        if hubert_model == None:
+            load_hubert()
+        if_f0 = cpt.get("f0", 1)
+        file_index = (
+            (
+                file_index.strip(" ")
+                .strip('"')
+                .strip("\n")
+                .strip('"')
+                .strip(" ")
+                .replace("trained", "added")
+            )
+        )  # 防止小白写错，自动帮他替换掉
+        # file_big_npy = (
+        #     file_big_npy.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
+        # )
+        audio_opt = vc.pipeline(
+            hubert_model,
+            net_g,
+            sid,
+            audio,
+            seg_save_fn_gen,
+            times,
+            f0_up_key,
+            f0_method,
+            file_index,
+            # file_big_npy,
+            index_rate,
+            if_f0,
+            filter_radius,
+            tgt_sr,
+            resample_sr,
+            rms_mix_rate,
+            version,
+            protect,
+            crepe_hop_length,
+            f0_file=f0_file,
+        )
+        if resample_sr >= 16000 and tgt_sr != resample_sr:
+            tgt_sr = resample_sr
+        index_info = (
+            "Using index:%s." % file_index
+            if os.path.exists(file_index)
+            else "Index not used."
+        )
+        return "Success.\n %s\nTime:\n npy:%ss, f0:%ss, infer:%ss" % (
+            index_info,
+            times[0],
+            times[1],
+            times[2],
+        ), (tgt_sr, audio_opt)
+    except:
+        info = traceback.format_exc()
+        print(info)
+        return info, (None, None)
 def upload_to_dataset(files, dir):
     if dir == '':
         dir = './dataset'
                             output_audio_gen = gr.Audio(label="Output Audio generated")
                             cutoff_value = gr.Number(label="cutoff_time", interactive=True, step=0.01)
                             run_btn = gr.Button(value="run")
+                            run_btn_joint = gr.Button(value="run with RVC")
                             target_transcript = gr.Textbox(label="target transcript")
                         transcribe_btn.click(fn=transcribe_btn_click, inputs=[input_audio],
                                         output_audio_con,
                                         output_audio_gen
                                     ])
                 with gr.Column():
                     vc_output2 = gr.Audio(
                         label="Final Result! (Click on the three dots to download the audio)",
                     ],
                     [vc_output1, vc_output2],
                 )
+                run_btn_joint.click(
+                    fn=run_joint,
+                    inputs=[
+                        seed,
+                        stop_repitition,
+                        sample_batch_size,
+                        left_margin,
+                        right_margin,
+                        codecaudio_sr,
+                        codec_sr,
+                        top_k,
+                        top_p,
+                        temperature,
+                        kvcache,
+                        cutoff_value,
+                        target_transcript,
+                        silence_tokens,
+                        transcribed_text,
+                        spk_item,
+                        vc_transform0,
+                        f0_file,
+                        f0method0,
+                        file_index1,
+                        # file_index2,
+                        # file_big_npy1,
+                        index_rate1,
+                        filter_radius0,
+                        resample_sr0,
+                        rms_mix_rate0,
+                        protect0,
+                        crepe_hop_length
+                    ],
+                    outputs=[vc_output1, vc_output2])
             with gr.Accordion("Batch Conversion",open=False, visible=False):
                 with gr.Row():

pretrained_models/encodec_4cb2048_giga.th ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:caa0c595d4919527a9728d627150aa2a0b15b6d117b21855165851333dc63378
+size 1167842971

pretrained_models/giga330M.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:35e028b8c5237cb4a6050ca81d4569b98e3a34ad9175fa252f7b1d13e6a9ad26
+size 1746844161