Commit
·
4f1e982
1
Parent(s):
8de3ef1
add models direct to space
Browse files- .gitattributes +2 -0
- app.py +167 -1
- pretrained_models/encodec_4cb2048_giga.th +3 -0
- pretrained_models/giga330M.pth +3 -0
.gitattributes
CHANGED
|
@@ -1,2 +1,4 @@
|
|
| 1 |
ilariasuitewallpaper.jpg filter=lfs diff=lfs merge=lfs -text
|
| 2 |
ilariaaisuite.png filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
| 1 |
ilariasuitewallpaper.jpg filter=lfs diff=lfs merge=lfs -text
|
| 2 |
ilariaaisuite.png filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
pretrained_models/giga330M.pth filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
pretrained_models/encodec_4cb2048_giga.th filter=lfs diff=lfs merge=lfs -text
|
app.py
CHANGED
|
@@ -1502,6 +1502,7 @@ def run(seed, stop_repetition, sample_batch_size, left_margin, right_margin, cod
|
|
| 1502 |
# take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt
|
| 1503 |
cut_off_sec = cutoff_value # NOTE: according to forced-alignment file, the word "common" stop as 3.01 sec, this should be different for different audio
|
| 1504 |
target_transcript = transcribed_text + target_transcript
|
|
|
|
| 1505 |
info = torchaudio.info(audio_fn)
|
| 1506 |
audio_dur = info.num_frames / info.sample_rate
|
| 1507 |
|
|
@@ -1545,6 +1546,136 @@ def run(seed, stop_repetition, sample_batch_size, left_margin, right_margin, cod
|
|
| 1545 |
|
| 1546 |
return [seg_save_fn_concat, seg_save_fn_gen]
|
| 1547 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1548 |
def upload_to_dataset(files, dir):
|
| 1549 |
if dir == '':
|
| 1550 |
dir = './dataset'
|
|
@@ -1678,6 +1809,7 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue="pink", secondary_hue="rose")
|
|
| 1678 |
output_audio_gen = gr.Audio(label="Output Audio generated")
|
| 1679 |
cutoff_value = gr.Number(label="cutoff_time", interactive=True, step=0.01)
|
| 1680 |
run_btn = gr.Button(value="run")
|
|
|
|
| 1681 |
target_transcript = gr.Textbox(label="target transcript")
|
| 1682 |
|
| 1683 |
transcribe_btn.click(fn=transcribe_btn_click, inputs=[input_audio],
|
|
@@ -1704,7 +1836,7 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue="pink", secondary_hue="rose")
|
|
| 1704 |
output_audio_con,
|
| 1705 |
output_audio_gen
|
| 1706 |
])
|
| 1707 |
-
|
| 1708 |
with gr.Column():
|
| 1709 |
vc_output2 = gr.Audio(
|
| 1710 |
label="Final Result! (Click on the three dots to download the audio)",
|
|
@@ -1864,6 +1996,40 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue="pink", secondary_hue="rose")
|
|
| 1864 |
],
|
| 1865 |
[vc_output1, vc_output2],
|
| 1866 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1867 |
|
| 1868 |
with gr.Accordion("Batch Conversion",open=False, visible=False):
|
| 1869 |
with gr.Row():
|
|
|
|
| 1502 |
# take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt
|
| 1503 |
cut_off_sec = cutoff_value # NOTE: according to forced-alignment file, the word "common" stop as 3.01 sec, this should be different for different audio
|
| 1504 |
target_transcript = transcribed_text + target_transcript
|
| 1505 |
+
print(target_transcript)
|
| 1506 |
info = torchaudio.info(audio_fn)
|
| 1507 |
audio_dur = info.num_frames / info.sample_rate
|
| 1508 |
|
|
|
|
| 1546 |
|
| 1547 |
return [seg_save_fn_concat, seg_save_fn_gen]
|
| 1548 |
|
| 1549 |
+
def run_joint(seed, stop_repetition, sample_batch_size, left_margin, right_margin, codec_audio_sr, codec_sr, top_k, top_p,
|
| 1550 |
+
temperature, kvcache, cutoff_value, target_transcript, silence_tokens, transcribed_text,
|
| 1551 |
+
sid,
|
| 1552 |
+
f0_up_key,
|
| 1553 |
+
f0_file,
|
| 1554 |
+
f0_method,
|
| 1555 |
+
file_index,
|
| 1556 |
+
#file_index2,
|
| 1557 |
+
# file_big_npy,
|
| 1558 |
+
index_rate,
|
| 1559 |
+
filter_radius,
|
| 1560 |
+
resample_sr,
|
| 1561 |
+
rms_mix_rate,
|
| 1562 |
+
protect,
|
| 1563 |
+
crepe_hop_length):
|
| 1564 |
+
|
| 1565 |
+
global voicecraft_model, voicecraft_config, phn2num
|
| 1566 |
+
|
| 1567 |
+
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
|
| 1568 |
+
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
| 1569 |
+
os.environ["USER"] = "USER"
|
| 1570 |
+
# take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt
|
| 1571 |
+
cut_off_sec = cutoff_value # NOTE: according to forced-alignment file, the word "common" stop as 3.01 sec, this should be different for different audio
|
| 1572 |
+
target_transcript = transcribed_text + target_transcript
|
| 1573 |
+
print(target_transcript)
|
| 1574 |
+
info = torchaudio.info(audio_fn)
|
| 1575 |
+
audio_dur = info.num_frames / info.sample_rate
|
| 1576 |
+
|
| 1577 |
+
assert cut_off_sec < audio_dur, f"cut_off_sec {cut_off_sec} is larger than the audio duration {audio_dur}"
|
| 1578 |
+
prompt_end_frame = int(cut_off_sec * info.sample_rate)
|
| 1579 |
+
|
| 1580 |
+
if voicecraft_model is None:
|
| 1581 |
+
load_voicecraft()
|
| 1582 |
+
|
| 1583 |
+
encodec_fn = "./pretrained_models/encodec_4cb2048_giga.th"
|
| 1584 |
+
text_tokenizer = TextTokenizer(backend="espeak")
|
| 1585 |
+
audio_tokenizer = AudioTokenizer(signature=encodec_fn) # will also put the neural codec model on gpu
|
| 1586 |
+
|
| 1587 |
+
|
| 1588 |
+
# # run the model to get the output
|
| 1589 |
+
decode_config = {'top_k': top_k, 'top_p': top_p, 'temperature': temperature, 'stop_repetition': stop_repetition,
|
| 1590 |
+
'kvcache': kvcache, "codec_audio_sr": codec_audio_sr, "codec_sr": codec_sr,
|
| 1591 |
+
"silence_tokens": silence_tokens, "sample_batch_size": sample_batch_size}
|
| 1592 |
+
from lib.voicecraft.inference_tts_scale import inference_one_sample
|
| 1593 |
+
concated_audio, gen_audio = inference_one_sample(voicecraft_model, voicecraft_config, phn2num, text_tokenizer, audio_tokenizer,
|
| 1594 |
+
audio_fn, target_transcript, config.device, decode_config,
|
| 1595 |
+
prompt_end_frame)
|
| 1596 |
+
|
| 1597 |
+
# save segments for comparison
|
| 1598 |
+
concated_audio, gen_audio = concated_audio[0].cpu(), gen_audio[0].cpu()
|
| 1599 |
+
# logging.info(f"length of the resynthesize orig audio: {orig_audio.shape}")
|
| 1600 |
+
|
| 1601 |
+
output_dir = "./demo/generated_tts"
|
| 1602 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 1603 |
+
seg_save_fn_gen = f"{output_dir}/{os.path.basename(audio_fn)[:-4]}_gen_seed{seed}.wav"
|
| 1604 |
+
seg_save_fn_concat = f"{output_dir}/{os.path.basename(audio_fn)[:-4]}_concat_seed{seed}.wav"
|
| 1605 |
+
|
| 1606 |
+
|
| 1607 |
+
torchaudio.save(seg_save_fn_gen, gen_audio, int(codec_audio_sr))
|
| 1608 |
+
torchaudio.save(seg_save_fn_concat, concated_audio, int(codec_audio_sr))
|
| 1609 |
+
|
| 1610 |
+
|
| 1611 |
+
global tgt_sr, net_g, vc, hubert_model, version
|
| 1612 |
+
|
| 1613 |
+
f0_up_key = int(f0_up_key)
|
| 1614 |
+
try:
|
| 1615 |
+
audio = gen_audio
|
| 1616 |
+
audio_max = np.abs(audio).max() / 0.95
|
| 1617 |
+
if audio_max > 1:
|
| 1618 |
+
audio /= audio_max
|
| 1619 |
+
times = [0, 0, 0]
|
| 1620 |
+
if hubert_model == None:
|
| 1621 |
+
load_hubert()
|
| 1622 |
+
if_f0 = cpt.get("f0", 1)
|
| 1623 |
+
file_index = (
|
| 1624 |
+
(
|
| 1625 |
+
file_index.strip(" ")
|
| 1626 |
+
.strip('"')
|
| 1627 |
+
.strip("\n")
|
| 1628 |
+
.strip('"')
|
| 1629 |
+
.strip(" ")
|
| 1630 |
+
.replace("trained", "added")
|
| 1631 |
+
)
|
| 1632 |
+
) # 防止小白写错,自动帮他替换掉
|
| 1633 |
+
# file_big_npy = (
|
| 1634 |
+
# file_big_npy.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
|
| 1635 |
+
# )
|
| 1636 |
+
audio_opt = vc.pipeline(
|
| 1637 |
+
hubert_model,
|
| 1638 |
+
net_g,
|
| 1639 |
+
sid,
|
| 1640 |
+
audio,
|
| 1641 |
+
seg_save_fn_gen,
|
| 1642 |
+
times,
|
| 1643 |
+
f0_up_key,
|
| 1644 |
+
f0_method,
|
| 1645 |
+
file_index,
|
| 1646 |
+
# file_big_npy,
|
| 1647 |
+
index_rate,
|
| 1648 |
+
if_f0,
|
| 1649 |
+
filter_radius,
|
| 1650 |
+
tgt_sr,
|
| 1651 |
+
resample_sr,
|
| 1652 |
+
rms_mix_rate,
|
| 1653 |
+
version,
|
| 1654 |
+
protect,
|
| 1655 |
+
crepe_hop_length,
|
| 1656 |
+
f0_file=f0_file,
|
| 1657 |
+
)
|
| 1658 |
+
if resample_sr >= 16000 and tgt_sr != resample_sr:
|
| 1659 |
+
tgt_sr = resample_sr
|
| 1660 |
+
index_info = (
|
| 1661 |
+
"Using index:%s." % file_index
|
| 1662 |
+
if os.path.exists(file_index)
|
| 1663 |
+
else "Index not used."
|
| 1664 |
+
)
|
| 1665 |
+
return "Success.\n %s\nTime:\n npy:%ss, f0:%ss, infer:%ss" % (
|
| 1666 |
+
index_info,
|
| 1667 |
+
times[0],
|
| 1668 |
+
times[1],
|
| 1669 |
+
times[2],
|
| 1670 |
+
), (tgt_sr, audio_opt)
|
| 1671 |
+
except:
|
| 1672 |
+
info = traceback.format_exc()
|
| 1673 |
+
print(info)
|
| 1674 |
+
return info, (None, None)
|
| 1675 |
+
|
| 1676 |
+
|
| 1677 |
+
|
| 1678 |
+
|
| 1679 |
def upload_to_dataset(files, dir):
|
| 1680 |
if dir == '':
|
| 1681 |
dir = './dataset'
|
|
|
|
| 1809 |
output_audio_gen = gr.Audio(label="Output Audio generated")
|
| 1810 |
cutoff_value = gr.Number(label="cutoff_time", interactive=True, step=0.01)
|
| 1811 |
run_btn = gr.Button(value="run")
|
| 1812 |
+
run_btn_joint = gr.Button(value="run with RVC")
|
| 1813 |
target_transcript = gr.Textbox(label="target transcript")
|
| 1814 |
|
| 1815 |
transcribe_btn.click(fn=transcribe_btn_click, inputs=[input_audio],
|
|
|
|
| 1836 |
output_audio_con,
|
| 1837 |
output_audio_gen
|
| 1838 |
])
|
| 1839 |
+
|
| 1840 |
with gr.Column():
|
| 1841 |
vc_output2 = gr.Audio(
|
| 1842 |
label="Final Result! (Click on the three dots to download the audio)",
|
|
|
|
| 1996 |
],
|
| 1997 |
[vc_output1, vc_output2],
|
| 1998 |
)
|
| 1999 |
+
|
| 2000 |
+
run_btn_joint.click(
|
| 2001 |
+
fn=run_joint,
|
| 2002 |
+
inputs=[
|
| 2003 |
+
seed,
|
| 2004 |
+
stop_repitition,
|
| 2005 |
+
sample_batch_size,
|
| 2006 |
+
left_margin,
|
| 2007 |
+
right_margin,
|
| 2008 |
+
codecaudio_sr,
|
| 2009 |
+
codec_sr,
|
| 2010 |
+
top_k,
|
| 2011 |
+
top_p,
|
| 2012 |
+
temperature,
|
| 2013 |
+
kvcache,
|
| 2014 |
+
cutoff_value,
|
| 2015 |
+
target_transcript,
|
| 2016 |
+
silence_tokens,
|
| 2017 |
+
transcribed_text,
|
| 2018 |
+
spk_item,
|
| 2019 |
+
vc_transform0,
|
| 2020 |
+
f0_file,
|
| 2021 |
+
f0method0,
|
| 2022 |
+
file_index1,
|
| 2023 |
+
# file_index2,
|
| 2024 |
+
# file_big_npy1,
|
| 2025 |
+
index_rate1,
|
| 2026 |
+
filter_radius0,
|
| 2027 |
+
resample_sr0,
|
| 2028 |
+
rms_mix_rate0,
|
| 2029 |
+
protect0,
|
| 2030 |
+
crepe_hop_length
|
| 2031 |
+
],
|
| 2032 |
+
outputs=[vc_output1, vc_output2])
|
| 2033 |
|
| 2034 |
with gr.Accordion("Batch Conversion",open=False, visible=False):
|
| 2035 |
with gr.Row():
|
pretrained_models/encodec_4cb2048_giga.th
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:caa0c595d4919527a9728d627150aa2a0b15b6d117b21855165851333dc63378
|
| 3 |
+
size 1167842971
|
pretrained_models/giga330M.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:35e028b8c5237cb4a6050ca81d4569b98e3a34ad9175fa252f7b1d13e6a9ad26
|
| 3 |
+
size 1746844161
|