Spaces:
Running
on
Zero
Running
on
Zero
Update gradio_mix.py
Browse files- gradio_mix.py +2 -64
gradio_mix.py
CHANGED
|
@@ -684,7 +684,6 @@ def transcribe(seed, audio_info):
|
|
| 684 |
|
| 685 |
return [
|
| 686 |
state["transcript"], state["transcript_with_start_time"], state["transcript_with_end_time"],
|
| 687 |
-
# gr.Dropdown(value=state["word_bounds"][-1], choices=state["word_bounds"], interactive=True), # prompt_to_word
|
| 688 |
gr.Dropdown(value=state["word_bounds"][0], choices=state["word_bounds"], interactive=True), # edit_from_word
|
| 689 |
gr.Dropdown(value=state["word_bounds"][-1], choices=state["word_bounds"], interactive=True), # edit_to_word
|
| 690 |
state
|
|
@@ -708,7 +707,6 @@ def align(transcript, audio_info, state):
|
|
| 708 |
|
| 709 |
return [
|
| 710 |
state["transcript"], state["transcript_with_start_time"], state["transcript_with_end_time"],
|
| 711 |
-
# gr.Dropdown(value=state["word_bounds"][-1], choices=state["word_bounds"], interactive=True), # prompt_to_word
|
| 712 |
gr.Dropdown(value=state["word_bounds"][0], choices=state["word_bounds"], interactive=True), # edit_from_word
|
| 713 |
gr.Dropdown(value=state["word_bounds"][-1], choices=state["word_bounds"], interactive=True), # edit_to_word
|
| 714 |
state
|
|
@@ -789,11 +787,6 @@ def run(seed, nfe_step, speed, cfg_strength, sway_sampling_coef, ref_ratio,
|
|
| 789 |
except Exception as e:
|
| 790 |
logging.warning("Failed to move LEMAS-TTS model to CUDA: %s", e)
|
| 791 |
|
| 792 |
-
# if mode == "Rerun":
|
| 793 |
-
# colon_position = selected_sentence.find(':')
|
| 794 |
-
# selected_sentence_idx = int(selected_sentence[:colon_position])
|
| 795 |
-
# sentences = [selected_sentence[colon_position + 1:]]
|
| 796 |
-
|
| 797 |
# Choose base audio (denoised if duration matches)
|
| 798 |
audio_base = audio_info
|
| 799 |
audio_dur = round(audio_info[1].shape[0] / audio_info[0], ndigits=3)
|
|
@@ -964,32 +957,14 @@ def update_bound_words(from_selected_word, to_selected_word, edit_word_mode):
|
|
| 964 |
|
| 965 |
smart_transcript_info = """
|
| 966 |
If enabled, the target transcript will be constructed for you:</br>
|
| 967 |
-
- In TTS and Long TTS mode just write the text you want to synthesize.</br>
|
| 968 |
- In Edit mode just write the text to replace selected editing segment.</br>
|
| 969 |
-
If disabled, you should write the target transcript yourself:</br>
|
| 970 |
-
- In TTS mode write prompt transcript followed by generation transcript.</br>
|
| 971 |
-
- In Long TTS select split by newline (<b>SENTENCE SPLIT WON'T WORK</b>) and start each line with a prompt transcript.</br>
|
| 972 |
-
- In Edit mode write full prompt</br>
|
| 973 |
"""
|
| 974 |
|
| 975 |
demo_original_transcript = ""
|
| 976 |
|
| 977 |
demo_text = {
|
| 978 |
-
"TTS": {
|
| 979 |
-
"smart": "take over the stage for half an hour,",
|
| 980 |
-
"regular": "Gwynplaine had, besides, for his work and for his feats of strength, take over the stage for half an hour."
|
| 981 |
-
},
|
| 982 |
"Edit": {
|
| 983 |
-
"smart": "
|
| 984 |
-
"regular": "照片、医疗记录、神经重塑的易损性,这是某种数据库啊!还有PRELESS的脑部扫描、生物管型、神经重塑."
|
| 985 |
-
},
|
| 986 |
-
"Long TTS": {
|
| 987 |
-
"smart": "You can run the model on a big text!\n"
|
| 988 |
-
"Just write it line-by-line. Or sentence-by-sentence.\n"
|
| 989 |
-
"If some sentences sound odd, just rerun the model on them, no need to generate the whole text again!",
|
| 990 |
-
"regular": "Gwynplaine had, besides, for his work and for his feats of strength, You can run the model on a big text!\n"
|
| 991 |
-
"Gwynplaine had, besides, for his work and for his feats of strength, Just write it line-by-line. Or sentence-by-sentence.\n"
|
| 992 |
-
"Gwynplaine had, besides, for his work and for his feats of strength, If some sentences sound odd, just rerun the model on them, no need to generate the whole text again!"
|
| 993 |
}
|
| 994 |
}
|
| 995 |
|
|
@@ -1033,7 +1008,6 @@ def get_app():
|
|
| 1033 |
scale=3,
|
| 1034 |
)
|
| 1035 |
denoise_model_choice = gr.Radio(label="Denoise Model", scale=2, value="UVR5", choices=["UVR5", "DeepFilterNet"]) # "830M", "330M_TTSEnhanced", "830M_TTSEnhanced"])
|
| 1036 |
-
# whisper_backend_choice = gr.Radio(label="Whisper backend", value="", choices=["whisperX", "whisper"])
|
| 1037 |
whisper_model_choice = gr.Radio(label="Whisper model", scale=3, value="medium", choices=["base", "small", "medium", "large"])
|
| 1038 |
align_model_choice = gr.Radio(label="Forced alignment model", scale=2, value="MMS", choices=["whisperX", "MMS"], visible=False)
|
| 1039 |
|
|
@@ -1041,27 +1015,10 @@ def get_app():
|
|
| 1041 |
with gr.Column(scale=2):
|
| 1042 |
# Use a numpy waveform as default value to avoid Gradio's
|
| 1043 |
# InvalidPathError with local filesystem paths.
|
| 1044 |
-
_demo_value = None
|
| 1045 |
-
demo_candidates = [
|
| 1046 |
-
os.path.join(DEMO_PATH, "test.wav"),
|
| 1047 |
-
]
|
| 1048 |
-
for demo_path in demo_candidates:
|
| 1049 |
-
try:
|
| 1050 |
-
if not os.path.isfile(demo_path):
|
| 1051 |
-
continue
|
| 1052 |
-
_demo_wav, _demo_sr = torchaudio.load(demo_path)
|
| 1053 |
-
if _demo_wav.dim() > 1 and _demo_wav.shape[0] > 1:
|
| 1054 |
-
_demo_wav = _demo_wav.mean(dim=0, keepdim=True)
|
| 1055 |
-
_demo_value = (_demo_sr, _demo_wav.squeeze(0).numpy())
|
| 1056 |
-
break
|
| 1057 |
-
except Exception:
|
| 1058 |
-
continue
|
| 1059 |
-
|
| 1060 |
input_audio = gr.Audio(
|
| 1061 |
-
value=
|
| 1062 |
label="Input Audio",
|
| 1063 |
interactive=True,
|
| 1064 |
-
type="numpy",
|
| 1065 |
)
|
| 1066 |
|
| 1067 |
with gr.Row():
|
|
@@ -1095,18 +1052,6 @@ def get_app():
|
|
| 1095 |
edit_word_mode = gr.Radio(label="Edit word mode", choices=["Replace half", "Replace all"], value="Replace all",
|
| 1096 |
info="What to do with first and last word", visible=False)
|
| 1097 |
|
| 1098 |
-
# with gr.Group(visible=False) as tts_mode_controls:
|
| 1099 |
-
# with gr.Row():
|
| 1100 |
-
# edit_from_word = gr.Dropdown(label="First word in prompt", choices=demo_words, value=demo_words[12], interactive=True)
|
| 1101 |
-
# edit_to_word = gr.Dropdown(label="Last word in prompt", choices=demo_words, value=demo_words[18], interactive=True)
|
| 1102 |
-
# with gr.Row():
|
| 1103 |
-
# edit_start_time = gr.Slider(label="Prompt start time", minimum=0, maximum=7.614, step=0.001, value=4.022)
|
| 1104 |
-
# edit_end_time = gr.Slider(label="Prompt end time", minimum=0, maximum=7.614, step=0.001, value=5.768)
|
| 1105 |
-
# with gr.Row():
|
| 1106 |
-
# check_btn = gr.Button(value="Check prompt",scale=1)
|
| 1107 |
-
# edit_audio = gr.Audio(label="Prompt Audio", scale=3)
|
| 1108 |
-
|
| 1109 |
-
# with gr.Group() as edit_mode_controls:
|
| 1110 |
with gr.Row():
|
| 1111 |
edit_from_word = gr.Dropdown(label="First word to edit", choices=demo_words, value=demo_words[12], interactive=True)
|
| 1112 |
edit_to_word = gr.Dropdown(label="Last word to edit", choices=demo_words, value=demo_words[18], interactive=True)
|
|
@@ -1223,10 +1168,6 @@ def get_app():
|
|
| 1223 |
inputs=[input_audio],
|
| 1224 |
outputs=[denoise_audio])
|
| 1225 |
|
| 1226 |
-
# mode.change(fn=change_mode,
|
| 1227 |
-
# inputs=[mode],
|
| 1228 |
-
# outputs=[tts_mode_controls, edit_mode_controls, edit_word_mode, split_text, long_tts_sentence_editor])
|
| 1229 |
-
|
| 1230 |
check_btn.click(fn=get_edit_audio_part,
|
| 1231 |
inputs=[input_audio, edit_start_time, edit_end_time],
|
| 1232 |
outputs=[edit_audio])
|
|
@@ -1254,9 +1195,6 @@ def get_app():
|
|
| 1254 |
],
|
| 1255 |
outputs=[output_audio, inference_transcript, sentence_audio, audio_tensors])
|
| 1256 |
|
| 1257 |
-
# prompt_to_word.change(fn=update_bound_word,
|
| 1258 |
-
# inputs=[gr.State(False), prompt_to_word, gr.State("Replace all")],
|
| 1259 |
-
# outputs=[prompt_end_time])
|
| 1260 |
edit_from_word.change(fn=update_bound_word,
|
| 1261 |
inputs=[gr.State(True), edit_from_word, edit_word_mode],
|
| 1262 |
outputs=[edit_start_time])
|
|
|
|
| 684 |
|
| 685 |
return [
|
| 686 |
state["transcript"], state["transcript_with_start_time"], state["transcript_with_end_time"],
|
|
|
|
| 687 |
gr.Dropdown(value=state["word_bounds"][0], choices=state["word_bounds"], interactive=True), # edit_from_word
|
| 688 |
gr.Dropdown(value=state["word_bounds"][-1], choices=state["word_bounds"], interactive=True), # edit_to_word
|
| 689 |
state
|
|
|
|
| 707 |
|
| 708 |
return [
|
| 709 |
state["transcript"], state["transcript_with_start_time"], state["transcript_with_end_time"],
|
|
|
|
| 710 |
gr.Dropdown(value=state["word_bounds"][0], choices=state["word_bounds"], interactive=True), # edit_from_word
|
| 711 |
gr.Dropdown(value=state["word_bounds"][-1], choices=state["word_bounds"], interactive=True), # edit_to_word
|
| 712 |
state
|
|
|
|
| 787 |
except Exception as e:
|
| 788 |
logging.warning("Failed to move LEMAS-TTS model to CUDA: %s", e)
|
| 789 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 790 |
# Choose base audio (denoised if duration matches)
|
| 791 |
audio_base = audio_info
|
| 792 |
audio_dur = round(audio_info[1].shape[0] / audio_info[0], ndigits=3)
|
|
|
|
| 957 |
|
| 958 |
smart_transcript_info = """
|
| 959 |
If enabled, the target transcript will be constructed for you:</br>
|
|
|
|
| 960 |
- In Edit mode just write the text to replace selected editing segment.</br>
|
|
|
|
|
|
|
|
|
|
|
|
|
| 961 |
"""
|
| 962 |
|
| 963 |
demo_original_transcript = ""
|
| 964 |
|
| 965 |
demo_text = {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 966 |
"Edit": {
|
| 967 |
+
"smart": "write new words here.",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 968 |
}
|
| 969 |
}
|
| 970 |
|
|
|
|
| 1008 |
scale=3,
|
| 1009 |
)
|
| 1010 |
denoise_model_choice = gr.Radio(label="Denoise Model", scale=2, value="UVR5", choices=["UVR5", "DeepFilterNet"]) # "830M", "330M_TTSEnhanced", "830M_TTSEnhanced"])
|
|
|
|
| 1011 |
whisper_model_choice = gr.Radio(label="Whisper model", scale=3, value="medium", choices=["base", "small", "medium", "large"])
|
| 1012 |
align_model_choice = gr.Radio(label="Forced alignment model", scale=2, value="MMS", choices=["whisperX", "MMS"], visible=False)
|
| 1013 |
|
|
|
|
| 1015 |
with gr.Column(scale=2):
|
| 1016 |
# Use a numpy waveform as default value to avoid Gradio's
|
| 1017 |
# InvalidPathError with local filesystem paths.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1018 |
input_audio = gr.Audio(
|
| 1019 |
+
value=os.path.join(DEMO_PATH, "test.wav"),
|
| 1020 |
label="Input Audio",
|
| 1021 |
interactive=True,
|
|
|
|
| 1022 |
)
|
| 1023 |
|
| 1024 |
with gr.Row():
|
|
|
|
| 1052 |
edit_word_mode = gr.Radio(label="Edit word mode", choices=["Replace half", "Replace all"], value="Replace all",
|
| 1053 |
info="What to do with first and last word", visible=False)
|
| 1054 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1055 |
with gr.Row():
|
| 1056 |
edit_from_word = gr.Dropdown(label="First word to edit", choices=demo_words, value=demo_words[12], interactive=True)
|
| 1057 |
edit_to_word = gr.Dropdown(label="Last word to edit", choices=demo_words, value=demo_words[18], interactive=True)
|
|
|
|
| 1168 |
inputs=[input_audio],
|
| 1169 |
outputs=[denoise_audio])
|
| 1170 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1171 |
check_btn.click(fn=get_edit_audio_part,
|
| 1172 |
inputs=[input_audio, edit_start_time, edit_end_time],
|
| 1173 |
outputs=[edit_audio])
|
|
|
|
| 1195 |
],
|
| 1196 |
outputs=[output_audio, inference_transcript, sentence_audio, audio_tensors])
|
| 1197 |
|
|
|
|
|
|
|
|
|
|
| 1198 |
edit_from_word.change(fn=update_bound_word,
|
| 1199 |
inputs=[gr.State(True), edit_from_word, edit_word_mode],
|
| 1200 |
outputs=[edit_start_time])
|