Approximetal commited on
Commit
71e70ab
·
verified ·
1 Parent(s): b47a221

Update gradio_mix.py

Browse files
Files changed (1) hide show
  1. gradio_mix.py +2 -64
gradio_mix.py CHANGED
@@ -684,7 +684,6 @@ def transcribe(seed, audio_info):
684
 
685
  return [
686
  state["transcript"], state["transcript_with_start_time"], state["transcript_with_end_time"],
687
- # gr.Dropdown(value=state["word_bounds"][-1], choices=state["word_bounds"], interactive=True), # prompt_to_word
688
  gr.Dropdown(value=state["word_bounds"][0], choices=state["word_bounds"], interactive=True), # edit_from_word
689
  gr.Dropdown(value=state["word_bounds"][-1], choices=state["word_bounds"], interactive=True), # edit_to_word
690
  state
@@ -708,7 +707,6 @@ def align(transcript, audio_info, state):
708
 
709
  return [
710
  state["transcript"], state["transcript_with_start_time"], state["transcript_with_end_time"],
711
- # gr.Dropdown(value=state["word_bounds"][-1], choices=state["word_bounds"], interactive=True), # prompt_to_word
712
  gr.Dropdown(value=state["word_bounds"][0], choices=state["word_bounds"], interactive=True), # edit_from_word
713
  gr.Dropdown(value=state["word_bounds"][-1], choices=state["word_bounds"], interactive=True), # edit_to_word
714
  state
@@ -789,11 +787,6 @@ def run(seed, nfe_step, speed, cfg_strength, sway_sampling_coef, ref_ratio,
789
  except Exception as e:
790
  logging.warning("Failed to move LEMAS-TTS model to CUDA: %s", e)
791
 
792
- # if mode == "Rerun":
793
- # colon_position = selected_sentence.find(':')
794
- # selected_sentence_idx = int(selected_sentence[:colon_position])
795
- # sentences = [selected_sentence[colon_position + 1:]]
796
-
797
  # Choose base audio (denoised if duration matches)
798
  audio_base = audio_info
799
  audio_dur = round(audio_info[1].shape[0] / audio_info[0], ndigits=3)
@@ -964,32 +957,14 @@ def update_bound_words(from_selected_word, to_selected_word, edit_word_mode):
964
 
965
  smart_transcript_info = """
966
  If enabled, the target transcript will be constructed for you:</br>
967
- - In TTS and Long TTS mode just write the text you want to synthesize.</br>
968
  - In Edit mode just write the text to replace selected editing segment.</br>
969
- If disabled, you should write the target transcript yourself:</br>
970
- - In TTS mode write prompt transcript followed by generation transcript.</br>
971
- - In Long TTS select split by newline (<b>SENTENCE SPLIT WON'T WORK</b>) and start each line with a prompt transcript.</br>
972
- - In Edit mode write full prompt</br>
973
  """
974
 
975
  demo_original_transcript = ""
976
 
977
  demo_text = {
978
- "TTS": {
979
- "smart": "take over the stage for half an hour,",
980
- "regular": "Gwynplaine had, besides, for his work and for his feats of strength, take over the stage for half an hour."
981
- },
982
  "Edit": {
983
- "smart": "Just write it line-by-line.",
984
- "regular": "照片、医疗记录、神经重塑的易损性,这是某种数据库啊!还有PRELESS的脑部扫描、生物管型、神经重塑."
985
- },
986
- "Long TTS": {
987
- "smart": "You can run the model on a big text!\n"
988
- "Just write it line-by-line. Or sentence-by-sentence.\n"
989
- "If some sentences sound odd, just rerun the model on them, no need to generate the whole text again!",
990
- "regular": "Gwynplaine had, besides, for his work and for his feats of strength, You can run the model on a big text!\n"
991
- "Gwynplaine had, besides, for his work and for his feats of strength, Just write it line-by-line. Or sentence-by-sentence.\n"
992
- "Gwynplaine had, besides, for his work and for his feats of strength, If some sentences sound odd, just rerun the model on them, no need to generate the whole text again!"
993
  }
994
  }
995
 
@@ -1033,7 +1008,6 @@ def get_app():
1033
  scale=3,
1034
  )
1035
  denoise_model_choice = gr.Radio(label="Denoise Model", scale=2, value="UVR5", choices=["UVR5", "DeepFilterNet"]) # "830M", "330M_TTSEnhanced", "830M_TTSEnhanced"])
1036
- # whisper_backend_choice = gr.Radio(label="Whisper backend", value="", choices=["whisperX", "whisper"])
1037
  whisper_model_choice = gr.Radio(label="Whisper model", scale=3, value="medium", choices=["base", "small", "medium", "large"])
1038
  align_model_choice = gr.Radio(label="Forced alignment model", scale=2, value="MMS", choices=["whisperX", "MMS"], visible=False)
1039
 
@@ -1041,27 +1015,10 @@ def get_app():
1041
  with gr.Column(scale=2):
1042
  # Use a numpy waveform as default value to avoid Gradio's
1043
  # InvalidPathError with local filesystem paths.
1044
- _demo_value = None
1045
- demo_candidates = [
1046
- os.path.join(DEMO_PATH, "test.wav"),
1047
- ]
1048
- for demo_path in demo_candidates:
1049
- try:
1050
- if not os.path.isfile(demo_path):
1051
- continue
1052
- _demo_wav, _demo_sr = torchaudio.load(demo_path)
1053
- if _demo_wav.dim() > 1 and _demo_wav.shape[0] > 1:
1054
- _demo_wav = _demo_wav.mean(dim=0, keepdim=True)
1055
- _demo_value = (_demo_sr, _demo_wav.squeeze(0).numpy())
1056
- break
1057
- except Exception:
1058
- continue
1059
-
1060
  input_audio = gr.Audio(
1061
- value=_demo_value,
1062
  label="Input Audio",
1063
  interactive=True,
1064
- type="numpy",
1065
  )
1066
 
1067
  with gr.Row():
@@ -1095,18 +1052,6 @@ def get_app():
1095
  edit_word_mode = gr.Radio(label="Edit word mode", choices=["Replace half", "Replace all"], value="Replace all",
1096
  info="What to do with first and last word", visible=False)
1097
 
1098
- # with gr.Group(visible=False) as tts_mode_controls:
1099
- # with gr.Row():
1100
- # edit_from_word = gr.Dropdown(label="First word in prompt", choices=demo_words, value=demo_words[12], interactive=True)
1101
- # edit_to_word = gr.Dropdown(label="Last word in prompt", choices=demo_words, value=demo_words[18], interactive=True)
1102
- # with gr.Row():
1103
- # edit_start_time = gr.Slider(label="Prompt start time", minimum=0, maximum=7.614, step=0.001, value=4.022)
1104
- # edit_end_time = gr.Slider(label="Prompt end time", minimum=0, maximum=7.614, step=0.001, value=5.768)
1105
- # with gr.Row():
1106
- # check_btn = gr.Button(value="Check prompt",scale=1)
1107
- # edit_audio = gr.Audio(label="Prompt Audio", scale=3)
1108
-
1109
- # with gr.Group() as edit_mode_controls:
1110
  with gr.Row():
1111
  edit_from_word = gr.Dropdown(label="First word to edit", choices=demo_words, value=demo_words[12], interactive=True)
1112
  edit_to_word = gr.Dropdown(label="Last word to edit", choices=demo_words, value=demo_words[18], interactive=True)
@@ -1223,10 +1168,6 @@ def get_app():
1223
  inputs=[input_audio],
1224
  outputs=[denoise_audio])
1225
 
1226
- # mode.change(fn=change_mode,
1227
- # inputs=[mode],
1228
- # outputs=[tts_mode_controls, edit_mode_controls, edit_word_mode, split_text, long_tts_sentence_editor])
1229
-
1230
  check_btn.click(fn=get_edit_audio_part,
1231
  inputs=[input_audio, edit_start_time, edit_end_time],
1232
  outputs=[edit_audio])
@@ -1254,9 +1195,6 @@ def get_app():
1254
  ],
1255
  outputs=[output_audio, inference_transcript, sentence_audio, audio_tensors])
1256
 
1257
- # prompt_to_word.change(fn=update_bound_word,
1258
- # inputs=[gr.State(False), prompt_to_word, gr.State("Replace all")],
1259
- # outputs=[prompt_end_time])
1260
  edit_from_word.change(fn=update_bound_word,
1261
  inputs=[gr.State(True), edit_from_word, edit_word_mode],
1262
  outputs=[edit_start_time])
 
684
 
685
  return [
686
  state["transcript"], state["transcript_with_start_time"], state["transcript_with_end_time"],
 
687
  gr.Dropdown(value=state["word_bounds"][0], choices=state["word_bounds"], interactive=True), # edit_from_word
688
  gr.Dropdown(value=state["word_bounds"][-1], choices=state["word_bounds"], interactive=True), # edit_to_word
689
  state
 
707
 
708
  return [
709
  state["transcript"], state["transcript_with_start_time"], state["transcript_with_end_time"],
 
710
  gr.Dropdown(value=state["word_bounds"][0], choices=state["word_bounds"], interactive=True), # edit_from_word
711
  gr.Dropdown(value=state["word_bounds"][-1], choices=state["word_bounds"], interactive=True), # edit_to_word
712
  state
 
787
  except Exception as e:
788
  logging.warning("Failed to move LEMAS-TTS model to CUDA: %s", e)
789
 
 
 
 
 
 
790
  # Choose base audio (denoised if duration matches)
791
  audio_base = audio_info
792
  audio_dur = round(audio_info[1].shape[0] / audio_info[0], ndigits=3)
 
957
 
958
  smart_transcript_info = """
959
  If enabled, the target transcript will be constructed for you:</br>
 
960
  - In Edit mode just write the text to replace selected editing segment.</br>
 
 
 
 
961
  """
962
 
963
  demo_original_transcript = ""
964
 
965
  demo_text = {
 
 
 
 
966
  "Edit": {
967
+ "smart": "write new words here.",
 
 
 
 
 
 
 
 
 
968
  }
969
  }
970
 
 
1008
  scale=3,
1009
  )
1010
  denoise_model_choice = gr.Radio(label="Denoise Model", scale=2, value="UVR5", choices=["UVR5", "DeepFilterNet"]) # "830M", "330M_TTSEnhanced", "830M_TTSEnhanced"])
 
1011
  whisper_model_choice = gr.Radio(label="Whisper model", scale=3, value="medium", choices=["base", "small", "medium", "large"])
1012
  align_model_choice = gr.Radio(label="Forced alignment model", scale=2, value="MMS", choices=["whisperX", "MMS"], visible=False)
1013
 
 
1015
  with gr.Column(scale=2):
1016
  # Use a numpy waveform as default value to avoid Gradio's
1017
  # InvalidPathError with local filesystem paths.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1018
  input_audio = gr.Audio(
1019
+ value=os.path.join(DEMO_PATH, "test.wav"),
1020
  label="Input Audio",
1021
  interactive=True,
 
1022
  )
1023
 
1024
  with gr.Row():
 
1052
  edit_word_mode = gr.Radio(label="Edit word mode", choices=["Replace half", "Replace all"], value="Replace all",
1053
  info="What to do with first and last word", visible=False)
1054
 
 
 
 
 
 
 
 
 
 
 
 
 
1055
  with gr.Row():
1056
  edit_from_word = gr.Dropdown(label="First word to edit", choices=demo_words, value=demo_words[12], interactive=True)
1057
  edit_to_word = gr.Dropdown(label="Last word to edit", choices=demo_words, value=demo_words[18], interactive=True)
 
1168
  inputs=[input_audio],
1169
  outputs=[denoise_audio])
1170
 
 
 
 
 
1171
  check_btn.click(fn=get_edit_audio_part,
1172
  inputs=[input_audio, edit_start_time, edit_end_time],
1173
  outputs=[edit_audio])
 
1195
  ],
1196
  outputs=[output_audio, inference_transcript, sentence_audio, audio_tensors])
1197
 
 
 
 
1198
  edit_from_word.change(fn=update_bound_word,
1199
  inputs=[gr.State(True), edit_from_word, edit_word_mode],
1200
  outputs=[edit_start_time])