Spaces:

LEMAS-Project
/

LEMAS-Edit

Sleeping

App Files Files Community

Approximetal commited on Jan 5

Commit

71e70ab

verified ·

1 Parent(s): b47a221

Update gradio_mix.py

Browse files

Files changed (1) hide show

gradio_mix.py +2 -64

gradio_mix.py CHANGED Viewed

@@ -684,7 +684,6 @@ def transcribe(seed, audio_info):
     return [
         state["transcript"], state["transcript_with_start_time"], state["transcript_with_end_time"],
-        # gr.Dropdown(value=state["word_bounds"][-1], choices=state["word_bounds"], interactive=True), # prompt_to_word
         gr.Dropdown(value=state["word_bounds"][0], choices=state["word_bounds"], interactive=True), # edit_from_word
         gr.Dropdown(value=state["word_bounds"][-1], choices=state["word_bounds"], interactive=True), # edit_to_word
         state
@@ -708,7 +707,6 @@ def align(transcript, audio_info, state):
     return [
         state["transcript"], state["transcript_with_start_time"], state["transcript_with_end_time"],
-        # gr.Dropdown(value=state["word_bounds"][-1], choices=state["word_bounds"], interactive=True), # prompt_to_word
         gr.Dropdown(value=state["word_bounds"][0], choices=state["word_bounds"], interactive=True), # edit_from_word
         gr.Dropdown(value=state["word_bounds"][-1], choices=state["word_bounds"], interactive=True), # edit_to_word
         state
@@ -789,11 +787,6 @@ def run(seed, nfe_step, speed, cfg_strength, sway_sampling_coef, ref_ratio,
         except Exception as e:
             logging.warning("Failed to move LEMAS-TTS model to CUDA: %s", e)
-    # if mode == "Rerun":
-    #     colon_position = selected_sentence.find(':')
-    #     selected_sentence_idx = int(selected_sentence[:colon_position])
-    #     sentences = [selected_sentence[colon_position + 1:]]
     # Choose base audio (denoised if duration matches)
     audio_base = audio_info
     audio_dur = round(audio_info[1].shape[0] / audio_info[0], ndigits=3)
@@ -964,32 +957,14 @@ def update_bound_words(from_selected_word, to_selected_word, edit_word_mode):
 smart_transcript_info = """
 If enabled, the target transcript will be constructed for you:</br>
- - In TTS and Long TTS mode just write the text you want to synthesize.</br>
  - In Edit mode just write the text to replace selected editing segment.</br>
-If disabled, you should write the target transcript yourself:</br>
- - In TTS mode write prompt transcript followed by generation transcript.</br>
- - In Long TTS select split by newline (<b>SENTENCE SPLIT WON'T WORK</b>) and start each line with a prompt transcript.</br>
- - In Edit mode write full prompt</br>
 """
 demo_original_transcript = ""
 demo_text = {
-    "TTS": {
-        "smart": "take over the stage for half an hour,",
-        "regular": "Gwynplaine had, besides, for his work and for his feats of strength, take over the stage for half an hour."
-    },
     "Edit": {
-        "smart": "Just write it line-by-line.",
-        "regular": "照片、医疗记录、神经重塑的易损性，这是某种数据库啊！还有PRELESS的脑部扫描、生物管型、神经重塑."
-    },
-    "Long TTS": {
-        "smart": "You can run the model on a big text!\n"
-                 "Just write it line-by-line. Or sentence-by-sentence.\n"
-                 "If some sentences sound odd, just rerun the model on them, no need to generate the whole text again!",
-        "regular": "Gwynplaine had, besides, for his work and for his feats of strength, You can run the model on a big text!\n"
-                   "Gwynplaine had, besides, for his work and for his feats of strength, Just write it line-by-line. Or sentence-by-sentence.\n"
-                   "Gwynplaine had, besides, for his work and for his feats of strength, If some sentences sound odd, just rerun the model on them, no need to generate the whole text again!"
     }
 }
@@ -1033,7 +1008,6 @@ def get_app():
                             scale=3,
                         )
                         denoise_model_choice = gr.Radio(label="Denoise Model", scale=2, value="UVR5", choices=["UVR5", "DeepFilterNet"]) # "830M", "330M_TTSEnhanced", "830M_TTSEnhanced"])
-                        # whisper_backend_choice = gr.Radio(label="Whisper backend", value="", choices=["whisperX", "whisper"])
                         whisper_model_choice = gr.Radio(label="Whisper model", scale=3, value="medium", choices=["base", "small", "medium", "large"])
                         align_model_choice = gr.Radio(label="Forced alignment model", scale=2, value="MMS", choices=["whisperX", "MMS"], visible=False)
@@ -1041,27 +1015,10 @@ def get_app():
             with gr.Column(scale=2):
                 # Use a numpy waveform as default value to avoid Gradio's
                 # InvalidPathError with local filesystem paths.
-                _demo_value = None
-                demo_candidates = [
-                    os.path.join(DEMO_PATH, "test.wav"),
-                ]
-                for demo_path in demo_candidates:
-                    try:
-                        if not os.path.isfile(demo_path):
-                            continue
-                        _demo_wav, _demo_sr = torchaudio.load(demo_path)
-                        if _demo_wav.dim() > 1 and _demo_wav.shape[0] > 1:
-                            _demo_wav = _demo_wav.mean(dim=0, keepdim=True)
-                        _demo_value = (_demo_sr, _demo_wav.squeeze(0).numpy())
-                        break
-                    except Exception:
-                        continue
                 input_audio = gr.Audio(
-                    value=_demo_value,
                     label="Input Audio",
                     interactive=True,
-                    type="numpy",
                 )
                 with gr.Row():
@@ -1095,18 +1052,6 @@ def get_app():
                         edit_word_mode = gr.Radio(label="Edit word mode", choices=["Replace half", "Replace all"], value="Replace all",
                                                 info="What to do with first and last word", visible=False)
-                    # with gr.Group(visible=False) as tts_mode_controls:
-                        # with gr.Row():
-                        #     edit_from_word = gr.Dropdown(label="First word in prompt", choices=demo_words, value=demo_words[12], interactive=True)
-                        #     edit_to_word = gr.Dropdown(label="Last word in prompt", choices=demo_words, value=demo_words[18], interactive=True)
-                        # with gr.Row():
-                        #     edit_start_time = gr.Slider(label="Prompt start time", minimum=0, maximum=7.614, step=0.001, value=4.022)
-                        #     edit_end_time = gr.Slider(label="Prompt end time", minimum=0, maximum=7.614, step=0.001, value=5.768)
-                        # with gr.Row():
-                        #     check_btn = gr.Button(value="Check prompt",scale=1)
-                        #     edit_audio = gr.Audio(label="Prompt Audio", scale=3)
-                    # with gr.Group() as edit_mode_controls:
                     with gr.Row():
                         edit_from_word = gr.Dropdown(label="First word to edit", choices=demo_words, value=demo_words[12], interactive=True)
                         edit_to_word = gr.Dropdown(label="Last word to edit", choices=demo_words, value=demo_words[18], interactive=True)
@@ -1223,10 +1168,6 @@ def get_app():
                         inputs=[input_audio],
                         outputs=[denoise_audio])
-        # mode.change(fn=change_mode,
-        #             inputs=[mode],
-        #             outputs=[tts_mode_controls, edit_mode_controls, edit_word_mode, split_text, long_tts_sentence_editor])
         check_btn.click(fn=get_edit_audio_part,
                         inputs=[input_audio, edit_start_time, edit_end_time],
                         outputs=[edit_audio])
@@ -1254,9 +1195,6 @@ def get_app():
                         ],
                         outputs=[output_audio, inference_transcript, sentence_audio, audio_tensors])
-        # prompt_to_word.change(fn=update_bound_word,
-        #                     inputs=[gr.State(False), prompt_to_word, gr.State("Replace all")],
-        #                     outputs=[prompt_end_time])
         edit_from_word.change(fn=update_bound_word,
                             inputs=[gr.State(True), edit_from_word, edit_word_mode],
                             outputs=[edit_start_time])

     return [
         state["transcript"], state["transcript_with_start_time"], state["transcript_with_end_time"],
         gr.Dropdown(value=state["word_bounds"][0], choices=state["word_bounds"], interactive=True), # edit_from_word
         gr.Dropdown(value=state["word_bounds"][-1], choices=state["word_bounds"], interactive=True), # edit_to_word
         state
     return [
         state["transcript"], state["transcript_with_start_time"], state["transcript_with_end_time"],
         gr.Dropdown(value=state["word_bounds"][0], choices=state["word_bounds"], interactive=True), # edit_from_word
         gr.Dropdown(value=state["word_bounds"][-1], choices=state["word_bounds"], interactive=True), # edit_to_word
         state
         except Exception as e:
             logging.warning("Failed to move LEMAS-TTS model to CUDA: %s", e)
     # Choose base audio (denoised if duration matches)
     audio_base = audio_info
     audio_dur = round(audio_info[1].shape[0] / audio_info[0], ndigits=3)
 smart_transcript_info = """
 If enabled, the target transcript will be constructed for you:</br>
  - In Edit mode just write the text to replace selected editing segment.</br>
 """
 demo_original_transcript = ""
 demo_text = {
     "Edit": {
+        "smart": "write new words here.",
     }
 }
                             scale=3,
                         )
                         denoise_model_choice = gr.Radio(label="Denoise Model", scale=2, value="UVR5", choices=["UVR5", "DeepFilterNet"]) # "830M", "330M_TTSEnhanced", "830M_TTSEnhanced"])
                         whisper_model_choice = gr.Radio(label="Whisper model", scale=3, value="medium", choices=["base", "small", "medium", "large"])
                         align_model_choice = gr.Radio(label="Forced alignment model", scale=2, value="MMS", choices=["whisperX", "MMS"], visible=False)
             with gr.Column(scale=2):
                 # Use a numpy waveform as default value to avoid Gradio's
                 # InvalidPathError with local filesystem paths.
                 input_audio = gr.Audio(
+                    value=os.path.join(DEMO_PATH, "test.wav"),
                     label="Input Audio",
                     interactive=True,
                 )
                 with gr.Row():
                         edit_word_mode = gr.Radio(label="Edit word mode", choices=["Replace half", "Replace all"], value="Replace all",
                                                 info="What to do with first and last word", visible=False)
                     with gr.Row():
                         edit_from_word = gr.Dropdown(label="First word to edit", choices=demo_words, value=demo_words[12], interactive=True)
                         edit_to_word = gr.Dropdown(label="Last word to edit", choices=demo_words, value=demo_words[18], interactive=True)
                         inputs=[input_audio],
                         outputs=[denoise_audio])
         check_btn.click(fn=get_edit_audio_part,
                         inputs=[input_audio, edit_start_time, edit_end_time],
                         outputs=[edit_audio])
                         ],
                         outputs=[output_audio, inference_transcript, sentence_audio, audio_tensors])
         edit_from_word.change(fn=update_bound_word,
                             inputs=[gr.State(True), edit_from_word, edit_word_mode],
                             outputs=[edit_start_time])