Spaces:

LAP-DEV
/

Demo

Sleeping

App Files Files Community

LAP-DEV commited on Feb 17, 2025

Commit

59414b9

verified ·

1 Parent(s): 71df80d

Update app.py

Browse files

Files changed (1) hide show

app.py +40 -39

app.py CHANGED Viewed

@@ -101,7 +101,6 @@ class App:
                 with gr.Row():
                     dd_model = gr.Dropdown(choices=self.whisper_inf.available_models, value=whisper_params["model_size"],label="Model", info="Larger models increase transcription quality, but reduce performance", interactive=True)
                     dd_lang = gr.Dropdown(choices=["Automatic Detection"] + self.whisper_inf.available_langs,value=whisper_params["lang"], label="Language", info="If the language is known upfront, always set it manually", interactive=True)
-                    #dd_file_format = gr.Dropdown(choices=["SRT", "WebVTT", "txt"], value="SRT", label="File Format")
                     dd_file_format = gr.Dropdown(choices=["TXT","SRT"], value="TXT", label="Output format", info="Output preview format", interactive=True, visible=False)
                 with gr.Row():
                     dd_translate_model = gr.Dropdown(choices=self.nllb_inf.available_models, value=nllb_params["model_size"],label="Model", info="Model used for translation", interactive=True)
@@ -110,9 +109,9 @@ class App:
                 with gr.Row():
                     cb_timestamp = gr.Checkbox(value=whisper_params["add_timestamp"], label="Add timestamp to output file",interactive=True)
                 with gr.Row():
-                    cb_translate = gr.Checkbox(value=whisper_params["is_translate"], label="Translate transcription to English", info="Translate using OpenAI Whisper's built-in module",interactive=True)
                 with gr.Row():
-                    cb_translate_output = gr.Checkbox(value=translation_params["translate_output"], label="Translate output to selected language", info="Translate using Facebook's NLLB",interactive=True)
         with gr.Accordion("Speaker diarization", open=False, visible=True):
             cb_diarize = gr.Checkbox(value=diarization_params["is_diarize"],label="Use diarization",interactive=True)
@@ -122,42 +121,44 @@ class App:
                                                 value=self.whisper_inf.diarizer.get_device(),
                                                 interactive=True, visible=False)
-        with gr.Accordion("Voice Detection Filter (⚠ experimental)", open=False, visible=True):
-            cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=vad_params["vad_filter"],
-                                        interactive=True,
-                                        info="Enable to transcribe only detected voice parts")
-            sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold",
-                                     value=vad_params["threshold"],
-                                     info="Lower it to be more sensitive to small sounds")
-            nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0,
-                                                  value=vad_params["min_speech_duration_ms"],
-                                                  info="Final speech chunks shorter than this time are thrown out")
-            nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)",
-                                                 value=vad_params["max_speech_duration_s"],
-                                                 info="Maximum duration of speech chunks in seconds")
-            nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0,
-                                                   value=vad_params["min_silence_duration_ms"],
-                                                   info="In the end of each speech chunk wait for this time"
-                                                        " before separating it")
-            nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=vad_params["speech_pad_ms"],
-                                         info="Final speech chunks are padded by this time each side")
-        with gr.Accordion("Background Music Remover Filter (⚠ experimental)", open=False):
-            cb_bgm_separation = gr.Checkbox(label="Enable Background Music Remover Filter", value=uvr_params["is_separate_bgm"],
-                                            info="Enable to remove background music by submodel before transcribing",
                                             interactive=True)
-            dd_uvr_device = gr.Dropdown(label="Device",
-                                            value=self.whisper_inf.music_separator.device,
-                                            choices=self.whisper_inf.music_separator.available_devices,
-                                            interactive=True, visible=False)
-            dd_uvr_model_size = gr.Dropdown(label="Model", value=uvr_params["model_size"],
-                                            choices=self.whisper_inf.music_separator.available_models)
-            nb_uvr_segment_size = gr.Number(label="Segment Size", value=uvr_params["segment_size"], precision=0,
-                                            interactive=True, visible=False)
-            cb_uvr_save_file = gr.Checkbox(label="Save separated files to output", value=uvr_params["save_file"],
-                                           interactive=True, visible=False)
-            cb_uvr_enable_offload = gr.Checkbox(label="Offload sub model after removing background music",value=uvr_params["enable_offload"],
-                                            interactive=True, visible=False)
         with gr.Accordion("Advanced processing options", open=False, visible=False):
             nb_beam_size = gr.Number(label="Beam Size", value=whisper_params["beam_size"], precision=0, interactive=True,
@@ -293,7 +294,7 @@ class App:
                         btn_reset.click(None,js="window.location.reload()")
                     with gr.Row():
                         with gr.Column(scale=4):
-                            tb_indicator = gr.Textbox(label="Output preview (Always review & verify the output generated by AI models)", show_copy_button=True, show_label=True)
                         with gr.Column(scale=1):
                             tb_info = gr.Textbox(label="Output info", interactive=False, show_copy_button=True)
                             files_subtitles = gr.Files(label="Output data", interactive=False, file_count="multiple")

                 with gr.Row():
                     dd_model = gr.Dropdown(choices=self.whisper_inf.available_models, value=whisper_params["model_size"],label="Model", info="Larger models increase transcription quality, but reduce performance", interactive=True)
                     dd_lang = gr.Dropdown(choices=["Automatic Detection"] + self.whisper_inf.available_langs,value=whisper_params["lang"], label="Language", info="If the language is known upfront, always set it manually", interactive=True)
                     dd_file_format = gr.Dropdown(choices=["TXT","SRT"], value="TXT", label="Output format", info="Output preview format", interactive=True, visible=False)
                 with gr.Row():
                     dd_translate_model = gr.Dropdown(choices=self.nllb_inf.available_models, value=nllb_params["model_size"],label="Model", info="Model used for translation", interactive=True)
                 with gr.Row():
                     cb_timestamp = gr.Checkbox(value=whisper_params["add_timestamp"], label="Add timestamp to output file",interactive=True)
                 with gr.Row():
+                    cb_translate = gr.Checkbox(value=whisper_params["is_translate"], label="Translate to English", info="Translate using OpenAI Whisper's built-in module",interactive=True)
                 with gr.Row():
+                    cb_translate_output = gr.Checkbox(value=translation_params["translate_output"], label="Translate to selected language", info="Translate using Facebook's NLLB",interactive=True)
         with gr.Accordion("Speaker diarization", open=False, visible=True):
             cb_diarize = gr.Checkbox(value=diarization_params["is_diarize"],label="Use diarization",interactive=True)
                                                 value=self.whisper_inf.diarizer.get_device(),
                                                 interactive=True, visible=False)
+        with gr.Accordion("Preprocessing options (⚠ Beta)", open=False, visible=True):
+            with gr.Accordion("Voice Detection Filter", open=False, visible=True):
+                cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=vad_params["vad_filter"],
+                                            info="Enable to transcribe only detected voice parts",
                                             interactive=True)
+                sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold",
+                                         value=vad_params["threshold"],
+                                         info="Lower it to be more sensitive to small sounds")
+                nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0,
+                                                      value=vad_params["min_speech_duration_ms"],
+                                                      info="Final speech chunks shorter than this time are thrown out")
+                nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)",
+                                                     value=vad_params["max_speech_duration_s"],
+                                                     info="Maximum duration of speech chunks in seconds")
+                nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0,
+                                                       value=vad_params["min_silence_duration_ms"],
+                                                       info="In the end of each speech chunk wait for this time"
+                                                            " before separating it")
+                nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=vad_params["speech_pad_ms"],
+                                             info="Final speech chunks are padded by this time each side")
+            with gr.Accordion("Background Music Remover Filter", open=False):
+                cb_bgm_separation = gr.Checkbox(label="Enable Background Music Remover Filter", value=uvr_params["is_separate_bgm"],
+                                                info="Enable to remove background music by submodel before transcribing",
+                                                interactive=True)
+                dd_uvr_device = gr.Dropdown(label="Device",
+                                                value=self.whisper_inf.music_separator.device,
+                                                choices=self.whisper_inf.music_separator.available_devices,
+                                                interactive=True, visible=False)
+                dd_uvr_model_size = gr.Dropdown(label="Model", value=uvr_params["model_size"],
+                                                choices=self.whisper_inf.music_separator.available_models,
+                                                interactive=True)
+                nb_uvr_segment_size = gr.Number(label="Segment Size", value=uvr_params["segment_size"], precision=0,
+                                                interactive=True, visible=False)
+                cb_uvr_save_file = gr.Checkbox(label="Save separated files to output", value=uvr_params["save_file"],
+                                               interactive=True, visible=False)
+                cb_uvr_enable_offload = gr.Checkbox(label="Offload sub model after removing background music",value=uvr_params["enable_offload"],
+                                                interactive=True, visible=False)
         with gr.Accordion("Advanced processing options", open=False, visible=False):
             nb_beam_size = gr.Number(label="Beam Size", value=whisper_params["beam_size"], precision=0, interactive=True,
                         btn_reset.click(None,js="window.location.reload()")
                     with gr.Row():
                         with gr.Column(scale=4):
+                            tb_indicator = gr.Textbox(label="Output preview (Always review output generated by AI models)", show_copy_button=True, show_label=True)
                         with gr.Column(scale=1):
                             tb_info = gr.Textbox(label="Output info", interactive=False, show_copy_button=True)
                             files_subtitles = gr.Files(label="Output data", interactive=False, file_count="multiple")