Update app.py
Browse files
app.py
CHANGED
|
@@ -101,7 +101,6 @@ class App:
|
|
| 101 |
with gr.Row():
|
| 102 |
dd_model = gr.Dropdown(choices=self.whisper_inf.available_models, value=whisper_params["model_size"],label="Model", info="Larger models increase transcription quality, but reduce performance", interactive=True)
|
| 103 |
dd_lang = gr.Dropdown(choices=["Automatic Detection"] + self.whisper_inf.available_langs,value=whisper_params["lang"], label="Language", info="If the language is known upfront, always set it manually", interactive=True)
|
| 104 |
-
#dd_file_format = gr.Dropdown(choices=["SRT", "WebVTT", "txt"], value="SRT", label="File Format")
|
| 105 |
dd_file_format = gr.Dropdown(choices=["TXT","SRT"], value="TXT", label="Output format", info="Output preview format", interactive=True, visible=False)
|
| 106 |
with gr.Row():
|
| 107 |
dd_translate_model = gr.Dropdown(choices=self.nllb_inf.available_models, value=nllb_params["model_size"],label="Model", info="Model used for translation", interactive=True)
|
|
@@ -110,9 +109,9 @@ class App:
|
|
| 110 |
with gr.Row():
|
| 111 |
cb_timestamp = gr.Checkbox(value=whisper_params["add_timestamp"], label="Add timestamp to output file",interactive=True)
|
| 112 |
with gr.Row():
|
| 113 |
-
cb_translate = gr.Checkbox(value=whisper_params["is_translate"], label="Translate
|
| 114 |
with gr.Row():
|
| 115 |
-
cb_translate_output = gr.Checkbox(value=translation_params["translate_output"], label="Translate
|
| 116 |
|
| 117 |
with gr.Accordion("Speaker diarization", open=False, visible=True):
|
| 118 |
cb_diarize = gr.Checkbox(value=diarization_params["is_diarize"],label="Use diarization",interactive=True)
|
|
@@ -122,42 +121,44 @@ class App:
|
|
| 122 |
value=self.whisper_inf.diarizer.get_device(),
|
| 123 |
interactive=True, visible=False)
|
| 124 |
|
| 125 |
-
with gr.Accordion("
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold",
|
| 130 |
-
value=vad_params["threshold"],
|
| 131 |
-
info="Lower it to be more sensitive to small sounds")
|
| 132 |
-
nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0,
|
| 133 |
-
value=vad_params["min_speech_duration_ms"],
|
| 134 |
-
info="Final speech chunks shorter than this time are thrown out")
|
| 135 |
-
nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)",
|
| 136 |
-
value=vad_params["max_speech_duration_s"],
|
| 137 |
-
info="Maximum duration of speech chunks in seconds")
|
| 138 |
-
nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0,
|
| 139 |
-
value=vad_params["min_silence_duration_ms"],
|
| 140 |
-
info="In the end of each speech chunk wait for this time"
|
| 141 |
-
" before separating it")
|
| 142 |
-
nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=vad_params["speech_pad_ms"],
|
| 143 |
-
info="Final speech chunks are padded by this time each side")
|
| 144 |
-
|
| 145 |
-
with gr.Accordion("Background Music Remover Filter (⚠ experimental)", open=False):
|
| 146 |
-
cb_bgm_separation = gr.Checkbox(label="Enable Background Music Remover Filter", value=uvr_params["is_separate_bgm"],
|
| 147 |
-
info="Enable to remove background music by submodel before transcribing",
|
| 148 |
interactive=True)
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 161 |
|
| 162 |
with gr.Accordion("Advanced processing options", open=False, visible=False):
|
| 163 |
nb_beam_size = gr.Number(label="Beam Size", value=whisper_params["beam_size"], precision=0, interactive=True,
|
|
@@ -293,7 +294,7 @@ class App:
|
|
| 293 |
btn_reset.click(None,js="window.location.reload()")
|
| 294 |
with gr.Row():
|
| 295 |
with gr.Column(scale=4):
|
| 296 |
-
tb_indicator = gr.Textbox(label="Output preview (Always review
|
| 297 |
with gr.Column(scale=1):
|
| 298 |
tb_info = gr.Textbox(label="Output info", interactive=False, show_copy_button=True)
|
| 299 |
files_subtitles = gr.Files(label="Output data", interactive=False, file_count="multiple")
|
|
|
|
| 101 |
with gr.Row():
|
| 102 |
dd_model = gr.Dropdown(choices=self.whisper_inf.available_models, value=whisper_params["model_size"],label="Model", info="Larger models increase transcription quality, but reduce performance", interactive=True)
|
| 103 |
dd_lang = gr.Dropdown(choices=["Automatic Detection"] + self.whisper_inf.available_langs,value=whisper_params["lang"], label="Language", info="If the language is known upfront, always set it manually", interactive=True)
|
|
|
|
| 104 |
dd_file_format = gr.Dropdown(choices=["TXT","SRT"], value="TXT", label="Output format", info="Output preview format", interactive=True, visible=False)
|
| 105 |
with gr.Row():
|
| 106 |
dd_translate_model = gr.Dropdown(choices=self.nllb_inf.available_models, value=nllb_params["model_size"],label="Model", info="Model used for translation", interactive=True)
|
|
|
|
| 109 |
with gr.Row():
|
| 110 |
cb_timestamp = gr.Checkbox(value=whisper_params["add_timestamp"], label="Add timestamp to output file",interactive=True)
|
| 111 |
with gr.Row():
|
| 112 |
+
cb_translate = gr.Checkbox(value=whisper_params["is_translate"], label="Translate to English", info="Translate using OpenAI Whisper's built-in module",interactive=True)
|
| 113 |
with gr.Row():
|
| 114 |
+
cb_translate_output = gr.Checkbox(value=translation_params["translate_output"], label="Translate to selected language", info="Translate using Facebook's NLLB",interactive=True)
|
| 115 |
|
| 116 |
with gr.Accordion("Speaker diarization", open=False, visible=True):
|
| 117 |
cb_diarize = gr.Checkbox(value=diarization_params["is_diarize"],label="Use diarization",interactive=True)
|
|
|
|
| 121 |
value=self.whisper_inf.diarizer.get_device(),
|
| 122 |
interactive=True, visible=False)
|
| 123 |
|
| 124 |
+
with gr.Accordion("Preprocessing options (⚠ Beta)", open=False, visible=True):
|
| 125 |
+
with gr.Accordion("Voice Detection Filter", open=False, visible=True):
|
| 126 |
+
cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=vad_params["vad_filter"],
|
| 127 |
+
info="Enable to transcribe only detected voice parts",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
interactive=True)
|
| 129 |
+
sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold",
|
| 130 |
+
value=vad_params["threshold"],
|
| 131 |
+
info="Lower it to be more sensitive to small sounds")
|
| 132 |
+
nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0,
|
| 133 |
+
value=vad_params["min_speech_duration_ms"],
|
| 134 |
+
info="Final speech chunks shorter than this time are thrown out")
|
| 135 |
+
nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)",
|
| 136 |
+
value=vad_params["max_speech_duration_s"],
|
| 137 |
+
info="Maximum duration of speech chunks in seconds")
|
| 138 |
+
nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0,
|
| 139 |
+
value=vad_params["min_silence_duration_ms"],
|
| 140 |
+
info="In the end of each speech chunk wait for this time"
|
| 141 |
+
" before separating it")
|
| 142 |
+
nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=vad_params["speech_pad_ms"],
|
| 143 |
+
info="Final speech chunks are padded by this time each side")
|
| 144 |
+
|
| 145 |
+
with gr.Accordion("Background Music Remover Filter", open=False):
|
| 146 |
+
cb_bgm_separation = gr.Checkbox(label="Enable Background Music Remover Filter", value=uvr_params["is_separate_bgm"],
|
| 147 |
+
info="Enable to remove background music by submodel before transcribing",
|
| 148 |
+
interactive=True)
|
| 149 |
+
dd_uvr_device = gr.Dropdown(label="Device",
|
| 150 |
+
value=self.whisper_inf.music_separator.device,
|
| 151 |
+
choices=self.whisper_inf.music_separator.available_devices,
|
| 152 |
+
interactive=True, visible=False)
|
| 153 |
+
dd_uvr_model_size = gr.Dropdown(label="Model", value=uvr_params["model_size"],
|
| 154 |
+
choices=self.whisper_inf.music_separator.available_models,
|
| 155 |
+
interactive=True)
|
| 156 |
+
nb_uvr_segment_size = gr.Number(label="Segment Size", value=uvr_params["segment_size"], precision=0,
|
| 157 |
+
interactive=True, visible=False)
|
| 158 |
+
cb_uvr_save_file = gr.Checkbox(label="Save separated files to output", value=uvr_params["save_file"],
|
| 159 |
+
interactive=True, visible=False)
|
| 160 |
+
cb_uvr_enable_offload = gr.Checkbox(label="Offload sub model after removing background music",value=uvr_params["enable_offload"],
|
| 161 |
+
interactive=True, visible=False)
|
| 162 |
|
| 163 |
with gr.Accordion("Advanced processing options", open=False, visible=False):
|
| 164 |
nb_beam_size = gr.Number(label="Beam Size", value=whisper_params["beam_size"], precision=0, interactive=True,
|
|
|
|
| 294 |
btn_reset.click(None,js="window.location.reload()")
|
| 295 |
with gr.Row():
|
| 296 |
with gr.Column(scale=4):
|
| 297 |
+
tb_indicator = gr.Textbox(label="Output preview (Always review output generated by AI models)", show_copy_button=True, show_label=True)
|
| 298 |
with gr.Column(scale=1):
|
| 299 |
tb_info = gr.Textbox(label="Output info", interactive=False, show_copy_button=True)
|
| 300 |
files_subtitles = gr.Files(label="Output data", interactive=False, file_count="multiple")
|