Implement speaker diarization and transcription merging pipeline
Browse files- Add diarization functionality with configurable speaker count
- Create merge_transcription_and_diarization function to assign speakers to transcribed text
- Enhance app.py with debug mode and dynamic file loading
- Update UI to support advanced speaker configuration
- Modify logo display and add visibility controls for intermediate outputs
app.py
CHANGED
|
@@ -19,12 +19,14 @@ from ui_config import (
|
|
| 19 |
ERROR_BACKGROUND_COLOR, ERROR_TEXT_COLOR, ERROR_BORDER_COLOR,
|
| 20 |
BUTTON_SECONDARY_BACKGROUND_COLOR, BUTTON_SECONDARY_BORDER_COLOR,
|
| 21 |
BUTTON_SECONDARY_TEXT_COLOR, RED, GREEN, BLUE,
|
| 22 |
-
html_social_media,
|
| 23 |
)
|
| 24 |
# from url_manager import get_youtube_thumbnail, is_valid_youtube_url, is_valid_twitch_url, is_valid_url
|
| 25 |
from slice_audio import slice_audio as slice_audio_main
|
| 26 |
from audio import get_audio_from_video
|
| 27 |
from transcribe import transcribe, get_language_dict
|
|
|
|
|
|
|
| 28 |
|
| 29 |
NUMBER = 100
|
| 30 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
|
@@ -56,13 +58,17 @@ YOUTUBE = "youtube"
|
|
| 56 |
TWITCH = "twitch"
|
| 57 |
ERROR = "error"
|
| 58 |
|
|
|
|
|
|
|
|
|
|
| 59 |
subtify_logo = Image.open("assets/subtify_logo-scaled.png")
|
| 60 |
subtify_logo_width, subtify_logo_height = subtify_logo.size
|
| 61 |
factor = 4
|
| 62 |
new_width = subtify_logo_width // factor
|
| 63 |
new_height = subtify_logo_height // factor
|
| 64 |
|
| 65 |
-
|
|
|
|
| 66 |
|
| 67 |
language_dict = union_language_dict()
|
| 68 |
|
|
@@ -118,8 +124,10 @@ def change_visibility_texboxes():
|
|
| 118 |
return (
|
| 119 |
gr.update(value="Done"), # auxiliar_block1
|
| 120 |
gr.update(visible=True), # get_audio_from_video_info
|
|
|
|
| 121 |
gr.update(visible=True), # video_sliced_progress_info
|
| 122 |
gr.update(visible=True), # video_transcribed_progress_info
|
|
|
|
| 123 |
gr.update(visible=True), # transcriptions_concatenated_progress_info
|
| 124 |
gr.update(visible=True), # video_translated_progress_info
|
| 125 |
gr.update(visible=True), # video_subtitled_progress_info
|
|
@@ -128,8 +136,17 @@ def change_visibility_texboxes():
|
|
| 128 |
def get_audio(video_path):
|
| 129 |
print('*'*NUMBER)
|
| 130 |
print(f"Getting audio from video {video_path}")
|
| 131 |
-
|
| 132 |
audios_folder = "audios"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
try:
|
| 134 |
audio_path = get_audio_from_video(video_path, audios_folder)
|
| 135 |
return [
|
|
@@ -162,18 +179,64 @@ def slice_audio(input_audio_path):
|
|
| 162 |
gr.update(value="Ok"), # video_sliced_progress_info
|
| 163 |
)
|
| 164 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
def trascribe_audio(input_audio_path, source_languaje):
|
| 166 |
print('*'*NUMBER)
|
| 167 |
print(f"Transcript {input_audio_path}")
|
| 168 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 169 |
# Get language dict
|
| 170 |
language_dict = get_language_dict()
|
| 171 |
|
| 172 |
# Transcribe audio file
|
| 173 |
-
transcribe(input_audio_path, language_dict[source_languaje]["transcriber"], DEVICE, CHUNK_SECONDS, CHUNK_OVERLAP_SECONDS)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 174 |
|
| 175 |
return (
|
| 176 |
-
gr.
|
|
|
|
| 177 |
)
|
| 178 |
|
| 179 |
def concatenate_transcriptions():
|
|
@@ -276,6 +339,16 @@ def process_uploaded_video(video_path):
|
|
| 276 |
videos_folder = "videos"
|
| 277 |
if not os.path.exists(videos_folder):
|
| 278 |
os.makedirs(videos_folder)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 279 |
|
| 280 |
# Copy uploaded video to videos folder
|
| 281 |
new_video_path = os.path.join(videos_folder, "download_video.mp4")
|
|
@@ -285,7 +358,103 @@ def process_uploaded_video(video_path):
|
|
| 285 |
return [
|
| 286 |
gr.update(label="Video uploaded"), # video_input
|
| 287 |
gr.update(visible=True), # config_block
|
| 288 |
-
gr.update(value=new_video_path) # original_video_path
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 289 |
]
|
| 290 |
|
| 291 |
@spaces.GPU
|
|
@@ -336,7 +505,7 @@ def subtify():
|
|
| 336 |
# Layout
|
| 337 |
gr.HTML(html_social_media)
|
| 338 |
gr.HTML("<h1 style='text-align: center;'>Subtify</h1>")
|
| 339 |
-
gr.HTML(
|
| 340 |
|
| 341 |
# Input block, where the user can upload a video and configure the subtify process
|
| 342 |
visible = False
|
|
@@ -357,21 +526,27 @@ def subtify():
|
|
| 357 |
with gr.Row():
|
| 358 |
source_languaje = gr.Dropdown(visible=True, label="Source languaje", show_label=True, value="English", choices=language_dict, scale=1, interactive=True, info="Language of the video")
|
| 359 |
target_languaje = gr.Dropdown(visible=True, label="Target languaje", show_label=True, value="Español", choices=language_dict, scale=1, interactive=True, info="Language to translate the subtitles")
|
| 360 |
-
with gr.Accordion("Advanced settings", open=
|
| 361 |
-
number_of_speakers = gr.
|
|
|
|
|
|
|
| 362 |
subtify_button = gr.Button(size="lg", value="subtify", min_width="10px", scale=0, visible=True)
|
| 363 |
|
| 364 |
auxiliar_block1 = gr.Textbox(placeholder="", interactive=False, visible=visible)
|
| 365 |
with gr.Row():
|
| 366 |
get_audio_from_video_info = gr.Textbox(placeholder="Waiting", label="Get audio from video info", elem_id="get_audio_from_video_info", interactive=False, visible=visible)
|
| 367 |
video_transcribed_progress_info = gr.Textbox(placeholder="Waiting", label="Transcribe progress info", elem_id="video_transcribed_progress_info", interactive=False, visible=visible)
|
|
|
|
|
|
|
| 368 |
transcriptions_concatenated_progress_info = gr.Textbox(placeholder="Waiting", label="Concatenate progress info", elem_id="transcriptions_concatenated_progress_info", interactive=False, visible=visible)
|
| 369 |
video_translated_progress_info = gr.Textbox(placeholder="Waiting", label="Translate progress info", elem_id="transcription_translated_progress_info", interactive=False, visible=visible)
|
| 370 |
video_subtitled_progress_info = gr.Textbox(placeholder="Waiting", label="Video subtitle progress info", elem_id="video_subtitled_progress_info", interactive=False, visible=visible)
|
| 371 |
|
| 372 |
original_audio_path = gr.Textbox(label="Original audio path", elem_id="original_audio_path", visible=visible)
|
| 373 |
original_video_path = gr.Textbox(label="Original video path", visible=visible)
|
| 374 |
-
|
|
|
|
|
|
|
| 375 |
original_audio_translated_path = gr.Textbox(label="Original audio translated", elem_id="original_audio_translated", visible=visible)
|
| 376 |
subtitled_video = gr.Video(label="Subtitled video", elem_id="subtitled_video", visible=visible, interactive=visible)
|
| 377 |
auxiliar_block3 = gr.Textbox(placeholder="Waiting", label="Auxiliar block 3", elem_id="auxiliar_block3", interactive=False, visible=visible)
|
|
@@ -400,11 +575,11 @@ def subtify():
|
|
| 400 |
video_input.change(
|
| 401 |
fn=process_uploaded_video,
|
| 402 |
inputs=[video_input],
|
| 403 |
-
outputs=[video_input, config_block, original_video_path]
|
| 404 |
)
|
| 405 |
subtify_button.click(
|
| 406 |
fn=change_visibility_texboxes,
|
| 407 |
-
outputs=[auxiliar_block1, get_audio_from_video_info, video_transcribed_progress_info, transcriptions_concatenated_progress_info, video_translated_progress_info, video_subtitled_progress_info]
|
| 408 |
)
|
| 409 |
auxiliar_block1.change(
|
| 410 |
fn=get_audio,
|
|
@@ -414,12 +589,17 @@ def subtify():
|
|
| 414 |
get_audio_from_video_info.change(
|
| 415 |
fn=trascribe_audio,
|
| 416 |
inputs=[original_audio_path, source_languaje],
|
| 417 |
-
outputs=[video_transcribed_progress_info]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 418 |
)
|
| 419 |
-
# video_transcribed_progress_info.change(
|
| 420 |
-
# fn=concatenate_transcriptions,
|
| 421 |
-
# outputs=[transcriptions_concatenated_progress_info, original_audio_transcribed_path]
|
| 422 |
-
# )
|
| 423 |
# transcriptions_concatenated_progress_info.change(
|
| 424 |
# fn=translate_transcription,
|
| 425 |
# inputs=[original_audio_transcribed_path, source_languaje, target_languaje],
|
|
|
|
| 19 |
ERROR_BACKGROUND_COLOR, ERROR_TEXT_COLOR, ERROR_BORDER_COLOR,
|
| 20 |
BUTTON_SECONDARY_BACKGROUND_COLOR, BUTTON_SECONDARY_BORDER_COLOR,
|
| 21 |
BUTTON_SECONDARY_TEXT_COLOR, RED, GREEN, BLUE,
|
| 22 |
+
html_social_media, get_html_subtify_logo_big, get_html_subtify_logo_small, html_buy_me_a_coffe
|
| 23 |
)
|
| 24 |
# from url_manager import get_youtube_thumbnail, is_valid_youtube_url, is_valid_twitch_url, is_valid_url
|
| 25 |
from slice_audio import slice_audio as slice_audio_main
|
| 26 |
from audio import get_audio_from_video
|
| 27 |
from transcribe import transcribe, get_language_dict
|
| 28 |
+
from diarize_library import diarize_audio
|
| 29 |
+
import json
|
| 30 |
|
| 31 |
NUMBER = 100
|
| 32 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
|
|
|
| 58 |
TWITCH = "twitch"
|
| 59 |
ERROR = "error"
|
| 60 |
|
| 61 |
+
VIEW_OUTPUTS = True
|
| 62 |
+
DEBUG = True
|
| 63 |
+
|
| 64 |
subtify_logo = Image.open("assets/subtify_logo-scaled.png")
|
| 65 |
subtify_logo_width, subtify_logo_height = subtify_logo.size
|
| 66 |
factor = 4
|
| 67 |
new_width = subtify_logo_width // factor
|
| 68 |
new_height = subtify_logo_height // factor
|
| 69 |
|
| 70 |
+
html_subtify_logo_big = get_html_subtify_logo_big(new_width, new_height)
|
| 71 |
+
html_subtify_logo_small = get_html_subtify_logo_small(new_width, new_height)
|
| 72 |
|
| 73 |
language_dict = union_language_dict()
|
| 74 |
|
|
|
|
| 124 |
return (
|
| 125 |
gr.update(value="Done"), # auxiliar_block1
|
| 126 |
gr.update(visible=True), # get_audio_from_video_info
|
| 127 |
+
gr.update(visible=True), # merged_transcription
|
| 128 |
gr.update(visible=True), # video_sliced_progress_info
|
| 129 |
gr.update(visible=True), # video_transcribed_progress_info
|
| 130 |
+
gr.update(visible=True), # diarization_progress_info
|
| 131 |
gr.update(visible=True), # transcriptions_concatenated_progress_info
|
| 132 |
gr.update(visible=True), # video_translated_progress_info
|
| 133 |
gr.update(visible=True), # video_subtitled_progress_info
|
|
|
|
| 136 |
def get_audio(video_path):
|
| 137 |
print('*'*NUMBER)
|
| 138 |
print(f"Getting audio from video {video_path}")
|
| 139 |
+
|
| 140 |
audios_folder = "audios"
|
| 141 |
+
|
| 142 |
+
if DEBUG:
|
| 143 |
+
audio_file = f"{audios_folder}/download_audio.mp3"
|
| 144 |
+
if os.path.exists(audio_file):
|
| 145 |
+
return [
|
| 146 |
+
gr.update(value="Loaded"), # get_audio_from_video_info
|
| 147 |
+
gr.update(value=audio_file) # original_audio_path
|
| 148 |
+
]
|
| 149 |
+
|
| 150 |
try:
|
| 151 |
audio_path = get_audio_from_video(video_path, audios_folder)
|
| 152 |
return [
|
|
|
|
| 179 |
gr.update(value="Ok"), # video_sliced_progress_info
|
| 180 |
)
|
| 181 |
|
| 182 |
+
def diarize(input_audio_path, num_speakers, min_speakers, max_speakers):
|
| 183 |
+
print('*'*NUMBER)
|
| 184 |
+
print(f"Diarize {input_audio_path}")
|
| 185 |
+
|
| 186 |
+
# Diarization file
|
| 187 |
+
diarization_file = f"diarization/diarization.json"
|
| 188 |
+
|
| 189 |
+
if DEBUG:
|
| 190 |
+
if os.path.exists(diarization_file):
|
| 191 |
+
with open(diarization_file, "r") as f:
|
| 192 |
+
diarization = f.read()
|
| 193 |
+
return [
|
| 194 |
+
gr.update(value="Loaded"),
|
| 195 |
+
gr.update(value=diarization)
|
| 196 |
+
]
|
| 197 |
+
|
| 198 |
+
# Diarize audio
|
| 199 |
+
diarization = diarize_audio(input_audio_path, num_speakers, min_speakers, max_speakers, DEVICE)
|
| 200 |
+
|
| 201 |
+
# Save diarization
|
| 202 |
+
with open(diarization_file, "w") as f:
|
| 203 |
+
json.dump(diarization, f)
|
| 204 |
+
|
| 205 |
+
return [
|
| 206 |
+
gr.update(value="Ok"),
|
| 207 |
+
gr.update(value=diarization)
|
| 208 |
+
]
|
| 209 |
+
|
| 210 |
def trascribe_audio(input_audio_path, source_languaje):
|
| 211 |
print('*'*NUMBER)
|
| 212 |
print(f"Transcript {input_audio_path}")
|
| 213 |
|
| 214 |
+
# Transcription file
|
| 215 |
+
transcription_file = f"transcriptions/transcription_{source_languaje}.json"
|
| 216 |
+
|
| 217 |
+
if DEBUG:
|
| 218 |
+
if os.path.exists(transcription_file):
|
| 219 |
+
transcription = open(transcription_file, "r").read()
|
| 220 |
+
transcription = json.loads(transcription)
|
| 221 |
+
return [
|
| 222 |
+
gr.update(value="Loaded"),
|
| 223 |
+
gr.update(value=transcription)
|
| 224 |
+
]
|
| 225 |
+
|
| 226 |
# Get language dict
|
| 227 |
language_dict = get_language_dict()
|
| 228 |
|
| 229 |
# Transcribe audio file
|
| 230 |
+
transcription_str, transcription_dict = transcribe(input_audio_path, language_dict[source_languaje]["transcriber"], DEVICE, CHUNK_SECONDS, CHUNK_OVERLAP_SECONDS)
|
| 231 |
+
|
| 232 |
+
# Save transcription
|
| 233 |
+
with open(transcription_file, "w") as f:
|
| 234 |
+
transcription_json = json.dumps(transcription_dict)
|
| 235 |
+
f.write(transcription_json)
|
| 236 |
|
| 237 |
return (
|
| 238 |
+
gr.update(value="Ok"),
|
| 239 |
+
gr.update(value=transcription_dict)
|
| 240 |
)
|
| 241 |
|
| 242 |
def concatenate_transcriptions():
|
|
|
|
| 339 |
videos_folder = "videos"
|
| 340 |
if not os.path.exists(videos_folder):
|
| 341 |
os.makedirs(videos_folder)
|
| 342 |
+
|
| 343 |
+
if DEBUG:
|
| 344 |
+
video_file = f"{videos_folder}/download_video.mp4"
|
| 345 |
+
if os.path.exists(video_file):
|
| 346 |
+
return [
|
| 347 |
+
gr.update(label="Video uploaded"), # video_input
|
| 348 |
+
gr.update(visible=True), # config_block
|
| 349 |
+
gr.update(value=video_file), # original_video_path
|
| 350 |
+
gr.update(value=html_subtify_logo_small) # html_subtify_logo_component
|
| 351 |
+
]
|
| 352 |
|
| 353 |
# Copy uploaded video to videos folder
|
| 354 |
new_video_path = os.path.join(videos_folder, "download_video.mp4")
|
|
|
|
| 358 |
return [
|
| 359 |
gr.update(label="Video uploaded"), # video_input
|
| 360 |
gr.update(visible=True), # config_block
|
| 361 |
+
gr.update(value=new_video_path), # original_video_path
|
| 362 |
+
gr.update(value=html_subtify_logo_small) # html_subtify_logo_component
|
| 363 |
+
]
|
| 364 |
+
|
| 365 |
+
def merge_transcription_and_diarization():
|
| 366 |
+
"""
|
| 367 |
+
Combina la transcripción y la diarización para asignar speakers a cada palabra.
|
| 368 |
+
|
| 369 |
+
Returns:
|
| 370 |
+
dict: Transcripción combinada con información de speakers
|
| 371 |
+
"""
|
| 372 |
+
print('*'*NUMBER)
|
| 373 |
+
print("Merge transcription and diarization")
|
| 374 |
+
|
| 375 |
+
if DEBUG:
|
| 376 |
+
merged_transcription_path = "merged_transcription_diarization/merged.json"
|
| 377 |
+
if os.path.exists(merged_transcription_path):
|
| 378 |
+
with open(merged_transcription_path, 'r') as f:
|
| 379 |
+
merged_transcription = json.load(f)
|
| 380 |
+
return [
|
| 381 |
+
gr.update(value="Loaded"),
|
| 382 |
+
gr.update(value=merged_transcription)
|
| 383 |
+
]
|
| 384 |
+
|
| 385 |
+
transcription_path = "transcriptions/transcription_English.json"
|
| 386 |
+
diarization_path = "diarization/diarization.json"
|
| 387 |
+
# Cargar los archivos JSON
|
| 388 |
+
with open(transcription_path, 'r') as f:
|
| 389 |
+
transcription = json.load(f)
|
| 390 |
+
with open(diarization_path, 'r') as f:
|
| 391 |
+
diarization = json.load(f)
|
| 392 |
+
|
| 393 |
+
# Crear una nueva lista para los chunks combinados
|
| 394 |
+
merged_chunks = []
|
| 395 |
+
|
| 396 |
+
# Para cada palabra en la transcripción
|
| 397 |
+
for chunk in transcription.get('chunks', []):
|
| 398 |
+
# Verificar que el chunk tiene timestamps válidos
|
| 399 |
+
if not (isinstance(chunk.get('start'), (int, float)) and
|
| 400 |
+
isinstance(chunk.get('end'), (int, float))):
|
| 401 |
+
continue
|
| 402 |
+
|
| 403 |
+
word_start = float(chunk['start'])
|
| 404 |
+
word_end = float(chunk['end'])
|
| 405 |
+
|
| 406 |
+
# Encontrar el speaker correspondiente en la diarización
|
| 407 |
+
speaker = None
|
| 408 |
+
for segment in diarization:
|
| 409 |
+
# Verificar que el segmento tiene timestamps válidos
|
| 410 |
+
if not (isinstance(segment.get('start'), (int, float)) and
|
| 411 |
+
isinstance(segment.get('end'), (int, float))):
|
| 412 |
+
continue
|
| 413 |
+
|
| 414 |
+
segment_start = float(segment['start'])
|
| 415 |
+
segment_end = float(segment['end'])
|
| 416 |
+
|
| 417 |
+
# Si la palabra está dentro del rango de tiempo del segmento
|
| 418 |
+
if (word_start >= segment_start and word_end <= segment_end):
|
| 419 |
+
speaker = segment['speaker']
|
| 420 |
+
break
|
| 421 |
+
|
| 422 |
+
# Si la palabra está mayormente dentro del segmento (más del 50% de su duración)
|
| 423 |
+
word_duration = word_end - word_start
|
| 424 |
+
overlap_start = max(word_start, segment_start)
|
| 425 |
+
overlap_end = min(word_end, segment_end)
|
| 426 |
+
overlap_duration = max(0, overlap_end - overlap_start)
|
| 427 |
+
|
| 428 |
+
if overlap_duration > word_duration * 0.5:
|
| 429 |
+
speaker = segment['speaker']
|
| 430 |
+
break
|
| 431 |
+
|
| 432 |
+
# Crear el nuevo chunk con la información del speaker
|
| 433 |
+
merged_chunk = {
|
| 434 |
+
'start': word_start,
|
| 435 |
+
'end': word_end,
|
| 436 |
+
'text': chunk['text'],
|
| 437 |
+
'speaker': speaker if speaker else 'UNKNOWN'
|
| 438 |
+
}
|
| 439 |
+
merged_chunks.append(merged_chunk)
|
| 440 |
+
|
| 441 |
+
# Crear el diccionario final
|
| 442 |
+
merged_transcription = {
|
| 443 |
+
'text': transcription.get('text', ''),
|
| 444 |
+
'chunks': merged_chunks
|
| 445 |
+
}
|
| 446 |
+
|
| 447 |
+
# Crear el directorio si no existe
|
| 448 |
+
if not os.path.exists(merged_transcription_path):
|
| 449 |
+
os.makedirs(merged_transcription_path)
|
| 450 |
+
|
| 451 |
+
# Guardar el resultado en el nuevo directorio
|
| 452 |
+
with open(merged_transcription_path, 'w', encoding='utf-8') as f:
|
| 453 |
+
json.dump(merged_transcription, f, ensure_ascii=False, indent=2)
|
| 454 |
+
|
| 455 |
+
return [
|
| 456 |
+
gr.update(value="Ok"),
|
| 457 |
+
gr.update(value=merged_transcription)
|
| 458 |
]
|
| 459 |
|
| 460 |
@spaces.GPU
|
|
|
|
| 505 |
# Layout
|
| 506 |
gr.HTML(html_social_media)
|
| 507 |
gr.HTML("<h1 style='text-align: center;'>Subtify</h1>")
|
| 508 |
+
html_subtify_logo_component = gr.HTML(html_subtify_logo_big)
|
| 509 |
|
| 510 |
# Input block, where the user can upload a video and configure the subtify process
|
| 511 |
visible = False
|
|
|
|
| 526 |
with gr.Row():
|
| 527 |
source_languaje = gr.Dropdown(visible=True, label="Source languaje", show_label=True, value="English", choices=language_dict, scale=1, interactive=True, info="Language of the video")
|
| 528 |
target_languaje = gr.Dropdown(visible=True, label="Target languaje", show_label=True, value="Español", choices=language_dict, scale=1, interactive=True, info="Language to translate the subtitles")
|
| 529 |
+
with gr.Accordion("Advanced settings", open=True, visible=True) as Advanced_setings:
|
| 530 |
+
number_of_speakers = gr.Number(visible=True, label="Number of speakers", show_label=True, value=0, interactive=True, info="Number of speakers in the video, if you don't know, select 0")
|
| 531 |
+
min_speakers = gr.Number(visible=True, label="Min speakers", show_label=True, value=0, scale=0, interactive=True, info="Minimum number of speakers in the video")
|
| 532 |
+
max_speakers = gr.Number(visible=True, label="Max speakers", show_label=True, value=0, scale=0, interactive=True, info="Maximum number of speakers in the video")
|
| 533 |
subtify_button = gr.Button(size="lg", value="subtify", min_width="10px", scale=0, visible=True)
|
| 534 |
|
| 535 |
auxiliar_block1 = gr.Textbox(placeholder="", interactive=False, visible=visible)
|
| 536 |
with gr.Row():
|
| 537 |
get_audio_from_video_info = gr.Textbox(placeholder="Waiting", label="Get audio from video info", elem_id="get_audio_from_video_info", interactive=False, visible=visible)
|
| 538 |
video_transcribed_progress_info = gr.Textbox(placeholder="Waiting", label="Transcribe progress info", elem_id="video_transcribed_progress_info", interactive=False, visible=visible)
|
| 539 |
+
diarization_progress_info = gr.Textbox(placeholder="Waiting", label="Diarize progress info", elem_id="diarization_progress_info", interactive=False, visible=visible)
|
| 540 |
+
merged_transcription_progress_info = gr.Textbox(placeholder="Waiting", label="Merge transcription and diarization progress info", elem_id="merged_transcription_progress_info", interactive=False, visible=visible)
|
| 541 |
transcriptions_concatenated_progress_info = gr.Textbox(placeholder="Waiting", label="Concatenate progress info", elem_id="transcriptions_concatenated_progress_info", interactive=False, visible=visible)
|
| 542 |
video_translated_progress_info = gr.Textbox(placeholder="Waiting", label="Translate progress info", elem_id="transcription_translated_progress_info", interactive=False, visible=visible)
|
| 543 |
video_subtitled_progress_info = gr.Textbox(placeholder="Waiting", label="Video subtitle progress info", elem_id="video_subtitled_progress_info", interactive=False, visible=visible)
|
| 544 |
|
| 545 |
original_audio_path = gr.Textbox(label="Original audio path", elem_id="original_audio_path", visible=visible)
|
| 546 |
original_video_path = gr.Textbox(label="Original video path", visible=visible)
|
| 547 |
+
transcription = gr.Textbox(label="transcription", elem_id="transcription", visible=VIEW_OUTPUTS)
|
| 548 |
+
diarization = gr.Textbox(label="diarization", elem_id="diarization", visible=VIEW_OUTPUTS)
|
| 549 |
+
merged_transcription = gr.Textbox(label="merged_transcription", elem_id="merged_transcription", visible=VIEW_OUTPUTS)
|
| 550 |
original_audio_translated_path = gr.Textbox(label="Original audio translated", elem_id="original_audio_translated", visible=visible)
|
| 551 |
subtitled_video = gr.Video(label="Subtitled video", elem_id="subtitled_video", visible=visible, interactive=visible)
|
| 552 |
auxiliar_block3 = gr.Textbox(placeholder="Waiting", label="Auxiliar block 3", elem_id="auxiliar_block3", interactive=False, visible=visible)
|
|
|
|
| 575 |
video_input.change(
|
| 576 |
fn=process_uploaded_video,
|
| 577 |
inputs=[video_input],
|
| 578 |
+
outputs=[video_input, config_block, original_video_path, html_subtify_logo_component]
|
| 579 |
)
|
| 580 |
subtify_button.click(
|
| 581 |
fn=change_visibility_texboxes,
|
| 582 |
+
outputs=[auxiliar_block1, get_audio_from_video_info, merged_transcription_progress_info, video_transcribed_progress_info, diarization_progress_info, transcriptions_concatenated_progress_info, video_translated_progress_info, video_subtitled_progress_info]
|
| 583 |
)
|
| 584 |
auxiliar_block1.change(
|
| 585 |
fn=get_audio,
|
|
|
|
| 589 |
get_audio_from_video_info.change(
|
| 590 |
fn=trascribe_audio,
|
| 591 |
inputs=[original_audio_path, source_languaje],
|
| 592 |
+
outputs=[video_transcribed_progress_info, transcription]
|
| 593 |
+
)
|
| 594 |
+
video_transcribed_progress_info.change(
|
| 595 |
+
fn=diarize,
|
| 596 |
+
inputs=[original_audio_path, number_of_speakers, min_speakers, max_speakers],
|
| 597 |
+
outputs=[diarization_progress_info, diarization]
|
| 598 |
+
)
|
| 599 |
+
diarization_progress_info.change(
|
| 600 |
+
fn=merge_transcription_and_diarization,
|
| 601 |
+
outputs=[merged_transcription_progress_info, merged_transcription]
|
| 602 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 603 |
# transcriptions_concatenated_progress_info.change(
|
| 604 |
# fn=translate_transcription,
|
| 605 |
# inputs=[original_audio_transcribed_path, source_languaje, target_languaje],
|