Update subtitle.py
Browse files- subtitle.py +86 -27
subtitle.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
"""
|
| 2 |
A comprehensive toolkit for generating and translating subtitles from media files.
|
| 3 |
|
|
@@ -207,13 +208,13 @@ def whisper_subtitle(uploaded_file, source_language):
|
|
| 207 |
# 1. Configure device and model
|
| 208 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 209 |
compute_type = "float16" if torch.cuda.is_available() else "int8"
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
model = WhisperModel("deepdml/faster-whisper-large-v3-turbo-ct2",device=device, compute_type=compute_type)
|
| 217 |
|
| 218 |
|
| 219 |
# 2. Process audio file
|
|
@@ -251,11 +252,11 @@ def whisper_subtitle(uploaded_file, source_language):
|
|
| 251 |
# 6. Generate all subtitle files
|
| 252 |
generate_srt_from_sentences(sentence_timestamps, srt_path=clean_srt_path)
|
| 253 |
word_level_srt(word_timestamps, srt_path=word_srt_path)
|
| 254 |
-
write_sentence_srt(
|
| 255 |
word_timestamps, output_file=shorts_srt_path, max_lines=1,
|
| 256 |
-
max_duration_s=2.0, max_chars_per_line=
|
| 257 |
)
|
| 258 |
-
write_sentence_srt(
|
| 259 |
word_timestamps, output_file=custom_srt_path, max_lines=2,
|
| 260 |
max_duration_s=7.0, max_chars_per_line=38
|
| 261 |
)
|
|
@@ -265,7 +266,7 @@ def whisper_subtitle(uploaded_file, source_language):
|
|
| 265 |
|
| 266 |
return (
|
| 267 |
clean_srt_path, custom_srt_path, word_srt_path, shorts_srt_path,
|
| 268 |
-
txt_path, transcript_text, detected_language
|
| 269 |
)
|
| 270 |
|
| 271 |
|
|
@@ -342,12 +343,13 @@ def merge_punctuation_glitches(subtitles):
|
|
| 342 |
|
| 343 |
return cleaned
|
| 344 |
|
|
|
|
| 345 |
def write_sentence_srt(
|
| 346 |
word_level_timestamps, output_file="subtitles_professional.srt", max_lines=2,
|
| 347 |
max_duration_s=7.0, max_chars_per_line=38, hard_pause_threshold=0.5,
|
| 348 |
merge_pause_threshold=0.4
|
| 349 |
):
|
| 350 |
-
"""Creates professional-grade SRT files
|
| 351 |
if not word_level_timestamps:
|
| 352 |
return
|
| 353 |
|
|
@@ -356,14 +358,20 @@ def write_sentence_srt(
|
|
| 356 |
i = 0
|
| 357 |
while i < len(word_level_timestamps):
|
| 358 |
start_time = word_level_timestamps[i]["start"]
|
| 359 |
-
|
|
|
|
|
|
|
|
|
|
| 360 |
j = i
|
| 361 |
while j < len(word_level_timestamps):
|
| 362 |
entry = word_level_timestamps[j]
|
| 363 |
-
|
|
|
|
|
|
|
|
|
|
| 364 |
|
| 365 |
if len(split_line_by_char_limit(potential_text, max_chars_per_line)) > max_lines: break
|
| 366 |
-
if (entry["end"] - start_time) > max_duration_s and
|
| 367 |
|
| 368 |
if j > i:
|
| 369 |
prev_entry = word_level_timestamps[j-1]
|
|
@@ -371,16 +379,24 @@ def write_sentence_srt(
|
|
| 371 |
if pause >= hard_pause_threshold: break
|
| 372 |
if prev_entry["word"].endswith(('.','!','?')): break
|
| 373 |
|
| 374 |
-
|
|
|
|
| 375 |
j += 1
|
| 376 |
|
| 377 |
-
if not
|
| 378 |
-
|
| 379 |
j = i + 1
|
| 380 |
|
| 381 |
-
text = " ".join(
|
| 382 |
end_time = word_level_timestamps[j - 1]["end"]
|
| 383 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 384 |
i = j
|
| 385 |
|
| 386 |
# Phase 2: Post-process to merge single-word "orphan" subtitles
|
|
@@ -397,20 +413,61 @@ def write_sentence_srt(
|
|
| 397 |
if len(split_line_by_char_limit(merged_text, max_chars_per_line)) <= max_lines:
|
| 398 |
prev_sub["text"] = merged_text
|
| 399 |
prev_sub["end"] = current_sub["end"]
|
|
|
|
|
|
|
|
|
|
| 400 |
continue
|
| 401 |
|
| 402 |
final_subtitles.append(current_sub)
|
| 403 |
|
| 404 |
final_subtitles = merge_punctuation_glitches(final_subtitles)
|
| 405 |
-
|
| 406 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 407 |
with open(output_file, "w", encoding="utf-8") as f:
|
| 408 |
for idx, sub in enumerate(final_subtitles, start=1):
|
|
|
|
| 409 |
text = sub["text"].replace(" ,", ",").replace(" .", ".")
|
| 410 |
formatted_lines = split_line_by_char_limit(text, max_chars_per_line)
|
|
|
|
|
|
|
|
|
|
| 411 |
f.write(f"{idx}\n")
|
| 412 |
-
f.write(f"{
|
| 413 |
f.write("\n".join(formatted_lines) + "\n\n")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 414 |
|
| 415 |
def write_subtitles_to_file(subtitles, filename="subtitles.srt"):
|
| 416 |
"""Writes a dictionary of subtitles to a standard SRT file."""
|
|
@@ -486,14 +543,15 @@ def subtitle_maker(media_file, source_lang, target_lang):
|
|
| 486 |
Returns:
|
| 487 |
A tuple containing paths to all generated files and the transcript text.
|
| 488 |
"""
|
|
|
|
| 489 |
try:
|
| 490 |
(
|
| 491 |
default_srt, custom_srt, word_srt, shorts_srt,
|
| 492 |
-
txt_path, transcript, detected_lang
|
| 493 |
) = whisper_subtitle(media_file, source_lang)
|
| 494 |
except Exception as e:
|
| 495 |
print(f"❌ An error occurred during transcription: {e}")
|
| 496 |
-
return (None, None, None, None, None, None, f"Error: {e}")
|
| 497 |
|
| 498 |
translated_srt_path = None
|
| 499 |
if detected_lang and detected_lang != target_lang:
|
|
@@ -508,7 +566,7 @@ def subtitle_maker(media_file, source_lang, target_lang):
|
|
| 508 |
|
| 509 |
return (
|
| 510 |
default_srt, translated_srt_path, custom_srt, word_srt,
|
| 511 |
-
shorts_srt, txt_path, transcript
|
| 512 |
)
|
| 513 |
|
| 514 |
|
|
@@ -525,7 +583,7 @@ os.makedirs(TEMP_FOLDER, exist_ok=True)
|
|
| 525 |
# source_lang = "English"
|
| 526 |
# target_lang = "English"
|
| 527 |
|
| 528 |
-
#
|
| 529 |
# media_file, source_lang, target_lang
|
| 530 |
# )
|
| 531 |
# If source_lang and target_lang are the same, translation will be skipped.
|
|
@@ -538,6 +596,7 @@ os.makedirs(TEMP_FOLDER, exist_ok=True)
|
|
| 538 |
# word_srt -> Word-level timestamps (useful for creating YouTube Shorts/Reels)
|
| 539 |
# shorts_srt -> Optimized subtitles for vertical videos (displays 3–4 words at a time , Maximum 17 characters per segment.)
|
| 540 |
# txt_path -> Full transcript as plain text (useful for video summarization or for asking questions about the video or audio data with other LLM tools)
|
|
|
|
| 541 |
# transcript -> Transcript text directly returned by the function, if you just need the transcript
|
| 542 |
|
| 543 |
# All functionality is contained in a single file, making it portable
|
|
|
|
| 1 |
+
# Code written by me, organized with the help of AI.
|
| 2 |
"""
|
| 3 |
A comprehensive toolkit for generating and translating subtitles from media files.
|
| 4 |
|
|
|
|
| 208 |
# 1. Configure device and model
|
| 209 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 210 |
compute_type = "float16" if torch.cuda.is_available() else "int8"
|
| 211 |
+
model_dir = download_model(
|
| 212 |
+
"deepdml/faster-whisper-large-v3-turbo-ct2",
|
| 213 |
+
download_folder="./",
|
| 214 |
+
redownload=False
|
| 215 |
+
)
|
| 216 |
+
model = WhisperModel(model_dir, device=device, compute_type=compute_type)
|
| 217 |
+
# model = WhisperModel("deepdml/faster-whisper-large-v3-turbo-ct2",device=device, compute_type=compute_type)
|
| 218 |
|
| 219 |
|
| 220 |
# 2. Process audio file
|
|
|
|
| 252 |
# 6. Generate all subtitle files
|
| 253 |
generate_srt_from_sentences(sentence_timestamps, srt_path=clean_srt_path)
|
| 254 |
word_level_srt(word_timestamps, srt_path=word_srt_path)
|
| 255 |
+
shorts_json=write_sentence_srt(
|
| 256 |
word_timestamps, output_file=shorts_srt_path, max_lines=1,
|
| 257 |
+
max_duration_s=2.0, max_chars_per_line=17
|
| 258 |
)
|
| 259 |
+
sentence_json=write_sentence_srt(
|
| 260 |
word_timestamps, output_file=custom_srt_path, max_lines=2,
|
| 261 |
max_duration_s=7.0, max_chars_per_line=38
|
| 262 |
)
|
|
|
|
| 266 |
|
| 267 |
return (
|
| 268 |
clean_srt_path, custom_srt_path, word_srt_path, shorts_srt_path,
|
| 269 |
+
txt_path, transcript_text, sentence_json,shorts_json,detected_language
|
| 270 |
)
|
| 271 |
|
| 272 |
|
|
|
|
| 343 |
|
| 344 |
return cleaned
|
| 345 |
|
| 346 |
+
import json
|
| 347 |
def write_sentence_srt(
|
| 348 |
word_level_timestamps, output_file="subtitles_professional.srt", max_lines=2,
|
| 349 |
max_duration_s=7.0, max_chars_per_line=38, hard_pause_threshold=0.5,
|
| 350 |
merge_pause_threshold=0.4
|
| 351 |
):
|
| 352 |
+
"""Creates professional-grade SRT files and a corresponding timestamp.json file."""
|
| 353 |
if not word_level_timestamps:
|
| 354 |
return
|
| 355 |
|
|
|
|
| 358 |
i = 0
|
| 359 |
while i < len(word_level_timestamps):
|
| 360 |
start_time = word_level_timestamps[i]["start"]
|
| 361 |
+
|
| 362 |
+
# We'll now store the full word objects, not just the text
|
| 363 |
+
current_word_objects = []
|
| 364 |
+
|
| 365 |
j = i
|
| 366 |
while j < len(word_level_timestamps):
|
| 367 |
entry = word_level_timestamps[j]
|
| 368 |
+
|
| 369 |
+
# Create potential text from the word objects
|
| 370 |
+
potential_words = [w["word"] for w in current_word_objects] + [entry["word"]]
|
| 371 |
+
potential_text = " ".join(potential_words)
|
| 372 |
|
| 373 |
if len(split_line_by_char_limit(potential_text, max_chars_per_line)) > max_lines: break
|
| 374 |
+
if (entry["end"] - start_time) > max_duration_s and current_word_objects: break
|
| 375 |
|
| 376 |
if j > i:
|
| 377 |
prev_entry = word_level_timestamps[j-1]
|
|
|
|
| 379 |
if pause >= hard_pause_threshold: break
|
| 380 |
if prev_entry["word"].endswith(('.','!','?')): break
|
| 381 |
|
| 382 |
+
# Append the full word object
|
| 383 |
+
current_word_objects.append(entry)
|
| 384 |
j += 1
|
| 385 |
|
| 386 |
+
if not current_word_objects:
|
| 387 |
+
current_word_objects.append(word_level_timestamps[i])
|
| 388 |
j = i + 1
|
| 389 |
|
| 390 |
+
text = " ".join([w["word"] for w in current_word_objects])
|
| 391 |
end_time = word_level_timestamps[j - 1]["end"]
|
| 392 |
+
|
| 393 |
+
# Include the list of word objects in our draft subtitle
|
| 394 |
+
draft_subtitles.append({
|
| 395 |
+
"start": start_time,
|
| 396 |
+
"end": end_time,
|
| 397 |
+
"text": text,
|
| 398 |
+
"words": current_word_objects
|
| 399 |
+
})
|
| 400 |
i = j
|
| 401 |
|
| 402 |
# Phase 2: Post-process to merge single-word "orphan" subtitles
|
|
|
|
| 413 |
if len(split_line_by_char_limit(merged_text, max_chars_per_line)) <= max_lines:
|
| 414 |
prev_sub["text"] = merged_text
|
| 415 |
prev_sub["end"] = current_sub["end"]
|
| 416 |
+
|
| 417 |
+
# Merge the word-level data as well
|
| 418 |
+
prev_sub["words"].extend(current_sub["words"])
|
| 419 |
continue
|
| 420 |
|
| 421 |
final_subtitles.append(current_sub)
|
| 422 |
|
| 423 |
final_subtitles = merge_punctuation_glitches(final_subtitles)
|
| 424 |
+
print(final_subtitles)
|
| 425 |
+
# ==============================================================================
|
| 426 |
+
# NEW CODE BLOCK: Generate JSON data and write files
|
| 427 |
+
# ==============================================================================
|
| 428 |
+
|
| 429 |
+
# This dictionary will hold the data for our JSON file
|
| 430 |
+
timestamps_data = {}
|
| 431 |
+
|
| 432 |
+
# Phase 3: Write the final SRT file (and prepare JSON data)
|
| 433 |
with open(output_file, "w", encoding="utf-8") as f:
|
| 434 |
for idx, sub in enumerate(final_subtitles, start=1):
|
| 435 |
+
# --- SRT Writing (Unchanged) ---
|
| 436 |
text = sub["text"].replace(" ,", ",").replace(" .", ".")
|
| 437 |
formatted_lines = split_line_by_char_limit(text, max_chars_per_line)
|
| 438 |
+
start_time_str = convert_time_to_srt_format(sub['start'])
|
| 439 |
+
end_time_str = convert_time_to_srt_format(sub['end'])
|
| 440 |
+
|
| 441 |
f.write(f"{idx}\n")
|
| 442 |
+
f.write(f"{start_time_str} --> {end_time_str}\n")
|
| 443 |
f.write("\n".join(formatted_lines) + "\n\n")
|
| 444 |
+
|
| 445 |
+
# --- JSON Data Population (New) ---
|
| 446 |
+
# Create the list of word dictionaries for the current subtitle
|
| 447 |
+
word_data = []
|
| 448 |
+
for word_obj in sub["words"]:
|
| 449 |
+
word_data.append({
|
| 450 |
+
"word": word_obj["word"],
|
| 451 |
+
"start": convert_time_to_srt_format(word_obj["start"]),
|
| 452 |
+
"end": convert_time_to_srt_format(word_obj["end"])
|
| 453 |
+
})
|
| 454 |
+
|
| 455 |
+
# Add the complete entry to our main dictionary
|
| 456 |
+
timestamps_data[str(idx)] = {
|
| 457 |
+
"text": "\n".join(formatted_lines),
|
| 458 |
+
"start": start_time_str,
|
| 459 |
+
"end": end_time_str,
|
| 460 |
+
"words": word_data
|
| 461 |
+
}
|
| 462 |
+
|
| 463 |
+
# Write the collected data to the JSON file
|
| 464 |
+
json_output_file = output_file.replace(".srt",".json")
|
| 465 |
+
with open(json_output_file, "w", encoding="utf-8") as f_json:
|
| 466 |
+
json.dump(timestamps_data, f_json, indent=4, ensure_ascii=False)
|
| 467 |
+
|
| 468 |
+
print(f"Successfully generated SRT file: {output_file}")
|
| 469 |
+
print(f"Successfully generated JSON file: {json_output_file}")
|
| 470 |
+
return json_output_file
|
| 471 |
|
| 472 |
def write_subtitles_to_file(subtitles, filename="subtitles.srt"):
|
| 473 |
"""Writes a dictionary of subtitles to a standard SRT file."""
|
|
|
|
| 543 |
Returns:
|
| 544 |
A tuple containing paths to all generated files and the transcript text.
|
| 545 |
"""
|
| 546 |
+
|
| 547 |
try:
|
| 548 |
(
|
| 549 |
default_srt, custom_srt, word_srt, shorts_srt,
|
| 550 |
+
txt_path, transcript, sentence_json,word_json,detected_lang
|
| 551 |
) = whisper_subtitle(media_file, source_lang)
|
| 552 |
except Exception as e:
|
| 553 |
print(f"❌ An error occurred during transcription: {e}")
|
| 554 |
+
return (None, None, None, None, None, None,None,None, f"Error: {e}")
|
| 555 |
|
| 556 |
translated_srt_path = None
|
| 557 |
if detected_lang and detected_lang != target_lang:
|
|
|
|
| 566 |
|
| 567 |
return (
|
| 568 |
default_srt, translated_srt_path, custom_srt, word_srt,
|
| 569 |
+
shorts_srt, txt_path,sentence_json,word_json, transcript
|
| 570 |
)
|
| 571 |
|
| 572 |
|
|
|
|
| 583 |
# source_lang = "English"
|
| 584 |
# target_lang = "English"
|
| 585 |
|
| 586 |
+
# default_srt, translated_srt_path, custom_srt, word_srt, shorts_srt, txt_path,sentence_json,word_json, transcript= subtitle_maker(
|
| 587 |
# media_file, source_lang, target_lang
|
| 588 |
# )
|
| 589 |
# If source_lang and target_lang are the same, translation will be skipped.
|
|
|
|
| 596 |
# word_srt -> Word-level timestamps (useful for creating YouTube Shorts/Reels)
|
| 597 |
# shorts_srt -> Optimized subtitles for vertical videos (displays 3–4 words at a time , Maximum 17 characters per segment.)
|
| 598 |
# txt_path -> Full transcript as plain text (useful for video summarization or for asking questions about the video or audio data with other LLM tools)
|
| 599 |
+
# sentence_json,word_json --> To Generate .ass file later
|
| 600 |
# transcript -> Transcript text directly returned by the function, if you just need the transcript
|
| 601 |
|
| 602 |
# All functionality is contained in a single file, making it portable
|