Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -518,14 +518,34 @@ def solve_optimal_alignment(original_segments, generated_durations, total_durati
|
|
| 518 |
|
| 519 |
return original_segments
|
| 520 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 521 |
ocr_model = None
|
| 522 |
ocr_lock = threading.Lock()
|
| 523 |
|
| 524 |
-
def init_ocr_model():
|
|
|
|
|
|
|
|
|
|
| 525 |
global ocr_model
|
| 526 |
with ocr_lock:
|
| 527 |
-
if ocr_model is None:
|
| 528 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 529 |
|
| 530 |
def find_best_subtitle_region(frame, ocr_model, region_height_ratio=0.35, num_strips=5, min_conf=0.5):
|
| 531 |
"""
|
|
@@ -579,10 +599,10 @@ def find_best_subtitle_region(frame, ocr_model, region_height_ratio=0.35, num_st
|
|
| 579 |
fallback_y = height - int(height * 0.2)
|
| 580 |
return frame[fallback_y:, :], (fallback_y, height)
|
| 581 |
|
| 582 |
-
def ocr_frame_worker(args, min_confidence=0.7):
|
| 583 |
frame_idx, frame_time, frame = args
|
| 584 |
|
| 585 |
-
init_ocr_model() # Load model in thread-safe way
|
| 586 |
|
| 587 |
if frame is None or frame.size == 0 or not isinstance(frame, np.ndarray):
|
| 588 |
return {"time": frame_time, "text": ""}
|
|
@@ -607,7 +627,7 @@ def frame_is_in_audio_segments(frame_time, audio_segments, tolerance=0.2):
|
|
| 607 |
return True
|
| 608 |
return False
|
| 609 |
|
| 610 |
-
def extract_ocr_subtitles_parallel(video_path, transcription_json, interval_sec=0.
|
| 611 |
cap = cv2.VideoCapture(video_path)
|
| 612 |
fps = cap.get(cv2.CAP_PROP_FPS)
|
| 613 |
frames = []
|
|
@@ -626,7 +646,7 @@ def extract_ocr_subtitles_parallel(video_path, transcription_json, interval_sec=
|
|
| 626 |
ocr_results = []
|
| 627 |
ocr_failures = 0 # Count OCR failures
|
| 628 |
with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
|
| 629 |
-
futures = [executor.submit(ocr_frame_worker, frame) for frame in frames]
|
| 630 |
|
| 631 |
for f in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
|
| 632 |
try:
|
|
@@ -653,6 +673,7 @@ def collapse_ocr_subtitles(ocr_json, text_similarity_threshold=90):
|
|
| 653 |
sim = fuzz.ratio(current["text"], text)
|
| 654 |
if sim >= text_similarity_threshold:
|
| 655 |
current["end"] = time
|
|
|
|
| 656 |
logger.debug(f"MERGED: Current end extended to {time:.2f}s for text: '{current['text'][:50]}...' (Similarity: {sim})")
|
| 657 |
else:
|
| 658 |
logger.debug(f"NOT MERGING (Similarity: {sim} < Threshold: {text_similarity_threshold}):")
|
|
@@ -660,8 +681,7 @@ def collapse_ocr_subtitles(ocr_json, text_similarity_threshold=90):
|
|
| 660 |
logger.debug(f" New segment: {time:.2f}s: '{text[:50]}...'")
|
| 661 |
collapsed.append(current)
|
| 662 |
current = {"start": time, "end": time, "text": text}
|
| 663 |
-
|
| 664 |
-
collapsed.append(current)
|
| 665 |
|
| 666 |
logger.info(f"✅ OCR subtitles collapsed into {len(collapsed)} segments.")
|
| 667 |
for idx, seg in enumerate(collapsed):
|
|
@@ -800,7 +820,8 @@ def realign_ocr_segments(merged_ocr_json, min_gap=0.2):
|
|
| 800 |
curr["end"] = round(curr["start"] + 0.3, 3)
|
| 801 |
|
| 802 |
return merged_ocr_json
|
| 803 |
-
|
|
|
|
| 804 |
interval_sec=0.5,
|
| 805 |
text_similarity_threshold=80,
|
| 806 |
time_tolerance=1.0,
|
|
@@ -812,8 +833,9 @@ def post_edit_transcribed_segments(transcription_json, video_path,
|
|
| 812 |
|
| 813 |
# Step 1: Extract OCR subtitles (only near audio segments)
|
| 814 |
ocr_json = extract_ocr_subtitles_parallel(
|
| 815 |
-
video_path,
|
| 816 |
transcription_json,
|
|
|
|
| 817 |
interval_sec=interval_sec,
|
| 818 |
num_workers=num_workers
|
| 819 |
)
|
|
@@ -1132,7 +1154,7 @@ def upload_and_manage(file, target_language, process_mode):
|
|
| 1132 |
transcription_json, source_language = transcribe_video_with_speakers(file.name)
|
| 1133 |
logger.info(f"Transcription completed. Detected source language: {source_language}")
|
| 1134 |
|
| 1135 |
-
transcription_json_merged = post_edit_transcribed_segments(transcription_json, file.name)
|
| 1136 |
# Step 2: Translate the transcription
|
| 1137 |
logger.info(f"Translating transcription from {source_language} to {target_language}...")
|
| 1138 |
translated_json_raw = translate_text(transcription_json_merged, source_language, target_language)
|
|
|
|
| 518 |
|
| 519 |
return original_segments
|
| 520 |
|
| 521 |
+
WHISPERX_TO_PADDLEOCR_LANG = {
|
| 522 |
+
"zh": "ch", # Chinese
|
| 523 |
+
"en": "en", # English
|
| 524 |
+
"fr": "fr", # French
|
| 525 |
+
"de": "german", # German
|
| 526 |
+
"ja": "japan", # Japanese
|
| 527 |
+
"ko": "korean", # Korean
|
| 528 |
+
"ru": "russian", # Russian
|
| 529 |
+
"it": "italian", # Italian
|
| 530 |
+
"es": "spanish", # Spanish
|
| 531 |
+
# Add more mappings as needed
|
| 532 |
+
}
|
| 533 |
+
|
| 534 |
ocr_model = None
|
| 535 |
ocr_lock = threading.Lock()
|
| 536 |
|
| 537 |
+
def init_ocr_model(source_lang):
|
| 538 |
+
"""
|
| 539 |
+
Initializes the PaddleOCR model using the mapped language.
|
| 540 |
+
"""
|
| 541 |
global ocr_model
|
| 542 |
with ocr_lock:
|
| 543 |
+
if ocr_model is not None:
|
| 544 |
+
return # already initialized
|
| 545 |
+
|
| 546 |
+
paddle_lang = WHISPERX_TO_PADDLEOCR_LANG.get(source_lang, "en")
|
| 547 |
+
logger.info(f"🔤 Initializing OCR model for source language: {source_lang} → PaddleOCR lang: {paddle_lang}")
|
| 548 |
+
ocr_model = PaddleOCR(use_angle_cls=True, lang=paddle_lang)
|
| 549 |
|
| 550 |
def find_best_subtitle_region(frame, ocr_model, region_height_ratio=0.35, num_strips=5, min_conf=0.5):
|
| 551 |
"""
|
|
|
|
| 599 |
fallback_y = height - int(height * 0.2)
|
| 600 |
return frame[fallback_y:, :], (fallback_y, height)
|
| 601 |
|
| 602 |
+
def ocr_frame_worker(args, source_language, min_confidence=0.7):
|
| 603 |
frame_idx, frame_time, frame = args
|
| 604 |
|
| 605 |
+
init_ocr_model(source_language) # Load model in thread-safe way
|
| 606 |
|
| 607 |
if frame is None or frame.size == 0 or not isinstance(frame, np.ndarray):
|
| 608 |
return {"time": frame_time, "text": ""}
|
|
|
|
| 627 |
return True
|
| 628 |
return False
|
| 629 |
|
| 630 |
+
def extract_ocr_subtitles_parallel(video_path, transcription_json, source_language, interval_sec=0.2, num_workers=4):
|
| 631 |
cap = cv2.VideoCapture(video_path)
|
| 632 |
fps = cap.get(cv2.CAP_PROP_FPS)
|
| 633 |
frames = []
|
|
|
|
| 646 |
ocr_results = []
|
| 647 |
ocr_failures = 0 # Count OCR failures
|
| 648 |
with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
|
| 649 |
+
futures = [executor.submit(ocr_frame_worker, frame, source_language) for frame in frames]
|
| 650 |
|
| 651 |
for f in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
|
| 652 |
try:
|
|
|
|
| 673 |
sim = fuzz.ratio(current["text"], text)
|
| 674 |
if sim >= text_similarity_threshold:
|
| 675 |
current["end"] = time
|
| 676 |
+
current["text"] = text
|
| 677 |
logger.debug(f"MERGED: Current end extended to {time:.2f}s for text: '{current['text'][:50]}...' (Similarity: {sim})")
|
| 678 |
else:
|
| 679 |
logger.debug(f"NOT MERGING (Similarity: {sim} < Threshold: {text_similarity_threshold}):")
|
|
|
|
| 681 |
logger.debug(f" New segment: {time:.2f}s: '{text[:50]}...'")
|
| 682 |
collapsed.append(current)
|
| 683 |
current = {"start": time, "end": time, "text": text}
|
| 684 |
+
|
|
|
|
| 685 |
|
| 686 |
logger.info(f"✅ OCR subtitles collapsed into {len(collapsed)} segments.")
|
| 687 |
for idx, seg in enumerate(collapsed):
|
|
|
|
| 820 |
curr["end"] = round(curr["start"] + 0.3, 3)
|
| 821 |
|
| 822 |
return merged_ocr_json
|
| 823 |
+
|
| 824 |
+
def post_edit_transcribed_segments(transcription_json, video_path, source_language,
|
| 825 |
interval_sec=0.5,
|
| 826 |
text_similarity_threshold=80,
|
| 827 |
time_tolerance=1.0,
|
|
|
|
| 833 |
|
| 834 |
# Step 1: Extract OCR subtitles (only near audio segments)
|
| 835 |
ocr_json = extract_ocr_subtitles_parallel(
|
| 836 |
+
video_path,
|
| 837 |
transcription_json,
|
| 838 |
+
source_language,
|
| 839 |
interval_sec=interval_sec,
|
| 840 |
num_workers=num_workers
|
| 841 |
)
|
|
|
|
| 1154 |
transcription_json, source_language = transcribe_video_with_speakers(file.name)
|
| 1155 |
logger.info(f"Transcription completed. Detected source language: {source_language}")
|
| 1156 |
|
| 1157 |
+
transcription_json_merged = post_edit_transcribed_segments(transcription_json, file.name, source_language)
|
| 1158 |
# Step 2: Translate the transcription
|
| 1159 |
logger.info(f"Translating transcription from {source_language} to {target_language}...")
|
| 1160 |
translated_json_raw = translate_text(transcription_json_merged, source_language, target_language)
|