Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -547,7 +547,7 @@ def post_edit_transcribed_segments(transcription_json, video_path,
|
|
| 547 |
num_workers=4):
|
| 548 |
"""
|
| 549 |
Given WhisperX transcription (transcription_json) and video,
|
| 550 |
-
use OCR subtitles to post-correct and
|
| 551 |
"""
|
| 552 |
|
| 553 |
# Step 1: Extract OCR subtitles
|
|
@@ -556,18 +556,19 @@ def post_edit_transcribed_segments(transcription_json, video_path,
|
|
| 556 |
# Step 2: Collapse repetitive OCR
|
| 557 |
collapsed_ocr = collapse_ocr_subtitles(ocr_json, text_similarity_threshold=90)
|
| 558 |
|
| 559 |
-
# Step 3:
|
| 560 |
merged_segments = []
|
|
|
|
| 561 |
|
| 562 |
-
for entry in transcription_json:
|
| 563 |
start = entry.get("start", 0)
|
| 564 |
end = entry.get("end", 0)
|
| 565 |
base_text = entry.get("text", "")
|
| 566 |
|
| 567 |
-
|
| 568 |
best_score = -1
|
| 569 |
|
| 570 |
-
for ocr in collapsed_ocr:
|
| 571 |
# Check time overlap
|
| 572 |
time_overlap = not (ocr["end"] < start - time_tolerance or ocr["start"] > end + time_tolerance)
|
| 573 |
if not time_overlap:
|
|
@@ -577,22 +578,41 @@ def post_edit_transcribed_segments(transcription_json, video_path,
|
|
| 577 |
sim = fuzz.ratio(ocr["text"], base_text)
|
| 578 |
if sim > best_score:
|
| 579 |
best_score = sim
|
| 580 |
-
|
| 581 |
|
| 582 |
-
#
|
| 583 |
updated_entry = entry.copy()
|
| 584 |
-
if
|
| 585 |
-
updated_entry["text"] =
|
| 586 |
updated_entry["ocr_matched"] = True
|
| 587 |
updated_entry["ocr_similarity"] = best_score
|
|
|
|
| 588 |
else:
|
| 589 |
updated_entry["ocr_matched"] = False
|
| 590 |
updated_entry["ocr_similarity"] = best_score if best_score >= 0 else None
|
| 591 |
|
| 592 |
merged_segments.append(updated_entry)
|
| 593 |
|
| 594 |
-
|
| 595 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 596 |
|
| 597 |
def process_entry(entry, i, tts_model, video_width, video_height, process_mode, target_language, font_path, speaker_sample_paths=None):
|
| 598 |
logger.debug(f"Processing entry {i}: {entry}")
|
|
|
|
| 547 |
num_workers=4):
|
| 548 |
"""
|
| 549 |
Given WhisperX transcription (transcription_json) and video,
|
| 550 |
+
use OCR subtitles to post-correct and safely insert missing captions.
|
| 551 |
"""
|
| 552 |
|
| 553 |
# Step 1: Extract OCR subtitles
|
|
|
|
| 556 |
# Step 2: Collapse repetitive OCR
|
| 557 |
collapsed_ocr = collapse_ocr_subtitles(ocr_json, text_similarity_threshold=90)
|
| 558 |
|
| 559 |
+
# Step 3: Refine existing WhisperX segments (Phase 1)
|
| 560 |
merged_segments = []
|
| 561 |
+
used_ocr_indices = set()
|
| 562 |
|
| 563 |
+
for entry_idx, entry in enumerate(transcription_json):
|
| 564 |
start = entry.get("start", 0)
|
| 565 |
end = entry.get("end", 0)
|
| 566 |
base_text = entry.get("text", "")
|
| 567 |
|
| 568 |
+
best_match_idx = None
|
| 569 |
best_score = -1
|
| 570 |
|
| 571 |
+
for ocr_idx, ocr in enumerate(collapsed_ocr):
|
| 572 |
# Check time overlap
|
| 573 |
time_overlap = not (ocr["end"] < start - time_tolerance or ocr["start"] > end + time_tolerance)
|
| 574 |
if not time_overlap:
|
|
|
|
| 578 |
sim = fuzz.ratio(ocr["text"], base_text)
|
| 579 |
if sim > best_score:
|
| 580 |
best_score = sim
|
| 581 |
+
best_match_idx = ocr_idx
|
| 582 |
|
| 583 |
+
# Update WhisperX segment if matched
|
| 584 |
updated_entry = entry.copy()
|
| 585 |
+
if best_match_idx is not None and best_score >= text_similarity_threshold:
|
| 586 |
+
updated_entry["text"] = collapsed_ocr[best_match_idx]["text"]
|
| 587 |
updated_entry["ocr_matched"] = True
|
| 588 |
updated_entry["ocr_similarity"] = best_score
|
| 589 |
+
used_ocr_indices.add(best_match_idx)
|
| 590 |
else:
|
| 591 |
updated_entry["ocr_matched"] = False
|
| 592 |
updated_entry["ocr_similarity"] = best_score if best_score >= 0 else None
|
| 593 |
|
| 594 |
merged_segments.append(updated_entry)
|
| 595 |
|
| 596 |
+
# Step 4: Insert unused OCR segments (Phase 2)
|
| 597 |
+
inserted_segments = []
|
| 598 |
+
for ocr_idx, ocr in enumerate(collapsed_ocr):
|
| 599 |
+
if ocr_idx not in used_ocr_indices:
|
| 600 |
+
inserted_segment = {
|
| 601 |
+
"start": ocr["start"],
|
| 602 |
+
"end": ocr["end"],
|
| 603 |
+
"text": ocr["text"],
|
| 604 |
+
"ocr_only": True
|
| 605 |
+
}
|
| 606 |
+
inserted_segments.append(inserted_segment)
|
| 607 |
+
|
| 608 |
+
# Step 5: Combine and sort
|
| 609 |
+
final_segments = merged_segments + inserted_segments
|
| 610 |
+
final_segments = sorted(final_segments, key=lambda x: x["start"])
|
| 611 |
+
|
| 612 |
+
print(f"✅ Post-editing completed: {len(final_segments)} total segments "
|
| 613 |
+
f"({len(inserted_segments)} OCR-only inserted)")
|
| 614 |
+
|
| 615 |
+
return final_segments
|
| 616 |
|
| 617 |
def process_entry(entry, i, tts_model, video_width, video_height, process_mode, target_language, font_path, speaker_sample_paths=None):
|
| 618 |
logger.debug(f"Processing entry {i}: {entry}")
|