Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -601,6 +601,7 @@ def post_edit_transcribed_segments(transcription_json, video_path,
|
|
| 601 |
interval_sec=interval_sec,
|
| 602 |
num_workers=num_workers
|
| 603 |
)
|
|
|
|
| 604 |
# Step 2: Collapse repetitive OCR
|
| 605 |
collapsed_ocr = collapse_ocr_subtitles(ocr_json, text_similarity_threshold=90)
|
| 606 |
|
|
@@ -617,18 +618,15 @@ def post_edit_transcribed_segments(transcription_json, video_path,
|
|
| 617 |
best_score = -1
|
| 618 |
|
| 619 |
for ocr_idx, ocr in enumerate(collapsed_ocr):
|
| 620 |
-
# Check time overlap
|
| 621 |
time_overlap = not (ocr["end"] < start - time_tolerance or ocr["start"] > end + time_tolerance)
|
| 622 |
if not time_overlap:
|
| 623 |
continue
|
| 624 |
|
| 625 |
-
# Text similarity
|
| 626 |
sim = fuzz.ratio(ocr["text"], base_text)
|
| 627 |
if sim > best_score:
|
| 628 |
best_score = sim
|
| 629 |
best_match_idx = ocr_idx
|
| 630 |
|
| 631 |
-
# Update WhisperX segment if matched
|
| 632 |
updated_entry = entry.copy()
|
| 633 |
if best_match_idx is not None and best_score >= text_similarity_threshold:
|
| 634 |
updated_entry["text"] = collapsed_ocr[best_match_idx]["text"]
|
|
@@ -645,11 +643,23 @@ def post_edit_transcribed_segments(transcription_json, video_path,
|
|
| 645 |
inserted_segments = []
|
| 646 |
for ocr_idx, ocr in enumerate(collapsed_ocr):
|
| 647 |
if ocr_idx not in used_ocr_indices:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 648 |
inserted_segment = {
|
| 649 |
"start": ocr["start"],
|
| 650 |
"end": ocr["end"],
|
| 651 |
"text": ocr["text"],
|
| 652 |
-
"
|
| 653 |
}
|
| 654 |
inserted_segments.append(inserted_segment)
|
| 655 |
|
|
@@ -658,10 +668,11 @@ def post_edit_transcribed_segments(transcription_json, video_path,
|
|
| 658 |
final_segments = sorted(final_segments, key=lambda x: x["start"])
|
| 659 |
|
| 660 |
print(f"✅ Post-editing completed: {len(final_segments)} total segments "
|
| 661 |
-
f"({len(inserted_segments)} OCR-
|
| 662 |
|
| 663 |
return final_segments
|
| 664 |
|
|
|
|
| 665 |
def process_entry(entry, i, tts_model, video_width, video_height, process_mode, target_language, font_path, speaker_sample_paths=None):
|
| 666 |
logger.debug(f"Processing entry {i}: {entry}")
|
| 667 |
error_message = None
|
|
|
|
| 601 |
interval_sec=interval_sec,
|
| 602 |
num_workers=num_workers
|
| 603 |
)
|
| 604 |
+
|
| 605 |
# Step 2: Collapse repetitive OCR
|
| 606 |
collapsed_ocr = collapse_ocr_subtitles(ocr_json, text_similarity_threshold=90)
|
| 607 |
|
|
|
|
| 618 |
best_score = -1
|
| 619 |
|
| 620 |
for ocr_idx, ocr in enumerate(collapsed_ocr):
|
|
|
|
| 621 |
time_overlap = not (ocr["end"] < start - time_tolerance or ocr["start"] > end + time_tolerance)
|
| 622 |
if not time_overlap:
|
| 623 |
continue
|
| 624 |
|
|
|
|
| 625 |
sim = fuzz.ratio(ocr["text"], base_text)
|
| 626 |
if sim > best_score:
|
| 627 |
best_score = sim
|
| 628 |
best_match_idx = ocr_idx
|
| 629 |
|
|
|
|
| 630 |
updated_entry = entry.copy()
|
| 631 |
if best_match_idx is not None and best_score >= text_similarity_threshold:
|
| 632 |
updated_entry["text"] = collapsed_ocr[best_match_idx]["text"]
|
|
|
|
| 643 |
inserted_segments = []
|
| 644 |
for ocr_idx, ocr in enumerate(collapsed_ocr):
|
| 645 |
if ocr_idx not in used_ocr_indices:
|
| 646 |
+
# Try to assign the speaker based on nearby merged segments
|
| 647 |
+
nearby_speakers = []
|
| 648 |
+
for seg in merged_segments:
|
| 649 |
+
if abs(seg["start"] - ocr["start"]) <= 2.0 or abs(seg["end"] - ocr["end"]) <= 2.0:
|
| 650 |
+
if "speaker" in seg:
|
| 651 |
+
nearby_speakers.append(seg["speaker"])
|
| 652 |
+
|
| 653 |
+
if nearby_speakers:
|
| 654 |
+
assigned_speaker = nearby_speakers[0] # Take the first nearby speaker
|
| 655 |
+
else:
|
| 656 |
+
assigned_speaker = "SPEAKER_00"
|
| 657 |
+
|
| 658 |
inserted_segment = {
|
| 659 |
"start": ocr["start"],
|
| 660 |
"end": ocr["end"],
|
| 661 |
"text": ocr["text"],
|
| 662 |
+
"speaker": assigned_speaker
|
| 663 |
}
|
| 664 |
inserted_segments.append(inserted_segment)
|
| 665 |
|
|
|
|
| 668 |
final_segments = sorted(final_segments, key=lambda x: x["start"])
|
| 669 |
|
| 670 |
print(f"✅ Post-editing completed: {len(final_segments)} total segments "
|
| 671 |
+
f"({len(inserted_segments)} OCR-inserted segments)")
|
| 672 |
|
| 673 |
return final_segments
|
| 674 |
|
| 675 |
+
|
| 676 |
def process_entry(entry, i, tts_model, video_width, video_height, process_mode, target_language, font_path, speaker_sample_paths=None):
|
| 677 |
logger.debug(f"Processing entry {i}: {entry}")
|
| 678 |
error_message = None
|