Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -668,13 +668,19 @@ def collapse_ocr_subtitles(ocr_json, text_similarity_threshold=90):
|
|
| 668 |
logger.debug(f"[OCR Collapsed {idx}] {seg['start']:.2f}s - {seg['end']:.2f}s: {seg['text'][:50]}...")
|
| 669 |
return collapsed
|
| 670 |
|
| 671 |
-
def merge_speaker_and_time_from_whisperx(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 672 |
merged = []
|
| 673 |
used_whisperx = set()
|
|
|
|
| 674 |
|
|
|
|
| 675 |
for ocr in ocr_json:
|
| 676 |
-
ocr_start = ocr["start"]
|
| 677 |
-
ocr_end = ocr["end"]
|
| 678 |
ocr_text = ocr["text"]
|
| 679 |
|
| 680 |
best_match = None
|
|
@@ -685,11 +691,9 @@ def merge_speaker_and_time_from_whisperx(ocr_json, whisperx_json, text_sim_thres
|
|
| 685 |
wx_start, wx_end = wx["start"], wx["end"]
|
| 686 |
wx_text = wx["text"]
|
| 687 |
|
| 688 |
-
|
| 689 |
-
|
| 690 |
-
|
| 691 |
-
time_center_diff = abs((ocr_start + ocr_end)/2 - (wx_start + wx_end)/2)
|
| 692 |
-
if time_center_diff > 3:
|
| 693 |
continue
|
| 694 |
|
| 695 |
sim = fuzz.ratio(ocr_text, wx_text)
|
|
@@ -698,23 +702,83 @@ def merge_speaker_and_time_from_whisperx(ocr_json, whisperx_json, text_sim_thres
|
|
| 698 |
best_match = wx
|
| 699 |
best_idx = idx
|
| 700 |
|
| 701 |
-
|
| 702 |
-
|
| 703 |
-
|
| 704 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 705 |
|
| 706 |
-
|
| 707 |
-
|
| 708 |
-
|
| 709 |
-
|
| 710 |
|
| 711 |
-
|
| 712 |
-
|
| 713 |
-
new_entry["ocr_similarity"] = None
|
| 714 |
|
| 715 |
-
merged.append(new_entry)
|
| 716 |
return merged
|
| 717 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 718 |
def realign_ocr_segments(merged_ocr_json, min_gap=0.2):
|
| 719 |
"""
|
| 720 |
Realign OCR segments to avoid overlaps using midpoint-based adjustment.
|
|
|
|
| 668 |
logger.debug(f"[OCR Collapsed {idx}] {seg['start']:.2f}s - {seg['end']:.2f}s: {seg['text'][:50]}...")
|
| 669 |
return collapsed
|
| 670 |
|
| 671 |
+
def merge_speaker_and_time_from_whisperx(
|
| 672 |
+
ocr_json,
|
| 673 |
+
whisperx_json,
|
| 674 |
+
replace_threshold=90,
|
| 675 |
+
time_tolerance=1.0
|
| 676 |
+
):
|
| 677 |
merged = []
|
| 678 |
used_whisperx = set()
|
| 679 |
+
whisperx_used_flags = [False] * len(whisperx_json)
|
| 680 |
|
| 681 |
+
# Step 1: Attempt to match each OCR entry to a WhisperX entry
|
| 682 |
for ocr in ocr_json:
|
| 683 |
+
ocr_start, ocr_end = ocr["start"], ocr["end"]
|
|
|
|
| 684 |
ocr_text = ocr["text"]
|
| 685 |
|
| 686 |
best_match = None
|
|
|
|
| 691 |
wx_start, wx_end = wx["start"], wx["end"]
|
| 692 |
wx_text = wx["text"]
|
| 693 |
|
| 694 |
+
# Check for time overlap
|
| 695 |
+
overlap = not (ocr_end < wx_start - time_tolerance or ocr_start > wx_end + time_tolerance)
|
| 696 |
+
if not overlap:
|
|
|
|
|
|
|
| 697 |
continue
|
| 698 |
|
| 699 |
sim = fuzz.ratio(ocr_text, wx_text)
|
|
|
|
| 702 |
best_match = wx
|
| 703 |
best_idx = idx
|
| 704 |
|
| 705 |
+
if best_match and best_score >= replace_threshold:
|
| 706 |
+
# Replace WhisperX segment with higher quality OCR text
|
| 707 |
+
new_segment = copy.deepcopy(best_match)
|
| 708 |
+
new_segment["text"] = ocr_text
|
| 709 |
+
new_segment["ocr_replaced"] = True
|
| 710 |
+
new_segment["ocr_similarity"] = best_score
|
| 711 |
+
whisperx_used_flags[best_idx] = True
|
| 712 |
+
merged.append(new_segment)
|
| 713 |
+
else:
|
| 714 |
+
# No replacement, check if this OCR is outside WhisperX time coverage
|
| 715 |
+
covered = any(
|
| 716 |
+
abs((ocr_start + ocr_end)/2 - (wx["start"] + wx["end"])/2) < time_tolerance
|
| 717 |
+
for wx in whisperx_json
|
| 718 |
+
)
|
| 719 |
+
if not covered:
|
| 720 |
+
new_segment = copy.deepcopy(ocr)
|
| 721 |
+
new_segment["ocr_added"] = True
|
| 722 |
+
new_segment["speaker"] = "UNKNOWN"
|
| 723 |
+
merged.append(new_segment)
|
| 724 |
|
| 725 |
+
# Step 2: Add untouched WhisperX segments
|
| 726 |
+
for idx, wx in enumerate(whisperx_json):
|
| 727 |
+
if not whisperx_used_flags[idx]:
|
| 728 |
+
merged.append(wx)
|
| 729 |
|
| 730 |
+
# Step 3: Sort all merged segments
|
| 731 |
+
merged = sorted(merged, key=lambda x: x["start"])
|
|
|
|
| 732 |
|
|
|
|
| 733 |
return merged
|
| 734 |
|
| 735 |
+
# def merge_speaker_and_time_from_whisperx(ocr_json, whisperx_json, text_sim_threshold=80, replace_threshold=90):
|
| 736 |
+
# merged = []
|
| 737 |
+
# used_whisperx = set()
|
| 738 |
+
|
| 739 |
+
# for ocr in ocr_json:
|
| 740 |
+
# ocr_start = ocr["start"]
|
| 741 |
+
# ocr_end = ocr["end"]
|
| 742 |
+
# ocr_text = ocr["text"]
|
| 743 |
+
|
| 744 |
+
# best_match = None
|
| 745 |
+
# best_score = -1
|
| 746 |
+
# best_idx = None
|
| 747 |
+
|
| 748 |
+
# for idx, wx in enumerate(whisperx_json):
|
| 749 |
+
# wx_start, wx_end = wx["start"], wx["end"]
|
| 750 |
+
# wx_text = wx["text"]
|
| 751 |
+
|
| 752 |
+
# if idx in used_whisperx:
|
| 753 |
+
# continue # Already matched
|
| 754 |
+
|
| 755 |
+
# time_center_diff = abs((ocr_start + ocr_end)/2 - (wx_start + wx_end)/2)
|
| 756 |
+
# if time_center_diff > 3:
|
| 757 |
+
# continue
|
| 758 |
+
|
| 759 |
+
# sim = fuzz.ratio(ocr_text, wx_text)
|
| 760 |
+
# if sim > best_score:
|
| 761 |
+
# best_score = sim
|
| 762 |
+
# best_match = wx
|
| 763 |
+
# best_idx = idx
|
| 764 |
+
|
| 765 |
+
# new_entry = copy.deepcopy(ocr)
|
| 766 |
+
# if best_match:
|
| 767 |
+
# new_entry["speaker"] = best_match.get("speaker", "UNKNOWN")
|
| 768 |
+
# new_entry["ocr_similarity"] = best_score
|
| 769 |
+
|
| 770 |
+
# if best_score >= replace_threshold:
|
| 771 |
+
# new_entry["start"] = best_match["start"]
|
| 772 |
+
# new_entry["end"] = best_match["end"]
|
| 773 |
+
# used_whisperx.add(best_idx) # Mark used
|
| 774 |
+
|
| 775 |
+
# else:
|
| 776 |
+
# new_entry["speaker"] = "UNKNOWN"
|
| 777 |
+
# new_entry["ocr_similarity"] = None
|
| 778 |
+
|
| 779 |
+
# merged.append(new_entry)
|
| 780 |
+
# return merged
|
| 781 |
+
|
| 782 |
def realign_ocr_segments(merged_ocr_json, min_gap=0.2):
|
| 783 |
"""
|
| 784 |
Realign OCR segments to avoid overlaps using midpoint-based adjustment.
|