Spaces:
Running
Running
Update working_yolo_pipeline.py
Browse files- working_yolo_pipeline.py +83 -11
working_yolo_pipeline.py
CHANGED
|
@@ -667,22 +667,87 @@ def pixmap_to_numpy(pix: fitz.Pixmap) -> np.ndarray:
|
|
| 667 |
|
| 668 |
|
| 669 |
|
| 670 |
-
def extract_native_words_and_convert(fitz_page, scale_factor: float = 2.0) -> list:
|
| 671 |
-
|
| 672 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 673 |
# ==============================================================================
|
| 674 |
-
# --- DEBUGGING BLOCK: CHECK FIRST 50 NATIVE WORDS ---
|
| 675 |
# ==============================================================================
|
| 676 |
print(f"\n[DEBUG] Native Extraction (Page {fitz_page.number + 1}): Checking first 50 words...")
|
|
|
|
| 677 |
debug_count = 0
|
| 678 |
for item in raw_word_data:
|
| 679 |
if debug_count >= 50: break
|
| 680 |
-
|
| 681 |
word_text = item[4]
|
| 682 |
|
| 683 |
-
#
|
| 684 |
-
|
| 685 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 686 |
debug_count += 1
|
| 687 |
print("----------------------------------------------------------------------\n")
|
| 688 |
# ==============================================================================
|
|
@@ -691,21 +756,28 @@ def extract_native_words_and_convert(fitz_page, scale_factor: float = 2.0) -> li
|
|
| 691 |
DEFAULT_CONFIDENCE = 99.0
|
| 692 |
|
| 693 |
for x1, y1, x2, y2, word, *rest in raw_word_data:
|
| 694 |
-
# --- FIX:
|
| 695 |
-
|
| 696 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 697 |
|
| 698 |
x1_pix = int(x1 * scale_factor)
|
| 699 |
y1_pix = int(y1 * scale_factor)
|
| 700 |
x2_pix = int(x2 * scale_factor)
|
| 701 |
y2_pix = int(y2 * scale_factor)
|
|
|
|
| 702 |
converted_ocr_output.append({
|
| 703 |
'type': 'text',
|
| 704 |
-
'word': cleaned_word,
|
| 705 |
'confidence': DEFAULT_CONFIDENCE,
|
| 706 |
'bbox': [x1_pix, y1_pix, x2_pix, y2_pix],
|
| 707 |
'y0': y1_pix, 'x0': x1_pix
|
| 708 |
})
|
|
|
|
| 709 |
return converted_ocr_output
|
| 710 |
|
| 711 |
|
|
|
|
| 667 |
|
| 668 |
|
| 669 |
|
| 670 |
+
# def extract_native_words_and_convert(fitz_page, scale_factor: float = 2.0) -> list:
|
| 671 |
+
# raw_word_data = fitz_page.get_text("words")
|
| 672 |
|
| 673 |
+
# # ==============================================================================
|
| 674 |
+
# # --- DEBUGGING BLOCK: CHECK FIRST 50 NATIVE WORDS ---
|
| 675 |
+
# # ==============================================================================
|
| 676 |
+
# print(f"\n[DEBUG] Native Extraction (Page {fitz_page.number + 1}): Checking first 50 words...")
|
| 677 |
+
# debug_count = 0
|
| 678 |
+
# for item in raw_word_data:
|
| 679 |
+
# if debug_count >= 50: break
|
| 680 |
+
# # item format: (x0, y0, x1, y1, word, block_no, line_no, word_no)
|
| 681 |
+
# word_text = item[4]
|
| 682 |
+
|
| 683 |
+
# # Generate unicode hex codes for every character in the word
|
| 684 |
+
# unicode_points = [f"\\u{ord(c):04x}" for c in word_text]
|
| 685 |
+
# print(f" Word {debug_count}: '{word_text}' -> Codes: {unicode_points}")
|
| 686 |
+
# debug_count += 1
|
| 687 |
+
# print("----------------------------------------------------------------------\n")
|
| 688 |
+
# # ==============================================================================
|
| 689 |
+
|
| 690 |
+
# converted_ocr_output = []
|
| 691 |
+
# DEFAULT_CONFIDENCE = 99.0
|
| 692 |
+
|
| 693 |
+
# for x1, y1, x2, y2, word, *rest in raw_word_data:
|
| 694 |
+
# # --- FIX: SANITIZE TEXT HERE ---
|
| 695 |
+
# cleaned_word = sanitize_text(word)
|
| 696 |
+
# if not cleaned_word.strip(): continue
|
| 697 |
+
|
| 698 |
+
# x1_pix = int(x1 * scale_factor)
|
| 699 |
+
# y1_pix = int(y1 * scale_factor)
|
| 700 |
+
# x2_pix = int(x2 * scale_factor)
|
| 701 |
+
# y2_pix = int(y2 * scale_factor)
|
| 702 |
+
# converted_ocr_output.append({
|
| 703 |
+
# 'type': 'text',
|
| 704 |
+
# 'word': cleaned_word, # Use the sanitized word
|
| 705 |
+
# 'confidence': DEFAULT_CONFIDENCE,
|
| 706 |
+
# 'bbox': [x1_pix, y1_pix, x2_pix, y2_pix],
|
| 707 |
+
# 'y0': y1_pix, 'x0': x1_pix
|
| 708 |
+
# })
|
| 709 |
+
# return converted_ocr_output
|
| 710 |
+
|
| 711 |
+
|
| 712 |
+
|
| 713 |
+
|
| 714 |
+
|
| 715 |
+
|
| 716 |
+
|
| 717 |
+
|
| 718 |
+
|
| 719 |
+
|
| 720 |
+
def extract_native_words_and_convert(fitz_page, scale_factor: float = 2.0) -> list:
|
| 721 |
+
# 1. Get raw data
|
| 722 |
+
try:
|
| 723 |
+
raw_word_data = fitz_page.get_text("words")
|
| 724 |
+
except Exception as e:
|
| 725 |
+
print(f" ❌ PyMuPDF extraction failed completely: {e}")
|
| 726 |
+
return []
|
| 727 |
+
|
| 728 |
# ==============================================================================
|
| 729 |
+
# --- DEBUGGING BLOCK: CHECK FIRST 50 NATIVE WORDS (SAFE PRINT) ---
|
| 730 |
# ==============================================================================
|
| 731 |
print(f"\n[DEBUG] Native Extraction (Page {fitz_page.number + 1}): Checking first 50 words...")
|
| 732 |
+
|
| 733 |
debug_count = 0
|
| 734 |
for item in raw_word_data:
|
| 735 |
if debug_count >= 50: break
|
| 736 |
+
|
| 737 |
word_text = item[4]
|
| 738 |
|
| 739 |
+
# --- SAFE PRINTING LOGIC ---
|
| 740 |
+
# We encode/decode to ignore surrogates just for the print statement
|
| 741 |
+
# This prevents the "UnicodeEncodeError" that was crashing your script
|
| 742 |
+
safe_text = word_text.encode('utf-8', 'ignore').decode('utf-8')
|
| 743 |
+
|
| 744 |
+
# Get hex codes (handling potential errors in 'ord')
|
| 745 |
+
try:
|
| 746 |
+
unicode_points = [f"\\u{ord(c):04x}" for c in word_text]
|
| 747 |
+
except:
|
| 748 |
+
unicode_points = ["ERROR"]
|
| 749 |
+
|
| 750 |
+
print(f" Word {debug_count}: '{safe_text}' -> Codes: {unicode_points}")
|
| 751 |
debug_count += 1
|
| 752 |
print("----------------------------------------------------------------------\n")
|
| 753 |
# ==============================================================================
|
|
|
|
| 756 |
DEFAULT_CONFIDENCE = 99.0
|
| 757 |
|
| 758 |
for x1, y1, x2, y2, word, *rest in raw_word_data:
|
| 759 |
+
# --- FIX: ROBUST SANITIZATION ---
|
| 760 |
+
# 1. Encode to UTF-8 ignoring errors (strips surrogates)
|
| 761 |
+
# 2. Decode back to string
|
| 762 |
+
cleaned_word_bytes = word.encode('utf-8', 'ignore')
|
| 763 |
+
cleaned_word = cleaned_word_bytes.decode('utf-8')
|
| 764 |
+
|
| 765 |
+
cleaned_word = cleaned_word.strip()
|
| 766 |
+
if not cleaned_word: continue
|
| 767 |
|
| 768 |
x1_pix = int(x1 * scale_factor)
|
| 769 |
y1_pix = int(y1 * scale_factor)
|
| 770 |
x2_pix = int(x2 * scale_factor)
|
| 771 |
y2_pix = int(y2 * scale_factor)
|
| 772 |
+
|
| 773 |
converted_ocr_output.append({
|
| 774 |
'type': 'text',
|
| 775 |
+
'word': cleaned_word,
|
| 776 |
'confidence': DEFAULT_CONFIDENCE,
|
| 777 |
'bbox': [x1_pix, y1_pix, x2_pix, y2_pix],
|
| 778 |
'y0': y1_pix, 'x0': x1_pix
|
| 779 |
})
|
| 780 |
+
|
| 781 |
return converted_ocr_output
|
| 782 |
|
| 783 |
|