Spaces:

heerjtdev
/

layout_latex

Running

App Files Files Community

heerjtdev commited on Dec 5, 2025

Commit

fdafee6

verified ·

1 Parent(s): 2e5b054

Update working_yolo_pipeline.py

Browse files

Files changed (1) hide show

working_yolo_pipeline.py +83 -11

working_yolo_pipeline.py CHANGED Viewed

@@ -667,22 +667,87 @@ def pixmap_to_numpy(pix: fitz.Pixmap) -> np.ndarray:
-def extract_native_words_and_convert(fitz_page, scale_factor: float = 2.0) -> list:
-    raw_word_data = fitz_page.get_text("words")
     # ==============================================================================
-    # --- DEBUGGING BLOCK: CHECK FIRST 50 NATIVE WORDS ---
     # ==============================================================================
     print(f"\n[DEBUG] Native Extraction (Page {fitz_page.number + 1}): Checking first 50 words...")
     debug_count = 0
     for item in raw_word_data:
         if debug_count >= 50: break
-        # item format: (x0, y0, x1, y1, word, block_no, line_no, word_no)
         word_text = item[4]
-        # Generate unicode hex codes for every character in the word
-        unicode_points = [f"\\u{ord(c):04x}" for c in word_text]
-        print(f"  Word {debug_count}: '{word_text}' -> Codes: {unicode_points}")
         debug_count += 1
     print("----------------------------------------------------------------------\n")
     # ==============================================================================
@@ -691,21 +756,28 @@ def extract_native_words_and_convert(fitz_page, scale_factor: float = 2.0) -> li
     DEFAULT_CONFIDENCE = 99.0
     for x1, y1, x2, y2, word, *rest in raw_word_data:
-        # --- FIX: SANITIZE TEXT HERE ---
-        cleaned_word = sanitize_text(word)
-        if not cleaned_word.strip(): continue
         x1_pix = int(x1 * scale_factor)
         y1_pix = int(y1 * scale_factor)
         x2_pix = int(x2 * scale_factor)
         y2_pix = int(y2 * scale_factor)
         converted_ocr_output.append({
             'type': 'text',
-            'word': cleaned_word, # Use the sanitized word
             'confidence': DEFAULT_CONFIDENCE,
             'bbox': [x1_pix, y1_pix, x2_pix, y2_pix],
             'y0': y1_pix, 'x0': x1_pix
         })
     return converted_ocr_output

+# def extract_native_words_and_convert(fitz_page, scale_factor: float = 2.0) -> list:
+#     raw_word_data = fitz_page.get_text("words")
+#     # ==============================================================================
+#     # --- DEBUGGING BLOCK: CHECK FIRST 50 NATIVE WORDS ---
+#     # ==============================================================================
+#     print(f"\n[DEBUG] Native Extraction (Page {fitz_page.number + 1}): Checking first 50 words...")
+#     debug_count = 0
+#     for item in raw_word_data:
+#         if debug_count >= 50: break
+#         # item format: (x0, y0, x1, y1, word, block_no, line_no, word_no)
+#         word_text = item[4]
+#         # Generate unicode hex codes for every character in the word
+#         unicode_points = [f"\\u{ord(c):04x}" for c in word_text]
+#         print(f"  Word {debug_count}: '{word_text}' -> Codes: {unicode_points}")
+#         debug_count += 1
+#     print("----------------------------------------------------------------------\n")
+#     # ==============================================================================
+#     converted_ocr_output = []
+#     DEFAULT_CONFIDENCE = 99.0
+#     for x1, y1, x2, y2, word, *rest in raw_word_data:
+#         # --- FIX: SANITIZE TEXT HERE ---
+#         cleaned_word = sanitize_text(word)
+#         if not cleaned_word.strip(): continue
+#         x1_pix = int(x1 * scale_factor)
+#         y1_pix = int(y1 * scale_factor)
+#         x2_pix = int(x2 * scale_factor)
+#         y2_pix = int(y2 * scale_factor)
+#         converted_ocr_output.append({
+#             'type': 'text',
+#             'word': cleaned_word, # Use the sanitized word
+#             'confidence': DEFAULT_CONFIDENCE,
+#             'bbox': [x1_pix, y1_pix, x2_pix, y2_pix],
+#             'y0': y1_pix, 'x0': x1_pix
+#         })
+#     return converted_ocr_output
+def extract_native_words_and_convert(fitz_page, scale_factor: float = 2.0) -> list:
+    # 1. Get raw data
+    try:
+        raw_word_data = fitz_page.get_text("words")
+    except Exception as e:
+        print(f"  ❌ PyMuPDF extraction failed completely: {e}")
+        return []
     # ==============================================================================
+    # --- DEBUGGING BLOCK: CHECK FIRST 50 NATIVE WORDS (SAFE PRINT) ---
     # ==============================================================================
     print(f"\n[DEBUG] Native Extraction (Page {fitz_page.number + 1}): Checking first 50 words...")
     debug_count = 0
     for item in raw_word_data:
         if debug_count >= 50: break
         word_text = item[4]
+        # --- SAFE PRINTING LOGIC ---
+        # We encode/decode to ignore surrogates just for the print statement
+        # This prevents the "UnicodeEncodeError" that was crashing your script
+        safe_text = word_text.encode('utf-8', 'ignore').decode('utf-8')
+        # Get hex codes (handling potential errors in 'ord')
+        try:
+            unicode_points = [f"\\u{ord(c):04x}" for c in word_text]
+        except:
+            unicode_points = ["ERROR"]
+        print(f"  Word {debug_count}: '{safe_text}' -> Codes: {unicode_points}")
         debug_count += 1
     print("----------------------------------------------------------------------\n")
     # ==============================================================================
     DEFAULT_CONFIDENCE = 99.0
     for x1, y1, x2, y2, word, *rest in raw_word_data:
+        # --- FIX: ROBUST SANITIZATION ---
+        # 1. Encode to UTF-8 ignoring errors (strips surrogates)
+        # 2. Decode back to string
+        cleaned_word_bytes = word.encode('utf-8', 'ignore')
+        cleaned_word = cleaned_word_bytes.decode('utf-8')
+        cleaned_word = cleaned_word.strip()
+        if not cleaned_word: continue
         x1_pix = int(x1 * scale_factor)
         y1_pix = int(y1 * scale_factor)
         x2_pix = int(x2 * scale_factor)
         y2_pix = int(y2 * scale_factor)
         converted_ocr_output.append({
             'type': 'text',
+            'word': cleaned_word,
             'confidence': DEFAULT_CONFIDENCE,
             'bbox': [x1_pix, y1_pix, x2_pix, y2_pix],
             'y0': y1_pix, 'x0': x1_pix
         })
     return converted_ocr_output