Spaces:

heerjtdev
/

layout_latex

Sleeping

heerjtdev commited on Dec 5, 2025

Commit

389d827

verified ·

1 Parent(s): 06e1970

Update working_yolo_pipeline.py

Files changed (1) hide show

working_yolo_pipeline.py CHANGED Viewed

@@ -75,18 +75,18 @@ except Exception as e:
 from typing import Optional
-# def sanitize_text(text: Optional[str]) -> str:
-#     """Removes surrogate characters and other invalid code points that cause UTF-8 encoding errors."""
-#     if not isinstance(text, str) or text is None:
-#         return ""
-#     # Matches all surrogates (\ud800-\udfff) and common non-characters (\ufffe, \uffff).
-#     # This specifically removes '\udefd' which is causing your error.
-#     surrogates_and_nonchars = re.compile(r'[\ud800-\udfff\ufffe\uffff]')
-#     # Replace the invalid characters with a standard space.
-#     # We strip afterward in the calling function.
-#     return surrogates_and_nonchars.sub(' ', text)
@@ -587,8 +587,8 @@ def extract_native_words_and_convert(fitz_page, scale_factor: float = 2.0) -> li
     for x1, y1, x2, y2, word, *rest in raw_word_data:
         # --- FIX: SANITIZE TEXT HERE ---
-        # cleaned_word = sanitize_text(word)
-        # if not cleaned_word.strip(): continue
         x1_pix = int(x1 * scale_factor)
         y1_pix = int(y1 * scale_factor)

 from typing import Optional
+def sanitize_text(text: Optional[str]) -> str:
+    """Removes surrogate characters and other invalid code points that cause UTF-8 encoding errors."""
+    if not isinstance(text, str) or text is None:
+        return ""
+    # Matches all surrogates (\ud800-\udfff) and common non-characters (\ufffe, \uffff).
+    # This specifically removes '\udefd' which is causing your error.
+    surrogates_and_nonchars = re.compile(r'[\ud800-\udfff\ufffe\uffff]')
+    # Replace the invalid characters with a standard space.
+    # We strip afterward in the calling function.
+    return surrogates_and_nonchars.sub(' ', text)
     for x1, y1, x2, y2, word, *rest in raw_word_data:
         # --- FIX: SANITIZE TEXT HERE ---
+        cleaned_word = sanitize_text(word)
+        if not cleaned_word.strip(): continue
         x1_pix = int(x1 * scale_factor)
         y1_pix = int(y1 * scale_factor)