Spaces:
Sleeping
Sleeping
Update working_yolo_pipeline.py
Browse files- working_yolo_pipeline.py +12 -12
working_yolo_pipeline.py
CHANGED
|
@@ -75,18 +75,18 @@ except Exception as e:
|
|
| 75 |
|
| 76 |
from typing import Optional
|
| 77 |
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
|
| 83 |
-
#
|
| 84 |
-
#
|
| 85 |
-
|
| 86 |
|
| 87 |
-
#
|
| 88 |
-
#
|
| 89 |
-
|
| 90 |
|
| 91 |
|
| 92 |
|
|
@@ -587,8 +587,8 @@ def extract_native_words_and_convert(fitz_page, scale_factor: float = 2.0) -> li
|
|
| 587 |
|
| 588 |
for x1, y1, x2, y2, word, *rest in raw_word_data:
|
| 589 |
# --- FIX: SANITIZE TEXT HERE ---
|
| 590 |
-
|
| 591 |
-
|
| 592 |
|
| 593 |
x1_pix = int(x1 * scale_factor)
|
| 594 |
y1_pix = int(y1 * scale_factor)
|
|
|
|
| 75 |
|
| 76 |
from typing import Optional
|
| 77 |
|
| 78 |
+
def sanitize_text(text: Optional[str]) -> str:
|
| 79 |
+
"""Removes surrogate characters and other invalid code points that cause UTF-8 encoding errors."""
|
| 80 |
+
if not isinstance(text, str) or text is None:
|
| 81 |
+
return ""
|
| 82 |
|
| 83 |
+
# Matches all surrogates (\ud800-\udfff) and common non-characters (\ufffe, \uffff).
|
| 84 |
+
# This specifically removes '\udefd' which is causing your error.
|
| 85 |
+
surrogates_and_nonchars = re.compile(r'[\ud800-\udfff\ufffe\uffff]')
|
| 86 |
|
| 87 |
+
# Replace the invalid characters with a standard space.
|
| 88 |
+
# We strip afterward in the calling function.
|
| 89 |
+
return surrogates_and_nonchars.sub(' ', text)
|
| 90 |
|
| 91 |
|
| 92 |
|
|
|
|
| 587 |
|
| 588 |
for x1, y1, x2, y2, word, *rest in raw_word_data:
|
| 589 |
# --- FIX: SANITIZE TEXT HERE ---
|
| 590 |
+
cleaned_word = sanitize_text(word)
|
| 591 |
+
if not cleaned_word.strip(): continue
|
| 592 |
|
| 593 |
x1_pix = int(x1 * scale_factor)
|
| 594 |
y1_pix = int(y1 * scale_factor)
|