Spaces:
Running
Running
Update working_yolo_pipeline.py
Browse files- working_yolo_pipeline.py +4 -3
working_yolo_pipeline.py
CHANGED
|
@@ -759,10 +759,11 @@ def extract_native_words_and_convert(fitz_page, scale_factor: float = 2.0) -> li
|
|
| 759 |
# --- FIX: ROBUST SANITIZATION ---
|
| 760 |
# 1. Encode to UTF-8 ignoring errors (strips surrogates)
|
| 761 |
# 2. Decode back to string
|
| 762 |
-
cleaned_word_bytes = word.encode('utf-8', 'ignore')
|
| 763 |
-
cleaned_word = cleaned_word_bytes.decode('utf-8')
|
|
|
|
| 764 |
|
| 765 |
-
cleaned_word = cleaned_word.strip()
|
| 766 |
if not cleaned_word: continue
|
| 767 |
|
| 768 |
x1_pix = int(x1 * scale_factor)
|
|
|
|
| 759 |
# --- FIX: ROBUST SANITIZATION ---
|
| 760 |
# 1. Encode to UTF-8 ignoring errors (strips surrogates)
|
| 761 |
# 2. Decode back to string
|
| 762 |
+
# cleaned_word_bytes = word.encode('utf-8', 'ignore')
|
| 763 |
+
# cleaned_word = cleaned_word_bytes.decode('utf-8')
|
| 764 |
+
cleaned_word = word.encode('utf-8', 'ignore').decode('utf-8').strip()
|
| 765 |
|
| 766 |
+
# cleaned_word = cleaned_word.strip()
|
| 767 |
if not cleaned_word: continue
|
| 768 |
|
| 769 |
x1_pix = int(x1 * scale_factor)
|