heerjtdev commited on
Commit
389d827
·
verified ·
1 Parent(s): 06e1970

Update working_yolo_pipeline.py

Browse files
Files changed (1) hide show
  1. working_yolo_pipeline.py +12 -12
working_yolo_pipeline.py CHANGED
@@ -75,18 +75,18 @@ except Exception as e:
75
 
76
  from typing import Optional
77
 
78
- # def sanitize_text(text: Optional[str]) -> str:
79
- # """Removes surrogate characters and other invalid code points that cause UTF-8 encoding errors."""
80
- # if not isinstance(text, str) or text is None:
81
- # return ""
82
 
83
- # # Matches all surrogates (\ud800-\udfff) and common non-characters (\ufffe, \uffff).
84
- # # This specifically removes '\udefd' which is causing your error.
85
- # surrogates_and_nonchars = re.compile(r'[\ud800-\udfff\ufffe\uffff]')
86
 
87
- # # Replace the invalid characters with a standard space.
88
- # # We strip afterward in the calling function.
89
- # return surrogates_and_nonchars.sub(' ', text)
90
 
91
 
92
 
@@ -587,8 +587,8 @@ def extract_native_words_and_convert(fitz_page, scale_factor: float = 2.0) -> li
587
 
588
  for x1, y1, x2, y2, word, *rest in raw_word_data:
589
  # --- FIX: SANITIZE TEXT HERE ---
590
- # cleaned_word = sanitize_text(word)
591
- # if not cleaned_word.strip(): continue
592
 
593
  x1_pix = int(x1 * scale_factor)
594
  y1_pix = int(y1 * scale_factor)
 
75
 
76
  from typing import Optional
77
 
78
+ def sanitize_text(text: Optional[str]) -> str:
79
+ """Removes surrogate characters and other invalid code points that cause UTF-8 encoding errors."""
80
+ if not isinstance(text, str) or text is None:
81
+ return ""
82
 
83
+ # Matches all surrogates (\ud800-\udfff) and common non-characters (\ufffe, \uffff).
84
+ # This specifically removes '\udefd' which is causing your error.
85
+ surrogates_and_nonchars = re.compile(r'[\ud800-\udfff\ufffe\uffff]')
86
 
87
+ # Replace the invalid characters with a standard space.
88
+ # We strip afterward in the calling function.
89
+ return surrogates_and_nonchars.sub(' ', text)
90
 
91
 
92
 
 
587
 
588
  for x1, y1, x2, y2, word, *rest in raw_word_data:
589
  # --- FIX: SANITIZE TEXT HERE ---
590
+ cleaned_word = sanitize_text(word)
591
+ if not cleaned_word.strip(): continue
592
 
593
  x1_pix = int(x1 * scale_factor)
594
  y1_pix = int(y1 * scale_factor)