heerjtdev commited on
Commit
fdafee6
·
verified ·
1 Parent(s): 2e5b054

Update working_yolo_pipeline.py

Browse files
Files changed (1) hide show
  1. working_yolo_pipeline.py +83 -11
working_yolo_pipeline.py CHANGED
@@ -667,22 +667,87 @@ def pixmap_to_numpy(pix: fitz.Pixmap) -> np.ndarray:
667
 
668
 
669
 
670
- def extract_native_words_and_convert(fitz_page, scale_factor: float = 2.0) -> list:
671
- raw_word_data = fitz_page.get_text("words")
672
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
673
  # ==============================================================================
674
- # --- DEBUGGING BLOCK: CHECK FIRST 50 NATIVE WORDS ---
675
  # ==============================================================================
676
  print(f"\n[DEBUG] Native Extraction (Page {fitz_page.number + 1}): Checking first 50 words...")
 
677
  debug_count = 0
678
  for item in raw_word_data:
679
  if debug_count >= 50: break
680
- # item format: (x0, y0, x1, y1, word, block_no, line_no, word_no)
681
  word_text = item[4]
682
 
683
- # Generate unicode hex codes for every character in the word
684
- unicode_points = [f"\\u{ord(c):04x}" for c in word_text]
685
- print(f" Word {debug_count}: '{word_text}' -> Codes: {unicode_points}")
 
 
 
 
 
 
 
 
 
686
  debug_count += 1
687
  print("----------------------------------------------------------------------\n")
688
  # ==============================================================================
@@ -691,21 +756,28 @@ def extract_native_words_and_convert(fitz_page, scale_factor: float = 2.0) -> li
691
  DEFAULT_CONFIDENCE = 99.0
692
 
693
  for x1, y1, x2, y2, word, *rest in raw_word_data:
694
- # --- FIX: SANITIZE TEXT HERE ---
695
- cleaned_word = sanitize_text(word)
696
- if not cleaned_word.strip(): continue
 
 
 
 
 
697
 
698
  x1_pix = int(x1 * scale_factor)
699
  y1_pix = int(y1 * scale_factor)
700
  x2_pix = int(x2 * scale_factor)
701
  y2_pix = int(y2 * scale_factor)
 
702
  converted_ocr_output.append({
703
  'type': 'text',
704
- 'word': cleaned_word, # Use the sanitized word
705
  'confidence': DEFAULT_CONFIDENCE,
706
  'bbox': [x1_pix, y1_pix, x2_pix, y2_pix],
707
  'y0': y1_pix, 'x0': x1_pix
708
  })
 
709
  return converted_ocr_output
710
 
711
 
 
667
 
668
 
669
 
670
+ # def extract_native_words_and_convert(fitz_page, scale_factor: float = 2.0) -> list:
671
+ # raw_word_data = fitz_page.get_text("words")
672
 
673
+ # # ==============================================================================
674
+ # # --- DEBUGGING BLOCK: CHECK FIRST 50 NATIVE WORDS ---
675
+ # # ==============================================================================
676
+ # print(f"\n[DEBUG] Native Extraction (Page {fitz_page.number + 1}): Checking first 50 words...")
677
+ # debug_count = 0
678
+ # for item in raw_word_data:
679
+ # if debug_count >= 50: break
680
+ # # item format: (x0, y0, x1, y1, word, block_no, line_no, word_no)
681
+ # word_text = item[4]
682
+
683
+ # # Generate unicode hex codes for every character in the word
684
+ # unicode_points = [f"\\u{ord(c):04x}" for c in word_text]
685
+ # print(f" Word {debug_count}: '{word_text}' -> Codes: {unicode_points}")
686
+ # debug_count += 1
687
+ # print("----------------------------------------------------------------------\n")
688
+ # # ==============================================================================
689
+
690
+ # converted_ocr_output = []
691
+ # DEFAULT_CONFIDENCE = 99.0
692
+
693
+ # for x1, y1, x2, y2, word, *rest in raw_word_data:
694
+ # # --- FIX: SANITIZE TEXT HERE ---
695
+ # cleaned_word = sanitize_text(word)
696
+ # if not cleaned_word.strip(): continue
697
+
698
+ # x1_pix = int(x1 * scale_factor)
699
+ # y1_pix = int(y1 * scale_factor)
700
+ # x2_pix = int(x2 * scale_factor)
701
+ # y2_pix = int(y2 * scale_factor)
702
+ # converted_ocr_output.append({
703
+ # 'type': 'text',
704
+ # 'word': cleaned_word, # Use the sanitized word
705
+ # 'confidence': DEFAULT_CONFIDENCE,
706
+ # 'bbox': [x1_pix, y1_pix, x2_pix, y2_pix],
707
+ # 'y0': y1_pix, 'x0': x1_pix
708
+ # })
709
+ # return converted_ocr_output
710
+
711
+
712
+
713
+
714
+
715
+
716
+
717
+
718
+
719
+
720
+ def extract_native_words_and_convert(fitz_page, scale_factor: float = 2.0) -> list:
721
+ # 1. Get raw data
722
+ try:
723
+ raw_word_data = fitz_page.get_text("words")
724
+ except Exception as e:
725
+ print(f" ❌ PyMuPDF extraction failed completely: {e}")
726
+ return []
727
+
728
  # ==============================================================================
729
+ # --- DEBUGGING BLOCK: CHECK FIRST 50 NATIVE WORDS (SAFE PRINT) ---
730
  # ==============================================================================
731
  print(f"\n[DEBUG] Native Extraction (Page {fitz_page.number + 1}): Checking first 50 words...")
732
+
733
  debug_count = 0
734
  for item in raw_word_data:
735
  if debug_count >= 50: break
736
+
737
  word_text = item[4]
738
 
739
+ # --- SAFE PRINTING LOGIC ---
740
+ # We encode/decode to ignore surrogates just for the print statement
741
+ # This prevents the "UnicodeEncodeError" that was crashing your script
742
+ safe_text = word_text.encode('utf-8', 'ignore').decode('utf-8')
743
+
744
+ # Get hex codes (handling potential errors in 'ord')
745
+ try:
746
+ unicode_points = [f"\\u{ord(c):04x}" for c in word_text]
747
+ except:
748
+ unicode_points = ["ERROR"]
749
+
750
+ print(f" Word {debug_count}: '{safe_text}' -> Codes: {unicode_points}")
751
  debug_count += 1
752
  print("----------------------------------------------------------------------\n")
753
  # ==============================================================================
 
756
  DEFAULT_CONFIDENCE = 99.0
757
 
758
  for x1, y1, x2, y2, word, *rest in raw_word_data:
759
+ # --- FIX: ROBUST SANITIZATION ---
760
+ # 1. Encode to UTF-8 ignoring errors (strips surrogates)
761
+ # 2. Decode back to string
762
+ cleaned_word_bytes = word.encode('utf-8', 'ignore')
763
+ cleaned_word = cleaned_word_bytes.decode('utf-8')
764
+
765
+ cleaned_word = cleaned_word.strip()
766
+ if not cleaned_word: continue
767
 
768
  x1_pix = int(x1 * scale_factor)
769
  y1_pix = int(y1 * scale_factor)
770
  x2_pix = int(x2 * scale_factor)
771
  y2_pix = int(y2 * scale_factor)
772
+
773
  converted_ocr_output.append({
774
  'type': 'text',
775
+ 'word': cleaned_word,
776
  'confidence': DEFAULT_CONFIDENCE,
777
  'bbox': [x1_pix, y1_pix, x2_pix, y2_pix],
778
  'y0': y1_pix, 'x0': x1_pix
779
  })
780
+
781
  return converted_ocr_output
782
 
783