heerjtdev commited on
Commit
589a4ac
·
verified ·
1 Parent(s): 12283fc

Update working_yolo_pipeline.py

Browse files
Files changed (1) hide show
  1. working_yolo_pipeline.py +319 -12
working_yolo_pipeline.py CHANGED
@@ -73,6 +73,23 @@ except Exception as e:
73
 
74
 
75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
 
77
  def get_latex_from_base64(base64_string: str) -> str:
78
  """
@@ -559,20 +576,45 @@ def pixmap_to_numpy(pix: fitz.Pixmap) -> np.ndarray:
559
  return img
560
 
561
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
562
  def extract_native_words_and_convert(fitz_page, scale_factor: float = 2.0) -> list:
563
  raw_word_data = fitz_page.get_text("words")
564
  converted_ocr_output = []
565
  DEFAULT_CONFIDENCE = 99.0
566
 
567
  for x1, y1, x2, y2, word, *rest in raw_word_data:
568
- if not word.strip(): continue
 
 
 
569
  x1_pix = int(x1 * scale_factor)
570
  y1_pix = int(y1 * scale_factor)
571
  x2_pix = int(x2 * scale_factor)
572
  y2_pix = int(y2 * scale_factor)
573
  converted_ocr_output.append({
574
  'type': 'text',
575
- 'word': word,
576
  'confidence': DEFAULT_CONFIDENCE,
577
  'bbox': [x1_pix, y1_pix, x2_pix, y2_pix],
578
  'y0': y1_pix, 'x0': x1_pix
@@ -580,6 +622,272 @@ def extract_native_words_and_convert(fitz_page, scale_factor: float = 2.0) -> li
580
  return converted_ocr_output
581
 
582
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
583
  def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
584
  page_num: int, fitz_page: fitz.Page,
585
  pdf_name: str) -> Tuple[List[Dict[str, Any]], Optional[int]]:
@@ -701,6 +1009,7 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
701
 
702
  try:
703
  # Try getting native text first
 
704
  raw_ocr_output = extract_native_words_and_convert(fitz_page, scale_factor=scale_factor)
705
  except Exception as e:
706
  print(f" ❌ Native text extraction failed: {e}")
@@ -728,7 +1037,6 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
728
  # === START OF OPTIMIZED OCR BLOCK ===
729
  try:
730
  # 1. Re-render Page at High Resolution (Zoom 4.0 = ~300 DPI)
731
- # We do this specifically for OCR accuracy, separate from the pipeline image
732
  ocr_zoom = 4.0
733
  pix_ocr = fitz_page.get_pixmap(matrix=fitz.Matrix(ocr_zoom, ocr_zoom))
734
 
@@ -741,12 +1049,9 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
741
  img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGBA2BGR)
742
 
743
  # 2. Preprocess (Binarization)
744
- # Ensure 'preprocess_image_for_ocr' is defined at top of file!
745
  processed_img = preprocess_image_for_ocr(img_ocr_np)
746
 
747
  # 3. Run Tesseract with Optimized Configuration
748
- # --oem 3: Default LSTM engine
749
- # --psm 6: Assume a single uniform block of text (Critical for lists/questions)
750
  custom_config = r'--oem 3 --psm 6'
751
 
752
  hocr_data = pytesseract.image_to_data(
@@ -756,11 +1061,13 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
756
  )
757
 
758
  for i in range(len(hocr_data['level'])):
759
- text = hocr_data['text'][i].strip()
760
- if text and hocr_data['conf'][i] > -1:
 
 
 
 
761
  # 4. Coordinate Mapping
762
- # We scanned at Zoom 4.0, but our pipeline expects Zoom 2.0.
763
- # Scale Factor = (Target 2.0) / (Source 4.0) = 0.5
764
  scale_adjustment = scale_factor / ocr_zoom
765
 
766
  x1 = int(hocr_data['left'][i] * scale_adjustment)
@@ -772,7 +1079,7 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
772
 
773
  raw_ocr_output.append({
774
  'type': 'text',
775
- 'word': text,
776
  'confidence': float(hocr_data['conf'][i]),
777
  'bbox': [x1, y1, x2, y2],
778
  'y0': y1,
@@ -781,7 +1088,7 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
781
  except Exception as e:
782
  print(f" ❌ Tesseract OCR Error: {e}")
783
  # === END OF OPTIMIZED OCR BLOCK ===
784
-
785
  # ====================================================================
786
  # --- STEP 6: OCR CLEANING AND MERGING ---
787
  # ====================================================================
 
73
 
74
 
75
 
76
+ from typing import Optional
77
+
78
+ def sanitize_text(text: Optional[str]) -> str:
79
+ """Removes surrogate characters and other invalid code points that cause UTF-8 encoding errors."""
80
+ if not isinstance(text, str) or text is None:
81
+ return ""
82
+
83
+ # Matches all surrogates (\ud800-\udfff) and common non-characters (\ufffe, \uffff).
84
+ # This specifically removes '\udefd' which is causing your error.
85
+ surrogates_and_nonchars = re.compile(r'[\ud800-\udfff\ufffe\uffff]')
86
+
87
+ # Replace the invalid characters with a standard space.
88
+ # We strip afterward in the calling function.
89
+ return surrogates_and_nonchars.sub(' ', text)
90
+
91
+
92
+
93
 
94
  def get_latex_from_base64(base64_string: str) -> str:
95
  """
 
576
  return img
577
 
578
 
579
+ # def extract_native_words_and_convert(fitz_page, scale_factor: float = 2.0) -> list:
580
+ # raw_word_data = fitz_page.get_text("words")
581
+ # converted_ocr_output = []
582
+ # DEFAULT_CONFIDENCE = 99.0
583
+
584
+ # for x1, y1, x2, y2, word, *rest in raw_word_data:
585
+ # if not word.strip(): continue
586
+ # x1_pix = int(x1 * scale_factor)
587
+ # y1_pix = int(y1 * scale_factor)
588
+ # x2_pix = int(x2 * scale_factor)
589
+ # y2_pix = int(y2 * scale_factor)
590
+ # converted_ocr_output.append({
591
+ # 'type': 'text',
592
+ # 'word': word,
593
+ # 'confidence': DEFAULT_CONFIDENCE,
594
+ # 'bbox': [x1_pix, y1_pix, x2_pix, y2_pix],
595
+ # 'y0': y1_pix, 'x0': x1_pix
596
+ # })
597
+ # return converted_ocr_output
598
+
599
+
600
+
601
  def extract_native_words_and_convert(fitz_page, scale_factor: float = 2.0) -> list:
602
  raw_word_data = fitz_page.get_text("words")
603
  converted_ocr_output = []
604
  DEFAULT_CONFIDENCE = 99.0
605
 
606
  for x1, y1, x2, y2, word, *rest in raw_word_data:
607
+ # --- FIX: SANITIZE TEXT HERE ---
608
+ cleaned_word = sanitize_text(word)
609
+ if not cleaned_word.strip(): continue
610
+
611
  x1_pix = int(x1 * scale_factor)
612
  y1_pix = int(y1 * scale_factor)
613
  x2_pix = int(x2 * scale_factor)
614
  y2_pix = int(y2 * scale_factor)
615
  converted_ocr_output.append({
616
  'type': 'text',
617
+ 'word': cleaned_word, # Use the sanitized word
618
  'confidence': DEFAULT_CONFIDENCE,
619
  'bbox': [x1_pix, y1_pix, x2_pix, y2_pix],
620
  'y0': y1_pix, 'x0': x1_pix
 
622
  return converted_ocr_output
623
 
624
 
625
+
626
+
627
+
628
+
629
+ # def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
630
+ # page_num: int, fitz_page: fitz.Page,
631
+ # pdf_name: str) -> Tuple[List[Dict[str, Any]], Optional[int]]:
632
+ # """
633
+ # OPTIMIZED FLOW:
634
+ # 1. Run YOLO to find Equations/Tables.
635
+ # 2. Mask raw text with YOLO boxes.
636
+ # 3. Run Column Detection on the MASKED data.
637
+ # 4. Proceed with OCR (Native or High-Res Tesseract Fallback) and Output.
638
+ # """
639
+ # global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
640
+
641
+ # start_time_total = time.time()
642
+
643
+ # if original_img is None:
644
+ # print(f" ❌ Invalid image for page {page_num}.")
645
+ # return None, None
646
+
647
+ # # ====================================================================
648
+ # # --- STEP 1: YOLO DETECTION ---
649
+ # # ====================================================================
650
+ # start_time_yolo = time.time()
651
+ # results = model.predict(source=original_img, conf=CONF_THRESHOLD, imgsz=640, verbose=False)
652
+
653
+ # relevant_detections = []
654
+ # if results and results[0].boxes:
655
+ # for box in results[0].boxes:
656
+ # class_id = int(box.cls[0])
657
+ # class_name = model.names[class_id]
658
+ # if class_name in TARGET_CLASSES:
659
+ # x1, y1, x2, y2 = box.xyxy[0].cpu().numpy().astype(int)
660
+ # relevant_detections.append(
661
+ # {'coords': (x1, y1, x2, y2), 'y1': y1, 'class': class_name, 'conf': float(box.conf[0])}
662
+ # )
663
+
664
+ # merged_detections = merge_overlapping_boxes(relevant_detections, IOU_MERGE_THRESHOLD)
665
+ # print(f" [LOG] YOLO found {len(merged_detections)} objects in {time.time() - start_time_yolo:.3f}s.")
666
+
667
+ # # ====================================================================
668
+ # # --- STEP 2: PREPARE DATA FOR COLUMN DETECTION (MASKING) ---
669
+ # # ====================================================================
670
+ # # Note: This uses the updated 'get_word_data_for_detection' which has its own optimizations
671
+ # raw_words_for_layout = get_word_data_for_detection(
672
+ # fitz_page, pdf_path, page_num,
673
+ # top_margin_percent=0.10, bottom_margin_percent=0.10
674
+ # )
675
+
676
+ # masked_word_data = merge_yolo_into_word_data(raw_words_for_layout, merged_detections, scale_factor=2.0)
677
+
678
+ # # ====================================================================
679
+ # # --- STEP 3: COLUMN DETECTION ---
680
+ # # ====================================================================
681
+ # page_width_pdf = fitz_page.rect.width
682
+ # page_height_pdf = fitz_page.rect.height
683
+
684
+ # column_detection_params = {
685
+ # 'cluster_bin_size': 2, 'cluster_smoothing': 2,
686
+ # 'cluster_min_width': 10, 'cluster_threshold_percentile': 85,
687
+ # }
688
+
689
+ # separators = calculate_x_gutters(masked_word_data, column_detection_params, page_height_pdf)
690
+
691
+ # page_separator_x = None
692
+ # if separators:
693
+ # central_min = page_width_pdf * 0.35
694
+ # central_max = page_width_pdf * 0.65
695
+ # central_separators = [s for s in separators if central_min <= s <= central_max]
696
+
697
+ # if central_separators:
698
+ # center_x = page_width_pdf / 2
699
+ # page_separator_x = min(central_separators, key=lambda x: abs(x - center_x))
700
+ # print(f" ✅ Column Split Confirmed at X={page_separator_x:.1f}")
701
+ # else:
702
+ # print(" ⚠️ Gutter found off-center. Ignoring.")
703
+ # else:
704
+ # print(" -> Single Column Layout Confirmed.")
705
+
706
+ # # ====================================================================
707
+ # # --- STEP 4: COMPONENT EXTRACTION (Save Images) ---
708
+ # # ====================================================================
709
+ # start_time_components = time.time()
710
+ # component_metadata = []
711
+ # fig_count_page = 0
712
+ # eq_count_page = 0
713
+
714
+ # for detection in merged_detections:
715
+ # x1, y1, x2, y2 = detection['coords']
716
+ # class_name = detection['class']
717
+
718
+ # if class_name == 'figure':
719
+ # GLOBAL_FIGURE_COUNT += 1
720
+ # counter = GLOBAL_FIGURE_COUNT
721
+ # component_word = f"FIGURE{counter}"
722
+ # fig_count_page += 1
723
+ # elif class_name == 'equation':
724
+ # GLOBAL_EQUATION_COUNT += 1
725
+ # counter = GLOBAL_EQUATION_COUNT
726
+ # component_word = f"EQUATION{counter}"
727
+ # eq_count_page += 1
728
+ # else:
729
+ # continue
730
+
731
+ # component_crop = original_img[y1:y2, x1:x2]
732
+ # component_filename = f"{pdf_name}_page{page_num}_{class_name}{counter}.png"
733
+ # cv2.imwrite(os.path.join(FIGURE_EXTRACTION_DIR, component_filename), component_crop)
734
+
735
+ # y_midpoint = (y1 + y2) // 2
736
+ # component_metadata.append({
737
+ # 'type': class_name, 'word': component_word,
738
+ # 'bbox': [int(x1), int(y1), int(x2), int(y2)],
739
+ # 'y0': int(y_midpoint), 'x0': int(x1)
740
+ # })
741
+
742
+ # # ====================================================================
743
+ # # --- STEP 5: HYBRID OCR (Native Text + Cached Tesseract Fallback) ---
744
+ # # ====================================================================
745
+ # raw_ocr_output = []
746
+ # scale_factor = 2.0 # Pipeline standard scale
747
+
748
+ # try:
749
+ # # Try getting native text first
750
+ # raw_ocr_output = extract_native_words_and_convert(fitz_page, scale_factor=scale_factor)
751
+ # except Exception as e:
752
+ # print(f" ❌ Native text extraction failed: {e}")
753
+
754
+ # # If native text is missing, fall back to OCR
755
+ # if not raw_ocr_output:
756
+ # if _ocr_cache.has_ocr(pdf_path, page_num):
757
+ # print(f" ⚡ Using cached Tesseract OCR for page {page_num}")
758
+ # cached_word_data = _ocr_cache.get_ocr(pdf_path, page_num)
759
+ # for word_tuple in cached_word_data:
760
+ # word_text, x1, y1, x2, y2 = word_tuple
761
+
762
+ # # Scale from PDF points to Pipeline Pixels (2.0)
763
+ # x1_pix = int(x1 * scale_factor)
764
+ # y1_pix = int(y1 * scale_factor)
765
+ # x2_pix = int(x2 * scale_factor)
766
+ # y2_pix = int(y2 * scale_factor)
767
+
768
+ # raw_ocr_output.append({
769
+ # 'type': 'text', 'word': word_text, 'confidence': 95.0,
770
+ # 'bbox': [x1_pix, y1_pix, x2_pix, y2_pix],
771
+ # 'y0': y1_pix, 'x0': x1_pix
772
+ # })
773
+ # else:
774
+ # # === START OF OPTIMIZED OCR BLOCK ===
775
+ # try:
776
+ # # 1. Re-render Page at High Resolution (Zoom 4.0 = ~300 DPI)
777
+ # # We do this specifically for OCR accuracy, separate from the pipeline image
778
+ # ocr_zoom = 4.0
779
+ # pix_ocr = fitz_page.get_pixmap(matrix=fitz.Matrix(ocr_zoom, ocr_zoom))
780
+
781
+ # # Convert PyMuPDF Pixmap to OpenCV format
782
+ # img_ocr_np = np.frombuffer(pix_ocr.samples, dtype=np.uint8).reshape(pix_ocr.height, pix_ocr.width,
783
+ # pix_ocr.n)
784
+ # if pix_ocr.n == 3:
785
+ # img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGB2BGR)
786
+ # elif pix_ocr.n == 4:
787
+ # img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGBA2BGR)
788
+
789
+ # # 2. Preprocess (Binarization)
790
+ # # Ensure 'preprocess_image_for_ocr' is defined at top of file!
791
+ # processed_img = preprocess_image_for_ocr(img_ocr_np)
792
+
793
+ # # 3. Run Tesseract with Optimized Configuration
794
+ # # --oem 3: Default LSTM engine
795
+ # # --psm 6: Assume a single uniform block of text (Critical for lists/questions)
796
+ # custom_config = r'--oem 3 --psm 6'
797
+
798
+ # hocr_data = pytesseract.image_to_data(
799
+ # processed_img,
800
+ # output_type=pytesseract.Output.DICT,
801
+ # config=custom_config
802
+ # )
803
+
804
+ # for i in range(len(hocr_data['level'])):
805
+ # text = hocr_data['text'][i].strip()
806
+ # if text and hocr_data['conf'][i] > -1:
807
+ # # 4. Coordinate Mapping
808
+ # # We scanned at Zoom 4.0, but our pipeline expects Zoom 2.0.
809
+ # # Scale Factor = (Target 2.0) / (Source 4.0) = 0.5
810
+ # scale_adjustment = scale_factor / ocr_zoom
811
+
812
+ # x1 = int(hocr_data['left'][i] * scale_adjustment)
813
+ # y1 = int(hocr_data['top'][i] * scale_adjustment)
814
+ # w = int(hocr_data['width'][i] * scale_adjustment)
815
+ # h = int(hocr_data['height'][i] * scale_adjustment)
816
+ # x2 = x1 + w
817
+ # y2 = y1 + h
818
+
819
+ # raw_ocr_output.append({
820
+ # 'type': 'text',
821
+ # 'word': text,
822
+ # 'confidence': float(hocr_data['conf'][i]),
823
+ # 'bbox': [x1, y1, x2, y2],
824
+ # 'y0': y1,
825
+ # 'x0': x1
826
+ # })
827
+ # except Exception as e:
828
+ # print(f" ❌ Tesseract OCR Error: {e}")
829
+ # # === END OF OPTIMIZED OCR BLOCK ===
830
+
831
+ # # ====================================================================
832
+ # # --- STEP 6: OCR CLEANING AND MERGING ---
833
+ # # ====================================================================
834
+ # items_to_sort = []
835
+
836
+ # for ocr_word in raw_ocr_output:
837
+ # is_suppressed = False
838
+ # for component in component_metadata:
839
+ # # Do not include words that are inside figure/equation boxes
840
+ # ioa = calculate_ioa(ocr_word['bbox'], component['bbox'])
841
+ # if ioa > IOA_SUPPRESSION_THRESHOLD:
842
+ # is_suppressed = True
843
+ # break
844
+ # if not is_suppressed:
845
+ # items_to_sort.append(ocr_word)
846
+
847
+ # # Add figures/equations back into the flow as "words"
848
+ # items_to_sort.extend(component_metadata)
849
+
850
+ # # ====================================================================
851
+ # # --- STEP 7: LINE-BASED SORTING ---
852
+ # # ====================================================================
853
+ # items_to_sort.sort(key=lambda x: (x['y0'], x['x0']))
854
+ # lines = []
855
+
856
+ # for item in items_to_sort:
857
+ # placed = False
858
+ # for line in lines:
859
+ # y_ref = min(it['y0'] for it in line)
860
+ # if abs(y_ref - item['y0']) < LINE_TOLERANCE:
861
+ # line.append(item)
862
+ # placed = True
863
+ # break
864
+ # if not placed and item['type'] in ['equation', 'figure']:
865
+ # for line in lines:
866
+ # y_ref = min(it['y0'] for it in line)
867
+ # if abs(y_ref - item['y0']) < 20:
868
+ # line.append(item)
869
+ # placed = True
870
+ # break
871
+ # if not placed:
872
+ # lines.append([item])
873
+
874
+ # for line in lines:
875
+ # line.sort(key=lambda x: x['x0'])
876
+
877
+ # final_output = []
878
+ # for line in lines:
879
+ # for item in line:
880
+ # data_item = {"word": item["word"], "bbox": item["bbox"], "type": item["type"]}
881
+ # if 'tag' in item: data_item['tag'] = item['tag']
882
+ # final_output.append(data_item)
883
+
884
+ # return final_output, page_separator_x
885
+
886
+
887
+
888
+
889
+
890
+
891
  def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
892
  page_num: int, fitz_page: fitz.Page,
893
  pdf_name: str) -> Tuple[List[Dict[str, Any]], Optional[int]]:
 
1009
 
1010
  try:
1011
  # Try getting native text first
1012
+ # NOTE: extract_native_words_and_convert MUST ALSO BE UPDATED TO USE sanitize_text
1013
  raw_ocr_output = extract_native_words_and_convert(fitz_page, scale_factor=scale_factor)
1014
  except Exception as e:
1015
  print(f" ❌ Native text extraction failed: {e}")
 
1037
  # === START OF OPTIMIZED OCR BLOCK ===
1038
  try:
1039
  # 1. Re-render Page at High Resolution (Zoom 4.0 = ~300 DPI)
 
1040
  ocr_zoom = 4.0
1041
  pix_ocr = fitz_page.get_pixmap(matrix=fitz.Matrix(ocr_zoom, ocr_zoom))
1042
 
 
1049
  img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGBA2BGR)
1050
 
1051
  # 2. Preprocess (Binarization)
 
1052
  processed_img = preprocess_image_for_ocr(img_ocr_np)
1053
 
1054
  # 3. Run Tesseract with Optimized Configuration
 
 
1055
  custom_config = r'--oem 3 --psm 6'
1056
 
1057
  hocr_data = pytesseract.image_to_data(
 
1061
  )
1062
 
1063
  for i in range(len(hocr_data['level'])):
1064
+ text = hocr_data['text'][i] # Retrieve raw Tesseract text
1065
+
1066
+ # --- FIX: SANITIZE TEXT AND THEN STRIP ---
1067
+ cleaned_text = sanitize_text(text).strip()
1068
+
1069
+ if cleaned_text and hocr_data['conf'][i] > -1:
1070
  # 4. Coordinate Mapping
 
 
1071
  scale_adjustment = scale_factor / ocr_zoom
1072
 
1073
  x1 = int(hocr_data['left'][i] * scale_adjustment)
 
1079
 
1080
  raw_ocr_output.append({
1081
  'type': 'text',
1082
+ 'word': cleaned_text, # Use the sanitized word
1083
  'confidence': float(hocr_data['conf'][i]),
1084
  'bbox': [x1, y1, x2, y2],
1085
  'y0': y1,
 
1088
  except Exception as e:
1089
  print(f" ❌ Tesseract OCR Error: {e}")
1090
  # === END OF OPTIMIZED OCR BLOCK ===
1091
+
1092
  # ====================================================================
1093
  # --- STEP 6: OCR CLEANING AND MERGING ---
1094
  # ====================================================================