heerjtdev commited on
Commit
7249be5
·
verified ·
1 Parent(s): bc2e64c

Update working_yolo_pipeline.py

Browse files
Files changed (1) hide show
  1. working_yolo_pipeline.py +215 -72
working_yolo_pipeline.py CHANGED
@@ -17,6 +17,30 @@ torch.load = patched_torch_load
17
 
18
 
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  import json
21
  import argparse
22
  import os
@@ -511,10 +535,71 @@ def calculate_x_gutters(word_data: list, params: Dict, page_height: float) -> Li
511
 
512
  return sorted(final_separators)
513
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
514
 
515
  def get_word_data_for_detection(page: fitz.Page, pdf_path: str, page_num: int,
516
  top_margin_percent=0.10, bottom_margin_percent=0.10) -> list:
517
- """Extract word data with OCR caching to avoid redundant Tesseract runs."""
518
  word_data = page.get_text("words")
519
 
520
  if len(word_data) > 0:
@@ -524,45 +609,40 @@ def get_word_data_for_detection(page: fitz.Page, pdf_path: str, page_num: int,
524
  word_data = _ocr_cache.get_ocr(pdf_path, page_num)
525
  else:
526
  try:
527
- # --- OPTIMIZATION START ---
528
- # 1. Render at Higher Resolution (Zoom 4.0 = ~300 DPI)
529
  zoom_level = 4.0
530
  pix = page.get_pixmap(matrix=fitz.Matrix(zoom_level, zoom_level))
531
-
532
- # 2. Convert directly to OpenCV format (Faster than PIL)
533
  img_np = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
534
- if pix.n == 3:
535
- img_np = cv2.cvtColor(img_np, cv2.COLOR_RGB2BGR)
536
- elif pix.n == 4:
537
- img_np = cv2.cvtColor(img_np, cv2.COLOR_RGBA2BGR)
538
-
539
- # 3. Apply Preprocessing (Thresholding)
540
- processed_img = preprocess_image_for_ocr(img_np)
541
-
542
- # 4. Optimized Tesseract Config
543
- # --psm 6: Assume a single uniform block of text (Great for columns/questions)
544
- # --oem 3: Default engine (LSTM)
545
- custom_config = r'--oem 3 --psm 6'
546
 
547
- data = pytesseract.image_to_data(processed_img, output_type=pytesseract.Output.DICT,
548
- config=custom_config)
 
 
549
 
550
  full_word_data = []
551
- for i in range(len(data['level'])):
552
- text = data['text'][i].strip()
553
- if text:
554
- # Scale coordinates back to PDF points
555
- x1 = data['left'][i] / zoom_level
556
- y1 = data['top'][i] / zoom_level
557
- x2 = (data['left'][i] + data['width'][i]) / zoom_level
558
- y2 = (data['top'][i] + data['height'][i]) / zoom_level
559
- full_word_data.append((text, x1, y1, x2, y2))
 
 
 
 
 
560
 
561
  word_data = full_word_data
562
  _ocr_cache.set_ocr(pdf_path, page_num, word_data)
563
- # --- OPTIMIZATION END ---
564
  except Exception as e:
565
- print(f" ❌ OCR Error in detection phase: {e}")
566
  return []
567
 
568
  # Apply margin filtering
@@ -572,6 +652,17 @@ def get_word_data_for_detection(page: fitz.Page, pdf_path: str, page_num: int,
572
  return [d for d in word_data if d[2] >= y_min and d[4] <= y_max]
573
 
574
 
 
 
 
 
 
 
 
 
 
 
 
575
  def pixmap_to_numpy(pix: fitz.Pixmap) -> np.ndarray:
576
  img_data = pix.samples
577
  img = np.frombuffer(img_data, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
@@ -992,58 +1083,110 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
992
  })
993
  else:
994
  # === START OF OPTIMIZED OCR BLOCK ===
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
995
  try:
996
- # 1. Re-render Page at High Resolution (Zoom 4.0 = ~300 DPI)
997
  ocr_zoom = 4.0
998
  pix_ocr = fitz_page.get_pixmap(matrix=fitz.Matrix(ocr_zoom, ocr_zoom))
999
 
1000
- # Convert PyMuPDF Pixmap to OpenCV format
1001
- img_ocr_np = np.frombuffer(pix_ocr.samples, dtype=np.uint8).reshape(pix_ocr.height, pix_ocr.width,
1002
- pix_ocr.n)
 
1003
  if pix_ocr.n == 3:
1004
  img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGB2BGR)
1005
  elif pix_ocr.n == 4:
1006
  img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGBA2BGR)
1007
 
1008
- # 2. Preprocess (Binarization)
1009
- processed_img = preprocess_image_for_ocr(img_ocr_np)
1010
-
1011
- # 3. Run Tesseract with Optimized Configuration
1012
- custom_config = r'--oem 3 --psm 6'
1013
-
1014
- hocr_data = pytesseract.image_to_data(
1015
- processed_img,
1016
- output_type=pytesseract.Output.DICT,
1017
- config=custom_config
1018
- )
1019
-
1020
- for i in range(len(hocr_data['level'])):
1021
- text = hocr_data['text'][i] # Retrieve raw Tesseract text
1022
-
1023
- # --- FIX: SANITIZE TEXT AND THEN STRIP ---
1024
- cleaned_text = sanitize_text(text).strip()
1025
-
1026
- if cleaned_text and hocr_data['conf'][i] > -1:
1027
- # 4. Coordinate Mapping
1028
- scale_adjustment = scale_factor / ocr_zoom
1029
-
1030
- x1 = int(hocr_data['left'][i] * scale_adjustment)
1031
- y1 = int(hocr_data['top'][i] * scale_adjustment)
1032
- w = int(hocr_data['width'][i] * scale_adjustment)
1033
- h = int(hocr_data['height'][i] * scale_adjustment)
1034
- x2 = x1 + w
1035
- y2 = y1 + h
1036
-
1037
- raw_ocr_output.append({
1038
- 'type': 'text',
1039
- 'word': cleaned_text, # Use the sanitized word
1040
- 'confidence': float(hocr_data['conf'][i]),
1041
- 'bbox': [x1, y1, x2, y2],
1042
- 'y0': y1,
1043
- 'x0': x1
1044
- })
1045
  except Exception as e:
1046
- print(f" ❌ Tesseract OCR Error: {e}")
 
 
1047
  # === END OF OPTIMIZED OCR BLOCK ===
1048
 
1049
  # ====================================================================
 
17
 
18
 
19
 
20
+
21
+
22
+ #==================================================================================
23
+ #RAPID OCR
24
+ #==================================================================================
25
+
26
+ from rapidocr import RapidOCR, OCRVersion
27
+
28
+ # Initialize RapidOCR (v5 is generally the most accurate current version)
29
+ # We use return_word_box=True to get word-level precision similar to Tesseract's image_to_data
30
+ ocr_engine = RapidOCR(params={
31
+ "Det.ocr_version": OCRVersion.PPOCRV5,
32
+ "Rec.ocr_version": OCRVersion.PPOCRV5,
33
+ "return_word_box": True
34
+ })
35
+
36
+
37
+
38
+ #==================================================================================
39
+ #RAPID OCR
40
+ #==================================================================================
41
+
42
+
43
+
44
  import json
45
  import argparse
46
  import os
 
535
 
536
  return sorted(final_separators)
537
 
538
+ #======================================================================================================================================
539
+ # def get_word_data_for_detection(page: fitz.Page, pdf_path: str, page_num: int,
540
+ # top_margin_percent=0.10, bottom_margin_percent=0.10) -> list:
541
+ # """Extract word data with OCR caching to avoid redundant Tesseract runs."""
542
+ # word_data = page.get_text("words")
543
+
544
+ # if len(word_data) > 0:
545
+ # word_data = [(w[4], w[0], w[1], w[2], w[3]) for w in word_data]
546
+ # else:
547
+ # if _ocr_cache.has_ocr(pdf_path, page_num):
548
+ # word_data = _ocr_cache.get_ocr(pdf_path, page_num)
549
+ # else:
550
+ # try:
551
+ # # --- OPTIMIZATION START ---
552
+ # # 1. Render at Higher Resolution (Zoom 4.0 = ~300 DPI)
553
+ # zoom_level = 4.0
554
+ # pix = page.get_pixmap(matrix=fitz.Matrix(zoom_level, zoom_level))
555
+
556
+ # # 2. Convert directly to OpenCV format (Faster than PIL)
557
+ # img_np = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
558
+ # if pix.n == 3:
559
+ # img_np = cv2.cvtColor(img_np, cv2.COLOR_RGB2BGR)
560
+ # elif pix.n == 4:
561
+ # img_np = cv2.cvtColor(img_np, cv2.COLOR_RGBA2BGR)
562
+
563
+ # # 3. Apply Preprocessing (Thresholding)
564
+ # processed_img = preprocess_image_for_ocr(img_np)
565
+
566
+ # # 4. Optimized Tesseract Config
567
+ # # --psm 6: Assume a single uniform block of text (Great for columns/questions)
568
+ # # --oem 3: Default engine (LSTM)
569
+ # custom_config = r'--oem 3 --psm 6'
570
+
571
+ # data = pytesseract.image_to_data(processed_img, output_type=pytesseract.Output.DICT,
572
+ # config=custom_config)
573
+
574
+ # full_word_data = []
575
+ # for i in range(len(data['level'])):
576
+ # text = data['text'][i].strip()
577
+ # if text:
578
+ # # Scale coordinates back to PDF points
579
+ # x1 = data['left'][i] / zoom_level
580
+ # y1 = data['top'][i] / zoom_level
581
+ # x2 = (data['left'][i] + data['width'][i]) / zoom_level
582
+ # y2 = (data['top'][i] + data['height'][i]) / zoom_level
583
+ # full_word_data.append((text, x1, y1, x2, y2))
584
+
585
+ # word_data = full_word_data
586
+ # _ocr_cache.set_ocr(pdf_path, page_num, word_data)
587
+ # # --- OPTIMIZATION END ---
588
+ # except Exception as e:
589
+ # print(f" ❌ OCR Error in detection phase: {e}")
590
+ # return []
591
+
592
+ # # Apply margin filtering
593
+ # page_height = page.rect.height
594
+ # y_min = page_height * top_margin_percent
595
+ # y_max = page_height * (1 - bottom_margin_percent)
596
+ # return [d for d in word_data if d[2] >= y_min and d[4] <= y_max]
597
+
598
+ #============================================================================================================
599
+
600
 
601
  def get_word_data_for_detection(page: fitz.Page, pdf_path: str, page_num: int,
602
  top_margin_percent=0.10, bottom_margin_percent=0.10) -> list:
 
603
  word_data = page.get_text("words")
604
 
605
  if len(word_data) > 0:
 
609
  word_data = _ocr_cache.get_ocr(pdf_path, page_num)
610
  else:
611
  try:
612
+ # 1. Render at Higher Resolution
 
613
  zoom_level = 4.0
614
  pix = page.get_pixmap(matrix=fitz.Matrix(zoom_level, zoom_level))
 
 
615
  img_np = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
616
+
617
+ # Convert to BGR for RapidOCR
618
+ if pix.n == 3: img_np = cv2.cvtColor(img_np, cv2.COLOR_RGB2BGR)
619
+ elif pix.n == 4: img_np = cv2.cvtColor(img_np, cv2.COLOR_RGBA2BGR)
 
 
 
 
 
 
 
 
620
 
621
+ # 2. Run RapidOCR
622
+ # RapidOCR returns: [[box, text, score], ...]
623
+ # where box is [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
624
+ results, _ = ocr_engine(img_np)
625
 
626
  full_word_data = []
627
+ if results:
628
+ for box, text, score in results:
629
+ text = text.strip()
630
+ if text:
631
+ # 3. Convert Polygon to BBox and Scale back to PDF points
632
+ xs = [p[0] for p in box]
633
+ ys = [p[1] for p in box]
634
+
635
+ x1 = min(xs) / zoom_level
636
+ y1 = min(ys) / zoom_level
637
+ x2 = max(xs) / zoom_level
638
+ y2 = max(ys) / zoom_level
639
+
640
+ full_word_data.append((text, x1, y1, x2, y2))
641
 
642
  word_data = full_word_data
643
  _ocr_cache.set_ocr(pdf_path, page_num, word_data)
 
644
  except Exception as e:
645
+ print(f" ❌ RapidOCR Error in detection phase: {e}")
646
  return []
647
 
648
  # Apply margin filtering
 
652
  return [d for d in word_data if d[2] >= y_min and d[4] <= y_max]
653
 
654
 
655
+
656
+ #=========================================================================================================================================
657
+ #=============================================================================================================================================
658
+
659
+
660
+
661
+
662
+
663
+
664
+
665
+
666
  def pixmap_to_numpy(pix: fitz.Pixmap) -> np.ndarray:
667
  img_data = pix.samples
668
  img = np.frombuffer(img_data, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
 
1083
  })
1084
  else:
1085
  # === START OF OPTIMIZED OCR BLOCK ===
1086
+ # try:
1087
+ # # 1. Re-render Page at High Resolution (Zoom 4.0 = ~300 DPI)
1088
+ # ocr_zoom = 4.0
1089
+ # pix_ocr = fitz_page.get_pixmap(matrix=fitz.Matrix(ocr_zoom, ocr_zoom))
1090
+
1091
+ # # Convert PyMuPDF Pixmap to OpenCV format
1092
+ # img_ocr_np = np.frombuffer(pix_ocr.samples, dtype=np.uint8).reshape(pix_ocr.height, pix_ocr.width,
1093
+ # pix_ocr.n)
1094
+ # if pix_ocr.n == 3:
1095
+ # img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGB2BGR)
1096
+ # elif pix_ocr.n == 4:
1097
+ # img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGBA2BGR)
1098
+
1099
+ # # 2. Preprocess (Binarization)
1100
+ # processed_img = preprocess_image_for_ocr(img_ocr_np)
1101
+
1102
+ # # 3. Run Tesseract with Optimized Configuration
1103
+ # custom_config = r'--oem 3 --psm 6'
1104
+
1105
+ # hocr_data = pytesseract.image_to_data(
1106
+ # processed_img,
1107
+ # output_type=pytesseract.Output.DICT,
1108
+ # config=custom_config
1109
+ # )
1110
+
1111
+ # for i in range(len(hocr_data['level'])):
1112
+ # text = hocr_data['text'][i] # Retrieve raw Tesseract text
1113
+
1114
+ # # --- FIX: SANITIZE TEXT AND THEN STRIP ---
1115
+ # cleaned_text = sanitize_text(text).strip()
1116
+
1117
+ # if cleaned_text and hocr_data['conf'][i] > -1:
1118
+ # # 4. Coordinate Mapping
1119
+ # scale_adjustment = scale_factor / ocr_zoom
1120
+
1121
+ # x1 = int(hocr_data['left'][i] * scale_adjustment)
1122
+ # y1 = int(hocr_data['top'][i] * scale_adjustment)
1123
+ # w = int(hocr_data['width'][i] * scale_adjustment)
1124
+ # h = int(hocr_data['height'][i] * scale_adjustment)
1125
+ # x2 = x1 + w
1126
+ # y2 = y1 + h
1127
+
1128
+ # raw_ocr_output.append({
1129
+ # 'type': 'text',
1130
+ # 'word': cleaned_text, # Use the sanitized word
1131
+ # 'confidence': float(hocr_data['conf'][i]),
1132
+ # 'bbox': [x1, y1, x2, y2],
1133
+ # 'y0': y1,
1134
+ # 'x0': x1
1135
+ # })
1136
+ # except Exception as e:
1137
+
1138
+ # print(f" ❌ Tesseract OCR Error: {e}")
1139
+ #=============================================================================================================================================================
1140
+ #=============================================================================================================================================================
1141
+ else:
1142
+ # === START OF RAPIDOCR BLOCK ===
1143
  try:
1144
+ # 1. Re-render Page at High Resolution (Standardizing to Zoom 4.0)
1145
  ocr_zoom = 4.0
1146
  pix_ocr = fitz_page.get_pixmap(matrix=fitz.Matrix(ocr_zoom, ocr_zoom))
1147
 
1148
+ # Convert PyMuPDF Pixmap to OpenCV format (BGR)
1149
+ img_ocr_np = np.frombuffer(pix_ocr.samples, dtype=np.uint8).reshape(
1150
+ pix_ocr.height, pix_ocr.width, pix_ocr.n
1151
+ )
1152
  if pix_ocr.n == 3:
1153
  img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGB2BGR)
1154
  elif pix_ocr.n == 4:
1155
  img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGBA2BGR)
1156
 
1157
+ # 2. Run RapidOCR (Models handle preprocessing internally)
1158
+ results, _ = ocr_engine(img_ocr_np)
1159
+
1160
+ if results:
1161
+ # Calculate scaling from OCR image (4.0) to your pipeline standard (scale_factor=2.0)
1162
+ scale_adjustment = scale_factor / ocr_zoom
1163
+
1164
+ for box, text, score in results:
1165
+ # Sanitize and clean text
1166
+ cleaned_text = sanitize_text(text).strip()
1167
+
1168
+ if cleaned_text:
1169
+ # 3. Coordinate Mapping (Convert 4-point polygon to x1, y1, x2, y2)
1170
+ xs = [p[0] for p in box]
1171
+ ys = [p[1] for p in box]
1172
+
1173
+ x1 = int(min(xs) * scale_adjustment)
1174
+ y1 = int(min(ys) * scale_adjustment)
1175
+ x2 = int(max(xs) * scale_adjustment)
1176
+ y2 = int(max(ys) * scale_adjustment)
1177
+
1178
+ raw_ocr_output.append({
1179
+ 'type': 'text',
1180
+ 'word': cleaned_text,
1181
+ 'confidence': float(score) * 100, # Converting 0-1.0 to 0-100 scale
1182
+ 'bbox': [x1, y1, x2, y2],
1183
+ 'y0': y1,
1184
+ 'x0': x1
1185
+ })
 
 
 
 
 
 
 
 
1186
  except Exception as e:
1187
+ print(f" ❌ RapidOCR Fallback Error: {e}")
1188
+ # === END OF RAPIDOCR BLOCK ====================================================================================================================================
1189
+ #===========================================================================================================================================================================
1190
  # === END OF OPTIMIZED OCR BLOCK ===
1191
 
1192
  # ====================================================================