heerjtdev commited on
Commit
27670a9
·
verified ·
1 Parent(s): 94d6916

Update working_yolo_pipeline.py

Browse files
Files changed (1) hide show
  1. working_yolo_pipeline.py +592 -816
working_yolo_pipeline.py CHANGED
@@ -23,8 +23,8 @@ import re
23
 
24
  import torch.nn as nn
25
  from TorchCRF import CRF
26
- from transformers import LayoutLMv3TokenizerFast, LayoutLMv3Model, LayoutLMv3Config
27
- # from transformers import LayoutLMv3Tokenizer, LayoutLMv3Model, LayoutLMv3Config
28
  from typing import List, Dict, Any, Optional, Union, Tuple
29
  from ultralytics import YOLO
30
  import glob
@@ -75,51 +75,18 @@ except Exception as e:
75
 
76
  from typing import Optional
77
 
78
- # def sanitize_text(text: Optional[str]) -> str:
79
- # """Removes surrogate characters and other invalid code points that cause UTF-8 encoding errors."""
80
- # if not isinstance(text, str) or text is None:
81
- # return ""
82
-
83
- # # Matches all surrogates (\ud800-\udfff) and common non-characters (\ufffe, \uffff).
84
- # # This specifically removes '\udefd' which is causing your error.
85
- # surrogates_and_nonchars = re.compile(r'[\ud800-\udfff\ufffe\uffff]')
86
-
87
- # # Replace the invalid characters with a standard space.
88
- # # We strip afterward in the calling function.
89
- # return surrogates_and_nonchars.sub(' ', text)
90
-
91
-
92
-
93
-
94
- # Robust sanitize_text: removes surrogates/non-characters, normalizes line breaks,
95
- # collapses multiple spaces, and removes remaining invalid bytes via utf-8 ignore.
96
- import unicodedata
97
-
98
  def sanitize_text(text: Optional[str]) -> str:
 
99
  if not isinstance(text, str) or text is None:
100
  return ""
101
-
102
- # 1) Normalize common unicode forms (NFKC keeps compatibility forms reasonable)
103
- try:
104
- text = unicodedata.normalize("NFC", text)
105
- except Exception:
106
- pass
107
-
108
- # 2) Remove surrogate codepoint range and common non-characters
109
- # \ud800-\udfff are surrogate halves; also remove \ufffe and \uffff
110
- text = re.sub(r'[\uD800-\uDFFF\uFFFE\uFFFF]', ' ', text)
111
-
112
- # 3) Remove other control chars except common whitespace (newline/tab)
113
- text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', ' ', text)
114
-
115
- # 4) Normalize newlines to single space, collapse repeated whitespace
116
- text = re.sub(r'[\r\n]+', ' ', text)
117
- text = re.sub(r'\s+', ' ', text).strip()
118
-
119
- # 5) Final safety: encode/decode ignoring errors (this strips any remaining bad bytes)
120
- cleaned = text.encode('utf-8', 'ignore').decode('utf-8', 'ignore')
121
- return cleaned
122
-
123
 
124
 
125
 
@@ -750,105 +717,74 @@ def pixmap_to_numpy(pix: fitz.Pixmap) -> np.ndarray:
750
 
751
 
752
 
753
- # def extract_native_words_and_convert(fitz_page, scale_factor: float = 2.0) -> list:
754
- # # 1. Get raw data
755
- # try:
756
- # raw_word_data = fitz_page.get_text("words")
757
- # except Exception as e:
758
- # print(f" ❌ PyMuPDF extraction failed completely: {e}")
759
- # return []
760
 
761
- # # ==============================================================================
762
- # # --- DEBUGGING BLOCK: CHECK FIRST 50 NATIVE WORDS (SAFE PRINT) ---
763
- # # ==============================================================================
764
- # print(f"\n[DEBUG] Native Extraction (Page {fitz_page.number + 1}): Checking first 50 words...")
765
 
766
- # debug_count = 0
767
- # for item in raw_word_data:
768
- # if debug_count >= 150: break
769
 
770
- # word_text = item[4]
771
 
772
- # # --- SAFE PRINTING LOGIC ---
773
- # # We encode/decode to ignore surrogates just for the print statement
774
- # # This prevents the "UnicodeEncodeError" that was crashing your script
775
- # safe_text = word_text.encode('utf-8', 'ignore').decode('utf-8')
776
 
777
- # # Get hex codes (handling potential errors in 'ord')
778
- # try:
779
- # unicode_points = [f"\\u{ord(c):04x}" for c in word_text]
780
- # except:
781
- # unicode_points = ["ERROR"]
782
 
783
- # print(f" Word {debug_count}: '{safe_text}' -> Codes: {unicode_points}")
784
- # debug_count += 1
785
- # print("----------------------------------------------------------------------\n")
786
- # # ==============================================================================
787
-
788
- # converted_ocr_output = []
789
- # DEFAULT_CONFIDENCE = 99.0
790
-
791
- # for x1, y1, x2, y2, word, *rest in raw_word_data:
792
- # # --- FIX: ROBUST SANITIZATION ---
793
- # # 1. Encode to UTF-8 ignoring errors (strips surrogates)
794
- # # 2. Decode back to string
795
- # cleaned_word_bytes = word.strip()
796
- # # cleaned_word_bytes = word.encode('utf-8', 'ignore')
797
- # # cleaned_word = cleaned_word_bytes.decode('utf-8')
798
- # # cleaned_word = word.encode('utf-8', 'ignore').decode('utf-8').strip()
799
-
800
- # # cleaned_word = cleaned_word.strip()
801
- # if not cleaned_word: continue
802
-
803
- # x1_pix = int(x1 * scale_factor)
804
- # y1_pix = int(y1 * scale_factor)
805
- # x2_pix = int(x2 * scale_factor)
806
- # y2_pix = int(y2 * scale_factor)
807
-
808
- # converted_ocr_output.append({
809
- # 'type': 'text',
810
- # 'word': cleaned_word,
811
- # 'confidence': DEFAULT_CONFIDENCE,
812
- # 'bbox': [x1_pix, y1_pix, x2_pix, y2_pix],
813
- # 'y0': y1_pix, 'x0': x1_pix
814
- # })
815
-
816
- # return converted_ocr_output
817
 
818
-
819
-
820
-
821
-
822
-
823
-
824
- def extract_native_words_and_convert(fitz_page, scale_factor: float = 2.0) -> list:
825
- raw_word_data = fitz_page.get_text("words")
826
  converted_ocr_output = []
827
  DEFAULT_CONFIDENCE = 99.0
828
 
829
  for x1, y1, x2, y2, word, *rest in raw_word_data:
830
- if not word.strip(): continue
 
 
 
 
 
 
 
 
 
831
  x1_pix = int(x1 * scale_factor)
832
  y1_pix = int(y1 * scale_factor)
833
  x2_pix = int(x2 * scale_factor)
834
  y2_pix = int(y2 * scale_factor)
 
835
  converted_ocr_output.append({
836
  'type': 'text',
837
- 'word': word,
838
  'confidence': DEFAULT_CONFIDENCE,
839
  'bbox': [x1_pix, y1_pix, x2_pix, y2_pix],
840
  'y0': y1_pix, 'x0': x1_pix
841
  })
 
842
  return converted_ocr_output
843
 
844
 
845
 
846
 
847
 
848
-
849
-
850
-
851
-
852
  #===================================================================================================
853
  #===================================================================================================
854
  #===================================================================================================
@@ -1038,375 +974,111 @@ def post_process_json_with_inference(json_data, classifier):
1038
 
1039
 
1040
 
1041
- # def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
1042
- # page_num: int, fitz_page: fitz.Page,
1043
- # pdf_name: str) -> Tuple[List[Dict[str, Any]], Optional[int]]:
1044
- # """
1045
- # OPTIMIZED FLOW:
1046
- # 1. Run YOLO to find Equations/Tables.
1047
- # 2. Mask raw text with YOLO boxes.
1048
- # 3. Run Column Detection on the MASKED data.
1049
- # 4. Proceed with OCR (Native or High-Res Tesseract Fallback) and Output.
1050
- # """
1051
- # global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
1052
-
1053
- # start_time_total = time.time()
1054
 
1055
- # if original_img is None:
1056
- # print(f" ❌ Invalid image for page {page_num}.")
1057
- # return None, None
1058
 
1059
- # # ====================================================================
1060
- # # --- STEP 1: YOLO DETECTION ---
1061
- # # ====================================================================
1062
- # start_time_yolo = time.time()
1063
- # results = model.predict(source=original_img, conf=CONF_THRESHOLD, imgsz=640, verbose=False)
1064
 
1065
- # relevant_detections = []
1066
- # if results and results[0].boxes:
1067
- # for box in results[0].boxes:
1068
- # class_id = int(box.cls[0])
1069
- # class_name = model.names[class_id]
1070
- # if class_name in TARGET_CLASSES:
1071
- # x1, y1, x2, y2 = box.xyxy[0].cpu().numpy().astype(int)
1072
- # relevant_detections.append(
1073
- # {'coords': (x1, y1, x2, y2), 'y1': y1, 'class': class_name, 'conf': float(box.conf[0])}
1074
- # )
1075
 
1076
- # merged_detections = merge_overlapping_boxes(relevant_detections, IOU_MERGE_THRESHOLD)
1077
- # print(f" [LOG] YOLO found {len(merged_detections)} objects in {time.time() - start_time_yolo:.3f}s.")
 
 
 
 
 
 
 
 
1078
 
1079
- # # ====================================================================
1080
- # # --- STEP 2: PREPARE DATA FOR COLUMN DETECTION (MASKING) ---
1081
- # # ====================================================================
1082
- # # Note: This uses the updated 'get_word_data_for_detection' which has its own optimizations
1083
- # raw_words_for_layout = get_word_data_for_detection(
1084
- # fitz_page, pdf_path, page_num,
1085
- # top_margin_percent=0.10, bottom_margin_percent=0.10
1086
- # )
1087
 
1088
- # masked_word_data = merge_yolo_into_word_data(raw_words_for_layout, merged_detections, scale_factor=2.0)
 
 
 
 
 
 
 
1089
 
1090
- # # ====================================================================
1091
- # # --- STEP 3: COLUMN DETECTION ---
1092
- # # ====================================================================
1093
- # page_width_pdf = fitz_page.rect.width
1094
- # page_height_pdf = fitz_page.rect.height
1095
 
1096
- # column_detection_params = {
1097
- # 'cluster_bin_size': 2, 'cluster_smoothing': 2,
1098
- # 'cluster_min_width': 10, 'cluster_threshold_percentile': 85,
1099
- # }
 
1100
 
1101
- # separators = calculate_x_gutters(masked_word_data, column_detection_params, page_height_pdf)
 
 
 
1102
 
1103
- # page_separator_x = None
1104
- # if separators:
1105
- # central_min = page_width_pdf * 0.35
1106
- # central_max = page_width_pdf * 0.65
1107
- # central_separators = [s for s in separators if central_min <= s <= central_max]
1108
 
1109
- # if central_separators:
1110
- # center_x = page_width_pdf / 2
1111
- # page_separator_x = min(central_separators, key=lambda x: abs(x - center_x))
1112
- # print(f" ✅ Column Split Confirmed at X={page_separator_x:.1f}")
1113
- # else:
1114
- # print(" ⚠️ Gutter found off-center. Ignoring.")
1115
- # else:
1116
- # print(" -> Single Column Layout Confirmed.")
1117
 
1118
- # # ====================================================================
1119
- # # --- STEP 4: COMPONENT EXTRACTION (Save Images) ---
1120
- # # ====================================================================
1121
- # start_time_components = time.time()
1122
- # component_metadata = []
1123
- # fig_count_page = 0
1124
- # eq_count_page = 0
 
1125
 
1126
- # for detection in merged_detections:
1127
- # x1, y1, x2, y2 = detection['coords']
1128
- # class_name = detection['class']
 
 
 
 
1129
 
1130
- # if class_name == 'figure':
1131
- # GLOBAL_FIGURE_COUNT += 1
1132
- # counter = GLOBAL_FIGURE_COUNT
1133
- # component_word = f"FIGURE{counter}"
1134
- # fig_count_page += 1
1135
- # elif class_name == 'equation':
1136
- # GLOBAL_EQUATION_COUNT += 1
1137
- # counter = GLOBAL_EQUATION_COUNT
1138
- # component_word = f"EQUATION{counter}"
1139
- # eq_count_page += 1
1140
- # else:
1141
- # continue
1142
 
1143
- # component_crop = original_img[y1:y2, x1:x2]
1144
- # component_filename = f"{pdf_name}_page{page_num}_{class_name}{counter}.png"
1145
- # cv2.imwrite(os.path.join(FIGURE_EXTRACTION_DIR, component_filename), component_crop)
 
 
 
 
 
 
 
 
 
1146
 
1147
- # y_midpoint = (y1 + y2) // 2
1148
- # component_metadata.append({
1149
- # 'type': class_name, 'word': component_word,
1150
- # 'bbox': [int(x1), int(y1), int(x2), int(y2)],
1151
- # 'y0': int(y_midpoint), 'x0': int(x1)
1152
- # })
1153
-
1154
- # # ====================================================================
1155
- # # --- STEP 5: HYBRID OCR (Native Text + Cached Tesseract Fallback) ---
1156
- # # ====================================================================
1157
- # raw_ocr_output = []
1158
- # scale_factor = 2.0 # Pipeline standard scale
1159
-
1160
- # try:
1161
- # # Try getting native text first
1162
- # # NOTE: extract_native_words_and_convert MUST ALSO BE UPDATED TO USE sanitize_text
1163
- # raw_ocr_output = extract_native_words_and_convert(fitz_page, scale_factor=scale_factor)
1164
- # except Exception as e:
1165
- # print(f" ❌ Native text extraction failed: {e}")
1166
-
1167
- # # If native text is missing, fall back to OCR
1168
- # if not raw_ocr_output:
1169
- # if _ocr_cache.has_ocr(pdf_path, page_num):
1170
- # print(f" ⚡ Using cached Tesseract OCR for page {page_num}")
1171
- # cached_word_data = _ocr_cache.get_ocr(pdf_path, page_num)
1172
- # for word_tuple in cached_word_data:
1173
- # word_text, x1, y1, x2, y2 = word_tuple
1174
-
1175
- # # Scale from PDF points to Pipeline Pixels (2.0)
1176
- # x1_pix = int(x1 * scale_factor)
1177
- # y1_pix = int(y1 * scale_factor)
1178
- # x2_pix = int(x2 * scale_factor)
1179
- # y2_pix = int(y2 * scale_factor)
1180
-
1181
- # raw_ocr_output.append({
1182
- # 'type': 'text', 'word': word_text, 'confidence': 95.0,
1183
- # 'bbox': [x1_pix, y1_pix, x2_pix, y2_pix],
1184
- # 'y0': y1_pix, 'x0': x1_pix
1185
- # })
1186
- # else:
1187
- # # === START OF OPTIMIZED OCR BLOCK ===
1188
- # try:
1189
- # # 1. Re-render Page at High Resolution (Zoom 4.0 = ~300 DPI)
1190
- # ocr_zoom = 4.0
1191
- # pix_ocr = fitz_page.get_pixmap(matrix=fitz.Matrix(ocr_zoom, ocr_zoom))
1192
-
1193
- # # Convert PyMuPDF Pixmap to OpenCV format
1194
- # img_ocr_np = np.frombuffer(pix_ocr.samples, dtype=np.uint8).reshape(pix_ocr.height, pix_ocr.width,
1195
- # pix_ocr.n)
1196
- # if pix_ocr.n == 3:
1197
- # img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGB2BGR)
1198
- # elif pix_ocr.n == 4:
1199
- # img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGBA2BGR)
1200
-
1201
- # # 2. Preprocess (Binarization)
1202
- # processed_img = preprocess_image_for_ocr(img_ocr_np)
1203
-
1204
- # # 3. Run Tesseract with Optimized Configuration
1205
- # custom_config = r'--oem 3 --psm 6'
1206
-
1207
- # hocr_data = pytesseract.image_to_data(
1208
- # processed_img,
1209
- # output_type=pytesseract.Output.DICT,
1210
- # config=custom_config
1211
- # )
1212
-
1213
- # for i in range(len(hocr_data['level'])):
1214
- # text = hocr_data['text'][i] # Retrieve raw Tesseract text
1215
-
1216
- # # --- FIX: SANITIZE TEXT AND THEN STRIP ---
1217
- # cleaned_text = sanitize_text(text).strip()
1218
-
1219
- # if cleaned_text and hocr_data['conf'][i] > -1:
1220
- # # 4. Coordinate Mapping
1221
- # scale_adjustment = scale_factor / ocr_zoom
1222
-
1223
- # x1 = int(hocr_data['left'][i] * scale_adjustment)
1224
- # y1 = int(hocr_data['top'][i] * scale_adjustment)
1225
- # w = int(hocr_data['width'][i] * scale_adjustment)
1226
- # h = int(hocr_data['height'][i] * scale_adjustment)
1227
- # x2 = x1 + w
1228
- # y2 = y1 + h
1229
-
1230
- # raw_ocr_output.append({
1231
- # 'type': 'text',
1232
- # 'word': cleaned_text, # Use the sanitized word
1233
- # 'confidence': float(hocr_data['conf'][i]),
1234
- # 'bbox': [x1, y1, x2, y2],
1235
- # 'y0': y1,
1236
- # 'x0': x1
1237
- # })
1238
- # except Exception as e:
1239
- # print(f" ❌ Tesseract OCR Error: {e}")
1240
- # # === END OF OPTIMIZED OCR BLOCK ===
1241
-
1242
- # # ====================================================================
1243
- # # --- STEP 6: OCR CLEANING AND MERGING ---
1244
- # # ====================================================================
1245
- # items_to_sort = []
1246
-
1247
- # for ocr_word in raw_ocr_output:
1248
- # is_suppressed = False
1249
- # for component in component_metadata:
1250
- # # Do not include words that are inside figure/equation boxes
1251
- # ioa = calculate_ioa(ocr_word['bbox'], component['bbox'])
1252
- # if ioa > IOA_SUPPRESSION_THRESHOLD:
1253
- # is_suppressed = True
1254
- # break
1255
- # if not is_suppressed:
1256
- # items_to_sort.append(ocr_word)
1257
-
1258
- # # Add figures/equations back into the flow as "words"
1259
- # items_to_sort.extend(component_metadata)
1260
-
1261
- # # ====================================================================
1262
- # # --- STEP 7: LINE-BASED SORTING ---
1263
- # # ====================================================================
1264
- # items_to_sort.sort(key=lambda x: (x['y0'], x['x0']))
1265
- # lines = []
1266
-
1267
- # for item in items_to_sort:
1268
- # placed = False
1269
- # for line in lines:
1270
- # y_ref = min(it['y0'] for it in line)
1271
- # if abs(y_ref - item['y0']) < LINE_TOLERANCE:
1272
- # line.append(item)
1273
- # placed = True
1274
- # break
1275
- # if not placed and item['type'] in ['equation', 'figure']:
1276
- # for line in lines:
1277
- # y_ref = min(it['y0'] for it in line)
1278
- # if abs(y_ref - item['y0']) < 20:
1279
- # line.append(item)
1280
- # placed = True
1281
- # break
1282
- # if not placed:
1283
- # lines.append([item])
1284
-
1285
- # for line in lines:
1286
- # line.sort(key=lambda x: x['x0'])
1287
-
1288
- # final_output = []
1289
- # for line in lines:
1290
- # for item in line:
1291
- # data_item = {"word": item["word"], "bbox": item["bbox"], "type": item["type"]}
1292
- # if 'tag' in item: data_item['tag'] = item['tag']
1293
- # final_output.append(data_item)
1294
-
1295
- # return final_output, page_separator_x
1296
-
1297
-
1298
-
1299
-
1300
-
1301
-
1302
-
1303
-
1304
-
1305
- def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
1306
- page_num: int, fitz_page: fitz.Page,
1307
- pdf_name: str) -> Tuple[List[Dict[str, Any]], Optional[int]]:
1308
- """
1309
- OPTIMIZED FLOW:
1310
- 1. Run YOLO to find Equations/Tables.
1311
- 2. Mask raw text with YOLO boxes.
1312
- 3. Run Column Detection on the MASKED data.
1313
- 4. Proceed with OCR (Native or High-Res Tesseract Fallback) and Output.
1314
- """
1315
- global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
1316
-
1317
- start_time_total = time.time()
1318
-
1319
- if original_img is None:
1320
- print(f" ❌ Invalid image for page {page_num}.")
1321
- return None, None
1322
-
1323
- # ====================================================================
1324
- # --- STEP 1: YOLO DETECTION ---
1325
- # ====================================================================
1326
- start_time_yolo = time.time()
1327
- results = model.predict(source=original_img, conf=CONF_THRESHOLD, imgsz=640, verbose=False)
1328
-
1329
- relevant_detections = []
1330
- if results and results[0].boxes:
1331
- for box in results[0].boxes:
1332
- class_id = int(box.cls[0])
1333
- class_name = model.names[class_id]
1334
- if class_name in TARGET_CLASSES:
1335
- x1, y1, x2, y2 = box.xyxy[0].cpu().numpy().astype(int)
1336
- relevant_detections.append(
1337
- {'coords': (x1, y1, x2, y2), 'y1': y1, 'class': class_name, 'conf': float(box.conf[0])}
1338
- )
1339
-
1340
- merged_detections = merge_overlapping_boxes(relevant_detections, IOU_MERGE_THRESHOLD)
1341
- print(f" [LOG] YOLO found {len(merged_detections)} objects in {time.time() - start_time_yolo:.3f}s.")
1342
-
1343
- # ====================================================================
1344
- # --- STEP 2: PREPARE DATA FOR COLUMN DETECTION (MASKING) ---
1345
- # ====================================================================
1346
- # Note: This uses the updated 'get_word_data_for_detection' which has its own optimizations
1347
- raw_words_for_layout = get_word_data_for_detection(
1348
- fitz_page, pdf_path, page_num,
1349
- top_margin_percent=0.10, bottom_margin_percent=0.10
1350
- )
1351
-
1352
- masked_word_data = merge_yolo_into_word_data(raw_words_for_layout, merged_detections, scale_factor=2.0)
1353
-
1354
- # ====================================================================
1355
- # --- STEP 3: COLUMN DETECTION ---
1356
- # ====================================================================
1357
- page_width_pdf = fitz_page.rect.width
1358
- page_height_pdf = fitz_page.rect.height
1359
-
1360
- column_detection_params = {
1361
- 'cluster_bin_size': 2, 'cluster_smoothing': 2,
1362
- 'cluster_min_width': 10, 'cluster_threshold_percentile': 85,
1363
- }
1364
-
1365
- separators = calculate_x_gutters(masked_word_data, column_detection_params, page_height_pdf)
1366
-
1367
- page_separator_x = None
1368
- if separators:
1369
- central_min = page_width_pdf * 0.35
1370
- central_max = page_width_pdf * 0.65
1371
- central_separators = [s for s in separators if central_min <= s <= central_max]
1372
-
1373
- if central_separators:
1374
- center_x = page_width_pdf / 2
1375
- page_separator_x = min(central_separators, key=lambda x: abs(x - center_x))
1376
- print(f" ✅ Column Split Confirmed at X={page_separator_x:.1f}")
1377
- else:
1378
- print(" ⚠️ Gutter found off-center. Ignoring.")
1379
- else:
1380
- print(" -> Single Column Layout Confirmed.")
1381
-
1382
- # ====================================================================
1383
- # --- STEP 4: COMPONENT EXTRACTION (Save Images) ---
1384
- # ====================================================================
1385
- start_time_components = time.time()
1386
- component_metadata = []
1387
- fig_count_page = 0
1388
- eq_count_page = 0
1389
-
1390
- for detection in merged_detections:
1391
- x1, y1, x2, y2 = detection['coords']
1392
- class_name = detection['class']
1393
-
1394
- if class_name == 'figure':
1395
- GLOBAL_FIGURE_COUNT += 1
1396
- counter = GLOBAL_FIGURE_COUNT
1397
- component_word = f"FIGURE{counter}"
1398
- fig_count_page += 1
1399
- elif class_name == 'equation':
1400
- GLOBAL_EQUATION_COUNT += 1
1401
- counter = GLOBAL_EQUATION_COUNT
1402
- component_word = f"EQUATION{counter}"
1403
- eq_count_page += 1
1404
- else:
1405
- continue
1406
-
1407
- component_crop = original_img[y1:y2, x1:x2]
1408
- component_filename = f"{pdf_name}_page{page_num}_{class_name}{counter}.png"
1409
- cv2.imwrite(os.path.join(FIGURE_EXTRACTION_DIR, component_filename), component_crop)
1410
 
1411
  y_midpoint = (y1 + y2) // 2
1412
  component_metadata.append({
@@ -1419,10 +1091,11 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
1419
  # --- STEP 5: HYBRID OCR (Native Text + Cached Tesseract Fallback) ---
1420
  # ====================================================================
1421
  raw_ocr_output = []
1422
- scale_factor = 2.0 # Pipeline standard scale
1423
 
1424
  try:
1425
  # Try getting native text first
 
1426
  raw_ocr_output = extract_native_words_and_convert(fitz_page, scale_factor=scale_factor)
1427
  except Exception as e:
1428
  print(f" ❌ Native text extraction failed: {e}")
@@ -1434,13 +1107,13 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
1434
  cached_word_data = _ocr_cache.get_ocr(pdf_path, page_num)
1435
  for word_tuple in cached_word_data:
1436
  word_text, x1, y1, x2, y2 = word_tuple
1437
-
1438
  # Scale from PDF points to Pipeline Pixels (2.0)
1439
  x1_pix = int(x1 * scale_factor)
1440
  y1_pix = int(y1 * scale_factor)
1441
  x2_pix = int(x2 * scale_factor)
1442
  y2_pix = int(y2 * scale_factor)
1443
-
1444
  raw_ocr_output.append({
1445
  'type': 'text', 'word': word_text, 'confidence': 95.0,
1446
  'bbox': [x1_pix, y1_pix, x2_pix, y2_pix],
@@ -1450,63 +1123,63 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
1450
  # === START OF OPTIMIZED OCR BLOCK ===
1451
  try:
1452
  # 1. Re-render Page at High Resolution (Zoom 4.0 = ~300 DPI)
1453
- # We do this specifically for OCR accuracy, separate from the pipeline image
1454
  ocr_zoom = 4.0
1455
  pix_ocr = fitz_page.get_pixmap(matrix=fitz.Matrix(ocr_zoom, ocr_zoom))
1456
-
1457
  # Convert PyMuPDF Pixmap to OpenCV format
1458
- img_ocr_np = np.frombuffer(pix_ocr.samples, dtype=np.uint8).reshape(pix_ocr.height, pix_ocr.width, pix_ocr.n)
1459
- if pix_ocr.n == 3: img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGB2BGR)
1460
- elif pix_ocr.n == 4: img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGBA2BGR)
 
 
 
1461
 
1462
  # 2. Preprocess (Binarization)
1463
- # Ensure 'preprocess_image_for_ocr' is defined at top of file!
1464
  processed_img = preprocess_image_for_ocr(img_ocr_np)
1465
-
1466
  # 3. Run Tesseract with Optimized Configuration
1467
- # --oem 3: Default LSTM engine
1468
- # --psm 6: Assume a single uniform block of text (Critical for lists/questions)
1469
  custom_config = r'--oem 3 --psm 6'
1470
-
1471
  hocr_data = pytesseract.image_to_data(
1472
- processed_img,
1473
- output_type=pytesseract.Output.DICT,
1474
  config=custom_config
1475
  )
1476
-
1477
  for i in range(len(hocr_data['level'])):
1478
- text = hocr_data['text'][i].strip()
1479
- if text and hocr_data['conf'][i] > -1:
1480
-
 
 
 
1481
  # 4. Coordinate Mapping
1482
- # We scanned at Zoom 4.0, but our pipeline expects Zoom 2.0.
1483
- # Scale Factor = (Target 2.0) / (Source 4.0) = 0.5
1484
- scale_adjustment = scale_factor / ocr_zoom
1485
-
1486
  x1 = int(hocr_data['left'][i] * scale_adjustment)
1487
  y1 = int(hocr_data['top'][i] * scale_adjustment)
1488
  w = int(hocr_data['width'][i] * scale_adjustment)
1489
  h = int(hocr_data['height'][i] * scale_adjustment)
1490
  x2 = x1 + w
1491
  y2 = y1 + h
1492
-
1493
  raw_ocr_output.append({
1494
- 'type': 'text',
1495
- 'word': text,
1496
  'confidence': float(hocr_data['conf'][i]),
1497
- 'bbox': [x1, y1, x2, y2],
1498
- 'y0': y1,
1499
  'x0': x1
1500
  })
1501
  except Exception as e:
1502
  print(f" ❌ Tesseract OCR Error: {e}")
1503
  # === END OF OPTIMIZED OCR BLOCK ===
1504
-
1505
  # ====================================================================
1506
  # --- STEP 6: OCR CLEANING AND MERGING ---
1507
  # ====================================================================
1508
  items_to_sort = []
1509
-
1510
  for ocr_word in raw_ocr_output:
1511
  is_suppressed = False
1512
  for component in component_metadata:
@@ -1570,7 +1243,6 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
1570
 
1571
 
1572
 
1573
-
1574
  # def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
1575
  # page_num: int, fitz_page: fitz.Page,
1576
  # pdf_name: str) -> Tuple[List[Dict[str, Any]], Optional[int]]:
@@ -1881,112 +1553,425 @@ def run_single_pdf_preprocessing(pdf_path: str, preprocessed_json_path: str) ->
1881
  total_pages_processed = 0
1882
  mat = fitz.Matrix(2.0, 2.0)
1883
 
1884
- print("\n[STEP 1.2: ITERATING PAGES - IN-MEMORY PROCESSING]")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1885
 
1886
- for page_num_0_based in range(doc.page_count):
1887
- page_num = page_num_0_based + 1
1888
- print(f" -> Processing Page {page_num}/{doc.page_count}...")
1889
 
1890
- fitz_page = doc.load_page(page_num_0_based)
 
1891
 
1892
- try:
1893
- pix = fitz_page.get_pixmap(matrix=mat)
1894
- original_img = pixmap_to_numpy(pix)
1895
- except Exception as e:
1896
- print(f" ❌ Error converting page {page_num} to image: {e}")
1897
- continue
1898
 
1899
- final_output, page_separator_x = preprocess_and_ocr_page(
1900
- original_img,
1901
- model,
1902
- pdf_path,
1903
- page_num,
1904
- fitz_page,
1905
- pdf_name
1906
- )
1907
 
1908
- if final_output is not None:
1909
- page_data = {
1910
- "page_number": page_num,
1911
- "data": final_output,
1912
- "column_separator_x": page_separator_x
1913
- }
1914
- all_pages_data.append(page_data)
1915
- total_pages_processed += 1
1916
- else:
1917
- print(f" ❌ Skipped page {page_num} due to processing error.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1918
 
1919
- doc.close()
 
 
 
 
 
 
1920
 
1921
- if all_pages_data:
1922
- try:
1923
- with open(preprocessed_json_path, 'w') as f:
1924
- json.dump(all_pages_data, f, indent=4)
1925
- print(f"\n ✅ Combined structured OCR JSON saved to: {os.path.basename(preprocessed_json_path)}")
1926
- except Exception as e:
1927
- print(f"❌ ERROR saving combined JSON output: {e}")
1928
- return None
1929
- else:
1930
- print("❌ WARNING: No page data generated. Halting pipeline.")
1931
- return None
1932
 
1933
- print("\n" + "=" * 80)
1934
- print(f"--- YOLO/OCR PREPROCESSING COMPLETE ({total_pages_processed} pages processed) ---")
1935
- print("=" * 80)
1936
 
1937
- return preprocessed_json_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1938
 
 
 
 
 
 
 
1939
 
1940
- # ============================================================================
1941
- # --- PHASE 2: LAYOUTLMV3 INFERENCE FUNCTIONS ---
1942
- # ============================================================================
 
 
1943
 
1944
- # class LayoutLMv3ForTokenClassification(nn.Module):
1945
- # def __init__(self, num_labels: int = NUM_LABELS):
1946
- # super().__init__()
1947
- # self.num_labels = num_labels
1948
- # config = LayoutLMv3Config.from_pretrained("microsoft/layoutlmv3-base", num_labels=num_labels)
1949
- # self.layoutlmv3 = LayoutLMv3Model.from_pretrained("microsoft/layoutlmv3-base", config=config)
1950
- # self.classifier = nn.Linear(config.hidden_size, num_labels)
1951
- # self.crf = CRF(num_labels)
1952
- # self.init_weights()
1953
-
1954
- # def init_weights(self):
1955
- # nn.init.xavier_uniform_(self.classifier.weight)
1956
- # if self.classifier.bias is not None: nn.init.zeros_(self.classifier.bias)
1957
-
1958
- # def forward(self, input_ids: torch.Tensor, bbox: torch.Tensor, attention_mask: torch.Tensor,
1959
- # labels: Optional[torch.Tensor] = None):
1960
- # outputs = self.layoutlmv3(input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, return_dict=True)
1961
- # sequence_output = outputs.last_hidden_state
1962
- # emissions = self.classifier(sequence_output)
1963
- # mask = attention_mask.bool()
1964
- # if labels is not None:
1965
- # loss = -self.crf(emissions, labels, mask=mask).mean()
1966
- # return loss
1967
- # else:
1968
- # return self.crf.viterbi_decode(emissions, mask=mask)
1969
-
1970
-
1971
- # def _merge_integrity(all_token_data: List[Dict[str, Any]],
1972
- # column_separator_x: Optional[int]) -> List[List[Dict[str, Any]]]:
1973
- # """Splits the token data objects into column chunks based on a separator."""
1974
- # if column_separator_x is None:
1975
- # print(" -> No column separator. Treating as one chunk.")
1976
- # return [all_token_data]
1977
-
1978
- # left_column_tokens, right_column_tokens = [], []
1979
- # for token_data in all_token_data:
1980
- # bbox_raw = token_data['bbox_raw_pdf_space']
1981
- # center_x = (bbox_raw[0] + bbox_raw[2]) / 2
1982
- # if center_x < column_separator_x:
1983
- # left_column_tokens.append(token_data)
1984
- # else:
1985
- # right_column_tokens.append(token_data)
1986
 
1987
- # chunks = [c for c in [left_column_tokens, right_column_tokens] if c]
1988
- # print(f" -> Data split into {len(chunks)} column chunk(s) using separator X={column_separator_x}.")
1989
- # return chunks
1990
 
1991
 
1992
 
@@ -2067,6 +2052,20 @@ def run_single_pdf_preprocessing(pdf_path: str, preprocessed_json_path: str) ->
2067
  # "item_original_data": item
2068
  # })
2069
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2070
  # if not all_token_data:
2071
  # continue
2072
 
@@ -2144,19 +2143,12 @@ def run_single_pdf_preprocessing(pdf_path: str, preprocessed_json_path: str) ->
2144
  # model_outputs = model(input_ids, bbox, attention_mask)
2145
 
2146
  # # --- Robust extraction: support several forward return types ---
2147
- # # We'll try (in order):
2148
- # # 1) model_outputs is (emissions_tensor, viterbi_list) -> use emissions for logits, keep decoded
2149
- # # 2) model_outputs has .logits attribute (HF ModelOutput)
2150
- # # 3) model_outputs is tuple/list containing a logits tensor
2151
- # # 4) model_outputs is a tensor (assume logits)
2152
- # # 5) model_outputs is a list-of-lists of ints (viterbi decoded) -> use that directly (no logits)
2153
  # logits_tensor = None
2154
  # decoded_labels_list = None
2155
 
2156
  # # case 1: tuple/list with (emissions, viterbi)
2157
  # if isinstance(model_outputs, (tuple, list)) and len(model_outputs) == 2:
2158
  # a, b = model_outputs
2159
- # # a might be tensor (emissions), b might be viterbi list
2160
  # if isinstance(a, torch.Tensor):
2161
  # logits_tensor = a
2162
  # if isinstance(b, list):
@@ -2171,15 +2163,12 @@ def run_single_pdf_preprocessing(pdf_path: str, preprocessed_json_path: str) ->
2171
  # found_tensor = None
2172
  # for item in model_outputs:
2173
  # if isinstance(item, torch.Tensor):
2174
- # # prefer 3D (batch, seq, labels)
2175
  # if item.dim() == 3:
2176
  # logits_tensor = item
2177
  # break
2178
  # if found_tensor is None:
2179
  # found_tensor = item
2180
  # if logits_tensor is None and found_tensor is not None:
2181
- # # found_tensor may be (batch, seq, hidden) or (seq, hidden); we avoid guessing.
2182
- # # Keep found_tensor only if it matches num_labels dimension
2183
  # if found_tensor.dim() == 3 and found_tensor.shape[-1] == NUM_LABELS:
2184
  # logits_tensor = found_tensor
2185
  # elif found_tensor.dim() == 2 and found_tensor.shape[-1] == NUM_LABELS:
@@ -2191,12 +2180,10 @@ def run_single_pdf_preprocessing(pdf_path: str, preprocessed_json_path: str) ->
2191
 
2192
  # # case 5: model_outputs is a decoded viterbi list (common for CRF-only forward)
2193
  # if decoded_labels_list is None and isinstance(model_outputs, list) and model_outputs and isinstance(model_outputs[0], list):
2194
- # # assume model_outputs is already viterbi decoded: List[List[int]] with batch dim first
2195
  # decoded_labels_list = model_outputs
2196
 
2197
  # # If neither logits nor decoded exist, that's fatal
2198
  # if logits_tensor is None and decoded_labels_list is None:
2199
- # # helpful debug info
2200
  # try:
2201
  # elem_shapes = [ (type(x), getattr(x, 'shape', None)) for x in model_outputs ] if isinstance(model_outputs, (list, tuple)) else [(type(model_outputs), getattr(model_outputs, 'shape', None))]
2202
  # except Exception:
@@ -2205,32 +2192,25 @@ def run_single_pdf_preprocessing(pdf_path: str, preprocessed_json_path: str) ->
2205
 
2206
  # # If we have logits_tensor, normalize shape to [seq_len, num_labels]
2207
  # if logits_tensor is not None:
2208
- # # If shape is [B, L, C] with B==1, squeeze batch
2209
  # if logits_tensor.dim() == 3 and logits_tensor.shape[0] == 1:
2210
  # preds_tensor = logits_tensor.squeeze(0) # [L, C]
2211
  # else:
2212
  # preds_tensor = logits_tensor # possibly [L, C] already
2213
 
2214
- # # Safety: ensure we have at least seq_len x channels
2215
  # if preds_tensor.dim() != 2:
2216
- # # try to reshape or error
2217
  # raise RuntimeError(f"Unexpected logits tensor shape: {tuple(preds_tensor.shape)}")
2218
- # # We'll use preds_tensor[token_idx] to argmax
2219
  # else:
2220
  # preds_tensor = None # no logits available
2221
 
2222
  # # If decoded labels provided, make a token-level list-of-ints aligned to tokenizer tokens
2223
  # decoded_token_labels = None
2224
  # if decoded_labels_list is not None:
2225
- # # decoded_labels_list is batch-first; we used batch size 1
2226
- # # if multiple sequences returned, take first
2227
  # decoded_token_labels = decoded_labels_list[0] if isinstance(decoded_labels_list[0], list) else decoded_labels_list
2228
 
2229
  # # Now map token-level predictions -> word-level predictions using word_ids
2230
  # word_idx_to_pred_id = {}
2231
 
2232
  # if preds_tensor is not None:
2233
- # # We have logits. Use argmax of logits for each token id up to sequence_length
2234
  # for token_idx, word_idx in enumerate(word_ids):
2235
  # if token_idx >= sequence_length:
2236
  # break
@@ -2239,26 +2219,14 @@ def run_single_pdf_preprocessing(pdf_path: str, preprocessed_json_path: str) ->
2239
  # pred_id = torch.argmax(preds_tensor[token_idx]).item()
2240
  # word_idx_to_pred_id[word_idx] = pred_id
2241
  # else:
2242
- # # No logits, but we have decoded_token_labels from CRF (one label per token)
2243
- # # We'll align decoded_token_labels to token positions.
2244
  # if decoded_token_labels is None:
2245
- # # should not happen due to earlier checks
2246
  # raise RuntimeError("No logits and no decoded labels available for mapping.")
2247
- # # decoded_token_labels length may be equal to content_token_length (no special tokens)
2248
- # # or equal to sequence_length; try to align intelligently:
2249
- # # Prefer using decoded_token_labels aligned to the tokenizer tokens (starting at token 1 for CLS)
2250
- # # If decoded length == content_token_length, then manual_word_ids maps sub-token -> word idx for content tokens only.
2251
- # # We'll iterate tokens and pick label accordingly.
2252
- # # Build token_idx -> decoded_label mapping:
2253
- # # We'll assume decoded_token_labels correspond to content tokens (no CLS/SEP). If decoded length == sequence_length, then shift by 0.
2254
  # decoded_len = len(decoded_token_labels)
2255
- # # Heuristic: if decoded_len == content_token_length -> alignment starts at token_idx 1 (skip CLS)
2256
  # if decoded_len == content_token_length:
2257
  # decoded_start = 1
2258
  # elif decoded_len == sequence_length:
2259
  # decoded_start = 0
2260
  # else:
2261
- # # fallback: prefer decoded_start=1 (most common)
2262
  # decoded_start = 1
2263
 
2264
  # for tok_idx_in_decoded, label_id in enumerate(decoded_token_labels):
@@ -2267,11 +2235,9 @@ def run_single_pdf_preprocessing(pdf_path: str, preprocessed_json_path: str) ->
2267
  # break
2268
  # if tok_idx >= sequence_length:
2269
  # break
2270
- # # map this token to a word index if present
2271
  # word_idx = word_ids[tok_idx] if tok_idx < len(word_ids) else None
2272
  # if word_idx is not None and word_idx < len(sub_words):
2273
  # if word_idx not in word_idx_to_pred_id:
2274
- # # label_id may already be an int
2275
  # word_idx_to_pred_id[word_idx] = int(label_id)
2276
 
2277
  # # Finally convert mapped word preds -> page_raw_predictions entries
@@ -2300,196 +2266,6 @@ def run_single_pdf_preprocessing(pdf_path: str, preprocessed_json_path: str) ->
2300
  # return final_page_predictions
2301
 
2302
 
2303
- class LayoutLMv3ForTokenClassification(nn.Module):
2304
- def __init__(self, num_labels: int = NUM_LABELS):
2305
- super().__init__()
2306
- self.num_labels = num_labels
2307
- config = LayoutLMv3Config.from_pretrained("microsoft/layoutlmv3-base", num_labels=num_labels)
2308
- self.layoutlmv3 = LayoutLMv3Model.from_pretrained("microsoft/layoutlmv3-base", config=config)
2309
- self.classifier = nn.Linear(config.hidden_size, num_labels)
2310
- self.crf = CRF(num_labels)
2311
- self.init_weights()
2312
-
2313
- def init_weights(self):
2314
- nn.init.xavier_uniform_(self.classifier.weight)
2315
- if self.classifier.bias is not None: nn.init.zeros_(self.classifier.bias)
2316
-
2317
- def forward(self, input_ids: torch.Tensor, bbox: torch.Tensor, attention_mask: torch.Tensor, labels: Optional[torch.Tensor] = None):
2318
- outputs = self.layoutlmv3(input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, return_dict=True)
2319
- sequence_output = outputs.last_hidden_state
2320
- emissions = self.classifier(sequence_output)
2321
- mask = attention_mask.bool()
2322
- if labels is not None:
2323
- loss = -self.crf(emissions, labels, mask=mask).mean()
2324
- return loss
2325
- else:
2326
- return self.crf.viterbi_decode(emissions, mask=mask)
2327
-
2328
- def _merge_integrity(all_token_data: List[Dict[str, Any]],
2329
- column_separator_x: Optional[int]) -> List[List[Dict[str, Any]]]:
2330
- """Splits the token data objects into column chunks based on a separator."""
2331
- if column_separator_x is None:
2332
- print(" -> No column separator. Treating as one chunk.")
2333
- return [all_token_data]
2334
-
2335
- left_column_tokens, right_column_tokens = [], []
2336
- for token_data in all_token_data:
2337
- bbox_raw = token_data['bbox_raw_pdf_space']
2338
- center_x = (bbox_raw[0] + bbox_raw[2]) / 2
2339
- if center_x < column_separator_x:
2340
- left_column_tokens.append(token_data)
2341
- else:
2342
- right_column_tokens.append(token_data)
2343
-
2344
- chunks = [c for c in [left_column_tokens, right_column_tokens] if c]
2345
- print(f" -> Data split into {len(chunks)} column chunk(s) using separator X={column_separator_x}.")
2346
- return chunks
2347
-
2348
- def run_inference_and_get_raw_words(pdf_path: str, model_path: str,
2349
- preprocessed_json_path: str,
2350
- column_detection_params: Optional[Dict] = None) -> List[Dict[str, Any]]:
2351
- print("\n" + "=" * 80)
2352
- print("--- 2. STARTING LAYOUTLMV3 INFERENCE PIPELINE (Raw Word Output) ---")
2353
- print("=" * 80)
2354
-
2355
- tokenizer = LayoutLMv3TokenizerFast.from_pretrained("microsoft/layoutlmv3-base")
2356
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
2357
- print(f" -> Using device: {device}")
2358
-
2359
- try:
2360
- model = LayoutLMv3ForTokenClassification(num_labels=NUM_LABELS)
2361
- checkpoint = torch.load(model_path, map_location=device)
2362
- model_state = checkpoint.get('model_state_dict', checkpoint)
2363
- fixed_state_dict = {key.replace('layoutlm.', 'layoutlmv3.'): value for key, value in model_state.items()}
2364
- model.load_state_dict(fixed_state_dict)
2365
- model.to(device)
2366
- model.eval()
2367
- print(f"✅ LayoutLMv3 Model loaded successfully from {os.path.basename(model_path)}.")
2368
- except Exception as e:
2369
- print(f"❌ FATAL ERROR during LayoutLMv3 model loading: {e}")
2370
- return []
2371
-
2372
- try:
2373
- with open(preprocessed_json_path, 'r', encoding='utf-8') as f:
2374
- preprocessed_data = json.load(f)
2375
- print(f"✅ Loaded preprocessed data with {len(preprocessed_data)} pages.")
2376
- except Exception:
2377
- print("❌ Error loading preprocessed JSON.")
2378
- return []
2379
-
2380
- try:
2381
- doc = fitz.open(pdf_path)
2382
- except Exception:
2383
- print("❌ Error loading PDF.")
2384
- return []
2385
-
2386
- final_page_predictions = []
2387
- CHUNK_SIZE = 500
2388
-
2389
- for page_data in preprocessed_data:
2390
- page_num_1_based = page_data['page_number']
2391
- page_num_0_based = page_num_1_based - 1
2392
- page_raw_predictions = []
2393
- print(f"\n *** Processing Page {page_num_1_based} ({len(page_data['data'])} raw tokens) ***")
2394
-
2395
- fitz_page = doc.load_page(page_num_0_based)
2396
- page_width, page_height = fitz_page.rect.width, fitz_page.rect.height
2397
- print(f" -> Page dimensions: {page_width:.0f}x{page_height:.0f} (PDF points).")
2398
-
2399
- all_token_data = []
2400
- scale_factor = 2.0
2401
-
2402
- for item in page_data['data']:
2403
- raw_yolo_bbox = item['bbox']
2404
- bbox_pdf = [
2405
- int(raw_yolo_bbox[0] / scale_factor), int(raw_yolo_bbox[1] / scale_factor),
2406
- int(raw_yolo_bbox[2] / scale_factor), int(raw_yolo_bbox[3] / scale_factor)
2407
- ]
2408
- normalized_bbox = [
2409
- max(0, min(1000, int(1000 * bbox_pdf[0] / page_width))),
2410
- max(0, min(1000, int(1000 * bbox_pdf[1] / page_height))),
2411
- max(0, min(1000, int(1000 * bbox_pdf[2] / page_width))),
2412
- max(0, min(1000, int(1000 * bbox_pdf[3] / page_height)))
2413
- ]
2414
- all_token_data.append({
2415
- "word": item['word'],
2416
- "bbox_raw_pdf_space": bbox_pdf,
2417
- "bbox_normalized": normalized_bbox,
2418
- "item_original_data": item
2419
- })
2420
-
2421
- if not all_token_data: continue
2422
-
2423
- column_separator_x = page_data.get('column_separator_x', None)
2424
- if column_separator_x is not None:
2425
- print(f" -> Using SAVED column separator: X={column_separator_x}")
2426
- else:
2427
- print(" -> No column separator found. Assuming single chunk.")
2428
-
2429
- token_chunks = _merge_integrity(all_token_data, column_separator_x)
2430
- total_chunks = len(token_chunks)
2431
-
2432
- for chunk_idx, chunk_tokens in enumerate(token_chunks):
2433
- if not chunk_tokens: continue
2434
-
2435
- chunk_words = [t['word'] for t in chunk_tokens]
2436
- chunk_normalized_bboxes = [t['bbox_normalized'] for t in chunk_tokens]
2437
-
2438
- total_sub_chunks = (len(chunk_words) + CHUNK_SIZE - 1) // CHUNK_SIZE
2439
- for i in range(0, len(chunk_words), CHUNK_SIZE):
2440
- sub_chunk_idx = i // CHUNK_SIZE + 1
2441
- sub_words = chunk_words[i:i + CHUNK_SIZE]
2442
- sub_bboxes = chunk_normalized_bboxes[i:i + CHUNK_SIZE]
2443
- sub_tokens_data = chunk_tokens[i:i + CHUNK_SIZE]
2444
-
2445
- print(f" -> Chunk {chunk_idx + 1}/{total_chunks}, Sub-chunk {sub_chunk_idx}/{total_sub_chunks}: {len(sub_words)} words. Running Inference...")
2446
-
2447
- encoded_input = tokenizer(
2448
- [sub_words], boxes=[sub_bboxes],is_split_into_words=True, truncation=True, padding="max_length",
2449
- max_length=512, return_tensors="pt"
2450
- )
2451
- input_ids = encoded_input['input_ids'].to(device)
2452
- bbox = encoded_input['bbox'].to(device)
2453
- attention_mask = encoded_input['attention_mask'].to(device)
2454
-
2455
- with torch.no_grad():
2456
- predictions_int_list = model(input_ids, bbox, attention_mask)
2457
-
2458
- if not predictions_int_list: continue
2459
- predictions_int = predictions_int_list[0]
2460
- word_ids = encoded_input.word_ids(batch_index=0)
2461
- word_idx_to_pred_id = {}
2462
-
2463
- for token_idx, word_idx in enumerate(word_ids):
2464
- if word_idx is not None and word_idx < len(sub_words):
2465
- if word_idx not in word_idx_to_pred_id:
2466
- word_idx_to_pred_id[word_idx] = predictions_int[token_idx]
2467
-
2468
- for current_word_idx in range(len(sub_words)):
2469
- pred_id_or_tensor = word_idx_to_pred_id.get(current_word_idx, 0)
2470
- pred_id = pred_id_or_tensor.item() if torch.is_tensor(pred_id_or_tensor) else pred_id_or_tensor
2471
- predicted_label = ID_TO_LABEL[pred_id]
2472
- original_token = sub_tokens_data[current_word_idx]
2473
- page_raw_predictions.append({
2474
- "word": original_token['word'],
2475
- "bbox": original_token['bbox_raw_pdf_space'],
2476
- "predicted_label": predicted_label,
2477
- "page_number": page_num_1_based
2478
- })
2479
-
2480
- if page_raw_predictions:
2481
- final_page_predictions.append({
2482
- "page_number": page_num_1_based,
2483
- "data": page_raw_predictions
2484
- })
2485
- print(f" *** Page {page_num_1_based} Finalized: {len(page_raw_predictions)} labeled words. ***")
2486
-
2487
- doc.close()
2488
- print("\n" + "=" * 80)
2489
- print("--- LAYOUTLMV3 INFERENCE COMPLETE ---")
2490
- print("=" * 80)
2491
- return final_page_predictions
2492
-
2493
 
2494
 
2495
 
 
23
 
24
  import torch.nn as nn
25
  from TorchCRF import CRF
26
+ # from transformers import LayoutLMv3TokenizerFast, LayoutLMv3Model, LayoutLMv3Config
27
+ from transformers import LayoutLMv3Tokenizer, LayoutLMv3Model, LayoutLMv3Config
28
  from typing import List, Dict, Any, Optional, Union, Tuple
29
  from ultralytics import YOLO
30
  import glob
 
75
 
76
  from typing import Optional
77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  def sanitize_text(text: Optional[str]) -> str:
79
+ """Removes surrogate characters and other invalid code points that cause UTF-8 encoding errors."""
80
  if not isinstance(text, str) or text is None:
81
  return ""
82
+
83
+ # Matches all surrogates (\ud800-\udfff) and common non-characters (\ufffe, \uffff).
84
+ # This specifically removes '\udefd' which is causing your error.
85
+ surrogates_and_nonchars = re.compile(r'[\ud800-\udfff\ufffe\uffff]')
86
+
87
+ # Replace the invalid characters with a standard space.
88
+ # We strip afterward in the calling function.
89
+ return surrogates_and_nonchars.sub(' ', text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
 
91
 
92
 
 
717
 
718
 
719
 
720
+ def extract_native_words_and_convert(fitz_page, scale_factor: float = 2.0) -> list:
721
+ # 1. Get raw data
722
+ try:
723
+ raw_word_data = fitz_page.get_text("words")
724
+ except Exception as e:
725
+ print(f" ❌ PyMuPDF extraction failed completely: {e}")
726
+ return []
727
 
728
+ # ==============================================================================
729
+ # --- DEBUGGING BLOCK: CHECK FIRST 50 NATIVE WORDS (SAFE PRINT) ---
730
+ # ==============================================================================
731
+ print(f"\n[DEBUG] Native Extraction (Page {fitz_page.number + 1}): Checking first 50 words...")
732
 
733
+ debug_count = 0
734
+ for item in raw_word_data:
735
+ if debug_count >= 50: break
736
 
737
+ word_text = item[4]
738
 
739
+ # --- SAFE PRINTING LOGIC ---
740
+ # We encode/decode to ignore surrogates just for the print statement
741
+ # This prevents the "UnicodeEncodeError" that was crashing your script
742
+ safe_text = word_text.encode('utf-8', 'ignore').decode('utf-8')
743
 
744
+ # Get hex codes (handling potential errors in 'ord')
745
+ try:
746
+ unicode_points = [f"\\u{ord(c):04x}" for c in word_text]
747
+ except:
748
+ unicode_points = ["ERROR"]
749
 
750
+ print(f" Word {debug_count}: '{safe_text}' -> Codes: {unicode_points}")
751
+ debug_count += 1
752
+ print("----------------------------------------------------------------------\n")
753
+ # ==============================================================================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
754
 
 
 
 
 
 
 
 
 
755
  converted_ocr_output = []
756
  DEFAULT_CONFIDENCE = 99.0
757
 
758
  for x1, y1, x2, y2, word, *rest in raw_word_data:
759
+ # --- FIX: ROBUST SANITIZATION ---
760
+ # 1. Encode to UTF-8 ignoring errors (strips surrogates)
761
+ # 2. Decode back to string
762
+ cleaned_word_bytes = word.encode('utf-8', 'ignore')
763
+ cleaned_word = cleaned_word_bytes.decode('utf-8')
764
+ cleaned_word = word.encode('utf-8', 'ignore').decode('utf-8').strip()
765
+
766
+ # cleaned_word = cleaned_word.strip()
767
+ if not cleaned_word: continue
768
+
769
  x1_pix = int(x1 * scale_factor)
770
  y1_pix = int(y1 * scale_factor)
771
  x2_pix = int(x2 * scale_factor)
772
  y2_pix = int(y2 * scale_factor)
773
+
774
  converted_ocr_output.append({
775
  'type': 'text',
776
+ 'word': cleaned_word,
777
  'confidence': DEFAULT_CONFIDENCE,
778
  'bbox': [x1_pix, y1_pix, x2_pix, y2_pix],
779
  'y0': y1_pix, 'x0': x1_pix
780
  })
781
+
782
  return converted_ocr_output
783
 
784
 
785
 
786
 
787
 
 
 
 
 
788
  #===================================================================================================
789
  #===================================================================================================
790
  #===================================================================================================
 
974
 
975
 
976
 
977
+ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
978
+ page_num: int, fitz_page: fitz.Page,
979
+ pdf_name: str) -> Tuple[List[Dict[str, Any]], Optional[int]]:
980
+ """
981
+ OPTIMIZED FLOW:
982
+ 1. Run YOLO to find Equations/Tables.
983
+ 2. Mask raw text with YOLO boxes.
984
+ 3. Run Column Detection on the MASKED data.
985
+ 4. Proceed with OCR (Native or High-Res Tesseract Fallback) and Output.
986
+ """
987
+ global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
 
 
988
 
989
+ start_time_total = time.time()
 
 
990
 
991
+ if original_img is None:
992
+ print(f" ❌ Invalid image for page {page_num}.")
993
+ return None, None
 
 
994
 
995
+ # ====================================================================
996
+ # --- STEP 1: YOLO DETECTION ---
997
+ # ====================================================================
998
+ start_time_yolo = time.time()
999
+ results = model.predict(source=original_img, conf=CONF_THRESHOLD, imgsz=640, verbose=False)
 
 
 
 
 
1000
 
1001
+ relevant_detections = []
1002
+ if results and results[0].boxes:
1003
+ for box in results[0].boxes:
1004
+ class_id = int(box.cls[0])
1005
+ class_name = model.names[class_id]
1006
+ if class_name in TARGET_CLASSES:
1007
+ x1, y1, x2, y2 = box.xyxy[0].cpu().numpy().astype(int)
1008
+ relevant_detections.append(
1009
+ {'coords': (x1, y1, x2, y2), 'y1': y1, 'class': class_name, 'conf': float(box.conf[0])}
1010
+ )
1011
 
1012
+ merged_detections = merge_overlapping_boxes(relevant_detections, IOU_MERGE_THRESHOLD)
1013
+ print(f" [LOG] YOLO found {len(merged_detections)} objects in {time.time() - start_time_yolo:.3f}s.")
 
 
 
 
 
 
1014
 
1015
+ # ====================================================================
1016
+ # --- STEP 2: PREPARE DATA FOR COLUMN DETECTION (MASKING) ---
1017
+ # ====================================================================
1018
+ # Note: This uses the updated 'get_word_data_for_detection' which has its own optimizations
1019
+ raw_words_for_layout = get_word_data_for_detection(
1020
+ fitz_page, pdf_path, page_num,
1021
+ top_margin_percent=0.10, bottom_margin_percent=0.10
1022
+ )
1023
 
1024
+ masked_word_data = merge_yolo_into_word_data(raw_words_for_layout, merged_detections, scale_factor=2.0)
 
 
 
 
1025
 
1026
+ # ====================================================================
1027
+ # --- STEP 3: COLUMN DETECTION ---
1028
+ # ====================================================================
1029
+ page_width_pdf = fitz_page.rect.width
1030
+ page_height_pdf = fitz_page.rect.height
1031
 
1032
+ column_detection_params = {
1033
+ 'cluster_bin_size': 2, 'cluster_smoothing': 2,
1034
+ 'cluster_min_width': 10, 'cluster_threshold_percentile': 85,
1035
+ }
1036
 
1037
+ separators = calculate_x_gutters(masked_word_data, column_detection_params, page_height_pdf)
 
 
 
 
1038
 
1039
+ page_separator_x = None
1040
+ if separators:
1041
+ central_min = page_width_pdf * 0.35
1042
+ central_max = page_width_pdf * 0.65
1043
+ central_separators = [s for s in separators if central_min <= s <= central_max]
 
 
 
1044
 
1045
+ if central_separators:
1046
+ center_x = page_width_pdf / 2
1047
+ page_separator_x = min(central_separators, key=lambda x: abs(x - center_x))
1048
+ print(f" ✅ Column Split Confirmed at X={page_separator_x:.1f}")
1049
+ else:
1050
+ print(" ⚠️ Gutter found off-center. Ignoring.")
1051
+ else:
1052
+ print(" -> Single Column Layout Confirmed.")
1053
 
1054
+ # ====================================================================
1055
+ # --- STEP 4: COMPONENT EXTRACTION (Save Images) ---
1056
+ # ====================================================================
1057
+ start_time_components = time.time()
1058
+ component_metadata = []
1059
+ fig_count_page = 0
1060
+ eq_count_page = 0
1061
 
1062
+ for detection in merged_detections:
1063
+ x1, y1, x2, y2 = detection['coords']
1064
+ class_name = detection['class']
 
 
 
 
 
 
 
 
 
1065
 
1066
+ if class_name == 'figure':
1067
+ GLOBAL_FIGURE_COUNT += 1
1068
+ counter = GLOBAL_FIGURE_COUNT
1069
+ component_word = f"FIGURE{counter}"
1070
+ fig_count_page += 1
1071
+ elif class_name == 'equation':
1072
+ GLOBAL_EQUATION_COUNT += 1
1073
+ counter = GLOBAL_EQUATION_COUNT
1074
+ component_word = f"EQUATION{counter}"
1075
+ eq_count_page += 1
1076
+ else:
1077
+ continue
1078
 
1079
+ component_crop = original_img[y1:y2, x1:x2]
1080
+ component_filename = f"{pdf_name}_page{page_num}_{class_name}{counter}.png"
1081
+ cv2.imwrite(os.path.join(FIGURE_EXTRACTION_DIR, component_filename), component_crop)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1082
 
1083
  y_midpoint = (y1 + y2) // 2
1084
  component_metadata.append({
 
1091
  # --- STEP 5: HYBRID OCR (Native Text + Cached Tesseract Fallback) ---
1092
  # ====================================================================
1093
  raw_ocr_output = []
1094
+ scale_factor = 2.0 # Pipeline standard scale
1095
 
1096
  try:
1097
  # Try getting native text first
1098
+ # NOTE: extract_native_words_and_convert MUST ALSO BE UPDATED TO USE sanitize_text
1099
  raw_ocr_output = extract_native_words_and_convert(fitz_page, scale_factor=scale_factor)
1100
  except Exception as e:
1101
  print(f" ❌ Native text extraction failed: {e}")
 
1107
  cached_word_data = _ocr_cache.get_ocr(pdf_path, page_num)
1108
  for word_tuple in cached_word_data:
1109
  word_text, x1, y1, x2, y2 = word_tuple
1110
+
1111
  # Scale from PDF points to Pipeline Pixels (2.0)
1112
  x1_pix = int(x1 * scale_factor)
1113
  y1_pix = int(y1 * scale_factor)
1114
  x2_pix = int(x2 * scale_factor)
1115
  y2_pix = int(y2 * scale_factor)
1116
+
1117
  raw_ocr_output.append({
1118
  'type': 'text', 'word': word_text, 'confidence': 95.0,
1119
  'bbox': [x1_pix, y1_pix, x2_pix, y2_pix],
 
1123
  # === START OF OPTIMIZED OCR BLOCK ===
1124
  try:
1125
  # 1. Re-render Page at High Resolution (Zoom 4.0 = ~300 DPI)
 
1126
  ocr_zoom = 4.0
1127
  pix_ocr = fitz_page.get_pixmap(matrix=fitz.Matrix(ocr_zoom, ocr_zoom))
1128
+
1129
  # Convert PyMuPDF Pixmap to OpenCV format
1130
+ img_ocr_np = np.frombuffer(pix_ocr.samples, dtype=np.uint8).reshape(pix_ocr.height, pix_ocr.width,
1131
+ pix_ocr.n)
1132
+ if pix_ocr.n == 3:
1133
+ img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGB2BGR)
1134
+ elif pix_ocr.n == 4:
1135
+ img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGBA2BGR)
1136
 
1137
  # 2. Preprocess (Binarization)
 
1138
  processed_img = preprocess_image_for_ocr(img_ocr_np)
1139
+
1140
  # 3. Run Tesseract with Optimized Configuration
 
 
1141
  custom_config = r'--oem 3 --psm 6'
1142
+
1143
  hocr_data = pytesseract.image_to_data(
1144
+ processed_img,
1145
+ output_type=pytesseract.Output.DICT,
1146
  config=custom_config
1147
  )
1148
+
1149
  for i in range(len(hocr_data['level'])):
1150
+ text = hocr_data['text'][i] # Retrieve raw Tesseract text
1151
+
1152
+ # --- FIX: SANITIZE TEXT AND THEN STRIP ---
1153
+ cleaned_text = sanitize_text(text).strip()
1154
+
1155
+ if cleaned_text and hocr_data['conf'][i] > -1:
1156
  # 4. Coordinate Mapping
1157
+ scale_adjustment = scale_factor / ocr_zoom
1158
+
 
 
1159
  x1 = int(hocr_data['left'][i] * scale_adjustment)
1160
  y1 = int(hocr_data['top'][i] * scale_adjustment)
1161
  w = int(hocr_data['width'][i] * scale_adjustment)
1162
  h = int(hocr_data['height'][i] * scale_adjustment)
1163
  x2 = x1 + w
1164
  y2 = y1 + h
1165
+
1166
  raw_ocr_output.append({
1167
+ 'type': 'text',
1168
+ 'word': cleaned_text, # Use the sanitized word
1169
  'confidence': float(hocr_data['conf'][i]),
1170
+ 'bbox': [x1, y1, x2, y2],
1171
+ 'y0': y1,
1172
  'x0': x1
1173
  })
1174
  except Exception as e:
1175
  print(f" ❌ Tesseract OCR Error: {e}")
1176
  # === END OF OPTIMIZED OCR BLOCK ===
1177
+
1178
  # ====================================================================
1179
  # --- STEP 6: OCR CLEANING AND MERGING ---
1180
  # ====================================================================
1181
  items_to_sort = []
1182
+
1183
  for ocr_word in raw_ocr_output:
1184
  is_suppressed = False
1185
  for component in component_metadata:
 
1243
 
1244
 
1245
 
 
1246
  # def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
1247
  # page_num: int, fitz_page: fitz.Page,
1248
  # pdf_name: str) -> Tuple[List[Dict[str, Any]], Optional[int]]:
 
1553
  total_pages_processed = 0
1554
  mat = fitz.Matrix(2.0, 2.0)
1555
 
1556
+ print("\n[STEP 1.2: ITERATING PAGES - IN-MEMORY PROCESSING]")
1557
+
1558
+ for page_num_0_based in range(doc.page_count):
1559
+ page_num = page_num_0_based + 1
1560
+ print(f" -> Processing Page {page_num}/{doc.page_count}...")
1561
+
1562
+ fitz_page = doc.load_page(page_num_0_based)
1563
+
1564
+ try:
1565
+ pix = fitz_page.get_pixmap(matrix=mat)
1566
+ original_img = pixmap_to_numpy(pix)
1567
+ except Exception as e:
1568
+ print(f" ❌ Error converting page {page_num} to image: {e}")
1569
+ continue
1570
+
1571
+ final_output, page_separator_x = preprocess_and_ocr_page(
1572
+ original_img,
1573
+ model,
1574
+ pdf_path,
1575
+ page_num,
1576
+ fitz_page,
1577
+ pdf_name
1578
+ )
1579
+
1580
+ if final_output is not None:
1581
+ page_data = {
1582
+ "page_number": page_num,
1583
+ "data": final_output,
1584
+ "column_separator_x": page_separator_x
1585
+ }
1586
+ all_pages_data.append(page_data)
1587
+ total_pages_processed += 1
1588
+ else:
1589
+ print(f" ❌ Skipped page {page_num} due to processing error.")
1590
+
1591
+ doc.close()
1592
+
1593
+ if all_pages_data:
1594
+ try:
1595
+ with open(preprocessed_json_path, 'w') as f:
1596
+ json.dump(all_pages_data, f, indent=4)
1597
+ print(f"\n ✅ Combined structured OCR JSON saved to: {os.path.basename(preprocessed_json_path)}")
1598
+ except Exception as e:
1599
+ print(f"❌ ERROR saving combined JSON output: {e}")
1600
+ return None
1601
+ else:
1602
+ print("❌ WARNING: No page data generated. Halting pipeline.")
1603
+ return None
1604
+
1605
+ print("\n" + "=" * 80)
1606
+ print(f"--- YOLO/OCR PREPROCESSING COMPLETE ({total_pages_processed} pages processed) ---")
1607
+ print("=" * 80)
1608
+
1609
+ return preprocessed_json_path
1610
+
1611
+
1612
+ # ============================================================================
1613
+ # --- PHASE 2: LAYOUTLMV3 INFERENCE FUNCTIONS ---
1614
+ # ============================================================================
1615
+
1616
+ class LayoutLMv3ForTokenClassification(nn.Module):
1617
+ def __init__(self, num_labels: int = NUM_LABELS):
1618
+ super().__init__()
1619
+ self.num_labels = num_labels
1620
+ config = LayoutLMv3Config.from_pretrained("microsoft/layoutlmv3-base", num_labels=num_labels)
1621
+ self.layoutlmv3 = LayoutLMv3Model.from_pretrained("microsoft/layoutlmv3-base", config=config)
1622
+ self.classifier = nn.Linear(config.hidden_size, num_labels)
1623
+ self.crf = CRF(num_labels)
1624
+ self.init_weights()
1625
+
1626
+ def init_weights(self):
1627
+ nn.init.xavier_uniform_(self.classifier.weight)
1628
+ if self.classifier.bias is not None: nn.init.zeros_(self.classifier.bias)
1629
+
1630
+ def forward(self, input_ids: torch.Tensor, bbox: torch.Tensor, attention_mask: torch.Tensor,
1631
+ labels: Optional[torch.Tensor] = None):
1632
+ outputs = self.layoutlmv3(input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, return_dict=True)
1633
+ sequence_output = outputs.last_hidden_state
1634
+ emissions = self.classifier(sequence_output)
1635
+ mask = attention_mask.bool()
1636
+ if labels is not None:
1637
+ loss = -self.crf(emissions, labels, mask=mask).mean()
1638
+ return loss
1639
+ else:
1640
+ return self.crf.viterbi_decode(emissions, mask=mask)
1641
+
1642
+
1643
+ def _merge_integrity(all_token_data: List[Dict[str, Any]],
1644
+ column_separator_x: Optional[int]) -> List[List[Dict[str, Any]]]:
1645
+ """Splits the token data objects into column chunks based on a separator."""
1646
+ if column_separator_x is None:
1647
+ print(" -> No column separator. Treating as one chunk.")
1648
+ return [all_token_data]
1649
+
1650
+ left_column_tokens, right_column_tokens = [], []
1651
+ for token_data in all_token_data:
1652
+ bbox_raw = token_data['bbox_raw_pdf_space']
1653
+ center_x = (bbox_raw[0] + bbox_raw[2]) / 2
1654
+ if center_x < column_separator_x:
1655
+ left_column_tokens.append(token_data)
1656
+ else:
1657
+ right_column_tokens.append(token_data)
1658
+
1659
+ chunks = [c for c in [left_column_tokens, right_column_tokens] if c]
1660
+ print(f" -> Data split into {len(chunks)} column chunk(s) using separator X={column_separator_x}.")
1661
+ return chunks
1662
+
1663
+
1664
+
1665
+
1666
+
1667
+
1668
+ def run_inference_and_get_raw_words(pdf_path: str, model_path: str,
1669
+ preprocessed_json_path: str,
1670
+ column_detection_params: Optional[Dict] = None) -> List[Dict[str, Any]]:
1671
+ print("\n" + "=" * 80)
1672
+ print("--- 2. STARTING LAYOUTLMV3 INFERENCE PIPELINE (Raw Word Output) ---")
1673
+ print("=" * 80)
1674
+
1675
+ tokenizer = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base")
1676
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
1677
+ print(f" -> Using device: {device}")
1678
+
1679
+ try:
1680
+ model = LayoutLMv3ForTokenClassification(num_labels=NUM_LABELS)
1681
+ checkpoint = torch.load(model_path, map_location=device)
1682
+ model_state = checkpoint.get('model_state_dict', checkpoint)
1683
+ # Apply patch for layoutlmv3 compatibility with saved state_dict
1684
+ fixed_state_dict = {key.replace('layoutlm.', 'layoutlmv3.'): value for key, value in model_state.items()}
1685
+ model.load_state_dict(fixed_state_dict)
1686
+ model.to(device)
1687
+ model.eval()
1688
+ print(f"✅ LayoutLMv3 Model loaded successfully from {os.path.basename(model_path)}.")
1689
+ except Exception as e:
1690
+ print(f"❌ FATAL ERROR during LayoutLMv3 model loading: {e}")
1691
+ return []
1692
+
1693
+ try:
1694
+ with open(preprocessed_json_path, 'r', encoding='utf-8') as f:
1695
+ preprocessed_data = json.load(f)
1696
+ print(f"✅ Loaded preprocessed data with {len(preprocessed_data)} pages.")
1697
+ except Exception:
1698
+ print("❌ Error loading preprocessed JSON.")
1699
+ return []
1700
+
1701
+ try:
1702
+ doc = fitz.open(pdf_path)
1703
+ except Exception:
1704
+ print("❌ Error loading PDF.")
1705
+ return []
1706
+
1707
+ final_page_predictions = []
1708
+ CHUNK_SIZE = 500
1709
+
1710
+ for page_data in preprocessed_data:
1711
+ page_num_1_based = page_data['page_number']
1712
+ page_num_0_based = page_num_1_based - 1
1713
+ page_raw_predictions = []
1714
+ print(f"\n *** Processing Page {page_num_1_based} ({len(page_data['data'])} raw tokens) ***")
1715
+
1716
+ fitz_page = doc.load_page(page_num_0_based)
1717
+ page_width, page_height = fitz_page.rect.width, fitz_page.rect.height
1718
+ print(f" -> Page dimensions: {page_width:.0f}x{page_height:.0f} (PDF points).")
1719
+
1720
+ all_token_data = []
1721
+ scale_factor = 2.0
1722
+
1723
+ for item in page_data['data']:
1724
+ raw_yolo_bbox = item['bbox']
1725
+ bbox_pdf = [
1726
+ int(raw_yolo_bbox[0] / scale_factor), int(raw_yolo_bbox[1] / scale_factor),
1727
+ int(raw_yolo_bbox[2] / scale_factor), int(raw_yolo_bbox[3] / scale_factor)
1728
+ ]
1729
+ normalized_bbox = [
1730
+ max(0, min(1000, int(1000 * bbox_pdf[0] / page_width))),
1731
+ max(0, min(1000, int(1000 * bbox_pdf[1] / page_height))),
1732
+ max(0, min(1000, int(1000 * bbox_pdf[2] / page_width))),
1733
+ max(0, min(1000, int(1000 * bbox_pdf[3] / page_height)))
1734
+ ]
1735
+ all_token_data.append({
1736
+ "word": item['word'],
1737
+ "bbox_raw_pdf_space": bbox_pdf,
1738
+ "bbox_normalized": normalized_bbox,
1739
+ "item_original_data": item
1740
+ })
1741
+
1742
+ if not all_token_data:
1743
+ continue
1744
+
1745
+ column_separator_x = page_data.get('column_separator_x', None)
1746
+ if column_separator_x is not None:
1747
+ print(f" -> Using SAVED column separator: X={column_separator_x}")
1748
+ else:
1749
+ print(" -> No column separator found. Assuming single chunk.")
1750
+
1751
+ token_chunks = _merge_integrity(all_token_data, column_separator_x)
1752
+ total_chunks = len(token_chunks)
1753
+
1754
+ for chunk_idx, chunk_tokens in enumerate(token_chunks):
1755
+ if not chunk_tokens: continue
1756
+
1757
+ # 1. Sanitize: Convert everything to strings and aggressively clean Unicode errors.
1758
+ chunk_words = [
1759
+ str(t['word']).encode('utf-8', errors='ignore').decode('utf-8')
1760
+ for t in chunk_tokens
1761
+ ]
1762
+ chunk_normalized_bboxes = [t['bbox_normalized'] for t in chunk_tokens]
1763
+
1764
+ total_sub_chunks = (len(chunk_words) + CHUNK_SIZE - 1) // CHUNK_SIZE
1765
+ for i in range(0, len(chunk_words), CHUNK_SIZE):
1766
+ sub_chunk_idx = i // CHUNK_SIZE + 1
1767
+ sub_words = chunk_words[i:i + CHUNK_SIZE]
1768
+ sub_bboxes = chunk_normalized_bboxes[i:i + CHUNK_SIZE]
1769
+ sub_tokens_data = chunk_tokens[i:i + CHUNK_SIZE]
1770
+
1771
+ print(f" -> Chunk {chunk_idx + 1}/{total_chunks}, Sub-chunk {sub_chunk_idx}/{total_sub_chunks}: {len(sub_words)} words. Running Inference...")
1772
+
1773
+ # 2. Manual generation of word_ids
1774
+ manual_word_ids = []
1775
+ for current_word_idx, word in enumerate(sub_words):
1776
+ sub_tokens = tokenizer.tokenize(word)
1777
+ for _ in sub_tokens:
1778
+ manual_word_ids.append(current_word_idx)
1779
+
1780
+ encoded_input = tokenizer(
1781
+ sub_words,
1782
+ boxes=sub_bboxes,
1783
+ truncation=True,
1784
+ padding="max_length",
1785
+ max_length=512,
1786
+ is_split_into_words=True,
1787
+ return_tensors="pt"
1788
+ )
1789
+
1790
+ # Check for empty sequence
1791
+ if encoded_input['input_ids'].shape[0] == 0:
1792
+ print(f" -> Warning: Sub-chunk {sub_chunk_idx} encoded to an empty sequence. Skipping.")
1793
+ continue
1794
+
1795
+ # 3. Finalize word_ids based on encoded output length
1796
+ sequence_length = int(torch.sum(encoded_input['attention_mask']).item())
1797
+ content_token_length = max(0, sequence_length - 2)
1798
+
1799
+ manual_word_ids = manual_word_ids[:content_token_length]
1800
 
1801
+ final_word_ids = [None] # CLS token (index 0)
1802
+ final_word_ids.extend(manual_word_ids)
 
1803
 
1804
+ if sequence_length > 1:
1805
+ final_word_ids.append(None) # SEP token
1806
 
1807
+ final_word_ids.extend([None] * (512 - len(final_word_ids)))
1808
+ word_ids = final_word_ids[:512] # Final array for mapping
 
 
 
 
1809
 
1810
+ # Inputs are already batched by the tokenizer as [1, 512]
1811
+ input_ids = encoded_input['input_ids'].to(device)
1812
+ bbox = encoded_input['bbox'].to(device)
1813
+ attention_mask = encoded_input['attention_mask'].to(device)
 
 
 
 
1814
 
1815
+ with torch.no_grad():
1816
+ model_outputs = model(input_ids, bbox, attention_mask)
1817
+
1818
+ # --- Robust extraction: support several forward return types ---
1819
+ # We'll try (in order):
1820
+ # 1) model_outputs is (emissions_tensor, viterbi_list) -> use emissions for logits, keep decoded
1821
+ # 2) model_outputs has .logits attribute (HF ModelOutput)
1822
+ # 3) model_outputs is tuple/list containing a logits tensor
1823
+ # 4) model_outputs is a tensor (assume logits)
1824
+ # 5) model_outputs is a list-of-lists of ints (viterbi decoded) -> use that directly (no logits)
1825
+ logits_tensor = None
1826
+ decoded_labels_list = None
1827
+
1828
+ # case 1: tuple/list with (emissions, viterbi)
1829
+ if isinstance(model_outputs, (tuple, list)) and len(model_outputs) == 2:
1830
+ a, b = model_outputs
1831
+ # a might be tensor (emissions), b might be viterbi list
1832
+ if isinstance(a, torch.Tensor):
1833
+ logits_tensor = a
1834
+ if isinstance(b, list):
1835
+ decoded_labels_list = b
1836
+
1837
+ # case 2: HF ModelOutput with .logits
1838
+ if logits_tensor is None and hasattr(model_outputs, 'logits') and isinstance(model_outputs.logits, torch.Tensor):
1839
+ logits_tensor = model_outputs.logits
1840
+
1841
+ # case 3: tuple/list - search for a 3D tensor (B, L, C)
1842
+ if logits_tensor is None and isinstance(model_outputs, (tuple, list)):
1843
+ found_tensor = None
1844
+ for item in model_outputs:
1845
+ if isinstance(item, torch.Tensor):
1846
+ # prefer 3D (batch, seq, labels)
1847
+ if item.dim() == 3:
1848
+ logits_tensor = item
1849
+ break
1850
+ if found_tensor is None:
1851
+ found_tensor = item
1852
+ if logits_tensor is None and found_tensor is not None:
1853
+ # found_tensor may be (batch, seq, hidden) or (seq, hidden); we avoid guessing.
1854
+ # Keep found_tensor only if it matches num_labels dimension
1855
+ if found_tensor.dim() == 3 and found_tensor.shape[-1] == NUM_LABELS:
1856
+ logits_tensor = found_tensor
1857
+ elif found_tensor.dim() == 2 and found_tensor.shape[-1] == NUM_LABELS:
1858
+ logits_tensor = found_tensor.unsqueeze(0)
1859
+
1860
+ # case 4: model_outputs directly a tensor
1861
+ if logits_tensor is None and isinstance(model_outputs, torch.Tensor):
1862
+ logits_tensor = model_outputs
1863
+
1864
+ # case 5: model_outputs is a decoded viterbi list (common for CRF-only forward)
1865
+ if decoded_labels_list is None and isinstance(model_outputs, list) and model_outputs and isinstance(model_outputs[0], list):
1866
+ # assume model_outputs is already viterbi decoded: List[List[int]] with batch dim first
1867
+ decoded_labels_list = model_outputs
1868
+
1869
+ # If neither logits nor decoded exist, that's fatal
1870
+ if logits_tensor is None and decoded_labels_list is None:
1871
+ # helpful debug info
1872
+ try:
1873
+ elem_shapes = [ (type(x), getattr(x, 'shape', None)) for x in model_outputs ] if isinstance(model_outputs, (list, tuple)) else [(type(model_outputs), getattr(model_outputs, 'shape', None))]
1874
+ except Exception:
1875
+ elem_shapes = str(type(model_outputs))
1876
+ raise RuntimeError(f"Model output of type {type(model_outputs)} did not contain a valid logits tensor or decoded viterbi. Contents: {elem_shapes}")
1877
+
1878
+ # If we have logits_tensor, normalize shape to [seq_len, num_labels]
1879
+ if logits_tensor is not None:
1880
+ # If shape is [B, L, C] with B==1, squeeze batch
1881
+ if logits_tensor.dim() == 3 and logits_tensor.shape[0] == 1:
1882
+ preds_tensor = logits_tensor.squeeze(0) # [L, C]
1883
+ else:
1884
+ preds_tensor = logits_tensor # possibly [L, C] already
1885
 
1886
+ # Safety: ensure we have at least seq_len x channels
1887
+ if preds_tensor.dim() != 2:
1888
+ # try to reshape or error
1889
+ raise RuntimeError(f"Unexpected logits tensor shape: {tuple(preds_tensor.shape)}")
1890
+ # We'll use preds_tensor[token_idx] to argmax
1891
+ else:
1892
+ preds_tensor = None # no logits available
1893
 
1894
+ # If decoded labels provided, make a token-level list-of-ints aligned to tokenizer tokens
1895
+ decoded_token_labels = None
1896
+ if decoded_labels_list is not None:
1897
+ # decoded_labels_list is batch-first; we used batch size 1
1898
+ # if multiple sequences returned, take first
1899
+ decoded_token_labels = decoded_labels_list[0] if isinstance(decoded_labels_list[0], list) else decoded_labels_list
 
 
 
 
 
1900
 
1901
+ # Now map token-level predictions -> word-level predictions using word_ids
1902
+ word_idx_to_pred_id = {}
 
1903
 
1904
+ if preds_tensor is not None:
1905
+ # We have logits. Use argmax of logits for each token id up to sequence_length
1906
+ for token_idx, word_idx in enumerate(word_ids):
1907
+ if token_idx >= sequence_length:
1908
+ break
1909
+ if word_idx is not None and word_idx < len(sub_words):
1910
+ if word_idx not in word_idx_to_pred_id:
1911
+ pred_id = torch.argmax(preds_tensor[token_idx]).item()
1912
+ word_idx_to_pred_id[word_idx] = pred_id
1913
+ else:
1914
+ # No logits, but we have decoded_token_labels from CRF (one label per token)
1915
+ # We'll align decoded_token_labels to token positions.
1916
+ if decoded_token_labels is None:
1917
+ # should not happen due to earlier checks
1918
+ raise RuntimeError("No logits and no decoded labels available for mapping.")
1919
+ # decoded_token_labels length may be equal to content_token_length (no special tokens)
1920
+ # or equal to sequence_length; try to align intelligently:
1921
+ # Prefer using decoded_token_labels aligned to the tokenizer tokens (starting at token 1 for CLS)
1922
+ # If decoded length == content_token_length, then manual_word_ids maps sub-token -> word idx for content tokens only.
1923
+ # We'll iterate tokens and pick label accordingly.
1924
+ # Build token_idx -> decoded_label mapping:
1925
+ # We'll assume decoded_token_labels correspond to content tokens (no CLS/SEP). If decoded length == sequence_length, then shift by 0.
1926
+ decoded_len = len(decoded_token_labels)
1927
+ # Heuristic: if decoded_len == content_token_length -> alignment starts at token_idx 1 (skip CLS)
1928
+ if decoded_len == content_token_length:
1929
+ decoded_start = 1
1930
+ elif decoded_len == sequence_length:
1931
+ decoded_start = 0
1932
+ else:
1933
+ # fallback: prefer decoded_start=1 (most common)
1934
+ decoded_start = 1
1935
+
1936
+ for tok_idx_in_decoded, label_id in enumerate(decoded_token_labels):
1937
+ tok_idx = decoded_start + tok_idx_in_decoded
1938
+ if tok_idx >= 512:
1939
+ break
1940
+ if tok_idx >= sequence_length:
1941
+ break
1942
+ # map this token to a word index if present
1943
+ word_idx = word_ids[tok_idx] if tok_idx < len(word_ids) else None
1944
+ if word_idx is not None and word_idx < len(sub_words):
1945
+ if word_idx not in word_idx_to_pred_id:
1946
+ # label_id may already be an int
1947
+ word_idx_to_pred_id[word_idx] = int(label_id)
1948
+
1949
+ # Finally convert mapped word preds -> page_raw_predictions entries
1950
+ for current_word_idx in range(len(sub_words)):
1951
+ pred_id = word_idx_to_pred_id.get(current_word_idx, 0) # default to 0
1952
+ predicted_label = ID_TO_LABEL[pred_id]
1953
+ original_token = sub_tokens_data[current_word_idx]
1954
+ page_raw_predictions.append({
1955
+ "word": original_token['word'],
1956
+ "bbox": original_token['bbox_raw_pdf_space'],
1957
+ "predicted_label": predicted_label,
1958
+ "page_number": page_num_1_based
1959
+ })
1960
 
1961
+ if page_raw_predictions:
1962
+ final_page_predictions.append({
1963
+ "page_number": page_num_1_based,
1964
+ "data": page_raw_predictions
1965
+ })
1966
+ print(f" *** Page {page_num_1_based} Finalized: {len(page_raw_predictions)} labeled words. ***")
1967
 
1968
+ doc.close()
1969
+ print("\n" + "=" * 80)
1970
+ print("--- LAYOUTLMV3 INFERENCE COMPLETE ---")
1971
+ print("=" * 80)
1972
+ return final_page_predictions
1973
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1974
 
 
 
 
1975
 
1976
 
1977
 
 
2052
  # "item_original_data": item
2053
  # })
2054
 
2055
+ # # ==============================================================================
2056
+ # # --- DEBUGGING BLOCK: CHECK FIRST 50 TOKENS BEFORE INFERENCE ---
2057
+ # # ==============================================================================
2058
+ # print(f"\n[DEBUG] LayoutLMv3 Input (Page {page_num_1_based}): Checking first 50 tokens...")
2059
+ # debug_count = 0
2060
+ # for t in all_token_data:
2061
+ # if debug_count >= 50: break
2062
+ # w = t['word']
2063
+ # unicode_points = [f"\\u{ord(c):04x}" for c in w]
2064
+ # print(f" Token {debug_count}: '{w}' -> Codes: {unicode_points}")
2065
+ # debug_count += 1
2066
+ # print("----------------------------------------------------------------------\n")
2067
+ # # ==============================================================================
2068
+
2069
  # if not all_token_data:
2070
  # continue
2071
 
 
2143
  # model_outputs = model(input_ids, bbox, attention_mask)
2144
 
2145
  # # --- Robust extraction: support several forward return types ---
 
 
 
 
 
 
2146
  # logits_tensor = None
2147
  # decoded_labels_list = None
2148
 
2149
  # # case 1: tuple/list with (emissions, viterbi)
2150
  # if isinstance(model_outputs, (tuple, list)) and len(model_outputs) == 2:
2151
  # a, b = model_outputs
 
2152
  # if isinstance(a, torch.Tensor):
2153
  # logits_tensor = a
2154
  # if isinstance(b, list):
 
2163
  # found_tensor = None
2164
  # for item in model_outputs:
2165
  # if isinstance(item, torch.Tensor):
 
2166
  # if item.dim() == 3:
2167
  # logits_tensor = item
2168
  # break
2169
  # if found_tensor is None:
2170
  # found_tensor = item
2171
  # if logits_tensor is None and found_tensor is not None:
 
 
2172
  # if found_tensor.dim() == 3 and found_tensor.shape[-1] == NUM_LABELS:
2173
  # logits_tensor = found_tensor
2174
  # elif found_tensor.dim() == 2 and found_tensor.shape[-1] == NUM_LABELS:
 
2180
 
2181
  # # case 5: model_outputs is a decoded viterbi list (common for CRF-only forward)
2182
  # if decoded_labels_list is None and isinstance(model_outputs, list) and model_outputs and isinstance(model_outputs[0], list):
 
2183
  # decoded_labels_list = model_outputs
2184
 
2185
  # # If neither logits nor decoded exist, that's fatal
2186
  # if logits_tensor is None and decoded_labels_list is None:
 
2187
  # try:
2188
  # elem_shapes = [ (type(x), getattr(x, 'shape', None)) for x in model_outputs ] if isinstance(model_outputs, (list, tuple)) else [(type(model_outputs), getattr(model_outputs, 'shape', None))]
2189
  # except Exception:
 
2192
 
2193
  # # If we have logits_tensor, normalize shape to [seq_len, num_labels]
2194
  # if logits_tensor is not None:
 
2195
  # if logits_tensor.dim() == 3 and logits_tensor.shape[0] == 1:
2196
  # preds_tensor = logits_tensor.squeeze(0) # [L, C]
2197
  # else:
2198
  # preds_tensor = logits_tensor # possibly [L, C] already
2199
 
 
2200
  # if preds_tensor.dim() != 2:
 
2201
  # raise RuntimeError(f"Unexpected logits tensor shape: {tuple(preds_tensor.shape)}")
 
2202
  # else:
2203
  # preds_tensor = None # no logits available
2204
 
2205
  # # If decoded labels provided, make a token-level list-of-ints aligned to tokenizer tokens
2206
  # decoded_token_labels = None
2207
  # if decoded_labels_list is not None:
 
 
2208
  # decoded_token_labels = decoded_labels_list[0] if isinstance(decoded_labels_list[0], list) else decoded_labels_list
2209
 
2210
  # # Now map token-level predictions -> word-level predictions using word_ids
2211
  # word_idx_to_pred_id = {}
2212
 
2213
  # if preds_tensor is not None:
 
2214
  # for token_idx, word_idx in enumerate(word_ids):
2215
  # if token_idx >= sequence_length:
2216
  # break
 
2219
  # pred_id = torch.argmax(preds_tensor[token_idx]).item()
2220
  # word_idx_to_pred_id[word_idx] = pred_id
2221
  # else:
 
 
2222
  # if decoded_token_labels is None:
 
2223
  # raise RuntimeError("No logits and no decoded labels available for mapping.")
 
 
 
 
 
 
 
2224
  # decoded_len = len(decoded_token_labels)
 
2225
  # if decoded_len == content_token_length:
2226
  # decoded_start = 1
2227
  # elif decoded_len == sequence_length:
2228
  # decoded_start = 0
2229
  # else:
 
2230
  # decoded_start = 1
2231
 
2232
  # for tok_idx_in_decoded, label_id in enumerate(decoded_token_labels):
 
2235
  # break
2236
  # if tok_idx >= sequence_length:
2237
  # break
 
2238
  # word_idx = word_ids[tok_idx] if tok_idx < len(word_ids) else None
2239
  # if word_idx is not None and word_idx < len(sub_words):
2240
  # if word_idx not in word_idx_to_pred_id:
 
2241
  # word_idx_to_pred_id[word_idx] = int(label_id)
2242
 
2243
  # # Finally convert mapped word preds -> page_raw_predictions entries
 
2266
  # return final_page_predictions
2267
 
2268
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2269
 
2270
 
2271