Spaces:
Running
Running
Update working_yolo_pipeline.py
Browse files- working_yolo_pipeline.py +3 -667
working_yolo_pipeline.py
CHANGED
|
@@ -536,183 +536,6 @@ def calculate_x_gutters(word_data: list, params: Dict, page_height: float) -> Li
|
|
| 536 |
return sorted(final_separators)
|
| 537 |
|
| 538 |
#======================================================================================================================================
|
| 539 |
-
# def get_word_data_for_detection(page: fitz.Page, pdf_path: str, page_num: int,
|
| 540 |
-
# top_margin_percent=0.10, bottom_margin_percent=0.10) -> list:
|
| 541 |
-
# """Extract word data with OCR caching to avoid redundant Tesseract runs."""
|
| 542 |
-
# word_data = page.get_text("words")
|
| 543 |
-
|
| 544 |
-
# if len(word_data) > 0:
|
| 545 |
-
# word_data = [(w[4], w[0], w[1], w[2], w[3]) for w in word_data]
|
| 546 |
-
# else:
|
| 547 |
-
# if _ocr_cache.has_ocr(pdf_path, page_num):
|
| 548 |
-
# word_data = _ocr_cache.get_ocr(pdf_path, page_num)
|
| 549 |
-
# else:
|
| 550 |
-
# try:
|
| 551 |
-
# # --- OPTIMIZATION START ---
|
| 552 |
-
# # 1. Render at Higher Resolution (Zoom 4.0 = ~300 DPI)
|
| 553 |
-
# zoom_level = 4.0
|
| 554 |
-
# pix = page.get_pixmap(matrix=fitz.Matrix(zoom_level, zoom_level))
|
| 555 |
-
|
| 556 |
-
# # 2. Convert directly to OpenCV format (Faster than PIL)
|
| 557 |
-
# img_np = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
|
| 558 |
-
# if pix.n == 3:
|
| 559 |
-
# img_np = cv2.cvtColor(img_np, cv2.COLOR_RGB2BGR)
|
| 560 |
-
# elif pix.n == 4:
|
| 561 |
-
# img_np = cv2.cvtColor(img_np, cv2.COLOR_RGBA2BGR)
|
| 562 |
-
|
| 563 |
-
# # 3. Apply Preprocessing (Thresholding)
|
| 564 |
-
# processed_img = preprocess_image_for_ocr(img_np)
|
| 565 |
-
|
| 566 |
-
# # 4. Optimized Tesseract Config
|
| 567 |
-
# # --psm 6: Assume a single uniform block of text (Great for columns/questions)
|
| 568 |
-
# # --oem 3: Default engine (LSTM)
|
| 569 |
-
# custom_config = r'--oem 3 --psm 6'
|
| 570 |
-
|
| 571 |
-
# data = pytesseract.image_to_data(processed_img, output_type=pytesseract.Output.DICT,
|
| 572 |
-
# config=custom_config)
|
| 573 |
-
|
| 574 |
-
# full_word_data = []
|
| 575 |
-
# for i in range(len(data['level'])):
|
| 576 |
-
# text = data['text'][i].strip()
|
| 577 |
-
# if text:
|
| 578 |
-
# # Scale coordinates back to PDF points
|
| 579 |
-
# x1 = data['left'][i] / zoom_level
|
| 580 |
-
# y1 = data['top'][i] / zoom_level
|
| 581 |
-
# x2 = (data['left'][i] + data['width'][i]) / zoom_level
|
| 582 |
-
# y2 = (data['top'][i] + data['height'][i]) / zoom_level
|
| 583 |
-
# full_word_data.append((text, x1, y1, x2, y2))
|
| 584 |
-
|
| 585 |
-
# word_data = full_word_data
|
| 586 |
-
# _ocr_cache.set_ocr(pdf_path, page_num, word_data)
|
| 587 |
-
# # --- OPTIMIZATION END ---
|
| 588 |
-
# except Exception as e:
|
| 589 |
-
# print(f" ❌ OCR Error in detection phase: {e}")
|
| 590 |
-
# return []
|
| 591 |
-
|
| 592 |
-
# # Apply margin filtering
|
| 593 |
-
# page_height = page.rect.height
|
| 594 |
-
# y_min = page_height * top_margin_percent
|
| 595 |
-
# y_max = page_height * (1 - bottom_margin_percent)
|
| 596 |
-
# return [d for d in word_data if d[2] >= y_min and d[4] <= y_max]
|
| 597 |
-
|
| 598 |
-
#============================================================================================================
|
| 599 |
-
|
| 600 |
-
|
| 601 |
-
# def get_word_data_for_detection(page: fitz.Page, pdf_path: str, page_num: int,
|
| 602 |
-
# top_margin_percent=0.10, bottom_margin_percent=0.10) -> list:
|
| 603 |
-
# word_data = page.get_text("words")
|
| 604 |
-
|
| 605 |
-
# if len(word_data) > 0:
|
| 606 |
-
# word_data = [(w[4], w[0], w[1], w[2], w[3]) for w in word_data]
|
| 607 |
-
# else:
|
| 608 |
-
# if _ocr_cache.has_ocr(pdf_path, page_num):
|
| 609 |
-
# word_data = _ocr_cache.get_ocr(pdf_path, page_num)
|
| 610 |
-
# else:
|
| 611 |
-
# try:
|
| 612 |
-
# # 1. Render at Higher Resolution
|
| 613 |
-
# zoom_level = 4.0
|
| 614 |
-
# pix = page.get_pixmap(matrix=fitz.Matrix(zoom_level, zoom_level))
|
| 615 |
-
# img_np = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
|
| 616 |
-
|
| 617 |
-
# # Convert to BGR for RapidOCR
|
| 618 |
-
# if pix.n == 3: img_np = cv2.cvtColor(img_np, cv2.COLOR_RGB2BGR)
|
| 619 |
-
# elif pix.n == 4: img_np = cv2.cvtColor(img_np, cv2.COLOR_RGBA2BGR)
|
| 620 |
-
|
| 621 |
-
# # 2. Run RapidOCR
|
| 622 |
-
# # RapidOCR returns: [[box, text, score], ...]
|
| 623 |
-
# # where box is [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
|
| 624 |
-
# results, _ = ocr_engine(img_np)
|
| 625 |
-
|
| 626 |
-
# full_word_data = []
|
| 627 |
-
# if results:
|
| 628 |
-
# for box, text, score in results:
|
| 629 |
-
# text = text.strip()
|
| 630 |
-
# if text:
|
| 631 |
-
# # 3. Convert Polygon to BBox and Scale back to PDF points
|
| 632 |
-
# xs = [p[0] for p in box]
|
| 633 |
-
# ys = [p[1] for p in box]
|
| 634 |
-
|
| 635 |
-
# x1 = min(xs) / zoom_level
|
| 636 |
-
# y1 = min(ys) / zoom_level
|
| 637 |
-
# x2 = max(xs) / zoom_level
|
| 638 |
-
# y2 = max(ys) / zoom_level
|
| 639 |
-
|
| 640 |
-
# full_word_data.append((text, x1, y1, x2, y2))
|
| 641 |
-
|
| 642 |
-
# word_data = full_word_data
|
| 643 |
-
# _ocr_cache.set_ocr(pdf_path, page_num, word_data)
|
| 644 |
-
# except Exception as e:
|
| 645 |
-
# print(f" ❌ RapidOCR Error in detection phase: {e}")
|
| 646 |
-
# return []
|
| 647 |
-
|
| 648 |
-
# # Apply margin filtering
|
| 649 |
-
# page_height = page.rect.height
|
| 650 |
-
# y_min = page_height * top_margin_percent
|
| 651 |
-
# y_max = page_height * (1 - bottom_margin_percent)
|
| 652 |
-
# return [d for d in word_data if d[2] >= y_min and d[4] <= y_max]
|
| 653 |
-
|
| 654 |
-
|
| 655 |
-
# def get_word_data_for_detection(page: fitz.Page, pdf_path: str, page_num: int,
|
| 656 |
-
# top_margin_percent=0.10, bottom_margin_percent=0.10) -> list:
|
| 657 |
-
# word_data = page.get_text("words")
|
| 658 |
-
|
| 659 |
-
# if len(word_data) > 0:
|
| 660 |
-
# # Reformat standard PyMuPDF output to (text, x1, y1, x2, y2)
|
| 661 |
-
# word_data = [(w[4], w[0], w[1], w[2], w[3]) for w in word_data]
|
| 662 |
-
# else:
|
| 663 |
-
# if _ocr_cache.has_ocr(pdf_path, page_num):
|
| 664 |
-
# word_data = _ocr_cache.get_ocr(pdf_path, page_num)
|
| 665 |
-
# else:
|
| 666 |
-
# try:
|
| 667 |
-
# # 1. Render at Higher Resolution
|
| 668 |
-
# zoom_level = 4.0
|
| 669 |
-
# pix = page.get_pixmap(matrix=fitz.Matrix(zoom_level, zoom_level))
|
| 670 |
-
# img_np = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
|
| 671 |
-
|
| 672 |
-
# # Convert to BGR for RapidOCR
|
| 673 |
-
# if pix.n == 3:
|
| 674 |
-
# img_np = cv2.cvtColor(img_np, cv2.COLOR_RGB2BGR)
|
| 675 |
-
# elif pix.n == 4:
|
| 676 |
-
# img_np = cv2.cvtColor(img_np, cv2.COLOR_RGBA2BGR)
|
| 677 |
-
|
| 678 |
-
# # 2. Run RapidOCR
|
| 679 |
-
# ocr_out = ocr_engine(img_np)
|
| 680 |
-
|
| 681 |
-
# full_word_data = []
|
| 682 |
-
|
| 683 |
-
# # CRITICAL FIX: Use 'is not None' to avoid NumPy truthiness ambiguity
|
| 684 |
-
# if ocr_out is not None and ocr_out.boxes is not None:
|
| 685 |
-
# # Use zip to iterate through boxes, text, and scores simultaneously
|
| 686 |
-
# for box, text, score in zip(ocr_out.boxes, ocr_out.txts, ocr_out.scores):
|
| 687 |
-
# text = str(text).strip()
|
| 688 |
-
# if text:
|
| 689 |
-
# # 3. Convert Polygon to BBox and Scale back to PDF points
|
| 690 |
-
# xs = [p[0] for p in box]
|
| 691 |
-
# ys = [p[1] for p in box]
|
| 692 |
-
|
| 693 |
-
# x1 = min(xs) / zoom_level
|
| 694 |
-
# y1 = min(ys) / zoom_level
|
| 695 |
-
# x2 = max(xs) / zoom_level
|
| 696 |
-
# y2 = max(ys) / zoom_level
|
| 697 |
-
|
| 698 |
-
# full_word_data.append((text, x1, y1, x2, y2))
|
| 699 |
-
|
| 700 |
-
# word_data = full_word_data
|
| 701 |
-
# _ocr_cache.set_ocr(pdf_path, page_num, word_data)
|
| 702 |
-
|
| 703 |
-
# except Exception as e:
|
| 704 |
-
# print(f" ❌ RapidOCR Error in detection phase: {e}")
|
| 705 |
-
# return []
|
| 706 |
-
|
| 707 |
-
# # Apply margin filtering
|
| 708 |
-
# page_height = page.rect.height
|
| 709 |
-
# y_min = page_height * top_margin_percent
|
| 710 |
-
# y_max = page_height * (1 - bottom_margin_percent)
|
| 711 |
-
|
| 712 |
-
# # Return filtered data where y-coordinates fall within the margins
|
| 713 |
-
# return [d for d in word_data if d[2] >= y_min and d[4] <= y_max]
|
| 714 |
-
|
| 715 |
-
|
| 716 |
|
| 717 |
|
| 718 |
def get_word_data_for_detection(page: fitz.Page, pdf_path: str, page_num: int,
|
|
@@ -1129,19 +952,7 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
|
|
| 1129 |
merged_detections = merge_overlapping_boxes(relevant_detections, IOU_MERGE_THRESHOLD)
|
| 1130 |
print(f" [LOG] YOLO found {len(merged_detections)} objects in {time.time() - start_time_yolo:.3f}s.")
|
| 1131 |
|
| 1132 |
-
|
| 1133 |
-
# if results and results[0].boxes:
|
| 1134 |
-
# for box in results[0].boxes:
|
| 1135 |
-
# class_id = int(box.cls[0])
|
| 1136 |
-
# class_name = model.names[class_id]
|
| 1137 |
-
# if class_name in TARGET_CLASSES:
|
| 1138 |
-
# x1, y1, x2, y2 = box.xyxy[0].cpu().numpy().astype(int)
|
| 1139 |
-
# relevant_detections.append(
|
| 1140 |
-
# {'coords': (x1, y1, x2, y2), 'y1': y1, 'class': class_name, 'conf': float(box.conf[0])}
|
| 1141 |
-
# )
|
| 1142 |
-
|
| 1143 |
-
# merged_detections = merge_overlapping_boxes(relevant_detections, IOU_MERGE_THRESHOLD)
|
| 1144 |
-
# print(f" [LOG] YOLO found {len(merged_detections)} objects in {time.time() - start_time_yolo:.3f}s.")
|
| 1145 |
|
| 1146 |
# ====================================================================
|
| 1147 |
# --- STEP 2: PREPARE DATA FOR COLUMN DETECTION (MASKING) ---
|
|
@@ -1252,108 +1063,10 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
|
|
| 1252 |
})
|
| 1253 |
else:
|
| 1254 |
# === START OF OPTIMIZED OCR BLOCK ===
|
| 1255 |
-
|
| 1256 |
-
# # 1. Re-render Page at High Resolution (Zoom 4.0 = ~300 DPI)
|
| 1257 |
-
# ocr_zoom = 4.0
|
| 1258 |
-
# pix_ocr = fitz_page.get_pixmap(matrix=fitz.Matrix(ocr_zoom, ocr_zoom))
|
| 1259 |
-
|
| 1260 |
-
# # Convert PyMuPDF Pixmap to OpenCV format
|
| 1261 |
-
# img_ocr_np = np.frombuffer(pix_ocr.samples, dtype=np.uint8).reshape(pix_ocr.height, pix_ocr.width,
|
| 1262 |
-
# pix_ocr.n)
|
| 1263 |
-
# if pix_ocr.n == 3:
|
| 1264 |
-
# img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGB2BGR)
|
| 1265 |
-
# elif pix_ocr.n == 4:
|
| 1266 |
-
# img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGBA2BGR)
|
| 1267 |
-
|
| 1268 |
-
# # 2. Preprocess (Binarization)
|
| 1269 |
-
# processed_img = preprocess_image_for_ocr(img_ocr_np)
|
| 1270 |
-
|
| 1271 |
-
# # 3. Run Tesseract with Optimized Configuration
|
| 1272 |
-
# custom_config = r'--oem 3 --psm 6'
|
| 1273 |
-
|
| 1274 |
-
# hocr_data = pytesseract.image_to_data(
|
| 1275 |
-
# processed_img,
|
| 1276 |
-
# output_type=pytesseract.Output.DICT,
|
| 1277 |
-
# config=custom_config
|
| 1278 |
-
# )
|
| 1279 |
-
|
| 1280 |
-
# for i in range(len(hocr_data['level'])):
|
| 1281 |
-
# text = hocr_data['text'][i] # Retrieve raw Tesseract text
|
| 1282 |
-
|
| 1283 |
-
# # --- FIX: SANITIZE TEXT AND THEN STRIP ---
|
| 1284 |
-
# cleaned_text = sanitize_text(text).strip()
|
| 1285 |
-
|
| 1286 |
-
# if cleaned_text and hocr_data['conf'][i] > -1:
|
| 1287 |
-
# # 4. Coordinate Mapping
|
| 1288 |
-
# scale_adjustment = scale_factor / ocr_zoom
|
| 1289 |
-
|
| 1290 |
-
# x1 = int(hocr_data['left'][i] * scale_adjustment)
|
| 1291 |
-
# y1 = int(hocr_data['top'][i] * scale_adjustment)
|
| 1292 |
-
# w = int(hocr_data['width'][i] * scale_adjustment)
|
| 1293 |
-
# h = int(hocr_data['height'][i] * scale_adjustment)
|
| 1294 |
-
# x2 = x1 + w
|
| 1295 |
-
# y2 = y1 + h
|
| 1296 |
-
|
| 1297 |
-
# raw_ocr_output.append({
|
| 1298 |
-
# 'type': 'text',
|
| 1299 |
-
# 'word': cleaned_text, # Use the sanitized word
|
| 1300 |
-
# 'confidence': float(hocr_data['conf'][i]),
|
| 1301 |
-
# 'bbox': [x1, y1, x2, y2],
|
| 1302 |
-
# 'y0': y1,
|
| 1303 |
-
# 'x0': x1
|
| 1304 |
-
# })
|
| 1305 |
-
# except Exception as e:
|
| 1306 |
-
|
| 1307 |
-
# print(f" ❌ Tesseract OCR Error: {e}")
|
| 1308 |
#=============================================================================================================================================================
|
| 1309 |
#=============================================================================================================================================================
|
| 1310 |
-
|
| 1311 |
-
# # === START OF RAPIDOCR BLOCK ===
|
| 1312 |
-
# try:
|
| 1313 |
-
# # 1. Re-render Page at High Resolution (Standardizing to Zoom 4.0)
|
| 1314 |
-
# ocr_zoom = 4.0
|
| 1315 |
-
# pix_ocr = fitz_page.get_pixmap(matrix=fitz.Matrix(ocr_zoom, ocr_zoom))
|
| 1316 |
-
|
| 1317 |
-
# # Convert PyMuPDF Pixmap to OpenCV format (BGR)
|
| 1318 |
-
# img_ocr_np = np.frombuffer(pix_ocr.samples, dtype=np.uint8).reshape(
|
| 1319 |
-
# pix_ocr.height, pix_ocr.width, pix_ocr.n
|
| 1320 |
-
# )
|
| 1321 |
-
# if pix_ocr.n == 3:
|
| 1322 |
-
# img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGB2BGR)
|
| 1323 |
-
# elif pix_ocr.n == 4:
|
| 1324 |
-
# img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGBA2BGR)
|
| 1325 |
-
|
| 1326 |
-
# # 2. Run RapidOCR (Models handle preprocessing internally)
|
| 1327 |
-
# results, _ = ocr_engine(img_ocr_np)
|
| 1328 |
-
|
| 1329 |
-
# if results:
|
| 1330 |
-
# # Calculate scaling from OCR image (4.0) to your pipeline standard (scale_factor=2.0)
|
| 1331 |
-
# scale_adjustment = scale_factor / ocr_zoom
|
| 1332 |
-
|
| 1333 |
-
# for box, text, score in results:
|
| 1334 |
-
# # Sanitize and clean text
|
| 1335 |
-
# cleaned_text = sanitize_text(text).strip()
|
| 1336 |
-
|
| 1337 |
-
# if cleaned_text:
|
| 1338 |
-
# # 3. Coordinate Mapping (Convert 4-point polygon to x1, y1, x2, y2)
|
| 1339 |
-
# xs = [p[0] for p in box]
|
| 1340 |
-
# ys = [p[1] for p in box]
|
| 1341 |
-
|
| 1342 |
-
# x1 = int(min(xs) * scale_adjustment)
|
| 1343 |
-
# y1 = int(min(ys) * scale_adjustment)
|
| 1344 |
-
# x2 = int(max(xs) * scale_adjustment)
|
| 1345 |
-
# y2 = int(max(ys) * scale_adjustment)
|
| 1346 |
-
|
| 1347 |
-
# raw_ocr_output.append({
|
| 1348 |
-
# 'type': 'text',
|
| 1349 |
-
# 'word': cleaned_text,
|
| 1350 |
-
# 'confidence': float(score) * 100, # Converting 0-1.0 to 0-100 scale
|
| 1351 |
-
# 'bbox': [x1, y1, x2, y2],
|
| 1352 |
-
# 'y0': y1,
|
| 1353 |
-
# 'x0': x1
|
| 1354 |
-
# })
|
| 1355 |
-
# except Exception as e:
|
| 1356 |
-
# print(f" ❌ RapidOCR Fallback Error: {e}")
|
| 1357 |
try:
|
| 1358 |
# 1. Re-render Page at High Resolution (Standardizing to Zoom 4.0)
|
| 1359 |
ocr_zoom = 4.0
|
|
@@ -1926,163 +1639,6 @@ def run_inference_and_get_raw_words(pdf_path: str, model_path: str,
|
|
| 1926 |
# ============================================================================
|
| 1927 |
|
| 1928 |
|
| 1929 |
-
# def convert_bio_to_structured_json_relaxed(input_path: str, output_path: str) -> Optional[List[Dict[str, Any]]]:
|
| 1930 |
-
# print("\n" + "=" * 80)
|
| 1931 |
-
# print("--- 3. STARTING BIO TO STRUCTURED JSON DECODING ---")
|
| 1932 |
-
# print("=" * 80)
|
| 1933 |
-
# try:
|
| 1934 |
-
# with open(input_path, 'r', encoding='utf-8') as f:
|
| 1935 |
-
# predictions_by_page = json.load(f)
|
| 1936 |
-
# except Exception as e:
|
| 1937 |
-
# print(f"❌ Error loading raw prediction file: {e}")
|
| 1938 |
-
# return None
|
| 1939 |
-
|
| 1940 |
-
# predictions = []
|
| 1941 |
-
# for page_item in predictions_by_page:
|
| 1942 |
-
# if isinstance(page_item, dict) and 'data' in page_item:
|
| 1943 |
-
# predictions.extend(page_item['data'])
|
| 1944 |
-
|
| 1945 |
-
# structured_data = []
|
| 1946 |
-
# current_item = None
|
| 1947 |
-
# current_option_key = None
|
| 1948 |
-
# current_passage_buffer = []
|
| 1949 |
-
# current_text_buffer = []
|
| 1950 |
-
# first_question_started = False
|
| 1951 |
-
# last_entity_type = None
|
| 1952 |
-
# just_finished_i_option = False
|
| 1953 |
-
# is_in_new_passage = False
|
| 1954 |
-
|
| 1955 |
-
# def finalize_passage_to_item(item, passage_buffer):
|
| 1956 |
-
# if passage_buffer:
|
| 1957 |
-
# passage_text = re.sub(r'\s{2,}', ' ', ' '.join(passage_buffer)).strip()
|
| 1958 |
-
# if item.get('passage'):
|
| 1959 |
-
# item['passage'] += ' ' + passage_text
|
| 1960 |
-
# else:
|
| 1961 |
-
# item['passage'] = passage_text
|
| 1962 |
-
# passage_buffer.clear()
|
| 1963 |
-
|
| 1964 |
-
# for item in predictions:
|
| 1965 |
-
# word = item['word']
|
| 1966 |
-
# label = item['predicted_label']
|
| 1967 |
-
# entity_type = label[2:].strip() if label.startswith(('B-', 'I-')) else None
|
| 1968 |
-
# current_text_buffer.append(word)
|
| 1969 |
-
# previous_entity_type = last_entity_type
|
| 1970 |
-
# is_passage_label = (entity_type == 'PASSAGE')
|
| 1971 |
-
|
| 1972 |
-
# if not first_question_started:
|
| 1973 |
-
# if label != 'B-QUESTION' and not is_passage_label:
|
| 1974 |
-
# just_finished_i_option = False
|
| 1975 |
-
# is_in_new_passage = False
|
| 1976 |
-
# continue
|
| 1977 |
-
# if is_passage_label:
|
| 1978 |
-
# current_passage_buffer.append(word)
|
| 1979 |
-
# last_entity_type = 'PASSAGE'
|
| 1980 |
-
# just_finished_i_option = False
|
| 1981 |
-
# is_in_new_passage = False
|
| 1982 |
-
# continue
|
| 1983 |
-
|
| 1984 |
-
# if label == 'B-QUESTION':
|
| 1985 |
-
# if not first_question_started:
|
| 1986 |
-
# header_text = ' '.join(current_text_buffer[:-1]).strip()
|
| 1987 |
-
# if header_text or current_passage_buffer:
|
| 1988 |
-
# metadata_item = {'type': 'METADATA', 'passage': ''}
|
| 1989 |
-
# finalize_passage_to_item(metadata_item, current_passage_buffer)
|
| 1990 |
-
# if header_text: metadata_item['text'] = header_text
|
| 1991 |
-
# structured_data.append(metadata_item)
|
| 1992 |
-
# first_question_started = True
|
| 1993 |
-
# current_text_buffer = [word]
|
| 1994 |
-
|
| 1995 |
-
# if current_item is not None:
|
| 1996 |
-
# finalize_passage_to_item(current_item, current_passage_buffer)
|
| 1997 |
-
# current_item['text'] = ' '.join(current_text_buffer[:-1]).strip()
|
| 1998 |
-
# structured_data.append(current_item)
|
| 1999 |
-
# current_text_buffer = [word]
|
| 2000 |
-
|
| 2001 |
-
# current_item = {
|
| 2002 |
-
# 'question': word, 'options': {}, 'answer': '', 'passage': '', 'text': ''
|
| 2003 |
-
# }
|
| 2004 |
-
# current_option_key = None
|
| 2005 |
-
# last_entity_type = 'QUESTION'
|
| 2006 |
-
# just_finished_i_option = False
|
| 2007 |
-
# is_in_new_passage = False
|
| 2008 |
-
# continue
|
| 2009 |
-
|
| 2010 |
-
# if current_item is not None:
|
| 2011 |
-
# if is_in_new_passage:
|
| 2012 |
-
# # 🔑 Robust Initialization and Appending for 'new_passage'
|
| 2013 |
-
# if 'new_passage' not in current_item:
|
| 2014 |
-
# current_item['new_passage'] = word
|
| 2015 |
-
# else:
|
| 2016 |
-
# current_item['new_passage'] += f' {word}'
|
| 2017 |
-
|
| 2018 |
-
# if label.startswith('B-') or (label.startswith('I-') and entity_type != 'PASSAGE'):
|
| 2019 |
-
# is_in_new_passage = False
|
| 2020 |
-
# if label.startswith(('B-', 'I-')): last_entity_type = entity_type
|
| 2021 |
-
# continue
|
| 2022 |
-
# is_in_new_passage = False
|
| 2023 |
-
|
| 2024 |
-
# if label.startswith('B-'):
|
| 2025 |
-
# if entity_type in ['QUESTION', 'OPTION', 'ANSWER', 'SECTION_HEADING']:
|
| 2026 |
-
# finalize_passage_to_item(current_item, current_passage_buffer)
|
| 2027 |
-
# current_passage_buffer = []
|
| 2028 |
-
# last_entity_type = entity_type
|
| 2029 |
-
# if entity_type == 'PASSAGE':
|
| 2030 |
-
# if previous_entity_type == 'OPTION' and just_finished_i_option:
|
| 2031 |
-
# current_item['new_passage'] = word # Initialize the new passage start
|
| 2032 |
-
# is_in_new_passage = True
|
| 2033 |
-
# else:
|
| 2034 |
-
# current_passage_buffer.append(word)
|
| 2035 |
-
# elif entity_type == 'OPTION':
|
| 2036 |
-
# current_option_key = word
|
| 2037 |
-
# current_item['options'][current_option_key] = word
|
| 2038 |
-
# just_finished_i_option = False
|
| 2039 |
-
# elif entity_type == 'ANSWER':
|
| 2040 |
-
# current_item['answer'] = word
|
| 2041 |
-
# current_option_key = None
|
| 2042 |
-
# just_finished_i_option = False
|
| 2043 |
-
# elif entity_type == 'QUESTION':
|
| 2044 |
-
# current_item['question'] += f' {word}'
|
| 2045 |
-
# just_finished_i_option = False
|
| 2046 |
-
|
| 2047 |
-
# elif label.startswith('I-'):
|
| 2048 |
-
# if entity_type == 'QUESTION':
|
| 2049 |
-
# current_item['question'] += f' {word}'
|
| 2050 |
-
# elif entity_type == 'PASSAGE':
|
| 2051 |
-
# if previous_entity_type == 'OPTION' and just_finished_i_option:
|
| 2052 |
-
# current_item['new_passage'] = word # Initialize the new passage start
|
| 2053 |
-
# is_in_new_passage = True
|
| 2054 |
-
# else:
|
| 2055 |
-
# if not current_passage_buffer: last_entity_type = 'PASSAGE'
|
| 2056 |
-
# current_passage_buffer.append(word)
|
| 2057 |
-
# elif entity_type == 'OPTION' and current_option_key is not None:
|
| 2058 |
-
# current_item['options'][current_option_key] += f' {word}'
|
| 2059 |
-
# just_finished_i_option = True
|
| 2060 |
-
# elif entity_type == 'ANSWER':
|
| 2061 |
-
# current_item['answer'] += f' {word}'
|
| 2062 |
-
# just_finished_i_option = (entity_type == 'OPTION')
|
| 2063 |
-
|
| 2064 |
-
# elif label == 'O':
|
| 2065 |
-
# if last_entity_type == 'QUESTION':
|
| 2066 |
-
# current_item['question'] += f' {word}'
|
| 2067 |
-
# just_finished_i_option = False
|
| 2068 |
-
|
| 2069 |
-
# if current_item is not None:
|
| 2070 |
-
# finalize_passage_to_item(current_item, current_passage_buffer)
|
| 2071 |
-
# current_item['text'] = ' '.join(current_text_buffer).strip()
|
| 2072 |
-
# structured_data.append(current_item)
|
| 2073 |
-
|
| 2074 |
-
# for item in structured_data:
|
| 2075 |
-
# item['text'] = re.sub(r'\s{2,}', ' ', item['text']).strip()
|
| 2076 |
-
# if 'new_passage' in item:
|
| 2077 |
-
# item['new_passage'] = re.sub(r'\s{2,}', ' ', item['new_passage']).strip()
|
| 2078 |
-
|
| 2079 |
-
# try:
|
| 2080 |
-
# with open(output_path, 'w', encoding='utf-8') as f:
|
| 2081 |
-
# json.dump(structured_data, f, indent=2, ensure_ascii=False)
|
| 2082 |
-
# except Exception:
|
| 2083 |
-
# pass
|
| 2084 |
-
|
| 2085 |
-
# return structured_data
|
| 2086 |
|
| 2087 |
|
| 2088 |
|
|
@@ -2600,104 +2156,9 @@ def embed_images_as_base64_in_memory(structured_data: List[Dict[str, Any]], figu
|
|
| 2600 |
|
| 2601 |
|
| 2602 |
|
| 2603 |
-
# def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str) -> Optional[
|
| 2604 |
-
# List[Dict[str, Any]]]:
|
| 2605 |
-
# def run_document_pipeline( input_pdf_path: str, layoutlmv3_model_path: str, structured_intermediate_output_path: Optional[str] = None) -> Optional[List[Dict[str, Any]]]:
|
| 2606 |
-
# if not os.path.exists(input_pdf_path): return None
|
| 2607 |
-
|
| 2608 |
-
# print("\n" + "#" * 80)
|
| 2609 |
-
# print("### STARTING OPTIMIZED FULL DOCUMENT ANALYSIS PIPELINE ###")
|
| 2610 |
-
# print("#" * 80)
|
| 2611 |
-
|
| 2612 |
-
# pdf_name = os.path.splitext(os.path.basename(input_pdf_path))[0]
|
| 2613 |
-
# temp_pipeline_dir = os.path.join(tempfile.gettempdir(), f"pipeline_run_{pdf_name}_{os.getpid()}")
|
| 2614 |
-
# os.makedirs(temp_pipeline_dir, exist_ok=True)
|
| 2615 |
-
|
| 2616 |
-
# preprocessed_json_path = os.path.join(temp_pipeline_dir, f"{pdf_name}_preprocessed.json")
|
| 2617 |
-
# raw_output_path = os.path.join(temp_pipeline_dir, f"{pdf_name}_raw_predictions.json")
|
| 2618 |
-
# structured_intermediate_output_path = os.path.join(temp_pipeline_dir, f"{pdf_name}_structured_intermediate.json")
|
| 2619 |
-
|
| 2620 |
-
# final_result = None
|
| 2621 |
-
# try:
|
| 2622 |
-
# # Phase 1: Preprocessing with YOLO First + Masking
|
| 2623 |
-
# preprocessed_json_path_out = run_single_pdf_preprocessing(input_pdf_path, preprocessed_json_path)
|
| 2624 |
-
# if not preprocessed_json_path_out: return None
|
| 2625 |
-
|
| 2626 |
-
# # Phase 2: Inference
|
| 2627 |
-
# page_raw_predictions_list = run_inference_and_get_raw_words(
|
| 2628 |
-
# input_pdf_path, layoutlmv3_model_path, preprocessed_json_path_out
|
| 2629 |
-
# )
|
| 2630 |
-
# if not page_raw_predictions_list: return None
|
| 2631 |
-
|
| 2632 |
-
# # --- DEBUG STEP: SAVE RAW PREDICTIONS ---
|
| 2633 |
-
# # Save raw predictions to the temporary file
|
| 2634 |
-
# with open(raw_output_path, 'w', encoding='utf-8') as f:
|
| 2635 |
-
# json.dump(page_raw_predictions_list, f, indent=4)
|
| 2636 |
-
|
| 2637 |
-
# # Explicitly copy/save the raw predictions to the user-specified debug path
|
| 2638 |
-
# # if raw_predictions_output_path:
|
| 2639 |
-
# # shutil.copy(raw_output_path, raw_predictions_output_path)
|
| 2640 |
-
# # print(f"\n✅ DEBUG: Raw predictions saved to: {raw_predictions_output_path}")
|
| 2641 |
-
# # ----------------------------------------
|
| 2642 |
-
|
| 2643 |
-
# # Phase 3: Decoding
|
| 2644 |
-
# structured_data_list = convert_bio_to_structured_json_relaxed(
|
| 2645 |
-
# raw_output_path, structured_intermediate_output_path
|
| 2646 |
-
# )
|
| 2647 |
-
# if not structured_data_list: return None
|
| 2648 |
-
|
| 2649 |
-
|
| 2650 |
-
# structured_data_list = correct_misaligned_options(structured_data_list)
|
| 2651 |
-
# structured_data_list = process_context_linking(structured_data_list)
|
| 2652 |
-
|
| 2653 |
-
|
| 2654 |
-
# # Phase 4: Embedding / Equation to LaTeX Conversion
|
| 2655 |
-
# final_result = embed_images_as_base64_in_memory(structured_data_list, FIGURE_EXTRACTION_DIR)
|
| 2656 |
-
|
| 2657 |
|
| 2658 |
|
| 2659 |
|
| 2660 |
-
# #================================================================================
|
| 2661 |
-
# # --- NEW FINAL STEP: HIERARCHICAL CLASSIFICATION TAGGING ---
|
| 2662 |
-
# #================================================================================
|
| 2663 |
-
|
| 2664 |
-
# print("\n" + "=" * 80)
|
| 2665 |
-
# print("--- FINAL STEP: HIERARCHICAL SUBJECT/CONCEPT TAGGING ---")
|
| 2666 |
-
# print("=" * 80)
|
| 2667 |
-
|
| 2668 |
-
# # 1. Initialize and Load the Classifier
|
| 2669 |
-
# classifier = HierarchicalClassifier()
|
| 2670 |
-
# if classifier.load_models():
|
| 2671 |
-
# # 2. Run Classification on the *Final* Result
|
| 2672 |
-
# # The function modifies the list in place and returns it
|
| 2673 |
-
# final_result = post_process_json_with_inference(
|
| 2674 |
-
# final_result, classifier
|
| 2675 |
-
# )
|
| 2676 |
-
# print("✅ Classification complete. Tags added to final output.")
|
| 2677 |
-
# else:
|
| 2678 |
-
# print("❌ Classification model loading failed. Outputting un-tagged data.")
|
| 2679 |
-
|
| 2680 |
-
# # ====================================================================
|
| 2681 |
-
|
| 2682 |
-
|
| 2683 |
-
# except Exception as e:
|
| 2684 |
-
# print(f"❌ FATAL ERROR: {e}")
|
| 2685 |
-
# import traceback
|
| 2686 |
-
# traceback.print_exc()
|
| 2687 |
-
# return None
|
| 2688 |
-
|
| 2689 |
-
# finally:
|
| 2690 |
-
# try:
|
| 2691 |
-
# for f in glob.glob(os.path.join(temp_pipeline_dir, '*')):
|
| 2692 |
-
# os.remove(f)
|
| 2693 |
-
# os.rmdir(temp_pipeline_dir)
|
| 2694 |
-
# except Exception:
|
| 2695 |
-
# pass
|
| 2696 |
-
|
| 2697 |
-
# print("\n" + "#" * 80)
|
| 2698 |
-
# print("### OPTIMIZED PIPELINE EXECUTION COMPLETE ###")
|
| 2699 |
-
# print("#" * 80)
|
| 2700 |
-
# return final_result
|
| 2701 |
|
| 2702 |
|
| 2703 |
|
|
@@ -2783,131 +2244,6 @@ import time
|
|
| 2783 |
import traceback
|
| 2784 |
import glob
|
| 2785 |
|
| 2786 |
-
# def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str, structured_intermediate_output_path: Optional[str] = None) -> Optional[List[Dict[str, Any]]]:
|
| 2787 |
-
# if not os.path.exists(input_pdf_path):
|
| 2788 |
-
# print(f"❌ ERROR: File not found: {input_pdf_path}")
|
| 2789 |
-
# return None
|
| 2790 |
-
|
| 2791 |
-
# print("\n" + "#" * 80)
|
| 2792 |
-
# print("### STARTING OPTIMIZED FULL DOCUMENT ANALYSIS PIPELINE ###")
|
| 2793 |
-
# print(f"Input: {input_pdf_path}")
|
| 2794 |
-
# print("#" * 80)
|
| 2795 |
-
|
| 2796 |
-
# overall_start = time.time()
|
| 2797 |
-
# pdf_name = os.path.splitext(os.path.basename(input_pdf_path))[0]
|
| 2798 |
-
# temp_pipeline_dir = os.path.join(tempfile.gettempdir(), f"pipeline_run_{pdf_name}_{os.getpid()}")
|
| 2799 |
-
# os.makedirs(temp_pipeline_dir, exist_ok=True)
|
| 2800 |
-
|
| 2801 |
-
# preprocessed_json_path = os.path.join(temp_pipeline_dir, f"{pdf_name}_preprocessed.json")
|
| 2802 |
-
# raw_output_path = os.path.join(temp_pipeline_dir, f"{pdf_name}_raw_predictions.json")
|
| 2803 |
-
|
| 2804 |
-
|
| 2805 |
-
# # If the user didn't provide a path, create one in the temp directory
|
| 2806 |
-
# if structured_intermediate_output_path is None:
|
| 2807 |
-
# structured_intermediate_output_path = os.path.join(
|
| 2808 |
-
# temp_pipeline_dir, f"{pdf_name}_structured_intermediate.json"
|
| 2809 |
-
# )
|
| 2810 |
-
|
| 2811 |
-
|
| 2812 |
-
|
| 2813 |
-
# final_result = None
|
| 2814 |
-
# try:
|
| 2815 |
-
# # --- Phase 1: Preprocessing ---
|
| 2816 |
-
# print(f"\n[Step 1/5] Preprocessing (YOLO + Masking)...")
|
| 2817 |
-
# p1_start = time.time()
|
| 2818 |
-
# preprocessed_json_path_out = run_single_pdf_preprocessing(input_pdf_path, preprocessed_json_path)
|
| 2819 |
-
# if not preprocessed_json_path_out:
|
| 2820 |
-
# print("❌ FAILED at Step 1: Preprocessing returned None.")
|
| 2821 |
-
# return None
|
| 2822 |
-
# print(f"✅ Step 1 Complete ({time.time() - p1_start:.2f}s)")
|
| 2823 |
-
|
| 2824 |
-
# # --- Phase 2: Inference ---
|
| 2825 |
-
# print(f"\n[Step 2/5] Inference (LayoutLMv3)...")
|
| 2826 |
-
# p2_start = time.time()
|
| 2827 |
-
# page_raw_predictions_list = run_inference_and_get_raw_words(
|
| 2828 |
-
# input_pdf_path, layoutlmv3_model_path, preprocessed_json_path_out
|
| 2829 |
-
# )
|
| 2830 |
-
# if not page_raw_predictions_list:
|
| 2831 |
-
# print("❌ FAILED at Step 2: Inference returned no data.")
|
| 2832 |
-
# return None
|
| 2833 |
-
|
| 2834 |
-
# # Save raw predictions for Step 3
|
| 2835 |
-
# with open(raw_output_path, 'w', encoding='utf-8') as f:
|
| 2836 |
-
# json.dump(page_raw_predictions_list, f, indent=4)
|
| 2837 |
-
# print(f"✅ Step 2 Complete ({time.time() - p2_start:.2f}s)")
|
| 2838 |
-
|
| 2839 |
-
# # --- Phase 3: Decoding ---
|
| 2840 |
-
# print(f"\n[Step 3/5] Decoding (BIO to Structured JSON)...")
|
| 2841 |
-
# p3_start = time.time()
|
| 2842 |
-
# structured_data_list = convert_bio_to_structured_json_relaxed(
|
| 2843 |
-
# raw_output_path, structured_intermediate_output_path
|
| 2844 |
-
# )
|
| 2845 |
-
# if not structured_data_list:
|
| 2846 |
-
# print("❌ FAILED at Step 3: BIO conversion failed.")
|
| 2847 |
-
# return None
|
| 2848 |
-
|
| 2849 |
-
# # Logic adjustments
|
| 2850 |
-
# print("... Correcting misalignments and linking context ...")
|
| 2851 |
-
# structured_data_list = correct_misaligned_options(structured_data_list)
|
| 2852 |
-
# structured_data_list = process_context_linking(structured_data_list)
|
| 2853 |
-
# print(f"✅ Step 3 Complete ({time.time() - p3_start:.2f}s)")
|
| 2854 |
-
|
| 2855 |
-
# # --- Phase 4: Base64 & LaTeX ---
|
| 2856 |
-
# print(f"\n[Step 4/5] Finalizing Layout (Base64 Images & LaTeX)...")
|
| 2857 |
-
# p4_start = time.time()
|
| 2858 |
-
# final_result = embed_images_as_base64_in_memory(structured_data_list, FIGURE_EXTRACTION_DIR)
|
| 2859 |
-
# if not final_result:
|
| 2860 |
-
# print("❌ FAILED at Step 4: Final formatting failed.")
|
| 2861 |
-
# return None
|
| 2862 |
-
# print(f"✅ Step 4 Complete ({time.time() - p4_start:.2f}s)")
|
| 2863 |
-
|
| 2864 |
-
|
| 2865 |
-
|
| 2866 |
-
# # --- ADD THIS NEW STEP HERE ---
|
| 2867 |
-
# print(f"\n[Step 4.5/5] Adding Question Type Classification...")
|
| 2868 |
-
# p4_5_start = time.time()
|
| 2869 |
-
# final_result = add_question_type_validation(final_result)
|
| 2870 |
-
# print(f"✅ Step 4.5 Complete ({time.time() - p4_5_start:.2f}s)")
|
| 2871 |
-
|
| 2872 |
-
|
| 2873 |
-
|
| 2874 |
-
# # --- END OF NEW STEP ---
|
| 2875 |
-
|
| 2876 |
-
|
| 2877 |
-
|
| 2878 |
-
|
| 2879 |
-
# # --- Phase 5: Hierarchical Tagging ---
|
| 2880 |
-
# print(f"\n[Step 5/5] AI Classification (Subject/Concept Tagging)...")
|
| 2881 |
-
# p5_start = time.time()
|
| 2882 |
-
# classifier = HierarchicalClassifier()
|
| 2883 |
-
# if classifier.load_models():
|
| 2884 |
-
# final_result = post_process_json_with_inference(final_result, classifier)
|
| 2885 |
-
# print(f"✅ Step 5 Complete: Tags added ({time.time() - p5_start:.2f}s)")
|
| 2886 |
-
# else:
|
| 2887 |
-
# print("⚠️ WARNING: Classifier models failed to load. Skipping tags.")
|
| 2888 |
-
|
| 2889 |
-
# except Exception as e:
|
| 2890 |
-
# print(f"\n‼️ FATAL PIPELINE EXCEPTION:")
|
| 2891 |
-
# print(f"Error Message: {str(e)}")
|
| 2892 |
-
# traceback.print_exc()
|
| 2893 |
-
# return None
|
| 2894 |
-
|
| 2895 |
-
# finally:
|
| 2896 |
-
# print(f"\nCleaning up temporary files in {temp_pipeline_dir}...")
|
| 2897 |
-
# try:
|
| 2898 |
-
# for f in glob.glob(os.path.join(temp_pipeline_dir, '*')):
|
| 2899 |
-
# os.remove(f)
|
| 2900 |
-
# os.rmdir(temp_pipeline_dir)
|
| 2901 |
-
# print("🧹 Cleanup successful.")
|
| 2902 |
-
# except Exception as e:
|
| 2903 |
-
# print(f"⚠️ Cleanup failed: {e}")
|
| 2904 |
-
|
| 2905 |
-
# total_time = time.time() - overall_start
|
| 2906 |
-
# print("\n" + "#" * 80)
|
| 2907 |
-
# print(f"### PIPELINE COMPLETE | Total Time: {total_time:.2f}s ###")
|
| 2908 |
-
# print("#" * 80)
|
| 2909 |
-
|
| 2910 |
-
# return final_result
|
| 2911 |
|
| 2912 |
|
| 2913 |
|
|
|
|
| 536 |
return sorted(final_separators)
|
| 537 |
|
| 538 |
#======================================================================================================================================
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 539 |
|
| 540 |
|
| 541 |
def get_word_data_for_detection(page: fitz.Page, pdf_path: str, page_num: int,
|
|
|
|
| 952 |
merged_detections = merge_overlapping_boxes(relevant_detections, IOU_MERGE_THRESHOLD)
|
| 953 |
print(f" [LOG] YOLO found {len(merged_detections)} objects in {time.time() - start_time_yolo:.3f}s.")
|
| 954 |
|
| 955 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 956 |
|
| 957 |
# ====================================================================
|
| 958 |
# --- STEP 2: PREPARE DATA FOR COLUMN DETECTION (MASKING) ---
|
|
|
|
| 1063 |
})
|
| 1064 |
else:
|
| 1065 |
# === START OF OPTIMIZED OCR BLOCK ===
|
| 1066 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1067 |
#=============================================================================================================================================================
|
| 1068 |
#=============================================================================================================================================================
|
| 1069 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1070 |
try:
|
| 1071 |
# 1. Re-render Page at High Resolution (Standardizing to Zoom 4.0)
|
| 1072 |
ocr_zoom = 4.0
|
|
|
|
| 1639 |
# ============================================================================
|
| 1640 |
|
| 1641 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1642 |
|
| 1643 |
|
| 1644 |
|
|
|
|
| 2156 |
|
| 2157 |
|
| 2158 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2159 |
|
| 2160 |
|
| 2161 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2162 |
|
| 2163 |
|
| 2164 |
|
|
|
|
| 2244 |
import traceback
|
| 2245 |
import glob
|
| 2246 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2247 |
|
| 2248 |
|
| 2249 |
|