heerjtdev commited on
Commit
7ce2214
·
verified ·
1 Parent(s): 96274f8

Update working_yolo_pipeline.py

Browse files
Files changed (1) hide show
  1. working_yolo_pipeline.py +2 -656
working_yolo_pipeline.py CHANGED
@@ -256,8 +256,6 @@ def merge_yolo_into_word_data(raw_word_data: list, yolo_detections: list, scale_
256
 
257
 
258
 
259
-
260
-
261
  # ============================================================================
262
  # --- MISSING HELPER FUNCTION ---
263
  # ============================================================================
@@ -317,65 +315,6 @@ def calculate_vertical_gap_coverage(word_data: list, sep_x: int, page_height: fl
317
  return coverage_ratio
318
 
319
 
320
- # def calculate_x_gutters(word_data: list, params: Dict, page_height: float) -> List[int]:
321
- # """Calculates X-axis histogram and validates using BRIDGING CHECK and Vertical Coverage."""
322
- # if not word_data: return []
323
-
324
- # x_points = []
325
- # for _, x1, _, x2, *rest in word_data:
326
- # x_points.extend([x1, x2])
327
-
328
- # if not x_points: return []
329
- # max_x = max(x_points)
330
-
331
- # bin_size = params.get('cluster_bin_size', 5)
332
- # smoothing = params.get('cluster_smoothing', 5)
333
- # min_width = params.get('cluster_min_width', 20)
334
- # threshold_percentile = params.get('cluster_threshold_percentile', 85)
335
-
336
- # num_bins = int(np.ceil(max_x / bin_size))
337
- # hist, bin_edges = np.histogram(x_points, bins=num_bins, range=(0, max_x))
338
-
339
- # smoothed_hist = gaussian_filter1d(hist.astype(float), sigma=smoothing)
340
- # inverted_signal = np.max(smoothed_hist) - smoothed_hist
341
-
342
- # peaks, properties = find_peaks(
343
- # inverted_signal,
344
- # height=np.max(inverted_signal) - np.percentile(smoothed_hist, threshold_percentile),
345
- # distance=min_width / bin_size
346
- # )
347
-
348
- # if not peaks.size: return []
349
-
350
- # separator_x_coords = [int(bin_edges[p]) for p in peaks]
351
- # final_separators = []
352
-
353
- # for x_coord in separator_x_coords:
354
- # # 1. BRIDGING CHECK: The "Do Not Cut Words" Constraint
355
- # # Count how many words/blocks physically cross this specific X coordinate.
356
- # bridging_count = 0
357
- # for _, wx1, _, wx2, _ in word_data:
358
- # # Strictly check if a word physically sits on this line
359
- # if wx1 < x_coord and wx2 > x_coord:
360
- # bridging_count += 1
361
-
362
- # # Strict Threshold: If more than 2 items (allow for noise) cross, REJECT.
363
- # if bridging_count > 2:
364
- # print(f" ❌ Separator X={x_coord} REJECTED: Cuts through {bridging_count} words/blocks.")
365
- # continue
366
-
367
- # # 2. VERTICAL COVERAGE CHECK
368
- # # The gap must exist for > 65% of the text height of the page.
369
- # coverage = calculate_vertical_gap_coverage(word_data, x_coord, page_height, gutter_width=min_width)
370
-
371
- # if coverage >= 0.65:
372
- # final_separators.append(x_coord)
373
- # print(f" -> Separator X={x_coord} ACCEPTED (Coverage: {coverage:.1%}, Bridging: {bridging_count})")
374
- # else:
375
- # print(f" ❌ Separator X={x_coord} REJECTED (Coverage: {coverage:.1%}, Bridging: {bridging_count})")
376
-
377
- # return sorted(final_separators)
378
-
379
 
380
 
381
  def calculate_x_gutters(word_data: list, params: Dict, page_height: float) -> List[int]:
@@ -456,39 +395,6 @@ def calculate_x_gutters(word_data: list, params: Dict, page_height: float) -> Li
456
  return sorted(final_separators)
457
 
458
 
459
- # def get_word_data_for_detection(page: fitz.Page, pdf_path: str, page_num: int,
460
- # top_margin_percent=0.10, bottom_margin_percent=0.10) -> list:
461
- # """Extract word data with OCR caching to avoid redundant Tesseract runs."""
462
- # word_data = page.get_text("words")
463
-
464
- # if len(word_data) > 0:
465
- # word_data = [(w[4], w[0], w[1], w[2], w[3]) for w in word_data]
466
- # else:
467
- # if _ocr_cache.has_ocr(pdf_path, page_num):
468
- # word_data = _ocr_cache.get_ocr(pdf_path, page_num)
469
- # else:
470
- # try:
471
- # pix = page.get_pixmap(matrix=fitz.Matrix(3, 3))
472
- # img_bytes = pix.tobytes("png")
473
- # img = Image.open(io.BytesIO(img_bytes))
474
- # data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT)
475
- # full_word_data = []
476
- # for i in range(len(data['level'])):
477
- # if data['text'][i].strip():
478
- # x1, y1 = data['left'][i] / 3, data['top'][i] / 3
479
- # x2, y2 = x1 + data['width'][i] / 3, y1 + data['height'][i] / 3
480
- # full_word_data.append((data['text'][i], x1, y1, x2, y2))
481
- # word_data = full_word_data
482
- # _ocr_cache.set_ocr(pdf_path, page_num, word_data)
483
- # except Exception:
484
- # return []
485
-
486
- # # Apply margin filtering
487
- # page_height = page.rect.height
488
- # y_min = page_height * top_margin_percent
489
- # y_max = page_height * (1 - bottom_margin_percent)
490
- # return [d for d in word_data if d[2] >= y_min and d[4] <= y_max]
491
-
492
 
493
 
494
 
@@ -839,221 +745,6 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
839
 
840
 
841
 
842
-
843
-
844
- # def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
845
- # page_num: int, fitz_page: fitz.Page,
846
- # pdf_name: str) -> Tuple[List[Dict[str, Any]], Optional[int]]:
847
- # """
848
- # OPTIMIZED FLOW:
849
- # 1. Run YOLO to find Equations/Tables.
850
- # 2. Mask raw text with YOLO boxes.
851
- # 3. Run Column Detection on the MASKED data (Populates OCR cache).
852
- # 4. Proceed with Final OCR Output (Strictly using the cache).
853
- # """
854
- # global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
855
-
856
- # start_time_total = time.time()
857
-
858
- # if original_img is None:
859
- # print(f" ❌ Invalid image for page {page_num}.")
860
- # return None, None
861
-
862
- # # ====================================================================
863
- # # --- STEP 1: YOLO DETECTION ---
864
- # # ====================================================================
865
- # start_time_yolo = time.time()
866
- # results = model.predict(source=original_img, conf=CONF_THRESHOLD, imgsz=640, verbose=False)
867
-
868
- # relevant_detections = []
869
- # if results and results[0].boxes:
870
- # for box in results[0].boxes:
871
- # class_id = int(box.cls[0])
872
- # class_name = model.names[class_id]
873
- # if class_name in TARGET_CLASSES:
874
- # x1, y1, x2, y2 = box.xyxy[0].cpu().numpy().astype(int)
875
- # relevant_detections.append(
876
- # {'coords': (x1, y1, x2, y2), 'y1': y1, 'class': class_name, 'conf': float(box.conf[0])}
877
- # )
878
-
879
- # merged_detections = merge_overlapping_boxes(relevant_detections, IOU_MERGE_THRESHOLD)
880
- # print(f" [LOG] YOLO found {len(merged_detections)} objects in {time.time() - start_time_yolo:.3f}s.")
881
-
882
- # # ====================================================================
883
- # # --- STEP 2: PREPARE DATA FOR COLUMN DETECTION (MASKING & CACHING) ---
884
- # # This call to get_word_data_for_detection will execute Tesseract if
885
- # # native words are missing, and save the result to the cache.
886
- # # ====================================================================
887
- # raw_words_for_layout = get_word_data_for_detection(
888
- # fitz_page, pdf_path, page_num,
889
- # top_margin_percent=0.10, bottom_margin_percent=0.10
890
- # )
891
-
892
- # masked_word_data = merge_yolo_into_word_data(raw_words_for_layout, merged_detections, scale_factor=2.0)
893
-
894
- # # ====================================================================
895
- # # --- STEP 3: COLUMN DETECTION ---
896
- # # ====================================================================
897
- # page_width_pdf = fitz_page.rect.width
898
- # page_height_pdf = fitz_page.rect.height
899
-
900
- # column_detection_params = {
901
- # 'cluster_bin_size': 2, 'cluster_smoothing': 2,
902
- # 'cluster_min_width': 10, 'cluster_threshold_percentile': 85,
903
- # }
904
-
905
- # separators = calculate_x_gutters(masked_word_data, column_detection_params, page_height_pdf)
906
-
907
- # page_separator_x = None
908
- # if separators:
909
- # central_min = page_width_pdf * 0.35
910
- # central_max = page_width_pdf * 0.65
911
- # central_separators = [s for s in separators if central_min <= s <= central_max]
912
-
913
- # if central_separators:
914
- # center_x = page_width_pdf / 2
915
- # page_separator_x = min(central_separators, key=lambda x: abs(x - center_x))
916
- # print(f" ✅ Column Split Confirmed at X={page_separator_x:.1f}")
917
- # else:
918
- # print(" ⚠️ Gutter found off-center. Ignoring.")
919
- # else:
920
- # print(" -> Single Column Layout Confirmed.")
921
-
922
- # # ====================================================================
923
- # # --- STEP 4: COMPONENT EXTRACTION (Save Images) ---
924
- # # ====================================================================
925
- # start_time_components = time.time()
926
- # component_metadata = []
927
- # fig_count_page = 0
928
- # eq_count_page = 0
929
-
930
- # for detection in merged_detections:
931
- # x1, y1, x2, y2 = detection['coords']
932
- # class_name = detection['class']
933
-
934
- # if class_name == 'figure':
935
- # GLOBAL_FIGURE_COUNT += 1
936
- # counter = GLOBAL_FIGURE_COUNT
937
- # component_word = f"FIGURE{counter}"
938
- # fig_count_page += 1
939
- # elif class_name == 'equation':
940
- # GLOBAL_EQUATION_COUNT += 1
941
- # counter = GLOBAL_EQUATION_COUNT
942
- # component_word = f"EQUATION{counter}"
943
- # eq_count_page += 1
944
- # else:
945
- # continue
946
-
947
- # component_crop = original_img[y1:y2, x1:x2]
948
- # component_filename = f"{pdf_name}_page{page_num}_{class_name}{counter}.png"
949
- # cv2.imwrite(os.path.join(FIGURE_EXTRACTION_DIR, component_filename), component_crop)
950
-
951
- # y_midpoint = (y1 + y2) // 2
952
- # component_metadata.append({
953
- # 'type': class_name, 'word': component_word,
954
- # 'bbox': [int(x1), int(y1), int(x2), int(y2)],
955
- # 'y0': int(y_midpoint), 'x0': int(x1)
956
- # })
957
-
958
- # # ====================================================================
959
- # # --- STEP 5: CACHED OCR RETRIEVAL (No Redundant Tesseract) ---
960
- # # ====================================================================
961
- # raw_ocr_output = []
962
- # scale_factor = 2.0 # Pipeline standard scale
963
-
964
- # if _ocr_cache.has_ocr(pdf_path, page_num):
965
- # print(f" ⚡ Using cached OCR (Native or Tesseract) for page {page_num}")
966
- # cached_word_data = _ocr_cache.get_ocr(pdf_path, page_num)
967
-
968
- # for word_tuple in cached_word_data:
969
- # # Cache stores: (text, x1, y1, x2, y2) in PDF points (see get_word_data_for_detection)
970
- # word_text, x1, y1, x2, y2 = word_tuple
971
-
972
- # # Scale from PDF points back to Pipeline Pixels (2.0)
973
- # x1_pix = int(x1 * scale_factor)
974
- # y1_pix = int(y1 * scale_factor)
975
- # x2_pix = int(x2 * scale_factor)
976
- # y2_pix = int(y2 * scale_factor)
977
-
978
- # raw_ocr_output.append({
979
- # 'type': 'text', 'word': word_text, 'confidence': 95.0, # 95.0 is a default/placeholder confidence
980
- # 'bbox': [x1_pix, y1_pix, x2_pix, y2_pix],
981
- # 'y0': y1_pix, 'x0': x1_pix
982
- # })
983
- # else:
984
- # # This branch is hit only if the cache check in Step 2 failed to produce text,
985
- # # meaning the page is genuinely textless or entirely composed of images/figures.
986
- # print(f" ⚠️ No text found in cache for page {page_num}. Proceeding without words.")
987
-
988
-
989
- # # ====================================================================
990
- # # --- STEP 6: OCR CLEANING AND MERGING (Original Logic Unchanged) ---
991
- # # ====================================================================
992
- # items_to_sort = []
993
-
994
- # for ocr_word in raw_ocr_output:
995
- # is_suppressed = False
996
- # for component in component_metadata:
997
- # # Do not include words that are inside figure/equation boxes
998
- # ioa = calculate_ioa(ocr_word['bbox'], component['bbox'])
999
- # if ioa > IOA_SUPPRESSION_THRESHOLD:
1000
- # is_suppressed = True
1001
- # break
1002
- # if not is_suppressed:
1003
- # items_to_sort.append(ocr_word)
1004
-
1005
- # # Add figures/equations back into the flow as "words"
1006
- # items_to_sort.extend(component_metadata)
1007
-
1008
- # # ====================================================================
1009
- # # --- STEP 7: LINE-BASED SORTING (Original Logic Unchanged) ---
1010
- # # ====================================================================
1011
- # items_to_sort.sort(key=lambda x: (x['y0'], x['x0']))
1012
- # lines = []
1013
-
1014
- # for item in items_to_sort:
1015
- # placed = False
1016
- # for line in lines:
1017
- # y_ref = min(it['y0'] for it in line)
1018
- # if abs(y_ref - item['y0']) < LINE_TOLERANCE:
1019
- # line.append(item)
1020
- # placed = True
1021
- # break
1022
- # if not placed and item['type'] in ['equation', 'figure']:
1023
- # for line in lines:
1024
- # y_ref = min(it['y0'] for it in line)
1025
- # if abs(y_ref - item['y0']) < 20:
1026
- # line.append(item)
1027
- # placed = True
1028
- # break
1029
- # if not placed:
1030
- # lines.append([item])
1031
-
1032
- # for line in lines:
1033
- # line.sort(key=lambda x: x['x0'])
1034
-
1035
- # final_output = []
1036
- # for line in lines:
1037
- # for item in line:
1038
- # data_item = {"word": item["word"], "bbox": item["bbox"], "type": item["type"]}
1039
- # if 'tag' in item: data_item['tag'] = item['tag']
1040
- # final_output.append(data_item)
1041
-
1042
- # return final_output, page_separator_x
1043
-
1044
-
1045
-
1046
-
1047
-
1048
-
1049
-
1050
-
1051
-
1052
-
1053
-
1054
-
1055
-
1056
-
1057
  def run_single_pdf_preprocessing(pdf_path: str, preprocessed_json_path: str) -> Optional[str]:
1058
  global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
1059
 
@@ -1406,158 +1097,6 @@ def convert_raw_predictions_to_label_studio(page_data_list, output_path: str):
1406
  # --- PHASE 3: BIO TO STRUCTURED JSON DECODER ---
1407
  # ============================================================================
1408
 
1409
- # def convert_bio_to_structured_json_relaxed(input_path: str, output_path: str) -> Optional[List[Dict[str, Any]]]:
1410
- # print("\n" + "=" * 80)
1411
- # print("--- 3. STARTING BIO TO STRUCTURED JSON DECODING ---")
1412
- # print("=" * 80)
1413
- # try:
1414
- # with open(input_path, 'r', encoding='utf-8') as f:
1415
- # predictions_by_page = json.load(f)
1416
- # except Exception as e:
1417
- # print(f"❌ Error loading raw prediction file: {e}")
1418
- # return None
1419
-
1420
- # predictions = []
1421
- # for page_item in predictions_by_page:
1422
- # if isinstance(page_item, dict) and 'data' in page_item:
1423
- # predictions.extend(page_item['data'])
1424
-
1425
- # structured_data = []
1426
- # current_item = None
1427
- # current_option_key = None
1428
- # current_passage_buffer = []
1429
- # current_text_buffer = []
1430
- # first_question_started = False
1431
- # last_entity_type = None
1432
- # just_finished_i_option = False
1433
- # is_in_new_passage = False
1434
-
1435
- # def finalize_passage_to_item(item, passage_buffer):
1436
- # if passage_buffer:
1437
- # passage_text = re.sub(r'\s{2,}', ' ', ' '.join(passage_buffer)).strip()
1438
- # if item.get('passage'): item['passage'] += ' ' + passage_text
1439
- # else: item['passage'] = passage_text
1440
- # passage_buffer.clear()
1441
-
1442
- # for item in predictions:
1443
- # word = item['word']
1444
- # label = item['predicted_label']
1445
- # entity_type = label[2:].strip() if label.startswith(('B-', 'I-')) else None
1446
- # current_text_buffer.append(word)
1447
- # previous_entity_type = last_entity_type
1448
- # is_passage_label = (entity_type == 'PASSAGE')
1449
-
1450
- # if not first_question_started:
1451
- # if label != 'B-QUESTION' and not is_passage_label:
1452
- # just_finished_i_option = False
1453
- # is_in_new_passage = False
1454
- # continue
1455
- # if is_passage_label:
1456
- # current_passage_buffer.append(word)
1457
- # last_entity_type = 'PASSAGE'
1458
- # just_finished_i_option = False
1459
- # is_in_new_passage = False
1460
- # continue
1461
-
1462
- # if label == 'B-QUESTION':
1463
- # if not first_question_started:
1464
- # header_text = ' '.join(current_text_buffer[:-1]).strip()
1465
- # if header_text or current_passage_buffer:
1466
- # metadata_item = {'type': 'METADATA', 'passage': ''}
1467
- # finalize_passage_to_item(metadata_item, current_passage_buffer)
1468
- # if header_text: metadata_item['text'] = header_text
1469
- # structured_data.append(metadata_item)
1470
- # first_question_started = True
1471
- # current_text_buffer = [word]
1472
-
1473
- # if current_item is not None:
1474
- # finalize_passage_to_item(current_item, current_passage_buffer)
1475
- # current_item['text'] = ' '.join(current_text_buffer[:-1]).strip()
1476
- # structured_data.append(current_item)
1477
- # current_text_buffer = [word]
1478
-
1479
- # current_item = {
1480
- # 'question': word, 'options': {}, 'answer': '', 'passage': '', 'text': ''
1481
- # }
1482
- # current_option_key = None
1483
- # last_entity_type = 'QUESTION'
1484
- # just_finished_i_option = False
1485
- # is_in_new_passage = False
1486
- # continue
1487
-
1488
- # if current_item is not None:
1489
- # if is_in_new_passage:
1490
- # current_item['new_passage'] += f' {word}'
1491
- # if label.startswith('B-') or (label.startswith('I-') and entity_type != 'PASSAGE'):
1492
- # is_in_new_passage = False
1493
- # if label.startswith(('B-', 'I-')): last_entity_type = entity_type
1494
- # continue
1495
- # is_in_new_passage = False
1496
-
1497
- # if label.startswith('B-'):
1498
- # if entity_type in ['QUESTION', 'OPTION', 'ANSWER', 'SECTION_HEADING']:
1499
- # finalize_passage_to_item(current_item, current_passage_buffer)
1500
- # current_passage_buffer = []
1501
- # last_entity_type = entity_type
1502
- # if entity_type == 'PASSAGE':
1503
- # if previous_entity_type == 'OPTION' and just_finished_i_option:
1504
- # current_item['new_passage'] = word
1505
- # is_in_new_passage = True
1506
- # else:
1507
- # current_passage_buffer.append(word)
1508
- # elif entity_type == 'OPTION':
1509
- # current_option_key = word
1510
- # current_item['options'][current_option_key] = word
1511
- # just_finished_i_option = False
1512
- # elif entity_type == 'ANSWER':
1513
- # current_item['answer'] = word
1514
- # current_option_key = None
1515
- # just_finished_i_option = False
1516
- # elif entity_type == 'QUESTION':
1517
- # current_item['question'] += f' {word}'
1518
- # just_finished_i_option = False
1519
-
1520
- # elif label.startswith('I-'):
1521
- # if entity_type == 'QUESTION':
1522
- # current_item['question'] += f' {word}'
1523
- # elif entity_type == 'PASSAGE':
1524
- # if previous_entity_type == 'OPTION' and just_finished_i_option:
1525
- # current_item['new_passage'] = word
1526
- # is_in_new_passage = True
1527
- # else:
1528
- # if not current_passage_buffer: last_entity_type = 'PASSAGE'
1529
- # current_passage_buffer.append(word)
1530
- # elif entity_type == 'OPTION' and current_option_key is not None:
1531
- # current_item['options'][current_option_key] += f' {word}'
1532
- # just_finished_i_option = True
1533
- # elif entity_type == 'ANSWER':
1534
- # current_item['answer'] += f' {word}'
1535
- # just_finished_i_option = (entity_type == 'OPTION')
1536
-
1537
- # elif label == 'O':
1538
- # if last_entity_type == 'QUESTION':
1539
- # current_item['question'] += f' {word}'
1540
- # just_finished_i_option = False
1541
-
1542
- # if current_item is not None:
1543
- # finalize_passage_to_item(current_item, current_passage_buffer)
1544
- # current_item['text'] = ' '.join(current_text_buffer).strip()
1545
- # structured_data.append(current_item)
1546
-
1547
- # for item in structured_data:
1548
- # item['text'] = re.sub(r'\s{2,}', ' ', item['text']).strip()
1549
- # if 'new_passage' in item:
1550
- # item['new_passage'] = re.sub(r'\s{2,}', ' ', item['new_passage']).strip()
1551
-
1552
- # try:
1553
- # with open(output_path, 'w', encoding='utf-8') as f:
1554
- # json.dump(structured_data, f, indent=2, ensure_ascii=False)
1555
- # except Exception: pass
1556
-
1557
- # return structured_data
1558
-
1559
-
1560
-
1561
 
1562
 
1563
  def convert_bio_to_structured_json_relaxed(input_path: str, output_path: str) -> Optional[List[Dict[str, Any]]]:
@@ -1729,6 +1268,8 @@ def create_query_text(entry: Dict[str, Any]) -> str:
1729
  query_parts.append(value)
1730
  return " ".join(query_parts)
1731
 
 
 
1732
  def calculate_similarity(doc1: str, doc2: str) -> float:
1733
  """Calculates Cosine Similarity between two text strings."""
1734
  if not doc1 or not doc2:
@@ -1759,123 +1300,6 @@ def calculate_similarity(doc1: str, doc2: str) -> float:
1759
 
1760
 
1761
 
1762
-
1763
-
1764
-
1765
-
1766
-
1767
- # def process_context_linking(data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
1768
- # """
1769
- # Links questions to passages based on 'passage' flow vs 'new_passage' priority.
1770
- # """
1771
- # print("\n" + "=" * 80)
1772
- # print("--- STARTING CONTEXT LINKING AND SELF-CORRECTION (DEBUG MODE) ---")
1773
- # print("=" * 80)
1774
-
1775
- # if not data: return []
1776
-
1777
- # # --- PHASE 1: IDENTIFY PASSAGE DEFINERS ---
1778
- # passage_definer_indices = []
1779
- # for i, entry in enumerate(data):
1780
- # # We track metadata indices too now, as they are valid sources
1781
- # if entry.get("passage") and entry["passage"].strip():
1782
- # passage_definer_indices.append(i)
1783
- # if entry.get("new_passage") and entry["new_passage"].strip():
1784
- # if i not in passage_definer_indices:
1785
- # passage_definer_indices.append(i)
1786
-
1787
- # # --- PHASE 2: CONTEXT TRANSFER & LINKING ---
1788
- # current_passage_text = None
1789
- # current_new_passage_text = None
1790
-
1791
- # # DEBUG: Check what the first item is offering
1792
- # if data and data[0].get('type') == 'METADATA':
1793
- # print(f" [Debug] Found METADATA at start. Length of passage: {len(data[0].get('passage', ''))}")
1794
-
1795
- # for i, entry in enumerate(data):
1796
- # item_type = entry.get("type", "Question")
1797
-
1798
- # # A. UNCONDITIONALLY UPDATE CONTEXTS
1799
- # # FIX: Removed 'and entry.get("type") != "METADATA"'
1800
- # # We WANT Metadata to update the current_passage_text
1801
- # if entry.get("passage") and entry["passage"].strip():
1802
- # current_passage_text = entry["passage"]
1803
- # # print(f" [Flow] Updated Standard Context from Item {i} ({item_type})")
1804
-
1805
- # if entry.get("new_passage") and entry["new_passage"].strip():
1806
- # current_new_passage_text = entry["new_passage"]
1807
- # # print(f" [Flow] Updated New/Local Context from Item {i} ({item_type})")
1808
-
1809
- # # B. QUESTION LINKING
1810
- # if entry.get("question") and item_type != "METADATA":
1811
- # combined_query = create_query_text(entry)
1812
-
1813
- # # Skip if query is too short (noise)
1814
- # if len(combined_query.strip()) < 5:
1815
- # continue
1816
-
1817
- # # Calculate scores
1818
- # score_old = calculate_similarity(current_passage_text, combined_query) if current_passage_text else 0.0
1819
- # score_new = calculate_similarity(current_new_passage_text, combined_query) if current_new_passage_text else 0.0
1820
-
1821
- # q_preview = entry['question'][:30] + '...'
1822
-
1823
- # # DEBUG PRINT to see why it might be failing
1824
- # # print(f" [Check Q{i}] Old_Ctx_Len: {len(str(current_passage_text))} | Score: {score_old:.4f}")
1825
-
1826
- # # RESOLUTION LOGIC
1827
- # linked = False
1828
-
1829
- # # 1. Prefer New Passage if significantly better
1830
- # if current_new_passage_text and (score_new > score_old + RESOLUTION_MARGIN) and (score_new >= SIMILARITY_THRESHOLD):
1831
- # entry["passage"] = current_new_passage_text
1832
- # print(f" [Linker] 🚀 Q{i} ('{q_preview}') -> NEW PASSAGE (Score: {score_new:.3f})")
1833
- # linked = True
1834
-
1835
- # # 2. Otherwise use Standard Passage if it meets threshold
1836
- # elif current_passage_text and (score_old >= SIMILARITY_THRESHOLD):
1837
- # entry["passage"] = current_passage_text
1838
- # print(f" [Linker] ✅ Q{i} ('{q_preview}') -> STANDARD PASSAGE (Score: {score_old:.3f})")
1839
- # linked = True
1840
-
1841
- # if not linked:
1842
- # print(f" [Linker] ⚠️ Q{i} NOT LINKED. Max Score: {max(score_old, score_new):.4f} < Threshold {SIMILARITY_THRESHOLD}")
1843
-
1844
- # # --- PHASE 3: CLEANUP AND INTERPOLATION ---
1845
- # print(" [Linker] Running Cleanup & Interpolation...")
1846
-
1847
- # # 3A. Self-Correction (Remove weak links)
1848
- # for i in passage_definer_indices:
1849
- # entry = data[i]
1850
- # # Don't wipe out Metadata passages, only questions that got linked
1851
- # if entry.get("question") and entry.get("type") != "METADATA":
1852
- # passage_to_check = entry.get("passage") or entry.get("new_passage")
1853
- # if passage_to_check:
1854
- # self_sim = calculate_similarity(passage_to_check, create_query_text(entry))
1855
- # if self_sim < SIMILARITY_THRESHOLD:
1856
- # entry["passage"] = ""
1857
- # if "new_passage" in entry: entry["new_passage"] = ""
1858
- # print(f" [Cleanup] Removed weak link for Q{i}")
1859
-
1860
- # # 3B. Interpolation (Fill gaps)
1861
- # for i in range(1, len(data) - 1):
1862
- # current_entry = data[i]
1863
- # is_gap = current_entry.get("question") and not current_entry.get("passage")
1864
- # if is_gap:
1865
- # prev_p = data[i - 1].get("passage")
1866
- # next_p = data[i + 1].get("passage")
1867
- # if prev_p and next_p and (prev_p == next_p):
1868
- # current_entry["passage"] = prev_p
1869
- # print(f" [Linker] 🥪 Q{i} Interopolated from neighbors.")
1870
-
1871
- # return data
1872
-
1873
-
1874
-
1875
-
1876
-
1877
-
1878
-
1879
  def process_context_linking(data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
1880
  """
1881
  Links questions to passages based on 'passage' flow vs 'new_passage' priority.
@@ -2024,8 +1448,6 @@ def correct_misaligned_options(structured_data: List[Dict[str, Any]]) -> List[Di
2024
 
2025
 
2026
 
2027
-
2028
-
2029
  # ============================================================================
2030
  # --- PHASE 4: IMAGE EMBEDDING (Base64) ---
2031
  # ============================================================================
@@ -2143,82 +1565,6 @@ def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str, label
2143
  return final_result
2144
 
2145
 
2146
- # def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str, label_studio_output_path: str) -> Optional[List[Dict[str, Any]]]:
2147
- # if not os.path.exists(input_pdf_path): return None
2148
-
2149
- # print("\n" + "#" * 80)
2150
- # print("### STARTING PIPELINE WITH DEBUGGING ENABLED ###")
2151
- # print("#" * 80)
2152
-
2153
- # pdf_name = os.path.splitext(os.path.basename(input_pdf_path))[0]
2154
- # # Save debug files in the CURRENT directory so you can find them easily
2155
- # debug_dir = os.path.abspath(os.path.dirname(input_pdf_path))
2156
-
2157
- # preprocessed_json_path = os.path.join(tempfile.gettempdir(), f"{pdf_name}_preprocessed.json")
2158
-
2159
- # # --- DEBUG FILE PATHS ---
2160
- # debug_step2_path = os.path.join(debug_dir, f"DEBUG_2_raw_model_predictions.json")
2161
- # debug_step3_path = os.path.join(debug_dir, f"DEBUG_3_bio_parsed_before_linking.json")
2162
- # final_output_path = os.path.join(debug_dir, f"{pdf_name}_final_output.json")
2163
-
2164
- # final_result = None
2165
- # try:
2166
- # # Phase 1: Preprocessing
2167
- # preprocessed_json_path_out = run_single_pdf_preprocessing(input_pdf_path, preprocessed_json_path)
2168
- # if not preprocessed_json_path_out: return None
2169
-
2170
- # # Phase 2: Inference (Model Predictions)
2171
- # page_raw_predictions_list = run_inference_and_get_raw_words(
2172
- # input_pdf_path, layoutlmv3_model_path, preprocessed_json_path_out
2173
- # )
2174
- # if not page_raw_predictions_list: return None
2175
-
2176
- # # 🔍 DEBUG SAVE 1: RAW PREDICTIONS
2177
- # # Open this file to see if the model actually predicted "B-PASSAGE" or "I-PASSAGE"
2178
- # print(f" [DEBUG] Saving Raw Model Predictions to: {debug_step2_path}")
2179
- # with open(debug_step2_path, 'w', encoding='utf-8') as f:
2180
- # json.dump(page_raw_predictions_list, f, indent=4)
2181
-
2182
- # # Phase 3: BIO Decoding
2183
- # structured_data_list = convert_bio_to_structured_json_relaxed(
2184
- # debug_step2_path, debug_step3_path
2185
- # )
2186
- # if not structured_data_list: return None
2187
-
2188
- # # 🔍 DEBUG SAVE 2: STRUCTURED DATA (Before Context Linking)
2189
- # # Open this file to see if 'new_passage' exists BEFORE we run the linker
2190
- # print(f" [DEBUG] Saving Parsed Data (Pre-Link) to: {debug_step3_path}")
2191
- # # (The function convert_bio... already saved to debug_step3_path, so we just use it)
2192
-
2193
- # structured_data_list = correct_misaligned_options(structured_data_list)
2194
-
2195
- # # Phase 3.5: Context Linking
2196
- # # We run this on the memory object 'structured_data_list'
2197
- # structured_data_list = process_context_linking(structured_data_list)
2198
-
2199
- # try:
2200
- # convert_raw_predictions_to_label_studio(page_raw_predictions_list, label_studio_output_path)
2201
- # except Exception as e:
2202
- # print(f"❌ Error during Label Studio conversion: {e}")
2203
-
2204
- # # Phase 4: Embedding
2205
- # final_result = embed_images_as_base64_in_memory(structured_data_list, FIGURE_EXTRACTION_DIR)
2206
-
2207
- # except Exception as e:
2208
- # print(f"❌ FATAL ERROR: {e}")
2209
- # import traceback
2210
- # traceback.print_exc()
2211
- # return None
2212
-
2213
- # print("\n" + "#" * 80)
2214
- # print("### PIPELINE COMPLETE ###")
2215
- # print("#" * 80)
2216
- # return final_result
2217
-
2218
-
2219
-
2220
-
2221
-
2222
  if __name__ == "__main__":
2223
  parser = argparse.ArgumentParser(description="Complete Pipeline")
2224
  parser.add_argument("--input_pdf", type=str, required=True, help="Input PDF")
 
256
 
257
 
258
 
 
 
259
  # ============================================================================
260
  # --- MISSING HELPER FUNCTION ---
261
  # ============================================================================
 
315
  return coverage_ratio
316
 
317
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
318
 
319
 
320
  def calculate_x_gutters(word_data: list, params: Dict, page_height: float) -> List[int]:
 
395
  return sorted(final_separators)
396
 
397
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
398
 
399
 
400
 
 
745
 
746
 
747
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
748
  def run_single_pdf_preprocessing(pdf_path: str, preprocessed_json_path: str) -> Optional[str]:
749
  global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
750
 
 
1097
  # --- PHASE 3: BIO TO STRUCTURED JSON DECODER ---
1098
  # ============================================================================
1099
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1100
 
1101
 
1102
  def convert_bio_to_structured_json_relaxed(input_path: str, output_path: str) -> Optional[List[Dict[str, Any]]]:
 
1268
  query_parts.append(value)
1269
  return " ".join(query_parts)
1270
 
1271
+
1272
+
1273
  def calculate_similarity(doc1: str, doc2: str) -> float:
1274
  """Calculates Cosine Similarity between two text strings."""
1275
  if not doc1 or not doc2:
 
1300
 
1301
 
1302
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1303
  def process_context_linking(data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
1304
  """
1305
  Links questions to passages based on 'passage' flow vs 'new_passage' priority.
 
1448
 
1449
 
1450
 
 
 
1451
  # ============================================================================
1452
  # --- PHASE 4: IMAGE EMBEDDING (Base64) ---
1453
  # ============================================================================
 
1565
  return final_result
1566
 
1567
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1568
  if __name__ == "__main__":
1569
  parser = argparse.ArgumentParser(description="Complete Pipeline")
1570
  parser.add_argument("--input_pdf", type=str, required=True, help="Input PDF")