heerjtdev commited on
Commit
0f4dfc1
·
1 Parent(s): dc4a82b

Update working_yolo_pipeline.py

Browse files
Files changed (1) hide show
  1. working_yolo_pipeline.py +285 -259
working_yolo_pipeline.py CHANGED
@@ -1202,14 +1202,6 @@
1202
 
1203
 
1204
 
1205
-
1206
-
1207
-
1208
-
1209
-
1210
-
1211
-
1212
-
1213
 
1214
 
1215
 
@@ -1598,49 +1590,6 @@ def get_word_data_for_detection(page: fitz.Page, top_margin_percent=0.10, bottom
1598
  return [d for d in word_data if d[2] >= y_min and d[4] <= y_max]
1599
 
1600
 
1601
- # def calculate_x_gutters(word_data: list, params: Dict) -> List[int]:
1602
- # if not word_data: return []
1603
- # x_points = []
1604
- # for _, x1, _, x2, _ in word_data: x_points.extend([x1, x2])
1605
- # max_x = max(x_points)
1606
- # bin_size = params['cluster_bin_size']
1607
- # num_bins = int(np.ceil(max_x / bin_size))
1608
- # hist, bin_edges = np.histogram(x_points, bins=num_bins, range=(0, max_x))
1609
- # smoothed_hist = gaussian_filter1d(hist.astype(float), sigma=params['cluster_smoothing'])
1610
- # inverted_signal = np.max(smoothed_hist) - smoothed_hist
1611
- #
1612
- # peaks, properties = find_peaks(
1613
- # inverted_signal, height=0, distance=params['cluster_min_width'] / bin_size
1614
- # )
1615
- #
1616
- # if not peaks.size: return []
1617
- #
1618
- # threshold_value = np.percentile(smoothed_hist, params['cluster_threshold_percentile'])
1619
- # inverted_threshold = np.max(smoothed_hist) - threshold_value
1620
- # significant_peaks = peaks[properties['peak_heights'] >= inverted_threshold]
1621
- # separator_x_coords = [int(bin_edges[p]) for p in significant_peaks]
1622
- #
1623
- # final_separators = []
1624
- # prominence_threshold = params['cluster_prominence'] * np.max(smoothed_hist)
1625
- #
1626
- # for x_coord in separator_x_coords:
1627
- # bin_idx = np.searchsorted(bin_edges, x_coord) - 1
1628
- # window_size = int(params['cluster_min_width'] / bin_size)
1629
- #
1630
- # left_start, left_end = max(0, bin_idx - window_size), bin_idx
1631
- # right_start, right_end = bin_idx + 1, min(len(smoothed_hist), bin_idx + 1 + window_size)
1632
- #
1633
- # if left_end <= left_start or right_end <= right_start: continue
1634
- #
1635
- # avg_left_density = np.mean(smoothed_hist[left_start:left_end])
1636
- # avg_right_density = np.mean(smoothed_hist[right_start:right_end])
1637
- #
1638
- # if avg_left_density >= prominence_threshold and avg_right_density >= prominence_threshold:
1639
- # final_separators.append(x_coord)
1640
- #
1641
- # return sorted(final_separators)
1642
-
1643
-
1644
  def calculate_x_gutters(word_data: list, params: Dict) -> List[int]:
1645
  """Calculates the X-axis histogram and detects significant gutters."""
1646
  if not word_data: return []
@@ -1953,6 +1902,9 @@ def run_inference_and_get_raw_words(pdf_path: str, model_path: str,
1953
  final_page_predictions = []
1954
  CHUNK_SIZE = 500
1955
 
 
 
 
1956
  for page_data in preprocessed_data:
1957
  page_num_1_based = page_data['page_number']
1958
  page_num_0_based = page_num_1_based - 1
@@ -2092,189 +2044,131 @@ def run_inference_and_get_raw_words(pdf_path: str, model_path: str,
2092
 
2093
 
2094
 
2095
- # def run_inference_and_get_raw_words(pdf_path: str, model_path: str,
2096
- # preprocessed_json_path: str,
2097
- # column_detection_params: Optional[Dict] = None) -> List[Dict[str, Any]]:
2098
- # """Runs LayoutLMv3-CRF inference and returns the raw word-level predictions, grouped by page."""
2099
- # start_time_overall = time.time()
2100
- # print("\n" + "=" * 80)
2101
- # print("--- 2. STARTING LAYOUTLMV3 INFERENCE PIPELINE ---")
2102
- # print("=" * 80)
2103
- #
2104
- # # --- MODEL & TOKENIZER SETUP ---
2105
- # start_time_setup = time.time()
2106
- # tokenizer = LayoutLMv3TokenizerFast.from_pretrained("microsoft/layoutlmv3-base")
2107
- # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
2108
- #
2109
- # try:
2110
- # model = LayoutLMv3ForTokenClassification(num_labels=NUM_LABELS)
2111
- # checkpoint = torch.load(model_path, map_location=device)
2112
- # model_state = checkpoint.get('model_state_dict', checkpoint)
2113
- # # Fix for potential key mismatch
2114
- # fixed_state_dict = {key.replace('layoutlm.', 'layoutlmv3.'): value for key, value in model_state.items()}
2115
- # model.load_state_dict(fixed_state_dict)
2116
- # model.to(device)
2117
- # model.eval()
2118
- # print(f" [LOG] Model loaded and moved to {device} in {time.time() - start_time_setup:.2f}s.")
2119
- # except Exception as e:
2120
- # print(f"❌ FATAL ERROR during LayoutLMv3 model loading: {e}")
2121
- # return []
2122
- #
2123
- # # --- DATA LOADING ---
2124
- # try:
2125
- # with open(preprocessed_json_path, 'r', encoding='utf-8') as f:
2126
- # preprocessed_data = json.load(f)
2127
- # except Exception as e:
2128
- # print(f" ERROR loading preprocessed JSON: {e}")
2129
- # return []
2130
- #
2131
- # try:
2132
- # doc = fitz.open(pdf_path)
2133
- # except Exception as e:
2134
- # print(f"❌ ERROR loading PDF file: {e}")
2135
- # return []
2136
- #
2137
- # final_page_predictions = []
2138
- # CHUNK_SIZE = 500
2139
- #
2140
- # # --- PAGE ITERATION LOOP ---
2141
- # for page_data in preprocessed_data:
2142
- # start_time_page = time.time()
2143
- # page_num_1_based = page_data['page_number']
2144
- # page_num_0_based = page_num_1_based - 1
2145
- # page_raw_predictions = []
2146
- #
2147
- # fitz_page = doc.load_page(page_num_0_based)
2148
- # page_width, page_height = fitz_page.rect.width, fitz_page.rect.height
2149
- # num_words_on_page = len(page_data['data'])
2150
- #
2151
- # print(f" -> Inferring Page {page_num_1_based} ({num_words_on_page} words)...")
2152
- #
2153
- # # --- COORDINATE NORMALIZATION & DATA PREP ---
2154
- # words, bboxes_raw_pdf_space, normalized_bboxes_list = [], [], []
2155
- # scale_factor = 2.0
2156
- #
2157
- # for item in page_data['data']:
2158
- # word, raw_yolo_bbox = item['word'], item['bbox']
2159
- #
2160
- # bbox_pdf = [
2161
- # int(raw_yolo_bbox[0] / scale_factor), int(raw_yolo_bbox[1] / scale_factor),
2162
- # int(raw_yolo_bbox[2] / scale_factor), int(raw_yolo_bbox[3] / scale_factor)
2163
- # ]
2164
- #
2165
- # normalized_bbox = [
2166
- # max(0, min(1000, int(1000 * bbox_pdf[0] / page_width))),
2167
- # max(0, min(1000, int(1000 * bbox_pdf[1] / page_height))),
2168
- # max(0, min(1000, int(1000 * bbox_pdf[2] / page_width))),
2169
- # max(0, min(1000, int(1000 * bbox_pdf[3] / page_height)))
2170
- # ]
2171
- #
2172
- # words.append(word)
2173
- # bboxes_raw_pdf_space.append(bbox_pdf)
2174
- # normalized_bboxes_list.append(normalized_bbox)
2175
- #
2176
- # if not words:
2177
- # print(f" [LOG] Skipped Page {page_num_1_based} (0 words found).")
2178
- # continue
2179
- #
2180
- # # --- COLUMN DETECTION & CHUNKING ---
2181
- # start_time_col_detect = time.time()
2182
- # column_detection_params = column_detection_params or {}
2183
- # column_separator_x = detect_column_gutters(pdf_path, page_num_0_based, **column_detection_params)
2184
- # word_chunks = _merge_integrity(words, bboxes_raw_pdf_space, column_separator_x)
2185
- # print(f" [LOG] Column detection and word chunking took {time.time() - start_time_col_detect:.3f}s.")
2186
- #
2187
- # # --- INFERENCE BATCHING ---
2188
- # current_global_index = 0
2189
- # total_inference_time = 0
2190
- #
2191
- # for chunk_words_original in word_chunks:
2192
- # if not chunk_words_original: continue
2193
- #
2194
- # # Reconstruct the aligned chunk (alignment logic unchanged)
2195
- # chunk_words, chunk_normalized_bboxes, chunk_bboxes_pdf = [], [], []
2196
- # temp_global_index = current_global_index
2197
- # for i in range(len(words)):
2198
- # if temp_global_index <= i and words[i] in chunk_words_original:
2199
- # if words[i] == chunk_words_original[len(chunk_words)]:
2200
- # chunk_words.append(words[i])
2201
- # chunk_normalized_bboxes.append(normalized_bboxes_list[i])
2202
- # chunk_bboxes_pdf.append(bboxes_raw_pdf_space[i])
2203
- # current_global_index = i + 1
2204
- # if len(chunk_words) == len(chunk_words_original):
2205
- # break
2206
- #
2207
- # # Inference in sub-batches
2208
- # for i in range(0, len(chunk_words), CHUNK_SIZE):
2209
- # start_time_inference_batch = time.time()
2210
- # sub_words = chunk_words[i:i + CHUNK_SIZE]
2211
- # sub_bboxes = chunk_normalized_bboxes[i:i + CHUNK_SIZE]
2212
- # sub_bboxes_pdf = chunk_bboxes_pdf[i:i + CHUNK_SIZE]
2213
- #
2214
- # if not sub_words: continue
2215
- #
2216
- # # Tokenization and Model Call (Inference)
2217
- # encoded_input = tokenizer(
2218
- # sub_words, boxes=sub_bboxes, truncation=True, padding="max_length",
2219
- # max_length=512, return_tensors="pt"
2220
- # )
2221
- #
2222
- # input_ids = encoded_input['input_ids'].to(device)
2223
- # bbox = encoded_input['bbox'].to(device)
2224
- # attention_mask = encoded_input['attention_mask'].to(device)
2225
- #
2226
- # with torch.no_grad():
2227
- # predictions_int_list = model(input_ids, bbox, attention_mask)
2228
- #
2229
- # if not predictions_int_list: continue
2230
- #
2231
- # # Post-processing and prediction mapping (unchanged)
2232
- # predictions_int = predictions_int_list[0]
2233
- # word_ids = encoded_input.word_ids()
2234
- # word_idx_to_pred_id = {}
2235
- #
2236
- # for token_idx, word_idx in enumerate(word_ids):
2237
- # if word_idx is not None and word_idx < len(sub_words):
2238
- # if word_idx not in word_idx_to_pred_id:
2239
- # word_idx_to_pred_id[word_idx] = predictions_int[token_idx]
2240
- #
2241
- # for current_word_idx in range(len(sub_words)):
2242
- # pred_id_or_tensor = word_idx_to_pred_id.get(current_word_idx, 0)
2243
- # pred_id = pred_id_or_tensor.item() if torch.is_tensor(pred_id_or_tensor) else pred_id_or_tensor
2244
- # predicted_label = ID_TO_LABEL[pred_id]
2245
- #
2246
- # page_raw_predictions.append({
2247
- # "word": sub_words[current_word_idx],
2248
- # "bbox": sub_bboxes_pdf[current_word_idx],
2249
- # "predicted_label": predicted_label,
2250
- # "page_number": page_num_1_based
2251
- # })
2252
- #
2253
- # batch_inference_time = time.time() - start_time_inference_batch
2254
- # total_inference_time += batch_inference_time
2255
- # # Optional: Log per-batch inference time if needed for deep debugging
2256
- # # print(f" [LOG] Batch inference ({len(sub_words)} words) took {batch_inference_time:.3f}s.")
2257
- #
2258
- # if page_raw_predictions:
2259
- # final_page_predictions.append({
2260
- # "page_number": page_num_1_based,
2261
- # "data": page_raw_predictions
2262
- # })
2263
- #
2264
- # print(f" [LOG] Total inference time for Page {page_num_1_based} (all batches): {total_inference_time:.2f}s")
2265
- # print(f" [LOG] Total processing time for Page {page_num_1_based}: {time.time() - start_time_page:.2f}s")
2266
- #
2267
- # doc.close()
2268
- #
2269
- # total_elapsed_time = time.time() - start_time_overall
2270
- # print(f"✅ LayoutLMv3 inference complete. Predicted tags for {len(final_page_predictions)} pages.")
2271
- # print(f" [LOG] Overall LayoutLMv3 Inference Pipeline Duration: {total_elapsed_time:.2f}s.")
2272
- # return final_page_predictions
2273
 
2274
 
2275
  # ============================================================================
2276
  # --- PHASE 3: BIO TO STRUCTURED JSON DECODER (Modified for In-Memory Return) ---
2277
  # ============================================================================
 
 
 
2278
 
2279
  def convert_bio_to_structured_json_relaxed(input_path: str, output_path: str) -> Optional[List[Dict[str, Any]]]:
2280
  """
@@ -2333,31 +2227,41 @@ def convert_bio_to_structured_json_relaxed(input_path: str, output_path: str) ->
2333
  entity_type = label[2:].strip() if label.startswith(('B-', 'I-')) else None
2334
  current_text_buffer.append(word)
2335
  previous_entity_type = last_entity_type
2336
- is_passage_label = (label == 'B-PASSAGE' or label == 'I-PASSAGE')
2337
 
2338
- if not first_question_started and label != 'B-QUESTION' and not is_passage_label:
2339
- just_finished_i_option = False
2340
- is_in_new_passage = False
2341
- continue
2342
 
2343
- if not first_question_started and is_passage_label:
2344
- if label == 'B-PASSAGE' or label == 'I-PASSAGE' or not current_passage_buffer:
 
 
 
 
 
 
 
 
 
2345
  current_passage_buffer.append(word)
2346
  last_entity_type = 'PASSAGE'
2347
- just_finished_i_option = False
2348
- is_in_new_passage = False
2349
- continue
 
 
 
 
2350
 
2351
  if label == 'B-QUESTION':
2352
  if not first_question_started:
 
2353
  header_text = ' '.join(current_text_buffer[:-1]).strip()
2354
  if header_text or current_passage_buffer:
2355
  metadata_item = {'type': 'METADATA', 'passage': ''}
2356
- if current_passage_buffer:
2357
- finalize_passage_to_item(metadata_item, current_passage_buffer)
2358
- if header_text:
2359
- metadata_item['text'] = header_text
2360
- elif header_text:
2361
  metadata_item['text'] = header_text
2362
  structured_data.append(metadata_item)
2363
  first_question_started = True
@@ -2382,8 +2286,13 @@ def convert_bio_to_structured_json_relaxed(input_path: str, output_path: str) ->
2382
  is_in_new_passage = False
2383
  continue
2384
 
 
 
 
 
2385
  if current_item is not None:
2386
  if is_in_new_passage:
 
2387
  current_item['new_passage'] += f' {word}'
2388
  if label.startswith('B-') or (label.startswith('I-') and entity_type != 'PASSAGE'):
2389
  is_in_new_passage = False
@@ -2392,18 +2301,25 @@ def convert_bio_to_structured_json_relaxed(input_path: str, output_path: str) ->
2392
  continue
2393
 
2394
  is_in_new_passage = False
 
 
2395
  if label.startswith('B-'):
2396
- if entity_type != 'PASSAGE':
 
 
2397
  finalize_passage_to_item(current_item, current_passage_buffer)
2398
  current_passage_buffer = []
 
2399
  last_entity_type = entity_type
2400
 
2401
  if entity_type == 'PASSAGE':
 
 
2402
  if previous_entity_type == 'OPTION' and just_finished_i_option:
2403
  current_item['new_passage'] = word
2404
  is_in_new_passage = True
2405
  else:
2406
- current_passage_buffer.append(word)
2407
  elif entity_type == 'OPTION':
2408
  current_option_key = word
2409
  current_item['options'][current_option_key] = word
@@ -2416,20 +2332,22 @@ def convert_bio_to_structured_json_relaxed(input_path: str, output_path: str) ->
2416
  current_item['question'] += f' {word}'
2417
  just_finished_i_option = False
2418
 
 
2419
  elif label.startswith('I-'):
2420
  if entity_type == 'QUESTION' and current_item.get('question'):
2421
  current_item['question'] += f' {word}'
2422
  last_entity_type = 'QUESTION'
2423
  just_finished_i_option = False
2424
  elif entity_type == 'PASSAGE':
 
2425
  if previous_entity_type == 'OPTION' and just_finished_i_option:
2426
  current_item['new_passage'] = word
2427
  is_in_new_passage = True
2428
  else:
2429
- if last_entity_type == 'QUESTION' and current_item.get('question'):
2430
- last_entity_type = 'PASSAGE'
2431
  if last_entity_type == 'PASSAGE' or not current_passage_buffer:
2432
- current_passage_buffer.append(word)
2433
  last_entity_type = 'PASSAGE'
2434
  just_finished_i_option = False
2435
  elif entity_type == 'OPTION' and last_entity_type == 'OPTION' and current_option_key is not None:
@@ -2441,11 +2359,21 @@ def convert_bio_to_structured_json_relaxed(input_path: str, output_path: str) ->
2441
  else:
2442
  just_finished_i_option = False
2443
 
 
2444
  elif label == 'O':
2445
- if last_entity_type == 'QUESTION' and current_item and 'question' in current_item:
 
 
 
 
 
2446
  current_item['question'] += f' {word}'
2447
  just_finished_i_option = False
2448
 
 
 
 
 
2449
  # --- Finalize last item ---
2450
  if current_item is not None:
2451
  finalize_passage_to_item(current_item, current_passage_buffer)
@@ -2475,6 +2403,74 @@ def convert_bio_to_structured_json_relaxed(input_path: str, output_path: str) ->
2475
  return structured_data
2476
 
2477
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2478
  # ============================================================================
2479
  # --- PHASE 4: IMAGE EMBEDDING (Modified for In-Memory Return) ---
2480
  # ============================================================================
@@ -2554,7 +2550,7 @@ def embed_images_as_base64_in_memory(structured_data: List[Dict[str, Any]], figu
2554
  # --- MAIN FUNCTION (The Callable Interface) ---
2555
  # ============================================================================
2556
 
2557
- def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str) -> Optional[List[Dict[str, Any]]]:
2558
  """
2559
  Executes the full document analysis pipeline: YOLO/OCR -> LayoutLMv3 -> Structured JSON -> Base64 Image Embed.
2560
 
@@ -2638,6 +2634,16 @@ def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str) -> Op
2638
  print("Pipeline aborted: Failed to convert BIO tags to structured data in Phase 3.")
2639
  return None
2640
 
 
 
 
 
 
 
 
 
 
 
2641
  # --- D. PHASE 4: IMAGE EMBEDDING (Base64) ---
2642
  final_result = embed_images_as_base64_in_memory(
2643
  structured_data_list,
@@ -2675,6 +2681,7 @@ def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str) -> Op
2675
  return final_result
2676
 
2677
 
 
2678
  if __name__ == "__main__":
2679
  parser = argparse.ArgumentParser(
2680
  description="Complete Document Analysis Pipeline (YOLO/OCR -> LayoutLMv3 -> Structured JSON -> Base64 Image Embed).")
@@ -2684,18 +2691,38 @@ if __name__ == "__main__":
2684
  default=DEFAULT_LAYOUTLMV3_MODEL_PATH,
2685
  help="Path to the saved LayoutLMv3-CRF PyTorch model checkpoint.")
2686
 
 
 
 
 
2687
  args = parser.parse_args()
2688
 
2689
- # --- Call the main function ---
2690
- final_json_data = run_document_pipeline(args.input_pdf, args.layoutlmv3_model_path)
2691
 
2692
- if final_json_data:
2693
- # Example of what to do with the returned data: Save it to a file
2694
- output_file_name = os.path.splitext(os.path.basename(args.input_pdf))[0] + "_final_output_embedded.json"
 
 
 
 
 
 
 
 
2695
 
2696
- # Determine where to save the final output (e.g., current directory)
2697
- final_output_path = os.path.abspath(output_file_name)
2698
 
 
 
 
 
 
 
 
 
 
2699
  with open(final_output_path, 'w', encoding='utf-8') as f:
2700
  json.dump(final_json_data, f, indent=2, ensure_ascii=False)
2701
 
@@ -2703,4 +2730,3 @@ if __name__ == "__main__":
2703
 
2704
 
2705
 
2706
-
 
1202
 
1203
 
1204
 
 
 
 
 
 
 
 
 
1205
 
1206
 
1207
 
 
1590
  return [d for d in word_data if d[2] >= y_min and d[4] <= y_max]
1591
 
1592
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1593
  def calculate_x_gutters(word_data: list, params: Dict) -> List[int]:
1594
  """Calculates the X-axis histogram and detects significant gutters."""
1595
  if not word_data: return []
 
1902
  final_page_predictions = []
1903
  CHUNK_SIZE = 500
1904
 
1905
+
1906
+ all_pages_word_level_results = []
1907
+
1908
  for page_data in preprocessed_data:
1909
  page_num_1_based = page_data['page_number']
1910
  page_num_0_based = page_num_1_based - 1
 
2044
 
2045
 
2046
 
2047
+ def create_label_studio_span(page_results, start_idx, end_idx, label):
2048
+ """Create a Label Studio span with character-level offsets and bbox."""
2049
+ # Get the words and bboxes for the specific entity span
2050
+ entity_words = [page_results[i]['word'] for i in range(start_idx, end_idx + 1)]
2051
+ entity_bboxes = [page_results[i]['bbox'] for i in range(start_idx, end_idx + 1)]
2052
+
2053
+ # Calculate encompassing BBOX
2054
+ x0 = min(bbox[0] for bbox in entity_bboxes)
2055
+ y0 = min(bbox[1] for bbox in entity_bboxes)
2056
+ x1 = max(bbox[2] for bbox in entity_bboxes)
2057
+ y1 = max(bbox[3] for bbox in entity_bboxes)
2058
+
2059
+ # Calculate character offsets based on the full page text string
2060
+ all_words_on_page = [r['word'] for r in page_results]
2061
+ text_string = " ".join(all_words_on_page)
2062
+
2063
+ # Compute start and end character offsets
2064
+ start_char = len(" ".join(all_words_on_page[:start_idx]))
2065
+ if start_idx != 0:
2066
+ start_char += 1
2067
+ end_char = start_char + len(" ".join(entity_words))
2068
+
2069
+ span_text = " ".join(entity_words)
2070
+
2071
+ return {
2072
+ "from_name": "label",
2073
+ "to_name": "text",
2074
+ "type": "labels",
2075
+ "value": {
2076
+ "start": start_char,
2077
+ "end": end_char,
2078
+ "text": span_text,
2079
+ "labels": [label],
2080
+ "bbox": {"x": x0, "y": y0, "width": x1 - x0, "height": y1 - y0}
2081
+ },
2082
+ "score": 0.99
2083
+ }
2084
+
2085
+
2086
+
2087
+
2088
+ def convert_raw_predictions_to_label_studio(page_data_list, output_path: str):
2089
+ """Convert raw word-level predictions (grouped by page) to Label Studio format."""
2090
+ final_tasks = []
2091
+ total_spans = 0
2092
+
2093
+ print("\n[PHASE: LABEL STUDIO CONVERSION]")
2094
+
2095
+ for page_data in page_data_list:
2096
+ page_num = page_data['page_number']
2097
+ page_results = page_data['data']
2098
+ if not page_results:
2099
+ continue
2100
+
2101
+ original_words = [r['word'] for r in page_results]
2102
+ text_string = " ".join(original_words)
2103
+
2104
+ results = []
2105
+ current_entity_label = None
2106
+ current_entity_start_word_index = None
2107
+
2108
+ # BIO stitching
2109
+ for i, pred_item in enumerate(page_results):
2110
+ label = pred_item['predicted_label']
2111
+ tag_only = label.split('-', 1)[-1] if '-' in label else label
2112
+
2113
+ if label.startswith('B-'):
2114
+ if current_entity_label:
2115
+ results.append(
2116
+ create_label_studio_span(page_results,
2117
+ current_entity_start_word_index,
2118
+ i - 1,
2119
+ current_entity_label)
2120
+ )
2121
+ current_entity_label = tag_only
2122
+ current_entity_start_word_index = i
2123
+
2124
+ elif label.startswith('I-') and current_entity_label == tag_only:
2125
+ continue
2126
+
2127
+ else:
2128
+ if current_entity_label:
2129
+ results.append(
2130
+ create_label_studio_span(page_results,
2131
+ current_entity_start_word_index,
2132
+ i - 1,
2133
+ current_entity_label)
2134
+ )
2135
+ current_entity_label = None
2136
+ current_entity_start_word_index = None
2137
+
2138
+ if current_entity_label:
2139
+ results.append(
2140
+ create_label_studio_span(page_results,
2141
+ current_entity_start_word_index,
2142
+ len(page_results) - 1,
2143
+ current_entity_label)
2144
+ )
2145
+
2146
+ total_spans += len(results)
2147
+ print(f" -> Page {page_num}: Generated {len(results)} labeled spans.")
2148
+
2149
+ final_tasks.append({
2150
+ "data": {
2151
+ "text": text_string,
2152
+ "original_words": original_words,
2153
+ "original_bboxes": [r['bbox'] for r in page_results]
2154
+ },
2155
+ "annotations": [{"result": results}],
2156
+ "meta": {"page_number": page_num}
2157
+ })
2158
+
2159
+ with open(output_path, "w", encoding='utf-8') as f:
2160
+ json.dump(final_tasks, f, indent=2, ensure_ascii=False)
2161
+
2162
+ print(f"\n✅ Label Studio tasks created and saved to {output_path}. Total {total_spans} spans.")
2163
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2164
 
2165
 
2166
  # ============================================================================
2167
  # --- PHASE 3: BIO TO STRUCTURED JSON DECODER (Modified for In-Memory Return) ---
2168
  # ============================================================================
2169
+ #
2170
+
2171
+
2172
 
2173
  def convert_bio_to_structured_json_relaxed(input_path: str, output_path: str) -> Optional[List[Dict[str, Any]]]:
2174
  """
 
2227
  entity_type = label[2:].strip() if label.startswith(('B-', 'I-')) else None
2228
  current_text_buffer.append(word)
2229
  previous_entity_type = last_entity_type
2230
+ is_passage_label = (entity_type == 'PASSAGE')
2231
 
2232
+ # ----------------------------------------------------------------------
2233
+ # --- MODIFICATION AREA 1: Pre-Question Content (Metadata/Passage) ---
2234
+ # ----------------------------------------------------------------------
 
2235
 
2236
+ # If we haven't started the first question yet
2237
+ if not first_question_started:
2238
+ # Skip 'O' and non-B/I-PASSAGE tags before the first B-QUESTION
2239
+ if label != 'B-QUESTION' and not is_passage_label:
2240
+ just_finished_i_option = False
2241
+ is_in_new_passage = False
2242
+ continue
2243
+
2244
+ # Handle PASSAGE tokens before the first B-QUESTION
2245
+ if is_passage_label:
2246
+ # B-PASSAGE or I-PASSAGE always appends to the buffer here
2247
  current_passage_buffer.append(word)
2248
  last_entity_type = 'PASSAGE'
2249
+ just_finished_i_option = False
2250
+ is_in_new_passage = False
2251
+ continue
2252
+
2253
+ # ----------------------------------------------------------------------
2254
+ # --- Standard B-QUESTION Start (Split/Finalize previous item) ---
2255
+ # ----------------------------------------------------------------------
2256
 
2257
  if label == 'B-QUESTION':
2258
  if not first_question_started:
2259
+ # Handle initial header/metadata
2260
  header_text = ' '.join(current_text_buffer[:-1]).strip()
2261
  if header_text or current_passage_buffer:
2262
  metadata_item = {'type': 'METADATA', 'passage': ''}
2263
+ finalize_passage_to_item(metadata_item, current_passage_buffer)
2264
+ if header_text:
 
 
 
2265
  metadata_item['text'] = header_text
2266
  structured_data.append(metadata_item)
2267
  first_question_started = True
 
2286
  is_in_new_passage = False
2287
  continue
2288
 
2289
+ # ----------------------------------------------------------------------
2290
+ # --- Processing tokens within an active Question (current_item) ---
2291
+ # ----------------------------------------------------------------------
2292
+
2293
  if current_item is not None:
2294
  if is_in_new_passage:
2295
+ # Handle passage continuation started after an option
2296
  current_item['new_passage'] += f' {word}'
2297
  if label.startswith('B-') or (label.startswith('I-') and entity_type != 'PASSAGE'):
2298
  is_in_new_passage = False
 
2301
  continue
2302
 
2303
  is_in_new_passage = False
2304
+
2305
+ # Case 1: Beginning of a new entity (B- tag)
2306
  if label.startswith('B-'):
2307
+
2308
+ # Check for termination entities
2309
+ if entity_type in ['QUESTION', 'OPTION', 'ANSWER', 'SECTION_HEADING']:
2310
  finalize_passage_to_item(current_item, current_passage_buffer)
2311
  current_passage_buffer = []
2312
+
2313
  last_entity_type = entity_type
2314
 
2315
  if entity_type == 'PASSAGE':
2316
+ # MODIFICATION 2: B-PASSAGE always continues the current passage buffer
2317
+ # unless immediately following an I-OPTION (which starts 'new_passage')
2318
  if previous_entity_type == 'OPTION' and just_finished_i_option:
2319
  current_item['new_passage'] = word
2320
  is_in_new_passage = True
2321
  else:
2322
+ current_passage_buffer.append(word) # Append B-PASSAGE word
2323
  elif entity_type == 'OPTION':
2324
  current_option_key = word
2325
  current_item['options'][current_option_key] = word
 
2332
  current_item['question'] += f' {word}'
2333
  just_finished_i_option = False
2334
 
2335
+ # Case 2: Inside an existing entity (I- tag)
2336
  elif label.startswith('I-'):
2337
  if entity_type == 'QUESTION' and current_item.get('question'):
2338
  current_item['question'] += f' {word}'
2339
  last_entity_type = 'QUESTION'
2340
  just_finished_i_option = False
2341
  elif entity_type == 'PASSAGE':
2342
+ # MODIFICATION 3: I-PASSAGE always continues the current passage buffer
2343
  if previous_entity_type == 'OPTION' and just_finished_i_option:
2344
  current_item['new_passage'] = word
2345
  is_in_new_passage = True
2346
  else:
2347
+ # Ensure last entity was PASSAGE or QUESTION/initial state to append
2348
+ if last_entity_type == 'QUESTION': last_entity_type = 'PASSAGE'
2349
  if last_entity_type == 'PASSAGE' or not current_passage_buffer:
2350
+ current_passage_buffer.append(word) # Append I-PASSAGE word
2351
  last_entity_type = 'PASSAGE'
2352
  just_finished_i_option = False
2353
  elif entity_type == 'OPTION' and last_entity_type == 'OPTION' and current_option_key is not None:
 
2359
  else:
2360
  just_finished_i_option = False
2361
 
2362
+ # Case 3: Outside any entity (O tag)
2363
  elif label == 'O':
2364
+ # MODIFICATION 4: Skip 'O' tokens ONLY if the last active entity was PASSAGE.
2365
+ # Otherwise, default O tokens append to QUESTION (original logic preserved)
2366
+ if last_entity_type == 'PASSAGE':
2367
+ # Do nothing to the passage buffer and do not change last_entity_type
2368
+ pass
2369
+ elif last_entity_type == 'QUESTION' and current_item and 'question' in current_item:
2370
  current_item['question'] += f' {word}'
2371
  just_finished_i_option = False
2372
 
2373
+ # ----------------------------------------------------------------------
2374
+ # --- Finalization (Unchanged) ---
2375
+ # ----------------------------------------------------------------------
2376
+
2377
  # --- Finalize last item ---
2378
  if current_item is not None:
2379
  finalize_passage_to_item(current_item, current_passage_buffer)
 
2403
  return structured_data
2404
 
2405
 
2406
+ def correct_misaligned_options(structured_data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
2407
+ """
2408
+ Corrects common OCR/tagging misalignment in options:
2409
+ If option N is empty (only contains its identifier, e.g., '(A)')
2410
+ AND option N+1 contains its own identifier followed by TWO EQUATION/FIGURE tags,
2411
+ then the first tag of option N+1 is moved to option N.
2412
+ """
2413
+ print("\n" + "=" * 80)
2414
+ print("--- 5. STARTING POST-PROCESSING: OPTION ALIGNMENT CORRECTION ---")
2415
+ print("=" * 80)
2416
+
2417
+ # Regex to find all EQUATION/FIGURE tags
2418
+ tag_pattern = re.compile(r'(EQUATION\d+|FIGURE\d+)')
2419
+
2420
+ corrected_count = 0
2421
+
2422
+ for item in structured_data:
2423
+ if item.get('type') in ['METADATA']:
2424
+ continue
2425
+
2426
+ options = item.get('options')
2427
+ if not options or len(options) < 2:
2428
+ continue
2429
+
2430
+ # Get option keys in their correct order
2431
+ option_keys = list(options.keys())
2432
+
2433
+ for i in range(len(option_keys) - 1):
2434
+ current_key = option_keys[i]
2435
+ next_key = option_keys[i + 1]
2436
+
2437
+ current_value = options[current_key].strip()
2438
+ next_value = options[next_key].strip()
2439
+
2440
+ # --- Condition 1: Check if the current option is "empty" ---
2441
+ # An "empty" option only contains its key/identifier (e.g., "(A)")
2442
+ is_current_empty = current_value == current_key
2443
+
2444
+ # --- Condition 2: Check if the next option has two content tags ---
2445
+ # Remove the option key from the next value to check content
2446
+ content_in_next = next_value.replace(next_key, '', 1).strip()
2447
+ tags_in_next = tag_pattern.findall(content_in_next)
2448
+
2449
+ has_two_tags = len(tags_in_next) == 2
2450
+
2451
+ if is_current_empty and has_two_tags:
2452
+ print(
2453
+ f" -> Correction applied in Item {item.get('question', '...')}: Moving '{tags_in_next[0]}' from {next_key} to {current_key}.")
2454
+
2455
+ # 1. Get the first tag (the one that belongs to the current option)
2456
+ tag_to_move = tags_in_next[0]
2457
+
2458
+ # 2. Update current option: Append the tag to the empty identifier
2459
+ options[current_key] = f"{current_key} {tag_to_move}".strip()
2460
+
2461
+ # 3. Update next option: Remove the first tag from its content
2462
+ remaining_tags_content = tag_pattern.sub('', next_value.replace(next_key, '', 1), 1).strip()
2463
+ remaining_tags_content += f" {tags_in_next[1]}"
2464
+
2465
+ # Reconstruct the next option value with only its identifier and the single remaining tag
2466
+ options[next_key] = f"{next_key} {tags_in_next[1]}".strip()
2467
+
2468
+ corrected_count += 1
2469
+
2470
+ print(f"✅ Option alignment correction finished. Total corrections: {corrected_count}.")
2471
+ return structured_data
2472
+
2473
+
2474
  # ============================================================================
2475
  # --- PHASE 4: IMAGE EMBEDDING (Modified for In-Memory Return) ---
2476
  # ============================================================================
 
2550
  # --- MAIN FUNCTION (The Callable Interface) ---
2551
  # ============================================================================
2552
 
2553
+ def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str, label_studio_output_path: str) -> Optional[List[Dict[str, Any]]]:
2554
  """
2555
  Executes the full document analysis pipeline: YOLO/OCR -> LayoutLMv3 -> Structured JSON -> Base64 Image Embed.
2556
 
 
2634
  print("Pipeline aborted: Failed to convert BIO tags to structured data in Phase 3.")
2635
  return None
2636
 
2637
+ structured_data_list = correct_misaligned_options(structured_data_list)
2638
+
2639
+ try:
2640
+ # CHANGE: Use the provided, persistent output path
2641
+ convert_raw_predictions_to_label_studio(page_raw_predictions_list, label_studio_output_path)
2642
+ print(f"✅ Label Studio output saved to: {label_studio_output_path}")
2643
+ except Exception as e:
2644
+ print(f"❌ Error during Label Studio conversion: {e}")
2645
+
2646
+
2647
  # --- D. PHASE 4: IMAGE EMBEDDING (Base64) ---
2648
  final_result = embed_images_as_base64_in_memory(
2649
  structured_data_list,
 
2681
  return final_result
2682
 
2683
 
2684
+
2685
  if __name__ == "__main__":
2686
  parser = argparse.ArgumentParser(
2687
  description="Complete Document Analysis Pipeline (YOLO/OCR -> LayoutLMv3 -> Structured JSON -> Base64 Image Embed).")
 
2691
  default=DEFAULT_LAYOUTLMV3_MODEL_PATH,
2692
  help="Path to the saved LayoutLMv3-CRF PyTorch model checkpoint.")
2693
 
2694
+ # NEW ARGUMENT: Optional path for the Label Studio output
2695
+ parser.add_argument("--ls_output_path", type=str, default=None,
2696
+ help="Optional path to save the Label Studio JSON task file.")
2697
+
2698
  args = parser.parse_args()
2699
 
2700
+ pdf_name = os.path.splitext(os.path.basename(args.input_pdf))[0]
 
2701
 
2702
+ # 1. Define Persistent Output Paths
2703
+ final_output_file_name = f"{pdf_name}_final_output_embedded.json"
2704
+ final_output_path = os.path.abspath(final_output_file_name)
2705
+
2706
+ # 2. Determine Label Studio Output Path
2707
+ # If not provided, create a default path next to the script
2708
+ if args.ls_output_path:
2709
+ ls_output_path = os.path.abspath(args.ls_output_path)
2710
+ else:
2711
+ ls_output_file_name = f"{pdf_name}_label_studio_tasks.json"
2712
+ ls_output_path = os.path.abspath(ls_output_file_name)
2713
 
2714
+ # --- Call the main function (Updated to include ls_output_path) ---
2715
+ print(f"\n[SETUP] Label Studio Output will be saved to: {ls_output_path}")
2716
 
2717
+ # NOTE: You must update the signature of run_document_pipeline to accept 3 arguments
2718
+ final_json_data = run_document_pipeline(
2719
+ args.input_pdf,
2720
+ args.layoutlmv3_model_path,
2721
+ ls_output_path # <--- Passing the persistent path
2722
+ )
2723
+
2724
+ if final_json_data:
2725
+ # Save the final structured output
2726
  with open(final_output_path, 'w', encoding='utf-8') as f:
2727
  json.dump(final_json_data, f, indent=2, ensure_ascii=False)
2728
 
 
2730
 
2731
 
2732