Spaces:

heerjtdev
/

yolo_layoutlm

Build error

App Files Files Community

heerjtdev commited on Nov 18, 2025

Commit

0f4dfc1

1 Parent(s): dc4a82b

Update working_yolo_pipeline.py

Browse files

Files changed (1) hide show

working_yolo_pipeline.py +285 -259

working_yolo_pipeline.py CHANGED Viewed

@@ -1202,14 +1202,6 @@
@@ -1598,49 +1590,6 @@ def get_word_data_for_detection(page: fitz.Page, top_margin_percent=0.10, bottom
     return [d for d in word_data if d[2] >= y_min and d[4] <= y_max]
-# def calculate_x_gutters(word_data: list, params: Dict) -> List[int]:
-#     if not word_data: return []
-#     x_points = []
-#     for _, x1, _, x2, _ in word_data: x_points.extend([x1, x2])
-#     max_x = max(x_points)
-#     bin_size = params['cluster_bin_size']
-#     num_bins = int(np.ceil(max_x / bin_size))
-#     hist, bin_edges = np.histogram(x_points, bins=num_bins, range=(0, max_x))
-#     smoothed_hist = gaussian_filter1d(hist.astype(float), sigma=params['cluster_smoothing'])
-#     inverted_signal = np.max(smoothed_hist) - smoothed_hist
-#
-#     peaks, properties = find_peaks(
-#         inverted_signal, height=0, distance=params['cluster_min_width'] / bin_size
-#     )
-#
-#     if not peaks.size: return []
-#
-#     threshold_value = np.percentile(smoothed_hist, params['cluster_threshold_percentile'])
-#     inverted_threshold = np.max(smoothed_hist) - threshold_value
-#     significant_peaks = peaks[properties['peak_heights'] >= inverted_threshold]
-#     separator_x_coords = [int(bin_edges[p]) for p in significant_peaks]
-#
-#     final_separators = []
-#     prominence_threshold = params['cluster_prominence'] * np.max(smoothed_hist)
-#
-#     for x_coord in separator_x_coords:
-#         bin_idx = np.searchsorted(bin_edges, x_coord) - 1
-#         window_size = int(params['cluster_min_width'] / bin_size)
-#
-#         left_start, left_end = max(0, bin_idx - window_size), bin_idx
-#         right_start, right_end = bin_idx + 1, min(len(smoothed_hist), bin_idx + 1 + window_size)
-#
-#         if left_end <= left_start or right_end <= right_start: continue
-#
-#         avg_left_density = np.mean(smoothed_hist[left_start:left_end])
-#         avg_right_density = np.mean(smoothed_hist[right_start:right_end])
-#
-#         if avg_left_density >= prominence_threshold and avg_right_density >= prominence_threshold:
-#             final_separators.append(x_coord)
-#
-#     return sorted(final_separators)
 def calculate_x_gutters(word_data: list, params: Dict) -> List[int]:
     """Calculates the X-axis histogram and detects significant gutters."""
     if not word_data: return []
@@ -1953,6 +1902,9 @@ def run_inference_and_get_raw_words(pdf_path: str, model_path: str,
     final_page_predictions = []
     CHUNK_SIZE = 500
     for page_data in preprocessed_data:
         page_num_1_based = page_data['page_number']
         page_num_0_based = page_num_1_based - 1
@@ -2092,189 +2044,131 @@ def run_inference_and_get_raw_words(pdf_path: str, model_path: str,
-# def run_inference_and_get_raw_words(pdf_path: str, model_path: str,
-#                                     preprocessed_json_path: str,
-#                                     column_detection_params: Optional[Dict] = None) -> List[Dict[str, Any]]:
-#     """Runs LayoutLMv3-CRF inference and returns the raw word-level predictions, grouped by page."""
-#     start_time_overall = time.time()
-#     print("\n" + "=" * 80)
-#     print("--- 2. STARTING LAYOUTLMV3 INFERENCE PIPELINE ---")
-#     print("=" * 80)
-#
-#     # --- MODEL & TOKENIZER SETUP ---
-#     start_time_setup = time.time()
-#     tokenizer = LayoutLMv3TokenizerFast.from_pretrained("microsoft/layoutlmv3-base")
-#     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-#
-#     try:
-#         model = LayoutLMv3ForTokenClassification(num_labels=NUM_LABELS)
-#         checkpoint = torch.load(model_path, map_location=device)
-#         model_state = checkpoint.get('model_state_dict', checkpoint)
-#         # Fix for potential key mismatch
-#         fixed_state_dict = {key.replace('layoutlm.', 'layoutlmv3.'): value for key, value in model_state.items()}
-#         model.load_state_dict(fixed_state_dict)
-#         model.to(device)
-#         model.eval()
-#         print(f"  [LOG] Model loaded and moved to {device} in {time.time() - start_time_setup:.2f}s.")
-#     except Exception as e:
-#         print(f"❌ FATAL ERROR during LayoutLMv3 model loading: {e}")
-#         return []
-#
-#     # --- DATA LOADING ---
-#     try:
-#         with open(preprocessed_json_path, 'r', encoding='utf-8') as f:
-#             preprocessed_data = json.load(f)
-#     except Exception as e:
-#         print(f"❌ ERROR loading preprocessed JSON: {e}")
-#         return []
-#
-#     try:
-#         doc = fitz.open(pdf_path)
-#     except Exception as e:
-#         print(f"❌ ERROR loading PDF file: {e}")
-#         return []
-#
-#     final_page_predictions = []
-#     CHUNK_SIZE = 500
-#
-#     # --- PAGE ITERATION LOOP ---
-#     for page_data in preprocessed_data:
-#         start_time_page = time.time()
-#         page_num_1_based = page_data['page_number']
-#         page_num_0_based = page_num_1_based - 1
-#         page_raw_predictions = []
-#
-#         fitz_page = doc.load_page(page_num_0_based)
-#         page_width, page_height = fitz_page.rect.width, fitz_page.rect.height
-#         num_words_on_page = len(page_data['data'])
-#
-#         print(f"  -> Inferring Page {page_num_1_based} ({num_words_on_page} words)...")
-#
-#         # --- COORDINATE NORMALIZATION & DATA PREP ---
-#         words, bboxes_raw_pdf_space, normalized_bboxes_list = [], [], []
-#         scale_factor = 2.0
-#
-#         for item in page_data['data']:
-#             word, raw_yolo_bbox = item['word'], item['bbox']
-#
-#             bbox_pdf = [
-#                 int(raw_yolo_bbox[0] / scale_factor), int(raw_yolo_bbox[1] / scale_factor),
-#                 int(raw_yolo_bbox[2] / scale_factor), int(raw_yolo_bbox[3] / scale_factor)
-#             ]
-#
-#             normalized_bbox = [
-#                 max(0, min(1000, int(1000 * bbox_pdf[0] / page_width))),
-#                 max(0, min(1000, int(1000 * bbox_pdf[1] / page_height))),
-#                 max(0, min(1000, int(1000 * bbox_pdf[2] / page_width))),
-#                 max(0, min(1000, int(1000 * bbox_pdf[3] / page_height)))
-#             ]
-#
-#             words.append(word)
-#             bboxes_raw_pdf_space.append(bbox_pdf)
-#             normalized_bboxes_list.append(normalized_bbox)
-#
-#         if not words:
-#             print(f"    [LOG] Skipped Page {page_num_1_based} (0 words found).")
-#             continue
-#
-#         # --- COLUMN DETECTION & CHUNKING ---
-#         start_time_col_detect = time.time()
-#         column_detection_params = column_detection_params or {}
-#         column_separator_x = detect_column_gutters(pdf_path, page_num_0_based, **column_detection_params)
-#         word_chunks = _merge_integrity(words, bboxes_raw_pdf_space, column_separator_x)
-#         print(f"    [LOG] Column detection and word chunking took {time.time() - start_time_col_detect:.3f}s.")
-#
-#         # --- INFERENCE BATCHING ---
-#         current_global_index = 0
-#         total_inference_time = 0
-#
-#         for chunk_words_original in word_chunks:
-#             if not chunk_words_original: continue
-#
-#             # Reconstruct the aligned chunk (alignment logic unchanged)
-#             chunk_words, chunk_normalized_bboxes, chunk_bboxes_pdf = [], [], []
-#             temp_global_index = current_global_index
-#             for i in range(len(words)):
-#                 if temp_global_index <= i and words[i] in chunk_words_original:
-#                     if words[i] == chunk_words_original[len(chunk_words)]:
-#                         chunk_words.append(words[i])
-#                         chunk_normalized_bboxes.append(normalized_bboxes_list[i])
-#                         chunk_bboxes_pdf.append(bboxes_raw_pdf_space[i])
-#                         current_global_index = i + 1
-#                         if len(chunk_words) == len(chunk_words_original):
-#                             break
-#
-#             # Inference in sub-batches
-#             for i in range(0, len(chunk_words), CHUNK_SIZE):
-#                 start_time_inference_batch = time.time()
-#                 sub_words = chunk_words[i:i + CHUNK_SIZE]
-#                 sub_bboxes = chunk_normalized_bboxes[i:i + CHUNK_SIZE]
-#                 sub_bboxes_pdf = chunk_bboxes_pdf[i:i + CHUNK_SIZE]
-#
-#                 if not sub_words: continue
-#
-#                 # Tokenization and Model Call (Inference)
-#                 encoded_input = tokenizer(
-#                     sub_words, boxes=sub_bboxes, truncation=True, padding="max_length",
-#                     max_length=512, return_tensors="pt"
-#                 )
-#
-#                 input_ids = encoded_input['input_ids'].to(device)
-#                 bbox = encoded_input['bbox'].to(device)
-#                 attention_mask = encoded_input['attention_mask'].to(device)
-#
-#                 with torch.no_grad():
-#                     predictions_int_list = model(input_ids, bbox, attention_mask)
-#
-#                 if not predictions_int_list: continue
-#
-#                 # Post-processing and prediction mapping (unchanged)
-#                 predictions_int = predictions_int_list[0]
-#                 word_ids = encoded_input.word_ids()
-#                 word_idx_to_pred_id = {}
-#
-#                 for token_idx, word_idx in enumerate(word_ids):
-#                     if word_idx is not None and word_idx < len(sub_words):
-#                         if word_idx not in word_idx_to_pred_id:
-#                             word_idx_to_pred_id[word_idx] = predictions_int[token_idx]
-#
-#                 for current_word_idx in range(len(sub_words)):
-#                     pred_id_or_tensor = word_idx_to_pred_id.get(current_word_idx, 0)
-#                     pred_id = pred_id_or_tensor.item() if torch.is_tensor(pred_id_or_tensor) else pred_id_or_tensor
-#                     predicted_label = ID_TO_LABEL[pred_id]
-#
-#                     page_raw_predictions.append({
-#                         "word": sub_words[current_word_idx],
-#                         "bbox": sub_bboxes_pdf[current_word_idx],
-#                         "predicted_label": predicted_label,
-#                         "page_number": page_num_1_based
-#                     })
-#
-#                 batch_inference_time = time.time() - start_time_inference_batch
-#                 total_inference_time += batch_inference_time
-#                 # Optional: Log per-batch inference time if needed for deep debugging
-#                 # print(f"      [LOG] Batch inference ({len(sub_words)} words) took {batch_inference_time:.3f}s.")
-#
-#         if page_raw_predictions:
-#             final_page_predictions.append({
-#                 "page_number": page_num_1_based,
-#                 "data": page_raw_predictions
-#             })
-#
-#         print(f"    [LOG] Total inference time for Page {page_num_1_based} (all batches): {total_inference_time:.2f}s")
-#         print(f"    [LOG] Total processing time for Page {page_num_1_based}: {time.time() - start_time_page:.2f}s")
-#
-#     doc.close()
-#
-#     total_elapsed_time = time.time() - start_time_overall
-#     print(f"✅ LayoutLMv3 inference complete. Predicted tags for {len(final_page_predictions)} pages.")
-#     print(f"  [LOG] Overall LayoutLMv3 Inference Pipeline Duration: {total_elapsed_time:.2f}s.")
-#     return final_page_predictions
 # ============================================================================
 # --- PHASE 3: BIO TO STRUCTURED JSON DECODER (Modified for In-Memory Return) ---
 # ============================================================================
 def convert_bio_to_structured_json_relaxed(input_path: str, output_path: str) -> Optional[List[Dict[str, Any]]]:
     """
@@ -2333,31 +2227,41 @@ def convert_bio_to_structured_json_relaxed(input_path: str, output_path: str) ->
         entity_type = label[2:].strip() if label.startswith(('B-', 'I-')) else None
         current_text_buffer.append(word)
         previous_entity_type = last_entity_type
-        is_passage_label = (label == 'B-PASSAGE' or label == 'I-PASSAGE')
-        if not first_question_started and label != 'B-QUESTION' and not is_passage_label:
-            just_finished_i_option = False
-            is_in_new_passage = False
-            continue
-        if not first_question_started and is_passage_label:
-            if label == 'B-PASSAGE' or label == 'I-PASSAGE' or not current_passage_buffer:
                 current_passage_buffer.append(word)
                 last_entity_type = 'PASSAGE'
-            just_finished_i_option = False
-            is_in_new_passage = False
-            continue
         if label == 'B-QUESTION':
             if not first_question_started:
                 header_text = ' '.join(current_text_buffer[:-1]).strip()
                 if header_text or current_passage_buffer:
                     metadata_item = {'type': 'METADATA', 'passage': ''}
-                    if current_passage_buffer:
-                        finalize_passage_to_item(metadata_item, current_passage_buffer)
-                        if header_text:
-                            metadata_item['text'] = header_text
-                    elif header_text:
                         metadata_item['text'] = header_text
                     structured_data.append(metadata_item)
                 first_question_started = True
@@ -2382,8 +2286,13 @@ def convert_bio_to_structured_json_relaxed(input_path: str, output_path: str) ->
             is_in_new_passage = False
             continue
         if current_item is not None:
             if is_in_new_passage:
                 current_item['new_passage'] += f' {word}'
                 if label.startswith('B-') or (label.startswith('I-') and entity_type != 'PASSAGE'):
                     is_in_new_passage = False
@@ -2392,18 +2301,25 @@ def convert_bio_to_structured_json_relaxed(input_path: str, output_path: str) ->
                 continue
             is_in_new_passage = False
             if label.startswith('B-'):
-                if entity_type != 'PASSAGE':
                     finalize_passage_to_item(current_item, current_passage_buffer)
                     current_passage_buffer = []
                 last_entity_type = entity_type
                 if entity_type == 'PASSAGE':
                     if previous_entity_type == 'OPTION' and just_finished_i_option:
                         current_item['new_passage'] = word
                         is_in_new_passage = True
                     else:
-                        current_passage_buffer.append(word)
                 elif entity_type == 'OPTION':
                     current_option_key = word
                     current_item['options'][current_option_key] = word
@@ -2416,20 +2332,22 @@ def convert_bio_to_structured_json_relaxed(input_path: str, output_path: str) ->
                     current_item['question'] += f' {word}'
                     just_finished_i_option = False
             elif label.startswith('I-'):
                 if entity_type == 'QUESTION' and current_item.get('question'):
                     current_item['question'] += f' {word}'
                     last_entity_type = 'QUESTION'
                     just_finished_i_option = False
                 elif entity_type == 'PASSAGE':
                     if previous_entity_type == 'OPTION' and just_finished_i_option:
                         current_item['new_passage'] = word
                         is_in_new_passage = True
                     else:
-                        if last_entity_type == 'QUESTION' and current_item.get('question'):
-                            last_entity_type = 'PASSAGE'
                         if last_entity_type == 'PASSAGE' or not current_passage_buffer:
-                            current_passage_buffer.append(word)
                             last_entity_type = 'PASSAGE'
                     just_finished_i_option = False
                 elif entity_type == 'OPTION' and last_entity_type == 'OPTION' and current_option_key is not None:
@@ -2441,11 +2359,21 @@ def convert_bio_to_structured_json_relaxed(input_path: str, output_path: str) ->
                 else:
                     just_finished_i_option = False
             elif label == 'O':
-                if last_entity_type == 'QUESTION' and current_item and 'question' in current_item:
                     current_item['question'] += f' {word}'
                 just_finished_i_option = False
     # --- Finalize last item ---
     if current_item is not None:
         finalize_passage_to_item(current_item, current_passage_buffer)
@@ -2475,6 +2403,74 @@ def convert_bio_to_structured_json_relaxed(input_path: str, output_path: str) ->
     return structured_data
 # ============================================================================
 # --- PHASE 4: IMAGE EMBEDDING (Modified for In-Memory Return) ---
 # ============================================================================
@@ -2554,7 +2550,7 @@ def embed_images_as_base64_in_memory(structured_data: List[Dict[str, Any]], figu
 # --- MAIN FUNCTION (The Callable Interface) ---
 # ============================================================================
-def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str) -> Optional[List[Dict[str, Any]]]:
     """
     Executes the full document analysis pipeline: YOLO/OCR -> LayoutLMv3 -> Structured JSON -> Base64 Image Embed.
@@ -2638,6 +2634,16 @@ def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str) -> Op
             print("Pipeline aborted: Failed to convert BIO tags to structured data in Phase 3.")
             return None
         # --- D. PHASE 4: IMAGE EMBEDDING (Base64) ---
         final_result = embed_images_as_base64_in_memory(
             structured_data_list,
@@ -2675,6 +2681,7 @@ def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str) -> Op
     return final_result
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
         description="Complete Document Analysis Pipeline (YOLO/OCR -> LayoutLMv3 -> Structured JSON -> Base64 Image Embed).")
@@ -2684,18 +2691,38 @@ if __name__ == "__main__":
                         default=DEFAULT_LAYOUTLMV3_MODEL_PATH,
                         help="Path to the saved LayoutLMv3-CRF PyTorch model checkpoint.")
     args = parser.parse_args()
-    # --- Call the main function ---
-    final_json_data = run_document_pipeline(args.input_pdf, args.layoutlmv3_model_path)
-    if final_json_data:
-        # Example of what to do with the returned data: Save it to a file
-        output_file_name = os.path.splitext(os.path.basename(args.input_pdf))[0] + "_final_output_embedded.json"
-        # Determine where to save the final output (e.g., current directory)
-        final_output_path = os.path.abspath(output_file_name)
         with open(final_output_path, 'w', encoding='utf-8') as f:
             json.dump(final_json_data, f, indent=2, ensure_ascii=False)
@@ -2703,4 +2730,3 @@ if __name__ == "__main__":

     return [d for d in word_data if d[2] >= y_min and d[4] <= y_max]
 def calculate_x_gutters(word_data: list, params: Dict) -> List[int]:
     """Calculates the X-axis histogram and detects significant gutters."""
     if not word_data: return []
     final_page_predictions = []
     CHUNK_SIZE = 500
+    all_pages_word_level_results = []
     for page_data in preprocessed_data:
         page_num_1_based = page_data['page_number']
         page_num_0_based = page_num_1_based - 1
+def create_label_studio_span(page_results, start_idx, end_idx, label):
+    """Create a Label Studio span with character-level offsets and bbox."""
+    # Get the words and bboxes for the specific entity span
+    entity_words = [page_results[i]['word'] for i in range(start_idx, end_idx + 1)]
+    entity_bboxes = [page_results[i]['bbox'] for i in range(start_idx, end_idx + 1)]
+    # Calculate encompassing BBOX
+    x0 = min(bbox[0] for bbox in entity_bboxes)
+    y0 = min(bbox[1] for bbox in entity_bboxes)
+    x1 = max(bbox[2] for bbox in entity_bboxes)
+    y1 = max(bbox[3] for bbox in entity_bboxes)
+    # Calculate character offsets based on the full page text string
+    all_words_on_page = [r['word'] for r in page_results]
+    text_string = " ".join(all_words_on_page)
+    # Compute start and end character offsets
+    start_char = len(" ".join(all_words_on_page[:start_idx]))
+    if start_idx != 0:
+        start_char += 1
+    end_char = start_char + len(" ".join(entity_words))
+    span_text = " ".join(entity_words)
+    return {
+        "from_name": "label",
+        "to_name": "text",
+        "type": "labels",
+        "value": {
+            "start": start_char,
+            "end": end_char,
+            "text": span_text,
+            "labels": [label],
+            "bbox": {"x": x0, "y": y0, "width": x1 - x0, "height": y1 - y0}
+        },
+        "score": 0.99
+    }
+def convert_raw_predictions_to_label_studio(page_data_list, output_path: str):
+    """Convert raw word-level predictions (grouped by page) to Label Studio format."""
+    final_tasks = []
+    total_spans = 0
+    print("\n[PHASE: LABEL STUDIO CONVERSION]")
+    for page_data in page_data_list:
+        page_num = page_data['page_number']
+        page_results = page_data['data']
+        if not page_results:
+            continue
+        original_words = [r['word'] for r in page_results]
+        text_string = " ".join(original_words)
+        results = []
+        current_entity_label = None
+        current_entity_start_word_index = None
+        # BIO stitching
+        for i, pred_item in enumerate(page_results):
+            label = pred_item['predicted_label']
+            tag_only = label.split('-', 1)[-1] if '-' in label else label
+            if label.startswith('B-'):
+                if current_entity_label:
+                    results.append(
+                        create_label_studio_span(page_results,
+                                                 current_entity_start_word_index,
+                                                 i - 1,
+                                                 current_entity_label)
+                    )
+                current_entity_label = tag_only
+                current_entity_start_word_index = i
+            elif label.startswith('I-') and current_entity_label == tag_only:
+                continue
+            else:
+                if current_entity_label:
+                    results.append(
+                        create_label_studio_span(page_results,
+                                                 current_entity_start_word_index,
+                                                 i - 1,
+                                                 current_entity_label)
+                    )
+                current_entity_label = None
+                current_entity_start_word_index = None
+        if current_entity_label:
+            results.append(
+                create_label_studio_span(page_results,
+                                         current_entity_start_word_index,
+                                         len(page_results) - 1,
+                                         current_entity_label)
+            )
+        total_spans += len(results)
+        print(f"  -> Page {page_num}: Generated {len(results)} labeled spans.")
+        final_tasks.append({
+            "data": {
+                "text": text_string,
+                "original_words": original_words,
+                "original_bboxes": [r['bbox'] for r in page_results]
+            },
+            "annotations": [{"result": results}],
+            "meta": {"page_number": page_num}
+        })
+    with open(output_path, "w", encoding='utf-8') as f:
+        json.dump(final_tasks, f, indent=2, ensure_ascii=False)
+    print(f"\n✅ Label Studio tasks created and saved to {output_path}. Total {total_spans} spans.")
 # ============================================================================
 # --- PHASE 3: BIO TO STRUCTURED JSON DECODER (Modified for In-Memory Return) ---
 # ============================================================================
+#
 def convert_bio_to_structured_json_relaxed(input_path: str, output_path: str) -> Optional[List[Dict[str, Any]]]:
     """
         entity_type = label[2:].strip() if label.startswith(('B-', 'I-')) else None
         current_text_buffer.append(word)
         previous_entity_type = last_entity_type
+        is_passage_label = (entity_type == 'PASSAGE')
+        # ----------------------------------------------------------------------
+        # --- MODIFICATION AREA 1: Pre-Question Content (Metadata/Passage) ---
+        # ----------------------------------------------------------------------
+        # If we haven't started the first question yet
+        if not first_question_started:
+            # Skip 'O' and non-B/I-PASSAGE tags before the first B-QUESTION
+            if label != 'B-QUESTION' and not is_passage_label:
+                just_finished_i_option = False
+                is_in_new_passage = False
+                continue
+            # Handle PASSAGE tokens before the first B-QUESTION
+            if is_passage_label:
+                # B-PASSAGE or I-PASSAGE always appends to the buffer here
                 current_passage_buffer.append(word)
                 last_entity_type = 'PASSAGE'
+                just_finished_i_option = False
+                is_in_new_passage = False
+                continue
+        # ----------------------------------------------------------------------
+        # --- Standard B-QUESTION Start (Split/Finalize previous item) ---
+        # ----------------------------------------------------------------------
         if label == 'B-QUESTION':
             if not first_question_started:
+                # Handle initial header/metadata
                 header_text = ' '.join(current_text_buffer[:-1]).strip()
                 if header_text or current_passage_buffer:
                     metadata_item = {'type': 'METADATA', 'passage': ''}
+                    finalize_passage_to_item(metadata_item, current_passage_buffer)
+                    if header_text:
                         metadata_item['text'] = header_text
                     structured_data.append(metadata_item)
                 first_question_started = True
             is_in_new_passage = False
             continue
+        # ----------------------------------------------------------------------
+        # --- Processing tokens within an active Question (current_item) ---
+        # ----------------------------------------------------------------------
         if current_item is not None:
             if is_in_new_passage:
+                # Handle passage continuation started after an option
                 current_item['new_passage'] += f' {word}'
                 if label.startswith('B-') or (label.startswith('I-') and entity_type != 'PASSAGE'):
                     is_in_new_passage = False
                 continue
             is_in_new_passage = False
+            # Case 1: Beginning of a new entity (B- tag)
             if label.startswith('B-'):
+                # Check for termination entities
+                if entity_type in ['QUESTION', 'OPTION', 'ANSWER', 'SECTION_HEADING']:
                     finalize_passage_to_item(current_item, current_passage_buffer)
                     current_passage_buffer = []
                 last_entity_type = entity_type
                 if entity_type == 'PASSAGE':
+                    # MODIFICATION 2: B-PASSAGE always continues the current passage buffer
+                    # unless immediately following an I-OPTION (which starts 'new_passage')
                     if previous_entity_type == 'OPTION' and just_finished_i_option:
                         current_item['new_passage'] = word
                         is_in_new_passage = True
                     else:
+                        current_passage_buffer.append(word)  # Append B-PASSAGE word
                 elif entity_type == 'OPTION':
                     current_option_key = word
                     current_item['options'][current_option_key] = word
                     current_item['question'] += f' {word}'
                     just_finished_i_option = False
+            # Case 2: Inside an existing entity (I- tag)
             elif label.startswith('I-'):
                 if entity_type == 'QUESTION' and current_item.get('question'):
                     current_item['question'] += f' {word}'
                     last_entity_type = 'QUESTION'
                     just_finished_i_option = False
                 elif entity_type == 'PASSAGE':
+                    # MODIFICATION 3: I-PASSAGE always continues the current passage buffer
                     if previous_entity_type == 'OPTION' and just_finished_i_option:
                         current_item['new_passage'] = word
                         is_in_new_passage = True
                     else:
+                        # Ensure last entity was PASSAGE or QUESTION/initial state to append
+                        if last_entity_type == 'QUESTION': last_entity_type = 'PASSAGE'
                         if last_entity_type == 'PASSAGE' or not current_passage_buffer:
+                            current_passage_buffer.append(word)  # Append I-PASSAGE word
                             last_entity_type = 'PASSAGE'
                     just_finished_i_option = False
                 elif entity_type == 'OPTION' and last_entity_type == 'OPTION' and current_option_key is not None:
                 else:
                     just_finished_i_option = False
+            # Case 3: Outside any entity (O tag)
             elif label == 'O':
+                # MODIFICATION 4: Skip 'O' tokens ONLY if the last active entity was PASSAGE.
+                # Otherwise, default O tokens append to QUESTION (original logic preserved)
+                if last_entity_type == 'PASSAGE':
+                    # Do nothing to the passage buffer and do not change last_entity_type
+                    pass
+                elif last_entity_type == 'QUESTION' and current_item and 'question' in current_item:
                     current_item['question'] += f' {word}'
                 just_finished_i_option = False
+    # ----------------------------------------------------------------------
+    # --- Finalization (Unchanged) ---
+    # ----------------------------------------------------------------------
     # --- Finalize last item ---
     if current_item is not None:
         finalize_passage_to_item(current_item, current_passage_buffer)
     return structured_data
+def correct_misaligned_options(structured_data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    """
+    Corrects common OCR/tagging misalignment in options:
+    If option N is empty (only contains its identifier, e.g., '(A)')
+    AND option N+1 contains its own identifier followed by TWO EQUATION/FIGURE tags,
+    then the first tag of option N+1 is moved to option N.
+    """
+    print("\n" + "=" * 80)
+    print("--- 5. STARTING POST-PROCESSING: OPTION ALIGNMENT CORRECTION ---")
+    print("=" * 80)
+    # Regex to find all EQUATION/FIGURE tags
+    tag_pattern = re.compile(r'(EQUATION\d+|FIGURE\d+)')
+    corrected_count = 0
+    for item in structured_data:
+        if item.get('type') in ['METADATA']:
+            continue
+        options = item.get('options')
+        if not options or len(options) < 2:
+            continue
+        # Get option keys in their correct order
+        option_keys = list(options.keys())
+        for i in range(len(option_keys) - 1):
+            current_key = option_keys[i]
+            next_key = option_keys[i + 1]
+            current_value = options[current_key].strip()
+            next_value = options[next_key].strip()
+            # --- Condition 1: Check if the current option is "empty" ---
+            # An "empty" option only contains its key/identifier (e.g., "(A)")
+            is_current_empty = current_value == current_key
+            # --- Condition 2: Check if the next option has two content tags ---
+            # Remove the option key from the next value to check content
+            content_in_next = next_value.replace(next_key, '', 1).strip()
+            tags_in_next = tag_pattern.findall(content_in_next)
+            has_two_tags = len(tags_in_next) == 2
+            if is_current_empty and has_two_tags:
+                print(
+                    f"  -> Correction applied in Item {item.get('question', '...')}: Moving '{tags_in_next[0]}' from {next_key} to {current_key}.")
+                # 1. Get the first tag (the one that belongs to the current option)
+                tag_to_move = tags_in_next[0]
+                # 2. Update current option: Append the tag to the empty identifier
+                options[current_key] = f"{current_key} {tag_to_move}".strip()
+                # 3. Update next option: Remove the first tag from its content
+                remaining_tags_content = tag_pattern.sub('', next_value.replace(next_key, '', 1), 1).strip()
+                remaining_tags_content += f" {tags_in_next[1]}"
+                # Reconstruct the next option value with only its identifier and the single remaining tag
+                options[next_key] = f"{next_key} {tags_in_next[1]}".strip()
+                corrected_count += 1
+    print(f"✅ Option alignment correction finished. Total corrections: {corrected_count}.")
+    return structured_data
 # ============================================================================
 # --- PHASE 4: IMAGE EMBEDDING (Modified for In-Memory Return) ---
 # ============================================================================
 # --- MAIN FUNCTION (The Callable Interface) ---
 # ============================================================================
+def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str, label_studio_output_path: str) -> Optional[List[Dict[str, Any]]]:
     """
     Executes the full document analysis pipeline: YOLO/OCR -> LayoutLMv3 -> Structured JSON -> Base64 Image Embed.
             print("Pipeline aborted: Failed to convert BIO tags to structured data in Phase 3.")
             return None
+        structured_data_list = correct_misaligned_options(structured_data_list)
+        try:
+            # CHANGE: Use the provided, persistent output path
+            convert_raw_predictions_to_label_studio(page_raw_predictions_list, label_studio_output_path)
+            print(f"✅ Label Studio output saved to: {label_studio_output_path}")
+        except Exception as e:
+            print(f"❌ Error during Label Studio conversion: {e}")
         # --- D. PHASE 4: IMAGE EMBEDDING (Base64) ---
         final_result = embed_images_as_base64_in_memory(
             structured_data_list,
     return final_result
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
         description="Complete Document Analysis Pipeline (YOLO/OCR -> LayoutLMv3 -> Structured JSON -> Base64 Image Embed).")
                         default=DEFAULT_LAYOUTLMV3_MODEL_PATH,
                         help="Path to the saved LayoutLMv3-CRF PyTorch model checkpoint.")
+    # NEW ARGUMENT: Optional path for the Label Studio output
+    parser.add_argument("--ls_output_path", type=str, default=None,
+                        help="Optional path to save the Label Studio JSON task file.")
     args = parser.parse_args()
+    pdf_name = os.path.splitext(os.path.basename(args.input_pdf))[0]
+    # 1. Define Persistent Output Paths
+    final_output_file_name = f"{pdf_name}_final_output_embedded.json"
+    final_output_path = os.path.abspath(final_output_file_name)
+    # 2. Determine Label Studio Output Path
+    # If not provided, create a default path next to the script
+    if args.ls_output_path:
+        ls_output_path = os.path.abspath(args.ls_output_path)
+    else:
+        ls_output_file_name = f"{pdf_name}_label_studio_tasks.json"
+        ls_output_path = os.path.abspath(ls_output_file_name)
+    # --- Call the main function (Updated to include ls_output_path) ---
+    print(f"\n[SETUP] Label Studio Output will be saved to: {ls_output_path}")
+    # NOTE: You must update the signature of run_document_pipeline to accept 3 arguments
+    final_json_data = run_document_pipeline(
+        args.input_pdf,
+        args.layoutlmv3_model_path,
+        ls_output_path  # <--- Passing the persistent path
+    )
+    if final_json_data:
+        # Save the final structured output
         with open(final_output_path, 'w', encoding='utf-8') as f:
             json.dump(final_json_data, f, indent=2, ensure_ascii=False)