Spaces:

heerjtdev
/

rocr

Sleeping

App Files Files Community

heerjtdev commited on Jan 16

Commit

37a91ca

verified ·

1 Parent(s): 4448236

Upload test_layout_yolo_columns_log.py

Browse files

Files changed (1) hide show

test_layout_yolo_columns_log.py +714 -0

test_layout_yolo_columns_log.py ADDED Viewed

	@@ -0,0 +1,714 @@

+import json
+import argparse
+import os
+import torch
+import torch.nn as nn
+from TorchCRF import CRF
+from transformers import LayoutLMv3TokenizerFast, LayoutLMv3Model, LayoutLMv3Config
+import pytesseract
+from PIL import Image
+import fitz  # PyMuPDF
+from typing import List, Dict, Any, Optional, Union, Tuple
+import numpy as np
+from scipy.signal import find_peaks
+from scipy.ndimage import gaussian_filter1d
+import sys
+import io
+# ============================================================================
+# CONSTANTS & MODEL DEFINITION
+# ============================================================================
+# Labels must match the training labels! (Use the most detailed set)
+ID_TO_LABEL = {
+    0: "O",
+    1: "B-QUESTION", 2: "I-QUESTION",
+    3: "B-OPTION", 4: "I-OPTION",
+    5: "B-ANSWER", 6: "I-ANSWER",
+    7: "B-SECTION_HEADING", 8: "I-SECTION_HEADING",
+    9: "B-PASSAGE", 10: "I-PASSAGE"
+}
+NUM_LABELS = len(ID_TO_LABEL)
+class LayoutLMv3ForTokenClassification(nn.Module):
+    """LayoutLMv3 model with a linear layer and a CRF layer on top."""
+    def __init__(self, num_labels: int = NUM_LABELS):
+        super().__init__()
+        self.num_labels = num_labels
+        config = LayoutLMv3Config.from_pretrained("microsoft/layoutlmv3-base", num_labels=num_labels)
+        self.layoutlmv3 = LayoutLMv3Model.from_pretrained("microsoft/layoutlmv3-base", config=config)
+        self.classifier = nn.Linear(config.hidden_size, num_labels)
+        self.crf = CRF(num_labels)
+        self.init_weights()
+    def init_weights(self):
+        nn.init.xavier_uniform_(self.classifier.weight)
+        if self.classifier.bias is not None:
+            nn.init.zeros_(self.classifier.bias)
+    def forward(
+            self,
+            input_ids: torch.Tensor,
+            bbox: torch.Tensor,
+            attention_mask: torch.Tensor,
+            labels: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, Tuple[List[List[int]], Any]]:
+        outputs = self.layoutlmv3(
+            input_ids=input_ids,
+            bbox=bbox,
+            attention_mask=attention_mask,
+            return_dict=True
+        )
+        sequence_output = outputs.last_hidden_state
+        emissions = self.classifier(sequence_output)
+        mask = attention_mask.bool()
+        if labels is not None:
+            log_likelihood = self.crf(emissions, labels, mask=mask)
+            loss = -log_likelihood.mean()
+            return loss
+        else:
+            best_paths = self.crf.viterbi_decode(emissions, mask=mask)
+            return best_paths
+# ============================================================================
+# COLUMN DETECTION MODULE (Re-included for completeness)
+# ============================================================================
+def get_word_data_for_detection(page: fitz.Page, top_margin_percent=0.10, bottom_margin_percent=0.10) -> list:
+    """Extracts word data for column detection with Y-axis filtering."""
+    word_data = page.get_text("words")
+    if len(word_data) == 0:
+        # Fallback to Tesseract if PyMuPDF finds no words
+        try:
+            pix = page.get_pixmap(matrix=fitz.Matrix(3, 3))
+            img_bytes = pix.tobytes("png")
+            img = Image.open(io.BytesIO(img_bytes))
+            data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT)
+            full_word_data = []
+            for i in range(len(data['level'])):
+                if data['text'][i].strip():
+                    x1 = data['left'][i] / 3
+                    y1 = data['top'][i] / 3
+                    x2 = (data['left'][i] + data['width'][i]) / 3
+                    y2 = (data['top'][i] + data['height'][i]) / 3
+                    word = data['text'][i]
+                    full_word_data.append((word, x1, y1, x2, y2))
+            word_data = full_word_data
+        except Exception as e:
+            # print(f"Tesseract fallback failed: {e}")
+            return []
+    else:
+        word_data = [(w[4], w[0], w[1], w[2], w[3]) for w in word_data]
+    page_height = page.rect.height
+    y_min = page_height * top_margin_percent
+    y_max = page_height * (1 - bottom_margin_percent)
+    filtered_data = [
+        (word, x1, y1, x2, y2)
+        for word, x1, y1, x2, y2 in word_data
+        if y1 >= y_min and y2 <= y_max
+    ]
+    return filtered_data
+def calculate_x_gutters(word_data: list, params: Dict) -> List[int]:
+    """Calculates the X-axis histogram and detects significant gutters."""
+    if not word_data: return []
+    x_points = []
+    for _, x1, _, x2, _ in word_data:
+        x_points.extend([x1, x2])
+    max_x = max(x_points)
+    bin_size = params['cluster_bin_size']
+    num_bins = int(np.ceil(max_x / bin_size))
+    hist, bin_edges = np.histogram(x_points, bins=num_bins, range=(0, max_x))
+    smoothed_hist = gaussian_filter1d(hist.astype(float), sigma=params['cluster_smoothing'])
+    inverted_signal = np.max(smoothed_hist) - smoothed_hist
+    peaks, properties = find_peaks(
+        inverted_signal,
+        height=0,
+        distance=params['cluster_min_width'] / bin_size
+    )
+    if not peaks.size: return []
+    threshold_value = np.percentile(smoothed_hist, params['cluster_threshold_percentile'])
+    inverted_threshold = np.max(smoothed_hist) - threshold_value
+    significant_peaks = peaks[properties['peak_heights'] >= inverted_threshold]
+    separator_x_coords = [int(bin_edges[p]) for p in significant_peaks]
+    final_separators = []
+    prominence_threshold = params['cluster_prominence'] * np.max(smoothed_hist)
+    for x_coord in separator_x_coords:
+        bin_idx = np.searchsorted(bin_edges, x_coord) - 1
+        window_size = int(params['cluster_min_width'] / bin_size)
+        left_start, left_end = max(0, bin_idx - window_size), bin_idx
+        right_start, right_end = bin_idx + 1, min(len(smoothed_hist), bin_idx + 1 + window_size)
+        if left_end <= left_start or right_end <= right_start: continue
+        avg_left_density = np.mean(smoothed_hist[left_start:left_end])
+        avg_right_density = np.mean(smoothed_hist[right_start:right_end])
+        if avg_left_density >= prominence_threshold and avg_right_density >= prominence_threshold:
+            final_separators.append(x_coord)
+    return sorted(final_separators)
+def detect_column_gutters(pdf_path: str, page_num: int, **params) -> Optional[int]:
+    """Main function for column detection."""
+    try:
+        doc = fitz.open(pdf_path)
+        page = doc.load_page(page_num)
+        word_data = get_word_data_for_detection(page, params.get('top_margin_percent', 0.10),
+                                                params.get('bottom_margin_percent', 0.10))
+        if not word_data:
+            doc.close()
+            return None
+        separators = calculate_x_gutters(word_data, params)
+        doc.close()
+        if len(separators) == 1:
+            return separators[0]
+        elif len(separators) > 1:
+            page_width = page.rect.width
+            center_x = page_width / 2
+            best_separator = min(separators, key=lambda x: abs(x - center_x))
+            return best_separator
+        return None
+    except Exception as e:
+        print(f"DEBUG: Column detection failed for page {page_num}: {e}")
+        return None
+def _merge_integrity(all_words_by_page: List[str], all_bboxes_raw: List[List[int]],
+                     column_separator_x: Optional[int]) -> List[List[str]]:
+    """Splits the words/bboxes into two columns if a separator is present."""
+    if column_separator_x is None:
+        return [all_words_by_page]
+    left_column_words = []
+    right_column_words = []
+    gutter_min_x = column_separator_x - 10
+    gutter_max_x = column_separator_x + 10
+    for i, (word, bbox_raw) in enumerate(zip(all_words_by_page, all_bboxes_raw)):
+        x1_raw, _, x2_raw, _ = bbox_raw
+        center_x = (x1_raw + x2_raw) / 2
+        if center_x < column_separator_x:
+            left_column_words.append(word)
+        else:
+            right_column_words.append(word)
+    return [c for c in [left_column_words, right_column_words] if c]
+def post_process_predictions(words: List[str], bboxes: List[List[int]], predictions: List[str]) -> List[Dict[str, Any]]:
+    """Converts a flat list of words (and string label predictions) into structured blocks."""
+    structured_blocks = []
+    current_block = None
+    # DEBUG: Track how many blocks are created
+    block_count = 0
+    for word, bbox, label in zip(words, bboxes, predictions):
+        prefix, tag = (label.split('-', 1) + [None])[:2]
+        if prefix == 'B':
+            if current_block:
+                structured_blocks.append(current_block)
+                block_count += 1
+                # print(f"  DEBUG POST: Closed block {block_count}: Tag={current_block['tag']}, Words={len(current_block['words'])}")
+            current_block = {
+                'text': word,
+                'tag': tag,
+                'words': [{'text': word, 'bbox': bbox, 'label': label}],
+                'bbox': list(bbox)
+            }
+        elif prefix == 'I' and current_block and current_block['tag'] == tag:
+            current_block['text'] += ' ' + word
+            current_block['words'].append({'text': word, 'bbox': bbox, 'label': label})
+            current_block['bbox'][0] = min(current_block['bbox'][0], bbox[0])
+            current_block['bbox'][1] = min(current_block['bbox'][1], bbox[1])
+            current_block['bbox'][2] = max(current_block['bbox'][2], bbox[2])
+            current_block['bbox'][3] = max(current_block['bbox'][3], bbox[3])
+        else:  # 'O' or mismatching tag
+            if current_block:
+                structured_blocks.append(current_block)
+                block_count += 1
+                # print(f"  DEBUG POST: Closed block {block_count}: Tag={current_block['tag']}, Words={len(current_block['words'])}")
+                current_block = None
+            # Handle 'O' or isolated 'I'. We include 'O' for completeness, but they might be filtered later.
+            if label == 'O':
+                # print("  DEBUG POST: Created individual 'OTHER' block (O).")
+                structured_blocks.append({
+                    'text': word,
+                    'tag': 'OTHER',
+                    'words': [{'text': word, 'bbox': bbox, 'label': label}],
+                    'bbox': list(bbox)
+                })
+            elif prefix == 'I':
+                # Start a block that missed the 'B'
+                # print(f"  DEBUG POST: Created isolated 'I' block: Tag={tag}.")
+                structured_blocks.append({
+                    'text': word,
+                    'tag': tag,
+                    'words': [{'text': word, 'bbox': bbox, 'label': label}],
+                    'bbox': list(bbox)
+                })
+    if current_block:
+        structured_blocks.append(current_block)
+        block_count += 1
+        # print(f"  DEBUG POST: Closed final block {block_count}: Tag={current_block['tag']}, Words={len(current_block['words'])}")
+    # print(f"DEBUG POST: Total structured blocks created: {len(structured_blocks)}")
+    return structured_blocks
+# ============================================================================
+# CORE INFERENCE FUNCTION (WITH DEBUGGING LOGS)
+# ============================================================================
+def run_inference_and_structure(pdf_path: str, model_path: str, inference_output_path: str,
+                                preprocessed_json_path: str,
+                                column_detection_params: Optional[Dict] = None) -> List[Dict[str, Any]]:
+    """
+    Runs LayoutLMv3-CRF inference with extensive debugging logs.
+    """
+    print("--- 1. MODEL SETUP ---")
+    tokenizer = LayoutLMv3TokenizerFast.from_pretrained("microsoft/layoutlmv3-base")
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"DEBUG: Using device: {device}")
+    try:
+        model = LayoutLMv3ForTokenClassification(num_labels=NUM_LABELS)
+        checkpoint = torch.load(model_path, map_location=device)
+        # Fixed Loading Logic
+        model_state = checkpoint.get('model_state_dict', checkpoint)
+        fixed_state_dict = {key.replace('layoutlm.', 'layoutlmv3.'): value for key, value in model_state.items()}
+        model.load_state_dict(fixed_state_dict)
+        model.to(device)
+        model.eval()
+        print(f"✅ Model loaded successfully from {model_path}. Total {len(fixed_state_dict)} keys loaded.")
+    except Exception as e:
+        print(f"❌ FATAL ERROR during model loading: {e}")
+        return []
+    # --------------------------------------------------------------------------
+    # 2. DATA LOADING & PREPARATION
+    # --------------------------------------------------------------------------
+    print("\n--- 2. DATA LOADING ---")
+    try:
+        with open(preprocessed_json_path, 'r', encoding='utf-8') as f:
+            preprocessed_data = json.load(f)
+        print(f"✅ Loaded preprocessed data with {len(preprocessed_data)} pages.")
+    except Exception as e:
+        print(f"❌ Error loading preprocessed JSON: {e}")
+        return []
+    try:
+        doc = fitz.open(pdf_path)
+    except Exception as e:
+        print(f"❌ Error loading PDF: {e}")
+        return []
+    all_pages_data = []
+    CHUNK_SIZE = 500
+    for page_data in preprocessed_data:
+        page_num_1_based = page_data['page_number']
+        page_num_0_based = page_num_1_based - 1
+        print(f"\nProcessing Page {page_num_1_based}...")
+        fitz_page = doc.load_page(page_num_0_based)
+        page_width, page_height = fitz_page.rect.width, fitz_page.rect.height
+        words = []
+        bboxes_raw_pdf_space = []
+        normalized_bboxes_list = []
+        scale_factor = 2.0
+        for item in page_data['data']:
+            word = item['word']
+            raw_yolo_bbox = item['bbox']
+            bbox_pdf = [
+                int(raw_yolo_bbox[0] / scale_factor),
+                int(raw_yolo_bbox[1] / scale_factor),
+                int(raw_yolo_bbox[2] / scale_factor),
+                int(raw_yolo_bbox[3] / scale_factor)
+            ]
+            normalized_bbox = [
+                max(0, min(1000, int(1000 * bbox_pdf[0] / page_width))),
+                max(0, min(1000, int(1000 * bbox_pdf[1] / page_height))),
+                max(0, min(1000, int(1000 * bbox_pdf[2] / page_width))),
+                max(0, min(1000, int(1000 * bbox_pdf[3] / page_height)))
+            ]
+            words.append(word)
+            bboxes_raw_pdf_space.append(bbox_pdf)
+            normalized_bboxes_list.append(normalized_bbox)
+        if not words:
+            print(f"  DEBUG: Page {page_num_1_based} has no words in preprocessed data. Skipping.")
+            continue
+        print(f"  DEBUG: Page {page_num_1_based} extracted {len(words)} words.")
+        # --------------------------------------------------------------------------
+        # 3. COLUMN DETECTION & CHUNKING
+        # --------------------------------------------------------------------------
+        column_detection_params = column_detection_params or {}
+        column_separator_x = detect_column_gutters(pdf_path, page_num_0_based, **column_detection_params)
+        if column_separator_x is not None:
+            print(f"  DEBUG: Column detected at X={column_separator_x}. Splitting.")
+        else:
+            print(f"  DEBUG: No column detected. Processing as a single chunk.")
+        word_chunks = _merge_integrity(words, bboxes_raw_pdf_space, column_separator_x)
+        print(f"  DEBUG: Split into {len(word_chunks)} column/chunks.")
+        page_structured_data = {'page_number': page_num_1_based, 'structured_blocks': []}
+        # --------------------------------------------------------------------------
+        # 4. INFERENCE LOOP
+        # --------------------------------------------------------------------------
+        # Re-alignment is simplified and potentially slow. A proper way would be to
+        # split all three lists (words, bboxes_pdf, bboxes_norm) at the same time.
+        # But we stick to your original approach for minimal changes.
+        current_word_idx = 0
+        for chunk_idx, chunk_words in enumerate(word_chunks):
+            if not chunk_words: continue
+            # Reconstruct the aligned chunk data (Simplified version of your complex loop)
+            current_original_index = 0
+            temp_chunk_norm_bboxes = []
+            temp_chunk_pdf_bboxes = []
+            found_words = []
+            # Simple, but slow, way to re-align data for the chunk:
+            for word_to_find in chunk_words:
+                try:
+                    # Find the index of the word in the master list, starting search from the last found position
+                    i = words[current_original_index:].index(word_to_find) + current_original_index
+                    temp_chunk_norm_bboxes.append(normalized_bboxes_list[i])
+                    temp_chunk_pdf_bboxes.append(bboxes_raw_pdf_space[i])
+                    found_words.append(words[i])
+                    current_original_index = i + 1
+                except ValueError:
+                    # print(f"  WARNING: Word '{word_to_find}' not found during re-alignment.")
+                    pass  # Skip missing words
+            chunk_words = found_words
+            chunk_normalized_bboxes = temp_chunk_norm_bboxes
+            chunk_bboxes_pdf = temp_chunk_pdf_bboxes
+            print(f"  DEBUG: Column/Chunk {chunk_idx + 1} has {len(chunk_words)} words.")
+            # Sub-chunking for max_seq_len (512)
+            for i in range(0, len(chunk_words), CHUNK_SIZE):
+                sub_words = chunk_words[i:i + CHUNK_SIZE]
+                sub_bboxes = chunk_normalized_bboxes[i:i + CHUNK_SIZE]
+                sub_bboxes_pdf = chunk_bboxes_pdf[i:i + CHUNK_SIZE]
+                encoded_input = tokenizer(
+                    sub_words,
+                    boxes=sub_bboxes,
+                    truncation=True,
+                    padding="max_length",
+                    max_length=512,
+                    # is_split_into_words=True,
+                    return_tensors="pt"
+                )
+                input_ids = encoded_input['input_ids'].to(device)
+                bbox = encoded_input['bbox'].to(device)
+                attention_mask = encoded_input['attention_mask'].to(device)
+                print(f"    DEBUG INFER: Sub-chunk size: {len(sub_words)} words. Input shape: {input_ids.shape}")
+                with torch.no_grad():
+                    predictions_int_list = model(input_ids, bbox, attention_mask)
+                if not predictions_int_list:
+                    print("    ❌ INFERENCE FAILED: Model returned empty list of predictions.")
+                    continue
+                predictions_int = predictions_int_list[0]
+                # --- CHECK FOR NON-'O' PREDICTIONS ---
+                non_o_count = sum(1 for p in predictions_int if p != 0)
+                print(
+                    f"    DEBUG INFER: Raw predictions (tokens): Total {len(predictions_int)}. Non-'O' tokens: {non_o_count}.")
+                if non_o_count == 0:
+                    print("    ⚠️ WARNING: Model is predicting 'O' for all tokens. Check training or input quality.")
+                # -----------------------------------
+                # Map token predictions back to original words
+                word_ids = encoded_input.word_ids()
+                word_idx_to_pred_id = {}
+                for token_idx, word_idx in enumerate(word_ids):
+                    if word_idx is not None and word_idx < len(sub_words):
+                        # Only take the prediction of the FIRST sub-token for a word
+                        if word_idx not in word_idx_to_pred_id:
+                            word_idx_to_pred_id[word_idx] = predictions_int[token_idx]
+                final_predictions_str = []
+                # Map integer IDs back to string labels
+                for current_word_idx in range(len(sub_words)):
+                    pred_id_or_tensor = word_idx_to_pred_id.get(current_word_idx, 0)
+                    pred_id = pred_id_or_tensor.item() if torch.is_tensor(pred_id_or_tensor) else pred_id_or_tensor
+                    # This is the final word-level prediction. If it's always 0, post-processing fails.
+                    final_predictions_str.append(ID_TO_LABEL[pred_id])
+                # --- POST-PROCESSING ---
+                structured_blocks = post_process_predictions(sub_words, sub_bboxes_pdf, final_predictions_str)
+                print(f"    DEBUG POST: Created {len(structured_blocks)} structured blocks from this sub-chunk.")
+                page_structured_data['structured_blocks'].extend(structured_blocks)
+        print(
+            f"  DEBUG: Page {page_num_1_based} final total structured blocks: {len(page_structured_data['structured_blocks'])}")
+        all_pages_data.append(page_structured_data)
+    doc.close()
+    # Save final structured predictions
+    with open(inference_output_path, 'w', encoding='utf-8') as f:
+        json.dump(all_pages_data, f, indent=4)
+    print(f"\n✅ All pages processed. Structured data saved to {os.path.basename(inference_output_path)}")
+    return all_pages_data
+# --- 5. Label Studio Conversion Utility (Included for completeness) ---
+def create_label_studio_span(all_results, start_idx, end_idx, label):
+    """Create a Label Studio span with character-level offsets."""
+    entity_words = [all_results[i]['word'] for i in range(start_idx, end_idx + 1)]
+    entity_bboxes = [all_results[i]['bbox'] for i in range(start_idx, end_idx + 1)]
+    x0 = min(bbox[0] for bbox in entity_bboxes)
+    y0 = min(bbox[1] for bbox in entity_bboxes)
+    x1 = max(bbox[2] for bbox in entity_bboxes)
+    y1 = max(bbox[3] for bbox in entity_bboxes)
+    all_words = [r['word'] for r in all_results]
+    text_string = " ".join(all_words)
+    prefix_words = all_words[:start_idx]
+    start_char = len(" ".join(prefix_words)) + (1 if prefix_words else 0)
+    span_text = " ".join(entity_words)
+    end_char = start_char + len(span_text)
+    return {
+        "from_name": "label",
+        "to_name": "text",
+        "type": "labels",
+        "value": {
+            "start": start_char,
+            "end": end_char,
+            "text": span_text,
+            "labels": [label],
+            "bbox": {
+                "x": x0,
+                "y": y0,
+                "width": x1 - x0,
+                "height": y1 - y0
+            }
+        },
+        "score": 0.99
+    }
+def convert_to_label_studio_format(structured_data: List[Dict[str, Any]],
+                                   output_path: str,
+                                   pdf_file_name: str) -> None:
+    """Convert structured predictions to Label Studio format."""
+    final_tasks = []
+    for page_data in structured_data:
+        page_num = page_data['page_number']
+        if 'structured_blocks' not in page_data: continue
+        page_results = []
+        for block in page_data['structured_blocks']:
+            if 'words' in block:
+                for word_info in block['words']:
+                    page_results.append({
+                        'word': word_info['text'],
+                        'bbox': word_info['bbox'],
+                        # FIX: Use the full label string (e.g., 'B-QUESTION')
+                        'predicted_label': word_info['label']
+                    })
+        if not page_results:
+            print(f"DEBUG LS: Page {page_num} has no word-level results. Skipping.")
+            continue
+        original_words = [r['word'] for r in page_results]
+        original_bboxes = [r['bbox'] for r in page_results]
+        text_string = " ".join(original_words)
+        results = []
+        current_entity_label = None
+        current_entity_start_word_index = None
+        for i, pred_item in enumerate(page_results):
+            label = pred_item['predicted_label']
+            # Get the tag (e.g., 'QUESTION' from 'B-QUESTION')
+            tag_only = label.split('-', 1)[-1] if '-' in label else label
+            if label.startswith('B-'):
+                if current_entity_label:
+                    results.append(create_label_studio_span(
+                        page_results, current_entity_start_word_index, i - 1, current_entity_label
+                    ))
+                current_entity_label = tag_only
+                current_entity_start_word_index = i
+            elif label.startswith('I-') and current_entity_label == tag_only:
+                continue
+            else:  # Label is 'O' or doesn't match current entity
+                if current_entity_label:
+                    results.append(create_label_studio_span(
+                        page_results, current_entity_start_word_index, i - 1, current_entity_label
+                    ))
+                    current_entity_label = None
+                    current_entity_start_word_index = None
+        if current_entity_label:
+            results.append(create_label_studio_span(
+                page_results, current_entity_start_word_index, len(page_results) - 1, current_entity_label
+            ))
+        print(f"DEBUG LS: Page {page_num} generated {len(results)} Label Studio spans.")
+        task = {
+            "data": {
+                "text": text_string,
+                "original_words": original_words,
+                "original_bboxes": original_bboxes
+            },
+            "annotations": [{"result": results}],
+            "meta": {"page_number": page_num, "column_index": 1}
+        }
+        final_tasks.append(task)
+    with open(output_path, "w", encoding='utf-8') as f:
+        json.dump(final_tasks, f, indent=2, ensure_ascii=False)
+    print(f"\n✅ Label Studio tasks created and saved to {output_path}")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="LayoutLMv3 Inference Pipeline for PDF and Label Studio OCR Conversion.")
+    parser.add_argument("--input_pdf", type=str, required=True,
+                        help="Path to the input PDF file for inference.")
+    parser.add_argument("--model_path", type=str,
+                        default="checkpoints/layoutlmv3_trained_20251031_102846_recovered.pth",
+                        help="Path to the saved LayoutLMv3-CRF PyTorch model checkpoint.")
+    parser.add_argument("--inference_output", type=str, default="structured_yolo_predictions.json",
+                        help="Path to save the intermediate structured predictions.")
+    parser.add_argument("--label_studio_output", type=str, default="label_studio_import.json",
+                        help="Path to save the final Label Studio import JSON.")
+    parser.add_argument("--preprocessed_json", type=str, required=True,
+                        help="Path to the combined JSON output from the YOLO/OCR script.")
+    parser.add_argument("--no_labelstudio", action="store_true",
+                        help="If set, skip creating the Label Studio import JSON and only write structured predictions.")
+    parser.add_argument("--verbose", action="store_true",
+                        help="Enable verbose printing.")
+    args = parser.parse_args()
+    # 1. Check for required files
+    print("--- 0. PRE-CHECK ---")
+    if not os.path.exists(args.model_path):
+        print(f"❌ FATAL ERROR: Model checkpoint not found at {args.model_path}.")
+        sys.exit(1)
+    if not os.path.exists(args.input_pdf):
+        print(f"❌ FATAL ERROR: Input PDF not found at {args.input_pdf}.")
+        sys.exit(1)
+    if not os.path.exists(args.preprocessed_json):
+        print(f"❌ FATAL ERROR: Preprocessed JSON not found at {args.preprocessed_json}. Run the YOLO/OCR script first.")
+        sys.exit(1)
+    print("✅ All required files found.")
+    # 2. Column Detection Parameters (Tuning required)
+    column_params = {
+        'top_margin_percent': 0.10,
+        'bottom_margin_percent': 0.10,
+        'cluster_prominence': 0.70,
+        'cluster_bin_size': 5,
+        'cluster_smoothing': 2,
+        'cluster_threshold_percentile': 30,
+        'cluster_min_width': 25,
+    }
+    # 3. Run inference
+    try:
+        structured_data = run_inference_and_structure(
+            args.input_pdf,
+            args.model_path,
+            args.inference_output,
+            args.preprocessed_json,
+            column_detection_params=column_params
+        )
+    except Exception as e:
+        print(f"❌ Fatal error while running inference: {e}")
+        structured_data = []
+    # 4. If requested, convert to Label Studio format
+    if structured_data and not args.no_labelstudio:
+        try:
+            convert_to_label_studio_format(
+                structured_data=structured_data,
+                output_path=args.label_studio_output,
+                pdf_file_name=args.input_pdf
+            )
+        except Exception as e:
+            print(f"❌ Error while converting to Label Studio format: {e}")
+    elif not structured_data:
+        print("⚠️ No structured data produced — skipping Label Studio conversion.")
+    else:
+        print("ℹ️ Skipped Label Studio conversion as requested (--no_labelstudio).")
+    # 5. Final status message
+    print("\n--- 5. FINAL STATUS ---")
+    print(f"Finished. Structured predictions file: {os.path.abspath(args.inference_output)}")
+    if structured_data and not args.no_labelstudio:
+        print(f"Label Studio import file: {os.path.abspath(args.label_studio_output)}")