Spaces:

heerjtdev
/

rocr

Sleeping

App Files Files Community

heerjtdev commited on Jan 16

Commit

fc8c0fc

verified ·

1 Parent(s): 37a91ca

Rename test_layout_yolo_columns_log.py to app.py

Browse files

Files changed (2) hide show

app.py +59 -0
test_layout_yolo_columns_log.py +0 -714

app.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import gradio as gr
+from rapidocr import RapidOCR, OCRVersion
+# 1. Initialize the OCR engine once with v5 defaults
+# We use v5 for Detection/Recognition and v4 for Classification (most stable v5 setup)
+engine = RapidOCR(params={
+    "Det.ocr_version": OCRVersion.PPOCRV5,
+    "Rec.ocr_version": OCRVersion.PPOCRV5,
+    "Cls.ocr_version": OCRVersion.PPOCRV4,
+})
+def perform_ocr(img):
+    if img is None:
+        return None, None, "0.0"
+    # 2. Run OCR. return_word_box=True provides the word/char level detail
+    ocr_result = engine(img, return_word_box=True)
+    # 3. Get the annotated preview image
+    vis_img = ocr_result.vis()
+    # 4. Format word-level results for the Dataframe
+    # We flatten the word_results list using the logic from your advanced script
+    word_list = []
+    if ocr_result.word_results:
+        flat_results = sum(ocr_result.word_results, ())
+        for i, (text, score, _) in enumerate(flat_results):
+            word_list.append([i + 1, text, round(float(score), 3)])
+    return vis_img, word_list, f"{ocr_result.elapse:.3f}s"
+# 5. Build a clean, minimal UI
+with gr.Blocks(title="Rapid⚡OCR Simple") as demo:
+    gr.Markdown("# Rapid⚡OCR v5")
+    gr.Markdown("Upload an image to extract text with word-level bounding boxes.")
+    with gr.Row():
+        with gr.Column():
+            input_img = gr.Image(label="Input Image", type="numpy")
+            run_btn = gr.Button("Run OCR", variant="primary")
+        with gr.Column():
+            output_img = gr.Image(label="Preview (Bounding Boxes)")
+            elapse_info = gr.Textbox(label="Processing Time")
+    result_table = gr.Dataframe(
+        headers=["ID", "Text", "Confidence"],
+        label="Detected Words",
+        interactive=False
+    )
+    run_btn.click(
+        fn=perform_ocr,
+        inputs=[input_img],
+        outputs=[output_img, result_table, elapse_info]
+    )
+if __name__ == "__main__":
+    demo.launch()

test_layout_yolo_columns_log.py DELETED Viewed

@@ -1,714 +0,0 @@
-import json
-import argparse
-import os
-import torch
-import torch.nn as nn
-from TorchCRF import CRF
-from transformers import LayoutLMv3TokenizerFast, LayoutLMv3Model, LayoutLMv3Config
-import pytesseract
-from PIL import Image
-import fitz  # PyMuPDF
-from typing import List, Dict, Any, Optional, Union, Tuple
-import numpy as np
-from scipy.signal import find_peaks
-from scipy.ndimage import gaussian_filter1d
-import sys
-import io
-# ============================================================================
-# CONSTANTS & MODEL DEFINITION
-# ============================================================================
-# Labels must match the training labels! (Use the most detailed set)
-ID_TO_LABEL = {
-    0: "O",
-    1: "B-QUESTION", 2: "I-QUESTION",
-    3: "B-OPTION", 4: "I-OPTION",
-    5: "B-ANSWER", 6: "I-ANSWER",
-    7: "B-SECTION_HEADING", 8: "I-SECTION_HEADING",
-    9: "B-PASSAGE", 10: "I-PASSAGE"
-}
-NUM_LABELS = len(ID_TO_LABEL)
-class LayoutLMv3ForTokenClassification(nn.Module):
-    """LayoutLMv3 model with a linear layer and a CRF layer on top."""
-    def __init__(self, num_labels: int = NUM_LABELS):
-        super().__init__()
-        self.num_labels = num_labels
-        config = LayoutLMv3Config.from_pretrained("microsoft/layoutlmv3-base", num_labels=num_labels)
-        self.layoutlmv3 = LayoutLMv3Model.from_pretrained("microsoft/layoutlmv3-base", config=config)
-        self.classifier = nn.Linear(config.hidden_size, num_labels)
-        self.crf = CRF(num_labels)
-        self.init_weights()
-    def init_weights(self):
-        nn.init.xavier_uniform_(self.classifier.weight)
-        if self.classifier.bias is not None:
-            nn.init.zeros_(self.classifier.bias)
-    def forward(
-            self,
-            input_ids: torch.Tensor,
-            bbox: torch.Tensor,
-            attention_mask: torch.Tensor,
-            labels: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, Tuple[List[List[int]], Any]]:
-        outputs = self.layoutlmv3(
-            input_ids=input_ids,
-            bbox=bbox,
-            attention_mask=attention_mask,
-            return_dict=True
-        )
-        sequence_output = outputs.last_hidden_state
-        emissions = self.classifier(sequence_output)
-        mask = attention_mask.bool()
-        if labels is not None:
-            log_likelihood = self.crf(emissions, labels, mask=mask)
-            loss = -log_likelihood.mean()
-            return loss
-        else:
-            best_paths = self.crf.viterbi_decode(emissions, mask=mask)
-            return best_paths
-# ============================================================================
-# COLUMN DETECTION MODULE (Re-included for completeness)
-# ============================================================================
-def get_word_data_for_detection(page: fitz.Page, top_margin_percent=0.10, bottom_margin_percent=0.10) -> list:
-    """Extracts word data for column detection with Y-axis filtering."""
-    word_data = page.get_text("words")
-    if len(word_data) == 0:
-        # Fallback to Tesseract if PyMuPDF finds no words
-        try:
-            pix = page.get_pixmap(matrix=fitz.Matrix(3, 3))
-            img_bytes = pix.tobytes("png")
-            img = Image.open(io.BytesIO(img_bytes))
-            data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT)
-            full_word_data = []
-            for i in range(len(data['level'])):
-                if data['text'][i].strip():
-                    x1 = data['left'][i] / 3
-                    y1 = data['top'][i] / 3
-                    x2 = (data['left'][i] + data['width'][i]) / 3
-                    y2 = (data['top'][i] + data['height'][i]) / 3
-                    word = data['text'][i]
-                    full_word_data.append((word, x1, y1, x2, y2))
-            word_data = full_word_data
-        except Exception as e:
-            # print(f"Tesseract fallback failed: {e}")
-            return []
-    else:
-        word_data = [(w[4], w[0], w[1], w[2], w[3]) for w in word_data]
-    page_height = page.rect.height
-    y_min = page_height * top_margin_percent
-    y_max = page_height * (1 - bottom_margin_percent)
-    filtered_data = [
-        (word, x1, y1, x2, y2)
-        for word, x1, y1, x2, y2 in word_data
-        if y1 >= y_min and y2 <= y_max
-    ]
-    return filtered_data
-def calculate_x_gutters(word_data: list, params: Dict) -> List[int]:
-    """Calculates the X-axis histogram and detects significant gutters."""
-    if not word_data: return []
-    x_points = []
-    for _, x1, _, x2, _ in word_data:
-        x_points.extend([x1, x2])
-    max_x = max(x_points)
-    bin_size = params['cluster_bin_size']
-    num_bins = int(np.ceil(max_x / bin_size))
-    hist, bin_edges = np.histogram(x_points, bins=num_bins, range=(0, max_x))
-    smoothed_hist = gaussian_filter1d(hist.astype(float), sigma=params['cluster_smoothing'])
-    inverted_signal = np.max(smoothed_hist) - smoothed_hist
-    peaks, properties = find_peaks(
-        inverted_signal,
-        height=0,
-        distance=params['cluster_min_width'] / bin_size
-    )
-    if not peaks.size: return []
-    threshold_value = np.percentile(smoothed_hist, params['cluster_threshold_percentile'])
-    inverted_threshold = np.max(smoothed_hist) - threshold_value
-    significant_peaks = peaks[properties['peak_heights'] >= inverted_threshold]
-    separator_x_coords = [int(bin_edges[p]) for p in significant_peaks]
-    final_separators = []
-    prominence_threshold = params['cluster_prominence'] * np.max(smoothed_hist)
-    for x_coord in separator_x_coords:
-        bin_idx = np.searchsorted(bin_edges, x_coord) - 1
-        window_size = int(params['cluster_min_width'] / bin_size)
-        left_start, left_end = max(0, bin_idx - window_size), bin_idx
-        right_start, right_end = bin_idx + 1, min(len(smoothed_hist), bin_idx + 1 + window_size)
-        if left_end <= left_start or right_end <= right_start: continue
-        avg_left_density = np.mean(smoothed_hist[left_start:left_end])
-        avg_right_density = np.mean(smoothed_hist[right_start:right_end])
-        if avg_left_density >= prominence_threshold and avg_right_density >= prominence_threshold:
-            final_separators.append(x_coord)
-    return sorted(final_separators)
-def detect_column_gutters(pdf_path: str, page_num: int, **params) -> Optional[int]:
-    """Main function for column detection."""
-    try:
-        doc = fitz.open(pdf_path)
-        page = doc.load_page(page_num)
-        word_data = get_word_data_for_detection(page, params.get('top_margin_percent', 0.10),
-                                                params.get('bottom_margin_percent', 0.10))
-        if not word_data:
-            doc.close()
-            return None
-        separators = calculate_x_gutters(word_data, params)
-        doc.close()
-        if len(separators) == 1:
-            return separators[0]
-        elif len(separators) > 1:
-            page_width = page.rect.width
-            center_x = page_width / 2
-            best_separator = min(separators, key=lambda x: abs(x - center_x))
-            return best_separator
-        return None
-    except Exception as e:
-        print(f"DEBUG: Column detection failed for page {page_num}: {e}")
-        return None
-def _merge_integrity(all_words_by_page: List[str], all_bboxes_raw: List[List[int]],
-                     column_separator_x: Optional[int]) -> List[List[str]]:
-    """Splits the words/bboxes into two columns if a separator is present."""
-    if column_separator_x is None:
-        return [all_words_by_page]
-    left_column_words = []
-    right_column_words = []
-    gutter_min_x = column_separator_x - 10
-    gutter_max_x = column_separator_x + 10
-    for i, (word, bbox_raw) in enumerate(zip(all_words_by_page, all_bboxes_raw)):
-        x1_raw, _, x2_raw, _ = bbox_raw
-        center_x = (x1_raw + x2_raw) / 2
-        if center_x < column_separator_x:
-            left_column_words.append(word)
-        else:
-            right_column_words.append(word)
-    return [c for c in [left_column_words, right_column_words] if c]
-def post_process_predictions(words: List[str], bboxes: List[List[int]], predictions: List[str]) -> List[Dict[str, Any]]:
-    """Converts a flat list of words (and string label predictions) into structured blocks."""
-    structured_blocks = []
-    current_block = None
-    # DEBUG: Track how many blocks are created
-    block_count = 0
-    for word, bbox, label in zip(words, bboxes, predictions):
-        prefix, tag = (label.split('-', 1) + [None])[:2]
-        if prefix == 'B':
-            if current_block:
-                structured_blocks.append(current_block)
-                block_count += 1
-                # print(f"  DEBUG POST: Closed block {block_count}: Tag={current_block['tag']}, Words={len(current_block['words'])}")
-            current_block = {
-                'text': word,
-                'tag': tag,
-                'words': [{'text': word, 'bbox': bbox, 'label': label}],
-                'bbox': list(bbox)
-            }
-        elif prefix == 'I' and current_block and current_block['tag'] == tag:
-            current_block['text'] += ' ' + word
-            current_block['words'].append({'text': word, 'bbox': bbox, 'label': label})
-            current_block['bbox'][0] = min(current_block['bbox'][0], bbox[0])
-            current_block['bbox'][1] = min(current_block['bbox'][1], bbox[1])
-            current_block['bbox'][2] = max(current_block['bbox'][2], bbox[2])
-            current_block['bbox'][3] = max(current_block['bbox'][3], bbox[3])
-        else:  # 'O' or mismatching tag
-            if current_block:
-                structured_blocks.append(current_block)
-                block_count += 1
-                # print(f"  DEBUG POST: Closed block {block_count}: Tag={current_block['tag']}, Words={len(current_block['words'])}")
-                current_block = None
-            # Handle 'O' or isolated 'I'. We include 'O' for completeness, but they might be filtered later.
-            if label == 'O':
-                # print("  DEBUG POST: Created individual 'OTHER' block (O).")
-                structured_blocks.append({
-                    'text': word,
-                    'tag': 'OTHER',
-                    'words': [{'text': word, 'bbox': bbox, 'label': label}],
-                    'bbox': list(bbox)
-                })
-            elif prefix == 'I':
-                # Start a block that missed the 'B'
-                # print(f"  DEBUG POST: Created isolated 'I' block: Tag={tag}.")
-                structured_blocks.append({
-                    'text': word,
-                    'tag': tag,
-                    'words': [{'text': word, 'bbox': bbox, 'label': label}],
-                    'bbox': list(bbox)
-                })
-    if current_block:
-        structured_blocks.append(current_block)
-        block_count += 1
-        # print(f"  DEBUG POST: Closed final block {block_count}: Tag={current_block['tag']}, Words={len(current_block['words'])}")
-    # print(f"DEBUG POST: Total structured blocks created: {len(structured_blocks)}")
-    return structured_blocks
-# ============================================================================
-# CORE INFERENCE FUNCTION (WITH DEBUGGING LOGS)
-# ============================================================================
-def run_inference_and_structure(pdf_path: str, model_path: str, inference_output_path: str,
-                                preprocessed_json_path: str,
-                                column_detection_params: Optional[Dict] = None) -> List[Dict[str, Any]]:
-    """
-    Runs LayoutLMv3-CRF inference with extensive debugging logs.
-    """
-    print("--- 1. MODEL SETUP ---")
-    tokenizer = LayoutLMv3TokenizerFast.from_pretrained("microsoft/layoutlmv3-base")
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    print(f"DEBUG: Using device: {device}")
-    try:
-        model = LayoutLMv3ForTokenClassification(num_labels=NUM_LABELS)
-        checkpoint = torch.load(model_path, map_location=device)
-        # Fixed Loading Logic
-        model_state = checkpoint.get('model_state_dict', checkpoint)
-        fixed_state_dict = {key.replace('layoutlm.', 'layoutlmv3.'): value for key, value in model_state.items()}
-        model.load_state_dict(fixed_state_dict)
-        model.to(device)
-        model.eval()
-        print(f"✅ Model loaded successfully from {model_path}. Total {len(fixed_state_dict)} keys loaded.")
-    except Exception as e:
-        print(f"❌ FATAL ERROR during model loading: {e}")
-        return []
-    # --------------------------------------------------------------------------
-    # 2. DATA LOADING & PREPARATION
-    # --------------------------------------------------------------------------
-    print("\n--- 2. DATA LOADING ---")
-    try:
-        with open(preprocessed_json_path, 'r', encoding='utf-8') as f:
-            preprocessed_data = json.load(f)
-        print(f"✅ Loaded preprocessed data with {len(preprocessed_data)} pages.")
-    except Exception as e:
-        print(f"❌ Error loading preprocessed JSON: {e}")
-        return []
-    try:
-        doc = fitz.open(pdf_path)
-    except Exception as e:
-        print(f"❌ Error loading PDF: {e}")
-        return []
-    all_pages_data = []
-    CHUNK_SIZE = 500
-    for page_data in preprocessed_data:
-        page_num_1_based = page_data['page_number']
-        page_num_0_based = page_num_1_based - 1
-        print(f"\nProcessing Page {page_num_1_based}...")
-        fitz_page = doc.load_page(page_num_0_based)
-        page_width, page_height = fitz_page.rect.width, fitz_page.rect.height
-        words = []
-        bboxes_raw_pdf_space = []
-        normalized_bboxes_list = []
-        scale_factor = 2.0
-        for item in page_data['data']:
-            word = item['word']
-            raw_yolo_bbox = item['bbox']
-            bbox_pdf = [
-                int(raw_yolo_bbox[0] / scale_factor),
-                int(raw_yolo_bbox[1] / scale_factor),
-                int(raw_yolo_bbox[2] / scale_factor),
-                int(raw_yolo_bbox[3] / scale_factor)
-            ]
-            normalized_bbox = [
-                max(0, min(1000, int(1000 * bbox_pdf[0] / page_width))),
-                max(0, min(1000, int(1000 * bbox_pdf[1] / page_height))),
-                max(0, min(1000, int(1000 * bbox_pdf[2] / page_width))),
-                max(0, min(1000, int(1000 * bbox_pdf[3] / page_height)))
-            ]
-            words.append(word)
-            bboxes_raw_pdf_space.append(bbox_pdf)
-            normalized_bboxes_list.append(normalized_bbox)
-        if not words:
-            print(f"  DEBUG: Page {page_num_1_based} has no words in preprocessed data. Skipping.")
-            continue
-        print(f"  DEBUG: Page {page_num_1_based} extracted {len(words)} words.")
-        # --------------------------------------------------------------------------
-        # 3. COLUMN DETECTION & CHUNKING
-        # --------------------------------------------------------------------------
-        column_detection_params = column_detection_params or {}
-        column_separator_x = detect_column_gutters(pdf_path, page_num_0_based, **column_detection_params)
-        if column_separator_x is not None:
-            print(f"  DEBUG: Column detected at X={column_separator_x}. Splitting.")
-        else:
-            print(f"  DEBUG: No column detected. Processing as a single chunk.")
-        word_chunks = _merge_integrity(words, bboxes_raw_pdf_space, column_separator_x)
-        print(f"  DEBUG: Split into {len(word_chunks)} column/chunks.")
-        page_structured_data = {'page_number': page_num_1_based, 'structured_blocks': []}
-        # --------------------------------------------------------------------------
-        # 4. INFERENCE LOOP
-        # --------------------------------------------------------------------------
-        # Re-alignment is simplified and potentially slow. A proper way would be to
-        # split all three lists (words, bboxes_pdf, bboxes_norm) at the same time.
-        # But we stick to your original approach for minimal changes.
-        current_word_idx = 0
-        for chunk_idx, chunk_words in enumerate(word_chunks):
-            if not chunk_words: continue
-            # Reconstruct the aligned chunk data (Simplified version of your complex loop)
-            current_original_index = 0
-            temp_chunk_norm_bboxes = []
-            temp_chunk_pdf_bboxes = []
-            found_words = []
-            # Simple, but slow, way to re-align data for the chunk:
-            for word_to_find in chunk_words:
-                try:
-                    # Find the index of the word in the master list, starting search from the last found position
-                    i = words[current_original_index:].index(word_to_find) + current_original_index
-                    temp_chunk_norm_bboxes.append(normalized_bboxes_list[i])
-                    temp_chunk_pdf_bboxes.append(bboxes_raw_pdf_space[i])
-                    found_words.append(words[i])
-                    current_original_index = i + 1
-                except ValueError:
-                    # print(f"  WARNING: Word '{word_to_find}' not found during re-alignment.")
-                    pass  # Skip missing words
-            chunk_words = found_words
-            chunk_normalized_bboxes = temp_chunk_norm_bboxes
-            chunk_bboxes_pdf = temp_chunk_pdf_bboxes
-            print(f"  DEBUG: Column/Chunk {chunk_idx + 1} has {len(chunk_words)} words.")
-            # Sub-chunking for max_seq_len (512)
-            for i in range(0, len(chunk_words), CHUNK_SIZE):
-                sub_words = chunk_words[i:i + CHUNK_SIZE]
-                sub_bboxes = chunk_normalized_bboxes[i:i + CHUNK_SIZE]
-                sub_bboxes_pdf = chunk_bboxes_pdf[i:i + CHUNK_SIZE]
-                encoded_input = tokenizer(
-                    sub_words,
-                    boxes=sub_bboxes,
-                    truncation=True,
-                    padding="max_length",
-                    max_length=512,
-                    # is_split_into_words=True,
-                    return_tensors="pt"
-                )
-                input_ids = encoded_input['input_ids'].to(device)
-                bbox = encoded_input['bbox'].to(device)
-                attention_mask = encoded_input['attention_mask'].to(device)
-                print(f"    DEBUG INFER: Sub-chunk size: {len(sub_words)} words. Input shape: {input_ids.shape}")
-                with torch.no_grad():
-                    predictions_int_list = model(input_ids, bbox, attention_mask)
-                if not predictions_int_list:
-                    print("    ❌ INFERENCE FAILED: Model returned empty list of predictions.")
-                    continue
-                predictions_int = predictions_int_list[0]
-                # --- CHECK FOR NON-'O' PREDICTIONS ---
-                non_o_count = sum(1 for p in predictions_int if p != 0)
-                print(
-                    f"    DEBUG INFER: Raw predictions (tokens): Total {len(predictions_int)}. Non-'O' tokens: {non_o_count}.")
-                if non_o_count == 0:
-                    print("    ⚠️ WARNING: Model is predicting 'O' for all tokens. Check training or input quality.")
-                # -----------------------------------
-                # Map token predictions back to original words
-                word_ids = encoded_input.word_ids()
-                word_idx_to_pred_id = {}
-                for token_idx, word_idx in enumerate(word_ids):
-                    if word_idx is not None and word_idx < len(sub_words):
-                        # Only take the prediction of the FIRST sub-token for a word
-                        if word_idx not in word_idx_to_pred_id:
-                            word_idx_to_pred_id[word_idx] = predictions_int[token_idx]
-                final_predictions_str = []
-                # Map integer IDs back to string labels
-                for current_word_idx in range(len(sub_words)):
-                    pred_id_or_tensor = word_idx_to_pred_id.get(current_word_idx, 0)
-                    pred_id = pred_id_or_tensor.item() if torch.is_tensor(pred_id_or_tensor) else pred_id_or_tensor
-                    # This is the final word-level prediction. If it's always 0, post-processing fails.
-                    final_predictions_str.append(ID_TO_LABEL[pred_id])
-                # --- POST-PROCESSING ---
-                structured_blocks = post_process_predictions(sub_words, sub_bboxes_pdf, final_predictions_str)
-                print(f"    DEBUG POST: Created {len(structured_blocks)} structured blocks from this sub-chunk.")
-                page_structured_data['structured_blocks'].extend(structured_blocks)
-        print(
-            f"  DEBUG: Page {page_num_1_based} final total structured blocks: {len(page_structured_data['structured_blocks'])}")
-        all_pages_data.append(page_structured_data)
-    doc.close()
-    # Save final structured predictions
-    with open(inference_output_path, 'w', encoding='utf-8') as f:
-        json.dump(all_pages_data, f, indent=4)
-    print(f"\n✅ All pages processed. Structured data saved to {os.path.basename(inference_output_path)}")
-    return all_pages_data
-# --- 5. Label Studio Conversion Utility (Included for completeness) ---
-def create_label_studio_span(all_results, start_idx, end_idx, label):
-    """Create a Label Studio span with character-level offsets."""
-    entity_words = [all_results[i]['word'] for i in range(start_idx, end_idx + 1)]
-    entity_bboxes = [all_results[i]['bbox'] for i in range(start_idx, end_idx + 1)]
-    x0 = min(bbox[0] for bbox in entity_bboxes)
-    y0 = min(bbox[1] for bbox in entity_bboxes)
-    x1 = max(bbox[2] for bbox in entity_bboxes)
-    y1 = max(bbox[3] for bbox in entity_bboxes)
-    all_words = [r['word'] for r in all_results]
-    text_string = " ".join(all_words)
-    prefix_words = all_words[:start_idx]
-    start_char = len(" ".join(prefix_words)) + (1 if prefix_words else 0)
-    span_text = " ".join(entity_words)
-    end_char = start_char + len(span_text)
-    return {
-        "from_name": "label",
-        "to_name": "text",
-        "type": "labels",
-        "value": {
-            "start": start_char,
-            "end": end_char,
-            "text": span_text,
-            "labels": [label],
-            "bbox": {
-                "x": x0,
-                "y": y0,
-                "width": x1 - x0,
-                "height": y1 - y0
-            }
-        },
-        "score": 0.99
-    }
-def convert_to_label_studio_format(structured_data: List[Dict[str, Any]],
-                                   output_path: str,
-                                   pdf_file_name: str) -> None:
-    """Convert structured predictions to Label Studio format."""
-    final_tasks = []
-    for page_data in structured_data:
-        page_num = page_data['page_number']
-        if 'structured_blocks' not in page_data: continue
-        page_results = []
-        for block in page_data['structured_blocks']:
-            if 'words' in block:
-                for word_info in block['words']:
-                    page_results.append({
-                        'word': word_info['text'],
-                        'bbox': word_info['bbox'],
-                        # FIX: Use the full label string (e.g., 'B-QUESTION')
-                        'predicted_label': word_info['label']
-                    })
-        if not page_results:
-            print(f"DEBUG LS: Page {page_num} has no word-level results. Skipping.")
-            continue
-        original_words = [r['word'] for r in page_results]
-        original_bboxes = [r['bbox'] for r in page_results]
-        text_string = " ".join(original_words)
-        results = []
-        current_entity_label = None
-        current_entity_start_word_index = None
-        for i, pred_item in enumerate(page_results):
-            label = pred_item['predicted_label']
-            # Get the tag (e.g., 'QUESTION' from 'B-QUESTION')
-            tag_only = label.split('-', 1)[-1] if '-' in label else label
-            if label.startswith('B-'):
-                if current_entity_label:
-                    results.append(create_label_studio_span(
-                        page_results, current_entity_start_word_index, i - 1, current_entity_label
-                    ))
-                current_entity_label = tag_only
-                current_entity_start_word_index = i
-            elif label.startswith('I-') and current_entity_label == tag_only:
-                continue
-            else:  # Label is 'O' or doesn't match current entity
-                if current_entity_label:
-                    results.append(create_label_studio_span(
-                        page_results, current_entity_start_word_index, i - 1, current_entity_label
-                    ))
-                    current_entity_label = None
-                    current_entity_start_word_index = None
-        if current_entity_label:
-            results.append(create_label_studio_span(
-                page_results, current_entity_start_word_index, len(page_results) - 1, current_entity_label
-            ))
-        print(f"DEBUG LS: Page {page_num} generated {len(results)} Label Studio spans.")
-        task = {
-            "data": {
-                "text": text_string,
-                "original_words": original_words,
-                "original_bboxes": original_bboxes
-            },
-            "annotations": [{"result": results}],
-            "meta": {"page_number": page_num, "column_index": 1}
-        }
-        final_tasks.append(task)
-    with open(output_path, "w", encoding='utf-8') as f:
-        json.dump(final_tasks, f, indent=2, ensure_ascii=False)
-    print(f"\n✅ Label Studio tasks created and saved to {output_path}")
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="LayoutLMv3 Inference Pipeline for PDF and Label Studio OCR Conversion.")
-    parser.add_argument("--input_pdf", type=str, required=True,
-                        help="Path to the input PDF file for inference.")
-    parser.add_argument("--model_path", type=str,
-                        default="checkpoints/layoutlmv3_trained_20251031_102846_recovered.pth",
-                        help="Path to the saved LayoutLMv3-CRF PyTorch model checkpoint.")
-    parser.add_argument("--inference_output", type=str, default="structured_yolo_predictions.json",
-                        help="Path to save the intermediate structured predictions.")
-    parser.add_argument("--label_studio_output", type=str, default="label_studio_import.json",
-                        help="Path to save the final Label Studio import JSON.")
-    parser.add_argument("--preprocessed_json", type=str, required=True,
-                        help="Path to the combined JSON output from the YOLO/OCR script.")
-    parser.add_argument("--no_labelstudio", action="store_true",
-                        help="If set, skip creating the Label Studio import JSON and only write structured predictions.")
-    parser.add_argument("--verbose", action="store_true",
-                        help="Enable verbose printing.")
-    args = parser.parse_args()
-    # 1. Check for required files
-    print("--- 0. PRE-CHECK ---")
-    if not os.path.exists(args.model_path):
-        print(f"❌ FATAL ERROR: Model checkpoint not found at {args.model_path}.")
-        sys.exit(1)
-    if not os.path.exists(args.input_pdf):
-        print(f"❌ FATAL ERROR: Input PDF not found at {args.input_pdf}.")
-        sys.exit(1)
-    if not os.path.exists(args.preprocessed_json):
-        print(f"❌ FATAL ERROR: Preprocessed JSON not found at {args.preprocessed_json}. Run the YOLO/OCR script first.")
-        sys.exit(1)
-    print("✅ All required files found.")
-    # 2. Column Detection Parameters (Tuning required)
-    column_params = {
-        'top_margin_percent': 0.10,
-        'bottom_margin_percent': 0.10,
-        'cluster_prominence': 0.70,
-        'cluster_bin_size': 5,
-        'cluster_smoothing': 2,
-        'cluster_threshold_percentile': 30,
-        'cluster_min_width': 25,
-    }
-    # 3. Run inference
-    try:
-        structured_data = run_inference_and_structure(
-            args.input_pdf,
-            args.model_path,
-            args.inference_output,
-            args.preprocessed_json,
-            column_detection_params=column_params
-        )
-    except Exception as e:
-        print(f"❌ Fatal error while running inference: {e}")
-        structured_data = []
-    # 4. If requested, convert to Label Studio format
-    if structured_data and not args.no_labelstudio:
-        try:
-            convert_to_label_studio_format(
-                structured_data=structured_data,
-                output_path=args.label_studio_output,
-                pdf_file_name=args.input_pdf
-            )
-        except Exception as e:
-            print(f"❌ Error while converting to Label Studio format: {e}")
-    elif not structured_data:
-        print("⚠️ No structured data produced — skipping Label Studio conversion.")
-    else:
-        print("ℹ️ Skipped Label Studio conversion as requested (--no_labelstudio).")
-    # 5. Final status message
-    print("\n--- 5. FINAL STATUS ---")
-    print(f"Finished. Structured predictions file: {os.path.abspath(args.inference_output)}")
-    if structured_data and not args.no_labelstudio:
-        print(f"Label Studio import file: {os.path.abspath(args.label_studio_output)}")