layout_late

Runtime error

App Files Files Community

deepkansara-123 commited on Dec 15, 2025

Commit

0f3c560

verified ·

1 Parent(s): 994b14b

Update working_yolo_pipeline.py

Browse files

Files changed (1) hide show

working_yolo_pipeline.py +126 -80

working_yolo_pipeline.py CHANGED Viewed

@@ -2761,100 +2761,146 @@ def embed_images_as_base64_in_memory(structured_data: List[Dict[str, Any]], figu
 # def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str) -> Optional[
 #     List[Dict[str, Any]]]:
-def run_document_pipeline( input_pdf_path: str, layoutlmv3_model_path: str, structured_intermediate_output_path: Optional[str] = None) -> Optional[List[Dict[str, Any]]]:
-    if not os.path.exists(input_pdf_path): return None
-    print("\n" + "#" * 80)
-    print("### STARTING OPTIMIZED FULL DOCUMENT ANALYSIS PIPELINE ###")
-    print("#" * 80)
-    pdf_name = os.path.splitext(os.path.basename(input_pdf_path))[0]
-    temp_pipeline_dir = os.path.join(tempfile.gettempdir(), f"pipeline_run_{pdf_name}_{os.getpid()}")
-    os.makedirs(temp_pipeline_dir, exist_ok=True)
-    preprocessed_json_path = os.path.join(temp_pipeline_dir, f"{pdf_name}_preprocessed.json")
-    raw_output_path = os.path.join(temp_pipeline_dir, f"{pdf_name}_raw_predictions.json")
-    structured_intermediate_output_path = os.path.join(temp_pipeline_dir, f"{pdf_name}_structured_intermediate.json")
-    final_result = None
-    try:
-        # Phase 1: Preprocessing with YOLO First + Masking
-        preprocessed_json_path_out = run_single_pdf_preprocessing(input_pdf_path, preprocessed_json_path)
-        if not preprocessed_json_path_out: return None
-        # Phase 2: Inference
-        page_raw_predictions_list = run_inference_and_get_raw_words(
-            input_pdf_path, layoutlmv3_model_path, preprocessed_json_path_out
-        )
-        if not page_raw_predictions_list: return None
-        # --- DEBUG STEP: SAVE RAW PREDICTIONS ---
-        # Save raw predictions to the temporary file
-        with open(raw_output_path, 'w', encoding='utf-8') as f:
-            json.dump(page_raw_predictions_list, f, indent=4)
-        # Explicitly copy/save the raw predictions to the user-specified debug path
-        # if raw_predictions_output_path:
-        #     shutil.copy(raw_output_path, raw_predictions_output_path)
-        #     print(f"\n✅ DEBUG: Raw predictions saved to: {raw_predictions_output_path}")
-        # ----------------------------------------
-        # Phase 3: Decoding
-        structured_data_list = convert_bio_to_structured_json_relaxed(
-            raw_output_path, structured_intermediate_output_path
         )
-        if not structured_data_list: return None
-        structured_data_list = correct_misaligned_options(structured_data_list)
-        structured_data_list = process_context_linking(structured_data_list)
-        # Phase 4: Embedding / Equation to LaTeX Conversion
-        final_result = embed_images_as_base64_in_memory(structured_data_list, FIGURE_EXTRACTION_DIR)
-#================================================================================
-        # --- NEW FINAL STEP: HIERARCHICAL CLASSIFICATION TAGGING ---
-#================================================================================
-        print("\n" + "=" * 80)
-        print("--- FINAL STEP: HIERARCHICAL SUBJECT/CONCEPT TAGGING ---")
-        print("=" * 80)
-        # 1. Initialize and Load the Classifier
-        classifier = HierarchicalClassifier()
-        if classifier.load_models():
-            # 2. Run Classification on the *Final* Result
-            # The function modifies the list in place and returns it
-            final_result = post_process_json_with_inference(
-                final_result, classifier
-            )
-            print("✅ Classification complete. Tags added to final output.")
-        else:
-            print("❌ Classification model loading failed. Outputting un-tagged data.")
-        # ====================================================================
-    except Exception as e:
-        print(f"❌ FATAL ERROR: {e}")
-        import traceback
-        traceback.print_exc()
-        return None
-    finally:
-        try:
-            for f in glob.glob(os.path.join(temp_pipeline_dir, '*')):
-                os.remove(f)
-            os.rmdir(temp_pipeline_dir)
-        except Exception:
-            pass
-    print("\n" + "#" * 80)
-    print("### OPTIMIZED PIPELINE EXECUTION COMPLETE ###")
-    print("#" * 80)
-    return final_result

 # def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str) -> Optional[
 #     List[Dict[str, Any]]]:
+DEFAULT_LAYOUTLMV3_MODEL_PATH = "./models/layoutlmv3_model"
+WEIGHTS_PATH = "./weights/yolo_weights.pt"
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+torch.set_grad_enabled(False)
+# ===============================
+# GLOBAL CACHED MODELS
+# ===============================
+_layoutlm_model = None
+_layoutlm_processor = None
+_yolo_model = None
+def load_models(layoutlm_path):
+    """
+    Load models ONCE (Singleton pattern)
+    """
+    global _layoutlm_model, _layoutlm_processor, _yolo_model
+    if _layoutlm_model is None:
+        print("🔹 Loading LayoutLMv3...")
+        _layoutlm_processor = AutoProcessor.from_pretrained(layoutlm_path)
+        _layoutlm_model = (
+            LayoutLMv3ForTokenClassification
+            .from_pretrained(layoutlm_path)
+            .to(DEVICE)
+            .eval()
         )
+        if DEVICE.type == "cuda":
+            _layoutlm_model = _layoutlm_model.half()
+            _layoutlm_model = torch.compile(_layoutlm_model)
+    if _yolo_model is None:
+        print("🔹 Loading YOLO...")
+        _yolo_model = YOLO(WEIGHTS_PATH)
+        _yolo_model.model.eval()
+    return _layoutlm_model, _layoutlm_processor, _yolo_model
+# ===============================
+# PDF UTILITIES
+# ===============================
+def load_pdf_images(pdf_path):
+    doc = fitz.open(pdf_path)
+    images = []
+    for page in doc:
+        pix = page.get_pixmap(dpi=200)
+        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+        images.append(img)
+    return images
+# ===============================
+# MAIN PIPELINE
+# ===============================
+def run_document_pipeline(pdf_path, layoutlm_path):
+    model, processor, yolo = load_models(layoutlm_path)
+    images = load_pdf_images(pdf_path)
+    results = []
+    for page_idx, image in enumerate(images):
+        # -------------------------------
+        # YOLO DETECTION
+        # -------------------------------
+        image_resized = image.resize((1024, 1024))
+        yolo_result = yolo.predict(
+            image_resized,
+            verbose=False,
+            conf=0.25,
+        )[0]
+        boxes = []
+        words = []
+        for box in yolo_result.boxes.xyxy.cpu().numpy():
+            x1, y1, x2, y2 = box
+            boxes.append([
+                int(x1), int(y1),
+                int(x2), int(y2)
+            ])
+            words.append("text")
+        if not boxes:
+            continue
+        # Normalize boxes for LayoutLM
+        w, h = image_resized.size
+        norm_boxes = [
+            [
+                int(1000 * b[0] / w),
+                int(1000 * b[1] / h),
+                int(1000 * b[2] / w),
+                int(1000 * b[3] / h),
+            ]
+            for b in boxes
+        ]
+        # -------------------------------
+        # LAYOUTLM INFERENCE
+        # -------------------------------
+        encoding = processor(
+            image_resized,
+            words,
+            boxes=norm_boxes,
+            return_tensors="pt",
+            truncation=True,
+            padding=True,
+        )
+        encoding = {k: v.to(DEVICE) for k, v in encoding.items()}
+        with torch.no_grad():
+            outputs = model(**encoding)
+        predictions = outputs.logits.argmax(-1).cpu().tolist()
+        results.append({
+            "page": page_idx + 1,
+            "num_boxes": len(boxes),
+            "predictions": predictions,
+        })
+    if DEVICE.type == "cuda":
+        torch.cuda.empty_cache()
+    return {
+        "pdf": Path(pdf_path).name,
+        "pages_processed": len(results),
+        "results": results,
+    }