Spaces:

heerjtdev
/

layout_latex

Running

App Files Files Community

heerjtdev commited on Dec 19, 2025

Commit

414d12d

verified ·

1 Parent(s): bf0bbaf

Update working_yolo_pipeline.py

Browse files

Files changed (1) hide show

working_yolo_pipeline.py +23 -23

working_yolo_pipeline.py CHANGED Viewed

@@ -2814,13 +2814,11 @@ def embed_images_as_base64_in_memory(structured_data: List[Dict[str, Any]], figu
 def load_image_as_fitz_page(image_path: str) -> Tuple[fitz.Document, fitz.Page]:
     """
     Wraps a standard image file into a single-page PyMuPDF Document.
-    This allows images to be processed by existing PDF-based functions
-    like coordinate scaling and column detection without modification.
     """
     img = Image.open(image_path)
     # Convert image to a PDF stream in memory
@@ -2830,53 +2828,47 @@ def load_image_as_fitz_page(image_path: str) -> Tuple[fitz.Document, fitz.Page]:
 def run_document_pipeline(input_path: str, layoutlmv3_model_path: str):
     """
-    Main pipeline entry point modified to handle both PDF and Image files.
     """
-    # Initialize YOLO and LayoutLMv3 models (kept from original script)
     yolo_model = YOLO(WEIGHTS_PATH)
-    # 1. DETECT FILE TYPE
     ext = os.path.splitext(input_path)[1].lower()
     is_image = ext in ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp']
     all_pages_data = []
     if is_image:
         print(f"📸 Image detected: {input_path}. Initializing Single-Page Pipeline.")
-        # 2. IMAGE BRANCH: Wrap image into a fitz page
         doc, page = load_image_as_fitz_page(input_path)
-        # Render the image for YOLO analysis (consistent with your PDF logic)
         pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
         img_np = pixmap_to_numpy(pix)
-        # 3. PROCESS THE PAGE
-        # Because the 'page' has no native text, the Tesseract OCR fallback
-        # in preprocess_and_ocr_page will trigger automatically.
         page_data, _ = preprocess_and_ocr_page(
             img_np,
             yolo_model,
             input_path,
-            0, # Page Index 0
             page,
             os.path.basename(input_path)
         )
         if page_data:
             all_pages_data.append(page_data)
         doc.close()
     else:
-        # 4. PDF BRANCH: Standard processing (your original logic)
         try:
             doc = fitz.open(input_path)
             print(f"📄 Processing PDF with {len(doc)} pages: {input_path}")
             for page_index in range(len(doc)):
                 page = doc[page_index]
-                # Render page at 2.0x scale (consistent with your original script)
                 pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
                 img_np = pixmap_to_numpy(pix)
@@ -2888,35 +2880,43 @@ def run_document_pipeline(input_path: str, layoutlmv3_model_path: str):
                     page,
                     os.path.basename(input_path)
                 )
                 if page_data:
                     all_pages_data.append(page_data)
             doc.close()
         except Exception as e:
             print(f"❌ Error opening PDF {input_path}: {e}")
             return None
-    # 5. CONTINUE EXACTLY AS BEFORE: Sequential processing & Inference
     # Sequence all blocks from all pages (or the single image page)
     sequential_blocks = []
     for p_data in all_pages_data:
         sequential_blocks.extend(p_data.get('blocks', []))
     # Run LayoutLMv3 Inference on the gathered blocks
     final_structured_data = run_layoutlmv3_inference_on_blocks(
         sequential_blocks,
         layoutlmv3_model_path
     )
-    # Run Subject/Concept classification (as implemented in your original script)
     classifier = HierarchicalClassifier()
     if classifier.load_models():
         final_structured_data = post_process_json_with_inference(final_structured_data, classifier)
     return final_structured_data
 #================================================================================
         # --- NEW FINAL STEP: HIERARCHICAL CLASSIFICATION TAGGING ---

 def load_image_as_fitz_page(image_path: str) -> Tuple[fitz.Document, fitz.Page]:
     """
     Wraps a standard image file into a single-page PyMuPDF Document.
+    This ensures it can be processed by your existing fitz-based functions
+    (coordinate scaling, column detection, etc.) exactly as before.
     """
     img = Image.open(image_path)
     # Convert image to a PDF stream in memory
 def run_document_pipeline(input_path: str, layoutlmv3_model_path: str):
     """
+    Main pipeline modified to handle both PDF and Image files.
     """
+    # 1. INITIALIZE MODELS (Preserving original logic)
     yolo_model = YOLO(WEIGHTS_PATH)
+    # 2. DETECT FILE TYPE
     ext = os.path.splitext(input_path)[1].lower()
     is_image = ext in ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp']
     all_pages_data = []
+    # 3. BRANCH LOGIC: IMAGE VS PDF
     if is_image:
         print(f"📸 Image detected: {input_path}. Initializing Single-Page Pipeline.")
         doc, page = load_image_as_fitz_page(input_path)
+        # Process as Page 0. Because there is no native text, your existing
+        # Tesseract fallback will naturally trigger to read the content.
         pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
         img_np = pixmap_to_numpy(pix)
         page_data, _ = preprocess_and_ocr_page(
             img_np,
             yolo_model,
             input_path,
+            0, # Page 0
             page,
             os.path.basename(input_path)
         )
         if page_data:
             all_pages_data.append(page_data)
         doc.close()
     else:
+        # Standard PDF Processing Loop
         try:
             doc = fitz.open(input_path)
             print(f"📄 Processing PDF with {len(doc)} pages: {input_path}")
             for page_index in range(len(doc)):
                 page = doc[page_index]
                 pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
                 img_np = pixmap_to_numpy(pix)
                     page,
                     os.path.basename(input_path)
                 )
                 if page_data:
                     all_pages_data.append(page_data)
             doc.close()
         except Exception as e:
             print(f"❌ Error opening PDF {input_path}: {e}")
             return None
+    # 4. CONTINUE EXACTLY AS BEFORE: Gathering and Inference
+    if not all_pages_data:
+        print("❌ No data extracted from document.")
+        return None
     # Sequence all blocks from all pages (or the single image page)
     sequential_blocks = []
     for p_data in all_pages_data:
         sequential_blocks.extend(p_data.get('blocks', []))
+    print("\n" + "=" * 80)
+    print("--- 2. STARTING LAYOUTLMV3 INFERENCE PIPELINE ---")
+    print("=" * 80)
     # Run LayoutLMv3 Inference on the gathered blocks
     final_structured_data = run_layoutlmv3_inference_on_blocks(
         sequential_blocks,
         layoutlmv3_model_path
     )
+    # Run Hierarchical classification (Subject/Concept tags)
     classifier = HierarchicalClassifier()
     if classifier.load_models():
         final_structured_data = post_process_json_with_inference(final_structured_data, classifier)
+        print("✅ Classification complete. Tags added.")
+    else:
+        print("❌ Classifier not found. Returning untagged data.")
     return final_structured_data
 #================================================================================
         # --- NEW FINAL STEP: HIERARCHICAL CLASSIFICATION TAGGING ---