Spaces:

heerjtdev
/

layout_latex

Running

App Files Files Community

heerjtdev commited on 27 days ago

Commit

8c28adb

verified ·

1 Parent(s): 92d1c66

Update working_yolo_pipeline.py

Browse files

Files changed (1) hide show

working_yolo_pipeline.py +26 -17

working_yolo_pipeline.py CHANGED Viewed

@@ -2058,19 +2058,27 @@ def embed_images_as_base64_in_memory(structured_data: List[Dict[str, Any]], figu
 # ============================================================================
 def load_image_as_fitz_page(image_path: str) -> Tuple[fitz.Document, fitz.Page]:
     """
-    Wraps an image into a temporary PyMuPDF document/page.
-    This allows your existing column detection and coordinate mapping
-    to work on images exactly as they do on PDFs.
     """
-    img = Image.open(image_path)
-    # Convert image to PDF format in memory
-    pdf_bytes = fitz.open("pdf", img.tobytes("pdf")).tobytes()
-    doc = fitz.open("pdf", pdf_bytes)
     return doc, doc[0]
 def run_document_pipeline(input_path: str, layoutlmv3_model_path: str):
     """
     Modified pipeline that handles both PDFs and Images, running YOLO,
@@ -2079,8 +2087,9 @@ def run_document_pipeline(input_path: str, layoutlmv3_model_path: str):
     # 1. INITIALIZE YOLO
     yolo_model = YOLO(WEIGHTS_PATH)
-    # 2. DETECT FILE TYPEext = os.path.splitext(input_path)[1].lower()
-    ext = os.path.splitext(input_path)[1].lower()
     is_image = ext in ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp']
     all_pages_data = []
@@ -2089,14 +2098,13 @@ def run_document_pipeline(input_path: str, layoutlmv3_model_path: str):
     try:
         if is_image:
             print(f"📸 Image detected: {input_path}. Processing with YOLO + Tesseract.")
             doc, page = load_image_as_fitz_page(input_path)
-            # Render for YOLO (using same scale as your PDF logic)
             pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
             img_np = pixmap_to_numpy(pix)
-            # Since an image has no native text layer, preprocess_and_ocr_page
-            # will automatically use Tesseract OCR fallback as intended.
             page_data, _ = preprocess_and_ocr_page(
                 img_np, yolo_model, input_path, 0, page, pdf_name
             )
@@ -2128,23 +2136,21 @@ def run_document_pipeline(input_path: str, layoutlmv3_model_path: str):
         for p_data in all_pages_data:
             sequential_blocks.extend(p_data.get('blocks', []))
-        # --- 4. STARTING LAYOUTLMV3 INFERENCE (Exactly as before) ---
         print("\n" + "=" * 80)
         print("--- 2. STARTING LAYOUTLMV3 INFERENCE PIPELINE ---")
         print("=" * 80)
-        # (Inlining your existing LayoutLMv3 inference logic)
         tokenizer = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base")
         device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        # This assumes LayoutLMv3ForTokenClassification is defined elsewhere in your script
         model = LayoutLMv3ForTokenClassification(num_labels=NUM_LABELS)
         checkpoint = torch.load(layoutlmv3_model_path, map_location=device)
         model.load_state_dict(checkpoint.get('model_state_dict', checkpoint))
         model.to(device)
         model.eval()
-        # Run inference on sequential_blocks...
         final_result = run_layoutlmv3_inference_on_blocks(sequential_blocks, model, tokenizer, device)
         # 5. POST-PROCESS CLASSIFICATION
@@ -2156,6 +2162,9 @@ def run_document_pipeline(input_path: str, layoutlmv3_model_path: str):
         return final_result
     except Exception as e:
         print(f"❌ FATAL ERROR in pipeline: {e}")
         return None

 # ============================================================================
 def load_image_as_fitz_page(image_path: str) -> Tuple[fitz.Document, fitz.Page]:
     """
+    Wraps an image into a temporary PyMuPDF document/page safely.
+    Uses an in-memory buffer to bypass 'encoder pdf not available' errors.
     """
+    # 1. Use PIL to open the image and ensure it's in RGB mode
+    img = Image.open(image_path).convert("RGB")
+    # 2. Use a bytes buffer to save the image as a PDF via PIL's engine
+    pdf_stream = io.BytesIO()
+    img.save(pdf_stream, format="PDF")
+    pdf_stream.seek(0)
+    # 3. Open that PDF stream with PyMuPDF
+    doc = fitz.open("pdf", pdf_stream.read())
     return doc, doc[0]
 def run_document_pipeline(input_path: str, layoutlmv3_model_path: str):
     """
     Modified pipeline that handles both PDFs and Images, running YOLO,
     # 1. INITIALIZE YOLO
     yolo_model = YOLO(WEIGHTS_PATH)
+    # 2. DETECT FILE TYPE
+    # FIX: [1] added to get the extension string from the (root, ext) tuple
+    ext = os.path.splitext(input_path)[1].lower()
     is_image = ext in ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp']
     all_pages_data = []
     try:
         if is_image:
             print(f"📸 Image detected: {input_path}. Processing with YOLO + Tesseract.")
+            # Use the corrected helper function defined above
             doc, page = load_image_as_fitz_page(input_path)
+            # Render for YOLO
             pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
             img_np = pixmap_to_numpy(pix)
             page_data, _ = preprocess_and_ocr_page(
                 img_np, yolo_model, input_path, 0, page, pdf_name
             )
         for p_data in all_pages_data:
             sequential_blocks.extend(p_data.get('blocks', []))
+        # --- 4. STARTING LAYOUTLMV3 INFERENCE ---
         print("\n" + "=" * 80)
         print("--- 2. STARTING LAYOUTLMV3 INFERENCE PIPELINE ---")
         print("=" * 80)
         tokenizer = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base")
         device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        # Note: Ensure LayoutLMv3ForTokenClassification is defined in your script
         model = LayoutLMv3ForTokenClassification(num_labels=NUM_LABELS)
         checkpoint = torch.load(layoutlmv3_model_path, map_location=device)
         model.load_state_dict(checkpoint.get('model_state_dict', checkpoint))
         model.to(device)
         model.eval()
         final_result = run_layoutlmv3_inference_on_blocks(sequential_blocks, model, tokenizer, device)
         # 5. POST-PROCESS CLASSIFICATION
         return final_result
     except Exception as e:
+        # Improved error logging to catch exactly where it fails
+        import traceback
+        traceback.print_exc()
         print(f"❌ FATAL ERROR in pipeline: {e}")
         return None