Spaces:

heerjtdev
/

layout_latex

Running

App Files Files Community

heerjtdev commited on Dec 19, 2025

Commit

bf0bbaf

verified ·

1 Parent(s): 994b14b

Update working_yolo_pipeline.py

Browse files

Files changed (1) hide show

working_yolo_pipeline.py +147 -43

working_yolo_pipeline.py CHANGED Viewed

@@ -2759,58 +2759,162 @@ def embed_images_as_base64_in_memory(structured_data: List[Dict[str, Any]], figu
-# def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str) -> Optional[
-#     List[Dict[str, Any]]]:
-def run_document_pipeline( input_pdf_path: str, layoutlmv3_model_path: str, structured_intermediate_output_path: Optional[str] = None) -> Optional[List[Dict[str, Any]]]:
-    if not os.path.exists(input_pdf_path): return None
-    print("\n" + "#" * 80)
-    print("### STARTING OPTIMIZED FULL DOCUMENT ANALYSIS PIPELINE ###")
-    print("#" * 80)
-    pdf_name = os.path.splitext(os.path.basename(input_pdf_path))[0]
-    temp_pipeline_dir = os.path.join(tempfile.gettempdir(), f"pipeline_run_{pdf_name}_{os.getpid()}")
-    os.makedirs(temp_pipeline_dir, exist_ok=True)
-    preprocessed_json_path = os.path.join(temp_pipeline_dir, f"{pdf_name}_preprocessed.json")
-    raw_output_path = os.path.join(temp_pipeline_dir, f"{pdf_name}_raw_predictions.json")
-    structured_intermediate_output_path = os.path.join(temp_pipeline_dir, f"{pdf_name}_structured_intermediate.json")
-    final_result = None
-    try:
-        # Phase 1: Preprocessing with YOLO First + Masking
-        preprocessed_json_path_out = run_single_pdf_preprocessing(input_pdf_path, preprocessed_json_path)
-        if not preprocessed_json_path_out: return None
-        # Phase 2: Inference
-        page_raw_predictions_list = run_inference_and_get_raw_words(
-            input_pdf_path, layoutlmv3_model_path, preprocessed_json_path_out
-        )
-        if not page_raw_predictions_list: return None
-        # --- DEBUG STEP: SAVE RAW PREDICTIONS ---
-        # Save raw predictions to the temporary file
-        with open(raw_output_path, 'w', encoding='utf-8') as f:
-            json.dump(page_raw_predictions_list, f, indent=4)
-        # Explicitly copy/save the raw predictions to the user-specified debug path
-        # if raw_predictions_output_path:
-        #     shutil.copy(raw_output_path, raw_predictions_output_path)
-        #     print(f"\n✅ DEBUG: Raw predictions saved to: {raw_predictions_output_path}")
-        # ----------------------------------------
-        # Phase 3: Decoding
-        structured_data_list = convert_bio_to_structured_json_relaxed(
-            raw_output_path, structured_intermediate_output_path
         )
-        if not structured_data_list: return None
-        structured_data_list = correct_misaligned_options(structured_data_list)
-        structured_data_list = process_context_linking(structured_data_list)
-        # Phase 4: Embedding / Equation to LaTeX Conversion
-        final_result = embed_images_as_base64_in_memory(structured_data_list, FIGURE_EXTRACTION_DIR)

+# # def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str) -> Optional[
+# #     List[Dict[str, Any]]]:
+# def run_document_pipeline( input_pdf_path: str, layoutlmv3_model_path: str, structured_intermediate_output_path: Optional[str] = None) -> Optional[List[Dict[str, Any]]]:
+#     if not os.path.exists(input_pdf_path): return None
+#     print("\n" + "#" * 80)
+#     print("### STARTING OPTIMIZED FULL DOCUMENT ANALYSIS PIPELINE ###")
+#     print("#" * 80)
+#     pdf_name = os.path.splitext(os.path.basename(input_pdf_path))[0]
+#     temp_pipeline_dir = os.path.join(tempfile.gettempdir(), f"pipeline_run_{pdf_name}_{os.getpid()}")
+#     os.makedirs(temp_pipeline_dir, exist_ok=True)
+#     preprocessed_json_path = os.path.join(temp_pipeline_dir, f"{pdf_name}_preprocessed.json")
+#     raw_output_path = os.path.join(temp_pipeline_dir, f"{pdf_name}_raw_predictions.json")
+#     structured_intermediate_output_path = os.path.join(temp_pipeline_dir, f"{pdf_name}_structured_intermediate.json")
+#     final_result = None
+#     try:
+#         # Phase 1: Preprocessing with YOLO First + Masking
+#         preprocessed_json_path_out = run_single_pdf_preprocessing(input_pdf_path, preprocessed_json_path)
+#         if not preprocessed_json_path_out: return None
+#         # Phase 2: Inference
+#         page_raw_predictions_list = run_inference_and_get_raw_words(
+#             input_pdf_path, layoutlmv3_model_path, preprocessed_json_path_out
+#         )
+#         if not page_raw_predictions_list: return None
+#         # --- DEBUG STEP: SAVE RAW PREDICTIONS ---
+#         # Save raw predictions to the temporary file
+#         with open(raw_output_path, 'w', encoding='utf-8') as f:
+#             json.dump(page_raw_predictions_list, f, indent=4)
+#         # Explicitly copy/save the raw predictions to the user-specified debug path
+#         # if raw_predictions_output_path:
+#         #     shutil.copy(raw_output_path, raw_predictions_output_path)
+#         #     print(f"\n✅ DEBUG: Raw predictions saved to: {raw_predictions_output_path}")
+#         # ----------------------------------------
+#         # Phase 3: Decoding
+#         structured_data_list = convert_bio_to_structured_json_relaxed(
+#             raw_output_path, structured_intermediate_output_path
+#         )
+#         if not structured_data_list: return None
+#         structured_data_list = correct_misaligned_options(structured_data_list)
+#         structured_data_list = process_context_linking(structured_data_list)
+#         # Phase 4: Embedding / Equation to LaTeX Conversion
+#         final_result = embed_images_as_base64_in_memory(structured_data_list, FIGURE_EXTRACTION_DIR)
+def load_image_as_fitz_page(image_path: str) -> Tuple[fitz.Document, fitz.Page]:
+    """
+    Wraps a standard image file into a single-page PyMuPDF Document.
+    This allows images to be processed by existing PDF-based functions
+    like coordinate scaling and column detection without modification.
+    """
+    img = Image.open(image_path)
+    # Convert image to a PDF stream in memory
+    pdf_bytes = fitz.open("pdf", img.tobytes("pdf")).tobytes()
+    doc = fitz.open("pdf", pdf_bytes)
+    return doc, doc[0]
+def run_document_pipeline(input_path: str, layoutlmv3_model_path: str):
+    """
+    Main pipeline entry point modified to handle both PDF and Image files.
+    """
+    # Initialize YOLO and LayoutLMv3 models (kept from original script)
+    yolo_model = YOLO(WEIGHTS_PATH)
+    # 1. DETECT FILE TYPE
+    ext = os.path.splitext(input_path)[1].lower()
+    is_image = ext in ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp']
+    all_pages_data = []
+    if is_image:
+        print(f"📸 Image detected: {input_path}. Initializing Single-Page Pipeline.")
+        # 2. IMAGE BRANCH: Wrap image into a fitz page
+        doc, page = load_image_as_fitz_page(input_path)
+        # Render the image for YOLO analysis (consistent with your PDF logic)
+        pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
+        img_np = pixmap_to_numpy(pix)
+        # 3. PROCESS THE PAGE
+        # Because the 'page' has no native text, the Tesseract OCR fallback
+        # in preprocess_and_ocr_page will trigger automatically.
+        page_data, _ = preprocess_and_ocr_page(
+            img_np,
+            yolo_model,
+            input_path,
+            0, # Page Index 0
+            page,
+            os.path.basename(input_path)
         )
+        if page_data:
+            all_pages_data.append(page_data)
+        doc.close()
+    else:
+        # 4. PDF BRANCH: Standard processing (your original logic)
+        try:
+            doc = fitz.open(input_path)
+            print(f"📄 Processing PDF with {len(doc)} pages: {input_path}")
+            for page_index in range(len(doc)):
+                page = doc[page_index]
+                # Render page at 2.0x scale (consistent with your original script)
+                pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
+                img_np = pixmap_to_numpy(pix)
+                page_data, _ = preprocess_and_ocr_page(
+                    img_np,
+                    yolo_model,
+                    input_path,
+                    page_index,
+                    page,
+                    os.path.basename(input_path)
+                )
+                if page_data:
+                    all_pages_data.append(page_data)
+            doc.close()
+        except Exception as e:
+            print(f"❌ Error opening PDF {input_path}: {e}")
+            return None
+    # 5. CONTINUE EXACTLY AS BEFORE: Sequential processing & Inference
+    # Sequence all blocks from all pages (or the single image page)
+    sequential_blocks = []
+    for p_data in all_pages_data:
+        sequential_blocks.extend(p_data.get('blocks', []))
+    # Run LayoutLMv3 Inference on the gathered blocks
+    final_structured_data = run_layoutlmv3_inference_on_blocks(
+        sequential_blocks,
+        layoutlmv3_model_path
+    )
+    # Run Subject/Concept classification (as implemented in your original script)
+    classifier = HierarchicalClassifier()
+    if classifier.load_models():
+        final_structured_data = post_process_json_with_inference(final_structured_data, classifier)
+    return final_structured_data