heerjtdev commited on
Commit
bf0bbaf
·
verified ·
1 Parent(s): 994b14b

Update working_yolo_pipeline.py

Browse files
Files changed (1) hide show
  1. working_yolo_pipeline.py +147 -43
working_yolo_pipeline.py CHANGED
@@ -2759,58 +2759,162 @@ def embed_images_as_base64_in_memory(structured_data: List[Dict[str, Any]], figu
2759
 
2760
 
2761
 
2762
- # def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str) -> Optional[
2763
- # List[Dict[str, Any]]]:
2764
- def run_document_pipeline( input_pdf_path: str, layoutlmv3_model_path: str, structured_intermediate_output_path: Optional[str] = None) -> Optional[List[Dict[str, Any]]]:
2765
- if not os.path.exists(input_pdf_path): return None
2766
 
2767
- print("\n" + "#" * 80)
2768
- print("### STARTING OPTIMIZED FULL DOCUMENT ANALYSIS PIPELINE ###")
2769
- print("#" * 80)
2770
 
2771
- pdf_name = os.path.splitext(os.path.basename(input_pdf_path))[0]
2772
- temp_pipeline_dir = os.path.join(tempfile.gettempdir(), f"pipeline_run_{pdf_name}_{os.getpid()}")
2773
- os.makedirs(temp_pipeline_dir, exist_ok=True)
2774
 
2775
- preprocessed_json_path = os.path.join(temp_pipeline_dir, f"{pdf_name}_preprocessed.json")
2776
- raw_output_path = os.path.join(temp_pipeline_dir, f"{pdf_name}_raw_predictions.json")
2777
- structured_intermediate_output_path = os.path.join(temp_pipeline_dir, f"{pdf_name}_structured_intermediate.json")
2778
 
2779
- final_result = None
2780
- try:
2781
- # Phase 1: Preprocessing with YOLO First + Masking
2782
- preprocessed_json_path_out = run_single_pdf_preprocessing(input_pdf_path, preprocessed_json_path)
2783
- if not preprocessed_json_path_out: return None
2784
 
2785
- # Phase 2: Inference
2786
- page_raw_predictions_list = run_inference_and_get_raw_words(
2787
- input_pdf_path, layoutlmv3_model_path, preprocessed_json_path_out
2788
- )
2789
- if not page_raw_predictions_list: return None
2790
-
2791
- # --- DEBUG STEP: SAVE RAW PREDICTIONS ---
2792
- # Save raw predictions to the temporary file
2793
- with open(raw_output_path, 'w', encoding='utf-8') as f:
2794
- json.dump(page_raw_predictions_list, f, indent=4)
2795
-
2796
- # Explicitly copy/save the raw predictions to the user-specified debug path
2797
- # if raw_predictions_output_path:
2798
- # shutil.copy(raw_output_path, raw_predictions_output_path)
2799
- # print(f"\n✅ DEBUG: Raw predictions saved to: {raw_predictions_output_path}")
2800
- # ----------------------------------------
2801
-
2802
- # Phase 3: Decoding
2803
- structured_data_list = convert_bio_to_structured_json_relaxed(
2804
- raw_output_path, structured_intermediate_output_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2805
  )
2806
- if not structured_data_list: return None
2807
- structured_data_list = correct_misaligned_options(structured_data_list)
2808
- structured_data_list = process_context_linking(structured_data_list)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2809
 
 
 
 
 
 
2810
 
2811
- # Phase 4: Embedding / Equation to LaTeX Conversion
2812
- final_result = embed_images_as_base64_in_memory(structured_data_list, FIGURE_EXTRACTION_DIR)
 
 
 
2813
 
 
 
 
 
 
 
2814
 
2815
 
2816
 
 
2759
 
2760
 
2761
 
2762
+ # # def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str) -> Optional[
2763
+ # # List[Dict[str, Any]]]:
2764
+ # def run_document_pipeline( input_pdf_path: str, layoutlmv3_model_path: str, structured_intermediate_output_path: Optional[str] = None) -> Optional[List[Dict[str, Any]]]:
2765
+ # if not os.path.exists(input_pdf_path): return None
2766
 
2767
+ # print("\n" + "#" * 80)
2768
+ # print("### STARTING OPTIMIZED FULL DOCUMENT ANALYSIS PIPELINE ###")
2769
+ # print("#" * 80)
2770
 
2771
+ # pdf_name = os.path.splitext(os.path.basename(input_pdf_path))[0]
2772
+ # temp_pipeline_dir = os.path.join(tempfile.gettempdir(), f"pipeline_run_{pdf_name}_{os.getpid()}")
2773
+ # os.makedirs(temp_pipeline_dir, exist_ok=True)
2774
 
2775
+ # preprocessed_json_path = os.path.join(temp_pipeline_dir, f"{pdf_name}_preprocessed.json")
2776
+ # raw_output_path = os.path.join(temp_pipeline_dir, f"{pdf_name}_raw_predictions.json")
2777
+ # structured_intermediate_output_path = os.path.join(temp_pipeline_dir, f"{pdf_name}_structured_intermediate.json")
2778
 
2779
+ # final_result = None
2780
+ # try:
2781
+ # # Phase 1: Preprocessing with YOLO First + Masking
2782
+ # preprocessed_json_path_out = run_single_pdf_preprocessing(input_pdf_path, preprocessed_json_path)
2783
+ # if not preprocessed_json_path_out: return None
2784
 
2785
+ # # Phase 2: Inference
2786
+ # page_raw_predictions_list = run_inference_and_get_raw_words(
2787
+ # input_pdf_path, layoutlmv3_model_path, preprocessed_json_path_out
2788
+ # )
2789
+ # if not page_raw_predictions_list: return None
2790
+
2791
+ # # --- DEBUG STEP: SAVE RAW PREDICTIONS ---
2792
+ # # Save raw predictions to the temporary file
2793
+ # with open(raw_output_path, 'w', encoding='utf-8') as f:
2794
+ # json.dump(page_raw_predictions_list, f, indent=4)
2795
+
2796
+ # # Explicitly copy/save the raw predictions to the user-specified debug path
2797
+ # # if raw_predictions_output_path:
2798
+ # # shutil.copy(raw_output_path, raw_predictions_output_path)
2799
+ # # print(f"\n✅ DEBUG: Raw predictions saved to: {raw_predictions_output_path}")
2800
+ # # ----------------------------------------
2801
+
2802
+ # # Phase 3: Decoding
2803
+ # structured_data_list = convert_bio_to_structured_json_relaxed(
2804
+ # raw_output_path, structured_intermediate_output_path
2805
+ # )
2806
+ # if not structured_data_list: return None
2807
+ # structured_data_list = correct_misaligned_options(structured_data_list)
2808
+ # structured_data_list = process_context_linking(structured_data_list)
2809
+
2810
+
2811
+ # # Phase 4: Embedding / Equation to LaTeX Conversion
2812
+ # final_result = embed_images_as_base64_in_memory(structured_data_list, FIGURE_EXTRACTION_DIR)
2813
+
2814
+
2815
+
2816
+
2817
+
2818
+
2819
+ def load_image_as_fitz_page(image_path: str) -> Tuple[fitz.Document, fitz.Page]:
2820
+ """
2821
+ Wraps a standard image file into a single-page PyMuPDF Document.
2822
+ This allows images to be processed by existing PDF-based functions
2823
+ like coordinate scaling and column detection without modification.
2824
+ """
2825
+ img = Image.open(image_path)
2826
+ # Convert image to a PDF stream in memory
2827
+ pdf_bytes = fitz.open("pdf", img.tobytes("pdf")).tobytes()
2828
+ doc = fitz.open("pdf", pdf_bytes)
2829
+ return doc, doc[0]
2830
+
2831
+ def run_document_pipeline(input_path: str, layoutlmv3_model_path: str):
2832
+ """
2833
+ Main pipeline entry point modified to handle both PDF and Image files.
2834
+ """
2835
+ # Initialize YOLO and LayoutLMv3 models (kept from original script)
2836
+ yolo_model = YOLO(WEIGHTS_PATH)
2837
+
2838
+ # 1. DETECT FILE TYPE
2839
+ ext = os.path.splitext(input_path)[1].lower()
2840
+ is_image = ext in ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp']
2841
+
2842
+ all_pages_data = []
2843
+
2844
+ if is_image:
2845
+ print(f"📸 Image detected: {input_path}. Initializing Single-Page Pipeline.")
2846
+
2847
+ # 2. IMAGE BRANCH: Wrap image into a fitz page
2848
+ doc, page = load_image_as_fitz_page(input_path)
2849
+
2850
+ # Render the image for YOLO analysis (consistent with your PDF logic)
2851
+ pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
2852
+ img_np = pixmap_to_numpy(pix)
2853
+
2854
+ # 3. PROCESS THE PAGE
2855
+ # Because the 'page' has no native text, the Tesseract OCR fallback
2856
+ # in preprocess_and_ocr_page will trigger automatically.
2857
+ page_data, _ = preprocess_and_ocr_page(
2858
+ img_np,
2859
+ yolo_model,
2860
+ input_path,
2861
+ 0, # Page Index 0
2862
+ page,
2863
+ os.path.basename(input_path)
2864
  )
2865
+
2866
+ if page_data:
2867
+ all_pages_data.append(page_data)
2868
+ doc.close()
2869
+
2870
+ else:
2871
+ # 4. PDF BRANCH: Standard processing (your original logic)
2872
+ try:
2873
+ doc = fitz.open(input_path)
2874
+ print(f"📄 Processing PDF with {len(doc)} pages: {input_path}")
2875
+
2876
+ for page_index in range(len(doc)):
2877
+ page = doc[page_index]
2878
+
2879
+ # Render page at 2.0x scale (consistent with your original script)
2880
+ pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
2881
+ img_np = pixmap_to_numpy(pix)
2882
+
2883
+ page_data, _ = preprocess_and_ocr_page(
2884
+ img_np,
2885
+ yolo_model,
2886
+ input_path,
2887
+ page_index,
2888
+ page,
2889
+ os.path.basename(input_path)
2890
+ )
2891
+
2892
+ if page_data:
2893
+ all_pages_data.append(page_data)
2894
+
2895
+ doc.close()
2896
+ except Exception as e:
2897
+ print(f"❌ Error opening PDF {input_path}: {e}")
2898
+ return None
2899
 
2900
+ # 5. CONTINUE EXACTLY AS BEFORE: Sequential processing & Inference
2901
+ # Sequence all blocks from all pages (or the single image page)
2902
+ sequential_blocks = []
2903
+ for p_data in all_pages_data:
2904
+ sequential_blocks.extend(p_data.get('blocks', []))
2905
 
2906
+ # Run LayoutLMv3 Inference on the gathered blocks
2907
+ final_structured_data = run_layoutlmv3_inference_on_blocks(
2908
+ sequential_blocks,
2909
+ layoutlmv3_model_path
2910
+ )
2911
 
2912
+ # Run Subject/Concept classification (as implemented in your original script)
2913
+ classifier = HierarchicalClassifier()
2914
+ if classifier.load_models():
2915
+ final_structured_data = post_process_json_with_inference(final_structured_data, classifier)
2916
+
2917
+ return final_structured_data
2918
 
2919
 
2920