heerjtdev commited on
Commit
414d12d
Β·
verified Β·
1 Parent(s): bf0bbaf

Update working_yolo_pipeline.py

Browse files
Files changed (1) hide show
  1. working_yolo_pipeline.py +23 -23
working_yolo_pipeline.py CHANGED
@@ -2814,13 +2814,11 @@ def embed_images_as_base64_in_memory(structured_data: List[Dict[str, Any]], figu
2814
 
2815
 
2816
 
2817
-
2818
-
2819
  def load_image_as_fitz_page(image_path: str) -> Tuple[fitz.Document, fitz.Page]:
2820
  """
2821
  Wraps a standard image file into a single-page PyMuPDF Document.
2822
- This allows images to be processed by existing PDF-based functions
2823
- like coordinate scaling and column detection without modification.
2824
  """
2825
  img = Image.open(image_path)
2826
  # Convert image to a PDF stream in memory
@@ -2830,53 +2828,47 @@ def load_image_as_fitz_page(image_path: str) -> Tuple[fitz.Document, fitz.Page]:
2830
 
2831
  def run_document_pipeline(input_path: str, layoutlmv3_model_path: str):
2832
  """
2833
- Main pipeline entry point modified to handle both PDF and Image files.
2834
  """
2835
- # Initialize YOLO and LayoutLMv3 models (kept from original script)
2836
  yolo_model = YOLO(WEIGHTS_PATH)
2837
 
2838
- # 1. DETECT FILE TYPE
2839
  ext = os.path.splitext(input_path)[1].lower()
2840
  is_image = ext in ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp']
2841
 
2842
  all_pages_data = []
2843
 
 
2844
  if is_image:
2845
  print(f"πŸ“Έ Image detected: {input_path}. Initializing Single-Page Pipeline.")
2846
-
2847
- # 2. IMAGE BRANCH: Wrap image into a fitz page
2848
  doc, page = load_image_as_fitz_page(input_path)
2849
 
2850
- # Render the image for YOLO analysis (consistent with your PDF logic)
 
2851
  pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
2852
  img_np = pixmap_to_numpy(pix)
2853
 
2854
- # 3. PROCESS THE PAGE
2855
- # Because the 'page' has no native text, the Tesseract OCR fallback
2856
- # in preprocess_and_ocr_page will trigger automatically.
2857
  page_data, _ = preprocess_and_ocr_page(
2858
  img_np,
2859
  yolo_model,
2860
  input_path,
2861
- 0, # Page Index 0
2862
  page,
2863
  os.path.basename(input_path)
2864
  )
2865
-
2866
  if page_data:
2867
  all_pages_data.append(page_data)
2868
  doc.close()
2869
 
2870
  else:
2871
- # 4. PDF BRANCH: Standard processing (your original logic)
2872
  try:
2873
  doc = fitz.open(input_path)
2874
  print(f"πŸ“„ Processing PDF with {len(doc)} pages: {input_path}")
2875
 
2876
  for page_index in range(len(doc)):
2877
  page = doc[page_index]
2878
-
2879
- # Render page at 2.0x scale (consistent with your original script)
2880
  pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
2881
  img_np = pixmap_to_numpy(pix)
2882
 
@@ -2888,35 +2880,43 @@ def run_document_pipeline(input_path: str, layoutlmv3_model_path: str):
2888
  page,
2889
  os.path.basename(input_path)
2890
  )
2891
-
2892
  if page_data:
2893
  all_pages_data.append(page_data)
2894
-
2895
  doc.close()
2896
  except Exception as e:
2897
  print(f"❌ Error opening PDF {input_path}: {e}")
2898
  return None
2899
 
2900
- # 5. CONTINUE EXACTLY AS BEFORE: Sequential processing & Inference
 
 
 
 
2901
  # Sequence all blocks from all pages (or the single image page)
2902
  sequential_blocks = []
2903
  for p_data in all_pages_data:
2904
  sequential_blocks.extend(p_data.get('blocks', []))
2905
 
 
 
 
 
2906
  # Run LayoutLMv3 Inference on the gathered blocks
2907
  final_structured_data = run_layoutlmv3_inference_on_blocks(
2908
  sequential_blocks,
2909
  layoutlmv3_model_path
2910
  )
2911
 
2912
- # Run Subject/Concept classification (as implemented in your original script)
2913
  classifier = HierarchicalClassifier()
2914
  if classifier.load_models():
2915
  final_structured_data = post_process_json_with_inference(final_structured_data, classifier)
 
 
 
2916
 
2917
  return final_structured_data
2918
 
2919
-
2920
 
2921
  #================================================================================
2922
  # --- NEW FINAL STEP: HIERARCHICAL CLASSIFICATION TAGGING ---
 
2814
 
2815
 
2816
 
 
 
2817
  def load_image_as_fitz_page(image_path: str) -> Tuple[fitz.Document, fitz.Page]:
2818
  """
2819
  Wraps a standard image file into a single-page PyMuPDF Document.
2820
+ This ensures it can be processed by your existing fitz-based functions
2821
+ (coordinate scaling, column detection, etc.) exactly as before.
2822
  """
2823
  img = Image.open(image_path)
2824
  # Convert image to a PDF stream in memory
 
2828
 
2829
  def run_document_pipeline(input_path: str, layoutlmv3_model_path: str):
2830
  """
2831
+ Main pipeline modified to handle both PDF and Image files.
2832
  """
2833
+ # 1. INITIALIZE MODELS (Preserving original logic)
2834
  yolo_model = YOLO(WEIGHTS_PATH)
2835
 
2836
+ # 2. DETECT FILE TYPE
2837
  ext = os.path.splitext(input_path)[1].lower()
2838
  is_image = ext in ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp']
2839
 
2840
  all_pages_data = []
2841
 
2842
+ # 3. BRANCH LOGIC: IMAGE VS PDF
2843
  if is_image:
2844
  print(f"πŸ“Έ Image detected: {input_path}. Initializing Single-Page Pipeline.")
 
 
2845
  doc, page = load_image_as_fitz_page(input_path)
2846
 
2847
+ # Process as Page 0. Because there is no native text, your existing
2848
+ # Tesseract fallback will naturally trigger to read the content.
2849
  pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
2850
  img_np = pixmap_to_numpy(pix)
2851
 
 
 
 
2852
  page_data, _ = preprocess_and_ocr_page(
2853
  img_np,
2854
  yolo_model,
2855
  input_path,
2856
+ 0, # Page 0
2857
  page,
2858
  os.path.basename(input_path)
2859
  )
 
2860
  if page_data:
2861
  all_pages_data.append(page_data)
2862
  doc.close()
2863
 
2864
  else:
2865
+ # Standard PDF Processing Loop
2866
  try:
2867
  doc = fitz.open(input_path)
2868
  print(f"πŸ“„ Processing PDF with {len(doc)} pages: {input_path}")
2869
 
2870
  for page_index in range(len(doc)):
2871
  page = doc[page_index]
 
 
2872
  pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
2873
  img_np = pixmap_to_numpy(pix)
2874
 
 
2880
  page,
2881
  os.path.basename(input_path)
2882
  )
 
2883
  if page_data:
2884
  all_pages_data.append(page_data)
 
2885
  doc.close()
2886
  except Exception as e:
2887
  print(f"❌ Error opening PDF {input_path}: {e}")
2888
  return None
2889
 
2890
+ # 4. CONTINUE EXACTLY AS BEFORE: Gathering and Inference
2891
+ if not all_pages_data:
2892
+ print("❌ No data extracted from document.")
2893
+ return None
2894
+
2895
  # Sequence all blocks from all pages (or the single image page)
2896
  sequential_blocks = []
2897
  for p_data in all_pages_data:
2898
  sequential_blocks.extend(p_data.get('blocks', []))
2899
 
2900
+ print("\n" + "=" * 80)
2901
+ print("--- 2. STARTING LAYOUTLMV3 INFERENCE PIPELINE ---")
2902
+ print("=" * 80)
2903
+
2904
  # Run LayoutLMv3 Inference on the gathered blocks
2905
  final_structured_data = run_layoutlmv3_inference_on_blocks(
2906
  sequential_blocks,
2907
  layoutlmv3_model_path
2908
  )
2909
 
2910
+ # Run Hierarchical classification (Subject/Concept tags)
2911
  classifier = HierarchicalClassifier()
2912
  if classifier.load_models():
2913
  final_structured_data = post_process_json_with_inference(final_structured_data, classifier)
2914
+ print("βœ… Classification complete. Tags added.")
2915
+ else:
2916
+ print("❌ Classifier not found. Returning untagged data.")
2917
 
2918
  return final_structured_data
2919
 
 
2920
 
2921
  #================================================================================
2922
  # --- NEW FINAL STEP: HIERARCHICAL CLASSIFICATION TAGGING ---