heerjtdev commited on
Commit
8c28adb
·
verified ·
1 Parent(s): 92d1c66

Update working_yolo_pipeline.py

Browse files
Files changed (1) hide show
  1. working_yolo_pipeline.py +26 -17
working_yolo_pipeline.py CHANGED
@@ -2058,19 +2058,27 @@ def embed_images_as_base64_in_memory(structured_data: List[Dict[str, Any]], figu
2058
  # ============================================================================
2059
 
2060
 
2061
-
2062
  def load_image_as_fitz_page(image_path: str) -> Tuple[fitz.Document, fitz.Page]:
2063
  """
2064
- Wraps an image into a temporary PyMuPDF document/page.
2065
- This allows your existing column detection and coordinate mapping
2066
- to work on images exactly as they do on PDFs.
2067
  """
2068
- img = Image.open(image_path)
2069
- # Convert image to PDF format in memory
2070
- pdf_bytes = fitz.open("pdf", img.tobytes("pdf")).tobytes()
2071
- doc = fitz.open("pdf", pdf_bytes)
 
 
 
 
 
 
2072
  return doc, doc[0]
2073
 
 
 
 
 
2074
  def run_document_pipeline(input_path: str, layoutlmv3_model_path: str):
2075
  """
2076
  Modified pipeline that handles both PDFs and Images, running YOLO,
@@ -2079,8 +2087,9 @@ def run_document_pipeline(input_path: str, layoutlmv3_model_path: str):
2079
  # 1. INITIALIZE YOLO
2080
  yolo_model = YOLO(WEIGHTS_PATH)
2081
 
2082
- # 2. DETECT FILE TYPEext = os.path.splitext(input_path)[1].lower()
2083
- ext = os.path.splitext(input_path)[1].lower()
 
2084
  is_image = ext in ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp']
2085
 
2086
  all_pages_data = []
@@ -2089,14 +2098,13 @@ def run_document_pipeline(input_path: str, layoutlmv3_model_path: str):
2089
  try:
2090
  if is_image:
2091
  print(f"📸 Image detected: {input_path}. Processing with YOLO + Tesseract.")
 
2092
  doc, page = load_image_as_fitz_page(input_path)
2093
 
2094
- # Render for YOLO (using same scale as your PDF logic)
2095
  pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
2096
  img_np = pixmap_to_numpy(pix)
2097
 
2098
- # Since an image has no native text layer, preprocess_and_ocr_page
2099
- # will automatically use Tesseract OCR fallback as intended.
2100
  page_data, _ = preprocess_and_ocr_page(
2101
  img_np, yolo_model, input_path, 0, page, pdf_name
2102
  )
@@ -2128,23 +2136,21 @@ def run_document_pipeline(input_path: str, layoutlmv3_model_path: str):
2128
  for p_data in all_pages_data:
2129
  sequential_blocks.extend(p_data.get('blocks', []))
2130
 
2131
- # --- 4. STARTING LAYOUTLMV3 INFERENCE (Exactly as before) ---
2132
  print("\n" + "=" * 80)
2133
  print("--- 2. STARTING LAYOUTLMV3 INFERENCE PIPELINE ---")
2134
  print("=" * 80)
2135
 
2136
- # (Inlining your existing LayoutLMv3 inference logic)
2137
  tokenizer = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base")
2138
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
2139
 
2140
- # This assumes LayoutLMv3ForTokenClassification is defined elsewhere in your script
2141
  model = LayoutLMv3ForTokenClassification(num_labels=NUM_LABELS)
2142
  checkpoint = torch.load(layoutlmv3_model_path, map_location=device)
2143
  model.load_state_dict(checkpoint.get('model_state_dict', checkpoint))
2144
  model.to(device)
2145
  model.eval()
2146
 
2147
- # Run inference on sequential_blocks...
2148
  final_result = run_layoutlmv3_inference_on_blocks(sequential_blocks, model, tokenizer, device)
2149
 
2150
  # 5. POST-PROCESS CLASSIFICATION
@@ -2156,6 +2162,9 @@ def run_document_pipeline(input_path: str, layoutlmv3_model_path: str):
2156
  return final_result
2157
 
2158
  except Exception as e:
 
 
 
2159
  print(f"❌ FATAL ERROR in pipeline: {e}")
2160
  return None
2161
 
 
2058
  # ============================================================================
2059
 
2060
 
 
2061
  def load_image_as_fitz_page(image_path: str) -> Tuple[fitz.Document, fitz.Page]:
2062
  """
2063
+ Wraps an image into a temporary PyMuPDF document/page safely.
2064
+ Uses an in-memory buffer to bypass 'encoder pdf not available' errors.
 
2065
  """
2066
+ # 1. Use PIL to open the image and ensure it's in RGB mode
2067
+ img = Image.open(image_path).convert("RGB")
2068
+
2069
+ # 2. Use a bytes buffer to save the image as a PDF via PIL's engine
2070
+ pdf_stream = io.BytesIO()
2071
+ img.save(pdf_stream, format="PDF")
2072
+ pdf_stream.seek(0)
2073
+
2074
+ # 3. Open that PDF stream with PyMuPDF
2075
+ doc = fitz.open("pdf", pdf_stream.read())
2076
  return doc, doc[0]
2077
 
2078
+
2079
+
2080
+
2081
+
2082
  def run_document_pipeline(input_path: str, layoutlmv3_model_path: str):
2083
  """
2084
  Modified pipeline that handles both PDFs and Images, running YOLO,
 
2087
  # 1. INITIALIZE YOLO
2088
  yolo_model = YOLO(WEIGHTS_PATH)
2089
 
2090
+ # 2. DETECT FILE TYPE
2091
+ # FIX: [1] added to get the extension string from the (root, ext) tuple
2092
+ ext = os.path.splitext(input_path)[1].lower()
2093
  is_image = ext in ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp']
2094
 
2095
  all_pages_data = []
 
2098
  try:
2099
  if is_image:
2100
  print(f"📸 Image detected: {input_path}. Processing with YOLO + Tesseract.")
2101
+ # Use the corrected helper function defined above
2102
  doc, page = load_image_as_fitz_page(input_path)
2103
 
2104
+ # Render for YOLO
2105
  pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
2106
  img_np = pixmap_to_numpy(pix)
2107
 
 
 
2108
  page_data, _ = preprocess_and_ocr_page(
2109
  img_np, yolo_model, input_path, 0, page, pdf_name
2110
  )
 
2136
  for p_data in all_pages_data:
2137
  sequential_blocks.extend(p_data.get('blocks', []))
2138
 
2139
+ # --- 4. STARTING LAYOUTLMV3 INFERENCE ---
2140
  print("\n" + "=" * 80)
2141
  print("--- 2. STARTING LAYOUTLMV3 INFERENCE PIPELINE ---")
2142
  print("=" * 80)
2143
 
 
2144
  tokenizer = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base")
2145
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
2146
 
2147
+ # Note: Ensure LayoutLMv3ForTokenClassification is defined in your script
2148
  model = LayoutLMv3ForTokenClassification(num_labels=NUM_LABELS)
2149
  checkpoint = torch.load(layoutlmv3_model_path, map_location=device)
2150
  model.load_state_dict(checkpoint.get('model_state_dict', checkpoint))
2151
  model.to(device)
2152
  model.eval()
2153
 
 
2154
  final_result = run_layoutlmv3_inference_on_blocks(sequential_blocks, model, tokenizer, device)
2155
 
2156
  # 5. POST-PROCESS CLASSIFICATION
 
2162
  return final_result
2163
 
2164
  except Exception as e:
2165
+ # Improved error logging to catch exactly where it fails
2166
+ import traceback
2167
+ traceback.print_exc()
2168
  print(f"❌ FATAL ERROR in pipeline: {e}")
2169
  return None
2170