heerjtdev commited on
Commit
b00791d
Β·
verified Β·
1 Parent(s): 1494cae

Rename bababa.py to app.py

Browse files
Files changed (1) hide show
  1. bababa.py β†’ app.py +157 -45
bababa.py β†’ app.py RENAMED
@@ -20,6 +20,7 @@ import argparse
20
  import os
21
  import re
22
 
 
23
  import torch.nn as nn
24
  from TorchCRF import CRF
25
  # from transformers import LayoutLMv3TokenizerFast, LayoutLMv3Model, LayoutLMv3Config
@@ -40,12 +41,13 @@ import logging
40
 
41
 
42
  # ============================================================================
43
- # --- TR-OCR/ORT MODEL INITIALIZATION ---
44
  # ============================================================================
45
 
46
  logging.basicConfig(level=logging.WARNING)
47
 
48
-
 
49
 
50
 
51
  # ============================================================================
@@ -244,6 +246,7 @@ def merge_yolo_into_word_data(raw_word_data: list, yolo_detections: list, scale_
244
  # 2. Filter out raw words that are inside YOLO boxes
245
  cleaned_word_data = []
246
  for word_tuple in raw_word_data:
 
247
  wx1, wy1, wx2, wy2 = word_tuple[1], word_tuple[2], word_tuple[3], word_tuple[4]
248
  w_center_x = (wx1 + wx2) / 2
249
  w_center_y = (wy1 + wy2) / 2
@@ -266,15 +269,97 @@ def merge_yolo_into_word_data(raw_word_data: list, yolo_detections: list, scale_
266
 
267
 
268
  # ============================================================================
269
- # --- MISSING HELPER FUNCTION ---
270
  # ============================================================================
271
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
272
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
273
 
274
 
275
- def run_single_pdf_preprocessing(pdf_path: str, preprocessed_json_path: str) -> Optional[str]:
 
 
 
 
 
276
  global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
277
 
 
278
  GLOBAL_FIGURE_COUNT = 0
279
  GLOBAL_EQUATION_COUNT = 0
280
  _ocr_cache.clear()
@@ -285,20 +370,29 @@ def run_single_pdf_preprocessing(pdf_path: str, preprocessed_json_path: str) ->
285
 
286
  if not os.path.exists(pdf_path):
287
  print(f"❌ FATAL ERROR: Input PDF not found at {pdf_path}.")
288
- return None
289
 
290
  os.makedirs(os.path.dirname(preprocessed_json_path), exist_ok=True)
291
  os.makedirs(FIGURE_EXTRACTION_DIR, exist_ok=True)
 
 
 
 
 
 
 
 
 
292
 
293
- model = YOLO(WEIGHTS_PATH)
294
  pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
295
 
296
  try:
297
  doc = fitz.open(pdf_path)
298
- print(f"βœ… Opened PDF: {pdf_name} ({doc.page_count} pages)")
 
299
  except Exception as e:
300
  print(f"❌ ERROR loading PDF file: {e}")
301
- return None
302
 
303
  all_pages_data = []
304
  total_pages_processed = 0
@@ -308,7 +402,7 @@ def run_single_pdf_preprocessing(pdf_path: str, preprocessed_json_path: str) ->
308
 
309
  for page_num_0_based in range(doc.page_count):
310
  page_num = page_num_0_based + 1
311
- print(f" -> Processing Page {page_num}/{doc.page_count}...")
312
 
313
  fitz_page = doc.load_page(page_num_0_based)
314
 
@@ -348,26 +442,28 @@ def run_single_pdf_preprocessing(pdf_path: str, preprocessed_json_path: str) ->
348
  print(f"\n βœ… Combined structured OCR JSON saved to: {os.path.basename(preprocessed_json_path)}")
349
  except Exception as e:
350
  print(f"❌ ERROR saving combined JSON output: {e}")
351
- return None
352
  else:
353
  print("❌ WARNING: No page data generated. Halting pipeline.")
354
- return None
355
 
356
  print("\n" + "=" * 80)
357
  print(f"--- YOLO/OCR PREPROCESSING COMPLETE ({total_pages_processed} pages processed) ---")
358
  print("=" * 80)
359
 
360
- return preprocessed_json_path
361
-
362
-
363
-
364
 
365
 
 
 
 
366
 
367
  if __name__ == "__main__":
368
  parser = argparse.ArgumentParser(description="Complete Pipeline")
369
  parser.add_argument("--input_pdf", type=str, required=True, help="Input PDF")
370
- parser.add_argument("--layoutlmv3_model_path", type=str, default=DEFAULT_LAYOUTLMV3_MODEL_PATH, help="Model Path")
 
371
 
372
  # --- ADDED ARGUMENT FOR DEBUGGING ---
373
  parser.add_argument("--raw_preds_path", type=str, default='BIO_debug.json',
@@ -377,35 +473,51 @@ if __name__ == "__main__":
377
 
378
  pdf_name = os.path.splitext(os.path.basename(args.input_pdf))[0]
379
  final_output_path = os.path.abspath(f"{pdf_name}_final_output_embedded.json")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
380
 
381
- # --- CALCULATE RAW PREDICTIONS OUTPUT PATH (Kept commented as per original script) ---
382
- # raw_predictions_output_path = os.path.abspath(
383
- # args.raw_preds_path if args.raw_preds_path else f"{pdf_name}_raw_predictions_debug.json")
384
- # ---------------------------------------------
385
-
386
- # --- UPDATED FUNCTION CALL ---
387
- final_json_data = run_document_pipeline(
388
- args.input_pdf,
389
- args.layoutlmv3_model_path)
390
- # -----------------------------
391
 
 
 
 
 
392
  # πŸ›‘ CRITICAL FINAL FIX: AGGRESSIVE CUSTOM JSON SAVING πŸ›‘
393
- if final_json_data:
394
- # 1. Dump the Python object to a standard JSON string.
395
- # This converts the in-memory double backslash ('\\') into a quadruple backslash ('\\\\')
396
- # in the raw json_str string content.
397
- json_str = json.dumps(final_json_data, indent=2, ensure_ascii=False)
398
-
399
- # 2. **AGGRESSIVE UNDO ESCAPING:** We assume we have quadruple backslashes and
400
- # replace them with the double backslashes needed for the LaTeX command to work.
401
- # This operation essentially replaces four literal backslashes with two literal backslashes.
402
- # final_output_content = json_str.replace('\\\\\\\\', '\\\\')
403
-
404
- # 3. Write the corrected string content to the file.
405
- with open(final_output_path, 'w', encoding='utf-8') as f:
406
- f.write(json_str)
407
-
408
- print(f"\nβœ… Final Data Saved: {final_output_path}")
409
- else:
410
- print("\n❌ Pipeline Failed.")
411
- sys.exit(1)
 
20
  import os
21
  import re
22
 
23
+ # Import torch components if needed (kept from original script)
24
  import torch.nn as nn
25
  from TorchCRF import CRF
26
  # from transformers import LayoutLMv3TokenizerFast, LayoutLMv3Model, LayoutLMv3Config
 
41
 
42
 
43
  # ============================================================================
44
+ # --- TR-OCR/ORT MODEL INITIALIZATION (Placeholder) ---
45
  # ============================================================================
46
 
47
  logging.basicConfig(level=logging.WARNING)
48
 
49
+ # Placeholder constant for missing argument
50
+ DEFAULT_LAYOUTLMV3_MODEL_PATH = 'layoutlmv3_placeholder'
51
 
52
 
53
  # ============================================================================
 
246
  # 2. Filter out raw words that are inside YOLO boxes
247
  cleaned_word_data = []
248
  for word_tuple in raw_word_data:
249
+ # word_tuple is (text, x1, y1, x2, y2)
250
  wx1, wy1, wx2, wy2 = word_tuple[1], word_tuple[2], word_tuple[3], word_tuple[4]
251
  w_center_x = (wx1 + wx2) / 2
252
  w_center_y = (wy1 + wy2) / 2
 
269
 
270
 
271
  # ============================================================================
272
+ # --- MISSING HELPER FUNCTIONS (Placeholders) ---
273
  # ============================================================================
274
 
275
+ def pixmap_to_numpy(pix: fitz.Pixmap) -> np.ndarray:
276
+ """Converts a PyMuPDF Pixmap to a NumPy array for OpenCV/YOLO."""
277
+ # This is a critical function for the pipeline. Implementing a basic version.
278
+ img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(
279
+ (pix.h, pix.w, pix.n)
280
+ )
281
+ if pix.n == 4:
282
+ # Convert RGBA to RGB for most YOLO models
283
+ img = cv2.cvtColor(img, cv2.COLOR_RGBA2RGB)
284
+ elif pix.n == 1:
285
+ # Grayscale to RGB
286
+ img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
287
+ return img
288
+
289
+ def find_column_separator_x(raw_word_data: list, page_width: float) -> Optional[float]:
290
+ """
291
+ Placeholder for logic that detects if a page is two-column and finds the separator line.
292
+ This logic is complex and usually involves histogram analysis of word x-coordinates.
293
+ Returns None for single column, or the x-coordinate of the separator.
294
+ """
295
+ # Placeholder: Always assume single column unless you have the full logic.
296
+ return None
297
 
298
+ def preprocess_and_ocr_page(
299
+ image: np.ndarray, model: YOLO, pdf_path: str, page_num: int,
300
+ fitz_page: fitz.Page, pdf_name: str
301
+ ) -> Tuple[Optional[list], Optional[float]]:
302
+ """
303
+ Placeholder for the page-level processing: YOLO detection, OCR, and merging.
304
+ This function is responsible for INCREMENTING the global counters.
305
+ """
306
+ global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
307
+
308
+ # 1. Mock YOLO Detection (You would run model(image) here)
309
+ # Mocking a result with 2 equations and 1 figure for testing the counters.
310
+ scale_factor = 2.0 # from the mat=fitz.Matrix(2.0, 2.0) call
311
+
312
+ # Mock Detection for Counters:
313
+ mock_detections = [
314
+ {'coords': (100, 100, 400, 200), 'class': 'equation', 'conf': 0.95},
315
+ {'coords': (100, 300, 400, 400), 'class': 'figure', 'conf': 0.90},
316
+ {'coords': (100, 500, 400, 600), 'class': 'equation', 'conf': 0.85},
317
+ ]
318
+
319
+ # 2. Apply NMS/Merging/Filtering (using the provided functions)
320
+ merged_detections = merge_overlapping_boxes(mock_detections, IOU_MERGE_THRESHOLD)
321
+ final_detections = filter_nested_boxes(merged_detections, IOA_SUPPRESSION_THRESHOLD)
322
+
323
+ # 3. Update Global Counters based on Final Detections
324
+ for det in final_detections:
325
+ if det['class'] == 'figure':
326
+ GLOBAL_FIGURE_COUNT += 1
327
+ # Logic for saving figure image/caption would go here
328
+ elif det['class'] == 'equation':
329
+ GLOBAL_EQUATION_COUNT += 1
330
+ # Logic for OCR/LaTeX extraction would go here
331
+
332
+ # 4. Mock Raw Word Data and Cleaning
333
+ # (In a real script, this would come from fitz_page.get_text("words"))
334
+ mock_raw_words = [("Word", 50.0, 50.0, 80.0, 60.0)]
335
+ cleaned_word_data = merge_yolo_into_word_data(mock_raw_words, final_detections, scale_factor)
336
+
337
+ # 5. Determine Column Separator
338
+ page_width = fitz_page.rect.width
339
+ page_separator_x = find_column_separator_x(cleaned_word_data, page_width)
340
+
341
+ # 6. Mock Final Output Structure
342
+ final_output = [
343
+ {"type": "text", "text": "Mock Text Block 1"},
344
+ {"type": "yolo_block", "class": "figure", "page_num": page_num, "global_id": GLOBAL_FIGURE_COUNT},
345
+ {"type": "yolo_block", "class": "equation", "page_num": page_num, "global_id": GLOBAL_EQUATION_COUNT},
346
+ # ... more mock data
347
+ ]
348
+
349
+ print(f" -> Page {page_num}: Equations={len([d for d in final_detections if d['class'] == 'equation'])}, Figures={len([d for d in final_detections if d['class'] == 'figure'])}")
350
+
351
+ return final_output, page_separator_x
352
 
353
 
354
+ # ============================================================================
355
+ # --- MAIN DOCUMENT PROCESSING FUNCTION ---
356
+ # ============================================================================
357
+
358
+ # MODIFIED: Returns a Tuple containing the JSON path and the three counts.
359
+ def run_single_pdf_preprocessing(pdf_path: str, preprocessed_json_path: str) -> Tuple[Optional[str], int, int, int]:
360
  global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
361
 
362
+ # Reset globals for a new document
363
  GLOBAL_FIGURE_COUNT = 0
364
  GLOBAL_EQUATION_COUNT = 0
365
  _ocr_cache.clear()
 
370
 
371
  if not os.path.exists(pdf_path):
372
  print(f"❌ FATAL ERROR: Input PDF not found at {pdf_path}.")
373
+ return None, 0, 0, 0
374
 
375
  os.makedirs(os.path.dirname(preprocessed_json_path), exist_ok=True)
376
  os.makedirs(FIGURE_EXTRACTION_DIR, exist_ok=True)
377
+
378
+ # NOTE: This will fail if best.pt is not present
379
+ try:
380
+ model = YOLO(WEIGHTS_PATH)
381
+ except Exception as e:
382
+ print(f"❌ ERROR loading YOLO model: {e}")
383
+ # Return 0 for counts if model fails to load
384
+ return None, 0, 0, 0
385
+
386
 
 
387
  pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
388
 
389
  try:
390
  doc = fitz.open(pdf_path)
391
+ total_pages = doc.page_count # Capture the total page count
392
+ print(f"βœ… Opened PDF: {pdf_name} ({total_pages} pages)")
393
  except Exception as e:
394
  print(f"❌ ERROR loading PDF file: {e}")
395
+ return None, 0, 0, 0
396
 
397
  all_pages_data = []
398
  total_pages_processed = 0
 
402
 
403
  for page_num_0_based in range(doc.page_count):
404
  page_num = page_num_0_based + 1
405
+ # print(f" -> Processing Page {page_num}/{doc.page_count}...") # Moved print inside the helper for better logging
406
 
407
  fitz_page = doc.load_page(page_num_0_based)
408
 
 
442
  print(f"\n βœ… Combined structured OCR JSON saved to: {os.path.basename(preprocessed_json_path)}")
443
  except Exception as e:
444
  print(f"❌ ERROR saving combined JSON output: {e}")
445
+ return None, total_pages, GLOBAL_EQUATION_COUNT, GLOBAL_FIGURE_COUNT
446
  else:
447
  print("❌ WARNING: No page data generated. Halting pipeline.")
448
+ return None, total_pages, GLOBAL_EQUATION_COUNT, GLOBAL_FIGURE_COUNT
449
 
450
  print("\n" + "=" * 80)
451
  print(f"--- YOLO/OCR PREPROCESSING COMPLETE ({total_pages_processed} pages processed) ---")
452
  print("=" * 80)
453
 
454
+ # UPDATED RETURN VALUE FOR REQUIRED STATS
455
+ return preprocessed_json_path, total_pages, GLOBAL_EQUATION_COUNT, GLOBAL_FIGURE_COUNT
 
 
456
 
457
 
458
+ # ============================================================================
459
+ # --- MAIN EXECUTION BLOCK (Modified for requested output) ---
460
+ # ============================================================================
461
 
462
  if __name__ == "__main__":
463
  parser = argparse.ArgumentParser(description="Complete Pipeline")
464
  parser.add_argument("--input_pdf", type=str, required=True, help="Input PDF")
465
+ # Using the placeholder constant
466
+ parser.add_argument("--layoutlmv3_model_path", type=str, default=DEFAULT_LAYOUTLMV3_MODEL_PATH, help="Model Path")
467
 
468
  # --- ADDED ARGUMENT FOR DEBUGGING ---
469
  parser.add_argument("--raw_preds_path", type=str, default='BIO_debug.json',
 
473
 
474
  pdf_name = os.path.splitext(os.path.basename(args.input_pdf))[0]
475
  final_output_path = os.path.abspath(f"{pdf_name}_final_output_embedded.json")
476
+
477
+ # Define the output path for the preprocessing step
478
+ os.makedirs(OCR_JSON_OUTPUT_DIR, exist_ok=True)
479
+ preprocessed_json_path = os.path.join(OCR_JSON_OUTPUT_DIR, f"{pdf_name}_preprocessed.json")
480
+
481
+ # --- CORE EXECUTION ---
482
+ print("\nStarting PDF Analysis and Extraction...")
483
+
484
+ # Run the core logic and capture the three required statistics
485
+ json_path_out, num_pages, num_equations, num_figures = run_single_pdf_preprocessing(
486
+ args.input_pdf,
487
+ preprocessed_json_path
488
+ )
489
+
490
+ # --- PRINTING THE REQUIRED STATISTICS ---
491
+ print("\n" + "#" * 50)
492
+ print("## πŸ“Š EXTRACTION SUMMARY")
493
+ print("#" * 50)
494
+
495
+ if json_path_out:
496
+ print(f"**1) Total Pages Detected:** {num_pages}")
497
+ print("**2) Elements Extracted:**")
498
+ print(f" - Equations: {num_equations}")
499
+ print(f" - Figures: {num_figures}")
500
+ else:
501
+ # Note: num_pages might be > 0 even if processing failed (if the PDF opened)
502
+ print(f"**Extraction Failed.** Pages in PDF: {num_pages}. See logs above for errors.")
503
+ sys.exit(1)
504
 
505
+ print("#" * 50 + "\n")
506
+ # --------------------------------------------------------------------------------
 
 
 
 
 
 
 
 
507
 
508
+ # The original script had more logic here (run_document_pipeline, etc.).
509
+ # Since only the pre-processing function and the statistics output were requested,
510
+ # the rest of the original final file saving logic is commented out/removed.
511
+ # To retain the original final file saving placeholder:
512
  # πŸ›‘ CRITICAL FINAL FIX: AGGRESSIVE CUSTOM JSON SAVING πŸ›‘
513
+ # if final_json_data: # final_json_data is not produced by run_single_pdf_preprocessing
514
+ # ...
515
+ # else:
516
+ # print("\n❌ Pipeline Failed.")
517
+ # sys.exit(1)
518
+
519
+ print(f"The preprocessed JSON data is saved to: {preprocessed_json_path}")
520
+ print("Pipeline step complete.")
521
+ sys.exit(0)
522
+
523
+ # End of script