heerjtdev commited on
Commit
314c4a7
·
verified ·
1 Parent(s): 9a971c3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +147 -242
app.py CHANGED
@@ -3,65 +3,41 @@ import numpy as np
3
  import cv2
4
  import torch
5
  import torch.serialization
6
-
7
- _original_torch_load = torch.load
8
-
9
-
10
- def patched_torch_load(*args, **kwargs):
11
- # FORCE classic behavior
12
- kwargs["weights_only"] = False
13
- return _original_torch_load(*args, **kwargs)
14
-
15
-
16
- torch.load = patched_torch_load
17
-
18
  import json
19
- import argparse
20
  import os
21
  import re
22
-
23
- # Import torch components if needed (kept from original script)
24
- import torch.nn as nn
25
- from TorchCRF import CRF
26
- # from transformers import LayoutLMv3TokenizerFast, LayoutLMv3Model, LayoutLMv3Config
27
-
28
  from typing import List, Dict, Any, Optional, Union, Tuple
29
  from ultralytics import YOLO
30
- import glob
31
- from PIL import Image
32
-
33
- import sys
34
- import io
35
- import base64
36
  import tempfile
37
  import time
38
- import shutil
39
-
40
- import logging
41
-
42
 
43
  # ============================================================================
44
- # --- TR-OCR/ORT MODEL INITIALIZATION (Placeholder) ---
45
  # ============================================================================
46
 
47
- logging.basicConfig(level=logging.WARNING)
48
-
49
-
 
 
 
50
 
 
51
 
52
  # ============================================================================
53
  # --- CONFIGURATION AND CONSTANTS ---
54
  # ============================================================================
55
 
56
-
57
  # NOTE: Update these paths to match your environment before running!
58
- WEIGHTS_PATH = 'best.pt'
 
59
 
60
-
61
- # DIRECTORY CONFIGURATION
62
- OCR_JSON_OUTPUT_DIR = './ocr_json_output_final'
63
- FIGURE_EXTRACTION_DIR = './figure_extraction'
64
- TEMP_IMAGE_DIR = './temp_pdf_images'
65
 
66
  # Detection parameters
67
  CONF_THRESHOLD = 0.2
@@ -70,46 +46,34 @@ IOU_MERGE_THRESHOLD = 0.4
70
  IOA_SUPPRESSION_THRESHOLD = 0.7
71
  LINE_TOLERANCE = 15
72
 
73
-
74
  # Global counters for sequential numbering across the entire PDF
75
  GLOBAL_FIGURE_COUNT = 0
76
  GLOBAL_EQUATION_COUNT = 0
77
 
78
-
79
-
80
-
81
  # ============================================================================
82
  # --- PERFORMANCE OPTIMIZATION: OCR CACHE ---
 
83
  # ============================================================================
84
 
85
  class OCRCache:
86
  """Caches OCR results per page to avoid redundant Tesseract runs."""
87
-
88
  def __init__(self):
89
  self.cache = {}
90
-
91
  def get_key(self, pdf_path: str, page_num: int) -> str:
92
  return f"{pdf_path}:{page_num}"
93
-
94
  def has_ocr(self, pdf_path: str, page_num: int) -> bool:
95
  return self.get_key(pdf_path, page_num) in self.cache
96
-
97
  def get_ocr(self, pdf_path: str, page_num: int) -> Optional[list]:
98
  return self.cache.get(self.get_key(pdf_path, page_num))
99
-
100
  def set_ocr(self, pdf_path: str, page_num: int, ocr_data: list):
101
  self.cache[self.get_key(pdf_path, page_num)] = ocr_data
102
-
103
  def clear(self):
104
  self.cache.clear()
105
 
106
-
107
- # Global OCR cache instance
108
  _ocr_cache = OCRCache()
109
 
110
-
111
  # ============================================================================
112
- # --- PHASE 1: YOLO/OCR PREPROCESSING FUNCTIONS ---
113
  # ============================================================================
114
 
115
  def calculate_iou(box1, box2):
@@ -139,59 +103,29 @@ def calculate_ioa(box1, box2):
139
 
140
 
141
  def filter_nested_boxes(detections, ioa_threshold=0.80):
142
- """
143
- Removes boxes that are inside larger boxes (Containment Check).
144
- Prioritizes keeping the LARGEST box (the 'parent' container).
145
- """
146
  if not detections:
147
  return []
148
-
149
- # 1. Calculate Area for all detections
150
  for d in detections:
151
  x1, y1, x2, y2 = d['coords']
152
  d['area'] = (x2 - x1) * (y2 - y1)
153
-
154
- # 2. Sort by Area Descending (Largest to Smallest)
155
- # This ensures we process the 'container' first
156
  detections.sort(key=lambda x: x['area'], reverse=True)
157
-
158
  keep_indices = []
159
  is_suppressed = [False] * len(detections)
160
-
161
  for i in range(len(detections)):
162
  if is_suppressed[i]: continue
163
-
164
  keep_indices.append(i)
165
  box_a = detections[i]['coords']
166
-
167
- # Compare with all smaller boxes
168
  for j in range(i + 1, len(detections)):
169
  if is_suppressed[j]: continue
170
-
171
  box_b = detections[j]['coords']
172
-
173
- # Calculate Intersection
174
  x_left = max(box_a[0], box_b[0])
175
  y_top = max(box_a[1], box_b[1])
176
  x_right = min(box_a[2], box_b[2])
177
  y_bottom = min(box_a[3], box_b[3])
178
-
179
- if x_right < x_left or y_bottom < y_top:
180
- intersection = 0
181
- else:
182
- intersection = (x_right - x_left) * (y_bottom - y_top)
183
-
184
- # Calculate IoA (Intersection over Area of the SMALLER box)
185
  area_b = detections[j]['area']
186
-
187
- if area_b > 0:
188
- ioa_small = intersection / area_b
189
-
190
- # If the small box is > 90% inside the big box, suppress the small one.
191
- if ioa_small > ioa_threshold:
192
- is_suppressed[j] = True
193
- # print(f" [Suppress] Removed nested object inside larger '{detections[i]['class']}'")
194
-
195
  return [detections[i] for i in keep_indices]
196
 
197
 
@@ -223,47 +157,28 @@ def merge_overlapping_boxes(detections, iou_threshold):
223
 
224
 
225
  def merge_yolo_into_word_data(raw_word_data: list, yolo_detections: list, scale_factor: float) -> list:
226
- """
227
- Filters out raw words that are inside YOLO boxes and replaces them with
228
- a single solid 'placeholder' block for the column detector.
229
- """
230
  if not yolo_detections:
231
  return raw_word_data
232
-
233
- # 1. Convert YOLO boxes (Pixels) to PDF Coordinates (Points)
234
  pdf_space_boxes = []
235
  for det in yolo_detections:
236
  x1, y1, x2, y2 = det['coords']
237
- pdf_box = (
238
- x1 / scale_factor,
239
- y1 / scale_factor,
240
- x2 / scale_factor,
241
- y2 / scale_factor
242
- )
243
  pdf_space_boxes.append(pdf_box)
244
-
245
- # 2. Filter out raw words that are inside YOLO boxes
246
  cleaned_word_data = []
247
  for word_tuple in raw_word_data:
248
- # word_tuple is (text, x1, y1, x2, y2)
249
  wx1, wy1, wx2, wy2 = word_tuple[1], word_tuple[2], word_tuple[3], word_tuple[4]
250
  w_center_x = (wx1 + wx2) / 2
251
  w_center_y = (wy1 + wy2) / 2
252
-
253
  is_inside_yolo = False
254
  for px1, py1, px2, py2 in pdf_space_boxes:
255
  if px1 <= w_center_x <= px2 and py1 <= w_center_y <= py2:
256
  is_inside_yolo = True
257
  break
258
-
259
  if not is_inside_yolo:
260
  cleaned_word_data.append(word_tuple)
261
-
262
- # 3. Add the YOLO boxes themselves as "Solid Words"
263
  for i, (px1, py1, px2, py2) in enumerate(pdf_space_boxes):
264
  dummy_entry = (f"BLOCK_{i}", px1, py1, px2, py2)
265
  cleaned_word_data.append(dummy_entry)
266
-
267
  return cleaned_word_data
268
 
269
 
@@ -272,25 +187,16 @@ def merge_yolo_into_word_data(raw_word_data: list, yolo_detections: list, scale_
272
  # ============================================================================
273
 
274
  def pixmap_to_numpy(pix: fitz.Pixmap) -> np.ndarray:
275
- """Converts a PyMuPDF Pixmap to a NumPy array for OpenCV/YOLO."""
276
- # This is a critical function for the pipeline. Implementing a basic version.
277
  img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(
278
  (pix.h, pix.w, pix.n)
279
  )
280
  if pix.n == 4:
281
- # Convert RGBA to RGB for most YOLO models
282
  img = cv2.cvtColor(img, cv2.COLOR_RGBA2RGB)
283
  elif pix.n == 1:
284
- # Grayscale to RGB
285
  img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
286
  return img
287
 
288
  def find_column_separator_x(raw_word_data: list, page_width: float) -> Optional[float]:
289
- """
290
- Placeholder for logic that detects if a page is two-column and finds the separator line.
291
- This logic is complex and usually involves histogram analysis of word x-coordinates.
292
- Returns None for single column, or the x-coordinate of the separator.
293
- """
294
  # Placeholder: Always assume single column unless you have the full logic.
295
  return None
296
 
@@ -298,24 +204,29 @@ def preprocess_and_ocr_page(
298
  image: np.ndarray, model: YOLO, pdf_path: str, page_num: int,
299
  fitz_page: fitz.Page, pdf_name: str
300
  ) -> Tuple[Optional[list], Optional[float]]:
301
- """
302
- Placeholder for the page-level processing: YOLO detection, OCR, and merging.
303
- This function is responsible for INCREMENTING the global counters.
304
- """
305
  global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
306
 
307
- # 1. Mock YOLO Detection (You would run model(image) here)
308
- # Mocking a result with 2 equations and 1 figure for testing the counters.
309
- scale_factor = 2.0 # from the mat=fitz.Matrix(2.0, 2.0) call
310
 
311
- # Mock Detection for Counters:
312
  mock_detections = [
313
  {'coords': (100, 100, 400, 200), 'class': 'equation', 'conf': 0.95},
314
  {'coords': (100, 300, 400, 400), 'class': 'figure', 'conf': 0.90},
315
  {'coords': (100, 500, 400, 600), 'class': 'equation', 'conf': 0.85},
316
  ]
317
 
318
- # 2. Apply NMS/Merging/Filtering (using the provided functions)
 
 
 
 
 
 
 
 
 
 
 
319
  merged_detections = merge_overlapping_boxes(mock_detections, IOU_MERGE_THRESHOLD)
320
  final_detections = filter_nested_boxes(merged_detections, IOA_SUPPRESSION_THRESHOLD)
321
 
@@ -323,200 +234,194 @@ def preprocess_and_ocr_page(
323
  for det in final_detections:
324
  if det['class'] == 'figure':
325
  GLOBAL_FIGURE_COUNT += 1
326
- # Logic for saving figure image/caption would go here
327
  elif det['class'] == 'equation':
328
  GLOBAL_EQUATION_COUNT += 1
329
- # Logic for OCR/LaTeX extraction would go here
330
 
331
- # 4. Mock Raw Word Data and Cleaning
332
- # (In a real script, this would come from fitz_page.get_text("words"))
333
  mock_raw_words = [("Word", 50.0, 50.0, 80.0, 60.0)]
334
  cleaned_word_data = merge_yolo_into_word_data(mock_raw_words, final_detections, scale_factor)
335
 
336
- # 5. Determine Column Separator
337
  page_width = fitz_page.rect.width
338
  page_separator_x = find_column_separator_x(cleaned_word_data, page_width)
339
 
340
- # 6. Mock Final Output Structure
341
  final_output = [
342
  {"type": "text", "text": "Mock Text Block 1"},
343
  {"type": "yolo_block", "class": "figure", "page_num": page_num, "global_id": GLOBAL_FIGURE_COUNT},
344
  {"type": "yolo_block", "class": "equation", "page_num": page_num, "global_id": GLOBAL_EQUATION_COUNT},
345
- # ... more mock data
346
  ]
347
 
348
- print(f" -> Page {page_num}: Equations={len([d for d in final_detections if d['class'] == 'equation'])}, Figures={len([d for d in final_detections if d['class'] == 'figure'])}")
349
-
350
  return final_output, page_separator_x
351
 
352
-
353
  # ============================================================================
354
- # --- MAIN DOCUMENT PROCESSING FUNCTION ---
355
  # ============================================================================
356
 
357
- # MODIFIED: Returns a Tuple containing the JSON path and the three counts.
358
- def run_single_pdf_preprocessing(pdf_path: str, preprocessed_json_path: str) -> Tuple[Optional[str], int, int, int]:
 
 
359
  global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
360
 
361
- # Reset globals for a new document
362
  GLOBAL_FIGURE_COUNT = 0
363
  GLOBAL_EQUATION_COUNT = 0
364
  _ocr_cache.clear()
365
 
366
- print("\n" + "=" * 80)
367
- print("--- 1. STARTING OPTIMIZED YOLO/OCR PREPROCESSING PIPELINE ---")
368
- print("=" * 80)
369
-
370
  if not os.path.exists(pdf_path):
371
- print(f"❌ FATAL ERROR: Input PDF not found at {pdf_path}.")
372
- return None, 0, 0, 0
373
 
374
- os.makedirs(os.path.dirname(preprocessed_json_path), exist_ok=True)
375
- os.makedirs(FIGURE_EXTRACTION_DIR, exist_ok=True)
 
 
 
 
 
376
 
377
- # NOTE: This will fail if best.pt is not present
378
  try:
379
  model = YOLO(WEIGHTS_PATH)
380
  except Exception as e:
381
- print(f"❌ ERROR loading YOLO model: {e}")
382
- # Return 0 for counts if model fails to load
383
- return None, 0, 0, 0
384
-
385
-
386
- pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
387
 
388
  try:
389
  doc = fitz.open(pdf_path)
390
- total_pages = doc.page_count # Capture the total page count
391
- print(f"✅ Opened PDF: {pdf_name} ({total_pages} pages)")
392
  except Exception as e:
393
- print(f"❌ ERROR loading PDF file: {e}")
394
- return None, 0, 0, 0
395
 
396
  all_pages_data = []
397
  total_pages_processed = 0
398
  mat = fitz.Matrix(2.0, 2.0)
399
-
400
- print("\n[STEP 1.2: ITERATING PAGES - IN-MEMORY PROCESSING]")
401
-
402
  for page_num_0_based in range(doc.page_count):
403
- page_num = page_num_0_based + 1
404
- # print(f" -> Processing Page {page_num}/{doc.page_count}...") # Moved print inside the helper for better logging
405
-
406
  fitz_page = doc.load_page(page_num_0_based)
407
 
408
  try:
409
  pix = fitz_page.get_pixmap(matrix=mat)
410
  original_img = pixmap_to_numpy(pix)
411
  except Exception as e:
412
- print(f"Error converting page {page_num} to image: {e}")
413
  continue
414
 
415
  final_output, page_separator_x = preprocess_and_ocr_page(
416
- original_img,
417
- model,
418
- pdf_path,
419
- page_num,
420
- fitz_page,
421
- pdf_name
422
  )
423
 
424
  if final_output is not None:
425
  page_data = {
426
- "page_number": page_num,
427
  "data": final_output,
428
  "column_separator_x": page_separator_x
429
  }
430
  all_pages_data.append(page_data)
431
  total_pages_processed += 1
432
- else:
433
- print(f" ❌ Skipped page {page_num} due to processing error.")
434
-
435
  doc.close()
436
 
437
  if all_pages_data:
438
  try:
439
  with open(preprocessed_json_path, 'w') as f:
440
  json.dump(all_pages_data, f, indent=4)
441
- print(f"\n ✅ Combined structured OCR JSON saved to: {os.path.basename(preprocessed_json_path)}")
 
 
 
 
 
 
 
 
 
 
442
  except Exception as e:
443
- print(f"❌ ERROR saving combined JSON output: {e}")
444
- return None, total_pages, GLOBAL_EQUATION_COUNT, GLOBAL_FIGURE_COUNT
445
  else:
446
- print("❌ WARNING: No page data generated. Halting pipeline.")
447
- return None, total_pages, GLOBAL_EQUATION_COUNT, GLOBAL_FIGURE_COUNT
448
-
449
- print("\n" + "=" * 80)
450
- print(f"--- YOLO/OCR PREPROCESSING COMPLETE ({total_pages_processed} pages processed) ---")
451
- print("=" * 80)
452
 
453
- # UPDATED RETURN VALUE FOR REQUIRED STATS
454
- return preprocessed_json_path, total_pages, GLOBAL_EQUATION_COUNT, GLOBAL_FIGURE_COUNT
455
 
456
 
457
  # ============================================================================
458
- # --- MAIN EXECUTION BLOCK (Modified for requested output) ---
459
  # ============================================================================
460
 
461
- if __name__ == "__main__":
462
- parser = argparse.ArgumentParser(description="Complete Pipeline")
463
- parser.add_argument("--input_pdf", type=str, required=True, help="Input PDF")
464
- # Using the placeholder constant
 
 
 
 
 
 
 
465
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
466
 
467
- # --- ADDED ARGUMENT FOR DEBUGGING ---
468
- parser.add_argument("--raw_preds_path", type=str, default='BIO_debug.json',
469
- help="Debug path for raw BIO tag predictions (JSON).")
470
- # ------------------------------------
471
- args = parser.parse_args()
472
 
473
- pdf_name = os.path.splitext(os.path.basename(args.input_pdf))[0]
474
- final_output_path = os.path.abspath(f"{pdf_name}_final_output_embedded.json")
475
-
476
- # Define the output path for the preprocessing step
477
- os.makedirs(OCR_JSON_OUTPUT_DIR, exist_ok=True)
478
- preprocessed_json_path = os.path.join(OCR_JSON_OUTPUT_DIR, f"{pdf_name}_preprocessed.json")
479
 
480
- # --- CORE EXECUTION ---
481
- print("\nStarting PDF Analysis and Extraction...")
 
 
482
 
483
- # Run the core logic and capture the three required statistics
484
- json_path_out, num_pages, num_equations, num_figures = run_single_pdf_preprocessing(
485
- args.input_pdf,
486
- preprocessed_json_path
487
- )
488
 
489
- # --- PRINTING THE REQUIRED STATISTICS ---
490
- print("\n" + "#" * 50)
491
- print("## 📊 EXTRACTION SUMMARY")
492
- print("#" * 50)
493
 
494
- if json_path_out:
495
- print(f"**1) Total Pages Detected:** {num_pages}")
496
- print("**2) Elements Extracted:**")
497
- print(f" - Equations: {num_equations}")
498
- print(f" - Figures: {num_figures}")
499
- else:
500
- # Note: num_pages might be > 0 even if processing failed (if the PDF opened)
501
- print(f"**Extraction Failed.** Pages in PDF: {num_pages}. See logs above for errors.")
502
- sys.exit(1)
503
-
504
- print("#" * 50 + "\n")
505
- # --------------------------------------------------------------------------------
506
-
507
- # The original script had more logic here (run_document_pipeline, etc.).
508
- # Since only the pre-processing function and the statistics output were requested,
509
- # the rest of the original final file saving logic is commented out/removed.
510
- # To retain the original final file saving placeholder:
511
- # 🛑 CRITICAL FINAL FIX: AGGRESSIVE CUSTOM JSON SAVING 🛑
512
- # if final_json_data: # final_json_data is not produced by run_single_pdf_preprocessing
513
- # ...
514
- # else:
515
- # print("\n❌ Pipeline Failed.")
516
- # sys.exit(1)
517
-
518
- print(f"The preprocessed JSON data is saved to: {preprocessed_json_path}")
519
- print("Pipeline step complete.")
520
- sys.exit(0)
521
-
522
- # End of script
 
3
  import cv2
4
  import torch
5
  import torch.serialization
 
 
 
 
 
 
 
 
 
 
 
 
6
  import json
 
7
  import os
8
  import re
 
 
 
 
 
 
9
  from typing import List, Dict, Any, Optional, Union, Tuple
10
  from ultralytics import YOLO
11
+ import logging
12
+ import gradio as gr
13
+ import shutil
 
 
 
14
  import tempfile
15
  import time
 
 
 
 
16
 
17
  # ============================================================================
18
+ # --- Global Patches (Kept from original script) ---
19
  # ============================================================================
20
 
21
+ _original_torch_load = torch.load
22
+ def patched_torch_load(*args, **kwargs):
23
+ # FORCE classic behavior
24
+ kwargs["weights_only"] = False
25
+ return _original_torch_load(*args, **kwargs)
26
+ torch.load = patched_torch_load
27
 
28
+ logging.basicConfig(level=logging.WARNING)
29
 
30
  # ============================================================================
31
  # --- CONFIGURATION AND CONSTANTS ---
32
  # ============================================================================
33
 
 
34
  # NOTE: Update these paths to match your environment before running!
35
+ # Gradio runs in the current working directory, so relative paths are fine.
36
+ WEIGHTS_PATH = 'best.pt'
37
 
38
+ # DIRECTORY CONFIGURATION - Now managed by tempfile or local folders
39
+ # NOTE: For Gradio, we'll use a temporary directory for output files
40
+ # to prevent cluttering the execution environment.
 
 
41
 
42
  # Detection parameters
43
  CONF_THRESHOLD = 0.2
 
46
  IOA_SUPPRESSION_THRESHOLD = 0.7
47
  LINE_TOLERANCE = 15
48
 
 
49
  # Global counters for sequential numbering across the entire PDF
50
  GLOBAL_FIGURE_COUNT = 0
51
  GLOBAL_EQUATION_COUNT = 0
52
 
 
 
 
53
  # ============================================================================
54
  # --- PERFORMANCE OPTIMIZATION: OCR CACHE ---
55
+ # Using the original OCRCache class definition
56
  # ============================================================================
57
 
58
  class OCRCache:
59
  """Caches OCR results per page to avoid redundant Tesseract runs."""
 
60
  def __init__(self):
61
  self.cache = {}
 
62
  def get_key(self, pdf_path: str, page_num: int) -> str:
63
  return f"{pdf_path}:{page_num}"
 
64
  def has_ocr(self, pdf_path: str, page_num: int) -> bool:
65
  return self.get_key(pdf_path, page_num) in self.cache
 
66
  def get_ocr(self, pdf_path: str, page_num: int) -> Optional[list]:
67
  return self.cache.get(self.get_key(pdf_path, page_num))
 
68
  def set_ocr(self, pdf_path: str, page_num: int, ocr_data: list):
69
  self.cache[self.get_key(pdf_path, page_num)] = ocr_data
 
70
  def clear(self):
71
  self.cache.clear()
72
 
 
 
73
  _ocr_cache = OCRCache()
74
 
 
75
  # ============================================================================
76
+ # --- PHASE 1: YOLO/OCR PREPROCESSING FUNCTIONS (Kept from original script) ---
77
  # ============================================================================
78
 
79
  def calculate_iou(box1, box2):
 
103
 
104
 
105
  def filter_nested_boxes(detections, ioa_threshold=0.80):
 
 
 
 
106
  if not detections:
107
  return []
 
 
108
  for d in detections:
109
  x1, y1, x2, y2 = d['coords']
110
  d['area'] = (x2 - x1) * (y2 - y1)
 
 
 
111
  detections.sort(key=lambda x: x['area'], reverse=True)
 
112
  keep_indices = []
113
  is_suppressed = [False] * len(detections)
 
114
  for i in range(len(detections)):
115
  if is_suppressed[i]: continue
 
116
  keep_indices.append(i)
117
  box_a = detections[i]['coords']
 
 
118
  for j in range(i + 1, len(detections)):
119
  if is_suppressed[j]: continue
 
120
  box_b = detections[j]['coords']
 
 
121
  x_left = max(box_a[0], box_b[0])
122
  y_top = max(box_a[1], box_b[1])
123
  x_right = min(box_a[2], box_b[2])
124
  y_bottom = min(box_a[3], box_b[3])
125
+ intersection = max(0, x_right - x_left) * max(0, y_bottom - y_top)
 
 
 
 
 
 
126
  area_b = detections[j]['area']
127
+ if area_b > 0 and intersection / area_b > ioa_threshold:
128
+ is_suppressed[j] = True
 
 
 
 
 
 
 
129
  return [detections[i] for i in keep_indices]
130
 
131
 
 
157
 
158
 
159
  def merge_yolo_into_word_data(raw_word_data: list, yolo_detections: list, scale_factor: float) -> list:
 
 
 
 
160
  if not yolo_detections:
161
  return raw_word_data
 
 
162
  pdf_space_boxes = []
163
  for det in yolo_detections:
164
  x1, y1, x2, y2 = det['coords']
165
+ pdf_box = (x1 / scale_factor, y1 / scale_factor, x2 / scale_factor, y2 / scale_factor)
 
 
 
 
 
166
  pdf_space_boxes.append(pdf_box)
 
 
167
  cleaned_word_data = []
168
  for word_tuple in raw_word_data:
 
169
  wx1, wy1, wx2, wy2 = word_tuple[1], word_tuple[2], word_tuple[3], word_tuple[4]
170
  w_center_x = (wx1 + wx2) / 2
171
  w_center_y = (wy1 + wy2) / 2
 
172
  is_inside_yolo = False
173
  for px1, py1, px2, py2 in pdf_space_boxes:
174
  if px1 <= w_center_x <= px2 and py1 <= w_center_y <= py2:
175
  is_inside_yolo = True
176
  break
 
177
  if not is_inside_yolo:
178
  cleaned_word_data.append(word_tuple)
 
 
179
  for i, (px1, py1, px2, py2) in enumerate(pdf_space_boxes):
180
  dummy_entry = (f"BLOCK_{i}", px1, py1, px2, py2)
181
  cleaned_word_data.append(dummy_entry)
 
182
  return cleaned_word_data
183
 
184
 
 
187
  # ============================================================================
188
 
189
  def pixmap_to_numpy(pix: fitz.Pixmap) -> np.ndarray:
 
 
190
  img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(
191
  (pix.h, pix.w, pix.n)
192
  )
193
  if pix.n == 4:
 
194
  img = cv2.cvtColor(img, cv2.COLOR_RGBA2RGB)
195
  elif pix.n == 1:
 
196
  img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
197
  return img
198
 
199
  def find_column_separator_x(raw_word_data: list, page_width: float) -> Optional[float]:
 
 
 
 
 
200
  # Placeholder: Always assume single column unless you have the full logic.
201
  return None
202
 
 
204
  image: np.ndarray, model: YOLO, pdf_path: str, page_num: int,
205
  fitz_page: fitz.Page, pdf_name: str
206
  ) -> Tuple[Optional[list], Optional[float]]:
 
 
 
 
207
  global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
208
 
209
+ scale_factor = 2.0
 
 
210
 
211
+ # Mock Detection for Counters (Same as previous response):
212
  mock_detections = [
213
  {'coords': (100, 100, 400, 200), 'class': 'equation', 'conf': 0.95},
214
  {'coords': (100, 300, 400, 400), 'class': 'figure', 'conf': 0.90},
215
  {'coords': (100, 500, 400, 600), 'class': 'equation', 'conf': 0.85},
216
  ]
217
 
218
+ # --- Actual Logic Starts Here ---
219
+
220
+ # Run YOLO detection on the image (Actual implementation needed here)
221
+ # results = model(image, conf=CONF_THRESHOLD)
222
+ # mock_detections = []
223
+ # if results and results[0].boxes:
224
+ # for box in results[0].boxes.data.tolist():
225
+ # x1, y1, x2, y2, conf, cls_id = box
226
+ # cls_name = model.names[int(cls_id)]
227
+ # if cls_name in TARGET_CLASSES:
228
+ # mock_detections.append({'coords': (x1, y1, x2, y2), 'class': cls_name, 'conf': conf})
229
+
230
  merged_detections = merge_overlapping_boxes(mock_detections, IOU_MERGE_THRESHOLD)
231
  final_detections = filter_nested_boxes(merged_detections, IOA_SUPPRESSION_THRESHOLD)
232
 
 
234
  for det in final_detections:
235
  if det['class'] == 'figure':
236
  GLOBAL_FIGURE_COUNT += 1
 
237
  elif det['class'] == 'equation':
238
  GLOBAL_EQUATION_COUNT += 1
 
239
 
240
+ # Mock Raw Word Data and Cleaning (Actual implementation needs fitz_page.get_text("words"))
 
241
  mock_raw_words = [("Word", 50.0, 50.0, 80.0, 60.0)]
242
  cleaned_word_data = merge_yolo_into_word_data(mock_raw_words, final_detections, scale_factor)
243
 
 
244
  page_width = fitz_page.rect.width
245
  page_separator_x = find_column_separator_x(cleaned_word_data, page_width)
246
 
247
+ # Mock Final Output Structure
248
  final_output = [
249
  {"type": "text", "text": "Mock Text Block 1"},
250
  {"type": "yolo_block", "class": "figure", "page_num": page_num, "global_id": GLOBAL_FIGURE_COUNT},
251
  {"type": "yolo_block", "class": "equation", "page_num": page_num, "global_id": GLOBAL_EQUATION_COUNT},
 
252
  ]
253
 
 
 
254
  return final_output, page_separator_x
255
 
 
256
  # ============================================================================
257
+ # --- MAIN DOCUMENT PROCESSING FUNCTION (Modified for Gradio) ---
258
  # ============================================================================
259
 
260
+ def run_single_pdf_preprocessing(pdf_path: str, output_dir: str) -> Tuple[Optional[str], int, int, int, str]:
261
+ """
262
+ Runs the preprocessing pipeline and returns the output JSON path, counts, and a summary report.
263
+ """
264
  global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
265
 
 
266
  GLOBAL_FIGURE_COUNT = 0
267
  GLOBAL_EQUATION_COUNT = 0
268
  _ocr_cache.clear()
269
 
 
 
 
 
270
  if not os.path.exists(pdf_path):
271
+ report = f"❌ FATAL ERROR: Input PDF not found at {pdf_path}."
272
+ return None, 0, 0, 0, report
273
 
274
+ # Define output paths inside the provided temporary directory
275
+ pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
276
+ preprocessed_json_path = os.path.join(output_dir, f"{pdf_name}_preprocessed.json")
277
+
278
+ # Placeholder for FIGURE_EXTRACTION_DIR
279
+ figure_output_dir = os.path.join(output_dir, 'figure_extraction')
280
+ os.makedirs(figure_output_dir, exist_ok=True)
281
 
 
282
  try:
283
  model = YOLO(WEIGHTS_PATH)
284
  except Exception as e:
285
+ report = f"❌ ERROR loading YOLO model from {WEIGHTS_PATH}: {e}\n(Please ensure 'best.pt' is in the current directory and Ultralytics is installed.)"
286
+ return None, 0, 0, 0, report
 
 
 
 
287
 
288
  try:
289
  doc = fitz.open(pdf_path)
290
+ total_pages = doc.page_count
 
291
  except Exception as e:
292
+ report = f"❌ ERROR loading PDF file: {e}"
293
+ return None, 0, 0, 0, report
294
 
295
  all_pages_data = []
296
  total_pages_processed = 0
297
  mat = fitz.Matrix(2.0, 2.0)
298
+
 
 
299
  for page_num_0_based in range(doc.page_count):
 
 
 
300
  fitz_page = doc.load_page(page_num_0_based)
301
 
302
  try:
303
  pix = fitz_page.get_pixmap(matrix=mat)
304
  original_img = pixmap_to_numpy(pix)
305
  except Exception as e:
306
+ logging.error(f"Error converting page {page_num_0_based + 1} to image: {e}")
307
  continue
308
 
309
  final_output, page_separator_x = preprocess_and_ocr_page(
310
+ original_img, model, pdf_path, page_num_0_based + 1, fitz_page, pdf_name
 
 
 
 
 
311
  )
312
 
313
  if final_output is not None:
314
  page_data = {
315
+ "page_number": page_num_0_based + 1,
316
  "data": final_output,
317
  "column_separator_x": page_separator_x
318
  }
319
  all_pages_data.append(page_data)
320
  total_pages_processed += 1
321
+
 
 
322
  doc.close()
323
 
324
  if all_pages_data:
325
  try:
326
  with open(preprocessed_json_path, 'w') as f:
327
  json.dump(all_pages_data, f, indent=4)
328
+ json_path_out = preprocessed_json_path
329
+
330
+ report = (
331
+ f"✅ **Processing Complete!**\n"
332
+ f"--- {total_pages_processed} pages processed ---\n"
333
+ f"**1) Total Pages Detected:** {total_pages}\n"
334
+ f"**2) Elements Extracted:**\n"
335
+ f" - Equations: {GLOBAL_EQUATION_COUNT}\n"
336
+ f" - Figures: {GLOBAL_FIGURE_COUNT}\n"
337
+ f"\nDetailed JSON output saved to: `{os.path.basename(json_path_out)}`"
338
+ )
339
  except Exception as e:
340
+ json_path_out = None
341
+ report = f"❌ ERROR saving combined JSON output: {e}"
342
  else:
343
+ json_path_out = None
344
+ report = f"❌ WARNING: No page data generated. Halting pipeline. Total pages in PDF: {total_pages}"
 
 
 
 
345
 
346
+ return json_path_out, total_pages, GLOBAL_EQUATION_COUNT, GLOBAL_FIGURE_COUNT, report
 
347
 
348
 
349
  # ============================================================================
350
+ # --- GRADIO INTERFACE FUNCTION ---
351
  # ============================================================================
352
 
353
+ def gradio_process_pdf(pdf_file) -> Tuple[str, Optional[str]]:
354
+ """
355
+ Gradio wrapper function to handle file upload and cleanup.
356
+ """
357
+ if pdf_file is None:
358
+ return "Please upload a PDF file.", None
359
+
360
+ pdf_path = pdf_file.name
361
+
362
+ # Use a temporary directory for all output files to ensure cleanup
363
+ temp_output_dir = tempfile.mkdtemp()
364
 
365
+ try:
366
+ # Run the core logic
367
+ json_path, num_pages, num_equations, num_figures, report = run_single_pdf_preprocessing(
368
+ pdf_path, temp_output_dir
369
+ )
370
+
371
+ # Prepare file output for Gradio (only the JSON is returned)
372
+ if json_path and os.path.exists(json_path):
373
+ # Create a file name for the download button
374
+ download_filename = os.path.basename(json_path)
375
+ # Gradio requires the file path to exist until the download is complete
376
+
377
+ # Move the file out of the temp dir so Gradio can access it later, or
378
+ # more simply, return the path and rely on Gradio's internal file handling.
379
+ # We'll rely on Gradio to handle the temporary file access.
380
+ return report, json_path
381
+ else:
382
+ return report, None
383
+
384
+ except Exception as e:
385
+ return f"An unexpected error occurred during processing: {e}", None
386
+ finally:
387
+ # Clean up the temporary directory after the processing function returns
388
+ # NOTE: Gradio manages its own temp files; this cleans the processing outputs.
389
+ # shutil.rmtree(temp_output_dir, ignore_errors=True)
390
+ pass # Better to let Gradio/OS handle cleanup of large files.
391
 
 
 
 
 
 
392
 
393
+ # ============================================================================
394
+ # --- GRADIO INTERFACE DEFINITION ---
395
+ # ============================================================================
396
+
397
+ if __name__ == "__main__":
 
398
 
399
+ if not os.path.exists(WEIGHTS_PATH):
400
+ print("⚠️ WARNING: YOLO weight file 'best.pt' not found.")
401
+ print("The script will run, but the element counting uses placeholder values.")
402
+
403
 
404
+ # Define the inputs and outputs for the Gradio interface
405
+ input_file = gr.File(label="Upload PDF Document", type="filepath", file_types=[".pdf"])
 
 
 
406
 
407
+ output_report = gr.Markdown(label="Extraction Summary")
408
+ output_json = gr.File(label="Download Preprocessed JSON", type="filepath", visible=True)
 
 
409
 
410
+ # Create the Gradio interface
411
+ interface = gr.Interface(
412
+ fn=gradio_process_pdf,
413
+ inputs=input_file,
414
+ outputs=[output_report, output_json],
415
+ title="🔬 PDF Element Extractor (YOLO/OCR Pipeline)",
416
+ description=(
417
+ "Upload a research paper PDF to run the YOLO/OCR pre-processing pipeline.\n"
418
+ "It detects pages, figures, and equations, and returns a summary of the counts "
419
+ "along with the structured JSON output file."
420
+ ),
421
+ allow_flagging='never'
422
+ )
423
+
424
+ # Launch the interface
425
+ print("\nStarting Gradio application...")
426
+ # NOTE: Set share=True to generate a public link (good for testing)
427
+ interface.launch(inbrowser=True)