heerjtdev commited on
Commit
7fbfa32
·
verified ·
1 Parent(s): 70e0005

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +171 -1530
app.py CHANGED
@@ -1,10 +1,4 @@
1
 
2
- # import base64
3
- # from PIL import Image
4
- # import re
5
-
6
-
7
-
8
 
9
 
10
 
@@ -14,7 +8,7 @@
14
  # import torch
15
  # import torch.serialization
16
  # import os
17
- # import time
18
  # from typing import Optional, Tuple, List, Dict, Any
19
  # from ultralytics import YOLO
20
  # import logging
@@ -41,28 +35,7 @@
41
  # # ============================================================================
42
 
43
  # WEIGHTS_PATH = 'best.pt'
44
- # SCALE_FACTOR = 2.0
45
- # # OUTPUT_DIR = "yolo_extracted_regions"
46
- # # OUTPUT_DIR = os.path.join(tempfile.gettempdir(), "yolo_extracted_regions")
47
-
48
- # from transformers import TrOCRProcessor
49
- # from optimum.onnxruntime import ORTModelForVision2Seq
50
-
51
-
52
-
53
- # MODEL_NAME = 'breezedeus/pix2text-mfr-1.5'
54
- # processor = TrOCRProcessor.from_pretrained(MODEL_NAME)
55
- # ort_model = ORTModelForVision2Seq.from_pretrained(MODEL_NAME, use_cache=False)
56
-
57
-
58
-
59
-
60
-
61
-
62
-
63
-
64
-
65
-
66
 
67
  # # Detection parameters
68
  # CONF_THRESHOLD = 0.2
@@ -75,7 +48,7 @@
75
  # GLOBAL_EQUATION_COUNT = 0
76
 
77
  # # ============================================================================
78
- # # --- BOX COMBINATION LOGIC (Retained for detection accuracy) ---
79
  # # ============================================================================
80
 
81
  # def calculate_iou(box1, box2):
@@ -150,6 +123,7 @@
150
 
151
  # def pixmap_to_numpy(pix: fitz.Pixmap) -> np.ndarray:
152
  # """Converts a PyMuPDF Pixmap to a NumPy array for OpenCV/YOLO."""
 
153
  # img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(
154
  # (pix.h, pix.w, pix.n)
155
  # )
@@ -160,252 +134,83 @@
160
  # return img
161
 
162
 
163
-
164
-
165
  # def run_yolo_detection_and_count(
166
  # image: np.ndarray, model: YOLO, page_num: int
167
- # ) -> Tuple[int, int, List[Dict[str, str]]]:
168
-
 
 
 
169
  # global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
170
-
171
  # yolo_detections = []
172
  # page_equations = 0
173
  # page_figures = 0
174
- # detected_items = []
175
-
176
  # try:
177
  # results = model.predict(image, conf=CONF_THRESHOLD, verbose=False)
178
-
179
  # if results and results[0].boxes:
180
  # for box in results[0].boxes.data.tolist():
181
  # x1, y1, x2, y2, conf, cls_id = box
182
  # cls_name = model.names[int(cls_id)]
183
-
184
  # if cls_name in TARGET_CLASSES:
185
  # yolo_detections.append({
186
- # 'coords': (x1, y1, x2, y2),
187
- # 'class': cls_name,
188
  # 'conf': conf
189
  # })
190
  # except Exception as e:
191
  # logging.error(f"YOLO inference failed on page {page_num}: {e}")
192
- # return 0, 0, []
193
 
 
194
  # merged_detections = merge_overlapping_boxes(yolo_detections, IOU_MERGE_THRESHOLD)
195
  # final_detections = filter_nested_boxes(merged_detections, IOA_SUPPRESSION_THRESHOLD)
196
 
 
197
  # for det in final_detections:
198
- # bbox = det["coords"]
199
-
200
- # if det["class"] == "equation":
201
- # GLOBAL_EQUATION_COUNT += 1
202
- # page_equations += 1
203
-
204
- # b64 = crop_and_convert_to_base64(image, bbox)
205
- # detected_items.append({
206
- # "type": "equation",
207
- # "id": f"EQUATION{GLOBAL_EQUATION_COUNT}",
208
- # "base64": b64
209
- # })
210
-
211
- # elif det["class"] == "figure":
212
  # GLOBAL_FIGURE_COUNT += 1
213
  # page_figures += 1
214
-
215
- # b64 = crop_and_convert_to_base64(image, bbox)
216
- # detected_items.append({
217
- # "type": "figure",
218
- # "id": f"FIGURE{GLOBAL_FIGURE_COUNT}",
219
- # "base64": b64
220
- # })
221
-
222
  # logging.warning(f" -> Page {page_num}: EQs={page_equations}, Figs={page_figures}")
223
- # return page_equations, page_figures, detected_items
224
-
225
-
226
-
227
- # def get_latex_from_base64(base64_string: str) -> str:
228
- # if ort_model is None or processor is None:
229
- # return "[MODEL_ERROR: Model not initialized]"
230
-
231
- # try:
232
- # image_data = base64.b64decode(base64_string)
233
- # image = Image.open(io.BytesIO(image_data)).convert('RGB')
234
-
235
- # pixel_values = processor(images=image, return_tensors="pt").pixel_values
236
- # generated_ids = ort_model.generate(pixel_values)
237
- # raw_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
238
-
239
- # if not raw_text:
240
- # return "[OCR_WARNING: No formula found]"
241
-
242
- # latex = raw_text[0]
243
- # latex = re.sub(r'[\r\n]+', '', latex)
244
-
245
- # return latex
246
-
247
- # except Exception as e:
248
- # return f"[TR_OCR_ERROR: {e}]"
249
-
250
-
251
-
252
-
253
-
254
-
255
-
256
-
257
-
258
-
259
 
260
 
261
-
262
-
263
-
264
-
265
- # def extract_images_from_page_in_memory(page) -> Dict[str, str]:
266
- # """
267
- # Extract images from a page and return:
268
- # { "EQUATION1": base64_string, "FIGURE1": base64_string }
269
- # """
270
- # image_map = {}
271
- # image_list = page.get_images(full=True)
272
-
273
- # for idx, img in enumerate(image_list, start=1):
274
- # xref = img[0]
275
- # base = page.parent.extract_image(xref)
276
- # image_bytes = base["image"]
277
-
278
- # base64_img = base64.b64encode(image_bytes).decode("utf-8")
279
-
280
- # # Convention: first image = FIGURE1, second image = EQUATION1 etc
281
- # # You can tune this if needed
282
- # image_map[f"FIGURE{idx}"] = base64_img
283
-
284
- # return image_map
285
-
286
-
287
-
288
-
289
- # def embed_images_as_base64_in_memory(structured_data, detected_items):
290
- # tag_regex = re.compile(r'(figure|equation)(\d+)', re.IGNORECASE)
291
-
292
- # item_lookup = {d["id"]: d for d in detected_items}
293
- # final_data = []
294
-
295
- # for item in structured_data:
296
- # text_fields = [
297
- # item.get('question', ''),
298
- # item.get('passage', ''),
299
- # item.get('new_passage', '')
300
- # ]
301
-
302
- # if 'options' in item:
303
- # text_fields.extend(item['options'].values())
304
-
305
- # used_tags = set()
306
-
307
- # for text in text_fields:
308
- # for m in tag_regex.finditer(text or ""):
309
- # used_tags.add(m.group(0).upper())
310
-
311
- # for tag in used_tags:
312
- # base_key = tag.lower().replace(" ", "")
313
-
314
- # if tag not in item_lookup:
315
- # item[base_key] = "[MISSING_IMAGE]"
316
- # continue
317
-
318
- # entry = item_lookup[tag]
319
-
320
- # if entry["type"] == "equation":
321
- # item[base_key] = get_latex_from_base64(entry["base64"])
322
-
323
- # else:
324
- # item[base_key] = entry["base64"]
325
-
326
- # final_data.append(item)
327
-
328
- # return final_data
329
-
330
-
331
-
332
-
333
-
334
-
335
- # def crop_and_convert_to_base64(image: np.ndarray, bbox: Tuple[float, float, float, float]) -> str:
336
- # x1, y1, x2, y2 = map(int, bbox)
337
- # h, w, _ = image.shape
338
-
339
- # x1 = max(0, x1)
340
- # y1 = max(0, y1)
341
- # x2 = min(w, x2)
342
- # y2 = min(h, y2)
343
-
344
- # crop = image[y1:y2, x1:x2]
345
- # _, buffer = cv2.imencode(".png", crop)
346
-
347
- # return base64.b64encode(buffer).decode("utf-8")
348
-
349
-
350
-
351
-
352
-
353
-
354
-
355
-
356
-
357
-
358
-
359
-
360
-
361
-
362
-
363
-
364
-
365
-
366
  # # ============================================================================
367
- # # --- MAIN DOCUMENT PROCESSING FUNCTION (Fixed for JSON serialization) ---
368
  # # ============================================================================
369
 
370
- # # NOTE: The return signature now uses Dict[str, int] for the equation counts
371
- # def run_single_pdf_preprocessing(pdf_path: str) -> Tuple[int, int, int, str, float, Dict[str, int], List[str]]:
372
  # """
373
- # Runs the pipeline, returns counts, report, total time, page counts dict (str keys), and empty list.
 
374
  # """
375
-
376
-
377
  # global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
378
  # start_time = time.time()
379
  # log_messages = []
380
- # all_saved_images = []
381
- # all_base64_images: List[str] = []
382
-
383
- # # Dictionary to store {page_number (int): equation_count (int)}
384
- # equation_counts_per_page: Dict[int, int] = {}
385
 
386
  # # Reset globals
387
  # GLOBAL_FIGURE_COUNT = 0
388
  # GLOBAL_EQUATION_COUNT = 0
389
 
390
-
391
-
392
- # # if os.path.exists(OUTPUT_DIR):
393
- # # shutil.rmtree(OUTPUT_DIR)
394
- # # os.makedirs(OUTPUT_DIR, exist_ok=True)
395
-
396
-
397
  # # 1. Validation and Model Loading
398
  # t0 = time.time()
399
  # if not os.path.exists(pdf_path):
400
  # report = f"❌ FATAL ERROR: Input PDF not found at {pdf_path}."
401
- # return 0, 0, 0, report, time.time() - start_time, {}, []
402
 
403
  # try:
404
  # model = YOLO(WEIGHTS_PATH)
405
  # logging.warning(f"✅ Loaded YOLO model from: {WEIGHTS_PATH}")
406
  # except Exception as e:
407
  # report = f"❌ ERROR loading YOLO model: {e}\n(Ensure 'best.pt' is available and valid.)"
408
- # return 0, 0, 0, report, time.time() - start_time, {}, []
409
  # t1 = time.time()
410
  # log_messages.append(f"Model Loading Time: {t1-t0:.4f}s")
411
 
@@ -417,7 +222,7 @@
417
  # logging.warning(f"✅ Opened PDF with {doc.page_count} pages")
418
  # except Exception as e:
419
  # report = f"❌ ERROR loading PDF file: {e}"
420
- # return 0, 0, 0, report, time.time() - start_time, {}, []
421
  # t3 = time.time()
422
  # log_messages.append(f"PDF Initialization Time: {t3-t2:.4f}s")
423
 
@@ -442,15 +247,9 @@
442
 
443
  # # Core Detection
444
  # detect_start = time.time()
445
- # # page_equations, _ = run_yolo_detection_and_count(original_img, model, page_num)
446
- # page_equations, _, page_images = run_yolo_detection_and_count(original_img, model, page_num)
447
- # all_saved_images.extend(page_images)
448
-
449
  # detect_time = time.time() - detect_start
450
 
451
- # # Store the count in the dictionary (INT keys)
452
- # equation_counts_per_page[page_num] = page_equations
453
-
454
  # page_total_time = time.time() - page_start_time
455
  # log_messages.append(f"Page {page_num} Time: Total={page_total_time:.4f}s (Render={pix_time:.4f}s, Detect={detect_time:.4f}s)")
456
 
@@ -459,11 +258,6 @@
459
  # detection_loop_time = t5 - t4
460
  # log_messages.append(f"Total Detection Loop Time ({total_pages} pages): {detection_loop_time:.4f}s")
461
 
462
- # # FIX APPLIED HERE: Convert integer keys to string keys for JSON serialization
463
- # equation_counts_per_page_str_keys: Dict[str, int] = {
464
- # str(k): v for k, v in equation_counts_per_page.items()
465
- # }
466
-
467
  # # 4. Final Report Generation
468
  # total_execution_time = t5 - start_time
469
 
@@ -480,46 +274,38 @@
480
  # f"\n```"
481
  # )
482
 
483
- # # Return the dictionary with string keys
484
- # # return total_pages, GLOBAL_EQUATION_COUNT, GLOBAL_FIGURE_COUNT, report, total_execution_time, equation_counts_per_page_str_keys, []
485
- # return total_pages, GLOBAL_EQUATION_COUNT, GLOBAL_FIGURE_COUNT, report, total_execution_time, equation_counts_per_page_str_keys, all_saved_images
486
-
487
 
488
 
489
  # # ============================================================================
490
  # # --- GRADIO INTERFACE FUNCTION (Updated) ---
491
  # # ============================================================================
492
 
493
- # def gradio_process_pdf(pdf_file) -> Tuple[str, str, str, str, Dict[str, int], List[str]]:
 
494
  # """
495
- # Gradio wrapper function to handle file upload and return results.
496
  # """
497
  # if pdf_file is None:
498
- # # Return an empty dict with string keys
499
- # return "N/A", "N/A", "N/A", "Please upload a PDF file.", {}, []
500
 
501
  # pdf_path = pdf_file.name
502
 
503
  # try:
504
- # # Unpack the new return value: equation_counts_per_page (with string keys)
505
- # # num_pages, num_equations, num_figures, report, total_time, equation_counts_per_page, _ = run_single_pdf_preprocessing(
506
- # # pdf_path
507
- # # )
508
- # # num_pages, num_equations, num_figures, report, total_time, equation_counts_per_page, images = run_single_pdf_preprocessing(pdf_path)
509
- # num_pages, num_equations, num_figures, report, total_time, equation_counts_per_page, images = run_single_pdf_preprocessing(pdf_path)
510
-
511
-
512
 
513
- # # Return results (6 items now)
514
- # # return str(num_pages), str(num_equations), str(num_figures), report, equation_counts_per_page, []
515
- # return str(num_pages), str(num_equations), str(num_figures), report, equation_counts_per_page, images
516
-
517
 
518
  # except Exception as e:
519
  # error_msg = f"An unexpected error occurred: {e}"
520
  # logging.error(error_msg, exc_info=True)
521
- # # Return an empty dict on error
522
- # return "Error", "Error", "Error", error_msg, {}, []
523
 
524
 
525
  # # ============================================================================
@@ -539,43 +325,35 @@
539
  # output_figures = gr.Textbox(label="Total Figures Detected", interactive=False)
540
  # output_report = gr.Markdown(label="Processing Summary and Timing")
541
 
542
- # # NEW OUTPUT: JSON component for structured data
543
- # output_page_counts = gr.JSON(label="Equation Count Per Page (Dictionary)")
544
-
545
  # # Gradio Gallery is retained but will receive an empty list []
546
  # output_gallery = gr.Gallery(
547
  # label="Detected Equations (Disabled for Speed)",
548
  # columns=5,
549
  # height="auto",
550
  # object_fit="contain",
551
- # allow_preview=False
552
  # )
553
 
554
  # interface = gr.Interface(
555
  # fn=gradio_process_pdf,
556
  # inputs=input_file,
557
- # # Outputs list remains the same, but the JSON component now receives string keys.
558
- # outputs=[
559
- # output_pages,
560
- # output_equations,
561
- # output_figures,
562
- # output_report,
563
- # output_page_counts,
564
- # output_gallery
565
- # ],
566
- # title="📊 YOLO Counting with Per-Page Data & Timing",
567
  # description=(
568
- # "Upload a PDF to run YOLO detection. The results include total counts, a breakdown of "
569
- # "equation counts per page (in JSON format), and detailed timing."
570
  # ),
571
  # )
572
 
573
  # print("\nStarting Gradio application...")
574
- # # interface.launch(inbrowser=True)
575
- # interface.launch(
576
- # inbrowser=True,
577
- # # allowed_paths=[OUTPUT_DIR]
578
- # )
 
 
579
 
580
 
581
 
@@ -587,31 +365,25 @@
587
 
588
 
589
 
590
- # import base64
591
- # from PIL import Image
592
- # import re
593
  # import fitz # PyMuPDF
594
  # import numpy as np
595
  # import cv2
596
  # import torch
597
  # import torch.serialization
598
  # import os
599
- # import time
600
- # from typing import Optional, Tuple, List, Dict, Any, Union
601
  # from ultralytics import YOLO
602
  # import logging
603
  # import gradio as gr
 
 
604
  # import io
605
- # import json
606
 
607
  # # ============================================================================
608
- # # --- Global Setup and Configuration ---
609
  # # ============================================================================
610
 
611
- # # Configure logging to write to a string buffer for display in the report
612
- # log_stream = io.StringIO()
613
- # logging.basicConfig(level=logging.WARNING, stream=log_stream, format='%(levelname)s:%(message)s')
614
-
615
  # # Patch torch.load to prevent weights_only error with older models
616
  # _original_torch_load = torch.load
617
  # def patched_torch_load(*args, **kwargs):
@@ -619,23 +391,14 @@
619
  # return _original_torch_load(*args, **kwargs)
620
  # torch.load = patched_torch_load
621
 
622
- # WEIGHTS_PATH = 'best.pt'
623
- # SCALE_FACTOR = 2.0
624
 
625
- # # --- OCR Model Initialization ---
626
- # from transformers import TrOCRProcessor
627
- # from optimum.onnxruntime import ORTModelForVision2Seq
628
 
629
- # MODEL_NAME = 'breezedeus/pix2text-mfr-1.5'
630
- # try:
631
- # processor = TrOCRProcessor.from_pretrained(MODEL_NAME)
632
- # ort_model = ORTModelForVision2Seq.from_pretrained(MODEL_NAME, use_cache=False)
633
- # OCR_MODEL_LOADED = True
634
- # except Exception as e:
635
- # logging.warning(f"OCR model loading failed: {e}")
636
- # processor = None
637
- # ort_model = None
638
- # OCR_MODEL_LOADED = False
639
 
640
  # # Detection parameters
641
  # CONF_THRESHOLD = 0.2
@@ -643,8 +406,12 @@
643
  # IOU_MERGE_THRESHOLD = 0.4
644
  # IOA_SUPPRESSION_THRESHOLD = 0.7
645
 
 
 
 
 
646
  # # ============================================================================
647
- # # --- BOX COMBINATION LOGIC (FIXED) ---
648
  # # ============================================================================
649
 
650
  # def calculate_iou(box1, box2):
@@ -689,11 +456,9 @@
689
 
690
  # def merge_overlapping_boxes(detections, iou_threshold):
691
  # if not detections: return []
692
- # # 1. Sort by confidence (YOLO standard)
693
  # detections.sort(key=lambda d: d['conf'], reverse=True)
694
  # merged_detections = []
695
  # is_merged = [False] * len(detections)
696
-
697
  # for i in range(len(detections)):
698
  # if is_merged[i]: continue
699
  # current_box = detections[i]['coords']
@@ -707,25 +472,16 @@
707
  # merged_x1 = min(merged_x1, other_box[0])
708
  # merged_y1 = min(merged_y1, other_box[1])
709
  # merged_x2 = max(merged_x2, other_box[2])
710
- # merged_y2 = max(other_box[3], other_box[3])
711
  # is_merged[j] = True
712
  # merged_detections.append({
713
  # 'coords': (merged_x1, merged_y1, merged_x2, merged_y2),
714
- # # 'y1' is retained for clarity, though 'coords' contains it
715
- # 'y1': merged_y1,
716
- # 'class': current_class,
717
- # 'conf': detections[i]['conf']
718
  # })
719
-
720
- # # --- FIX IMPLEMENTATION: READING ORDER SORT ---
721
- # # Sort primarily by y1 (vertical position), secondarily by x1 (horizontal position).
722
- # # This correctly handles two-column layouts like Q.10 options (A), (B), (C), (D)
723
- # merged_detections.sort(key=lambda d: (d['coords'][1], d['coords'][0]))
724
-
725
  # return merged_detections
726
 
727
  # # ============================================================================
728
- # # --- UTILITY FUNCTIONS (Retained) ---
729
  # # ============================================================================
730
 
731
  # def pixmap_to_numpy(pix: fitz.Pixmap) -> np.ndarray:
@@ -740,137 +496,75 @@
740
  # return img
741
 
742
 
743
- # def crop_and_convert_to_pil(image: np.ndarray, bbox: Tuple[float, float, float, float]) -> Image.Image:
744
- # """Crops the numpy array and returns a PIL Image object."""
745
- # x1, y1, x2, y2 = map(int, bbox)
746
- # h, w, _ = image.shape
747
-
748
- # x1 = max(0, x1)
749
- # y1 = max(0, y1)
750
- # x2 = min(w, x2)
751
- # y2 = min(h, y2)
752
-
753
- # crop_np = image[y1:y2, x1:x2]
754
- # crop_pil = Image.fromarray(cv2.cvtColor(crop_np, cv2.COLOR_BGR2RGB))
755
-
756
- # return crop_pil
757
-
758
-
759
- # def pil_to_base64(img: Image.Image) -> str:
760
- # """Converts a PIL Image object to a Base64 encoded string (PNG format) for OCR input."""
761
- # buffer = io.BytesIO()
762
- # img.save(buffer, format="PNG")
763
- # return base64.b64encode(buffer.getvalue()).decode("utf-8")
764
-
765
-
766
- # def get_latex_from_base64(base64_string: str) -> str:
767
- # """Performs the OCR conversion using the globally loaded model."""
768
- # if not OCR_MODEL_LOADED:
769
- # return "[MODEL_ERROR: Model not loaded]"
770
-
771
- # try:
772
- # image_data = base64.b64decode(base64_string)
773
- # image = Image.open(io.BytesIO(image_data)).convert('RGB')
774
-
775
- # pixel_values = processor(images=image, return_tensors="pt").pixel_values
776
- # generated_ids = ort_model.generate(pixel_values)
777
- # raw_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
778
-
779
- # if not raw_text:
780
- # return "[OCR_WARNING: No formula found]"
781
-
782
- # latex = raw_text[0]
783
- # latex = re.sub(r'[\r\n]+', '', latex)
784
-
785
- # return latex
786
-
787
- # except Exception as e:
788
- # return f"[TR_OCR_ERROR: {e}]"
789
-
790
-
791
  # def run_yolo_detection_and_count(
792
- # image: np.ndarray, model: YOLO, page_num: int,
793
- # current_eq_count: int, current_fig_count: int
794
- # ) -> Tuple[List[Dict[str, Union[Image.Image, str, Tuple[float,...]]]], int, int]:
795
  # """
796
- # Performs YOLO detection and returns a list of detected item dictionaries
797
- # and the updated total counters.
798
  # """
 
799
 
800
- # eq_counter = current_eq_count
801
- # fig_counter = current_fig_count
802
-
803
- # detected_items: List[Dict[str, Union[Image.Image, str, Tuple[float,...]]]] = []
804
  # yolo_detections = []
805
-
 
 
806
  # try:
807
  # results = model.predict(image, conf=CONF_THRESHOLD, verbose=False)
 
808
  # if results and results[0].boxes:
809
  # for box in results[0].boxes.data.tolist():
810
  # x1, y1, x2, y2, conf, cls_id = box
811
  # cls_name = model.names[int(cls_id)]
 
812
  # if cls_name in TARGET_CLASSES:
813
  # yolo_detections.append({
814
- # 'coords': (x1, y1, x2, y2),
815
- # 'class': cls_name,
816
  # 'conf': conf
817
  # })
818
  # except Exception as e:
819
- # logging.error(f"ERROR: YOLO inference failed on page {page_num}: {e}")
820
- # return [], eq_counter, fig_counter
821
 
 
822
  # merged_detections = merge_overlapping_boxes(yolo_detections, IOU_MERGE_THRESHOLD)
823
  # final_detections = filter_nested_boxes(merged_detections, IOA_SUPPRESSION_THRESHOLD)
824
 
825
- # # Note: final_detections is now sorted by (y1, x1) in reading order.
826
-
827
  # for det in final_detections:
828
- # bbox = det["coords"]
829
- # crop_pil = crop_and_convert_to_pil(image, bbox)
830
-
831
- # item = {
832
- # "type": det["class"],
833
- # "coords": bbox,
834
- # "pil_image": crop_pil,
835
- # }
836
-
837
- # if det["class"] == "equation":
838
- # eq_counter += 1
839
- # item["id"] = f"EQUATION{eq_counter}"
840
- # item["latex"] = ""
841
- # elif det["class"] == "figure":
842
- # fig_counter += 1
843
- # item["id"] = f"FIGURE{fig_counter}"
844
- # item["latex"] = "[FIGURE - No LaTeX]"
845
 
846
- # detected_items.append(item)
847
-
848
- # return detected_items, eq_counter, fig_counter
849
 
850
 
851
  # # ============================================================================
852
- # # --- MAIN DOCUMENT PROCESSING FUNCTION (Retained Logic) ---
853
  # # ============================================================================
854
 
855
- # def run_single_pdf_preprocessing(
856
- # pdf_path: str
857
- # ) -> Tuple[int, int, int, str, float, Dict[str, Union[int, str]], List[Tuple[Image.Image, str]]]:
858
  # """
859
- # Runs the pipeline, performs OCR, and returns final results.
860
  # """
861
-
862
- # log_stream.truncate(0)
863
- # log_stream.seek(0)
864
-
865
  # start_time = time.time()
 
866
 
867
- # all_extracted_items: List[Dict[str, Union[Image.Image, str]]] = []
868
-
869
- # total_figure_count = 0
870
- # total_equation_count = 0
871
 
 
 
 
872
 
873
- # # 1. Validation and Model Loading (YOLO)
874
  # t0 = time.time()
875
  # if not os.path.exists(pdf_path):
876
  # report = f"❌ FATAL ERROR: Input PDF not found at {pdf_path}."
@@ -878,28 +572,28 @@
878
 
879
  # try:
880
  # model = YOLO(WEIGHTS_PATH)
881
- # logging.warning(f"INFO: Loaded YOLO model from: {WEIGHTS_PATH}")
882
  # except Exception as e:
883
  # report = f"❌ ERROR loading YOLO model: {e}\n(Ensure 'best.pt' is available and valid.)"
884
  # return 0, 0, 0, report, time.time() - start_time, {}, []
885
  # t1 = time.time()
886
- # logging.warning(f"INFO: Model Loading Time: {t1-t0:.4f}s")
887
 
888
- # # 2. PDF Loading (fitz)
889
  # t2 = time.time()
890
  # try:
891
  # doc = fitz.open(pdf_path)
892
  # total_pages = doc.page_count
893
- # logging.warning(f"INFO: Opened PDF with {doc.page_count} pages")
894
  # except Exception as e:
895
  # report = f"❌ ERROR loading PDF file: {e}"
896
  # return 0, 0, 0, report, time.time() - start_time, {}, []
897
  # t3 = time.time()
898
- # logging.warning(f"INFO: PDF Initialization Time: {t3-t2:.4f}s")
899
 
900
  # mat = fitz.Matrix(SCALE_FACTOR, SCALE_FACTOR)
901
 
902
- # # 3. Page Processing, Detection, and OCR Loop
903
  # t4 = time.time()
904
  # for page_num_0_based in range(doc.page_count):
905
  # page_start_time = time.time()
@@ -913,131 +607,83 @@
913
  # original_img = pixmap_to_numpy(pix)
914
  # pix_time = time.time() - pix_start
915
  # except Exception as e:
916
- # logging.error(f"ERROR: Error converting page {page_num} to image: {e}. Skipping.")
917
  # continue
918
-
919
- # # YOLO Detection
920
  # detect_start = time.time()
921
- # (
922
- # page_extracted_items,
923
- # total_equation_count,
924
- # total_figure_count
925
- # ) = run_yolo_detection_and_count(
926
- # original_img,
927
- # model,
928
- # page_num,
929
- # total_equation_count,
930
- # total_figure_count
931
- # )
932
  # detect_time = time.time() - detect_start
933
 
934
- # # --- OCR/LaTeX Conversion and Logging ---
935
- # ocr_total_time = 0
936
- # page_equations = 0
937
-
938
- # for item in page_extracted_items:
939
- # if item["type"] == "equation":
940
- # page_equations += 1
941
- # ocr_start = time.time()
942
-
943
- # b64_string = pil_to_base64(item["pil_image"])
944
- # item["latex"] = get_latex_from_base64(b64_string)
945
-
946
- # ocr_time = time.time() - ocr_start
947
- # ocr_total_time += ocr_time
948
-
949
- # logging.warning(f"LATEX: Page {page_num}, ID {item['id']} -> Time: {ocr_time:.4f}s, Formula: {item['latex'][:50]}...")
950
-
951
- # all_extracted_items.extend(page_extracted_items)
952
-
953
- # page_figures = sum(1 for item in page_extracted_items if item["type"] == "figure")
954
 
955
  # page_total_time = time.time() - page_start_time
956
- # logging.warning(f"SUMMARY: Page {page_num}: EQs={page_equations}, Figs={page_figures} | Page Time: {page_total_time:.4f}s (Detect={detect_time:.4f}s, OCR Total={ocr_total_time:.4f}s)")
957
 
958
  # doc.close()
959
  # t5 = time.time()
960
  # detection_loop_time = t5 - t4
961
- # logging.warning(f"INFO: Total Detection and OCR Loop Time ({total_pages} pages): {detection_loop_time:.4f}s")
962
-
963
- # # 4. Final Report Generation and Gallery Formatting
964
 
965
- # # Create the structured JSON output as requested by the user
966
- # structured_latex_output = {
967
- # "Total Pages": total_pages,
968
- # "Total Equations": total_equation_count,
969
  # }
970
- # for item in all_extracted_items:
971
- # if item["type"] == "equation":
972
- # # Map EQUATION ID to LaTeX code
973
- # structured_latex_output[item["id"]] = item["latex"]
974
-
975
-
976
- # # Format the extracted items for the Gradio Gallery
977
- # gallery_items: List[Tuple[Image.Image, str]] = []
978
-
979
- # for item in all_extracted_items:
980
- # image_label = item["id"]
981
- # if item["type"] == "equation":
982
- # image_label = f'{item["id"]}: {item["latex"]}'
983
-
984
- # gallery_items.append((item["pil_image"], image_label))
985
-
986
 
 
987
  # total_execution_time = t5 - start_time
988
 
989
- # full_log = log_stream.getvalue()
990
-
991
  # report = (
992
- # f"✅ **YOLO Counting & OCR Complete!**\n\n"
993
  # f"**1) Total Pages Detected in PDF:** **{total_pages}**\n"
994
- # f"**2) Total Equations Detected:** **{total_equation_count}**\n"
995
- # f"**3) Total Figures Detected:** **{total_figure_count}**\n"
996
  # f"---\n"
997
  # f"**4) Total Execution Time:** **{total_execution_time:.4f}s**\n"
998
- # f"### Full Processing Log\n"
999
- # f"```text\n"
1000
- # f"{full_log}"
1001
  # f"\n```"
1002
  # )
1003
 
1004
- # # Return the new structured_latex_output instead of the page counts
1005
- # return total_pages, total_equation_count, total_figure_count, report, total_execution_time, structured_latex_output, gallery_items
1006
 
1007
 
1008
  # # ============================================================================
1009
- # # --- GRADIO INTERFACE FUNCTION & DEFINITION (Retained) ---
1010
  # # ============================================================================
1011
 
1012
- # def gradio_process_pdf(pdf_file) -> Tuple[str, str, str, str, Dict[str, Union[int, str]], List[Tuple[Image.Image, str]]]:
1013
- # """Gradio wrapper function to handle file upload and return results."""
 
 
1014
  # if pdf_file is None:
 
1015
  # return "N/A", "N/A", "N/A", "Please upload a PDF file.", {}, []
1016
 
1017
  # pdf_path = pdf_file.name
1018
 
1019
  # try:
1020
- # (
1021
- # num_pages,
1022
- # num_equations,
1023
- # num_figures,
1024
- # report,
1025
- # total_time,
1026
- # structured_latex_output,
1027
- # gallery_items
1028
- # ) = run_single_pdf_preprocessing(pdf_path)
1029
-
1030
 
1031
- # return str(num_pages), str(num_equations), str(num_figures), report, structured_latex_output, gallery_items
1032
-
1033
 
1034
  # except Exception as e:
1035
  # error_msg = f"An unexpected error occurred: {e}"
1036
- # logging.error(f"FATAL: {error_msg}", exc_info=True)
1037
- # full_log = log_stream.getvalue()
1038
- # error_report = f" CRITICAL ERROR:\n{error_msg}\n\n### Log up to Failure\n```text\n{full_log}\n```"
1039
- # return "Error", "Error", "Error", error_report, {}, []
1040
 
 
 
 
1041
 
1042
  # if __name__ == "__main__":
1043
 
@@ -1046,16 +692,19 @@
1046
 
1047
  # input_file = gr.File(label="Upload PDF Document", type="filepath", file_types=[".pdf"])
1048
 
 
1049
  # output_pages = gr.Textbox(label="Total Pages in PDF", interactive=False)
1050
  # output_equations = gr.Textbox(label="Total Equations Detected", interactive=False)
1051
  # output_figures = gr.Textbox(label="Total Figures Detected", interactive=False)
1052
- # output_report = gr.Markdown(label="Processing Summary and Full Log")
1053
 
1054
- # output_structured_latex = gr.JSON(label="Structured LaTeX Output (EQUATIONx : <latex code>)")
 
1055
 
 
1056
  # output_gallery = gr.Gallery(
1057
- # label="Detected Items (with Extracted LaTeX)",
1058
- # columns=3,
1059
  # height="auto",
1060
  # object_fit="contain",
1061
  # allow_preview=False
@@ -1064,1030 +713,22 @@
1064
  # interface = gr.Interface(
1065
  # fn=gradio_process_pdf,
1066
  # inputs=input_file,
 
1067
  # outputs=[
1068
  # output_pages,
1069
  # output_equations,
1070
  # output_figures,
1071
  # output_report,
1072
- # output_structured_latex,
1073
  # output_gallery
1074
  # ],
1075
- # title="📊 YOLO Detection & Math OCR Pipeline (Reading Order Fix)",
1076
  # description=(
1077
- # "Upload a PDF. YOLO detects equations/figures, and OCR converts equations to LaTeX. Now includes a fix for two-column reading order."
 
1078
  # ),
1079
  # )
1080
 
1081
  # print("\nStarting Gradio application...")
1082
  # interface.launch(inbrowser=True)
1083
 
1084
-
1085
-
1086
-
1087
-
1088
-
1089
-
1090
-
1091
-
1092
-
1093
-
1094
-
1095
-
1096
-
1097
-
1098
-
1099
-
1100
-
1101
-
1102
- import base64
1103
- from PIL import Image
1104
- import re
1105
- import fitz # PyMuPDF
1106
- import numpy as np
1107
- import cv2
1108
- import torch
1109
- import torch.serialization
1110
- import os
1111
- import time
1112
- from typing import Optional, Tuple, List, Dict, Any, Union
1113
- from ultralytics import YOLO
1114
- import logging
1115
- import gradio as gr
1116
- import io
1117
- import json
1118
-
1119
- # ============================================================================
1120
- # --- Global Setup and Configuration (Retained) ---
1121
- # ============================================================================
1122
-
1123
- log_stream = io.StringIO()
1124
- logging.basicConfig(level=logging.WARNING, stream=log_stream, format='%(levelname)s:%(message)s')
1125
-
1126
- _original_torch_load = torch.load
1127
- def patched_torch_load(*args, **kwargs):
1128
- kwargs["weights_only"] = False
1129
- return _original_torch_load(*args, **kwargs)
1130
- torch.load = patched_torch_load
1131
-
1132
- WEIGHTS_PATH = 'best.pt'
1133
- SCALE_FACTOR = 2.0
1134
-
1135
- from transformers import TrOCRProcessor
1136
- from optimum.onnxruntime import ORTModelForVision2Seq
1137
-
1138
- MODEL_NAME = 'breezedeus/pix2text-mfr-1.5'
1139
- try:
1140
- processor = TrOCRProcessor.from_pretrained(MODEL_NAME)
1141
- ort_model = ORTModelForVision2Seq.from_pretrained(MODEL_NAME, use_cache=False)
1142
- OCR_MODEL_LOADED = True
1143
- except Exception as e:
1144
- logging.warning(f"OCR model loading failed: {e}")
1145
- processor = None
1146
- ort_model = None
1147
- OCR_MODEL_LOADED = False
1148
-
1149
- CONF_THRESHOLD = 0.2
1150
- TARGET_CLASSES = ['figure', 'equation']
1151
- IOU_MERGE_THRESHOLD = 0.4
1152
- IOA_SUPPRESSION_THRESHOLD = 0.7
1153
-
1154
- # ============================================================================
1155
- # --- BOX COMBINATION LOGIC (PURE VERTICAL FIX) ---
1156
- # ============================================================================
1157
-
1158
- # def calculate_iou(box1, box2):
1159
- # x1_a, y1_a, x2_a, y2_a = box1
1160
- # x1_b, y1_b, x2_b, y2_b = box2
1161
- # x_left = max(x1_a, x1_b)
1162
- # y_top = max(y1_a, y1_b)
1163
- # x_right = min(x2_a, x2_b)
1164
- # y_bottom = min(y2_a, y2_b)
1165
- # intersection_area = max(0, x_right - x_left) * max(0, y_bottom - y_top)
1166
- # box_a_area = (x2_a - x1_a) * (y2_a - y1_a)
1167
- # box_b_area = (x2_b - x1_b) * (y2_b - y1_b)
1168
- # union_area = float(box_a_area + box_b_area - intersection_area)
1169
- # return intersection_area / union_area if union_area > 0 else 0
1170
-
1171
-
1172
- # def filter_nested_boxes(detections, ioa_threshold=0.80):
1173
- # if not detections: return []
1174
- # for d in detections:
1175
- # x1, y1, x2, y2 = d['coords']
1176
- # d['area'] = (x2 - x1) * (y2 - y1)
1177
- # detections.sort(key=lambda x: x['area'], reverse=True)
1178
- # keep_indices = []
1179
- # is_suppressed = [False] * len(detections)
1180
- # for i in range(len(detections)):
1181
- # if is_suppressed[i]: continue
1182
- # keep_indices.append(i)
1183
- # box_a = detections[i]['coords']
1184
- # for j in range(i + 1, len(detections)):
1185
- # if is_suppressed[j]: continue
1186
- # box_b = detections[j]['coords']
1187
- # x_left = max(box_a[0], box_b[0])
1188
- # y_top = max(box_a[1], box_b[1])
1189
- # x_right = min(box_a[2], box_b[2])
1190
- # y_bottom = min(box_a[3], box_b[3])
1191
- # intersection = max(0, x_right - x_left) * max(0, y_bottom - y_top)
1192
- # area_b = detections[j]['area']
1193
- # if area_b > 0 and intersection / area_b > ioa_threshold:
1194
- # is_suppressed[j] = True
1195
- # return [detections[i] for i in keep_indices]
1196
-
1197
-
1198
- # # --- UPDATED: page_width argument removed ---
1199
- # def merge_overlapping_boxes(detections, iou_threshold):
1200
- # if not detections: return []
1201
- # detections.sort(key=lambda d: d['conf'], reverse=True)
1202
- # merged_detections = []
1203
- # is_merged = [False] * len(detections)
1204
-
1205
- # for i in range(len(detections)):
1206
- # if is_merged[i]: continue
1207
- # current_box = detections[i]['coords']
1208
- # current_class = detections[i]['class']
1209
- # merged_x1, merged_y1, merged_x2, merged_y2 = current_box
1210
- # for j in range(i + 1, len(detections)):
1211
- # if is_merged[j] or detections[j]['class'] != current_class: continue
1212
- # other_box = detections[j]['coords']
1213
- # iou = calculate_iou(current_box, other_box)
1214
- # if iou > iou_threshold:
1215
- # merged_x1 = min(merged_x1, other_box[0])
1216
- # merged_y1 = min(merged_y1, other_box[1])
1217
- # merged_x2 = max(merged_x2, other_box[2])
1218
- # merged_y2 = max(merged_y2, other_box[3])
1219
- # is_merged[j] = True
1220
- # merged_detections.append({
1221
- # 'coords': (merged_x1, merged_y1, merged_x2, merged_y2),
1222
- # 'y1': merged_y1,
1223
- # 'class': current_class,
1224
- # 'conf': detections[i]['conf']
1225
- # })
1226
-
1227
- # # --- PURE VERTICAL FIX IMPLEMENTATION ---
1228
- # # Sort ONLY by the top y-coordinate (coords[1]).
1229
- # # This ignores horizontal position and any complex layout.
1230
- # merged_detections.sort(key=lambda d: d['coords'][1])
1231
-
1232
- # return merged_detections
1233
-
1234
-
1235
-
1236
-
1237
-
1238
-
1239
-
1240
-
1241
-
1242
-
1243
-
1244
-
1245
-
1246
-
1247
-
1248
-
1249
- def calculate_iou(box1, box2):
1250
- """Calculate Intersection over Union between two boxes."""
1251
- x1_a, y1_a, x2_a, y2_a = box1
1252
- x1_b, y1_b, x2_b, y2_b = box2
1253
- x_left = max(x1_a, x1_b)
1254
- y_top = max(y1_a, y1_b)
1255
- x_right = min(x2_a, x2_b)
1256
- y_bottom = min(y2_a, y2_b)
1257
- intersection_area = max(0, x_right - x_left) * max(0, y_bottom - y_top)
1258
- box_a_area = (x2_a - x1_a) * (y2_a - y1_a)
1259
- box_b_area = (x2_b - x1_b) * (y2_b - y1_b)
1260
- union_area = float(box_a_area + box_b_area - intersection_area)
1261
- return intersection_area / union_area if union_area > 0 else 0
1262
-
1263
-
1264
- def calculate_ioa(box1, box2):
1265
- """Calculate Intersection over Area of box2."""
1266
- x1_a, y1_a, x2_a, y2_a = box1
1267
- x1_b, y1_b, x2_b, y2_b = box2
1268
- x_left = max(x1_a, x1_b)
1269
- y_top = max(y1_a, y1_b)
1270
- x_right = min(x2_a, x2_b)
1271
- y_bottom = min(y2_a, y2_b)
1272
- intersection_area = max(0, x_right - x_left) * max(0, y_bottom - y_top)
1273
- box_a_area = (x2_a - x1_a) * (y2_a - y1_a)
1274
- return intersection_area / box_a_area if box_a_area > 0 else 0
1275
-
1276
-
1277
- def merge_overlapping_boxes(detections, iou_threshold):
1278
- """
1279
- Merges overlapping boxes of the same class based on IOU threshold.
1280
- Returns boxes sorted by y-coordinate (top to bottom).
1281
- """
1282
- if not detections:
1283
- return []
1284
-
1285
- # Sort by confidence (highest first) for merge priority
1286
- detections.sort(key=lambda d: d['conf'], reverse=True)
1287
-
1288
- merged_detections = []
1289
- is_merged = [False] * len(detections)
1290
-
1291
- for i in range(len(detections)):
1292
- if is_merged[i]:
1293
- continue
1294
-
1295
- current_box = detections[i]['coords']
1296
- current_class = detections[i]['class']
1297
- merged_x1, merged_y1, merged_x2, merged_y2 = current_box
1298
-
1299
- # Try to merge with all subsequent boxes of same class
1300
- for j in range(i + 1, len(detections)):
1301
- if is_merged[j] or detections[j]['class'] != current_class:
1302
- continue
1303
-
1304
- other_box = detections[j]['coords']
1305
- iou = calculate_iou(current_box, other_box)
1306
-
1307
- if iou > iou_threshold:
1308
- # Expand merged box to encompass both
1309
- merged_x1 = min(merged_x1, other_box[0])
1310
- merged_y1 = min(merged_y1, other_box[1])
1311
- merged_x2 = max(merged_x2, other_box[2])
1312
- merged_y2 = max(merged_y2, other_box[3]) # ← FIX THE TYPO HERE
1313
- is_merged[j] = True
1314
-
1315
- merged_detections.append({
1316
- 'coords': (merged_x1, merged_y1, merged_x2, merged_y2),
1317
- 'y1': merged_y1,
1318
- 'class': current_class,
1319
- 'conf': detections[i]['conf']
1320
- })
1321
-
1322
- # Sort by y-coordinate (top to bottom) for consistent ordering
1323
- merged_detections.sort(key=lambda d: d['coords'][1])
1324
-
1325
- return merged_detections
1326
-
1327
-
1328
- def filter_nested_boxes(detections, ioa_threshold=0.80):
1329
- """
1330
- Removes boxes that are nested inside larger boxes.
1331
- Keeps the larger (parent) box and suppresses smaller (child) boxes.
1332
- """
1333
- if not detections:
1334
- return []
1335
-
1336
- # Calculate area for all detections
1337
- for d in detections:
1338
- x1, y1, x2, y2 = d['coords']
1339
- d['area'] = (x2 - x1) * (y2 - y1)
1340
-
1341
- # Sort by area (largest first) to prioritize keeping parent boxes
1342
- detections.sort(key=lambda x: x['area'], reverse=True)
1343
-
1344
- keep_indices = []
1345
- is_suppressed = [False] * len(detections)
1346
-
1347
- for i in range(len(detections)):
1348
- if is_suppressed[i]:
1349
- continue
1350
-
1351
- keep_indices.append(i)
1352
- box_a = detections[i]['coords']
1353
-
1354
- # Check all smaller boxes
1355
- for j in range(i + 1, len(detections)):
1356
- if is_suppressed[j]:
1357
- continue
1358
-
1359
- box_b = detections[j]['coords']
1360
-
1361
- # Calculate intersection
1362
- x_left = max(box_a[0], box_b[0])
1363
- y_top = max(box_a[1], box_b[1])
1364
- x_right = min(box_a[2], box_b[2])
1365
- y_bottom = min(box_a[3], box_b[3])
1366
-
1367
- intersection = max(0, x_right - x_left) * max(0, y_bottom - y_top)
1368
- area_b = detections[j]['area']
1369
-
1370
- # If small box is mostly inside large box, suppress it
1371
- if area_b > 0 and intersection / area_b > ioa_threshold:
1372
- is_suppressed[j] = True
1373
-
1374
- # Return kept detections in original y-sorted order
1375
- kept_detections = [detections[i] for i in keep_indices]
1376
- kept_detections.sort(key=lambda d: d['coords'][1])
1377
-
1378
- return kept_detections
1379
-
1380
-
1381
-
1382
-
1383
-
1384
-
1385
-
1386
-
1387
-
1388
-
1389
-
1390
-
1391
-
1392
-
1393
-
1394
-
1395
-
1396
-
1397
-
1398
-
1399
-
1400
-
1401
-
1402
- # ============================================================================
1403
- # --- UTILITY FUNCTIONS (Retained) ---
1404
- # ============================================================================
1405
-
1406
- def pixmap_to_numpy(pix: fitz.Pixmap) -> np.ndarray:
1407
- """Converts a PyMuPDF Pixmap to a NumPy array for OpenCV/YOLO."""
1408
- img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(
1409
- (pix.h, pix.w, pix.n)
1410
- )
1411
- if pix.n == 4:
1412
- img = cv2.cvtColor(img, cv2.COLOR_RGBA2RGB)
1413
- elif pix.n == 1:
1414
- img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
1415
- return img
1416
-
1417
-
1418
- def crop_and_convert_to_pil(image: np.ndarray, bbox: Tuple[float, float, float, float]) -> Image.Image:
1419
- """Crops the numpy array and returns a PIL Image object."""
1420
- x1, y1, x2, y2 = map(int, bbox)
1421
- h, w, _ = image.shape
1422
-
1423
- x1 = max(0, x1)
1424
- y1 = max(0, y1)
1425
- x2 = min(w, x2)
1426
- y2 = min(h, y2)
1427
-
1428
- crop_np = image[y1:y2, x1:x2]
1429
- crop_pil = Image.fromarray(cv2.cvtColor(crop_np, cv2.COLOR_BGR2RGB))
1430
-
1431
- return crop_pil
1432
-
1433
-
1434
- def pil_to_base64(img: Image.Image) -> str:
1435
- """Converts a PIL Image object to a Base64 encoded string (PNG format) for OCR input."""
1436
- buffer = io.BytesIO()
1437
- img.save(buffer, format="PNG")
1438
- return base64.b64encode(buffer.getvalue()).decode("utf-8")
1439
-
1440
-
1441
- def get_latex_from_base64(base64_string: str) -> str:
1442
- """Performs the OCR conversion using the globally loaded model."""
1443
- if not OCR_MODEL_LOADED:
1444
- return "[MODEL_ERROR: Model not loaded]"
1445
-
1446
- try:
1447
- image_data = base64.b64decode(base64_string)
1448
- image = Image.open(io.BytesIO(image_data)).convert('RGB')
1449
-
1450
- pixel_values = processor(images=image, return_tensors="pt").pixel_values
1451
- generated_ids = ort_model.generate(pixel_values)
1452
- raw_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
1453
-
1454
- if not raw_text:
1455
- return "[OCR_WARNING: No formula found]"
1456
-
1457
- latex = raw_text[0]
1458
- latex = re.sub(r'[\r\n]+', '', latex)
1459
-
1460
- return latex
1461
-
1462
- except Exception as e:
1463
- return f"[TR_OCR_ERROR: {e}]"
1464
-
1465
-
1466
- # --- UPDATED: page width argument removed from signature and call ---
1467
- # def run_yolo_detection_and_count(
1468
- # image: np.ndarray, model: YOLO, page_num: int,
1469
- # current_eq_count: int, current_fig_count: int
1470
- # ) -> Tuple[List[Dict[str, Union[Image.Image, str, Tuple[float,...]]]], int, int]:
1471
- # """
1472
- # Performs YOLO detection and returns a list of detected item dictionaries
1473
- # and the updated total counters.
1474
- # """
1475
-
1476
- # eq_counter = current_eq_count
1477
- # fig_counter = current_fig_count
1478
-
1479
- # detected_items: List[Dict[str, Union[Image.Image, str, Tuple[float,...]]]] = []
1480
- # yolo_detections = []
1481
-
1482
- # try:
1483
- # results = model.predict(image, conf=CONF_THRESHOLD, verbose=False)
1484
- # if results and results[0].boxes:
1485
- # for box in results[0].boxes.data.tolist():
1486
- # x1, y1, x2, y2, conf, cls_id = box
1487
- # cls_name = model.names[int(cls_id)]
1488
- # if cls_name in TARGET_CLASSES:
1489
- # yolo_detections.append({
1490
- # 'coords': (x1, y1, x2, y2),
1491
- # 'class': cls_name,
1492
- # 'conf': conf
1493
- # })
1494
- # except Exception as e:
1495
- # logging.error(f"ERROR: YOLO inference failed on page {page_num}: {e}")
1496
- # return [], eq_counter, fig_counter
1497
-
1498
- # # Call merge_overlapping_boxes without page_width
1499
- # merged_detections = merge_overlapping_boxes(yolo_detections, IOU_MERGE_THRESHOLD)
1500
- # final_detections = filter_nested_boxes(merged_detections, IOA_SUPPRESSION_THRESHOLD)
1501
-
1502
- # # Note: final_detections is now sorted purely by y1
1503
-
1504
- # for det in final_detections:
1505
- # bbox = det["coords"]
1506
- # crop_pil = crop_and_convert_to_pil(image, bbox)
1507
-
1508
- # item = {
1509
- # "type": det["class"],
1510
- # "coords": bbox,
1511
- # "pil_image": crop_pil,
1512
- # }
1513
-
1514
- # if det["class"] == "equation":
1515
- # eq_counter += 1
1516
- # item["id"] = f"EQUATION{eq_counter}"
1517
- # item["latex"] = ""
1518
- # elif det["class"] == "figure":
1519
- # fig_counter += 1
1520
- # item["id"] = f"FIGURE{fig_counter}"
1521
- # item["latex"] = "[FIGURE - No LaTeX]"
1522
-
1523
- # detected_items.append(item)
1524
-
1525
- # return detected_items, eq_counter, fig_counter
1526
-
1527
-
1528
-
1529
-
1530
-
1531
-
1532
-
1533
-
1534
- # def run_yolo_detection_and_count(
1535
- # image: np.ndarray, model: YOLO, page_num: int,
1536
- # current_eq_count: int, current_fig_count: int
1537
- # ) -> Tuple[List[Dict[str, Union[Image.Image, str, Tuple[float,...]]]], int, int]:
1538
- # """
1539
- # Performs YOLO detection and returns a list of detected item dictionaries
1540
- # and the updated total counters.
1541
- # """
1542
-
1543
- # eq_counter = current_eq_count
1544
- # fig_counter = current_fig_count
1545
-
1546
- # detected_items: List[Dict[str, Union[Image.Image, str, Tuple[float,...]]]] = []
1547
- # yolo_detections = []
1548
-
1549
- # try:
1550
- # results = model.predict(image, conf=CONF_THRESHOLD, verbose=False)
1551
- # if results and results[0].boxes:
1552
- # for box in results[0].boxes.data.tolist():
1553
- # x1, y1, x2, y2, conf, cls_id = box
1554
- # cls_name = model.names[int(cls_id)]
1555
- # if cls_name in TARGET_CLASSES:
1556
- # yolo_detections.append({
1557
- # 'coords': (x1, y1, x2, y2),
1558
- # 'class': cls_name,
1559
- # 'conf': conf
1560
- # })
1561
- # except Exception as e:
1562
- # logging.error(f"ERROR: YOLO inference failed on page {page_num}: {e}")
1563
- # return [], eq_counter, fig_counter
1564
-
1565
- # merged_detections = merge_overlapping_boxes(yolo_detections, IOU_MERGE_THRESHOLD)
1566
- # final_detections = filter_nested_boxes(merged_detections, IOA_SUPPRESSION_THRESHOLD)
1567
-
1568
- # for det in final_detections:
1569
- # bbox = det["coords"]
1570
- # crop_pil = crop_and_convert_to_pil(image, bbox)
1571
-
1572
- # item = {
1573
- # "type": det["class"],
1574
- # "coords": bbox,
1575
- # "pil_image": crop_pil,
1576
- # "page_num": page_num, # ← ADD THIS LINE
1577
- # }
1578
-
1579
- # if det["class"] == "equation":
1580
- # eq_counter += 1
1581
- # item["id"] = f"EQUATION{eq_counter}"
1582
- # item["latex"] = ""
1583
- # elif det["class"] == "figure":
1584
- # fig_counter += 1
1585
- # item["id"] = f"FIGURE{fig_counter}"
1586
- # item["latex"] = "[FIGURE - No LaTeX]"
1587
-
1588
- # detected_items.append(item)
1589
-
1590
- # return detected_items, eq_counter, fig_counter
1591
-
1592
-
1593
-
1594
-
1595
- def run_yolo_detection_and_count(
1596
- image: np.ndarray, model: YOLO, page_num: int,
1597
- current_eq_count: int, current_fig_count: int
1598
- ) -> Tuple[List[Dict[str, Union[Image.Image, str, Tuple[float,...]]]], int, int]:
1599
- """
1600
- Performs YOLO detection and returns detected items with counters.
1601
- This version is for the Gradio script.
1602
- """
1603
- eq_counter = current_eq_count
1604
- fig_counter = current_fig_count
1605
-
1606
- detected_items = []
1607
- yolo_detections = []
1608
-
1609
- try:
1610
- results = model.predict(image, conf=CONF_THRESHOLD, verbose=False)
1611
- if results and results[0].boxes:
1612
- for box in results[0].boxes.data.tolist():
1613
- x1, y1, x2, y2, conf, cls_id = box
1614
- cls_name = model.names[int(cls_id)]
1615
- if cls_name in TARGET_CLASSES:
1616
- yolo_detections.append({
1617
- 'coords': (x1, y1, x2, y2),
1618
- 'class': cls_name,
1619
- 'conf': conf
1620
- })
1621
- except Exception as e:
1622
- logging.error(f"ERROR: YOLO inference failed on page {page_num}: {e}")
1623
- return [], eq_counter, fig_counter
1624
-
1625
- # CRITICAL: Use exact same processing order
1626
- merged_detections = merge_overlapping_boxes(yolo_detections, IOU_MERGE_THRESHOLD)
1627
- final_detections = filter_nested_boxes(merged_detections, IOA_SUPPRESSION_THRESHOLD)
1628
-
1629
- # Create items from final detections
1630
- for det in final_detections:
1631
- bbox = det["coords"]
1632
- crop_pil = crop_and_convert_to_pil(image, bbox)
1633
-
1634
- item = {
1635
- "type": det["class"],
1636
- "coords": bbox,
1637
- "pil_image": crop_pil,
1638
- "page_num": page_num,
1639
- }
1640
-
1641
- if det["class"] == "equation":
1642
- eq_counter += 1
1643
- item["id"] = f"EQUATION{eq_counter}"
1644
- item["latex"] = ""
1645
- elif det["class"] == "figure":
1646
- fig_counter += 1
1647
- item["id"] = f"FIGURE{fig_counter}"
1648
- item["latex"] = "[FIGURE - No LaTeX]"
1649
-
1650
- detected_items.append(item)
1651
-
1652
- return detected_items, eq_counter, fig_counter
1653
-
1654
-
1655
-
1656
-
1657
- # ============================================================================
1658
- # --- MAIN DOCUMENT PROCESSING FUNCTION (Retained Logic) ---
1659
- # ============================================================================
1660
-
1661
- # def run_single_pdf_preprocessing(
1662
- # pdf_path: str
1663
- # ) -> Tuple[int, int, int, str, float, Dict[str, Union[int, str]], List[Tuple[Image.Image, str]]]:
1664
- # """
1665
- # Runs the pipeline, performs OCR, and returns final results.
1666
- # """
1667
-
1668
- # log_stream.truncate(0)
1669
- # log_stream.seek(0)
1670
-
1671
- # start_time = time.time()
1672
-
1673
- # all_extracted_items: List[Dict[str, Union[Image.Image, str]]] = []
1674
-
1675
- # total_figure_count = 0
1676
- # total_equation_count = 0
1677
-
1678
-
1679
- # # 1. Validation and Model Loading (YOLO)
1680
- # t0 = time.time()
1681
- # if not os.path.exists(pdf_path):
1682
- # report = f"❌ FATAL ERROR: Input PDF not found at {pdf_path}."
1683
- # return 0, 0, 0, report, time.time() - start_time, {}, []
1684
-
1685
- # try:
1686
- # model = YOLO(WEIGHTS_PATH)
1687
- # logging.warning(f"INFO: Loaded YOLO model from: {WEIGHTS_PATH}")
1688
- # except Exception as e:
1689
- # report = f"❌ ERROR loading YOLO model: {e}\n(Ensure 'best.pt' is available and valid.)"
1690
- # return 0, 0, 0, report, time.time() - start_time, {}, []
1691
- # t1 = time.time()
1692
- # logging.warning(f"INFO: Model Loading Time: {t1-t0:.4f}s")
1693
-
1694
- # # 2. PDF Loading (fitz)
1695
- # t2 = time.time()
1696
- # try:
1697
- # doc = fitz.open(pdf_path)
1698
- # total_pages = doc.page_count
1699
- # logging.warning(f"INFO: Opened PDF with {doc.page_count} pages")
1700
- # except Exception as e:
1701
- # report = f"❌ ERROR loading PDF file: {e}"
1702
- # return 0, 0, 0, report, time.time() - start_time, {}, []
1703
- # t3 = time.time()
1704
- # logging.warning(f"INFO: PDF Initialization Time: {t3-t2:.4f}s")
1705
-
1706
- # mat = fitz.Matrix(SCALE_FACTOR, SCALE_FACTOR)
1707
-
1708
- # # 3. Page Processing, Detection, and OCR Loop
1709
- # t4 = time.time()
1710
- # for page_num_0_based in range(doc.page_count):
1711
- # page_start_time = time.time()
1712
- # fitz_page = doc.load_page(page_num_0_based)
1713
- # page_num = page_num_0_based + 1
1714
-
1715
- # # Render page to image for YOLO
1716
- # try:
1717
- # pix_start = time.time()
1718
- # pix = fitz_page.get_pixmap(matrix=mat)
1719
- # original_img = pixmap_to_numpy(pix)
1720
- # pix_time = time.time() - pix_start
1721
- # except Exception as e:
1722
- # logging.error(f"ERROR: Error converting page {page_num} to image: {e}. Skipping.")
1723
- # continue
1724
-
1725
- # # YOLO Detection
1726
- # detect_start = time.time()
1727
- # (
1728
- # page_extracted_items,
1729
- # total_equation_count,
1730
- # total_figure_count
1731
- # ) = run_yolo_detection_and_count(
1732
- # original_img,
1733
- # model,
1734
- # page_num,
1735
- # total_equation_count,
1736
- # total_figure_count
1737
- # )
1738
- # detect_time = time.time() - detect_start
1739
-
1740
- # # --- OCR/LaTeX Conversion and Logging ---
1741
- # ocr_total_time = 0
1742
- # page_equations = 0
1743
-
1744
- # for item in page_extracted_items:
1745
- # if item["type"] == "equation":
1746
- # page_equations += 1
1747
- # ocr_start = time.time()
1748
-
1749
- # b64_string = pil_to_base64(item["pil_image"])
1750
- # item["latex"] = get_latex_from_base64(b64_string)
1751
-
1752
- # ocr_time = time.time() - ocr_start
1753
- # ocr_total_time += ocr_time
1754
-
1755
- # logging.warning(f"LATEX: Page {page_num}, ID {item['id']} -> Time: {ocr_time:.4f}s, Formula: {item['latex'][:50]}...")
1756
-
1757
- # all_extracted_items.extend(page_extracted_items)
1758
-
1759
- # page_figures = sum(1 for item in page_extracted_items if item["type"] == "figure")
1760
-
1761
- # page_total_time = time.time() - page_start_time
1762
- # logging.warning(f"SUMMARY: Page {page_num}: EQs={page_equations}, Figs={page_figures} | Page Time: {page_total_time:.4f}s (Detect={detect_time:.4f}s, OCR Total={ocr_total_time:.4f}s)")
1763
-
1764
- # doc.close()
1765
- # t5 = time.time()
1766
- # detection_loop_time = t5 - t4
1767
- # logging.warning(f"INFO: Total Detection and OCR Loop Time ({total_pages} pages): {detection_loop_time:.4f}s")
1768
-
1769
- # # 4. Final Report Generation and Gallery Formatting
1770
-
1771
- # # Create the structured JSON output as requested by the user
1772
- # structured_latex_output = {
1773
- # "Total Pages": total_pages,
1774
- # "Total Equations": total_equation_count,
1775
- # }
1776
- # for item in all_extracted_items:
1777
- # if item["type"] == "equation":
1778
- # # Map EQUATION ID to LaTeX code
1779
- # structured_latex_output[item["id"]] = item["latex"]
1780
-
1781
-
1782
- # # Format the extracted items for the Gradio Gallery
1783
- # gallery_items: List[Tuple[Image.Image, str]] = []
1784
-
1785
- # for item in all_extracted_items:
1786
- # image_label = item["id"]
1787
- # if item["type"] == "equation":
1788
- # image_label = f'{item["id"]}: {item["latex"]}'
1789
-
1790
- # gallery_items.append((item["pil_image"], image_label))
1791
-
1792
-
1793
- # total_execution_time = t5 - start_time
1794
-
1795
- # full_log = log_stream.getvalue()
1796
-
1797
- # report = (
1798
- # f"✅ **YOLO Counting & OCR Complete!**\n\n"
1799
- # f"**1) Total Pages Detected in PDF:** **{total_pages}**\n"
1800
- # f"**2) Total Equations Detected:** **{total_equation_count}**\n"
1801
- # f"**3) Total Figures Detected:** **{total_figure_count}**\n"
1802
- # f"---\n"
1803
- # f"**4) Total Execution Time:** **{total_execution_time:.4f}s**\n"
1804
- # f"### Full Processing Log\n"
1805
- # f"```text\n"
1806
- # f"{full_log}"
1807
- # f"\n```"
1808
- # )
1809
-
1810
- # # Return the new structured_latex_output instead of the page counts
1811
- # return total_pages, total_equation_count, total_figure_count, report, total_execution_time, structured_latex_output, gallery_items
1812
-
1813
-
1814
-
1815
-
1816
-
1817
-
1818
-
1819
-
1820
-
1821
-
1822
-
1823
-
1824
-
1825
-
1826
- def run_single_pdf_preprocessing(
1827
- pdf_path: str
1828
- ) -> Tuple[int, int, int, str, float, Dict[str, Union[int, str]], List[Tuple[Image.Image, str]]]:
1829
- """
1830
- Runs the pipeline, performs OCR, and returns final results.
1831
- """
1832
-
1833
- log_stream.truncate(0)
1834
- log_stream.seek(0)
1835
-
1836
- start_time = time.time()
1837
-
1838
- all_extracted_items: List[Dict[str, Union[Image.Image, str]]] = []
1839
-
1840
- total_figure_count = 0
1841
- total_equation_count = 0
1842
-
1843
-
1844
- # 1. Validation and Model Loading (YOLO)
1845
- t0 = time.time()
1846
- if not os.path.exists(pdf_path):
1847
- report = f"❌ FATAL ERROR: Input PDF not found at {pdf_path}."
1848
- return 0, 0, 0, report, time.time() - start_time, {}, []
1849
-
1850
- try:
1851
- model = YOLO(WEIGHTS_PATH)
1852
- logging.warning(f"INFO: Loaded YOLO model from: {WEIGHTS_PATH}")
1853
- except Exception as e:
1854
- report = f"❌ ERROR loading YOLO model: {e}\n(Ensure 'best.pt' is available and valid.)"
1855
- return 0, 0, 0, report, time.time() - start_time, {}, []
1856
- t1 = time.time()
1857
- logging.warning(f"INFO: Model Loading Time: {t1-t0:.4f}s")
1858
-
1859
- # 2. PDF Loading (fitz)
1860
- t2 = time.time()
1861
- try:
1862
- doc = fitz.open(pdf_path)
1863
- total_pages = doc.page_count
1864
- logging.warning(f"INFO: Opened PDF with {doc.page_count} pages")
1865
- except Exception as e:
1866
- report = f"❌ ERROR loading PDF file: {e}"
1867
- return 0, 0, 0, report, time.time() - start_time, {}, []
1868
- t3 = time.time()
1869
- logging.warning(f"INFO: PDF Initialization Time: {t3-t2:.4f}s")
1870
-
1871
- mat = fitz.Matrix(SCALE_FACTOR, SCALE_FACTOR)
1872
-
1873
- # 3. Page Processing and Detection Loop
1874
- t4 = time.time()
1875
- for page_num_0_based in range(doc.page_count):
1876
- page_start_time = time.time()
1877
- fitz_page = doc.load_page(page_num_0_based)
1878
- page_num = page_num_0_based + 1
1879
-
1880
- # Render page to image for YOLO
1881
- try:
1882
- pix_start = time.time()
1883
- pix = fitz_page.get_pixmap(matrix=mat)
1884
- original_img = pixmap_to_numpy(pix)
1885
- pix_time = time.time() - pix_start
1886
- except Exception as e:
1887
- logging.error(f"ERROR: Error converting page {page_num} to image: {e}. Skipping.")
1888
- continue
1889
-
1890
- # YOLO Detection
1891
- detect_start = time.time()
1892
- (
1893
- page_extracted_items,
1894
- total_equation_count,
1895
- total_figure_count
1896
- ) = run_yolo_detection_and_count(
1897
- original_img,
1898
- model,
1899
- page_num,
1900
- total_equation_count,
1901
- total_figure_count
1902
- )
1903
- detect_time = time.time() - detect_start
1904
-
1905
- # Store items (OCR will be done later in correct order)
1906
- all_extracted_items.extend(page_extracted_items)
1907
-
1908
- page_figures = sum(1 for item in page_extracted_items if item["type"] == "figure")
1909
- page_equations = sum(1 for item in page_extracted_items if item["type"] == "equation")
1910
-
1911
- page_total_time = time.time() - page_start_time
1912
- logging.warning(f"SUMMARY: Page {page_num}: EQs={page_equations}, Figs={page_figures} | Page Time: {page_total_time:.4f}s (Detect={detect_time:.4f}s)")
1913
-
1914
- doc.close()
1915
- t5 = time.time()
1916
- detection_loop_time = t5 - t4
1917
- logging.warning(f"INFO: Total Detection Loop Time ({total_pages} pages): {detection_loop_time:.4f}s")
1918
-
1919
- # 4. Sort all items by page number, then by y-coordinate
1920
- logging.warning(f"INFO: Sorting {len(all_extracted_items)} items by page and position...")
1921
- all_extracted_items.sort(key=lambda item: (item['page_num'], item['coords'][1]))
1922
-
1923
- # 5. Re-assign IDs in the correct order
1924
- equation_counter = 0
1925
- figure_counter = 0
1926
-
1927
- for item in all_extracted_items:
1928
- if item["type"] == "equation":
1929
- equation_counter += 1
1930
- item["id"] = f"EQUATION{equation_counter}"
1931
- elif item["type"] == "figure":
1932
- figure_counter += 1
1933
- item["id"] = f"FIGURE{figure_counter}"
1934
-
1935
- # Update the total counts with the correct values
1936
- total_equation_count = equation_counter
1937
- total_figure_count = figure_counter
1938
-
1939
- logging.warning(f"INFO: Re-numbered items - Total Equations: {total_equation_count}, Total Figures: {total_figure_count}")
1940
-
1941
- # 6. Perform OCR in the correct order
1942
- t6 = time.time()
1943
- ocr_total_time = 0
1944
-
1945
- logging.warning(f"INFO: Starting OCR for {total_equation_count} equations in correct order...")
1946
-
1947
- for item in all_extracted_items:
1948
- if item["type"] == "equation":
1949
- ocr_start = time.time()
1950
-
1951
- b64_string = pil_to_base64(item["pil_image"])
1952
- item["latex"] = get_latex_from_base64(b64_string)
1953
-
1954
- ocr_time = time.time() - ocr_start
1955
- ocr_total_time += ocr_time
1956
-
1957
- logging.warning(f"LATEX: Page {item['page_num']}, ID {item['id']} -> Time: {ocr_time:.4f}s, Formula: {item['latex'][:50]}...")
1958
- elif item["type"] == "figure":
1959
- item["latex"] = "[FIGURE - No LaTeX]"
1960
-
1961
- t7 = time.time()
1962
- logging.warning(f"INFO: Total OCR Time: {ocr_total_time:.4f}s")
1963
-
1964
- # 7. Final Report Generation and Gallery Formatting
1965
-
1966
- # Create the structured JSON output as requested by the user
1967
- structured_latex_output = {
1968
- "Total Pages": total_pages,
1969
- "Total Equations": total_equation_count,
1970
- }
1971
- for item in all_extracted_items:
1972
- if item["type"] == "equation":
1973
- # Map EQUATION ID to LaTeX code
1974
- structured_latex_output[item["id"]] = item["latex"]
1975
-
1976
-
1977
- # Format the extracted items for the Gradio Gallery
1978
- gallery_items: List[Tuple[Image.Image, str]] = []
1979
-
1980
- for item in all_extracted_items:
1981
- image_label = item["id"]
1982
- if item["type"] == "equation":
1983
- image_label = f'{item["id"]}: {item["latex"]}'
1984
-
1985
- gallery_items.append((item["pil_image"], image_label))
1986
-
1987
-
1988
- total_execution_time = t7 - start_time
1989
-
1990
- full_log = log_stream.getvalue()
1991
-
1992
- report = (
1993
- f"✅ **YOLO Counting & OCR Complete!**\n\n"
1994
- f"**1) Total Pages Detected in PDF:** **{total_pages}**\n"
1995
- f"**2) Total Equations Detected:** **{total_equation_count}**\n"
1996
- f"**3) Total Figures Detected:** **{total_figure_count}**\n"
1997
- f"---\n"
1998
- f"**4) Total Execution Time:** **{total_execution_time:.4f}s**\n"
1999
- f"### Full Processing Log\n"
2000
- f"```text\n"
2001
- f"{full_log}"
2002
- f"\n```"
2003
- )
2004
-
2005
- # Return the new structured_latex_output instead of the page counts
2006
- return total_pages, total_equation_count, total_figure_count, report, total_execution_time, structured_latex_output, gallery_items
2007
-
2008
-
2009
-
2010
-
2011
-
2012
-
2013
-
2014
-
2015
-
2016
-
2017
-
2018
-
2019
- # ============================================================================
2020
- # --- GRADIO INTERFACE FUNCTION & DEFINITION (Retained) ---
2021
- # ============================================================================
2022
-
2023
- def gradio_process_pdf(pdf_file) -> Tuple[str, str, str, str, Dict[str, Union[int, str]], List[Tuple[Image.Image, str]]]:
2024
- """Gradio wrapper function to handle file upload and return results."""
2025
- if pdf_file is None:
2026
- return "N/A", "N/A", "N/A", "Please upload a PDF file.", {}, []
2027
-
2028
- pdf_path = pdf_file.name
2029
-
2030
- try:
2031
- (
2032
- num_pages,
2033
- num_equations,
2034
- num_figures,
2035
- report,
2036
- total_time,
2037
- structured_latex_output,
2038
- gallery_items
2039
- ) = run_single_pdf_preprocessing(pdf_path)
2040
-
2041
-
2042
- return str(num_pages), str(num_equations), str(num_figures), report, structured_latex_output, gallery_items
2043
-
2044
-
2045
- except Exception as e:
2046
- error_msg = f"An unexpected error occurred: {e}"
2047
- logging.error(f"FATAL: {error_msg}", exc_info=True)
2048
- full_log = log_stream.getvalue()
2049
- error_report = f"❌ CRITICAL ERROR:\n{error_msg}\n\n### Log up to Failure\n```text\n{full_log}\n```"
2050
- return "Error", "Error", "Error", error_report, {}, []
2051
-
2052
-
2053
- if __name__ == "__main__":
2054
-
2055
- if not os.path.exists(WEIGHTS_PATH):
2056
- logging.error(f"❌ FATAL ERROR: YOLO weight file '{WEIGHTS_PATH}' not found. Cannot run live inference.")
2057
-
2058
- input_file = gr.File(label="Upload PDF Document", type="filepath", file_types=[".pdf"])
2059
-
2060
- output_pages = gr.Textbox(label="Total Pages in PDF", interactive=False)
2061
- output_equations = gr.Textbox(label="Total Equations Detected", interactive=False)
2062
- output_figures = gr.Textbox(label="Total Figures Detected", interactive=False)
2063
- output_report = gr.Markdown(label="Processing Summary and Full Log")
2064
-
2065
- output_structured_latex = gr.JSON(label="Structured LaTeX Output (EQUATIONx : <latex code>)")
2066
-
2067
- output_gallery = gr.Gallery(
2068
- label="Detected Items (with Extracted LaTeX)",
2069
- columns=3,
2070
- height="auto",
2071
- object_fit="contain",
2072
- allow_preview=False
2073
- )
2074
-
2075
- interface = gr.Interface(
2076
- fn=gradio_process_pdf,
2077
- inputs=input_file,
2078
- outputs=[
2079
- output_pages,
2080
- output_equations,
2081
- output_figures,
2082
- output_report,
2083
- output_structured_latex,
2084
- output_gallery
2085
- ],
2086
- title="📊 YOLO Detection & Math OCR Pipeline (Pure Vertical Sort)",
2087
- description=(
2088
- "Upload a PDF. YOLO detects equations/figures, and OCR converts equations to LaTeX. The output is now strictly sorted by the top bounding box Y-coordinate."
2089
- ),
2090
- )
2091
-
2092
- print("\nStarting Gradio application...")
2093
- interface.launch(inbrowser=True)
 
1
 
 
 
 
 
 
 
2
 
3
 
4
 
 
8
  # import torch
9
  # import torch.serialization
10
  # import os
11
+ # import time # Import for timing
12
  # from typing import Optional, Tuple, List, Dict, Any
13
  # from ultralytics import YOLO
14
  # import logging
 
35
  # # ============================================================================
36
 
37
  # WEIGHTS_PATH = 'best.pt'
38
+ # SCALE_FACTOR = 2.0 # Used for page rendering and coordinate scaling
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
  # # Detection parameters
41
  # CONF_THRESHOLD = 0.2
 
48
  # GLOBAL_EQUATION_COUNT = 0
49
 
50
  # # ============================================================================
51
+ # # --- BOX COMBINATION LOGIC ---
52
  # # ============================================================================
53
 
54
  # def calculate_iou(box1, box2):
 
123
 
124
  # def pixmap_to_numpy(pix: fitz.Pixmap) -> np.ndarray:
125
  # """Converts a PyMuPDF Pixmap to a NumPy array for OpenCV/YOLO."""
126
+ # # This function is retained as it's required to convert PDF page to image for YOLO input.
127
  # img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(
128
  # (pix.h, pix.w, pix.n)
129
  # )
 
134
  # return img
135
 
136
 
 
 
137
  # def run_yolo_detection_and_count(
138
  # image: np.ndarray, model: YOLO, page_num: int
139
+ # ) -> Tuple[int, int]: # Removed equation_results list from return
140
+ # """
141
+ # Runs YOLO inference, applies NMS/filtering, and updates global counters.
142
+ # Returns page counts only.
143
+ # """
144
  # global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
145
+
146
  # yolo_detections = []
147
  # page_equations = 0
148
  # page_figures = 0
149
+
 
150
  # try:
151
  # results = model.predict(image, conf=CONF_THRESHOLD, verbose=False)
152
+
153
  # if results and results[0].boxes:
154
  # for box in results[0].boxes.data.tolist():
155
  # x1, y1, x2, y2, conf, cls_id = box
156
  # cls_name = model.names[int(cls_id)]
157
+
158
  # if cls_name in TARGET_CLASSES:
159
  # yolo_detections.append({
160
+ # 'coords': (x1, y1, x2, y2),
161
+ # 'class': cls_name,
162
  # 'conf': conf
163
  # })
164
  # except Exception as e:
165
  # logging.error(f"YOLO inference failed on page {page_num}: {e}")
166
+ # return 0, 0
167
 
168
+ # # Apply NMS/Merging/Filtering
169
  # merged_detections = merge_overlapping_boxes(yolo_detections, IOU_MERGE_THRESHOLD)
170
  # final_detections = filter_nested_boxes(merged_detections, IOA_SUPPRESSION_THRESHOLD)
171
 
172
+ # # Update Global Counters
173
  # for det in final_detections:
174
+ # if det['class'] == 'figure':
 
 
 
 
 
 
 
 
 
 
 
 
 
175
  # GLOBAL_FIGURE_COUNT += 1
176
  # page_figures += 1
177
+ # elif det['class'] == 'equation':
178
+ # GLOBAL_EQUATION_COUNT += 1
179
+ # page_equations += 1
180
+
 
 
 
 
181
  # logging.warning(f" -> Page {page_num}: EQs={page_equations}, Figs={page_figures}")
182
+ # return page_equations, page_figures
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
 
184
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
  # # ============================================================================
186
+ # # --- MAIN DOCUMENT PROCESSING FUNCTION (Optimized) ---
187
  # # ============================================================================
188
 
189
+ # def run_single_pdf_preprocessing(pdf_path: str) -> Tuple[int, int, int, str, float, List[str]]:
 
190
  # """
191
+ # Runs the pipeline, returns counts, report, total time, and an empty list
192
+ # (maintaining the expected return signature for Gradio but with None for gallery).
193
  # """
 
 
194
  # global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
195
  # start_time = time.time()
196
  # log_messages = []
 
 
 
 
 
197
 
198
  # # Reset globals
199
  # GLOBAL_FIGURE_COUNT = 0
200
  # GLOBAL_EQUATION_COUNT = 0
201
 
 
 
 
 
 
 
 
202
  # # 1. Validation and Model Loading
203
  # t0 = time.time()
204
  # if not os.path.exists(pdf_path):
205
  # report = f"❌ FATAL ERROR: Input PDF not found at {pdf_path}."
206
+ # return 0, 0, 0, report, time.time() - start_time, []
207
 
208
  # try:
209
  # model = YOLO(WEIGHTS_PATH)
210
  # logging.warning(f"✅ Loaded YOLO model from: {WEIGHTS_PATH}")
211
  # except Exception as e:
212
  # report = f"❌ ERROR loading YOLO model: {e}\n(Ensure 'best.pt' is available and valid.)"
213
+ # return 0, 0, 0, report, time.time() - start_time, []
214
  # t1 = time.time()
215
  # log_messages.append(f"Model Loading Time: {t1-t0:.4f}s")
216
 
 
222
  # logging.warning(f"✅ Opened PDF with {doc.page_count} pages")
223
  # except Exception as e:
224
  # report = f"❌ ERROR loading PDF file: {e}"
225
+ # return 0, 0, 0, report, time.time() - start_time, []
226
  # t3 = time.time()
227
  # log_messages.append(f"PDF Initialization Time: {t3-t2:.4f}s")
228
 
 
247
 
248
  # # Core Detection
249
  # detect_start = time.time()
250
+ # run_yolo_detection_and_count(original_img, model, page_num)
 
 
 
251
  # detect_time = time.time() - detect_start
252
 
 
 
 
253
  # page_total_time = time.time() - page_start_time
254
  # log_messages.append(f"Page {page_num} Time: Total={page_total_time:.4f}s (Render={pix_time:.4f}s, Detect={detect_time:.4f}s)")
255
 
 
258
  # detection_loop_time = t5 - t4
259
  # log_messages.append(f"Total Detection Loop Time ({total_pages} pages): {detection_loop_time:.4f}s")
260
 
 
 
 
 
 
261
  # # 4. Final Report Generation
262
  # total_execution_time = t5 - start_time
263
 
 
274
  # f"\n```"
275
  # )
276
 
277
+ # # Return total_execution_time and an empty list for the gallery output
278
+ # return total_pages, GLOBAL_EQUATION_COUNT, GLOBAL_FIGURE_COUNT, report, total_execution_time, []
 
 
279
 
280
 
281
  # # ============================================================================
282
  # # --- GRADIO INTERFACE FUNCTION (Updated) ---
283
  # # ============================================================================
284
 
285
+ # # NOTE: The return signature has changed. We removed 'temp_output_dir' as it's no longer used.
286
+ # def gradio_process_pdf(pdf_file) -> Tuple[str, str, str, str, List[str]]:
287
  # """
288
+ # Gradio wrapper function to handle file upload and return results (no image handling).
289
  # """
290
  # if pdf_file is None:
291
+ # return "N/A", "N/A", "N/A", "Please upload a PDF file.", []
 
292
 
293
  # pdf_path = pdf_file.name
294
 
295
  # try:
296
+ # # Run the core logic
297
+ # # Note the change: temp_output_dir is removed, and total_time is returned
298
+ # num_pages, num_equations, num_figures, report, total_time, _ = run_single_pdf_preprocessing(
299
+ # pdf_path
300
+ # )
 
 
 
301
 
302
+ # # Return results (the last item is an empty list for the now-empty gallery)
303
+ # return str(num_pages), str(num_equations), str(num_figures), report, []
 
 
304
 
305
  # except Exception as e:
306
  # error_msg = f"An unexpected error occurred: {e}"
307
  # logging.error(error_msg, exc_info=True)
308
+ # return "Error", "Error", "Error", error_msg, []
 
309
 
310
 
311
  # # ============================================================================
 
325
  # output_figures = gr.Textbox(label="Total Figures Detected", interactive=False)
326
  # output_report = gr.Markdown(label="Processing Summary and Timing")
327
 
 
 
 
328
  # # Gradio Gallery is retained but will receive an empty list []
329
  # output_gallery = gr.Gallery(
330
  # label="Detected Equations (Disabled for Speed)",
331
  # columns=5,
332
  # height="auto",
333
  # object_fit="contain",
334
+ # allow_preview=False # Disable preview since it's empty
335
  # )
336
 
337
  # interface = gr.Interface(
338
  # fn=gradio_process_pdf,
339
  # inputs=input_file,
340
+ # # The number of outputs remains 5 (3 textboxes, 1 markdown, 1 gallery)
341
+ # outputs=[output_pages, output_equations, output_figures, output_report, output_gallery],
342
+ # title="🚀 Optimized YOLO Counting with Timing",
 
 
 
 
 
 
 
343
  # description=(
344
+ # "Upload a PDF to run YOLO detection. Image cropping is disabled for maximum speed. "
345
+ # "Timing for each step is included in the summary report."
346
  # ),
347
  # )
348
 
349
  # print("\nStarting Gradio application...")
350
+ # interface.launch(inbrowser=True)
351
+
352
+
353
+
354
+
355
+
356
+
357
 
358
 
359
 
 
365
 
366
 
367
 
 
 
 
368
  # import fitz # PyMuPDF
369
  # import numpy as np
370
  # import cv2
371
  # import torch
372
  # import torch.serialization
373
  # import os
374
+ # import time
375
+ # from typing import Optional, Tuple, List, Dict, Any
376
  # from ultralytics import YOLO
377
  # import logging
378
  # import gradio as gr
379
+ # import shutil
380
+ # import tempfile
381
  # import io
 
382
 
383
  # # ============================================================================
384
+ # # --- Global Patches and Setup ---
385
  # # ============================================================================
386
 
 
 
 
 
387
  # # Patch torch.load to prevent weights_only error with older models
388
  # _original_torch_load = torch.load
389
  # def patched_torch_load(*args, **kwargs):
 
391
  # return _original_torch_load(*args, **kwargs)
392
  # torch.load = patched_torch_load
393
 
394
+ # logging.basicConfig(level=logging.WARNING)
 
395
 
396
+ # # ============================================================================
397
+ # # --- CONFIGURATION AND CONSTANTS ---
398
+ # # ============================================================================
399
 
400
+ # WEIGHTS_PATH = 'best.pt'
401
+ # SCALE_FACTOR = 2.0
 
 
 
 
 
 
 
 
402
 
403
  # # Detection parameters
404
  # CONF_THRESHOLD = 0.2
 
406
  # IOU_MERGE_THRESHOLD = 0.4
407
  # IOA_SUPPRESSION_THRESHOLD = 0.7
408
 
409
+ # # Global counters (Reset per run)
410
+ # GLOBAL_FIGURE_COUNT = 0
411
+ # GLOBAL_EQUATION_COUNT = 0
412
+
413
  # # ============================================================================
414
+ # # --- BOX COMBINATION LOGIC (Retained for detection accuracy) ---
415
  # # ============================================================================
416
 
417
  # def calculate_iou(box1, box2):
 
456
 
457
  # def merge_overlapping_boxes(detections, iou_threshold):
458
  # if not detections: return []
 
459
  # detections.sort(key=lambda d: d['conf'], reverse=True)
460
  # merged_detections = []
461
  # is_merged = [False] * len(detections)
 
462
  # for i in range(len(detections)):
463
  # if is_merged[i]: continue
464
  # current_box = detections[i]['coords']
 
472
  # merged_x1 = min(merged_x1, other_box[0])
473
  # merged_y1 = min(merged_y1, other_box[1])
474
  # merged_x2 = max(merged_x2, other_box[2])
475
+ # merged_y2 = max(merged_y2, other_box[3])
476
  # is_merged[j] = True
477
  # merged_detections.append({
478
  # 'coords': (merged_x1, merged_y1, merged_x2, merged_y2),
479
+ # 'y1': merged_y1, 'class': current_class, 'conf': detections[i]['conf']
 
 
 
480
  # })
 
 
 
 
 
 
481
  # return merged_detections
482
 
483
  # # ============================================================================
484
+ # # --- UTILITY FUNCTIONS ---
485
  # # ============================================================================
486
 
487
  # def pixmap_to_numpy(pix: fitz.Pixmap) -> np.ndarray:
 
496
  # return img
497
 
498
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
499
  # def run_yolo_detection_and_count(
500
+ # image: np.ndarray, model: YOLO, page_num: int
501
+ # ) -> Tuple[int, int]:
 
502
  # """
503
+ # Runs YOLO inference, applies NMS/filtering, and updates global counters.
504
+ # Returns page counts only.
505
  # """
506
+ # global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
507
 
 
 
 
 
508
  # yolo_detections = []
509
+ # page_equations = 0
510
+ # page_figures = 0
511
+
512
  # try:
513
  # results = model.predict(image, conf=CONF_THRESHOLD, verbose=False)
514
+
515
  # if results and results[0].boxes:
516
  # for box in results[0].boxes.data.tolist():
517
  # x1, y1, x2, y2, conf, cls_id = box
518
  # cls_name = model.names[int(cls_id)]
519
+
520
  # if cls_name in TARGET_CLASSES:
521
  # yolo_detections.append({
522
+ # 'coords': (x1, y1, x2, y2),
523
+ # 'class': cls_name,
524
  # 'conf': conf
525
  # })
526
  # except Exception as e:
527
+ # logging.error(f"YOLO inference failed on page {page_num}: {e}")
528
+ # return 0, 0
529
 
530
+ # # Apply NMS/Merging/Filtering
531
  # merged_detections = merge_overlapping_boxes(yolo_detections, IOU_MERGE_THRESHOLD)
532
  # final_detections = filter_nested_boxes(merged_detections, IOA_SUPPRESSION_THRESHOLD)
533
 
534
+ # # Update Global Counters
 
535
  # for det in final_detections:
536
+ # if det['class'] == 'figure':
537
+ # GLOBAL_FIGURE_COUNT += 1
538
+ # page_figures += 1
539
+ # elif det['class'] == 'equation':
540
+ # GLOBAL_EQUATION_COUNT += 1
541
+ # page_equations += 1
 
 
 
 
 
 
 
 
 
 
 
542
 
543
+ # logging.warning(f" -> Page {page_num}: EQs={page_equations}, Figs={page_figures}")
544
+ # return page_equations, page_figures
 
545
 
546
 
547
  # # ============================================================================
548
+ # # --- MAIN DOCUMENT PROCESSING FUNCTION (Fixed for JSON serialization) ---
549
  # # ============================================================================
550
 
551
+ # # NOTE: The return signature now uses Dict[str, int] for the equation counts
552
+ # def run_single_pdf_preprocessing(pdf_path: str) -> Tuple[int, int, int, str, float, Dict[str, int], List[str]]:
 
553
  # """
554
+ # Runs the pipeline, returns counts, report, total time, page counts dict (str keys), and empty list.
555
  # """
556
+ # global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
 
 
 
557
  # start_time = time.time()
558
+ # log_messages = []
559
 
560
+ # # Dictionary to store {page_number (int): equation_count (int)}
561
+ # equation_counts_per_page: Dict[int, int] = {}
 
 
562
 
563
+ # # Reset globals
564
+ # GLOBAL_FIGURE_COUNT = 0
565
+ # GLOBAL_EQUATION_COUNT = 0
566
 
567
+ # # 1. Validation and Model Loading
568
  # t0 = time.time()
569
  # if not os.path.exists(pdf_path):
570
  # report = f"❌ FATAL ERROR: Input PDF not found at {pdf_path}."
 
572
 
573
  # try:
574
  # model = YOLO(WEIGHTS_PATH)
575
+ # logging.warning(f" Loaded YOLO model from: {WEIGHTS_PATH}")
576
  # except Exception as e:
577
  # report = f"❌ ERROR loading YOLO model: {e}\n(Ensure 'best.pt' is available and valid.)"
578
  # return 0, 0, 0, report, time.time() - start_time, {}, []
579
  # t1 = time.time()
580
+ # log_messages.append(f"Model Loading Time: {t1-t0:.4f}s")
581
 
582
+ # # 2. PDF Loading
583
  # t2 = time.time()
584
  # try:
585
  # doc = fitz.open(pdf_path)
586
  # total_pages = doc.page_count
587
+ # logging.warning(f" Opened PDF with {doc.page_count} pages")
588
  # except Exception as e:
589
  # report = f"❌ ERROR loading PDF file: {e}"
590
  # return 0, 0, 0, report, time.time() - start_time, {}, []
591
  # t3 = time.time()
592
+ # log_messages.append(f"PDF Initialization Time: {t3-t2:.4f}s")
593
 
594
  # mat = fitz.Matrix(SCALE_FACTOR, SCALE_FACTOR)
595
 
596
+ # # 3. Page Processing and Detection Loop
597
  # t4 = time.time()
598
  # for page_num_0_based in range(doc.page_count):
599
  # page_start_time = time.time()
 
607
  # original_img = pixmap_to_numpy(pix)
608
  # pix_time = time.time() - pix_start
609
  # except Exception as e:
610
+ # logging.error(f"Error converting page {page_num} to image: {e}. Skipping.")
611
  # continue
612
+
613
+ # # Core Detection
614
  # detect_start = time.time()
615
+ # page_equations, _ = run_yolo_detection_and_count(original_img, model, page_num)
 
 
 
 
 
 
 
 
 
 
616
  # detect_time = time.time() - detect_start
617
 
618
+ # # Store the count in the dictionary (INT keys)
619
+ # equation_counts_per_page[page_num] = page_equations
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
620
 
621
  # page_total_time = time.time() - page_start_time
622
+ # log_messages.append(f"Page {page_num} Time: Total={page_total_time:.4f}s (Render={pix_time:.4f}s, Detect={detect_time:.4f}s)")
623
 
624
  # doc.close()
625
  # t5 = time.time()
626
  # detection_loop_time = t5 - t4
627
+ # log_messages.append(f"Total Detection Loop Time ({total_pages} pages): {detection_loop_time:.4f}s")
 
 
628
 
629
+ # # FIX APPLIED HERE: Convert integer keys to string keys for JSON serialization
630
+ # equation_counts_per_page_str_keys: Dict[str, int] = {
631
+ # str(k): v for k, v in equation_counts_per_page.items()
 
632
  # }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
633
 
634
+ # # 4. Final Report Generation
635
  # total_execution_time = t5 - start_time
636
 
 
 
637
  # report = (
638
+ # f"✅ **YOLO Counting Complete!**\n\n"
639
  # f"**1) Total Pages Detected in PDF:** **{total_pages}**\n"
640
+ # f"**2) Total Equations Detected:** **{GLOBAL_EQUATION_COUNT}**\n"
641
+ # f"**3) Total Figures Detected:** **{GLOBAL_FIGURE_COUNT}**\n"
642
  # f"---\n"
643
  # f"**4) Total Execution Time:** **{total_execution_time:.4f}s**\n"
644
+ # f"### Detailed Step Timing\n"
645
+ # f"```\n"
646
+ # + "\n".join(log_messages) +
647
  # f"\n```"
648
  # )
649
 
650
+ # # Return the dictionary with string keys
651
+ # return total_pages, GLOBAL_EQUATION_COUNT, GLOBAL_FIGURE_COUNT, report, total_execution_time, equation_counts_per_page_str_keys, []
652
 
653
 
654
  # # ============================================================================
655
+ # # --- GRADIO INTERFACE FUNCTION (Updated) ---
656
  # # ============================================================================
657
 
658
+ # def gradio_process_pdf(pdf_file) -> Tuple[str, str, str, str, Dict[str, int], List[str]]:
659
+ # """
660
+ # Gradio wrapper function to handle file upload and return results.
661
+ # """
662
  # if pdf_file is None:
663
+ # # Return an empty dict with string keys
664
  # return "N/A", "N/A", "N/A", "Please upload a PDF file.", {}, []
665
 
666
  # pdf_path = pdf_file.name
667
 
668
  # try:
669
+ # # Unpack the new return value: equation_counts_per_page (with string keys)
670
+ # num_pages, num_equations, num_figures, report, total_time, equation_counts_per_page, _ = run_single_pdf_preprocessing(
671
+ # pdf_path
672
+ # )
 
 
 
 
 
 
673
 
674
+ # # Return results (6 items now)
675
+ # return str(num_pages), str(num_equations), str(num_figures), report, equation_counts_per_page, []
676
 
677
  # except Exception as e:
678
  # error_msg = f"An unexpected error occurred: {e}"
679
+ # logging.error(error_msg, exc_info=True)
680
+ # # Return an empty dict on error
681
+ # return "Error", "Error", "Error", error_msg, {}, []
682
+
683
 
684
+ # # ============================================================================
685
+ # # --- GRADIO INTERFACE DEFINITION (Updated) ---
686
+ # # ============================================================================
687
 
688
  # if __name__ == "__main__":
689
 
 
692
 
693
  # input_file = gr.File(label="Upload PDF Document", type="filepath", file_types=[".pdf"])
694
 
695
+ # # Outputs
696
  # output_pages = gr.Textbox(label="Total Pages in PDF", interactive=False)
697
  # output_equations = gr.Textbox(label="Total Equations Detected", interactive=False)
698
  # output_figures = gr.Textbox(label="Total Figures Detected", interactive=False)
699
+ # output_report = gr.Markdown(label="Processing Summary and Timing")
700
 
701
+ # # NEW OUTPUT: JSON component for structured data
702
+ # output_page_counts = gr.JSON(label="Equation Count Per Page (Dictionary)")
703
 
704
+ # # Gradio Gallery is retained but will receive an empty list []
705
  # output_gallery = gr.Gallery(
706
+ # label="Detected Equations (Disabled for Speed)",
707
+ # columns=5,
708
  # height="auto",
709
  # object_fit="contain",
710
  # allow_preview=False
 
713
  # interface = gr.Interface(
714
  # fn=gradio_process_pdf,
715
  # inputs=input_file,
716
+ # # Outputs list remains the same, but the JSON component now receives string keys.
717
  # outputs=[
718
  # output_pages,
719
  # output_equations,
720
  # output_figures,
721
  # output_report,
722
+ # output_page_counts,
723
  # output_gallery
724
  # ],
725
+ # title="📊 YOLO Counting with Per-Page Data & Timing",
726
  # description=(
727
+ # "Upload a PDF to run YOLO detection. The results include total counts, a breakdown of "
728
+ # "equation counts per page (in JSON format), and detailed timing."
729
  # ),
730
  # )
731
 
732
  # print("\nStarting Gradio application...")
733
  # interface.launch(inbrowser=True)
734