Spaces:

heerjtdev
/

feeedback

Running

App Files Files Community

heerjtdev commited on 24 days ago

Commit

3dd4c9e

verified ·

1 Parent(s): 0add556

Update app.py

Browse files

Files changed (1) hide show

app.py +302 -25

app.py CHANGED Viewed

@@ -1296,6 +1296,73 @@ def get_latex_from_base64(base64_string: str) -> str:
 # --- UPDATED: page width argument removed from signature and call ---
 def run_yolo_detection_and_count(
         image: np.ndarray, model: YOLO, page_num: int,
         current_eq_count: int, current_fig_count: int
@@ -1327,12 +1394,9 @@ def run_yolo_detection_and_count(
         logging.error(f"ERROR: YOLO inference failed on page {page_num}: {e}")
         return [], eq_counter, fig_counter
-    # Call merge_overlapping_boxes without page_width
     merged_detections = merge_overlapping_boxes(yolo_detections, IOU_MERGE_THRESHOLD)
     final_detections = filter_nested_boxes(merged_detections, IOA_SUPPRESSION_THRESHOLD)
-    # Note: final_detections is now sorted purely by y1
     for det in final_detections:
         bbox = det["coords"]
         crop_pil = crop_and_convert_to_pil(image, bbox)
@@ -1341,6 +1405,7 @@ def run_yolo_detection_and_count(
             "type": det["class"],
             "coords": bbox,
             "pil_image": crop_pil,
         }
         if det["class"] == "equation":
@@ -1357,10 +1422,182 @@ def run_yolo_detection_and_count(
     return detected_items, eq_counter, fig_counter
 # ============================================================================
 # --- MAIN DOCUMENT PROCESSING FUNCTION (Retained Logic) ---
 # ============================================================================
 def run_single_pdf_preprocessing(
     pdf_path: str
 ) -> Tuple[int, int, int, str, float, Dict[str, Union[int, str]], List[Tuple[Image.Image, str]]]:
@@ -1408,7 +1645,7 @@ def run_single_pdf_preprocessing(
     mat = fitz.Matrix(SCALE_FACTOR, SCALE_FACTOR)
-    # 3. Page Processing, Detection, and OCR Loop
     t4 = time.time()
     for page_num_0_based in range(doc.page_count):
         page_start_time = time.time()
@@ -1440,36 +1677,66 @@ def run_single_pdf_preprocessing(
         )
         detect_time = time.time() - detect_start
-        # --- OCR/LaTeX Conversion and Logging ---
-        ocr_total_time = 0
-        page_equations = 0
-        for item in page_extracted_items:
-            if item["type"] == "equation":
-                page_equations += 1
-                ocr_start = time.time()
-                b64_string = pil_to_base64(item["pil_image"])
-                item["latex"] = get_latex_from_base64(b64_string)
-                ocr_time = time.time() - ocr_start
-                ocr_total_time += ocr_time
-                logging.warning(f"LATEX: Page {page_num}, ID {item['id']} -> Time: {ocr_time:.4f}s, Formula: {item['latex'][:50]}...")
         all_extracted_items.extend(page_extracted_items)
         page_figures = sum(1 for item in page_extracted_items if item["type"] == "figure")
         page_total_time = time.time() - page_start_time
-        logging.warning(f"SUMMARY: Page {page_num}: EQs={page_equations}, Figs={page_figures} | Page Time: {page_total_time:.4f}s (Detect={detect_time:.4f}s, OCR Total={ocr_total_time:.4f}s)")
     doc.close()
     t5 = time.time()
     detection_loop_time = t5 - t4
-    logging.warning(f"INFO: Total Detection and OCR Loop Time ({total_pages} pages): {detection_loop_time:.4f}s")
-    # 4. Final Report Generation and Gallery Formatting
     # Create the structured JSON output as requested by the user
     structured_latex_output = {
@@ -1493,7 +1760,7 @@ def run_single_pdf_preprocessing(
         gallery_items.append((item["pil_image"], image_label))
-    total_execution_time = t5 - start_time
     full_log = log_stream.getvalue()
@@ -1514,6 +1781,16 @@ def run_single_pdf_preprocessing(
     return total_pages, total_equation_count, total_figure_count, report, total_execution_time, structured_latex_output, gallery_items
 # ============================================================================
 # --- GRADIO INTERFACE FUNCTION & DEFINITION (Retained) ---
 # ============================================================================

 # --- UPDATED: page width argument removed from signature and call ---
+# def run_yolo_detection_and_count(
+#         image: np.ndarray, model: YOLO, page_num: int,
+#         current_eq_count: int, current_fig_count: int
+# ) -> Tuple[List[Dict[str, Union[Image.Image, str, Tuple[float,...]]]], int, int]:
+#     """
+#     Performs YOLO detection and returns a list of detected item dictionaries
+#     and the updated total counters.
+#     """
+#     eq_counter = current_eq_count
+#     fig_counter = current_fig_count
+#     detected_items: List[Dict[str, Union[Image.Image, str, Tuple[float,...]]]] = []
+#     yolo_detections = []
+#     try:
+#         results = model.predict(image, conf=CONF_THRESHOLD, verbose=False)
+#         if results and results[0].boxes:
+#             for box in results[0].boxes.data.tolist():
+#                 x1, y1, x2, y2, conf, cls_id = box
+#                 cls_name = model.names[int(cls_id)]
+#                 if cls_name in TARGET_CLASSES:
+#                     yolo_detections.append({
+#                         'coords': (x1, y1, x2, y2),
+#                         'class': cls_name,
+#                         'conf': conf
+#                     })
+#     except Exception as e:
+#         logging.error(f"ERROR: YOLO inference failed on page {page_num}: {e}")
+#         return [], eq_counter, fig_counter
+#     # Call merge_overlapping_boxes without page_width
+#     merged_detections = merge_overlapping_boxes(yolo_detections, IOU_MERGE_THRESHOLD)
+#     final_detections = filter_nested_boxes(merged_detections, IOA_SUPPRESSION_THRESHOLD)
+#     # Note: final_detections is now sorted purely by y1
+#     for det in final_detections:
+#         bbox = det["coords"]
+#         crop_pil = crop_and_convert_to_pil(image, bbox)
+#         item = {
+#             "type": det["class"],
+#             "coords": bbox,
+#             "pil_image": crop_pil,
+#         }
+#         if det["class"] == "equation":
+#             eq_counter += 1
+#             item["id"] = f"EQUATION{eq_counter}"
+#             item["latex"] = ""
+#         elif det["class"] == "figure":
+#             fig_counter += 1
+#             item["id"] = f"FIGURE{fig_counter}"
+#             item["latex"] = "[FIGURE - No LaTeX]"
+#         detected_items.append(item)
+#     return detected_items, eq_counter, fig_counter
 def run_yolo_detection_and_count(
         image: np.ndarray, model: YOLO, page_num: int,
         current_eq_count: int, current_fig_count: int
         logging.error(f"ERROR: YOLO inference failed on page {page_num}: {e}")
         return [], eq_counter, fig_counter
     merged_detections = merge_overlapping_boxes(yolo_detections, IOU_MERGE_THRESHOLD)
     final_detections = filter_nested_boxes(merged_detections, IOA_SUPPRESSION_THRESHOLD)
     for det in final_detections:
         bbox = det["coords"]
         crop_pil = crop_and_convert_to_pil(image, bbox)
             "type": det["class"],
             "coords": bbox,
             "pil_image": crop_pil,
+            "page_num": page_num,  # ← ADD THIS LINE
         }
         if det["class"] == "equation":
     return detected_items, eq_counter, fig_counter
 # ============================================================================
 # --- MAIN DOCUMENT PROCESSING FUNCTION (Retained Logic) ---
 # ============================================================================
+# def run_single_pdf_preprocessing(
+#     pdf_path: str
+# ) -> Tuple[int, int, int, str, float, Dict[str, Union[int, str]], List[Tuple[Image.Image, str]]]:
+#     """
+#     Runs the pipeline, performs OCR, and returns final results.
+#     """
+#     log_stream.truncate(0)
+#     log_stream.seek(0)
+#     start_time = time.time()
+#     all_extracted_items: List[Dict[str, Union[Image.Image, str]]] = []
+#     total_figure_count = 0
+#     total_equation_count = 0
+#     # 1. Validation and Model Loading (YOLO)
+#     t0 = time.time()
+#     if not os.path.exists(pdf_path):
+#         report = f"❌ FATAL ERROR: Input PDF not found at {pdf_path}."
+#         return 0, 0, 0, report, time.time() - start_time, {}, []
+#     try:
+#         model = YOLO(WEIGHTS_PATH)
+#         logging.warning(f"INFO: Loaded YOLO model from: {WEIGHTS_PATH}")
+#     except Exception as e:
+#         report = f"❌ ERROR loading YOLO model: {e}\n(Ensure 'best.pt' is available and valid.)"
+#         return 0, 0, 0, report, time.time() - start_time, {}, []
+#     t1 = time.time()
+#     logging.warning(f"INFO: Model Loading Time: {t1-t0:.4f}s")
+#     # 2. PDF Loading (fitz)
+#     t2 = time.time()
+#     try:
+#         doc = fitz.open(pdf_path)
+#         total_pages = doc.page_count
+#         logging.warning(f"INFO: Opened PDF with {doc.page_count} pages")
+#     except Exception as e:
+#         report = f"❌ ERROR loading PDF file: {e}"
+#         return 0, 0, 0, report, time.time() - start_time, {}, []
+#     t3 = time.time()
+#     logging.warning(f"INFO: PDF Initialization Time: {t3-t2:.4f}s")
+#     mat = fitz.Matrix(SCALE_FACTOR, SCALE_FACTOR)
+#     # 3. Page Processing, Detection, and OCR Loop
+#     t4 = time.time()
+#     for page_num_0_based in range(doc.page_count):
+#         page_start_time = time.time()
+#         fitz_page = doc.load_page(page_num_0_based)
+#         page_num = page_num_0_based + 1
+#         # Render page to image for YOLO
+#         try:
+#             pix_start = time.time()
+#             pix = fitz_page.get_pixmap(matrix=mat)
+#             original_img = pixmap_to_numpy(pix)
+#             pix_time = time.time() - pix_start
+#         except Exception as e:
+#             logging.error(f"ERROR: Error converting page {page_num} to image: {e}. Skipping.")
+#             continue
+#         # YOLO Detection
+#         detect_start = time.time()
+#         (
+#             page_extracted_items,
+#             total_equation_count,
+#             total_figure_count
+#         ) = run_yolo_detection_and_count(
+#             original_img,
+#             model,
+#             page_num,
+#             total_equation_count,
+#             total_figure_count
+#         )
+#         detect_time = time.time() - detect_start
+#         # --- OCR/LaTeX Conversion and Logging ---
+#         ocr_total_time = 0
+#         page_equations = 0
+#         for item in page_extracted_items:
+#             if item["type"] == "equation":
+#                 page_equations += 1
+#                 ocr_start = time.time()
+#                 b64_string = pil_to_base64(item["pil_image"])
+#                 item["latex"] = get_latex_from_base64(b64_string)
+#                 ocr_time = time.time() - ocr_start
+#                 ocr_total_time += ocr_time
+#                 logging.warning(f"LATEX: Page {page_num}, ID {item['id']} -> Time: {ocr_time:.4f}s, Formula: {item['latex'][:50]}...")
+#         all_extracted_items.extend(page_extracted_items)
+#         page_figures = sum(1 for item in page_extracted_items if item["type"] == "figure")
+#         page_total_time = time.time() - page_start_time
+#         logging.warning(f"SUMMARY: Page {page_num}: EQs={page_equations}, Figs={page_figures} | Page Time: {page_total_time:.4f}s (Detect={detect_time:.4f}s, OCR Total={ocr_total_time:.4f}s)")
+#     doc.close()
+#     t5 = time.time()
+#     detection_loop_time = t5 - t4
+#     logging.warning(f"INFO: Total Detection and OCR Loop Time ({total_pages} pages): {detection_loop_time:.4f}s")
+#     # 4. Final Report Generation and Gallery Formatting
+#     # Create the structured JSON output as requested by the user
+#     structured_latex_output = {
+#         "Total Pages": total_pages,
+#         "Total Equations": total_equation_count,
+#     }
+#     for item in all_extracted_items:
+#         if item["type"] == "equation":
+#             # Map EQUATION ID to LaTeX code
+#             structured_latex_output[item["id"]] = item["latex"]
+#     # Format the extracted items for the Gradio Gallery
+#     gallery_items: List[Tuple[Image.Image, str]] = []
+#     for item in all_extracted_items:
+#         image_label = item["id"]
+#         if item["type"] == "equation":
+#             image_label = f'{item["id"]}: {item["latex"]}'
+#         gallery_items.append((item["pil_image"], image_label))
+#     total_execution_time = t5 - start_time
+#     full_log = log_stream.getvalue()
+#     report = (
+#         f"✅ **YOLO Counting & OCR Complete!**\n\n"
+#         f"**1) Total Pages Detected in PDF:** **{total_pages}**\n"
+#         f"**2) Total Equations Detected:** **{total_equation_count}**\n"
+#         f"**3) Total Figures Detected:** **{total_figure_count}**\n"
+#         f"---\n"
+#         f"**4) Total Execution Time:** **{total_execution_time:.4f}s**\n"
+#         f"### Full Processing Log\n"
+#         f"```text\n"
+#         f"{full_log}"
+#         f"\n```"
+#     )
+#     # Return the new structured_latex_output instead of the page counts
+#     return total_pages, total_equation_count, total_figure_count, report, total_execution_time, structured_latex_output, gallery_items
 def run_single_pdf_preprocessing(
     pdf_path: str
 ) -> Tuple[int, int, int, str, float, Dict[str, Union[int, str]], List[Tuple[Image.Image, str]]]:
     mat = fitz.Matrix(SCALE_FACTOR, SCALE_FACTOR)
+    # 3. Page Processing and Detection Loop
     t4 = time.time()
     for page_num_0_based in range(doc.page_count):
         page_start_time = time.time()
         )
         detect_time = time.time() - detect_start
+        # Store items (OCR will be done later in correct order)
         all_extracted_items.extend(page_extracted_items)
         page_figures = sum(1 for item in page_extracted_items if item["type"] == "figure")
+        page_equations = sum(1 for item in page_extracted_items if item["type"] == "equation")
         page_total_time = time.time() - page_start_time
+        logging.warning(f"SUMMARY: Page {page_num}: EQs={page_equations}, Figs={page_figures} | Page Time: {page_total_time:.4f}s (Detect={detect_time:.4f}s)")
     doc.close()
     t5 = time.time()
     detection_loop_time = t5 - t4
+    logging.warning(f"INFO: Total Detection Loop Time ({total_pages} pages): {detection_loop_time:.4f}s")
+    # 4. Sort all items by page number, then by y-coordinate
+    logging.warning(f"INFO: Sorting {len(all_extracted_items)} items by page and position...")
+    all_extracted_items.sort(key=lambda item: (item['page_num'], item['coords'][1]))
+    # 5. Re-assign IDs in the correct order
+    equation_counter = 0
+    figure_counter = 0
+    for item in all_extracted_items:
+        if item["type"] == "equation":
+            equation_counter += 1
+            item["id"] = f"EQUATION{equation_counter}"
+        elif item["type"] == "figure":
+            figure_counter += 1
+            item["id"] = f"FIGURE{figure_counter}"
+    # Update the total counts with the correct values
+    total_equation_count = equation_counter
+    total_figure_count = figure_counter
+    logging.warning(f"INFO: Re-numbered items - Total Equations: {total_equation_count}, Total Figures: {total_figure_count}")
+    # 6. Perform OCR in the correct order
+    t6 = time.time()
+    ocr_total_time = 0
+    logging.warning(f"INFO: Starting OCR for {total_equation_count} equations in correct order...")
+    for item in all_extracted_items:
+        if item["type"] == "equation":
+            ocr_start = time.time()
+            b64_string = pil_to_base64(item["pil_image"])
+            item["latex"] = get_latex_from_base64(b64_string)
+            ocr_time = time.time() - ocr_start
+            ocr_total_time += ocr_time
+            logging.warning(f"LATEX: Page {item['page_num']}, ID {item['id']} -> Time: {ocr_time:.4f}s, Formula: {item['latex'][:50]}...")
+        elif item["type"] == "figure":
+            item["latex"] = "[FIGURE - No LaTeX]"
+    t7 = time.time()
+    logging.warning(f"INFO: Total OCR Time: {ocr_total_time:.4f}s")
+    # 7. Final Report Generation and Gallery Formatting
     # Create the structured JSON output as requested by the user
     structured_latex_output = {
         gallery_items.append((item["pil_image"], image_label))
+    total_execution_time = t7 - start_time
     full_log = log_stream.getvalue()
     return total_pages, total_equation_count, total_figure_count, report, total_execution_time, structured_latex_output, gallery_items
 # ============================================================================
 # --- GRADIO INTERFACE FUNCTION & DEFINITION (Retained) ---
 # ============================================================================