Update app.py
Browse files
app.py
CHANGED
|
@@ -581,8 +581,6 @@
|
|
| 581 |
|
| 582 |
|
| 583 |
|
| 584 |
-
|
| 585 |
-
|
| 586 |
import base64
|
| 587 |
from PIL import Image
|
| 588 |
import re
|
|
@@ -621,17 +619,16 @@ logging.basicConfig(level=logging.WARNING)
|
|
| 621 |
WEIGHTS_PATH = 'best.pt'
|
| 622 |
SCALE_FACTOR = 2.0
|
| 623 |
|
| 624 |
-
# --- OCR Model Initialization
|
| 625 |
from transformers import TrOCRProcessor
|
| 626 |
from optimum.onnxruntime import ORTModelForVision2Seq
|
| 627 |
|
| 628 |
MODEL_NAME = 'breezedeus/pix2text-mfr-1.5'
|
| 629 |
-
# Note: These models are kept global but unused in the main flow,
|
| 630 |
-
# as the user did not explicitly ask to remove the heavy OCR dependency yet.
|
| 631 |
try:
|
| 632 |
processor = TrOCRProcessor.from_pretrained(MODEL_NAME)
|
| 633 |
ort_model = ORTModelForVision2Seq.from_pretrained(MODEL_NAME, use_cache=False)
|
| 634 |
except Exception as e:
|
|
|
|
| 635 |
logging.warning(f"OCR model loading failed (expected if dependencies are missing): {e}")
|
| 636 |
processor = None
|
| 637 |
ort_model = None
|
|
@@ -642,10 +639,7 @@ TARGET_CLASSES = ['figure', 'equation']
|
|
| 642 |
IOU_MERGE_THRESHOLD = 0.4
|
| 643 |
IOA_SUPPRESSION_THRESHOLD = 0.7
|
| 644 |
|
| 645 |
-
#
|
| 646 |
-
# GLOBAL_FIGURE_COUNT = 0
|
| 647 |
-
# GLOBAL_EQUATION_COUNT = 0
|
| 648 |
-
|
| 649 |
|
| 650 |
# ============================================================================
|
| 651 |
# --- BOX COMBINATION LOGIC (Retained) ---
|
|
@@ -718,7 +712,7 @@ def merge_overlapping_boxes(detections, iou_threshold):
|
|
| 718 |
return merged_detections
|
| 719 |
|
| 720 |
# ============================================================================
|
| 721 |
-
# --- UTILITY FUNCTIONS ---
|
| 722 |
# ============================================================================
|
| 723 |
|
| 724 |
def pixmap_to_numpy(pix: fitz.Pixmap) -> np.ndarray:
|
|
@@ -733,7 +727,9 @@ def pixmap_to_numpy(pix: fitz.Pixmap) -> np.ndarray:
|
|
| 733 |
return img
|
| 734 |
|
| 735 |
|
| 736 |
-
|
|
|
|
|
|
|
| 737 |
x1, y1, x2, y2 = map(int, bbox)
|
| 738 |
h, w, _ = image.shape
|
| 739 |
|
|
@@ -742,27 +738,22 @@ def crop_and_convert_to_base64(image: np.ndarray, bbox: Tuple[float, float, floa
|
|
| 742 |
x2 = min(w, x2)
|
| 743 |
y2 = min(h, y2)
|
| 744 |
|
| 745 |
-
|
| 746 |
-
|
|
|
|
|
|
|
| 747 |
|
| 748 |
-
return
|
| 749 |
-
|
| 750 |
-
|
| 751 |
-
# --- NEW: Function to format base64 for Gradio Gallery ---
|
| 752 |
-
def base64_to_gradio_gallery_tuple(base64_str: str, label: str) -> Tuple[str, str]:
|
| 753 |
-
"""Converts raw base64 to a data URI tuple for Gradio Gallery."""
|
| 754 |
-
# Format: ('data:image/png;base64,...', 'label')
|
| 755 |
-
return (f"data:image/png;base64,{base64_str}", label)
|
| 756 |
|
| 757 |
|
| 758 |
-
# --- UPDATED: run_yolo_detection_and_count
|
| 759 |
def run_yolo_detection_and_count(
|
| 760 |
image: np.ndarray, model: YOLO, page_num: int,
|
| 761 |
current_eq_count: int, current_fig_count: int
|
| 762 |
-
) -> Tuple[int, int, List[
|
| 763 |
"""
|
| 764 |
-
Performs YOLO detection and returns page counts, detected items,
|
| 765 |
-
and the updated
|
| 766 |
"""
|
| 767 |
|
| 768 |
# Use the passed counters as starting points for this page
|
|
@@ -771,7 +762,8 @@ def run_yolo_detection_and_count(
|
|
| 771 |
|
| 772 |
page_equations = 0
|
| 773 |
page_figures = 0
|
| 774 |
-
detected_items
|
|
|
|
| 775 |
yolo_detections = []
|
| 776 |
|
| 777 |
try:
|
|
@@ -797,38 +789,30 @@ def run_yolo_detection_and_count(
|
|
| 797 |
|
| 798 |
for det in final_detections:
|
| 799 |
bbox = det["coords"]
|
|
|
|
|
|
|
|
|
|
| 800 |
|
| 801 |
if det["class"] == "equation":
|
| 802 |
eq_counter += 1
|
| 803 |
page_equations += 1
|
| 804 |
-
|
| 805 |
-
|
| 806 |
-
detected_items.append({
|
| 807 |
-
"type": "equation",
|
| 808 |
-
"id": f"EQUATION{eq_counter}",
|
| 809 |
-
"base64": b64
|
| 810 |
-
})
|
| 811 |
|
| 812 |
elif det["class"] == "figure":
|
| 813 |
fig_counter += 1
|
| 814 |
page_figures += 1
|
| 815 |
-
|
| 816 |
-
|
| 817 |
-
detected_items.append({
|
| 818 |
-
"type": "figure",
|
| 819 |
-
"id": f"FIGURE{fig_counter}",
|
| 820 |
-
"base64": b64
|
| 821 |
-
})
|
| 822 |
|
| 823 |
logging.warning(f" -> Page {page_num}: EQs={page_equations}, Figs={page_figures}")
|
| 824 |
-
# Return page counts, detected items, and the UPDATED total counters
|
| 825 |
return page_equations, page_figures, detected_items, eq_counter, fig_counter
|
| 826 |
|
| 827 |
|
| 828 |
-
# --- Other unused functions (get_latex_from_base64, etc.) are kept but not modified as
|
| 829 |
-
# the focus is on the concurrency and Gradio Gallery fix. ---
|
| 830 |
-
|
| 831 |
def get_latex_from_base64(base64_string: str) -> str:
|
|
|
|
|
|
|
| 832 |
if ort_model is None or processor is None:
|
| 833 |
return "[MODEL_ERROR: Model not initialized]"
|
| 834 |
|
|
@@ -852,7 +836,33 @@ def get_latex_from_base64(base64_string: str) -> str:
|
|
| 852 |
return f"[TR_OCR_ERROR: {e}]"
|
| 853 |
|
| 854 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 855 |
def embed_images_as_base64_in_memory(structured_data, detected_items):
|
|
|
|
|
|
|
|
|
|
| 856 |
tag_regex = re.compile(r'(figure|equation)(\d+)', re.IGNORECASE)
|
| 857 |
|
| 858 |
item_lookup = {d["id"]: d for d in detected_items}
|
|
@@ -882,41 +892,59 @@ def embed_images_as_base64_in_memory(structured_data, detected_items):
|
|
| 882 |
continue
|
| 883 |
|
| 884 |
entry = item_lookup[tag]
|
|
|
|
|
|
|
|
|
|
| 885 |
|
| 886 |
-
if entry["type"] == "equation":
|
| 887 |
-
|
| 888 |
-
|
| 889 |
-
|
| 890 |
-
item[base_key] = entry["base64"]
|
| 891 |
|
| 892 |
final_data.append(item)
|
| 893 |
|
| 894 |
return final_data
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 895 |
|
| 896 |
# ============================================================================
|
| 897 |
-
# --- MAIN DOCUMENT PROCESSING FUNCTION (Fixed for
|
| 898 |
# ============================================================================
|
| 899 |
|
| 900 |
-
# --- UPDATED return type for clarity ---
|
| 901 |
def run_single_pdf_preprocessing(
|
| 902 |
pdf_path: str
|
| 903 |
-
) -> Tuple[int, int, int, str, float, Dict[str, int], List[Tuple[
|
| 904 |
"""
|
| 905 |
Runs the pipeline, returns counts, report, total time, page counts dict (str keys),
|
| 906 |
-
and a list of (
|
| 907 |
"""
|
| 908 |
|
| 909 |
-
# --- INITIALIZE LOCAL COUNTERS ---
|
| 910 |
start_time = time.time()
|
| 911 |
log_messages = []
|
| 912 |
|
| 913 |
-
# This list now holds (
|
| 914 |
-
all_gradio_gallery_items: List[Tuple[
|
| 915 |
|
| 916 |
# Dictionary to store {page_number (int): equation_count (int)}
|
| 917 |
equation_counts_per_page: Dict[int, int] = {}
|
| 918 |
|
| 919 |
-
#
|
| 920 |
total_figure_count = 0
|
| 921 |
total_equation_count = 0
|
| 922 |
|
|
@@ -925,7 +953,6 @@ def run_single_pdf_preprocessing(
|
|
| 925 |
t0 = time.time()
|
| 926 |
if not os.path.exists(pdf_path):
|
| 927 |
report = f"❌ FATAL ERROR: Input PDF not found at {pdf_path}."
|
| 928 |
-
# Return empty list of tuples for gallery on error
|
| 929 |
return 0, 0, 0, report, time.time() - start_time, {}, []
|
| 930 |
|
| 931 |
try:
|
|
@@ -970,11 +997,10 @@ def run_single_pdf_preprocessing(
|
|
| 970 |
|
| 971 |
# Core Detection
|
| 972 |
detect_start = time.time()
|
| 973 |
-
# --- PASSING AND RECEIVING THE COUNTERS HERE (Concurrency Fix) ---
|
| 974 |
(
|
| 975 |
page_equations,
|
| 976 |
page_figures,
|
| 977 |
-
|
| 978 |
total_equation_count,
|
| 979 |
total_figure_count
|
| 980 |
) = run_yolo_detection_and_count(
|
|
@@ -985,10 +1011,8 @@ def run_single_pdf_preprocessing(
|
|
| 985 |
total_figure_count
|
| 986 |
)
|
| 987 |
|
| 988 |
-
#
|
| 989 |
-
|
| 990 |
-
gradio_tuple = base64_to_gradio_gallery_tuple(item["base64"], item["id"])
|
| 991 |
-
all_gradio_gallery_items.append(gradio_tuple)
|
| 992 |
|
| 993 |
detect_time = time.time() - detect_start
|
| 994 |
|
|
@@ -1014,8 +1038,8 @@ def run_single_pdf_preprocessing(
|
|
| 1014 |
report = (
|
| 1015 |
f"✅ **YOLO Counting Complete!**\n\n"
|
| 1016 |
f"**1) Total Pages Detected in PDF:** **{total_pages}**\n"
|
| 1017 |
-
f"**2) Total Equations Detected:** **{total_equation_count}**\n"
|
| 1018 |
-
f"**3) Total Figures Detected:** **{total_figure_count}**\n"
|
| 1019 |
f"---\n"
|
| 1020 |
f"**4) Total Execution Time:** **{total_execution_time:.4f}s**\n"
|
| 1021 |
f"### Detailed Step Timing\n"
|
|
@@ -1024,27 +1048,25 @@ def run_single_pdf_preprocessing(
|
|
| 1024 |
f"\n```"
|
| 1025 |
)
|
| 1026 |
|
| 1027 |
-
# Return the dictionary with string keys and the properly formatted gallery items
|
| 1028 |
return total_pages, total_equation_count, total_figure_count, report, total_execution_time, equation_counts_per_page_str_keys, all_gradio_gallery_items
|
| 1029 |
|
| 1030 |
|
| 1031 |
# ============================================================================
|
| 1032 |
-
# --- GRADIO INTERFACE FUNCTION (Updated) ---
|
| 1033 |
# ============================================================================
|
| 1034 |
|
| 1035 |
-
#
|
| 1036 |
-
def gradio_process_pdf(pdf_file) -> Tuple[str, str, str, str, Dict[str, int], List[Tuple[
|
| 1037 |
"""
|
| 1038 |
Gradio wrapper function to handle file upload and return results.
|
| 1039 |
"""
|
| 1040 |
if pdf_file is None:
|
| 1041 |
-
# Return empty list of tuples for gallery on error
|
| 1042 |
return "N/A", "N/A", "N/A", "Please upload a PDF file.", {}, []
|
| 1043 |
|
| 1044 |
pdf_path = pdf_file.name
|
| 1045 |
|
| 1046 |
try:
|
| 1047 |
-
# Unpack the new return value: equation_counts_per_page (with string keys)
|
| 1048 |
(
|
| 1049 |
num_pages,
|
| 1050 |
num_equations,
|
|
@@ -1052,23 +1074,21 @@ def gradio_process_pdf(pdf_file) -> Tuple[str, str, str, str, Dict[str, int], Li
|
|
| 1052 |
report,
|
| 1053 |
total_time,
|
| 1054 |
equation_counts_per_page,
|
| 1055 |
-
gallery_items # Now
|
| 1056 |
) = run_single_pdf_preprocessing(pdf_path)
|
| 1057 |
|
| 1058 |
|
| 1059 |
-
# Return results (6 items now)
|
| 1060 |
return str(num_pages), str(num_equations), str(num_figures), report, equation_counts_per_page, gallery_items
|
| 1061 |
|
| 1062 |
|
| 1063 |
except Exception as e:
|
| 1064 |
error_msg = f"An unexpected error occurred: {e}"
|
| 1065 |
logging.error(error_msg, exc_info=True)
|
| 1066 |
-
# Return empty list of tuples for gallery on error
|
| 1067 |
return "Error", "Error", "Error", error_msg, {}, []
|
| 1068 |
|
| 1069 |
|
| 1070 |
# ============================================================================
|
| 1071 |
-
# --- GRADIO INTERFACE DEFINITION (
|
| 1072 |
# ============================================================================
|
| 1073 |
|
| 1074 |
if __name__ == "__main__":
|
|
@@ -1087,9 +1107,9 @@ if __name__ == "__main__":
|
|
| 1087 |
# NEW OUTPUT: JSON component for structured data
|
| 1088 |
output_page_counts = gr.JSON(label="Equation Count Per Page (Dictionary)")
|
| 1089 |
|
| 1090 |
-
# Gradio Gallery is
|
| 1091 |
output_gallery = gr.Gallery(
|
| 1092 |
-
label="Detected Items (Gallery
|
| 1093 |
columns=5,
|
| 1094 |
height="auto",
|
| 1095 |
object_fit="contain",
|
|
@@ -1099,7 +1119,6 @@ if __name__ == "__main__":
|
|
| 1099 |
interface = gr.Interface(
|
| 1100 |
fn=gradio_process_pdf,
|
| 1101 |
inputs=input_file,
|
| 1102 |
-
# Outputs list remains the same, but the gallery now works
|
| 1103 |
outputs=[
|
| 1104 |
output_pages,
|
| 1105 |
output_equations,
|
|
@@ -1108,11 +1127,11 @@ if __name__ == "__main__":
|
|
| 1108 |
output_page_counts,
|
| 1109 |
output_gallery
|
| 1110 |
],
|
| 1111 |
-
title="📊 YOLO Counting with Per-Page Data & Timing (
|
| 1112 |
description=(
|
| 1113 |
-
"Upload a PDF to run YOLO detection.
|
| 1114 |
),
|
| 1115 |
)
|
| 1116 |
|
| 1117 |
print("\nStarting Gradio application...")
|
| 1118 |
-
interface.launch(inbrowser=True)
|
|
|
|
| 581 |
|
| 582 |
|
| 583 |
|
|
|
|
|
|
|
| 584 |
import base64
|
| 585 |
from PIL import Image
|
| 586 |
import re
|
|
|
|
| 619 |
WEIGHTS_PATH = 'best.pt'
|
| 620 |
SCALE_FACTOR = 2.0
|
| 621 |
|
| 622 |
+
# --- OCR Model Initialization ---
|
| 623 |
from transformers import TrOCRProcessor
|
| 624 |
from optimum.onnxruntime import ORTModelForVision2Seq
|
| 625 |
|
| 626 |
MODEL_NAME = 'breezedeus/pix2text-mfr-1.5'
|
|
|
|
|
|
|
| 627 |
try:
|
| 628 |
processor = TrOCRProcessor.from_pretrained(MODEL_NAME)
|
| 629 |
ort_model = ORTModelForVision2Seq.from_pretrained(MODEL_NAME, use_cache=False)
|
| 630 |
except Exception as e:
|
| 631 |
+
# This warning is included to alert the user if the optional, unused dependencies fail
|
| 632 |
logging.warning(f"OCR model loading failed (expected if dependencies are missing): {e}")
|
| 633 |
processor = None
|
| 634 |
ort_model = None
|
|
|
|
| 639 |
IOU_MERGE_THRESHOLD = 0.4
|
| 640 |
IOA_SUPPRESSION_THRESHOLD = 0.7
|
| 641 |
|
| 642 |
+
# Note: The original GLOBAL_COUNT variables have been removed to fix concurrency.
|
|
|
|
|
|
|
|
|
|
| 643 |
|
| 644 |
# ============================================================================
|
| 645 |
# --- BOX COMBINATION LOGIC (Retained) ---
|
|
|
|
| 712 |
return merged_detections
|
| 713 |
|
| 714 |
# ============================================================================
|
| 715 |
+
# --- UTILITY FUNCTIONS (Updated for PIL/Concurrency) ---
|
| 716 |
# ============================================================================
|
| 717 |
|
| 718 |
def pixmap_to_numpy(pix: fitz.Pixmap) -> np.ndarray:
|
|
|
|
| 727 |
return img
|
| 728 |
|
| 729 |
|
| 730 |
+
# --- REPLACED CROP_AND_CONVERT_TO_BASE64 ---
|
| 731 |
+
def crop_and_convert_to_pil(image: np.ndarray, bbox: Tuple[float, float, float, float]) -> Image.Image:
|
| 732 |
+
"""Crops the numpy array and returns a PIL Image object."""
|
| 733 |
x1, y1, x2, y2 = map(int, bbox)
|
| 734 |
h, w, _ = image.shape
|
| 735 |
|
|
|
|
| 738 |
x2 = min(w, x2)
|
| 739 |
y2 = min(h, y2)
|
| 740 |
|
| 741 |
+
crop_np = image[y1:y2, x1:x2]
|
| 742 |
+
# Convert OpenCV/BGR (if applicable) or RGB numpy array to PIL Image
|
| 743 |
+
# Using BGR2RGB conversion just in case OpenCV read the image in BGR format
|
| 744 |
+
crop_pil = Image.fromarray(cv2.cvtColor(crop_np, cv2.COLOR_BGR2RGB))
|
| 745 |
|
| 746 |
+
return crop_pil
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 747 |
|
| 748 |
|
| 749 |
+
# --- UPDATED: run_yolo_detection_and_count for Concurrency and PIL output ---
|
| 750 |
def run_yolo_detection_and_count(
|
| 751 |
image: np.ndarray, model: YOLO, page_num: int,
|
| 752 |
current_eq_count: int, current_fig_count: int
|
| 753 |
+
) -> Tuple[int, int, List[Tuple[Image.Image, str]], int, int]:
|
| 754 |
"""
|
| 755 |
+
Performs YOLO detection and returns page counts, detected items as (PIL.Image, label),
|
| 756 |
+
and the updated total counters.
|
| 757 |
"""
|
| 758 |
|
| 759 |
# Use the passed counters as starting points for this page
|
|
|
|
| 762 |
|
| 763 |
page_equations = 0
|
| 764 |
page_figures = 0
|
| 765 |
+
# Change: detected_items now holds (PIL.Image, label) for direct Gradio use
|
| 766 |
+
detected_items: List[Tuple[Image.Image, str]] = []
|
| 767 |
yolo_detections = []
|
| 768 |
|
| 769 |
try:
|
|
|
|
| 789 |
|
| 790 |
for det in final_detections:
|
| 791 |
bbox = det["coords"]
|
| 792 |
+
|
| 793 |
+
# --- NEW: Get PIL image directly ---
|
| 794 |
+
crop_pil = crop_and_convert_to_pil(image, bbox)
|
| 795 |
|
| 796 |
if det["class"] == "equation":
|
| 797 |
eq_counter += 1
|
| 798 |
page_equations += 1
|
| 799 |
+
label = f"EQUATION{eq_counter}"
|
| 800 |
+
detected_items.append((crop_pil, label))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 801 |
|
| 802 |
elif det["class"] == "figure":
|
| 803 |
fig_counter += 1
|
| 804 |
page_figures += 1
|
| 805 |
+
label = f"FIGURE{fig_counter}"
|
| 806 |
+
detected_items.append((crop_pil, label))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 807 |
|
| 808 |
logging.warning(f" -> Page {page_num}: EQs={page_equations}, Figs={page_figures}")
|
| 809 |
+
# Return page counts, detected items (as PIL tuples), and the UPDATED total counters
|
| 810 |
return page_equations, page_figures, detected_items, eq_counter, fig_counter
|
| 811 |
|
| 812 |
|
|
|
|
|
|
|
|
|
|
| 813 |
def get_latex_from_base64(base64_string: str) -> str:
|
| 814 |
+
# NOTE: This function still expects base64 input,
|
| 815 |
+
# but the main detection flow no longer provides it.
|
| 816 |
if ort_model is None or processor is None:
|
| 817 |
return "[MODEL_ERROR: Model not initialized]"
|
| 818 |
|
|
|
|
| 836 |
return f"[TR_OCR_ERROR: {e}]"
|
| 837 |
|
| 838 |
|
| 839 |
+
def extract_images_from_page_in_memory(page) -> Dict[str, str]:
|
| 840 |
+
"""
|
| 841 |
+
Extract images from a page and return:
|
| 842 |
+
{ "EQUATION1": base64_string, "FIGURE1": base64_string }
|
| 843 |
+
(NOTE: This is unused dead code from the original script, retained as requested)
|
| 844 |
+
"""
|
| 845 |
+
image_map = {}
|
| 846 |
+
image_list = page.get_images(full=True)
|
| 847 |
+
|
| 848 |
+
for idx, img in enumerate(image_list, start=1):
|
| 849 |
+
xref = img[0]
|
| 850 |
+
base = page.parent.extract_image(xref)
|
| 851 |
+
image_bytes = base["image"]
|
| 852 |
+
|
| 853 |
+
base64_img = base64.b64encode(image_bytes).decode("utf-8")
|
| 854 |
+
|
| 855 |
+
# Convention: first image = FIGURE1, second image = EQUATION1 etc
|
| 856 |
+
# You can tune this if needed
|
| 857 |
+
image_map[f"FIGURE{idx}"] = base64_img
|
| 858 |
+
|
| 859 |
+
return image_map
|
| 860 |
+
|
| 861 |
+
|
| 862 |
def embed_images_as_base64_in_memory(structured_data, detected_items):
|
| 863 |
+
"""
|
| 864 |
+
(NOTE: This is unused dead code from the original script, retained as requested)
|
| 865 |
+
"""
|
| 866 |
tag_regex = re.compile(r'(figure|equation)(\d+)', re.IGNORECASE)
|
| 867 |
|
| 868 |
item_lookup = {d["id"]: d for d in detected_items}
|
|
|
|
| 892 |
continue
|
| 893 |
|
| 894 |
entry = item_lookup[tag]
|
| 895 |
+
# This logic assumes detected_items still contained the raw dicts,
|
| 896 |
+
# which is no longer true in the main flow.
|
| 897 |
+
# This section is functionally broken but left untouched as per request.
|
| 898 |
|
| 899 |
+
# if entry["type"] == "equation":
|
| 900 |
+
# item[base_key] = get_latex_from_base64(entry["base64"])
|
| 901 |
+
# else:
|
| 902 |
+
# item[base_key] = entry["base64"]
|
|
|
|
| 903 |
|
| 904 |
final_data.append(item)
|
| 905 |
|
| 906 |
return final_data
|
| 907 |
+
|
| 908 |
+
def crop_and_convert_to_base64(image: np.ndarray, bbox: Tuple[float, float, float, float]) -> str:
|
| 909 |
+
"""
|
| 910 |
+
Original function definition (now deprecated in main flow, but retained).
|
| 911 |
+
"""
|
| 912 |
+
x1, y1, x2, y2 = map(int, bbox)
|
| 913 |
+
h, w, _ = image.shape
|
| 914 |
+
|
| 915 |
+
x1 = max(0, x1)
|
| 916 |
+
y1 = max(0, y1)
|
| 917 |
+
x2 = min(w, x2)
|
| 918 |
+
y2 = min(h, y2)
|
| 919 |
+
|
| 920 |
+
crop = image[y1:y2, x1:x2]
|
| 921 |
+
_, buffer = cv2.imencode(".png", crop)
|
| 922 |
+
|
| 923 |
+
return base64.b64encode(buffer).decode("utf-8")
|
| 924 |
+
|
| 925 |
|
| 926 |
# ============================================================================
|
| 927 |
+
# --- MAIN DOCUMENT PROCESSING FUNCTION (Fixed for PIL output) ---
|
| 928 |
# ============================================================================
|
| 929 |
|
|
|
|
| 930 |
def run_single_pdf_preprocessing(
|
| 931 |
pdf_path: str
|
| 932 |
+
) -> Tuple[int, int, int, str, float, Dict[str, int], List[Tuple[Image.Image, str]]]:
|
| 933 |
"""
|
| 934 |
Runs the pipeline, returns counts, report, total time, page counts dict (str keys),
|
| 935 |
+
and a list of (PIL.Image, label) for the Gradio gallery.
|
| 936 |
"""
|
| 937 |
|
|
|
|
| 938 |
start_time = time.time()
|
| 939 |
log_messages = []
|
| 940 |
|
| 941 |
+
# This list now holds (PIL.Image, label) tuples
|
| 942 |
+
all_gradio_gallery_items: List[Tuple[Image.Image, str]] = []
|
| 943 |
|
| 944 |
# Dictionary to store {page_number (int): equation_count (int)}
|
| 945 |
equation_counts_per_page: Dict[int, int] = {}
|
| 946 |
|
| 947 |
+
# Local counters for thread safety (Concurrency Fix)
|
| 948 |
total_figure_count = 0
|
| 949 |
total_equation_count = 0
|
| 950 |
|
|
|
|
| 953 |
t0 = time.time()
|
| 954 |
if not os.path.exists(pdf_path):
|
| 955 |
report = f"❌ FATAL ERROR: Input PDF not found at {pdf_path}."
|
|
|
|
| 956 |
return 0, 0, 0, report, time.time() - start_time, {}, []
|
| 957 |
|
| 958 |
try:
|
|
|
|
| 997 |
|
| 998 |
# Core Detection
|
| 999 |
detect_start = time.time()
|
|
|
|
| 1000 |
(
|
| 1001 |
page_equations,
|
| 1002 |
page_figures,
|
| 1003 |
+
page_images_pil_tuples, # Now (PIL.Image, label)
|
| 1004 |
total_equation_count,
|
| 1005 |
total_figure_count
|
| 1006 |
) = run_yolo_detection_and_count(
|
|
|
|
| 1011 |
total_figure_count
|
| 1012 |
)
|
| 1013 |
|
| 1014 |
+
# Append the PIL tuples directly to the master list
|
| 1015 |
+
all_gradio_gallery_items.extend(page_images_pil_tuples)
|
|
|
|
|
|
|
| 1016 |
|
| 1017 |
detect_time = time.time() - detect_start
|
| 1018 |
|
|
|
|
| 1038 |
report = (
|
| 1039 |
f"✅ **YOLO Counting Complete!**\n\n"
|
| 1040 |
f"**1) Total Pages Detected in PDF:** **{total_pages}**\n"
|
| 1041 |
+
f"**2) Total Equations Detected:** **{total_equation_count}**\n"
|
| 1042 |
+
f"**3) Total Figures Detected:** **{total_figure_count}**\n"
|
| 1043 |
f"---\n"
|
| 1044 |
f"**4) Total Execution Time:** **{total_execution_time:.4f}s**\n"
|
| 1045 |
f"### Detailed Step Timing\n"
|
|
|
|
| 1048 |
f"\n```"
|
| 1049 |
)
|
| 1050 |
|
| 1051 |
+
# Return the dictionary with string keys and the properly formatted gallery items (PIL tuples)
|
| 1052 |
return total_pages, total_equation_count, total_figure_count, report, total_execution_time, equation_counts_per_page_str_keys, all_gradio_gallery_items
|
| 1053 |
|
| 1054 |
|
| 1055 |
# ============================================================================
|
| 1056 |
+
# --- GRADIO INTERFACE FUNCTION (Updated for PIL output) ---
|
| 1057 |
# ============================================================================
|
| 1058 |
|
| 1059 |
+
# The return type now uses PIL.Image for the gallery list
|
| 1060 |
+
def gradio_process_pdf(pdf_file) -> Tuple[str, str, str, str, Dict[str, int], List[Tuple[Image.Image, str]]]:
|
| 1061 |
"""
|
| 1062 |
Gradio wrapper function to handle file upload and return results.
|
| 1063 |
"""
|
| 1064 |
if pdf_file is None:
|
|
|
|
| 1065 |
return "N/A", "N/A", "N/A", "Please upload a PDF file.", {}, []
|
| 1066 |
|
| 1067 |
pdf_path = pdf_file.name
|
| 1068 |
|
| 1069 |
try:
|
|
|
|
| 1070 |
(
|
| 1071 |
num_pages,
|
| 1072 |
num_equations,
|
|
|
|
| 1074 |
report,
|
| 1075 |
total_time,
|
| 1076 |
equation_counts_per_page,
|
| 1077 |
+
gallery_items # Now (PIL.Image, label) tuples
|
| 1078 |
) = run_single_pdf_preprocessing(pdf_path)
|
| 1079 |
|
| 1080 |
|
|
|
|
| 1081 |
return str(num_pages), str(num_equations), str(num_figures), report, equation_counts_per_page, gallery_items
|
| 1082 |
|
| 1083 |
|
| 1084 |
except Exception as e:
|
| 1085 |
error_msg = f"An unexpected error occurred: {e}"
|
| 1086 |
logging.error(error_msg, exc_info=True)
|
|
|
|
| 1087 |
return "Error", "Error", "Error", error_msg, {}, []
|
| 1088 |
|
| 1089 |
|
| 1090 |
# ============================================================================
|
| 1091 |
+
# --- GRADIO INTERFACE DEFINITION (Unchanged) ---
|
| 1092 |
# ============================================================================
|
| 1093 |
|
| 1094 |
if __name__ == "__main__":
|
|
|
|
| 1107 |
# NEW OUTPUT: JSON component for structured data
|
| 1108 |
output_page_counts = gr.JSON(label="Equation Count Per Page (Dictionary)")
|
| 1109 |
|
| 1110 |
+
# Gradio Gallery is configured to accept the (PIL.Image, label) format
|
| 1111 |
output_gallery = gr.Gallery(
|
| 1112 |
+
label="Detected Items (Gallery Fixed for Stability)",
|
| 1113 |
columns=5,
|
| 1114 |
height="auto",
|
| 1115 |
object_fit="contain",
|
|
|
|
| 1119 |
interface = gr.Interface(
|
| 1120 |
fn=gradio_process_pdf,
|
| 1121 |
inputs=input_file,
|
|
|
|
| 1122 |
outputs=[
|
| 1123 |
output_pages,
|
| 1124 |
output_equations,
|
|
|
|
| 1127 |
output_page_counts,
|
| 1128 |
output_gallery
|
| 1129 |
],
|
| 1130 |
+
title="📊 YOLO Counting with Per-Page Data & Timing (Stable)",
|
| 1131 |
description=(
|
| 1132 |
+
"Upload a PDF to run YOLO detection. Concurrency and Gallery display issues are resolved."
|
| 1133 |
),
|
| 1134 |
)
|
| 1135 |
|
| 1136 |
print("\nStarting Gradio application...")
|
| 1137 |
+
interface.launch(inbrowser=True)
|