Spaces:

heerjtdev
/

feeedback

Running

App Files Files Community

heerjtdev commited on Dec 9, 2025

Commit

4a3866a

verified ·

1 Parent(s): 1ad06d0

Update app.py

Browse files

Files changed (1) hide show

app.py +99 -140

app.py CHANGED Viewed

@@ -580,7 +580,6 @@
 import base64
 from PIL import Image
 import re
@@ -627,11 +626,12 @@ MODEL_NAME = 'breezedeus/pix2text-mfr-1.5'
 try:
     processor = TrOCRProcessor.from_pretrained(MODEL_NAME)
     ort_model = ORTModelForVision2Seq.from_pretrained(MODEL_NAME, use_cache=False)
 except Exception as e:
-    # This warning is included to alert the user if the optional, unused dependencies fail
     logging.warning(f"OCR model loading failed (expected if dependencies are missing): {e}")
     processor = None
     ort_model = None
 # Detection parameters
 CONF_THRESHOLD = 0.2
@@ -639,8 +639,6 @@ TARGET_CLASSES = ['figure', 'equation']
 IOU_MERGE_THRESHOLD = 0.4
 IOA_SUPPRESSION_THRESHOLD = 0.7
-# Note: The original GLOBAL_COUNT variables have been removed to fix concurrency.
 # ============================================================================
 # --- BOX COMBINATION LOGIC (Retained) ---
 # ============================================================================
@@ -712,7 +710,7 @@ def merge_overlapping_boxes(detections, iou_threshold):
     return merged_detections
 # ============================================================================
-# --- UTILITY FUNCTIONS (Updated for PIL/Concurrency) ---
 # ============================================================================
 def pixmap_to_numpy(pix: fitz.Pixmap) -> np.ndarray:
@@ -727,7 +725,6 @@ def pixmap_to_numpy(pix: fitz.Pixmap) -> np.ndarray:
     return img
-# --- REPLACED CROP_AND_CONVERT_TO_BASE64 ---
 def crop_and_convert_to_pil(image: np.ndarray, bbox: Tuple[float, float, float, float]) -> Image.Image:
     """Crops the numpy array and returns a PIL Image object."""
     x1, y1, x2, y2 = map(int, bbox)
@@ -739,33 +736,39 @@ def crop_and_convert_to_pil(image: np.ndarray, bbox: Tuple[float, float, float,
     y2 = min(h, y2)
     crop_np = image[y1:y2, x1:x2]
-    # Convert OpenCV/BGR (if applicable) or RGB numpy array to PIL Image
-    # Using BGR2RGB conversion just in case OpenCV read the image in BGR format
     crop_pil = Image.fromarray(cv2.cvtColor(crop_np, cv2.COLOR_BGR2RGB))
     return crop_pil
-# --- UPDATED: run_yolo_detection_and_count for Concurrency and PIL output ---
 def run_yolo_detection_and_count(
         image: np.ndarray, model: YOLO, page_num: int,
         current_eq_count: int, current_fig_count: int
-) -> Tuple[int, int, List[Tuple[Image.Image, str]], int, int]:
     """
-    Performs YOLO detection and returns page counts, detected items as (PIL.Image, label),
-    and the updated total counters.
     """
-    # Use the passed counters as starting points for this page
     eq_counter = current_eq_count
     fig_counter = current_fig_count
     page_equations = 0
     page_figures = 0
-    # Change: detected_items now holds (PIL.Image, label) for direct Gradio use
-    detected_items: List[Tuple[Image.Image, str]] = []
     yolo_detections = []
     try:
         results = model.predict(image, conf=CONF_THRESHOLD, verbose=False)
@@ -790,33 +793,41 @@ def run_yolo_detection_and_count(
     for det in final_detections:
         bbox = det["coords"]
-        # --- NEW: Get PIL image directly ---
         crop_pil = crop_and_convert_to_pil(image, bbox)
         if det["class"] == "equation":
             eq_counter += 1
             page_equations += 1
-            label = f"EQUATION{eq_counter}"
-            detected_items.append((crop_pil, label))
         elif det["class"] == "figure":
             fig_counter += 1
             page_figures += 1
-            label = f"FIGURE{fig_counter}"
-            detected_items.append((crop_pil, label))
     logging.warning(f"  -> Page {page_num}: EQs={page_equations}, Figs={page_figures}")
-    # Return page counts, detected items (as PIL tuples), and the UPDATED total counters
     return page_equations, page_figures, detected_items, eq_counter, fig_counter
 def get_latex_from_base64(base64_string: str) -> str:
-    # NOTE: This function still expects base64 input,
-    # but the main detection flow no longer provides it.
-    if ort_model is None or processor is None:
-        return "[MODEL_ERROR: Model not initialized]"
     try:
         image_data = base64.b64decode(base64_string)
         image = Image.open(io.BytesIO(image_data)).convert('RGB')
@@ -836,120 +847,45 @@ def get_latex_from_base64(base64_string: str) -> str:
         return f"[TR_OCR_ERROR: {e}]"
-def extract_images_from_page_in_memory(page) -> Dict[str, str]:
-    """
-    Extract images from a page and return:
-    { "EQUATION1": base64_string, "FIGURE1": base64_string }
-    (NOTE: This is unused dead code from the original script, retained as requested)
-    """
-    image_map = {}
-    image_list = page.get_images(full=True)
-    for idx, img in enumerate(image_list, start=1):
-        xref = img[0]
-        base = page.parent.extract_image(xref)
-        image_bytes = base["image"]
-        base64_img = base64.b64encode(image_bytes).decode("utf-8")
-        # Convention: first image = FIGURE1, second image = EQUATION1 etc
-        # You can tune this if needed
-        image_map[f"FIGURE{idx}"] = base64_img
-    return image_map
 def embed_images_as_base64_in_memory(structured_data, detected_items):
-    """
-    (NOTE: This is unused dead code from the original script, retained as requested)
-    """
-    tag_regex = re.compile(r'(figure|equation)(\d+)', re.IGNORECASE)
-    item_lookup = {d["id"]: d for d in detected_items}
-    final_data = []
-    for item in structured_data:
-        text_fields = [
-            item.get('question', ''),
-            item.get('passage', ''),
-            item.get('new_passage', '')
-        ]
-        if 'options' in item:
-            text_fields.extend(item['options'].values())
-        used_tags = set()
-        for text in text_fields:
-            for m in tag_regex.finditer(text or ""):
-                used_tags.add(m.group(0).upper())
-        for tag in used_tags:
-            base_key = tag.lower().replace(" ", "")
-            if tag not in item_lookup:
-                item[base_key] = "[MISSING_IMAGE]"
-                continue
-            entry = item_lookup[tag]
-            # This logic assumes detected_items still contained the raw dicts,
-            # which is no longer true in the main flow.
-            # This section is functionally broken but left untouched as per request.
-            # if entry["type"] == "equation":
-            #     item[base_key] = get_latex_from_base64(entry["base64"])
-            # else:
-            #     item[base_key] = entry["base64"]
-        final_data.append(item)
-    return final_data
 def crop_and_convert_to_base64(image: np.ndarray, bbox: Tuple[float, float, float, float]) -> str:
-    """
-    Original function definition (now deprecated in main flow, but retained).
-    """
-    x1, y1, x2, y2 = map(int, bbox)
-    h, w, _ = image.shape
-    x1 = max(0, x1)
-    y1 = max(0, y1)
-    x2 = min(w, x2)
-    y2 = min(h, y2)
-    crop = image[y1:y2, x1:x2]
-    _, buffer = cv2.imencode(".png", crop)
-    return base64.b64encode(buffer).decode("utf-8")
 # ============================================================================
-# --- MAIN DOCUMENT PROCESSING FUNCTION (Fixed for PIL output) ---
 # ============================================================================
 def run_single_pdf_preprocessing(
     pdf_path: str
 ) -> Tuple[int, int, int, str, float, Dict[str, int], List[Tuple[Image.Image, str]]]:
     """
-    Runs the pipeline, returns counts, report, total time, page counts dict (str keys),
-    and a list of (PIL.Image, label) for the Gradio gallery.
     """
     start_time = time.time()
     log_messages = []
-    # This list now holds (PIL.Image, label) tuples
-    all_gradio_gallery_items: List[Tuple[Image.Image, str]] = []
-    # Dictionary to store {page_number (int): equation_count (int)}
     equation_counts_per_page: Dict[int, int] = {}
-    # Local counters for thread safety (Concurrency Fix)
     total_figure_count = 0
     total_equation_count = 0
-    # 1. Validation and Model Loading
     t0 = time.time()
     if not os.path.exists(pdf_path):
         report = f"❌ FATAL ERROR: Input PDF not found at {pdf_path}."
@@ -964,7 +900,7 @@ def run_single_pdf_preprocessing(
     t1 = time.time()
     log_messages.append(f"Model Loading Time: {t1-t0:.4f}s")
-    # 2. PDF Loading
     t2 = time.time()
     try:
         doc = fitz.open(pdf_path)
@@ -978,7 +914,7 @@ def run_single_pdf_preprocessing(
     mat = fitz.Matrix(SCALE_FACTOR, SCALE_FACTOR)
-    # 3. Page Processing and Detection Loop
     t4 = time.time()
     for page_num_0_based in range(doc.page_count):
         page_start_time = time.time()
@@ -986,6 +922,7 @@ def run_single_pdf_preprocessing(
         page_num = page_num_0_based + 1
         # Render page to image for YOLO
         try:
             pix_start = time.time()
             pix = fitz_page.get_pixmap(matrix=mat)
@@ -994,13 +931,13 @@ def run_single_pdf_preprocessing(
         except Exception as e:
             logging.error(f"Error converting page {page_num} to image: {e}. Skipping.")
             continue
-        # Core Detection
         detect_start = time.time()
         (
             page_equations,
             page_figures,
-            page_images_pil_tuples, # Now (PIL.Image, label)
             total_equation_count,
             total_figure_count
         ) = run_yolo_detection_and_count(
@@ -1011,32 +948,60 @@ def run_single_pdf_preprocessing(
             total_figure_count
         )
-        # Append the PIL tuples directly to the master list
-        all_gradio_gallery_items.extend(page_images_pil_tuples)
         detect_time = time.time() - detect_start
         # Store the count in the dictionary (INT keys)
         equation_counts_per_page[page_num] = page_equations
         page_total_time = time.time() - page_start_time
-        log_messages.append(f"Page {page_num} Time: Total={page_total_time:.4f}s (Render={pix_time:.4f}s, Detect={detect_time:.4f}s)")
     doc.close()
     t5 = time.time()
     detection_loop_time = t5 - t4
     log_messages.append(f"Total Detection Loop Time ({total_pages} pages): {detection_loop_time:.4f}s")
     # Convert integer keys to string keys for JSON serialization
     equation_counts_per_page_str_keys: Dict[str, int] = {
         str(k): v for k, v in equation_counts_per_page.items()
     }
-    # 4. Final Report Generation
-    total_execution_time = t5 - start_time
     report = (
-        f"✅ **YOLO Counting Complete!**\n\n"
         f"**1) Total Pages Detected in PDF:** **{total_pages}**\n"
         f"**2) Total Equations Detected:** **{total_equation_count}**\n"
         f"**3) Total Figures Detected:** **{total_figure_count}**\n"
@@ -1048,15 +1013,13 @@ def run_single_pdf_preprocessing(
         f"\n```"
     )
-    # Return the dictionary with string keys and the properly formatted gallery items (PIL tuples)
-    return total_pages, total_equation_count, total_figure_count, report, total_execution_time, equation_counts_per_page_str_keys, all_gradio_gallery_items
 # ============================================================================
-# --- GRADIO INTERFACE FUNCTION (Updated for PIL output) ---
 # ============================================================================
-# The return type now uses PIL.Image for the gallery list
 def gradio_process_pdf(pdf_file) -> Tuple[str, str, str, str, Dict[str, int], List[Tuple[Image.Image, str]]]:
     """
     Gradio wrapper function to handle file upload and return results.
@@ -1074,7 +1037,7 @@ def gradio_process_pdf(pdf_file) -> Tuple[str, str, str, str, Dict[str, int], Li
             report,
             total_time,
             equation_counts_per_page,
-            gallery_items # Now (PIL.Image, label) tuples
         ) = run_single_pdf_preprocessing(pdf_path)
@@ -1087,10 +1050,6 @@ def gradio_process_pdf(pdf_file) -> Tuple[str, str, str, str, Dict[str, int], Li
         return "Error", "Error", "Error", error_msg, {}, []
-# ============================================================================
-# --- GRADIO INTERFACE DEFINITION (Unchanged) ---
-# ============================================================================
 if __name__ == "__main__":
     if not os.path.exists(WEIGHTS_PATH):
@@ -1107,10 +1066,10 @@ if __name__ == "__main__":
     # NEW OUTPUT: JSON component for structured data
     output_page_counts = gr.JSON(label="Equation Count Per Page (Dictionary)")
-    # Gradio Gallery is configured to accept the (PIL.Image, label) format
     output_gallery = gr.Gallery(
-        label="Detected Items (Gallery Fixed for Stability)",
-        columns=5,
         height="auto",
         object_fit="contain",
         allow_preview=False
@@ -1127,9 +1086,9 @@ if __name__ == "__main__":
             output_page_counts,
             output_gallery
         ],
-        title="📊 YOLO Counting with Per-Page Data & Timing (Stable)",
         description=(
-            "Upload a PDF to run YOLO detection. Concurrency and Gallery display issues are resolved."
         ),
     )

 import base64
 from PIL import Image
 import re
 try:
     processor = TrOCRProcessor.from_pretrained(MODEL_NAME)
     ort_model = ORTModelForVision2Seq.from_pretrained(MODEL_NAME, use_cache=False)
+    OCR_MODEL_LOADED = True
 except Exception as e:
     logging.warning(f"OCR model loading failed (expected if dependencies are missing): {e}")
     processor = None
     ort_model = None
+    OCR_MODEL_LOADED = False
 # Detection parameters
 CONF_THRESHOLD = 0.2
 IOU_MERGE_THRESHOLD = 0.4
 IOA_SUPPRESSION_THRESHOLD = 0.7
 # ============================================================================
 # --- BOX COMBINATION LOGIC (Retained) ---
 # ============================================================================
     return merged_detections
 # ============================================================================
+# --- UTILITY FUNCTIONS (UPDATED) ---
 # ============================================================================
 def pixmap_to_numpy(pix: fitz.Pixmap) -> np.ndarray:
     return img
 def crop_and_convert_to_pil(image: np.ndarray, bbox: Tuple[float, float, float, float]) -> Image.Image:
     """Crops the numpy array and returns a PIL Image object."""
     x1, y1, x2, y2 = map(int, bbox)
     y2 = min(h, y2)
     crop_np = image[y1:y2, x1:x2]
     crop_pil = Image.fromarray(cv2.cvtColor(crop_np, cv2.COLOR_BGR2RGB))
     return crop_pil
+# --- NEW: Utility to convert PIL Image to Base64 (for OCR input) ---
+def pil_to_base64(img: Image.Image) -> str:
+    """Converts a PIL Image object to a Base64 encoded string (PNG format)."""
+    buffer = io.BytesIO()
+    img.save(buffer, format="PNG")
+    return base64.b64encode(buffer.getvalue()).decode("utf-8")
+# --- UPDATED: run_yolo_detection_and_count to return a list of dictionaries with PIL images ---
 def run_yolo_detection_and_count(
         image: np.ndarray, model: YOLO, page_num: int,
         current_eq_count: int, current_fig_count: int
+) -> Tuple[int, int, List[Dict[str, Union[Image.Image, str]]], int, int]:
     """
+    Performs YOLO detection and returns page counts, detected items (as dicts
+    containing the PIL Image), and the updated total counters.
     """
     eq_counter = current_eq_count
     fig_counter = current_fig_count
     page_equations = 0
     page_figures = 0
+    # Change: detected_items now holds dictionaries: {'type', 'id', 'pil_image'}
+    detected_items: List[Dict[str, Union[Image.Image, str]]] = []
     yolo_detections = []
+    # ... (YOLO inference logic is the same)
     try:
         results = model.predict(image, conf=CONF_THRESHOLD, verbose=False)
     for det in final_detections:
         bbox = det["coords"]
         crop_pil = crop_and_convert_to_pil(image, bbox)
         if det["class"] == "equation":
             eq_counter += 1
             page_equations += 1
+            detected_items.append({
+                "type": "equation",
+                "id": f"EQUATION{eq_counter}",
+                "pil_image": crop_pil,
+                "latex": "" # Placeholder for OCR result
+            })
         elif det["class"] == "figure":
             fig_counter += 1
             page_figures += 1
+            detected_items.append({
+                "type": "figure",
+                "id": f"FIGURE{fig_counter}",
+                "pil_image": crop_pil,
+                "latex": "[FIGURE - No LaTeX]" # Figures don't get OCR
+            })
     logging.warning(f"  -> Page {page_num}: EQs={page_equations}, Figs={page_figures}")
     return page_equations, page_figures, detected_items, eq_counter, fig_counter
 def get_latex_from_base64(base64_string: str) -> str:
+    """
+    Performs the OCR conversion. Expects Base64 string input.
+    """
+    if not OCR_MODEL_LOADED:
+        return "[MODEL_ERROR: Model not initialized or failed to load]"
     try:
+        # OCR logic (unchanged)
         image_data = base64.b64decode(base64_string)
         image = Image.open(io.BytesIO(image_data)).convert('RGB')
         return f"[TR_OCR_ERROR: {e}]"
+# --- UNUSED ORIGINAL FUNCTIONS RETAINED FOR COMPLETENESS ---
+def extract_images_from_page_in_memory(page) -> Dict[str, str]:
+    # ... (body retained)
+    pass
 def embed_images_as_base64_in_memory(structured_data, detected_items):
+    # ... (body retained)
+    pass
 def crop_and_convert_to_base64(image: np.ndarray, bbox: Tuple[float, float, float, float]) -> str:
+    # ... (body retained)
+    pass
 # ============================================================================
+# --- MAIN DOCUMENT PROCESSING FUNCTION (UPDATED for OCR) ---
 # ============================================================================
 def run_single_pdf_preprocessing(
     pdf_path: str
 ) -> Tuple[int, int, int, str, float, Dict[str, int], List[Tuple[Image.Image, str]]]:
     """
+    Runs the pipeline, performs OCR, and returns final results.
     """
     start_time = time.time()
     log_messages = []
+    # This will store all final extracted item dicts (image, ID, type, LATEX)
+    all_extracted_items: List[Dict[str, Union[Image.Image, str]]] = []
     equation_counts_per_page: Dict[int, int] = {}
     total_figure_count = 0
     total_equation_count = 0
+    # 1. Validation and Model Loading (YOLO)
     t0 = time.time()
     if not os.path.exists(pdf_path):
         report = f"❌ FATAL ERROR: Input PDF not found at {pdf_path}."
     t1 = time.time()
     log_messages.append(f"Model Loading Time: {t1-t0:.4f}s")
+    # 2. PDF Loading (fitz)
     t2 = time.time()
     try:
         doc = fitz.open(pdf_path)
     mat = fitz.Matrix(SCALE_FACTOR, SCALE_FACTOR)
+    # 3. Page Processing, Detection, and OCR Loop
     t4 = time.time()
     for page_num_0_based in range(doc.page_count):
         page_start_time = time.time()
         page_num = page_num_0_based + 1
         # Render page to image for YOLO
+        # ... (image rendering logic retained)
         try:
             pix_start = time.time()
             pix = fitz_page.get_pixmap(matrix=mat)
         except Exception as e:
             logging.error(f"Error converting page {page_num} to image: {e}. Skipping.")
             continue
+        # YOLO Detection
         detect_start = time.time()
         (
             page_equations,
             page_figures,
+            page_extracted_items, # List of dicts: {'type', 'id', 'pil_image', 'latex'}
             total_equation_count,
             total_figure_count
         ) = run_yolo_detection_and_count(
             total_figure_count
         )
+        # --- NEW: OCR/LaTeX Conversion for Equations ---
+        ocr_start = time.time()
+        for item in page_extracted_items:
+            if item["type"] == "equation":
+                # 1. Convert PIL image to Base64
+                b64_string = pil_to_base64(item["pil_image"])
+                # 2. Run OCR
+                item["latex"] = get_latex_from_base64(b64_string)
+                # OPTIONAL: Clean up large image data if memory is a concern
+                # del item["pil_image"]
+        ocr_time = time.time() - ocr_start
+        # Append all extracted item dictionaries
+        all_extracted_items.extend(page_extracted_items)
         detect_time = time.time() - detect_start
         # Store the count in the dictionary (INT keys)
         equation_counts_per_page[page_num] = page_equations
         page_total_time = time.time() - page_start_time
+        log_messages.append(f"Page {page_num} Time: Total={page_total_time:.4f}s (Render={pix_time:.4f}s, Detect={detect_time:.4f}s, OCR={ocr_time:.4f}s)")
     doc.close()
     t5 = time.time()
     detection_loop_time = t5 - t4
     log_messages.append(f"Total Detection Loop Time ({total_pages} pages): {detection_loop_time:.4f}s")
+    # 4. Final Report Generation and Gallery Formatting
+    # Format the extracted items for the Gradio Gallery
+    gallery_items: List[Tuple[Image.Image, str]] = []
+    # We will include the LATEX code as the image label in the gallery
+    # If the item is a Figure, the label is just the ID.
+    for item in all_extracted_items:
+        image_label = item["id"]
+        if item["type"] == "equation":
+            image_label = f'{item["id"]}: {item["latex"]}'
+        gallery_items.append((item["pil_image"], image_label))
+    total_execution_time = t5 - start_time
     # Convert integer keys to string keys for JSON serialization
     equation_counts_per_page_str_keys: Dict[str, int] = {
         str(k): v for k, v in equation_counts_per_page.items()
     }
     report = (
+        f"✅ **YOLO Counting & OCR Complete!**\n\n"
         f"**1) Total Pages Detected in PDF:** **{total_pages}**\n"
         f"**2) Total Equations Detected:** **{total_equation_count}**\n"
         f"**3) Total Figures Detected:** **{total_figure_count}**\n"
         f"\n```"
     )
+    return total_pages, total_equation_count, total_figure_count, report, total_execution_time, equation_counts_per_page_str_keys, gallery_items
 # ============================================================================
+# --- GRADIO INTERFACE FUNCTION & DEFINITION (Retained) ---
 # ============================================================================
 def gradio_process_pdf(pdf_file) -> Tuple[str, str, str, str, Dict[str, int], List[Tuple[Image.Image, str]]]:
     """
     Gradio wrapper function to handle file upload and return results.
             report,
             total_time,
             equation_counts_per_page,
+            gallery_items
         ) = run_single_pdf_preprocessing(pdf_path)
         return "Error", "Error", "Error", error_msg, {}, []
 if __name__ == "__main__":
     if not os.path.exists(WEIGHTS_PATH):
     # NEW OUTPUT: JSON component for structured data
     output_page_counts = gr.JSON(label="Equation Count Per Page (Dictionary)")
+    # Gradio Gallery now shows the LaTeX code as the label
     output_gallery = gr.Gallery(
+        label="Detected Items (with Extracted LaTeX)",
+        columns=3,
         height="auto",
         object_fit="contain",
         allow_preview=False
             output_page_counts,
             output_gallery
         ],
+        title="📊 YOLO Detection & Math OCR Pipeline",
         description=(
+            "Upload a PDF. YOLO detects equations, and the TrOCR model converts them to LaTeX."
         ),
     )