Spaces:

heerjtdev
/

feeedback

Running

App Files Files Community

heerjtdev commited on Dec 8, 2025

Commit

2665493

verified ·

1 Parent(s): e12f847

Update app.py

Browse files

Files changed (1) hide show

app.py +103 -171

app.py CHANGED Viewed

@@ -1,6 +1,9 @@
 import base64
 from PIL import Image
@@ -41,7 +44,7 @@ logging.basicConfig(level=logging.WARNING)
 WEIGHTS_PATH = 'best.pt'
 SCALE_FACTOR = 2.0
 # OUTPUT_DIR = "yolo_extracted_regions"
-OUTPUT_DIR = os.path.join(tempfile.gettempdir(), "yolo_extracted_regions")
@@ -155,53 +158,92 @@ def pixmap_to_numpy(pix: fitz.Pixmap) -> np.ndarray:
     return img
 def run_yolo_detection_and_count(
         image: np.ndarray, model: YOLO, page_num: int
-) -> Tuple[int, int]:
-    """
-    Runs YOLO inference, applies NMS/filtering, and updates global counters.
-    Returns page counts only.
-    """
     global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
     yolo_detections = []
     page_equations = 0
     page_figures = 0
     try:
         results = model.predict(image, conf=CONF_THRESHOLD, verbose=False)
         if results and results[0].boxes:
             for box in results[0].boxes.data.tolist():
                 x1, y1, x2, y2, conf, cls_id = box
                 cls_name = model.names[int(cls_id)]
                 if cls_name in TARGET_CLASSES:
                     yolo_detections.append({
-                        'coords': (x1, y1, x2, y2),
-                        'class': cls_name,
                         'conf': conf
                     })
     except Exception as e:
         logging.error(f"YOLO inference failed on page {page_num}: {e}")
-        return 0, 0
-    # Apply NMS/Merging/Filtering
     merged_detections = merge_overlapping_boxes(yolo_detections, IOU_MERGE_THRESHOLD)
     final_detections = filter_nested_boxes(merged_detections, IOA_SUPPRESSION_THRESHOLD)
-    # Update Global Counters
     for det in final_detections:
-        if det['class'] == 'figure':
-            GLOBAL_FIGURE_COUNT += 1
-            page_figures += 1
-        elif det['class'] == 'equation':
             GLOBAL_EQUATION_COUNT += 1
             page_equations += 1
     logging.warning(f"  -> Page {page_num}: EQs={page_equations}, Figs={page_figures}")
-    return page_equations, page_figures
@@ -242,132 +284,11 @@ def extract_images_from_page_in_memory(page) -> Dict[str, str]:
-def crop_and_convert_to_base64(image: np.ndarray, bbox: Tuple[float, float, float, float]) -> str:
-    """
-    Crop bounding box from image and return as base64 string.
-    """
-    x1, y1, x2, y2 = map(int, bbox)
-    h, w, _ = image.shape
-    # Clamp to image bounds
-    x1 = max(0, x1)
-    y1 = max(0, y1)
-    x2 = min(w, x2)
-    y2 = min(h, y2)
-    crop = image[y1:y2, x1:x2]
-    # Convert to PNG
-    _, buffer = cv2.imencode(".png", crop)
-    b64 = base64.b64encode(buffer).decode("utf-8")
-    return f"data:image/png;base64,{b64}"
-def crop_and_save(image: np.ndarray, bbox, label: str, index: int) -> str:
-    """Crop bounding box and save to disk. Return file path."""
-    x1, y1, x2, y2 = map(int, bbox)
-    h, w, _ = image.shape
-    x1 = max(0, x1)
-    y1 = max(0, y1)
-    x2 = min(w, x2)
-    y2 = min(h, y2)
-    crop = image[y1:y2, x1:x2]
-    filename = f"{label}{index}.png"
-    filepath = os.path.join(OUTPUT_DIR, filename)
-    cv2.imwrite(filepath, crop)
-    return filepath
-def run_yolo_detection_and_count(
-        image: np.ndarray, model: YOLO, page_num: int
-) -> Tuple[int, int, List[str]]:
-    """
-    Runs YOLO inference, saves crops, and returns file paths.
-    """
-    global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
-    yolo_detections = []
-    page_equations = 0
-    page_figures = 0
-    saved_images = []
-    try:
-        results = model.predict(image, conf=CONF_THRESHOLD, verbose=False)
-        if results and results[0].boxes:
-            for box in results[0].boxes.data.tolist():
-                x1, y1, x2, y2, conf, cls_id = box
-                cls_name = model.names[int(cls_id)]
-                if cls_name in TARGET_CLASSES:
-                    yolo_detections.append({
-                        'coords': (x1, y1, x2, y2),
-                        'class': cls_name,
-                        'conf': conf
-                    })
-    except Exception as e:
-        logging.error(f"YOLO inference failed on page {page_num}: {e}")
-        return 0, 0, []
-    merged_detections = merge_overlapping_boxes(yolo_detections, IOU_MERGE_THRESHOLD)
-    final_detections = filter_nested_boxes(merged_detections, IOA_SUPPRESSION_THRESHOLD)
-    for det in final_detections:
-        bbox = det["coords"]
-        if det["class"] == "equation":
-            GLOBAL_EQUATION_COUNT += 1
-            page_equations += 1
-            path = crop_and_save(image, bbox, "EQUATION", GLOBAL_EQUATION_COUNT)
-            saved_images.append(path)
-        elif det["class"] == "figure":
-            GLOBAL_FIGURE_COUNT += 1
-            page_figures += 1
-            path = crop_and_save(image, bbox, "FIGURE", GLOBAL_FIGURE_COUNT)
-            saved_images.append(path)
-    logging.warning(f"  -> Page {page_num}: EQs={page_equations}, Figs={page_figures}")
-    return page_equations, page_figures, saved_images
-def embed_images_as_base64_in_memory(structured_data: List[Dict[str, Any]], pdf_doc) -> List[Dict[str, Any]]:
-    print("\n" + "="*80)
-    print("--- IN-MEMORY IMAGE + EQUATION TO LATEX PIPELINE ---")
-    print("="*80)
-    if not structured_data:
-        return []
-    # Build global image map from all pages (in memory only)
-    full_image_lookup = {}
-    for page_index in range(len(pdf_doc)):
-        page = pdf_doc[page_index]
-        page_images = extract_images_from_page_in_memory(page)
-        for tag, base64_img in page_images.items():
-            full_image_lookup[tag] = base64_img
-    print(f" -> Found {len(full_image_lookup)} total in-memory images.")
     tag_regex = re.compile(r'(figure|equation)(\d+)', re.IGNORECASE)
-    final_structured_data = []
     for item in structured_data:
         text_fields = [
@@ -377,41 +298,52 @@ def embed_images_as_base64_in_memory(structured_data: List[Dict[str, Any]], pdf_
         ]
         if 'options' in item:
-            for opt in item['options'].values():
-                text_fields.append(opt)
-        unique_tags = set()
         for text in text_fields:
-            if not text:
-                continue
-            for match in tag_regex.finditer(text):
-                unique_tags.add(match.group(0).upper())
-        for tag in sorted(unique_tags):
-            base_key = tag.lower().replace(' ', '')
-            if tag not in full_image_lookup:
                 item[base_key] = "[MISSING_IMAGE]"
                 continue
-            base64_img = full_image_lookup[tag]
-            if "EQUATION" in tag:
-                latex = get_latex_from_base64(base64_img)
-                item[base_key] = latex
-                print(f" ✅ {tag} → LaTeX")
-            elif "FIGURE" in tag:
-                item[base_key] = base64_img
-                print(f" ✅ {tag} → Base64")
-        final_structured_data.append(item)
-    print("✅ In-memory embedding completed")
-    return final_structured_data
@@ -455,9 +387,9 @@ def run_single_pdf_preprocessing(pdf_path: str) -> Tuple[int, int, int, str, flo
-    if os.path.exists(OUTPUT_DIR):
-       shutil.rmtree(OUTPUT_DIR)
-       os.makedirs(OUTPUT_DIR, exist_ok=True)
     # 1. Validation and Model Loading
@@ -640,7 +572,7 @@ if __name__ == "__main__":
     # interface.launch(inbrowser=True)
     interface.launch(
     inbrowser=True,
-    allowed_paths=[OUTPUT_DIR]
 )

 import base64
 from PIL import Image
+import re
+from transformers import TrOCRProcessor
+from optimum.onnxruntime import ORTModelForVision2Seq
 WEIGHTS_PATH = 'best.pt'
 SCALE_FACTOR = 2.0
 # OUTPUT_DIR = "yolo_extracted_regions"
+# OUTPUT_DIR = os.path.join(tempfile.gettempdir(), "yolo_extracted_regions")
     return img
 def run_yolo_detection_and_count(
         image: np.ndarray, model: YOLO, page_num: int
+) -> Tuple[int, int, List[Dict[str, str]]]:
     global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
     yolo_detections = []
     page_equations = 0
     page_figures = 0
+    detected_items = []
     try:
         results = model.predict(image, conf=CONF_THRESHOLD, verbose=False)
         if results and results[0].boxes:
             for box in results[0].boxes.data.tolist():
                 x1, y1, x2, y2, conf, cls_id = box
                 cls_name = model.names[int(cls_id)]
                 if cls_name in TARGET_CLASSES:
                     yolo_detections.append({
+                        'coords': (x1, y1, x2, y2),
+                        'class': cls_name,
                         'conf': conf
                     })
     except Exception as e:
         logging.error(f"YOLO inference failed on page {page_num}: {e}")
+        return 0, 0, []
     merged_detections = merge_overlapping_boxes(yolo_detections, IOU_MERGE_THRESHOLD)
     final_detections = filter_nested_boxes(merged_detections, IOA_SUPPRESSION_THRESHOLD)
     for det in final_detections:
+        bbox = det["coords"]
+        if det["class"] == "equation":
             GLOBAL_EQUATION_COUNT += 1
             page_equations += 1
+            b64 = crop_and_convert_to_base64(image, bbox)
+            detected_items.append({
+                "type": "equation",
+                "id": f"EQUATION{GLOBAL_EQUATION_COUNT}",
+                "base64": b64
+            })
+        elif det["class"] == "figure":
+            GLOBAL_FIGURE_COUNT += 1
+            page_figures += 1
+            b64 = crop_and_convert_to_base64(image, bbox)
+            detected_items.append({
+                "type": "figure",
+                "id": f"FIGURE{GLOBAL_FIGURE_COUNT}",
+                "base64": b64
+            })
     logging.warning(f"  -> Page {page_num}: EQs={page_equations}, Figs={page_figures}")
+    return page_equations, page_figures, detected_items
+def get_latex_from_base64(base64_string: str) -> str:
+    if ort_model is None or processor is None:
+        return "[MODEL_ERROR: Model not initialized]"
+    try:
+        image_data = base64.b64decode(base64_string)
+        image = Image.open(io.BytesIO(image_data)).convert('RGB')
+        pixel_values = processor(images=image, return_tensors="pt").pixel_values
+        generated_ids = ort_model.generate(pixel_values)
+        raw_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
+        if not raw_text:
+            return "[OCR_WARNING: No formula found]"
+        latex = raw_text[0]
+        latex = re.sub(r'[\r\n]+', '', latex)
+        return latex
+    except Exception as e:
+        return f"[TR_OCR_ERROR: {e}]"
+def embed_images_as_base64_in_memory(structured_data, detected_items):
     tag_regex = re.compile(r'(figure|equation)(\d+)', re.IGNORECASE)
+    item_lookup = {d["id"]: d for d in detected_items}
+    final_data = []
     for item in structured_data:
         text_fields = [
         ]
         if 'options' in item:
+            text_fields.extend(item['options'].values())
+        used_tags = set()
         for text in text_fields:
+            for m in tag_regex.finditer(text or ""):
+                used_tags.add(m.group(0).upper())
+        for tag in used_tags:
+            base_key = tag.lower().replace(" ", "")
+            if tag not in item_lookup:
                 item[base_key] = "[MISSING_IMAGE]"
                 continue
+            entry = item_lookup[tag]
+            if entry["type"] == "equation":
+                item[base_key] = get_latex_from_base64(entry["base64"])
+            else:
+                item[base_key] = entry["base64"]
+        final_data.append(item)
+    return final_data
+def crop_and_convert_to_base64(image: np.ndarray, bbox: Tuple[float, float, float, float]) -> str:
+    x1, y1, x2, y2 = map(int, bbox)
+    h, w, _ = image.shape
+    x1 = max(0, x1)
+    y1 = max(0, y1)
+    x2 = min(w, x2)
+    y2 = min(h, y2)
+    crop = image[y1:y2, x1:x2]
+    _, buffer = cv2.imencode(".png", crop)
+    return base64.b64encode(buffer).decode("utf-8")
+    # if os.path.exists(OUTPUT_DIR):
+    #    shutil.rmtree(OUTPUT_DIR)
+    #    os.makedirs(OUTPUT_DIR, exist_ok=True)
     # 1. Validation and Model Loading
     # interface.launch(inbrowser=True)
     interface.launch(
     inbrowser=True,
+    # allowed_paths=[OUTPUT_DIR]
 )