GradioPDF-Extractor

Runtime error

App Files Files Community

VolarisLLC commited on 18 days ago

Commit

0e28d43

verified ·

1 Parent(s): 72c550c

Update main.py

Browse files

Files changed (1) hide show

main.py +772 -1262

main.py CHANGED Viewed

@@ -1,1324 +1,834 @@
-import os
 import json
-import signal
-import sys
 from pathlib import Path
-from typing import List, Dict, Tuple, Optional, Sequence, Set, Any
-from multiprocessing import Pool, cpu_count
-from functools import partial
-import fitz  # PyMuPDF (Still needed for drawing output PDF)
-import pypdfium2 as pdfium
 import torch
-from doclayout_yolo import YOLOv10
-from huggingface_hub import hf_hub_download
-from loguru import logger
-from PIL import Image
-import numpy as np
-try:
-    import pymupdf4llm  # type: ignore
-except ImportError:  # pragma: no cover - optional dependency
-    pymupdf4llm = None  # type: ignore
-try:
-    import spaces
-except ImportError:
-    # Mock spaces for local execution
-    class spaces:
-        @staticmethod
-        def GPU(func):
-            return func
-# ----------------------------------------------------------------------
-# CONFIGURATION
-# ----------------------------------------------------------------------
-# DEVICE = "cuda" if torch.cuda.is_available() else "cpu"  # Removed for ZeroGPU compatibility (lazy check instead)
-# Model options
-MODEL_SIZE = 1024
-REPO_ID = "juliozhao/DocLayout-YOLO-DocStructBench"
-WEIGHTS_FILE = f"doclayout_yolo_docstructbench_imgsz{MODEL_SIZE}.pt"
-# Detection settings
-CONF_THRESHOLD = 0.25
-# Multiprocessing settings
-NUM_WORKERS = None  # None = auto (cpu_count - 1), or set to specific number like 4
-USE_MULTIPROCESSING = False  # Set to False to disable parallel processing entirely (Required for ZeroGPU)
-# ----------------------------------------------------------------------
-# Color map for the layout classes
-# ----------------------------------------------------------------------
-CLASS_COLORS = {
-    "text": (0, 128, 0),          # Dark Green
-    "title": (192, 0, 0),        # Dark Red
-    "figure": (0, 0, 192),       # Dark Blue
-    "table": (218, 165, 32),     # Goldenrod (Dark Yellow)
-    "list": (128, 0, 128),       # Purple
-    "header": (0, 128, 128),     # Teal
-    "footer": (100, 100, 100),   # Dark Gray
-    "figure_caption": (0, 0, 128), # Navy
-    "table_caption": (139, 69, 19),  # Saddle Brown
-    "table_footnote": (128, 0, 128), # Purple
-}
-# Global model instance (will be None in worker processes until loaded)
-_model = None
-_shutdown_requested = False
-# ----------------------------------------------------------------------
-# Signal handler for graceful shutdown
-# ----------------------------------------------------------------------
-def signal_handler(signum, frame):
-    """Handle interrupt signals gracefully."""
-    global _shutdown_requested
-    if not _shutdown_requested:
-        _shutdown_requested = True
-        logger.warning("\n⚠️  Interrupt received! Finishing current page and shutting down gracefully...")
-        logger.warning("Press Ctrl+C again to force quit (may leave incomplete files)")
-    else:
-        logger.error("\n❌ Force quit requested. Exiting immediately.")
-        sys.exit(1)
-def setup_signal_handlers():
-    """Setup signal handlers for graceful shutdown."""
-    signal.signal(signal.SIGINT, signal_handler)
-    signal.signal(signal.SIGTERM, signal_handler)
-# ----------------------------------------------------------------------
-# Model loader function
-# ----------------------------------------------------------------------
-def get_model():
-    """Lazy load the model (only once per process)."""
-    global _model
-    if _model is None:
-        weights_path = hf_hub_download(repo_id=REPO_ID, filename=WEIGHTS_FILE)
-        _model = YOLOv10(weights_path)
-        logger.info(f"✓ Model loaded in worker process (PID: {os.getpid()})")
-    return _model
-# ----------------------------------------------------------------------
-# Worker initialization function
-# ----------------------------------------------------------------------
-def init_worker():
-    """Initialize worker process - loads model once at startup."""
-    try:
-        get_model()
-        logger.success(f"Worker {os.getpid()} ready")
-    except Exception as e:
-        logger.error(f"Failed to initialize worker {os.getpid()}: {e}")
-        raise
-# ----------------------------------------------------------------------
-# Run layout detection on a single page image (YOLO)
-# ----------------------------------------------------------------------
-@spaces.GPU
-def detect_page(pil_img: Image.Image) -> List[dict]:
-    """Detect layout elements using YOLO model."""
-    # Re-check device availability inside the decorated function (ZeroGPU context)
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    model = get_model()  # Will return already-loaded model in worker
-    img_cv = np.array(pil_img)
-    results = model.predict(
-        img_cv,
-        imgsz=MODEL_SIZE,
-        conf=CONF_THRESHOLD,
-        device=device,
-        verbose=False
-    )
-    dets = []
-    for i, box in enumerate(results[0].boxes):
-        cls_id = int(box.cls.item())
-        name = results[0].names[cls_id]
-        conf = float(box.conf.item())
-        x0, y0, x1, y1 = box.xyxy[0].cpu().numpy().tolist()
-        dets.append({
-            "name": name,
-            "bbox": [x0, y0, x1, y1],
-            "conf": conf,
-            "source": "yolo",
-            "index": i
-        })
-    return dets
-# ----------------------------------------------------------------------
-# Crop & save figure/table regions (with captions)
-# ----------------------------------------------------------------------
-def get_union_box(box1: List[float], box2: List[float]) -> List[float]:
-    """Get the bounding box enclosing two boxes."""
-    x0 = min(box1[0], box2[0])
-    y0 = min(box1[1], box2[1])
-    x1 = max(box1[2], box2[2])
-    y1 = max(box1[3], box2[3])
-    return [x0, y0, x1, y1]
-def collect_caption_elements(
-    element: Dict,
-    all_dets: List[Dict],
-    target_name: str,
-    max_vertical_gap: float = 60.0,
-    min_overlap: float = 0.25,
-) -> List[Dict]:
-    """
-    Collect contiguous caption detections directly below a figure/table.
-    """
-    base_box = element["bbox"]
-    base_bottom = base_box[3]
-    selected: List[Dict] = []
-    last_bottom = base_bottom
-    relevant = [
-        d for d in all_dets
-        if d["name"] == target_name and d["bbox"][1] >= base_bottom - 5
-    ]
-    relevant.sort(key=lambda d: d["bbox"][1])
-    for cand in relevant:
-        cand_box = cand["bbox"]
-        top = cand_box[1]
-        if selected and top - last_bottom > max_vertical_gap:
-            break
-        if selected:
-            overlap = _horizontal_overlap_ratio(selected[-1]["bbox"], cand_box)
-        else:
-            overlap = _horizontal_overlap_ratio(base_box, cand_box)
-        if overlap < min_overlap:
-            continue
-        selected.append(cand)
-        last_bottom = cand_box[3]
-    return selected
-def collect_title_and_text_segments(
-    element: Dict,
-    all_dets: List[Dict],
-    processed_indices: Set[int],
-    settings: Optional[Dict[str, float]] = None,
-) -> Tuple[List[Dict], List[Dict]]:
     """
-    Locate a title below the element and any contiguous text blocks directly beneath it.
     """
-    if settings is None:
-        settings = TITLE_TEXT_ASSOCIATION
-    if not element.get("bbox"):
-        return [], []
-    figure_box = element["bbox"]
-    figure_bottom = figure_box[3]
-    candidates = [
-        d for d in all_dets
-        if d.get("bbox") and d["index"] not in processed_indices
-    ]
-    candidates.sort(key=lambda d: d["bbox"][1])
-    titles: List[Dict] = []
-    texts: List[Dict] = []
-    for idx, det in enumerate(candidates):
-        if det["name"] != "title":
-            continue
-        title_box = det["bbox"]
-        if title_box[1] < figure_bottom - 5:
-            continue
-        vertical_gap = title_box[1] - figure_bottom
-        if vertical_gap > settings["max_title_gap"]:
-            break
-        overlap = _horizontal_overlap_ratio(figure_box, title_box)
-        if overlap < settings["min_overlap"]:
-            continue
-        titles.append(det)
-        last_bottom = title_box[3]
-        for follower in candidates[idx + 1 :]:
-            if follower["name"] == "title":
-                break
-            if follower["name"] != "text":
-                continue
-            text_box = follower["bbox"]
-            if text_box[1] < title_box[1]:
-                continue
-            gap = text_box[1] - last_bottom
-            if gap > settings["max_text_gap"]:
-                break
-            if _horizontal_overlap_ratio(title_box, text_box) < settings["min_overlap"]:
-                continue
-            texts.append(follower)
-            last_bottom = text_box[3]
-        break
-    return titles, texts
-def save_layout_elements(pil_img: Image.Image, page_num: int,
-                         dets: List[dict], out_dir: Path) -> List[dict]:
-    """Save figure and table crops, merging captions."""
-    fig_dir = out_dir / "figures"
-    tab_dir = out_dir / "tables"
-    os.makedirs(fig_dir, exist_ok=True)
-    os.makedirs(tab_dir, exist_ok=True)
-    infos = []
-    fig_count = 0
-    tab_count = 0
-    processed_indices = set()
-    for i, d in enumerate(dets):
-        if d["index"] in processed_indices:
-            continue
-        name = d["name"].lower()
-        final_box = d["bbox"]
-        caption_segments: List[Dict] = []
-        title_segments: List[Dict] = []
-        text_segments: List[Dict] = []
-        if name == "figure":
-            elem_type = "figure"
-            path_template = fig_dir / f"page_{page_num + 1}_fig_{fig_count}.png"
-            fig_count += 1
-            caption_segments = collect_caption_elements(d, dets, "figure_caption")
-            for cap in caption_segments:
-                final_box = get_union_box(final_box, cap["bbox"])
-                processed_indices.add(cap["index"])
-            title_segments, text_segments = collect_title_and_text_segments(
-                d, dets, processed_indices
-            )
-            for seg in title_segments + text_segments:
-                final_box = get_union_box(final_box, seg["bbox"])
-                processed_indices.add(seg["index"])
-        elif name == "table":
-            elem_type = "table"
-            path_template = tab_dir / f"page_{page_num + 1}_tab_{tab_count}.png"
-            tab_count += 1
-            caption_segments = collect_caption_elements(d, dets, "table_caption")
-            for cap in caption_segments:
-                final_box = get_union_box(final_box, cap["bbox"])
-                processed_indices.add(cap["index"])
-        else:
-            continue
-        x0, y0, x1, y1 = map(int, final_box)
-        crop = pil_img.crop((x0, y0, x1, y1))
-        if crop.mode == "CMYK":
-            crop = crop.convert("RGB")
-        crop.save(path_template)
-        info_data = {
-            "type": elem_type,
-            "page": page_num + 1,
-            "bbox_pixels": final_box,
-            "conf": d["conf"],
-            "source": d.get("source", "yolo"),
-            "image_path": str(path_template.relative_to(out_dir)),
-            "width": int(x1 - x0),
-            "height": int(y1 - y0),
-            "page_width": pil_img.width,
-            "page_height": pil_img.height,
-        }
-        if caption_segments:
-            info_data["captions"] = [
-                {
-                    "bbox": cap["bbox"],
-                    "conf": cap.get("conf"),
-                    "index": cap["index"],
-                    "source": cap.get("source"),
-                    "page": page_num + 1,
-                }
-                for cap in caption_segments
-            ]
-        if title_segments:
-            info_data["titles"] = [
-                {
-                    "bbox": seg["bbox"],
-                    "conf": seg.get("conf"),
-                    "index": seg["index"],
-                    "source": seg.get("source"),
-                    "page": page_num + 1,
-                }
-                for seg in title_segments
-            ]
-        if text_segments:
-            info_data["texts"] = [
-                {
-                    "bbox": seg["bbox"],
-                    "conf": seg.get("conf"),
-                    "index": seg["index"],
-                    "source": seg.get("source"),
-                    "page": page_num + 1,
-                }
-                for seg in text_segments
-            ]
-        infos.append(info_data)
-    return infos
-TABLE_STITCH_TOLERANCES = {
-    "x_tol": 60,
-    "y_tol": 60,
-    "width_tol": 120,
-    "height_tol": 120,
-}
-CROSS_PAGE_CAPTION_THRESHOLDS = {
-    "max_top_ratio": 0.35,
-    "max_top_pixels": 220,
-    "x_tol": 120,
-    "width_tol": 200,
-    "min_overlap": 0.05,
-}
-TITLE_TEXT_ASSOCIATION = {
-    "max_title_gap": 220,
-    "max_text_gap": 160,
-    "min_overlap": 0.2,
-}
-def _horizontal_overlap_ratio(box1: List[float], box2: List[float]) -> float:
-    """Compute horizontal overlap ratio between two bounding boxes."""
-    x_left = max(box1[0], box2[0])
-    x_right = min(box1[2], box2[2])
-    overlap = max(0.0, x_right - x_left)
-    if overlap <= 0:
-        return 0.0
-    width_union = max(box1[2], box2[2]) - min(box1[0], box2[0])
-    if width_union <= 0:
-        return 0.0
-    return overlap / width_union
-def _bbox_to_rect(bbox: List[float]) -> Tuple[int, int, int, int]:
-    """Convert [x0, y0, x1, y1] into (x, y, w, h)."""
-    x0, y0, x1, y1 = bbox
-    return int(x0), int(y0), int(x1 - x0), int(y1 - y0)
-def _open_table_image(elem: Dict, out_dir: Path) -> Optional[Image.Image]:
-    """Open a table image relative to the output directory."""
-    image_path = out_dir / elem["image_path"]
-    if not image_path.exists():
-        logger.warning(f"Missing table crop for stitching: {image_path}")
-        return None
-    img = Image.open(image_path)
-    if img.mode != "RGB":
-        img = img.convert("RGB")
-    return img
-def _pad_width(img: Image.Image, target_width: int) -> Image.Image:
-    if img.width >= target_width:
-        return img
-    canvas = Image.new("RGB", (target_width, img.height), color=(255, 255, 255))
-    canvas.paste(img, (0, 0))
-    return canvas
-def _pad_height(img: Image.Image, target_height: int) -> Image.Image:
-    if img.height >= target_height:
-        return img
-    canvas = Image.new("RGB", (img.width, target_height), color=(255, 255, 255))
-    canvas.paste(img, (0, 0))
-    return canvas
-def _append_segment_image(
-    base_img: Image.Image,
-    segment_img: Image.Image,
-    resize_to_base: bool = False,
-) -> Image.Image:
-    """Append segment image below base image with optional width alignment."""
-    if base_img.mode != "RGB":
-        base_img = base_img.convert("RGB")
-    if segment_img.mode != "RGB":
-        segment_img = segment_img.convert("RGB")
-    if resize_to_base and segment_img.width > 0 and base_img.width > 0:
-        segment_img = segment_img.resize(
-            (
-                base_img.width,
-                max(1, int(segment_img.height * (base_img.width / segment_img.width))),
-            ),
-            Image.Resampling.LANCZOS,
         )
-    target_width = max(base_img.width, segment_img.width)
-    base_img = _pad_width(base_img, target_width)
-    segment_img = _pad_width(segment_img, target_width)
-    stitched = Image.new(
-        "RGB",
-        (target_width, base_img.height + segment_img.height),
-        color=(255, 255, 255),
     )
-    stitched.paste(base_img, (0, 0))
-    stitched.paste(segment_img, (0, base_img.height))
-    return stitched
-def _render_pdf_page(
-    pdf_doc: pdfium.PdfDocument,
-    page_index: int,
-    scale: float,
-    cache: Dict[int, Image.Image],
-) -> Optional[Image.Image]:
-    """Render a PDF page to a PIL image with caching."""
-    if page_index in cache:
-        return cache[page_index]
-    try:
-        page = pdf_doc[page_index]
-        bitmap = page.render(scale=scale)
-        pil_img = bitmap.to_pil()
-        page.close()
-    except Exception as exc:
-        logger.error(f"Failed to render page {page_index + 1} for caption stitching: {exc}")
-        return None
-    cache[page_index] = pil_img
-    return pil_img
-def _crop_pdf_region(
-    page_img: Optional[Image.Image], bbox: List[float]
-) -> Optional[Image.Image]:
-    """Crop a region from a rendered PDF page."""
-    if page_img is None:
-        return None
-    x0, y0, x1, y1 = map(int, bbox)
-    x0 = max(0, x0)
-    y0 = max(0, y0)
-    x1 = min(page_img.width, max(x0 + 1, x1))
-    y1 = min(page_img.height, max(y0 + 1, y1))
-    if x0 >= x1 or y0 >= y1:
-        return None
-    crop = page_img.crop((x0, y0, x1, y1))
-    if crop.mode == "CMYK":
-        crop = crop.convert("RGB")
-    return crop
-def write_markdown_document(pdf_path: Path, out_dir: Path) -> Optional[Path]:
     """
-    Extract markdown text from a PDF using PyMuPDF4LLM and write it to disk.
     """
-    if pymupdf4llm is None:
-        logger.warning(
-            "Skipping markdown extraction for %s because pymupdf4llm is not installed.",
-            pdf_path.name,
-        )
-        return None
-    try:
-        markdown_content = pymupdf4llm.to_markdown(str(pdf_path))
-    except Exception as exc:
-        logger.error(f"  Failed to create markdown for {pdf_path.name}: {exc}")
-        return None
-    if isinstance(markdown_content, list):
-        markdown_content = "\n\n".join(
-            part for part in markdown_content if isinstance(part, str)
-        )
-    if not isinstance(markdown_content, str):
-        logger.error(
-            f"  Unexpected markdown output type {type(markdown_content)} for {pdf_path.name}"
-        )
-        return None
-    markdown_content = markdown_content.strip()
-    if not markdown_content:
-        logger.warning(f"  No textual content extracted from {pdf_path.name}")
-        return None
-    if not markdown_content.endswith("\n"):
-        markdown_content += "\n"
-    md_path = out_dir / f"{pdf_path.stem}.md"
-    md_path.write_text(markdown_content, encoding="utf-8")
-    logger.info(f"  Saved markdown to {md_path.name}")
-    return md_path
-def _collect_text_under_title_cross_page(
-    title_det: Dict,
-    sorted_dets: List[Dict],
-    start_idx: int,
-    page_idx: int,
-    used_indices: Set[Tuple[int, int]],
-    settings: Optional[Dict[str, float]] = None,
-) -> List[Dict]:
-    """Collect text elements directly below a title on the next page."""
-    if settings is None:
-        settings = TITLE_TEXT_ASSOCIATION
-    texts: List[Dict] = []
-    title_box = title_det["bbox"]
-    last_bottom = title_box[3]
-    for follower in sorted_dets[start_idx + 1 :]:
-        det_index = follower.get("index")
-        if det_index is None or (page_idx, det_index) in used_indices:
-            continue
-        if follower["name"] == "title":
-            break
-        if follower["name"] != "text":
-            continue
-        text_box = follower["bbox"]
-        if text_box[1] < title_box[1]:
-            continue
-        gap = text_box[1] - last_bottom
-        if gap > settings["max_text_gap"]:
-            break
-        if _horizontal_overlap_ratio(title_box, text_box) < settings["min_overlap"]:
-            continue
-        texts.append(follower)
-        last_bottom = text_box[3]
-    return texts
-def attach_cross_page_figure_captions(
-    elements: List[Dict],
-    all_dets: Sequence[Optional[List[Dict[str, Any]]]],
-    pdf_bytes: bytes,
-    out_dir: Path,
-    scale: float,
-) -> List[Dict]:
-    """
-    If a figure caption appears on the next page, stitch it to the prior figure.
-    """
-    figures = [elem for elem in elements if elem.get("type") == "figure"]
-    if not figures or not all_dets:
-        return elements
-    try:
-        pdf_doc = pdfium.PdfDocument(pdf_bytes)
-    except Exception as exc:
-        logger.error(f"Unable to reopen PDF for figure caption stitching: {exc}")
-        return elements
-    page_cache: Dict[int, Image.Image] = {}
-    used_following_ids: Set[Tuple[int, int]] = set()
-    # Mark existing caption/title/text detections as used
-    for elem in figures:
-        for key in ("captions", "titles", "texts"):
-            for seg in elem.get(key, []) or []:
-                idx = seg.get("index")
-                page_no = seg.get("page")
-                if idx is None or page_no is None:
-                    continue
-                used_following_ids.add((page_no - 1, idx))
-    for elem in figures:
-        page_no = elem.get("page")
-        bbox = elem.get("bbox_pixels")
-        if page_no is None or bbox is None:
-            continue
-        current_idx = page_no - 1
-        next_idx = current_idx + 1
-        if next_idx >= len(all_dets):
-            continue
-        next_dets = all_dets[next_idx]
-        if not next_dets:
-            continue
-        fig_width = bbox[2] - bbox[0]
-        page_img = _render_pdf_page(pdf_doc, next_idx, scale, page_cache)
-        if page_img is None:
-            continue
-        next_page_height = page_img.height
-        max_top_allowed = min(
-            CROSS_PAGE_CAPTION_THRESHOLDS["max_top_pixels"],
-            int(next_page_height * CROSS_PAGE_CAPTION_THRESHOLDS["max_top_ratio"]),
         )
-        sorted_next = sorted(
-            [det for det in next_dets if det.get("bbox")],
-            key=lambda det: det["bbox"][1],
-        )
-        caption_candidate: Optional[Tuple[Dict, int]] = None
-        caption_candidates = []
-        for det in sorted_next:
-            if det.get("name") != "figure_caption":
-                continue
-            det_index = det.get("index")
-            if det_index is None or (next_idx, det_index) in used_following_ids:
-                continue
-            det_bbox = det.get("bbox")
-            if not det_bbox or det_bbox[1] > max_top_allowed:
-                continue
-            overlap = _horizontal_overlap_ratio(bbox, det_bbox)
-            x_diff = abs(bbox[0] - det_bbox[0])
-            width_diff = abs((bbox[2] - bbox[0]) - (det_bbox[2] - det_bbox[0]))
-            if overlap < CROSS_PAGE_CAPTION_THRESHOLDS["min_overlap"]:
-                if (
-                    x_diff > CROSS_PAGE_CAPTION_THRESHOLDS["x_tol"]
-                    or width_diff > CROSS_PAGE_CAPTION_THRESHOLDS["width_tol"]
-                ):
-                    continue
-            score = width_diff + 0.5 * x_diff
-            caption_candidates.append((score, det, det_index))
-        if caption_candidates:
-            caption_candidates.sort(key=lambda item: item[0])
-            _, best_det, best_index = caption_candidates[0]
-            caption_candidate = (best_det, best_index)
-        title_candidate: Optional[Tuple[Dict, int]] = None
-        title_texts: List[Dict] = []
-        for idx_sorted, det in enumerate(sorted_next):
-            if det.get("name") != "title":
-                continue
-            det_index = det.get("index")
-            if det_index is None or (next_idx, det_index) in used_following_ids:
-                continue
-            det_bbox = det.get("bbox")
-            if not det_bbox or det_bbox[1] > max_top_allowed:
-                continue
-            overlap = _horizontal_overlap_ratio(bbox, det_bbox)
-            x_diff = abs(bbox[0] - det_bbox[0])
-            if (
-                overlap < TITLE_TEXT_ASSOCIATION["min_overlap"]
-                and x_diff > CROSS_PAGE_CAPTION_THRESHOLDS["x_tol"]
-            ):
-                continue
-            title_candidate = (det, det_index)
-            title_texts = _collect_text_under_title_cross_page(
-                det, sorted_next, idx_sorted, next_idx, used_following_ids
-            )
-            break
-        if not caption_candidate and not title_candidate and not title_texts:
-            continue
-        figure_path = out_dir / elem["image_path"]
-        if not figure_path.exists():
-            continue
-        figure_img = Image.open(figure_path)
-        if figure_img.mode == "CMYK":
-            figure_img = figure_img.convert("RGB")
-        segments_added = False
-        if caption_candidate:
-            cap_det, cap_index = caption_candidate
-            caption_crop = _crop_pdf_region(page_img, cap_det["bbox"])
-            if caption_crop is not None:
-                figure_img = _append_segment_image(
-                    figure_img, caption_crop, resize_to_base=True
-                )
-                elem.setdefault("captions", [])
-                elem["captions"].append(
-                    {
-                        "bbox": cap_det["bbox"],
-                        "conf": cap_det.get("conf"),
-                        "index": cap_index,
-                        "source": cap_det.get("source"),
-                        "page": next_idx + 1,
-                    }
-                )
-                used_following_ids.add((next_idx, cap_index))
-                segments_added = True
-        if title_candidate:
-            title_det, title_index = title_candidate
-            title_crop = _crop_pdf_region(page_img, title_det["bbox"])
-            if title_crop is not None:
-                figure_img = _append_segment_image(figure_img, title_crop)
-                elem.setdefault("titles", [])
-                elem["titles"].append(
-                    {
-                        "bbox": title_det["bbox"],
-                        "conf": title_det.get("conf"),
-                        "index": title_index,
-                        "source": title_det.get("source"),
-                        "page": next_idx + 1,
-                    }
-                )
-                used_following_ids.add((next_idx, title_index))
-                segments_added = True
-            for text_det in title_texts:
-                text_index = text_det.get("index")
-                text_crop = _crop_pdf_region(page_img, text_det["bbox"])
-                if text_crop is None:
-                    continue
-                figure_img = _append_segment_image(figure_img, text_crop)
-                elem.setdefault("texts", [])
-                elem["texts"].append(
-                    {
-                        "bbox": text_det["bbox"],
-                        "conf": text_det.get("conf"),
-                        "index": text_index,
-                        "source": text_det.get("source"),
-                        "page": next_idx + 1,
-                    }
-                )
-                if text_index is not None:
-                    used_following_ids.add((next_idx, text_index))
-                segments_added = True
-        if not segments_added:
-            continue
-        figure_img.save(figure_path)
-        elem["width"] = figure_img.width
-        elem["height"] = figure_img.height
-        span = elem.get("page_span")
-        if span:
-            if next_idx + 1 not in span:
-                span.append(next_idx + 1)
-        else:
-            base_page = elem.get("page")
-            new_span = [page for page in (base_page, next_idx + 1) if page is not None]
-            elem["page_span"] = new_span
-    pdf_doc.close()
-    return elements
-def _stitch_table_pair(
-    base_elem: Dict,
-    candidate_elem: Dict,
-    out_dir: Path,
-    merge_index: int,
-    stitch_type: str,
-) -> Optional[Dict]:
-    """Stitch two table crops either vertically or horizontally."""
-    base_img = _open_table_image(base_elem, out_dir)
-    candidate_img = _open_table_image(candidate_elem, out_dir)
-    if base_img is None or candidate_img is None:
-        return None
-    tables_dir = out_dir / "tables"
-    tables_dir.mkdir(parents=True, exist_ok=True)
-    if stitch_type == "vertical":
-        target_width = max(base_img.width, candidate_img.width)
-        base_img = _pad_width(base_img, target_width)
-        candidate_img = _pad_width(candidate_img, target_width)
-        merged_height = base_img.height + candidate_img.height
-        stitched = Image.new("RGB", (target_width, merged_height), color=(255, 255, 255))
-        stitched.paste(base_img, (0, 0))
-        stitched.paste(candidate_img, (0, base_img.height))
-    else:
-        target_height = max(base_img.height, candidate_img.height)
-        base_img = _pad_height(base_img, target_height)
-        candidate_img = _pad_height(candidate_img, target_height)
-        merged_width = base_img.width + candidate_img.width
-        stitched = Image.new("RGB", (merged_width, target_height), color=(255, 255, 255))
-        stitched.paste(base_img, (0, 0))
-        stitched.paste(candidate_img, (base_img.width, 0))
-    merged_name = (
-        f"page_{base_elem['page']}_to_{candidate_elem['page']}_"
-        f"table_merged_{merge_index}.png"
-    )
-    merged_path = tables_dir / merged_name
-    stitched.save(merged_path)
-    # Remove original partial crops to avoid duplicates
-    (out_dir / base_elem["image_path"]).unlink(missing_ok=True)
-    (out_dir / candidate_elem["image_path"]).unlink(missing_ok=True)
-    new_bbox = [
-        min(base_elem["bbox_pixels"][0], candidate_elem["bbox_pixels"][0]),
-        min(base_elem["bbox_pixels"][1], candidate_elem["bbox_pixels"][1]),
-        max(base_elem["bbox_pixels"][2], candidate_elem["bbox_pixels"][2]),
-        max(base_elem["bbox_pixels"][3], candidate_elem["bbox_pixels"][3]),
-    ]
-    merged_elem = base_elem.copy()
-    merged_elem["page_span"] = [base_elem["page"], candidate_elem["page"]]
-    merged_elem["box_refs"] = [
-        {"page": base_elem["page"], "image_path": base_elem["image_path"]},
-        {"page": candidate_elem["page"], "image_path": candidate_elem["image_path"]},
-    ]
-    merged_elem["bbox_pixels"] = new_bbox
-    merged_elem["image_path"] = str(merged_path.relative_to(out_dir))
-    merged_elem["width"] = stitched.width
-    merged_elem["height"] = stitched.height
-    merged_elem["page_height"] = stitched.height
-    merged_elem["conf"] = min(
-        base_elem.get("conf", 1.0), candidate_elem.get("conf", 1.0)
-    )
-    return merged_elem
-def merge_spanning_tables(elements: List[Dict], out_dir: Path) -> List[Dict]:
-    """
-    Stitch table crops that continue across adjacent pages using the heuristic
-    from the legacy OpenCV-based extractor.
-    """
-    if not elements:
-        return elements
-    tables_by_page: Dict[int, List[Dict]] = {}
-    non_tables: List[Dict] = []
-    for elem in elements:
-        if elem.get("type") != "table":
-            non_tables.append(elem)
-            continue
-        page = elem.get("page")
-        if not isinstance(page, int):
-            non_tables.append(elem)
-            continue
-        tables_by_page.setdefault(page, []).append(elem)
-    merged_results: List[Dict] = []
-    used_next: Dict[int, set[int]] = {}
-    merge_counter = 0
-    for page in sorted(tables_by_page.keys()):
-        current_tables = tables_by_page.get(page, [])
-        next_page_tables = tables_by_page.get(page + 1, [])
-        next_used_indices = used_next.get(page + 1, set())
-        current_used_indices = used_next.get(page, set())
-        for idx_current, table_elem in enumerate(current_tables):
-            if idx_current in current_used_indices:
-                continue
-            if not next_page_tables:
-                merged_results.append(table_elem)
-                continue
-            x, y, w, h = _bbox_to_rect(table_elem["bbox_pixels"])
-            matched = False
-            for idx, candidate in enumerate(next_page_tables):
-                if idx in next_used_indices:
-                    continue
-                if candidate.get("type") != "table":
-                    continue
-                cx, cy, cw, ch = _bbox_to_rect(candidate["bbox_pixels"])
-                vertical_match = (
-                    abs(x - cx) <= TABLE_STITCH_TOLERANCES["x_tol"]
-                    and abs((x + w) - (cx + cw)) <= TABLE_STITCH_TOLERANCES["width_tol"]
-                )
-                horizontal_match = (
-                    abs(y - cy) <= TABLE_STITCH_TOLERANCES["y_tol"]
-                    and abs((y + h) - (cy + ch))
-                    <= TABLE_STITCH_TOLERANCES["height_tol"]
-                )
-                stitch_type = "vertical" if vertical_match else None
-                if not stitch_type and horizontal_match:
-                    stitch_type = "horizontal"
-                if not stitch_type:
-                    continue
-                merge_counter += 1
-                merged_elem = _stitch_table_pair(
-                    table_elem, candidate, out_dir, merge_counter, stitch_type
-                )
-                if merged_elem is None:
-                    continue
-                merged_results.append(merged_elem)
-                next_used_indices.add(idx)
-                matched = True
-                break
-            if not matched:
-                merged_results.append(table_elem)
-        used_next[page + 1] = next_used_indices
-    merged_results.extend(non_tables)
-    return merged_results
-# ----------------------------------------------------------------------
-# Draw layout boxes on the original PDF
-# ----------------------------------------------------------------------
-def draw_layout_pdf(pdf_bytes: bytes, all_dets: List[List[dict]],
-                    scale: float, out_path: Path):
-    """Annotate PDF with semi-transparent bounding boxes and labels."""
-    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
-    for page_no, dets in enumerate(all_dets):
-        page = doc[page_no]
-        for d in dets:
-            rgb = CLASS_COLORS.get(d["name"], (0, 0, 0))
-            rect = fitz.Rect([c / scale for c in d["bbox"]])
-            border_color = [c / 255 for c in rgb]
-            fill_color = [c / 255 for c in rgb]
-            fill_opacity = 0.15
-            border_width = 1.5
-            page.draw_rect(
-                rect,
-                color=border_color,
-                fill=fill_color,
-                width=border_width,
-                overlay=True,
-                fill_opacity=fill_opacity
-            )
-            label = f"{d['name']} {d['conf']:.2f}"
-            if d.get("source"):
-                label += f" [{d['source'][0].upper()}]"
-            text_bg = fitz.Rect(rect.x0, rect.y0 - 10, rect.x0 + 60, rect.y0)
-            page.draw_rect(text_bg, color=None, fill=(1, 1, 1, 0.6), overlay=True)
-            page.insert_text(
-                (rect.x0 + 2, rect.y0 - 8),
-                label,
-                fontsize=6.5,
-                color=border_color,
-                overlay=True
-            )
-    doc.save(str(out_path))
-    doc.close()
-# ----------------------------------------------------------------------
-# Process a single PDF Page (for parallel execution)
-# ----------------------------------------------------------------------
-def process_page(task_data: Tuple[int, bytes, float, Path, str]) -> Optional[Tuple[int, List[dict], List[dict]]]:
     """
-    Process a single page of a PDF in a worker process.
-    Returns: (page_number, detections, elements) or None on failure
     """
-    pno, pdf_bytes, scale, out_dir, pdf_name = task_data
-    if _shutdown_requested:
-        return None
-    pdf_pdfium = None
     try:
-        pdf_pdfium = pdfium.PdfDocument(pdf_bytes)
-        page = pdf_pdfium[pno]
-        bitmap = page.render(scale=scale)
-        pil = bitmap.to_pil()
-        dets = detect_page(pil)
-        elements = save_layout_elements(pil, pno, dets, out_dir)
-        page_figures = len([d for d in dets if d['name'] == 'figure'])
-        page_tables = len([d for d in dets if d['name'] == 'table'])
-        logger.info(f"  [{pdf_name}] Page {pno + 1}: {page_figures} figs, {page_tables} tables")
-        page.close()
-        pdf_pdfium.close()
-        return (pno, dets, elements)
     except Exception as e:
-        logger.error(f"Failed to process page {pno + 1} of {pdf_name}: {e}")
-        if pdf_pdfium:
-            pdf_pdfium.close()
-        return None
-# ----------------------------------------------------------------------
-# Process a full PDF using the persistent worker pool
-# ----------------------------------------------------------------------
-def process_pdf_with_pool(
-    pdf_path: Path,
-    out_dir: Path,
-    pool: Optional[Pool] = None,
-    *,
-    extract_images: bool = True,
-    extract_markdown: bool = True,
-):
-    """
-    Main processing pipeline for a PDF file.
-    If pool is provided, uses it. Otherwise processes serially.
-    """
-    if _shutdown_requested:
-        logger.warning(f"Skipping {pdf_path.name} due to shutdown request")
-        return
-    stem = pdf_path.stem
-    logger.info(f"Processing {pdf_path.name}")
-    pdf_bytes = pdf_path.read_bytes()
-    doc = None
     try:
-        doc = pdfium.PdfDocument(pdf_bytes)
-        page_count = len(doc)
     except Exception as e:
-        logger.error(f"Failed to open PDF {pdf_path.name}: {e}. Skipping.")
-        return
-    finally:
-        if doc is not None:
-            doc.close()
-    scale = 2.0
-    all_elements: List[Dict] = []
-    filtered_dets: List[List[dict]] = []
-    if extract_images:
-        all_dets: List[Optional[List[dict]]] = [None] * page_count
-        if pool is not None and USE_MULTIPROCESSING:
-            logger.info(f"  Using worker pool for {page_count} pages...")
-            tasks = [
-                (pno, pdf_bytes, scale, out_dir, pdf_path.name)
-                for pno in range(page_count)
-            ]
-            try:
-                results = pool.map(process_page, tasks)
-                for res in results:
-                    if res:
-                        pno, dets, elements = res
-                        all_dets[pno] = dets
-                        all_elements.extend(elements)
-            except KeyboardInterrupt:
-                logger.warning("Processing interrupted during parallel execution")
-                raise
-        else:
-            logger.info("Using serial processing...")
-            try:
-                pdf_pdfium = pdfium.PdfDocument(pdf_bytes)
-                for pno in range(page_count):
-                    if _shutdown_requested:
-                        logger.warning(
-                            f"Stopping at page {pno + 1}/{page_count} due to shutdown request"
-                        )
-                        break
-                    try:
-                        logger.info(f"  Processing page {pno + 1}/{page_count}")
-                        page = pdf_pdfium[pno]
-                        bitmap = page.render(scale=scale)
-                        pil = bitmap.to_pil()
-                        dets = detect_page(pil)
-                        all_dets[pno] = dets
-                        elements = save_layout_elements(pil, pno, dets, out_dir)
-                        all_elements.extend(elements)
-                        page_figures = len([d for d in dets if d["name"] == "figure"])
-                        page_tables = len([d for d in dets if d["name"] == "table"])
-                        logger.info(
-                            f"    Found {page_figures} figures and {page_tables} tables"
-                        )
-                        page.close()
-                    except Exception as e:
-                        logger.error(f"Failed to process page {pno + 1}: {e}. Skipping page.")
-                pdf_pdfium.close()
-            except Exception as e:
-                logger.error(f"Fatal error processing {pdf_path.name}: {e}")
-                if "pdf_pdfium" in locals() and pdf_pdfium:
-                    pdf_pdfium.close()
-                return
-        dets_per_page: List[Optional[List[Dict[str, Any]]]] = [
-            det if det is not None else None for det in all_dets
-        ]
-        filtered_dets = [d for d in all_dets if d is not None]
-        if all_elements:
-            all_elements = merge_spanning_tables(all_elements, out_dir)
-            all_elements = attach_cross_page_figure_captions(
-                all_elements, dets_per_page, pdf_bytes, out_dir, scale
-            )
-        if all_elements:
-            content_list_path = out_dir / f"{stem}_content_list.json"
-            with open(content_list_path, "w", encoding="utf-8") as f:
-                json.dump(all_elements, f, ensure_ascii=False, indent=4)
-            logger.info(f"  Saved {len(all_elements)} elements to JSON")
-        if filtered_dets:
-            draw_layout_pdf(
-                pdf_bytes, filtered_dets, scale, out_dir / f"{stem}_layout.pdf"
-            )
-            logger.info("  Generated annotated PDF")
-        else:
-            logger.warning(f"No detections found for {stem}. Skipping layout PDF.")
-    else:
-        logger.info("  Image extraction skipped per configuration.")
-    markdown_path = None
-    if extract_markdown:
-        markdown_path = write_markdown_document(pdf_path, out_dir)
-        if markdown_path is None:
-            logger.warning(f"  Markdown extraction yielded no content for {stem}.")
-    if _shutdown_requested:
-        logger.warning(f"⚠️  Partial results saved for {stem} → {out_dir}")
-    else:
-        if extract_images:
-            logger.success(
-                f"✓ {stem} → {out_dir} ({len(all_elements)} elements extracted)"
-            )
-        else:
-            logger.success(f"✓ {stem} → {out_dir} (image extraction skipped)")
-# ----------------------------------------------------------------------
-# Main
-# ----------------------------------------------------------------------
-if __name__ == "__main__":
-    # Important for multiprocessing on Windows/macOS
-    torch.multiprocessing.set_start_method('spawn', force=True)
-    # Setup signal handlers for graceful shutdown
-    setup_signal_handlers()
-    INPUT_DIR = Path("./pdfs")
-    OUTPUT_DIR = Path("./output")
-    os.makedirs(INPUT_DIR, exist_ok=True)
-    os.makedirs(OUTPUT_DIR, exist_ok=True)
-    pdf_files = list(INPUT_DIR.glob("*.pdf"))
-    if not pdf_files:
-        logger.warning("No PDF files found in ./pdfs")
-        logger.info("Please add PDF files to the ./pdfs directory")
-        logger.info("The script will exit gracefully. No errors occurred.")
-        sys.exit(0)
-    logger.info(f"Found {len(pdf_files)} PDF file(s) to process")
-    logger.info(f"Settings: MODEL_SIZE={MODEL_SIZE}, CONF={CONF_THRESHOLD}")
-    # Determine worker count
-    total_cpus = cpu_count()
-    if NUM_WORKERS is None:
-        num_workers = max(1, total_cpus - 1)
-    else:
-        num_workers = max(1, min(NUM_WORKERS, total_cpus))
-    # Decide whether to use multiprocessing
-    # Local device check
-    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-    use_pool = USE_MULTIPROCESSING and DEVICE == "cpu" and total_cpus >= 4
-    if use_pool:
-        logger.info(f"🚀 Creating persistent worker pool with {num_workers} workers...")
-    else:
-        if not USE_MULTIPROCESSING:
-            logger.info("Multiprocessing disabled by configuration")
-        elif DEVICE != "cpu":
-            logger.info(f"Using serial GPU processing (device: {DEVICE})")
-        else:
-            logger.info(f"Using serial CPU processing (CPU count {total_cpus} too low)")
-    pool = None
-    try:
-        # Create persistent pool ONCE for all PDFs
-        if use_pool:
-            pool = Pool(processes=num_workers, initializer=init_worker)
-            logger.success(f"✓ Worker pool ready with {num_workers} workers\n")
-        else:
-            # Load model in main process for serial execution
-            logger.info("Initializing model in main process...")
-            get_model()
-            logger.success(f"✓ Model loaded (device: {DEVICE})\n")
-        # Process all PDFs using the same pool
-        for i, pdf_path in enumerate(pdf_files, 1):
-            if _shutdown_requested:
-                logger.warning(f"\nShutdown requested. Processed {i-1}/{len(pdf_files)} files.")
-                break
-            logger.info(f"\n{'='*60}")
-            logger.info(f"📄 File {i}/{len(pdf_files)}: {pdf_path.name}")
-            logger.info(f"{'='*60}")
-            sub_out = OUTPUT_DIR / pdf_path.stem
-            os.makedirs(sub_out, exist_ok=True)
-            try:
-                process_pdf_with_pool(pdf_path, sub_out, pool)
-            except KeyboardInterrupt:
-                logger.warning(f"\nInterrupted while processing {pdf_path.name}")
-                break
-            except Exception as e:
-                logger.error(f"Error processing {pdf_path.name}: {e}")
-                if _shutdown_requested:
-                    break
-                logger.info("Continuing with next file...")
-                continue
-        if _shutdown_requested:
-            logger.warning(f"\n⚠️  Processing interrupted. Partial results saved in {OUTPUT_DIR}")
-        else:
-            logger.success(f"\n✨ All done! Results are in {OUTPUT_DIR}")
-    except KeyboardInterrupt:
-        logger.error("\n❌ Processing interrupted by user")
-        sys.exit(1)
-    except Exception as e:
-        logger.error(f"\n❌ Fatal error: {e}")
-        sys.exit(1)
-    finally:
-        # Clean up pool if it exists
-        if pool is not None:
-            logger.info("\n🧹 Shutting down worker pool...")
-            pool.close()
-            pool.join()
-            logger.success("✓ Worker pool closed cleanly")

 import json
+import os
+import shutil
+import shutil
+import threading
+import uuid
+import time
+import multiprocessing
 from pathlib import Path
+from typing import Dict, List, Optional, Any
+from enum import Enum
+from contextlib import asynccontextmanager
+from fastapi import FastAPI, Request, File, UploadFile, Form, BackgroundTasks, HTTPException
+from fastapi.responses import HTMLResponse, JSONResponse, FileResponse
+from fastapi.staticfiles import StaticFiles
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel, Field
+import re
+import gradio as gr
+# from werkzeug.utils import secure_filename # Removed dependency
 import torch
+import main as extractor
+from loguru import logger
+# --------------------------------------------------------------------------------
+# CONFIGURATION
+# --------------------------------------------------------------------------------
+MAX_CONTENT_LENGTH = 500 * 1024 * 1024  # Not strictly enforced by FastAPI by default, but good to know
+UPLOAD_FOLDER = Path('./uploads')
+OUTPUT_FOLDER = Path('./output')
+UPLOAD_FOLDER.mkdir(parents=True, exist_ok=True)
+OUTPUT_FOLDER.mkdir(parents=True, exist_ok=True)
+# Global model instance
+_model = None
+_progress_tracker: Dict[str, Dict] = {}
+_progress_lock = threading.RLock()
+# Global process pool
+_pool = None
+def secure_filename(filename: str) -> str:
     """
+    Sanitize filename to prevent directory traversal and special chars.
+    Simplistic implementation to replace werkzeug.
     """
+    filename = Path(filename).name
+    # Keep only alphanumeric, dots, hyphens, and underscores
+    filename = re.sub(r'[^a-zA-Z0-9_.-]', '_', filename)
+    return filename
+def get_device_info() -> Dict[str, Any]:
+    """Get information about GPU/CPU availability."""
+    cuda_available = torch.cuda.is_available()
+    device = "cuda" if cuda_available else "cpu"
+    info = {
+        "device": device,
+        "cuda_available": cuda_available,
+        "device_name": None,
+        "device_count": 0,
+    }
+    if cuda_available:
+        info["device_name"] = torch.cuda.get_device_name(0)
+        info["device_count"] = torch.cuda.device_count()
+    return info
+def load_model_once():
+    """Load the model once and cache it."""
+    global _model
+    if _model is None:
+        logger.info("Loading DocLayout-YOLO model...")
+        _model = extractor.get_model()
+        logger.info("Model loaded successfully")
+    return _model
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """
+    Life span context manager for startup and shutdown events.
+    Initializes the multiprocessing pool for non-blocking CPU tasks.
+    """
+    global _pool
+    logger.info("Starting up PDF Layout Extractor...")
+    # Configure multiprocessing for PyTorch/CUDA
+    try:
+        multiprocessing.set_start_method('spawn', force=True)
+    except RuntimeError:
+        pass # Already set
+    # Initialize worker pool
+    # On ZeroGPU / Spaces, multiprocessing prevents GPU access and causes crashes.
+    # We will disable it globally as requested.
+    logger.info("Multiprocessing disabled for ZeroGPU compatibility.")
+    _pool = None
+    yield
+    # Shutdown
+    logger.info("Shutting down PDF Layout Extractor...")
+app = FastAPI(
+    title="PDF Layout Extractor API",
+    description="A polished API for extracting layout information (text, tables, figures) from PDFs using DocLayout-YOLO.",
+    version="1.0.0",
+    lifespan=lifespan
+)
+# Enable CORS
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Mount Static Files
+# Mount Output as Static for easy access to generated images/PDFs
+app.mount("/output", StaticFiles(directory="output"), name="output")
+# --------------------------------------------------------------------------------
+# Pydantic Models for Response Documentation
+# --------------------------------------------------------------------------------
+class DeviceInfo(BaseModel):
+    device: str = Field(..., description="Compute device being used (e.g., 'cuda' or 'cpu').")
+    cuda_available: bool = Field(..., description="Whether CUDA GPU acceleration is available.")
+    device_name: Optional[str] = Field(None, description="Name of the GPU if available.")
+    device_count: int = Field(..., description="Number of GPU devices detected.")
+class TaskStartResponse(BaseModel):
+    task_id: str = Field(..., description="Unique identifier for the background processing task.")
+    message: str = Field(..., description="Status message confirming start.")
+    total_files: int = Field(..., description="Number of PDF files accepted for processing.")
+class ProcessingResult(BaseModel):
+    filename: str = Field(..., description="Name of the processed file.")
+    stem: Optional[str] = Field(None, description="Filename without extension.")
+    output_dir: Optional[str] = Field(None, description="Relative path to the output directory.")
+    figures_count: Optional[int] = Field(0, description="Total figures detected.")
+    tables_count: Optional[int] = Field(0, description="Total tables detected.")
+    elements_count: Optional[int] = Field(0, description="Total layout elements (text, tables, figures).")
+    annotated_pdf: Optional[str] = Field(None, description="Path to the PDF with layout bounding boxes drawn.")
+    markdown_path: Optional[str] = Field(None, description="Path to the extracted markdown file.")
+    # Extended URLs
+    annotated_pdf_url: Optional[str] = Field(None, description="Full URL to access the annotated PDF.")
+    markdown_url: Optional[str] = Field(None, description="Full URL to access the extracted markdown.")
+    figure_urls: Optional[List[Dict[str, Any]]] = Field(None, description="List of URLs for extracted figure images.")
+    table_urls: Optional[List[Dict[str, Any]]] = Field(None, description="List of URLs for extracted table images.")
+    error: Optional[str] = Field(None, description="Error message if processing failed.")
+class ExtractionMode(str, Enum):
+    images = "images"
+    markdown = "markdown"
+    both = "both"
+class ProgressResponse(BaseModel):
+    status: str = Field(..., description="Current status of the task (e.g., 'processing', 'completed').")
+    progress: int = Field(..., description="Overall progress percentage (0-100).")
+    message: str = Field(..., description="Current status message.")
+    results: List[ProcessingResult] = Field([], description="List of results for processed files.")
+    file_progress: Optional[Dict[str, int]] = Field(None, description="Progress percentage per file.")
+class PDFInfo(BaseModel):
+    stem: str = Field(..., description="Unique identifier/stem of the PDF.")
+    output_dir: str = Field(..., description="Directory where results are stored.")
+class PDFListResponse(BaseModel):
+    pdfs: List[PDFInfo] = Field(..., description="List of processed PDFs available on the server.")
+# --------------------------------------------------------------------------------
+# Helper Functions
+# --------------------------------------------------------------------------------
+def _update_task_progress(task_id: str, filename: str, file_progress: int, message: str):
+    """Update progress for a specific file and calculate overall progress."""
+    with _progress_lock:
+        if task_id not in _progress_tracker:
+            return
+        # Update file-specific progress
+        if 'file_progress' not in _progress_tracker[task_id]:
+            _progress_tracker[task_id]['file_progress'] = {}
+        _progress_tracker[task_id]['file_progress'][filename] = file_progress
+        # Calculate overall progress (average of all files)
+        file_progresses = _progress_tracker[task_id]['file_progress']
+        if file_progresses:
+            total_progress = sum(file_progresses.values()) / len(file_progresses)
+            _progress_tracker[task_id]['progress'] = int(total_progress)
+        _progress_tracker[task_id]['message'] = message
+def process_file_background_task(task_id: str, file_data: bytes, filename: str, extraction_mode: str):
+    """
+    Process a single file in the background (runs in a thread pool inside FastAPI/Starlette).
+    Note: For heavy CPU/GPU tasks, prefer running in a separate process or queue (like Celery),
+    but consistent with the request to 'use FastAPI' and the previous design, this is fine
+    since `fastapi.BackgroundTasks` runs in a thread pool.
+    """
+    filename = secure_filename(filename)
+    try:
+        _update_task_progress(task_id, filename, 5, f'Processing {filename}...')
+        stem = Path(filename).stem
+        include_images = extraction_mode != 'markdown'
+        include_markdown = extraction_mode != 'images'
+        # Ensure upload directory exists
+        upload_dir = UPLOAD_FOLDER
+        upload_path = upload_dir / filename
+        upload_path.write_bytes(file_data)
+        _update_task_progress(task_id, filename, 15, f'Saved {filename}, preparing output...')
+        # Prepare output directory
+        output_dir = OUTPUT_FOLDER / stem
+        output_dir.mkdir(parents=True, exist_ok=True)
+        # Copy PDF to output directory
+        pdf_path = output_dir / filename
+        # shutil.copy caused permissions issues in some envs, renaming/moving is safer if fresh upload
+        # But here we might want to keep the original in uploads?
+        # The original code did `upload_path.rename(pdf_path)`, so let's stick to that semantics:
+        # Move from temp upload to output dir
+        if pdf_path.exists():
+            pdf_path.unlink()
+        upload_path.rename(pdf_path)
+        _update_task_progress(task_id, filename, 25, f'Loading model and processing {filename}...')
+        # Process PDF
+        # Disable multiprocessing for ZeroGPU compatibility
+        extractor.USE_MULTIPROCESSING = False
+        logger.info(f"Processing {filename} (images={include_images}, markdown={include_markdown})")
+        # Note: When using a pool, we don't strictly need to load the model in THIS process
+        # unless we fallback to serial.
+        # But 'init_worker' loaded it in workers.
+        _update_task_progress(task_id, filename, 30, f'Extracting content from {filename}...')
+        # Use the global pool
+        # If _pool is None (initialization failed), main.py will fallback to serial (blocking this thread, but working)
+        extractor.process_pdf_with_pool(
+            pdf_path,
+            output_dir,
+            pool=_pool,
+            extract_images=include_images,
+            extract_markdown=include_markdown,
         )
+        _update_task_progress(task_id, filename, 85, f'Collecting results for {filename}...')
+        # Collect results
+        json_path = output_dir / f"{stem}_content_list.json"
+        elements = []
+        if include_images and json_path.exists():
+            text_content = json_path.read_text(encoding='utf-8')
+            if text_content.strip():
+                elements = json.loads(text_content)
+        annotated_pdf = None
+        if include_images:
+            candidate_pdf = output_dir / f"{stem}_layout.pdf"
+            if candidate_pdf.exists():
+                annotated_pdf = str(candidate_pdf.relative_to(OUTPUT_FOLDER))
+        markdown_path = None
+        if include_markdown:
+            candidate_md = output_dir / f"{stem}.md"
+            if candidate_md.exists():
+                markdown_path = str(candidate_md.relative_to(OUTPUT_FOLDER))
+        figures = [e for e in elements if e.get('type') == 'figure']
+        tables = [e for e in elements if e.get('type') == 'table']
+        result = {
+            'filename': filename,
+            'stem': stem,
+            'output_dir': str(output_dir.relative_to(OUTPUT_FOLDER)),
+            'figures_count': len(figures),
+            'tables_count': len(tables),
+            'elements_count': len(elements),
+            'annotated_pdf': annotated_pdf,
+            'markdown_path': markdown_path,
+            'include_images': include_images,
+            'include_markdown': include_markdown,
+        }
+        with _progress_lock:
+            if 'file_progress' not in _progress_tracker[task_id]:
+                _progress_tracker[task_id]['file_progress'] = {}
+            _progress_tracker[task_id]['file_progress'][filename] = 100
+            # Recalculate total
+            file_progresses = _progress_tracker[task_id]['file_progress']
+            if file_progresses:
+                total_prog = sum(file_progresses.values()) / len(file_progresses)
+                _progress_tracker[task_id]['progress'] = int(total_prog)
+            _progress_tracker[task_id]['results'].append(result)
+            _progress_tracker[task_id]['message'] = f'Completed processing {filename}'
+            # Check completion
+            total_files = _progress_tracker[task_id].get('total_files', 1)
+            completed_count = len([r for r in _progress_tracker[task_id]['results'] if 'error' not in r])
+            error_count = len([r for r in _progress_tracker[task_id]['results'] if 'error' in r])
+            if completed_count + error_count >= total_files:
+                _progress_tracker[task_id]['status'] = 'completed'
+                _progress_tracker[task_id]['progress'] = 100
+                _progress_tracker[task_id]['message'] = f'All {total_files} file(s) processed.'
+    except Exception as e:
+        logger.error(f"Error processing {filename}: {e}")
+        import traceback
+        logger.error(traceback.format_exc())
+        with _progress_lock:
+            _progress_tracker[task_id]['results'].append({
+                'filename': filename,
+                'error': str(e)
+            })
+            # Check if this was the last file
+            total_files = _progress_tracker[task_id].get('total_files', 1)
+            if len(_progress_tracker[task_id]['results']) >= total_files:
+                _progress_tracker[task_id]['status'] = 'completed' # Mark done even if error, so frontend stops polling
+                _progress_tracker[task_id]['message'] = f'Finished with errors.'
+# --------------------------------------------------------------------------------
+# Routes
+# --------------------------------------------------------------------------------
+@app.get("/api/docs", response_class=HTMLResponse, tags=["UI"], include_in_schema=False)
+async def api_docs_redirect():
+    """Redirect legacy /api/docs to Swagger UI."""
+    return HTMLResponse(
+        """
+        <html>
+            <head>
+                <meta http-equiv="refresh" content="0; url=/docs" />
+            </head>
+            <body>
+                <p>Redirecting to <a href="/docs">/docs</a>...</p>
+            </body>
+        </html>
+        """
     )
+@app.get("/api/device-info", response_model=DeviceInfo, tags=["System"])
+async def device_info_endpoint():
+    """Get information about the processing device (CPU/GPU)."""
+    return get_device_info()
+@app.post("/api/upload", response_model=TaskStartResponse, tags=["Processing"])
+async def upload_files(
+    background_tasks: BackgroundTasks,
+    files: List[UploadFile] = File(...),
+    extraction_mode: ExtractionMode = Form(ExtractionMode.images, description="Select extraction mode: 'images' (figures/tables), 'markdown' (text), or 'both'.")
+):
     """
+    Upload one or more PDF files for background processing.
     """
+    if not files:
+        raise HTTPException(status_code=400, detail="No files provided")
+    pdf_files = [f for f in files if f.filename.lower().endswith('.pdf')]
+    if not pdf_files:
+        raise HTTPException(status_code=400, detail="No valid PDF files selected")
+    task_id = str(uuid.uuid4())
+    with _progress_lock:
+        _progress_tracker[task_id] = {
+            'status': 'processing',
+            'progress': 0,
+            'message': 'Starting upload...',
+            'results': [],
+            'total_files': len(pdf_files)
+        }
+    # Read files into memory to pass to background task (UploadFile is a stream)
+    # Be careful with RAM here for huge files. If too big, save to temp disk first.
+    # Given the original code read into RAM, we'll do the same for consistency but simpler.
+    for file in pdf_files:
+        content = await file.read()
+        background_tasks.add_task(
+            process_file_background_task,
+            task_id,
+            content,
+            file.filename,
+            extraction_mode
         )
+    return {
+        "task_id": task_id,
+        "message": "Processing started",
+        "total_files": len(pdf_files)
+    }
+@app.get("/api/progress/{task_id}", response_model=ProgressResponse, tags=["Processing"])
+async def get_progress(task_id: str, request: Request):
+    """Check the progress of a processing task."""
+    with _progress_lock:
+        progress = _progress_tracker.get(task_id)
+        if not progress:
+            raise HTTPException(status_code=404, detail="Task not found")
+        # Deep copy to modify for response (adding URLs) without changing state
+        # Or just build the response object.
+        # Since we are adding computed URLs, we shouldn't modify the stored state every time.
+        response_data = progress.copy()
+        # Use request.base_url for absolute URLs
+        base_url = str(request.base_url).rstrip('/')
+        if 'hf.space' in base_url or request.headers.get("x-forwarded-proto") == "https":
+            base_url = base_url.replace("http://", "https://")
+        # Process results to add URLs
+        results_with_urls = []
+        for res in response_data.get('results', []):
+            res_copy = res.copy()
+            # Helper to make url
+            def make_url(rel_path):
+                if not rel_path: return None
+                # Clean windows paths to forward slashes for URLs
+                clean_path = str(rel_path).replace('\\', '/')
+                return f"{base_url}/output/{clean_path}"
+            res_copy['annotated_pdf_url'] = make_url(res.get('annotated_pdf'))
+            res_copy['markdown_url'] = make_url(res.get('markdown_path'))
+            # Figures and Tables URLs need to be discovered from disk if not stored
+            # The original code loaded JSON every time. That's a bit heavy but ensures freshness.
+            # Let's try to do it if stem is present.
+            stem = res.get('stem')
+            if stem:
+                output_dir = OUTPUT_FOLDER / stem
+                if output_dir.exists():
+                    json_files = list(output_dir.glob('*_content_list.json'))
+                    if json_files:
+                        try:
+                            elements = json.loads(json_files[0].read_text(encoding='utf-8'))
+                            figures = [e for e in elements if e.get('type') == 'figure']
+                            tables = [e for e in elements if e.get('type') == 'table']
+                            fig_urls = []
+                            for fig in figures:
+                                if fig.get('image_path'):
+                                    path = Path(fig['image_path']) # relative to unique output folder usually?
+                                    # Actually in main.py it saves relative to out_dir
+                                    # so image_path is like "figures/page_1_fig_0.png"
+                                    # We need relative to "output" folder for URL
+                                    # output_dir is "output/stem_timestamp"
+                                    # so full path is "output/stem_timestamp/figures/..."
+                                    # The URL mount is /output/ -> output/
+                                    # "image_path" in JSON is relative to the specific STEM folder (implied by main.py logic)
+                                    # Wait, main.py says: "image_path": str(path_template.relative_to(out_dir))
+                                    # So yes, it is "figures/..."
+                                    full_rel_path = f"{stem}/{fig['image_path']}"
+                                    fig_urls.append({
+                                        "page": fig.get('page'),
+                                        "url": make_url(full_rel_path),
+                                        "path": full_rel_path
+                                    })
+                            res_copy['figure_urls'] = fig_urls
+                            tab_urls = []
+                            for tab in tables:
+                                if tab.get('image_path'):
+                                    full_rel_path = f"{stem}/{tab['image_path']}"
+                                    tab_urls.append({
+                                        "page": tab.get('page'),
+                                        "url": make_url(full_rel_path),
+                                        "path": full_rel_path
+                                    })
+                            res_copy['table_urls'] = tab_urls
+                        except Exception as e:
+                            logger.error(f"Error reading details for {stem}: {e}")
+            results_with_urls.append(res_copy)
+        response_data['results'] = results_with_urls
+        return response_data
+@app.get("/api/pdf-list", response_model=PDFListResponse, tags=["Retrieval"])
+async def pdf_list():
+    """List previously processed PDFs."""
+    output_dir = OUTPUT_FOLDER
+    pdfs = []
+    if output_dir.exists():
+        for item in output_dir.iterdir():
+            if item.is_dir():
+                # Check for indicators of success
+                if list(item.glob('*_content_list.json')) or list(item.glob('*.md')):
+                    pdfs.append({
+                        'stem': item.name,
+                        'output_dir': item.name # returning the name as relative dir
+                    })
+    return {'pdfs': pdfs}
+@app.get("/api/pdf-details/{pdf_stem}", tags=["Retrieval"])
+async def pdf_details(pdf_stem: str, request: Request):
+    """Get detailed information about a processed PDF."""
+    output_dir = OUTPUT_FOLDER / pdf_stem
+    if not output_dir.exists():
+        raise HTTPException(status_code=404, detail="PDF not found")
+    base_url = str(request.base_url).rstrip('/')
+    if 'hf.space' in base_url or request.headers.get("x-forwarded-proto") == "https":
+        base_url = base_url.replace("http://", "https://")
+    def make_url(rel_path):
+        if not rel_path: return None
+        clean_path = str(rel_path).replace('\\', '/')
+        return f"{base_url}/output/{clean_path}"
+    # Load content list
+    json_files = list(output_dir.glob('*_content_list.json'))
+    elements = []
+    if json_files:
+        elements = json.loads(json_files[0].read_text(encoding='utf-8'))
+    figures = [e for e in elements if e.get('type') == 'figure']
+    tables = [e for e in elements if e.get('type') == 'table']
+    # PDF Layout
+    annotated_pdf = None
+    pdf_files = list(output_dir.glob('*_layout.pdf'))
+    if pdf_files:
+        annotated_pdf = f"{pdf_stem}/{pdf_files[0].name}"
+    # Markdown
+    markdown_path = None
+    md_files = list(output_dir.glob('*.md'))
+    if md_files:
+        markdown_path = f"{pdf_stem}/{md_files[0].name}"
+    # Image lists
+    figure_images = []
+    fig_dir = output_dir / 'figures'
+    if fig_dir.exists():
+        figure_images = [f"{pdf_stem}/figures/{f.name}" for f in sorted(fig_dir.glob('*.png'))]
+    table_images = []
+    tab_dir = output_dir / 'tables'
+    if tab_dir.exists():
+        table_images = [f"{pdf_stem}/tables/{f.name}" for f in sorted(tab_dir.glob('*.png'))]
+    return {
+        'stem': pdf_stem,
+        'figures': figures,
+        'tables': tables,
+        'figures_count': len(figures),
+        'tables_count': len(tables),
+        'elements_count': len(elements),
+        'annotated_pdf': annotated_pdf,
+        'markdown_path': markdown_path,
+        'figure_images': figure_images,
+        'table_images': table_images,
+        'urls': {
+            'annotated_pdf': make_url(annotated_pdf),
+            'markdown': make_url(markdown_path),
+            'figures': [make_url(img) for img in figure_images],
+            'tables': [make_url(img) for img in table_images],
+        }
+    }
+@app.post("/api/predict", tags=["Legacy"], include_in_schema=True)
+async def predict(
+    file: UploadFile = File(...),
+    request: Request = None
+):
     """
+    Direct API endpoint for extracting text/tables/figures from a single PDF.
+    Waits for completion and returns JSON result.
     """
+    if not file.filename.lower().endswith('.pdf'):
+        raise HTTPException(status_code=400, detail="Invalid file type. Please upload a PDF.")
+    # Create unique output directory
+    filename = secure_filename(file.filename)
+    stem = Path(filename).stem
+    unique_id = f"{stem}_{int(time.time())}"
+    output_dir = OUTPUT_FOLDER / unique_id
+    output_dir.mkdir(parents=True, exist_ok=True)
+    # Save file
+    pdf_path = output_dir / filename
+    content = await file.read()
+    pdf_path.write_bytes(content)
     try:
+        # Load model logic (sync call to stay simple for this endpoint)
+        load_model_once()
+        extractor.USE_MULTIPROCESSING = False
+        # Process
+        extractor.process_pdf_with_pool(
+            pdf_path,
+            output_dir,
+            pool=None,
+            extract_images=True,
+            extract_markdown=True,
+        )
+        # Build Result
+        base_url = str(request.base_url).rstrip('/')
+        if 'hf.space' in base_url or request.headers.get("x-forwarded-proto") == "https":
+            base_url = base_url.replace("http://", "https://")
+        def make_url(rel_path):
+             return f"{base_url}/output/{unique_id}/{rel_path}"
+        result = {
+            "status": "success",
+            "filename": filename,
+            "text": "",
+            "tables": [],
+            "figures": [],
+            "summary": {}
+        }
+        # Text
+        md_path = output_dir / f"{stem}.md"
+        if md_path.exists():
+            result['text'] = md_path.read_text(encoding='utf-8')
+        # JSON content
+        json_path = output_dir / f"{stem}_content_list.json"
+        if json_path.exists():
+            elements = json.loads(json_path.read_text(encoding='utf-8'))
+            figures = [e for e in elements if e.get('type') == 'figure']
+            result['figures'] = [{
+                **fig,
+                'image_url': make_url(fig.get('image_path')) if fig.get('image_path') else None
+            } for fig in figures]
+            tables = [e for e in elements if e.get('type') == 'table']
+            result['tables'] = [{
+                **tab,
+                'image_url': make_url(tab.get('image_path')) if tab.get('image_path') else None
+            } for tab in tables]
+            result['summary'] = {
+                'figures_count': len(figures),
+                'tables_count': len(tables),
+                'elements_count': len(elements)
+            }
+        return result
     except Exception as e:
+        logger.error(f"Error in predict: {e}")
+        import traceback
+        logger.error(traceback.format_exc())
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/api/delete", tags=["Processing"])
+async def delete_pdf(stem: str = Form(...)):
+    """Delete a processed PDF and its output directory."""
+    if not stem:
+        raise HTTPException(status_code=400, detail="Missing stem")
+    # Resolve output directory safely
+    output_root = OUTPUT_FOLDER.resolve()
+    target_dir = (output_root / stem).resolve()
+    # Prevent path traversal
+    if output_root not in target_dir.parents and target_dir != output_root:
+         raise HTTPException(status_code=400, detail="Invalid stem path")
+    if not target_dir.exists() or not target_dir.is_dir():
+        raise HTTPException(status_code=404, detail="Not found")
     try:
+        shutil.rmtree(target_dir)
+        return {"status": "success", "message": f"Deleted {stem}"}
     except Exception as e:
+        # Try to fix read-only files (common on Windows)
+        try:
+            import stat
+            def on_rm_error(func, path, exc_info):
+                os.chmod(path, stat.S_IWRITE)
+                func(path)
+            shutil.rmtree(target_dir, onerror=on_rm_error)
+            return {"status": "success", "message": f"Deleted {stem}"}
+        except Exception as e2:
+            logger.error(f"Error deleting {stem}: {e2}")
+            raise HTTPException(status_code=500, detail=f"Failed to delete: {str(e2)}")
+# --------------------------------------------------------------------------------
+# Gradio Interface
+# --------------------------------------------------------------------------------
+def gradio_process(pdf_file, mode_str):
+    """
+    Wrapper for Gradio to call the extractor logic.
+    """
+    if pdf_file is None:
+        return None, None, None, "No file uploaded."
+    try:
+        # Create unique directory
+        filename = secure_filename(Path(pdf_file.name).name)
+        stem = Path(filename).stem
+        unique_id = f"{stem}_{int(time.time())}"
+        output_dir = OUTPUT_FOLDER / unique_id
+        output_dir.mkdir(parents=True, exist_ok=True)
+        # Copy file
+        dest_path = output_dir / filename
+        shutil.copy(pdf_file.name, dest_path)
+        # Determine flags
+        include_images = (mode_str != "markdown")
+        include_markdown = (mode_str != "images")
+        # Process using the multiprocessing pool for speed
+        # The global pool is already initialized in lifespan
+        extractor.USE_MULTIPROCESSING = False
+        extractor.process_pdf_with_pool(
+            dest_path,
+            output_dir,
+            pool=None,  # Use the global pool instead of None
+            extract_images=include_images,
+            extract_markdown=include_markdown
+        )
+        # Collect outputs
+        md_text = ""
+        md_path = output_dir / f"{stem}.md"
+        if md_path.exists():
+            md_text = md_path.read_text(encoding='utf-8')
+        annotated_pdf = None
+        pdf_layout_path = output_dir / f"{stem}_layout.pdf"
+        if pdf_layout_path.exists():
+            annotated_pdf = str(pdf_layout_path)
+        gallery = []
+        if include_images:
+            fig_dir = output_dir / 'figures'
+            if fig_dir.exists():
+                gallery.extend([str(p) for p in fig_dir.glob('*.png')])
+            tab_dir = output_dir / 'tables'
+            if tab_dir.exists():
+                gallery.extend([str(p) for p in tab_dir.glob('*.png')])
+        return md_text, gallery, annotated_pdf, f"Processed {filename} successfully."
+    except Exception as e:
+        logger.error(f"Gradio Error: {e}")
+        return str(e), None, None, f"Error: {e}"
+# Define Gradio App
+with gr.Blocks(title="PDF Layout Extractor") as demo:
+    gr.Markdown("# PDF Layout Extractor")
+    gr.Markdown("Upload a PDF to extract text (Markdown), figures, tables, and visualization.")
+    with gr.Row():
+        with gr.Column():
+            input_pdf = gr.File(label="Upload PDF", file_types=[".pdf"])
+            mode_input = gr.Radio(["both", "images", "markdown"], label="Extraction Mode", value="both")
+            process_btn = gr.Button("Extract Layout", variant="primary")
+        with gr.Column():
+            status_msg = gr.Textbox(label="Status", interactive=False)
+            output_md = gr.Code(label="Extracted Simple Markdown", language="markdown")
+    with gr.Row():
+        output_pdf = gr.File(label="Annotated PDF Layout")
+        output_gallery = gr.Gallery(label="Extracted Images (Figures/Tables)")
+    process_btn.click(
+        fn=gradio_process,
+        inputs=[input_pdf, mode_input],
+        outputs=[output_md, output_gallery, output_pdf, status_msg]
+    )
+    # Enable queueing for better stability and performance on Spaces
+    demo.queue(default_concurrency_limit=5)
+# --------------------------------------------------------------------------------
+# Integrate Gradio with FastAPI
+# --------------------------------------------------------------------------------
+# Mount Gradio at /gradio path (this ensures static files work correctly)
+app = gr.mount_gradio_app(
+    app,
+    demo,
+    path="/gradio",
+    allowed_paths=["./output", "./uploads"],
+    ssr_mode=False
+)
+# Redirect root to Gradio interface
+@app.get("/", response_class=HTMLResponse, include_in_schema=False)
+async def root_redirect():
+    """Redirect to Gradio interface."""
+    return HTMLResponse('<meta http-equiv="refresh" content="0; url=/gradio/" />')
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=False)