Spaces:

VolariGlobal
/

GradioPDF-Extractor

Sleeping

App Files Files Community

saifisvibin commited on 16 days ago

Commit

fd2b9fb

verified ·

1 Parent(s): 93b8ce2

Upload 4 files

Browse files

Files changed (4) hide show

app.py +845 -0
main.py +1309 -0
packages.txt +0 -0
requirements.txt +16 -0

app.py ADDED Viewed

	@@ -0,0 +1,845 @@

+import json
+import os
+import shutil
+import shutil
+import threading
+import uuid
+import time
+import multiprocessing
+from pathlib import Path
+from typing import Dict, List, Optional, Any
+from enum import Enum
+from contextlib import asynccontextmanager
+from fastapi import FastAPI, Request, File, UploadFile, Form, BackgroundTasks, HTTPException
+from fastapi.responses import HTMLResponse, JSONResponse, FileResponse
+from fastapi.staticfiles import StaticFiles
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel, Field
+import re
+import gradio as gr
+# from werkzeug.utils import secure_filename # Removed dependency
+import torch
+import main as extractor
+from loguru import logger
+# --------------------------------------------------------------------------------
+# CONFIGURATION
+# --------------------------------------------------------------------------------
+MAX_CONTENT_LENGTH = 500 * 1024 * 1024  # Not strictly enforced by FastAPI by default, but good to know
+UPLOAD_FOLDER = Path('./uploads')
+OUTPUT_FOLDER = Path('./output')
+UPLOAD_FOLDER.mkdir(parents=True, exist_ok=True)
+OUTPUT_FOLDER.mkdir(parents=True, exist_ok=True)
+# Global model instance
+_model = None
+_progress_tracker: Dict[str, Dict] = {}
+_progress_lock = threading.RLock()
+# Global process pool
+_pool = None
+def secure_filename(filename: str) -> str:
+    """
+    Sanitize filename to prevent directory traversal and special chars.
+    Simplistic implementation to replace werkzeug.
+    """
+    filename = Path(filename).name
+    # Keep only alphanumeric, dots, hyphens, and underscores
+    filename = re.sub(r'[^a-zA-Z0-9_.-]', '_', filename)
+    return filename
+def get_device_info() -> Dict[str, Any]:
+    """Get information about GPU/CPU availability."""
+    cuda_available = torch.cuda.is_available()
+    device = "cuda" if cuda_available else "cpu"
+    info = {
+        "device": device,
+        "cuda_available": cuda_available,
+        "device_name": None,
+        "device_count": 0,
+    }
+    if cuda_available:
+        info["device_name"] = torch.cuda.get_device_name(0)
+        info["device_count"] = torch.cuda.device_count()
+    return info
+def load_model_once():
+    """Load the model once and cache it."""
+    global _model
+    if _model is None:
+        logger.info("Loading DocLayout-YOLO model...")
+        _model = extractor.get_model()
+        logger.info("Model loaded successfully")
+    return _model
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """
+    Life span context manager for startup and shutdown events.
+    Initializes the multiprocessing pool for non-blocking CPU tasks.
+    """
+    global _pool
+    logger.info("Starting up PDF Layout Extractor...")
+    # Configure multiprocessing for PyTorch/CUDA
+    try:
+        multiprocessing.set_start_method('spawn', force=True)
+    except RuntimeError:
+        pass # Already set
+    # Initialize worker pool
+    try:
+        workers = max(1, multiprocessing.cpu_count() - 1)
+        # Check available memory/device for safe concurrency?
+        # For now rely on CPU count.
+        # Note: If CUDA is used, we must be careful with VRAM.
+        # main.py handles lazy loading in workers.
+        logger.info(f"Initializing background process pool with {workers} workers...")
+        _pool = multiprocessing.Pool(processes=workers, initializer=extractor.init_worker)
+    except Exception as e:
+        logger.error(f"Failed to initialize pool: {e}")
+        # non-fatal, will fallback to serial?
+        # actually if pool is None, app might error if we rely on it.
+        # But we'll handle it.
+        pass
+    yield
+    # Shutdown
+    logger.info("Shutting down PDF Layout Extractor...")
+    if _pool:
+        _pool.close()
+        _pool.join()
+app = FastAPI(
+    title="PDF Layout Extractor API",
+    description="A polished API for extracting layout information (text, tables, figures) from PDFs using DocLayout-YOLO.",
+    version="1.0.0",
+    lifespan=lifespan
+)
+# Enable CORS
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Mount Static Files
+# Mount Output as Static for easy access to generated images/PDFs
+app.mount("/output", StaticFiles(directory="output"), name="output")
+# --------------------------------------------------------------------------------
+# Pydantic Models for Response Documentation
+# --------------------------------------------------------------------------------
+class DeviceInfo(BaseModel):
+    device: str = Field(..., description="Compute device being used (e.g., 'cuda' or 'cpu').")
+    cuda_available: bool = Field(..., description="Whether CUDA GPU acceleration is available.")
+    device_name: Optional[str] = Field(None, description="Name of the GPU if available.")
+    device_count: int = Field(..., description="Number of GPU devices detected.")
+class TaskStartResponse(BaseModel):
+    task_id: str = Field(..., description="Unique identifier for the background processing task.")
+    message: str = Field(..., description="Status message confirming start.")
+    total_files: int = Field(..., description="Number of PDF files accepted for processing.")
+class ProcessingResult(BaseModel):
+    filename: str = Field(..., description="Name of the processed file.")
+    stem: Optional[str] = Field(None, description="Filename without extension.")
+    output_dir: Optional[str] = Field(None, description="Relative path to the output directory.")
+    figures_count: Optional[int] = Field(0, description="Total figures detected.")
+    tables_count: Optional[int] = Field(0, description="Total tables detected.")
+    elements_count: Optional[int] = Field(0, description="Total layout elements (text, tables, figures).")
+    annotated_pdf: Optional[str] = Field(None, description="Path to the PDF with layout bounding boxes drawn.")
+    markdown_path: Optional[str] = Field(None, description="Path to the extracted markdown file.")
+    # Extended URLs
+    annotated_pdf_url: Optional[str] = Field(None, description="Full URL to access the annotated PDF.")
+    markdown_url: Optional[str] = Field(None, description="Full URL to access the extracted markdown.")
+    figure_urls: Optional[List[Dict[str, Any]]] = Field(None, description="List of URLs for extracted figure images.")
+    table_urls: Optional[List[Dict[str, Any]]] = Field(None, description="List of URLs for extracted table images.")
+    error: Optional[str] = Field(None, description="Error message if processing failed.")
+class ExtractionMode(str, Enum):
+    images = "images"
+    markdown = "markdown"
+    both = "both"
+class ProgressResponse(BaseModel):
+    status: str = Field(..., description="Current status of the task (e.g., 'processing', 'completed').")
+    progress: int = Field(..., description="Overall progress percentage (0-100).")
+    message: str = Field(..., description="Current status message.")
+    results: List[ProcessingResult] = Field([], description="List of results for processed files.")
+    file_progress: Optional[Dict[str, int]] = Field(None, description="Progress percentage per file.")
+class PDFInfo(BaseModel):
+    stem: str = Field(..., description="Unique identifier/stem of the PDF.")
+    output_dir: str = Field(..., description="Directory where results are stored.")
+class PDFListResponse(BaseModel):
+    pdfs: List[PDFInfo] = Field(..., description="List of processed PDFs available on the server.")
+# --------------------------------------------------------------------------------
+# Helper Functions
+# --------------------------------------------------------------------------------
+def _update_task_progress(task_id: str, filename: str, file_progress: int, message: str):
+    """Update progress for a specific file and calculate overall progress."""
+    with _progress_lock:
+        if task_id not in _progress_tracker:
+            return
+        # Update file-specific progress
+        if 'file_progress' not in _progress_tracker[task_id]:
+            _progress_tracker[task_id]['file_progress'] = {}
+        _progress_tracker[task_id]['file_progress'][filename] = file_progress
+        # Calculate overall progress (average of all files)
+        file_progresses = _progress_tracker[task_id]['file_progress']
+        if file_progresses:
+            total_progress = sum(file_progresses.values()) / len(file_progresses)
+            _progress_tracker[task_id]['progress'] = int(total_progress)
+        _progress_tracker[task_id]['message'] = message
+def process_file_background_task(task_id: str, file_data: bytes, filename: str, extraction_mode: str):
+    """
+    Process a single file in the background (runs in a thread pool inside FastAPI/Starlette).
+    Note: For heavy CPU/GPU tasks, prefer running in a separate process or queue (like Celery),
+    but consistent with the request to 'use FastAPI' and the previous design, this is fine
+    since `fastapi.BackgroundTasks` runs in a thread pool.
+    """
+    filename = secure_filename(filename)
+    try:
+        _update_task_progress(task_id, filename, 5, f'Processing {filename}...')
+        stem = Path(filename).stem
+        include_images = extraction_mode != 'markdown'
+        include_markdown = extraction_mode != 'images'
+        # Ensure upload directory exists
+        upload_dir = UPLOAD_FOLDER
+        upload_path = upload_dir / filename
+        upload_path.write_bytes(file_data)
+        _update_task_progress(task_id, filename, 15, f'Saved {filename}, preparing output...')
+        # Prepare output directory
+        output_dir = OUTPUT_FOLDER / stem
+        output_dir.mkdir(parents=True, exist_ok=True)
+        # Copy PDF to output directory
+        pdf_path = output_dir / filename
+        # shutil.copy caused permissions issues in some envs, renaming/moving is safer if fresh upload
+        # But here we might want to keep the original in uploads?
+        # The original code did `upload_path.rename(pdf_path)`, so let's stick to that semantics:
+        # Move from temp upload to output dir
+        if pdf_path.exists():
+            pdf_path.unlink()
+        upload_path.rename(pdf_path)
+        _update_task_progress(task_id, filename, 25, f'Loading model and processing {filename}...')
+        # Process PDF
+        # Enable multiprocessing to release GIL and avoid blocking the event loop
+        extractor.USE_MULTIPROCESSING = True
+        logger.info(f"Processing {filename} (images={include_images}, markdown={include_markdown})")
+        # Note: When using a pool, we don't strictly need to load the model in THIS process
+        # unless we fallback to serial.
+        # But 'init_worker' loaded it in workers.
+        _update_task_progress(task_id, filename, 30, f'Extracting content from {filename}...')
+        # Use the global pool
+        # If _pool is None (initialization failed), main.py will fallback to serial (blocking this thread, but working)
+        extractor.process_pdf_with_pool(
+            pdf_path,
+            output_dir,
+            pool=_pool,
+            extract_images=include_images,
+            extract_markdown=include_markdown,
+        )
+        _update_task_progress(task_id, filename, 85, f'Collecting results for {filename}...')
+        # Collect results
+        json_path = output_dir / f"{stem}_content_list.json"
+        elements = []
+        if include_images and json_path.exists():
+            text_content = json_path.read_text(encoding='utf-8')
+            if text_content.strip():
+                elements = json.loads(text_content)
+        annotated_pdf = None
+        if include_images:
+            candidate_pdf = output_dir / f"{stem}_layout.pdf"
+            if candidate_pdf.exists():
+                annotated_pdf = str(candidate_pdf.relative_to(OUTPUT_FOLDER))
+        markdown_path = None
+        if include_markdown:
+            candidate_md = output_dir / f"{stem}.md"
+            if candidate_md.exists():
+                markdown_path = str(candidate_md.relative_to(OUTPUT_FOLDER))
+        figures = [e for e in elements if e.get('type') == 'figure']
+        tables = [e for e in elements if e.get('type') == 'table']
+        result = {
+            'filename': filename,
+            'stem': stem,
+            'output_dir': str(output_dir.relative_to(OUTPUT_FOLDER)),
+            'figures_count': len(figures),
+            'tables_count': len(tables),
+            'elements_count': len(elements),
+            'annotated_pdf': annotated_pdf,
+            'markdown_path': markdown_path,
+            'include_images': include_images,
+            'include_markdown': include_markdown,
+        }
+        with _progress_lock:
+            if 'file_progress' not in _progress_tracker[task_id]:
+                _progress_tracker[task_id]['file_progress'] = {}
+            _progress_tracker[task_id]['file_progress'][filename] = 100
+            # Recalculate total
+            file_progresses = _progress_tracker[task_id]['file_progress']
+            if file_progresses:
+                total_prog = sum(file_progresses.values()) / len(file_progresses)
+                _progress_tracker[task_id]['progress'] = int(total_prog)
+            _progress_tracker[task_id]['results'].append(result)
+            _progress_tracker[task_id]['message'] = f'Completed processing {filename}'
+            # Check completion
+            total_files = _progress_tracker[task_id].get('total_files', 1)
+            completed_count = len([r for r in _progress_tracker[task_id]['results'] if 'error' not in r])
+            error_count = len([r for r in _progress_tracker[task_id]['results'] if 'error' in r])
+            if completed_count + error_count >= total_files:
+                _progress_tracker[task_id]['status'] = 'completed'
+                _progress_tracker[task_id]['progress'] = 100
+                _progress_tracker[task_id]['message'] = f'All {total_files} file(s) processed.'
+    except Exception as e:
+        logger.error(f"Error processing {filename}: {e}")
+        import traceback
+        logger.error(traceback.format_exc())
+        with _progress_lock:
+            _progress_tracker[task_id]['results'].append({
+                'filename': filename,
+                'error': str(e)
+            })
+            # Check if this was the last file
+            total_files = _progress_tracker[task_id].get('total_files', 1)
+            if len(_progress_tracker[task_id]['results']) >= total_files:
+                _progress_tracker[task_id]['status'] = 'completed' # Mark done even if error, so frontend stops polling
+                _progress_tracker[task_id]['message'] = f'Finished with errors.'
+# --------------------------------------------------------------------------------
+# Routes
+# --------------------------------------------------------------------------------
+@app.get("/api/docs", response_class=HTMLResponse, tags=["UI"], include_in_schema=False)
+async def api_docs_redirect():
+    """Redirect legacy /api/docs to Swagger UI."""
+    return HTMLResponse(
+        """
+        <html>
+            <head>
+                <meta http-equiv="refresh" content="0; url=/docs" />
+            </head>
+            <body>
+                <p>Redirecting to <a href="/docs">/docs</a>...</p>
+            </body>
+        </html>
+        """
+    )
+@app.get("/api/device-info", response_model=DeviceInfo, tags=["System"])
+async def device_info_endpoint():
+    """Get information about the processing device (CPU/GPU)."""
+    return get_device_info()
+@app.post("/api/upload", response_model=TaskStartResponse, tags=["Processing"])
+async def upload_files(
+    background_tasks: BackgroundTasks,
+    files: List[UploadFile] = File(...),
+    extraction_mode: ExtractionMode = Form(ExtractionMode.images, description="Select extraction mode: 'images' (figures/tables), 'markdown' (text), or 'both'.")
+):
+    """
+    Upload one or more PDF files for background processing.
+    """
+    if not files:
+        raise HTTPException(status_code=400, detail="No files provided")
+    pdf_files = [f for f in files if f.filename.lower().endswith('.pdf')]
+    if not pdf_files:
+        raise HTTPException(status_code=400, detail="No valid PDF files selected")
+    task_id = str(uuid.uuid4())
+    with _progress_lock:
+        _progress_tracker[task_id] = {
+            'status': 'processing',
+            'progress': 0,
+            'message': 'Starting upload...',
+            'results': [],
+            'total_files': len(pdf_files)
+        }
+    # Read files into memory to pass to background task (UploadFile is a stream)
+    # Be careful with RAM here for huge files. If too big, save to temp disk first.
+    # Given the original code read into RAM, we'll do the same for consistency but simpler.
+    for file in pdf_files:
+        content = await file.read()
+        background_tasks.add_task(
+            process_file_background_task,
+            task_id,
+            content,
+            file.filename,
+            extraction_mode
+        )
+    return {
+        "task_id": task_id,
+        "message": "Processing started",
+        "total_files": len(pdf_files)
+    }
+@app.get("/api/progress/{task_id}", response_model=ProgressResponse, tags=["Processing"])
+async def get_progress(task_id: str, request: Request):
+    """Check the progress of a processing task."""
+    with _progress_lock:
+        progress = _progress_tracker.get(task_id)
+        if not progress:
+            raise HTTPException(status_code=404, detail="Task not found")
+        # Deep copy to modify for response (adding URLs) without changing state
+        # Or just build the response object.
+        # Since we are adding computed URLs, we shouldn't modify the stored state every time.
+        response_data = progress.copy()
+        # Use request.base_url for absolute URLs
+        base_url = str(request.base_url).rstrip('/')
+        if 'hf.space' in base_url or request.headers.get("x-forwarded-proto") == "https":
+            base_url = base_url.replace("http://", "https://")
+        # Process results to add URLs
+        results_with_urls = []
+        for res in response_data.get('results', []):
+            res_copy = res.copy()
+            # Helper to make url
+            def make_url(rel_path):
+                if not rel_path: return None
+                # Clean windows paths to forward slashes for URLs
+                clean_path = str(rel_path).replace('\\', '/')
+                return f"{base_url}/output/{clean_path}"
+            res_copy['annotated_pdf_url'] = make_url(res.get('annotated_pdf'))
+            res_copy['markdown_url'] = make_url(res.get('markdown_path'))
+            # Figures and Tables URLs need to be discovered from disk if not stored
+            # The original code loaded JSON every time. That's a bit heavy but ensures freshness.
+            # Let's try to do it if stem is present.
+            stem = res.get('stem')
+            if stem:
+                output_dir = OUTPUT_FOLDER / stem
+                if output_dir.exists():
+                    json_files = list(output_dir.glob('*_content_list.json'))
+                    if json_files:
+                        try:
+                            elements = json.loads(json_files[0].read_text(encoding='utf-8'))
+                            figures = [e for e in elements if e.get('type') == 'figure']
+                            tables = [e for e in elements if e.get('type') == 'table']
+                            fig_urls = []
+                            for fig in figures:
+                                if fig.get('image_path'):
+                                    path = Path(fig['image_path']) # relative to unique output folder usually?
+                                    # Actually in main.py it saves relative to out_dir
+                                    # so image_path is like "figures/page_1_fig_0.png"
+                                    # We need relative to "output" folder for URL
+                                    # output_dir is "output/stem_timestamp"
+                                    # so full path is "output/stem_timestamp/figures/..."
+                                    # The URL mount is /output/ -> output/
+                                    # "image_path" in JSON is relative to the specific STEM folder (implied by main.py logic)
+                                    # Wait, main.py says: "image_path": str(path_template.relative_to(out_dir))
+                                    # So yes, it is "figures/..."
+                                    full_rel_path = f"{stem}/{fig['image_path']}"
+                                    fig_urls.append({
+                                        "page": fig.get('page'),
+                                        "url": make_url(full_rel_path),
+                                        "path": full_rel_path
+                                    })
+                            res_copy['figure_urls'] = fig_urls
+                            tab_urls = []
+                            for tab in tables:
+                                if tab.get('image_path'):
+                                    full_rel_path = f"{stem}/{tab['image_path']}"
+                                    tab_urls.append({
+                                        "page": tab.get('page'),
+                                        "url": make_url(full_rel_path),
+                                        "path": full_rel_path
+                                    })
+                            res_copy['table_urls'] = tab_urls
+                        except Exception as e:
+                            logger.error(f"Error reading details for {stem}: {e}")
+            results_with_urls.append(res_copy)
+        response_data['results'] = results_with_urls
+        return response_data
+@app.get("/api/pdf-list", response_model=PDFListResponse, tags=["Retrieval"])
+async def pdf_list():
+    """List previously processed PDFs."""
+    output_dir = OUTPUT_FOLDER
+    pdfs = []
+    if output_dir.exists():
+        for item in output_dir.iterdir():
+            if item.is_dir():
+                # Check for indicators of success
+                if list(item.glob('*_content_list.json')) or list(item.glob('*.md')):
+                    pdfs.append({
+                        'stem': item.name,
+                        'output_dir': item.name # returning the name as relative dir
+                    })
+    return {'pdfs': pdfs}
+@app.get("/api/pdf-details/{pdf_stem}", tags=["Retrieval"])
+async def pdf_details(pdf_stem: str, request: Request):
+    """Get detailed information about a processed PDF."""
+    output_dir = OUTPUT_FOLDER / pdf_stem
+    if not output_dir.exists():
+        raise HTTPException(status_code=404, detail="PDF not found")
+    base_url = str(request.base_url).rstrip('/')
+    if 'hf.space' in base_url or request.headers.get("x-forwarded-proto") == "https":
+        base_url = base_url.replace("http://", "https://")
+    def make_url(rel_path):
+        if not rel_path: return None
+        clean_path = str(rel_path).replace('\\', '/')
+        return f"{base_url}/output/{clean_path}"
+    # Load content list
+    json_files = list(output_dir.glob('*_content_list.json'))
+    elements = []
+    if json_files:
+        elements = json.loads(json_files[0].read_text(encoding='utf-8'))
+    figures = [e for e in elements if e.get('type') == 'figure']
+    tables = [e for e in elements if e.get('type') == 'table']
+    # PDF Layout
+    annotated_pdf = None
+    pdf_files = list(output_dir.glob('*_layout.pdf'))
+    if pdf_files:
+        annotated_pdf = f"{pdf_stem}/{pdf_files[0].name}"
+    # Markdown
+    markdown_path = None
+    md_files = list(output_dir.glob('*.md'))
+    if md_files:
+        markdown_path = f"{pdf_stem}/{md_files[0].name}"
+    # Image lists
+    figure_images = []
+    fig_dir = output_dir / 'figures'
+    if fig_dir.exists():
+        figure_images = [f"{pdf_stem}/figures/{f.name}" for f in sorted(fig_dir.glob('*.png'))]
+    table_images = []
+    tab_dir = output_dir / 'tables'
+    if tab_dir.exists():
+        table_images = [f"{pdf_stem}/tables/{f.name}" for f in sorted(tab_dir.glob('*.png'))]
+    return {
+        'stem': pdf_stem,
+        'figures': figures,
+        'tables': tables,
+        'figures_count': len(figures),
+        'tables_count': len(tables),
+        'elements_count': len(elements),
+        'annotated_pdf': annotated_pdf,
+        'markdown_path': markdown_path,
+        'figure_images': figure_images,
+        'table_images': table_images,
+        'urls': {
+            'annotated_pdf': make_url(annotated_pdf),
+            'markdown': make_url(markdown_path),
+            'figures': [make_url(img) for img in figure_images],
+            'tables': [make_url(img) for img in table_images],
+        }
+    }
+@app.post("/api/predict", tags=["Legacy"], include_in_schema=True)
+async def predict(
+    file: UploadFile = File(...),
+    request: Request = None
+):
+    """
+    Direct API endpoint for extracting text/tables/figures from a single PDF.
+    Waits for completion and returns JSON result.
+    """
+    if not file.filename.lower().endswith('.pdf'):
+        raise HTTPException(status_code=400, detail="Invalid file type. Please upload a PDF.")
+    # Create unique output directory
+    filename = secure_filename(file.filename)
+    stem = Path(filename).stem
+    unique_id = f"{stem}_{int(time.time())}"
+    output_dir = OUTPUT_FOLDER / unique_id
+    output_dir.mkdir(parents=True, exist_ok=True)
+    # Save file
+    pdf_path = output_dir / filename
+    content = await file.read()
+    pdf_path.write_bytes(content)
+    try:
+        # Load model logic (sync call to stay simple for this endpoint)
+        load_model_once()
+        extractor.USE_MULTIPROCESSING = False
+        # Process
+        extractor.process_pdf_with_pool(
+            pdf_path,
+            output_dir,
+            pool=None,
+            extract_images=True,
+            extract_markdown=True,
+        )
+        # Build Result
+        base_url = str(request.base_url).rstrip('/')
+        if 'hf.space' in base_url or request.headers.get("x-forwarded-proto") == "https":
+            base_url = base_url.replace("http://", "https://")
+        def make_url(rel_path):
+             return f"{base_url}/output/{unique_id}/{rel_path}"
+        result = {
+            "status": "success",
+            "filename": filename,
+            "text": "",
+            "tables": [],
+            "figures": [],
+            "summary": {}
+        }
+        # Text
+        md_path = output_dir / f"{stem}.md"
+        if md_path.exists():
+            result['text'] = md_path.read_text(encoding='utf-8')
+        # JSON content
+        json_path = output_dir / f"{stem}_content_list.json"
+        if json_path.exists():
+            elements = json.loads(json_path.read_text(encoding='utf-8'))
+            figures = [e for e in elements if e.get('type') == 'figure']
+            result['figures'] = [{
+                **fig,
+                'image_url': make_url(fig.get('image_path')) if fig.get('image_path') else None
+            } for fig in figures]
+            tables = [e for e in elements if e.get('type') == 'table']
+            result['tables'] = [{
+                **tab,
+                'image_url': make_url(tab.get('image_path')) if tab.get('image_path') else None
+            } for tab in tables]
+            result['summary'] = {
+                'figures_count': len(figures),
+                'tables_count': len(tables),
+                'elements_count': len(elements)
+            }
+        return result
+    except Exception as e:
+        logger.error(f"Error in predict: {e}")
+        import traceback
+        logger.error(traceback.format_exc())
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/api/delete", tags=["Processing"])
+async def delete_pdf(stem: str = Form(...)):
+    """Delete a processed PDF and its output directory."""
+    if not stem:
+        raise HTTPException(status_code=400, detail="Missing stem")
+    # Resolve output directory safely
+    output_root = OUTPUT_FOLDER.resolve()
+    target_dir = (output_root / stem).resolve()
+    # Prevent path traversal
+    if output_root not in target_dir.parents and target_dir != output_root:
+         raise HTTPException(status_code=400, detail="Invalid stem path")
+    if not target_dir.exists() or not target_dir.is_dir():
+        raise HTTPException(status_code=404, detail="Not found")
+    try:
+        shutil.rmtree(target_dir)
+        return {"status": "success", "message": f"Deleted {stem}"}
+    except Exception as e:
+        # Try to fix read-only files (common on Windows)
+        try:
+            import stat
+            def on_rm_error(func, path, exc_info):
+                os.chmod(path, stat.S_IWRITE)
+                func(path)
+            shutil.rmtree(target_dir, onerror=on_rm_error)
+            return {"status": "success", "message": f"Deleted {stem}"}
+        except Exception as e2:
+            logger.error(f"Error deleting {stem}: {e2}")
+            raise HTTPException(status_code=500, detail=f"Failed to delete: {str(e2)}")
+            raise HTTPException(status_code=500, detail=f"Failed to delete: {str(e2)}")
+# --------------------------------------------------------------------------------
+# Gradio Interface
+# --------------------------------------------------------------------------------
+def gradio_process(pdf_file, mode_str):
+    """
+    Wrapper for Gradio to call the extractor logic.
+    """
+    if pdf_file is None:
+        return None, None, None, "No file uploaded."
+    try:
+        # Create unique directory
+        filename = secure_filename(Path(pdf_file.name).name)
+        stem = Path(filename).stem
+        unique_id = f"{stem}_{int(time.time())}"
+        output_dir = OUTPUT_FOLDER / unique_id
+        output_dir.mkdir(parents=True, exist_ok=True)
+        # Copy file
+        dest_path = output_dir / filename
+        shutil.copy(pdf_file.name, dest_path)
+        # Determine flags
+        include_images = (mode_str != "markdown")
+        include_markdown = (mode_str != "images")
+        # Process (sync for Gradio simplicity, or use utils)
+        # We need to load model if not loaded
+        load_model_once()
+        extractor.USE_MULTIPROCESSING = False # Gradio usually runs in thread
+        extractor.process_pdf_with_pool(
+            dest_path,
+            output_dir,
+            pool=None,
+            extract_images=include_images,
+            extract_markdown=include_markdown
+        )
+        # Collect outputs
+        md_text = ""
+        md_path = output_dir / f"{stem}.md"
+        if md_path.exists():
+            md_text = md_path.read_text(encoding='utf-8')
+        annotated_pdf = None
+        pdf_layout_path = output_dir / f"{stem}_layout.pdf"
+        if pdf_layout_path.exists():
+            annotated_pdf = str(pdf_layout_path)
+        gallery = []
+        if include_images:
+            fig_dir = output_dir / 'figures'
+            if fig_dir.exists():
+                gallery.extend([str(p) for p in fig_dir.glob('*.png')])
+            tab_dir = output_dir / 'tables'
+            if tab_dir.exists():
+                gallery.extend([str(p) for p in tab_dir.glob('*.png')])
+        return md_text, gallery, annotated_pdf, f"Processed {filename} successfully."
+    except Exception as e:
+        logger.error(f"Gradio Error: {e}")
+        return str(e), None, None, f"Error: {e}"
+# Define Gradio App
+with gr.Blocks(title="PDF Layout Extractor") as demo:
+    gr.Markdown("# PDF Layout Extractor")
+    gr.Markdown("Upload a PDF to extract text (Markdown), figures, tables, and visualization.")
+    with gr.Row():
+        with gr.Column():
+            input_pdf = gr.File(label="Upload PDF", file_types=[".pdf"])
+            mode_input = gr.Radio(["both", "images", "markdown"], label="Extraction Mode", value="both")
+            process_btn = gr.Button("Extract Layout", variant="primary")
+        with gr.Column():
+            status_msg = gr.Textbox(label="Status", interactive=False)
+            output_md = gr.Code(label="Extracted Simple Markdown", language="markdown")
+    with gr.Row():
+        output_pdf = gr.File(label="Annotated PDF Layout")
+        output_gallery = gr.Gallery(label="Extracted Images (Figures/Tables)")
+    process_btn.click(
+        fn=gradio_process,
+        inputs=[input_pdf, mode_input],
+        outputs=[output_md, output_gallery, output_pdf, status_msg]
+    )
+# Mount Gradio to FastAPI
+# We mount it at the root "/" to replace the old index.html,
+# OR we can mount it at "/gradio" and keep the old UI.
+# The user said "switch to gradio", implying replacement or primary usage.
+# HF Spaces expects the app at root.
+# But we have specific API endpoints.
+# Let's mount Gradio at "/" and move the custom API docs to "/docs" (already there).
+# Wait, if we mount at "/", it might shadow "/api" if not careful?
+# No, FastAPI routing takes precedence if defined before mounting, usually.
+# But `gr.mount_gradio_app` documentation says it mounts a sub-app.
+# If path="", it mounts at root.
+# Let's mount at "/" but ensure API routes work.
+# Actually, `mount_gradio_app` handles this gracefully if we pass the `app`.
+app = gr.mount_gradio_app(app, demo, path="/")
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=True)

main.py ADDED Viewed

	@@ -0,0 +1,1309 @@

+import os
+import json
+import signal
+import sys
+from pathlib import Path
+from typing import List, Dict, Tuple, Optional, Sequence, Set, Any
+from multiprocessing import Pool, cpu_count
+from functools import partial
+import fitz  # PyMuPDF (Still needed for drawing output PDF)
+import pypdfium2 as pdfium
+import torch
+from doclayout_yolo import YOLOv10
+from huggingface_hub import hf_hub_download
+from loguru import logger
+from PIL import Image
+import numpy as np
+try:
+    import pymupdf4llm  # type: ignore
+except ImportError:  # pragma: no cover - optional dependency
+    pymupdf4llm = None  # type: ignore
+# ----------------------------------------------------------------------
+# CONFIGURATION
+# ----------------------------------------------------------------------
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+# Model options
+MODEL_SIZE = 1024
+REPO_ID = "juliozhao/DocLayout-YOLO-DocStructBench"
+WEIGHTS_FILE = f"doclayout_yolo_docstructbench_imgsz{MODEL_SIZE}.pt"
+# Detection settings
+CONF_THRESHOLD = 0.25
+# Multiprocessing settings
+NUM_WORKERS = None  # None = auto (cpu_count - 1), or set to specific number like 4
+USE_MULTIPROCESSING = True  # Set to False to disable parallel processing entirely
+# ----------------------------------------------------------------------
+# Color map for the layout classes
+# ----------------------------------------------------------------------
+CLASS_COLORS = {
+    "text": (0, 128, 0),          # Dark Green
+    "title": (192, 0, 0),        # Dark Red
+    "figure": (0, 0, 192),       # Dark Blue
+    "table": (218, 165, 32),     # Goldenrod (Dark Yellow)
+    "list": (128, 0, 128),       # Purple
+    "header": (0, 128, 128),     # Teal
+    "footer": (100, 100, 100),   # Dark Gray
+    "figure_caption": (0, 0, 128), # Navy
+    "table_caption": (139, 69, 19),  # Saddle Brown
+    "table_footnote": (128, 0, 128), # Purple
+}
+# Global model instance (will be None in worker processes until loaded)
+_model = None
+_shutdown_requested = False
+# ----------------------------------------------------------------------
+# Signal handler for graceful shutdown
+# ----------------------------------------------------------------------
+def signal_handler(signum, frame):
+    """Handle interrupt signals gracefully."""
+    global _shutdown_requested
+    if not _shutdown_requested:
+        _shutdown_requested = True
+        logger.warning("\n⚠️  Interrupt received! Finishing current page and shutting down gracefully...")
+        logger.warning("Press Ctrl+C again to force quit (may leave incomplete files)")
+    else:
+        logger.error("\n❌ Force quit requested. Exiting immediately.")
+        sys.exit(1)
+def setup_signal_handlers():
+    """Setup signal handlers for graceful shutdown."""
+    signal.signal(signal.SIGINT, signal_handler)
+    signal.signal(signal.SIGTERM, signal_handler)
+# ----------------------------------------------------------------------
+# Model loader function
+# ----------------------------------------------------------------------
+def get_model():
+    """Lazy load the model (only once per process)."""
+    global _model
+    if _model is None:
+        weights_path = hf_hub_download(repo_id=REPO_ID, filename=WEIGHTS_FILE)
+        _model = YOLOv10(weights_path)
+        logger.info(f"✓ Model loaded in worker process (PID: {os.getpid()})")
+    return _model
+# ----------------------------------------------------------------------
+# Worker initialization function
+# ----------------------------------------------------------------------
+def init_worker():
+    """Initialize worker process - loads model once at startup."""
+    try:
+        get_model()
+        logger.success(f"Worker {os.getpid()} ready")
+    except Exception as e:
+        logger.error(f"Failed to initialize worker {os.getpid()}: {e}")
+        raise
+# ----------------------------------------------------------------------
+# Run layout detection on a single page image (YOLO)
+# ----------------------------------------------------------------------
+def detect_page(pil_img: Image.Image) -> List[dict]:
+    """Detect layout elements using YOLO model."""
+    model = get_model()  # Will return already-loaded model in worker
+    img_cv = np.array(pil_img)
+    results = model.predict(
+        img_cv,
+        imgsz=MODEL_SIZE,
+        conf=CONF_THRESHOLD,
+        device=DEVICE,
+        verbose=False
+    )
+    dets = []
+    for i, box in enumerate(results[0].boxes):
+        cls_id = int(box.cls.item())
+        name = results[0].names[cls_id]
+        conf = float(box.conf.item())
+        x0, y0, x1, y1 = box.xyxy[0].cpu().numpy().tolist()
+        dets.append({
+            "name": name,
+            "bbox": [x0, y0, x1, y1],
+            "conf": conf,
+            "source": "yolo",
+            "index": i
+        })
+    return dets
+# ----------------------------------------------------------------------
+# Crop & save figure/table regions (with captions)
+# ----------------------------------------------------------------------
+def get_union_box(box1: List[float], box2: List[float]) -> List[float]:
+    """Get the bounding box enclosing two boxes."""
+    x0 = min(box1[0], box2[0])
+    y0 = min(box1[1], box2[1])
+    x1 = max(box1[2], box2[2])
+    y1 = max(box1[3], box2[3])
+    return [x0, y0, x1, y1]
+def collect_caption_elements(
+    element: Dict,
+    all_dets: List[Dict],
+    target_name: str,
+    max_vertical_gap: float = 60.0,
+    min_overlap: float = 0.25,
+) -> List[Dict]:
+    """
+    Collect contiguous caption detections directly below a figure/table.
+    """
+    base_box = element["bbox"]
+    base_bottom = base_box[3]
+    selected: List[Dict] = []
+    last_bottom = base_bottom
+    relevant = [
+        d for d in all_dets
+        if d["name"] == target_name and d["bbox"][1] >= base_bottom - 5
+    ]
+    relevant.sort(key=lambda d: d["bbox"][1])
+    for cand in relevant:
+        cand_box = cand["bbox"]
+        top = cand_box[1]
+        if selected and top - last_bottom > max_vertical_gap:
+            break
+        if selected:
+            overlap = _horizontal_overlap_ratio(selected[-1]["bbox"], cand_box)
+        else:
+            overlap = _horizontal_overlap_ratio(base_box, cand_box)
+        if overlap < min_overlap:
+            continue
+        selected.append(cand)
+        last_bottom = cand_box[3]
+    return selected
+def collect_title_and_text_segments(
+    element: Dict,
+    all_dets: List[Dict],
+    processed_indices: Set[int],
+    settings: Optional[Dict[str, float]] = None,
+) -> Tuple[List[Dict], List[Dict]]:
+    """
+    Locate a title below the element and any contiguous text blocks directly beneath it.
+    """
+    if settings is None:
+        settings = TITLE_TEXT_ASSOCIATION
+    if not element.get("bbox"):
+        return [], []
+    figure_box = element["bbox"]
+    figure_bottom = figure_box[3]
+    candidates = [
+        d for d in all_dets
+        if d.get("bbox") and d["index"] not in processed_indices
+    ]
+    candidates.sort(key=lambda d: d["bbox"][1])
+    titles: List[Dict] = []
+    texts: List[Dict] = []
+    for idx, det in enumerate(candidates):
+        if det["name"] != "title":
+            continue
+        title_box = det["bbox"]
+        if title_box[1] < figure_bottom - 5:
+            continue
+        vertical_gap = title_box[1] - figure_bottom
+        if vertical_gap > settings["max_title_gap"]:
+            break
+        overlap = _horizontal_overlap_ratio(figure_box, title_box)
+        if overlap < settings["min_overlap"]:
+            continue
+        titles.append(det)
+        last_bottom = title_box[3]
+        for follower in candidates[idx + 1 :]:
+            if follower["name"] == "title":
+                break
+            if follower["name"] != "text":
+                continue
+            text_box = follower["bbox"]
+            if text_box[1] < title_box[1]:
+                continue
+            gap = text_box[1] - last_bottom
+            if gap > settings["max_text_gap"]:
+                break
+            if _horizontal_overlap_ratio(title_box, text_box) < settings["min_overlap"]:
+                continue
+            texts.append(follower)
+            last_bottom = text_box[3]
+        break
+    return titles, texts
+def save_layout_elements(pil_img: Image.Image, page_num: int,
+                         dets: List[dict], out_dir: Path) -> List[dict]:
+    """Save figure and table crops, merging captions."""
+    fig_dir = out_dir / "figures"
+    tab_dir = out_dir / "tables"
+    os.makedirs(fig_dir, exist_ok=True)
+    os.makedirs(tab_dir, exist_ok=True)
+    infos = []
+    fig_count = 0
+    tab_count = 0
+    processed_indices = set()
+    for i, d in enumerate(dets):
+        if d["index"] in processed_indices:
+            continue
+        name = d["name"].lower()
+        final_box = d["bbox"]
+        caption_segments: List[Dict] = []
+        title_segments: List[Dict] = []
+        text_segments: List[Dict] = []
+        if name == "figure":
+            elem_type = "figure"
+            path_template = fig_dir / f"page_{page_num + 1}_fig_{fig_count}.png"
+            fig_count += 1
+            caption_segments = collect_caption_elements(d, dets, "figure_caption")
+            for cap in caption_segments:
+                final_box = get_union_box(final_box, cap["bbox"])
+                processed_indices.add(cap["index"])
+            title_segments, text_segments = collect_title_and_text_segments(
+                d, dets, processed_indices
+            )
+            for seg in title_segments + text_segments:
+                final_box = get_union_box(final_box, seg["bbox"])
+                processed_indices.add(seg["index"])
+        elif name == "table":
+            elem_type = "table"
+            path_template = tab_dir / f"page_{page_num + 1}_tab_{tab_count}.png"
+            tab_count += 1
+            caption_segments = collect_caption_elements(d, dets, "table_caption")
+            for cap in caption_segments:
+                final_box = get_union_box(final_box, cap["bbox"])
+                processed_indices.add(cap["index"])
+        else:
+            continue
+        x0, y0, x1, y1 = map(int, final_box)
+        crop = pil_img.crop((x0, y0, x1, y1))
+        if crop.mode == "CMYK":
+            crop = crop.convert("RGB")
+        crop.save(path_template)
+        info_data = {
+            "type": elem_type,
+            "page": page_num + 1,
+            "bbox_pixels": final_box,
+            "conf": d["conf"],
+            "source": d.get("source", "yolo"),
+            "image_path": str(path_template.relative_to(out_dir)),
+            "width": int(x1 - x0),
+            "height": int(y1 - y0),
+            "page_width": pil_img.width,
+            "page_height": pil_img.height,
+        }
+        if caption_segments:
+            info_data["captions"] = [
+                {
+                    "bbox": cap["bbox"],
+                    "conf": cap.get("conf"),
+                    "index": cap["index"],
+                    "source": cap.get("source"),
+                    "page": page_num + 1,
+                }
+                for cap in caption_segments
+            ]
+        if title_segments:
+            info_data["titles"] = [
+                {
+                    "bbox": seg["bbox"],
+                    "conf": seg.get("conf"),
+                    "index": seg["index"],
+                    "source": seg.get("source"),
+                    "page": page_num + 1,
+                }
+                for seg in title_segments
+            ]
+        if text_segments:
+            info_data["texts"] = [
+                {
+                    "bbox": seg["bbox"],
+                    "conf": seg.get("conf"),
+                    "index": seg["index"],
+                    "source": seg.get("source"),
+                    "page": page_num + 1,
+                }
+                for seg in text_segments
+            ]
+        infos.append(info_data)
+    return infos
+TABLE_STITCH_TOLERANCES = {
+    "x_tol": 60,
+    "y_tol": 60,
+    "width_tol": 120,
+    "height_tol": 120,
+}
+CROSS_PAGE_CAPTION_THRESHOLDS = {
+    "max_top_ratio": 0.35,
+    "max_top_pixels": 220,
+    "x_tol": 120,
+    "width_tol": 200,
+    "min_overlap": 0.05,
+}
+TITLE_TEXT_ASSOCIATION = {
+    "max_title_gap": 220,
+    "max_text_gap": 160,
+    "min_overlap": 0.2,
+}
+def _horizontal_overlap_ratio(box1: List[float], box2: List[float]) -> float:
+    """Compute horizontal overlap ratio between two bounding boxes."""
+    x_left = max(box1[0], box2[0])
+    x_right = min(box1[2], box2[2])
+    overlap = max(0.0, x_right - x_left)
+    if overlap <= 0:
+        return 0.0
+    width_union = max(box1[2], box2[2]) - min(box1[0], box2[0])
+    if width_union <= 0:
+        return 0.0
+    return overlap / width_union
+def _bbox_to_rect(bbox: List[float]) -> Tuple[int, int, int, int]:
+    """Convert [x0, y0, x1, y1] into (x, y, w, h)."""
+    x0, y0, x1, y1 = bbox
+    return int(x0), int(y0), int(x1 - x0), int(y1 - y0)
+def _open_table_image(elem: Dict, out_dir: Path) -> Optional[Image.Image]:
+    """Open a table image relative to the output directory."""
+    image_path = out_dir / elem["image_path"]
+    if not image_path.exists():
+        logger.warning(f"Missing table crop for stitching: {image_path}")
+        return None
+    img = Image.open(image_path)
+    if img.mode != "RGB":
+        img = img.convert("RGB")
+    return img
+def _pad_width(img: Image.Image, target_width: int) -> Image.Image:
+    if img.width >= target_width:
+        return img
+    canvas = Image.new("RGB", (target_width, img.height), color=(255, 255, 255))
+    canvas.paste(img, (0, 0))
+    return canvas
+def _pad_height(img: Image.Image, target_height: int) -> Image.Image:
+    if img.height >= target_height:
+        return img
+    canvas = Image.new("RGB", (img.width, target_height), color=(255, 255, 255))
+    canvas.paste(img, (0, 0))
+    return canvas
+def _append_segment_image(
+    base_img: Image.Image,
+    segment_img: Image.Image,
+    resize_to_base: bool = False,
+) -> Image.Image:
+    """Append segment image below base image with optional width alignment."""
+    if base_img.mode != "RGB":
+        base_img = base_img.convert("RGB")
+    if segment_img.mode != "RGB":
+        segment_img = segment_img.convert("RGB")
+    if resize_to_base and segment_img.width > 0 and base_img.width > 0:
+        segment_img = segment_img.resize(
+            (
+                base_img.width,
+                max(1, int(segment_img.height * (base_img.width / segment_img.width))),
+            ),
+            Image.Resampling.LANCZOS,
+        )
+    target_width = max(base_img.width, segment_img.width)
+    base_img = _pad_width(base_img, target_width)
+    segment_img = _pad_width(segment_img, target_width)
+    stitched = Image.new(
+        "RGB",
+        (target_width, base_img.height + segment_img.height),
+        color=(255, 255, 255),
+    )
+    stitched.paste(base_img, (0, 0))
+    stitched.paste(segment_img, (0, base_img.height))
+    return stitched
+def _render_pdf_page(
+    pdf_doc: pdfium.PdfDocument,
+    page_index: int,
+    scale: float,
+    cache: Dict[int, Image.Image],
+) -> Optional[Image.Image]:
+    """Render a PDF page to a PIL image with caching."""
+    if page_index in cache:
+        return cache[page_index]
+    try:
+        page = pdf_doc[page_index]
+        bitmap = page.render(scale=scale)
+        pil_img = bitmap.to_pil()
+        page.close()
+    except Exception as exc:
+        logger.error(f"Failed to render page {page_index + 1} for caption stitching: {exc}")
+        return None
+    cache[page_index] = pil_img
+    return pil_img
+def _crop_pdf_region(
+    page_img: Optional[Image.Image], bbox: List[float]
+) -> Optional[Image.Image]:
+    """Crop a region from a rendered PDF page."""
+    if page_img is None:
+        return None
+    x0, y0, x1, y1 = map(int, bbox)
+    x0 = max(0, x0)
+    y0 = max(0, y0)
+    x1 = min(page_img.width, max(x0 + 1, x1))
+    y1 = min(page_img.height, max(y0 + 1, y1))
+    if x0 >= x1 or y0 >= y1:
+        return None
+    crop = page_img.crop((x0, y0, x1, y1))
+    if crop.mode == "CMYK":
+        crop = crop.convert("RGB")
+    return crop
+def write_markdown_document(pdf_path: Path, out_dir: Path) -> Optional[Path]:
+    """
+    Extract markdown text from a PDF using PyMuPDF4LLM and write it to disk.
+    """
+    if pymupdf4llm is None:
+        logger.warning(
+            "Skipping markdown extraction for %s because pymupdf4llm is not installed.",
+            pdf_path.name,
+        )
+        return None
+    try:
+        markdown_content = pymupdf4llm.to_markdown(str(pdf_path))
+    except Exception as exc:
+        logger.error(f"  Failed to create markdown for {pdf_path.name}: {exc}")
+        return None
+    if isinstance(markdown_content, list):
+        markdown_content = "\n\n".join(
+            part for part in markdown_content if isinstance(part, str)
+        )
+    if not isinstance(markdown_content, str):
+        logger.error(
+            f"  Unexpected markdown output type {type(markdown_content)} for {pdf_path.name}"
+        )
+        return None
+    markdown_content = markdown_content.strip()
+    if not markdown_content:
+        logger.warning(f"  No textual content extracted from {pdf_path.name}")
+        return None
+    if not markdown_content.endswith("\n"):
+        markdown_content += "\n"
+    md_path = out_dir / f"{pdf_path.stem}.md"
+    md_path.write_text(markdown_content, encoding="utf-8")
+    logger.info(f"  Saved markdown to {md_path.name}")
+    return md_path
+def _collect_text_under_title_cross_page(
+    title_det: Dict,
+    sorted_dets: List[Dict],
+    start_idx: int,
+    page_idx: int,
+    used_indices: Set[Tuple[int, int]],
+    settings: Optional[Dict[str, float]] = None,
+) -> List[Dict]:
+    """Collect text elements directly below a title on the next page."""
+    if settings is None:
+        settings = TITLE_TEXT_ASSOCIATION
+    texts: List[Dict] = []
+    title_box = title_det["bbox"]
+    last_bottom = title_box[3]
+    for follower in sorted_dets[start_idx + 1 :]:
+        det_index = follower.get("index")
+        if det_index is None or (page_idx, det_index) in used_indices:
+            continue
+        if follower["name"] == "title":
+            break
+        if follower["name"] != "text":
+            continue
+        text_box = follower["bbox"]
+        if text_box[1] < title_box[1]:
+            continue
+        gap = text_box[1] - last_bottom
+        if gap > settings["max_text_gap"]:
+            break
+        if _horizontal_overlap_ratio(title_box, text_box) < settings["min_overlap"]:
+            continue
+        texts.append(follower)
+        last_bottom = text_box[3]
+    return texts
+def attach_cross_page_figure_captions(
+    elements: List[Dict],
+    all_dets: Sequence[Optional[List[Dict[str, Any]]]],
+    pdf_bytes: bytes,
+    out_dir: Path,
+    scale: float,
+) -> List[Dict]:
+    """
+    If a figure caption appears on the next page, stitch it to the prior figure.
+    """
+    figures = [elem for elem in elements if elem.get("type") == "figure"]
+    if not figures or not all_dets:
+        return elements
+    try:
+        pdf_doc = pdfium.PdfDocument(pdf_bytes)
+    except Exception as exc:
+        logger.error(f"Unable to reopen PDF for figure caption stitching: {exc}")
+        return elements
+    page_cache: Dict[int, Image.Image] = {}
+    used_following_ids: Set[Tuple[int, int]] = set()
+    # Mark existing caption/title/text detections as used
+    for elem in figures:
+        for key in ("captions", "titles", "texts"):
+            for seg in elem.get(key, []) or []:
+                idx = seg.get("index")
+                page_no = seg.get("page")
+                if idx is None or page_no is None:
+                    continue
+                used_following_ids.add((page_no - 1, idx))
+    for elem in figures:
+        page_no = elem.get("page")
+        bbox = elem.get("bbox_pixels")
+        if page_no is None or bbox is None:
+            continue
+        current_idx = page_no - 1
+        next_idx = current_idx + 1
+        if next_idx >= len(all_dets):
+            continue
+        next_dets = all_dets[next_idx]
+        if not next_dets:
+            continue
+        fig_width = bbox[2] - bbox[0]
+        page_img = _render_pdf_page(pdf_doc, next_idx, scale, page_cache)
+        if page_img is None:
+            continue
+        next_page_height = page_img.height
+        max_top_allowed = min(
+            CROSS_PAGE_CAPTION_THRESHOLDS["max_top_pixels"],
+            int(next_page_height * CROSS_PAGE_CAPTION_THRESHOLDS["max_top_ratio"]),
+        )
+        sorted_next = sorted(
+            [det for det in next_dets if det.get("bbox")],
+            key=lambda det: det["bbox"][1],
+        )
+        caption_candidate: Optional[Tuple[Dict, int]] = None
+        caption_candidates = []
+        for det in sorted_next:
+            if det.get("name") != "figure_caption":
+                continue
+            det_index = det.get("index")
+            if det_index is None or (next_idx, det_index) in used_following_ids:
+                continue
+            det_bbox = det.get("bbox")
+            if not det_bbox or det_bbox[1] > max_top_allowed:
+                continue
+            overlap = _horizontal_overlap_ratio(bbox, det_bbox)
+            x_diff = abs(bbox[0] - det_bbox[0])
+            width_diff = abs((bbox[2] - bbox[0]) - (det_bbox[2] - det_bbox[0]))
+            if overlap < CROSS_PAGE_CAPTION_THRESHOLDS["min_overlap"]:
+                if (
+                    x_diff > CROSS_PAGE_CAPTION_THRESHOLDS["x_tol"]
+                    or width_diff > CROSS_PAGE_CAPTION_THRESHOLDS["width_tol"]
+                ):
+                    continue
+            score = width_diff + 0.5 * x_diff
+            caption_candidates.append((score, det, det_index))
+        if caption_candidates:
+            caption_candidates.sort(key=lambda item: item[0])
+            _, best_det, best_index = caption_candidates[0]
+            caption_candidate = (best_det, best_index)
+        title_candidate: Optional[Tuple[Dict, int]] = None
+        title_texts: List[Dict] = []
+        for idx_sorted, det in enumerate(sorted_next):
+            if det.get("name") != "title":
+                continue
+            det_index = det.get("index")
+            if det_index is None or (next_idx, det_index) in used_following_ids:
+                continue
+            det_bbox = det.get("bbox")
+            if not det_bbox or det_bbox[1] > max_top_allowed:
+                continue
+            overlap = _horizontal_overlap_ratio(bbox, det_bbox)
+            x_diff = abs(bbox[0] - det_bbox[0])
+            if (
+                overlap < TITLE_TEXT_ASSOCIATION["min_overlap"]
+                and x_diff > CROSS_PAGE_CAPTION_THRESHOLDS["x_tol"]
+            ):
+                continue
+            title_candidate = (det, det_index)
+            title_texts = _collect_text_under_title_cross_page(
+                det, sorted_next, idx_sorted, next_idx, used_following_ids
+            )
+            break
+        if not caption_candidate and not title_candidate and not title_texts:
+            continue
+        figure_path = out_dir / elem["image_path"]
+        if not figure_path.exists():
+            continue
+        figure_img = Image.open(figure_path)
+        if figure_img.mode == "CMYK":
+            figure_img = figure_img.convert("RGB")
+        segments_added = False
+        if caption_candidate:
+            cap_det, cap_index = caption_candidate
+            caption_crop = _crop_pdf_region(page_img, cap_det["bbox"])
+            if caption_crop is not None:
+                figure_img = _append_segment_image(
+                    figure_img, caption_crop, resize_to_base=True
+                )
+                elem.setdefault("captions", [])
+                elem["captions"].append(
+                    {
+                        "bbox": cap_det["bbox"],
+                        "conf": cap_det.get("conf"),
+                        "index": cap_index,
+                        "source": cap_det.get("source"),
+                        "page": next_idx + 1,
+                    }
+                )
+                used_following_ids.add((next_idx, cap_index))
+                segments_added = True
+        if title_candidate:
+            title_det, title_index = title_candidate
+            title_crop = _crop_pdf_region(page_img, title_det["bbox"])
+            if title_crop is not None:
+                figure_img = _append_segment_image(figure_img, title_crop)
+                elem.setdefault("titles", [])
+                elem["titles"].append(
+                    {
+                        "bbox": title_det["bbox"],
+                        "conf": title_det.get("conf"),
+                        "index": title_index,
+                        "source": title_det.get("source"),
+                        "page": next_idx + 1,
+                    }
+                )
+                used_following_ids.add((next_idx, title_index))
+                segments_added = True
+            for text_det in title_texts:
+                text_index = text_det.get("index")
+                text_crop = _crop_pdf_region(page_img, text_det["bbox"])
+                if text_crop is None:
+                    continue
+                figure_img = _append_segment_image(figure_img, text_crop)
+                elem.setdefault("texts", [])
+                elem["texts"].append(
+                    {
+                        "bbox": text_det["bbox"],
+                        "conf": text_det.get("conf"),
+                        "index": text_index,
+                        "source": text_det.get("source"),
+                        "page": next_idx + 1,
+                    }
+                )
+                if text_index is not None:
+                    used_following_ids.add((next_idx, text_index))
+                segments_added = True
+        if not segments_added:
+            continue
+        figure_img.save(figure_path)
+        elem["width"] = figure_img.width
+        elem["height"] = figure_img.height
+        span = elem.get("page_span")
+        if span:
+            if next_idx + 1 not in span:
+                span.append(next_idx + 1)
+        else:
+            base_page = elem.get("page")
+            new_span = [page for page in (base_page, next_idx + 1) if page is not None]
+            elem["page_span"] = new_span
+    pdf_doc.close()
+    return elements
+def _stitch_table_pair(
+    base_elem: Dict,
+    candidate_elem: Dict,
+    out_dir: Path,
+    merge_index: int,
+    stitch_type: str,
+) -> Optional[Dict]:
+    """Stitch two table crops either vertically or horizontally."""
+    base_img = _open_table_image(base_elem, out_dir)
+    candidate_img = _open_table_image(candidate_elem, out_dir)
+    if base_img is None or candidate_img is None:
+        return None
+    tables_dir = out_dir / "tables"
+    tables_dir.mkdir(parents=True, exist_ok=True)
+    if stitch_type == "vertical":
+        target_width = max(base_img.width, candidate_img.width)
+        base_img = _pad_width(base_img, target_width)
+        candidate_img = _pad_width(candidate_img, target_width)
+        merged_height = base_img.height + candidate_img.height
+        stitched = Image.new("RGB", (target_width, merged_height), color=(255, 255, 255))
+        stitched.paste(base_img, (0, 0))
+        stitched.paste(candidate_img, (0, base_img.height))
+    else:
+        target_height = max(base_img.height, candidate_img.height)
+        base_img = _pad_height(base_img, target_height)
+        candidate_img = _pad_height(candidate_img, target_height)
+        merged_width = base_img.width + candidate_img.width
+        stitched = Image.new("RGB", (merged_width, target_height), color=(255, 255, 255))
+        stitched.paste(base_img, (0, 0))
+        stitched.paste(candidate_img, (base_img.width, 0))
+    merged_name = (
+        f"page_{base_elem['page']}_to_{candidate_elem['page']}_"
+        f"table_merged_{merge_index}.png"
+    )
+    merged_path = tables_dir / merged_name
+    stitched.save(merged_path)
+    # Remove original partial crops to avoid duplicates
+    (out_dir / base_elem["image_path"]).unlink(missing_ok=True)
+    (out_dir / candidate_elem["image_path"]).unlink(missing_ok=True)
+    new_bbox = [
+        min(base_elem["bbox_pixels"][0], candidate_elem["bbox_pixels"][0]),
+        min(base_elem["bbox_pixels"][1], candidate_elem["bbox_pixels"][1]),
+        max(base_elem["bbox_pixels"][2], candidate_elem["bbox_pixels"][2]),
+        max(base_elem["bbox_pixels"][3], candidate_elem["bbox_pixels"][3]),
+    ]
+    merged_elem = base_elem.copy()
+    merged_elem["page_span"] = [base_elem["page"], candidate_elem["page"]]
+    merged_elem["box_refs"] = [
+        {"page": base_elem["page"], "image_path": base_elem["image_path"]},
+        {"page": candidate_elem["page"], "image_path": candidate_elem["image_path"]},
+    ]
+    merged_elem["bbox_pixels"] = new_bbox
+    merged_elem["image_path"] = str(merged_path.relative_to(out_dir))
+    merged_elem["width"] = stitched.width
+    merged_elem["height"] = stitched.height
+    merged_elem["page_height"] = stitched.height
+    merged_elem["conf"] = min(
+        base_elem.get("conf", 1.0), candidate_elem.get("conf", 1.0)
+    )
+    return merged_elem
+def merge_spanning_tables(elements: List[Dict], out_dir: Path) -> List[Dict]:
+    """
+    Stitch table crops that continue across adjacent pages using the heuristic
+    from the legacy OpenCV-based extractor.
+    """
+    if not elements:
+        return elements
+    tables_by_page: Dict[int, List[Dict]] = {}
+    non_tables: List[Dict] = []
+    for elem in elements:
+        if elem.get("type") != "table":
+            non_tables.append(elem)
+            continue
+        page = elem.get("page")
+        if not isinstance(page, int):
+            non_tables.append(elem)
+            continue
+        tables_by_page.setdefault(page, []).append(elem)
+    merged_results: List[Dict] = []
+    used_next: Dict[int, set[int]] = {}
+    merge_counter = 0
+    for page in sorted(tables_by_page.keys()):
+        current_tables = tables_by_page.get(page, [])
+        next_page_tables = tables_by_page.get(page + 1, [])
+        next_used_indices = used_next.get(page + 1, set())
+        current_used_indices = used_next.get(page, set())
+        for idx_current, table_elem in enumerate(current_tables):
+            if idx_current in current_used_indices:
+                continue
+            if not next_page_tables:
+                merged_results.append(table_elem)
+                continue
+            x, y, w, h = _bbox_to_rect(table_elem["bbox_pixels"])
+            matched = False
+            for idx, candidate in enumerate(next_page_tables):
+                if idx in next_used_indices:
+                    continue
+                if candidate.get("type") != "table":
+                    continue
+                cx, cy, cw, ch = _bbox_to_rect(candidate["bbox_pixels"])
+                vertical_match = (
+                    abs(x - cx) <= TABLE_STITCH_TOLERANCES["x_tol"]
+                    and abs((x + w) - (cx + cw)) <= TABLE_STITCH_TOLERANCES["width_tol"]
+                )
+                horizontal_match = (
+                    abs(y - cy) <= TABLE_STITCH_TOLERANCES["y_tol"]
+                    and abs((y + h) - (cy + ch))
+                    <= TABLE_STITCH_TOLERANCES["height_tol"]
+                )
+                stitch_type = "vertical" if vertical_match else None
+                if not stitch_type and horizontal_match:
+                    stitch_type = "horizontal"
+                if not stitch_type:
+                    continue
+                merge_counter += 1
+                merged_elem = _stitch_table_pair(
+                    table_elem, candidate, out_dir, merge_counter, stitch_type
+                )
+                if merged_elem is None:
+                    continue
+                merged_results.append(merged_elem)
+                next_used_indices.add(idx)
+                matched = True
+                break
+            if not matched:
+                merged_results.append(table_elem)
+        used_next[page + 1] = next_used_indices
+    merged_results.extend(non_tables)
+    return merged_results
+# ----------------------------------------------------------------------
+# Draw layout boxes on the original PDF
+# ----------------------------------------------------------------------
+def draw_layout_pdf(pdf_bytes: bytes, all_dets: List[List[dict]],
+                    scale: float, out_path: Path):
+    """Annotate PDF with semi-transparent bounding boxes and labels."""
+    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+    for page_no, dets in enumerate(all_dets):
+        page = doc[page_no]
+        for d in dets:
+            rgb = CLASS_COLORS.get(d["name"], (0, 0, 0))
+            rect = fitz.Rect([c / scale for c in d["bbox"]])
+            border_color = [c / 255 for c in rgb]
+            fill_color = [c / 255 for c in rgb]
+            fill_opacity = 0.15
+            border_width = 1.5
+            page.draw_rect(
+                rect,
+                color=border_color,
+                fill=fill_color,
+                width=border_width,
+                overlay=True,
+                fill_opacity=fill_opacity
+            )
+            label = f"{d['name']} {d['conf']:.2f}"
+            if d.get("source"):
+                label += f" [{d['source'][0].upper()}]"
+            text_bg = fitz.Rect(rect.x0, rect.y0 - 10, rect.x0 + 60, rect.y0)
+            page.draw_rect(text_bg, color=None, fill=(1, 1, 1, 0.6), overlay=True)
+            page.insert_text(
+                (rect.x0 + 2, rect.y0 - 8),
+                label,
+                fontsize=6.5,
+                color=border_color,
+                overlay=True
+            )
+    doc.save(str(out_path))
+    doc.close()
+# ----------------------------------------------------------------------
+# Process a single PDF Page (for parallel execution)
+# ----------------------------------------------------------------------
+def process_page(task_data: Tuple[int, bytes, float, Path, str]) -> Optional[Tuple[int, List[dict], List[dict]]]:
+    """
+    Process a single page of a PDF in a worker process.
+    Returns: (page_number, detections, elements) or None on failure
+    """
+    pno, pdf_bytes, scale, out_dir, pdf_name = task_data
+    if _shutdown_requested:
+        return None
+    pdf_pdfium = None
+    try:
+        pdf_pdfium = pdfium.PdfDocument(pdf_bytes)
+        page = pdf_pdfium[pno]
+        bitmap = page.render(scale=scale)
+        pil = bitmap.to_pil()
+        dets = detect_page(pil)
+        elements = save_layout_elements(pil, pno, dets, out_dir)
+        page_figures = len([d for d in dets if d['name'] == 'figure'])
+        page_tables = len([d for d in dets if d['name'] == 'table'])
+        logger.info(f"  [{pdf_name}] Page {pno + 1}: {page_figures} figs, {page_tables} tables")
+        page.close()
+        pdf_pdfium.close()
+        return (pno, dets, elements)
+    except Exception as e:
+        logger.error(f"Failed to process page {pno + 1} of {pdf_name}: {e}")
+        if pdf_pdfium:
+            pdf_pdfium.close()
+        return None
+# ----------------------------------------------------------------------
+# Process a full PDF using the persistent worker pool
+# ----------------------------------------------------------------------
+def process_pdf_with_pool(
+    pdf_path: Path,
+    out_dir: Path,
+    pool: Optional[Pool] = None,
+    *,
+    extract_images: bool = True,
+    extract_markdown: bool = True,
+):
+    """
+    Main processing pipeline for a PDF file.
+    If pool is provided, uses it. Otherwise processes serially.
+    """
+    if _shutdown_requested:
+        logger.warning(f"Skipping {pdf_path.name} due to shutdown request")
+        return
+    stem = pdf_path.stem
+    logger.info(f"Processing {pdf_path.name}")
+    pdf_bytes = pdf_path.read_bytes()
+    doc = None
+    try:
+        doc = pdfium.PdfDocument(pdf_bytes)
+        page_count = len(doc)
+    except Exception as e:
+        logger.error(f"Failed to open PDF {pdf_path.name}: {e}. Skipping.")
+        return
+    finally:
+        if doc is not None:
+            doc.close()
+    scale = 2.0
+    all_elements: List[Dict] = []
+    filtered_dets: List[List[dict]] = []
+    if extract_images:
+        all_dets: List[Optional[List[dict]]] = [None] * page_count
+        if pool is not None and USE_MULTIPROCESSING:
+            logger.info(f"  Using worker pool for {page_count} pages...")
+            tasks = [
+                (pno, pdf_bytes, scale, out_dir, pdf_path.name)
+                for pno in range(page_count)
+            ]
+            try:
+                results = pool.map(process_page, tasks)
+                for res in results:
+                    if res:
+                        pno, dets, elements = res
+                        all_dets[pno] = dets
+                        all_elements.extend(elements)
+            except KeyboardInterrupt:
+                logger.warning("Processing interrupted during parallel execution")
+                raise
+        else:
+            logger.info("Using serial processing...")
+            try:
+                pdf_pdfium = pdfium.PdfDocument(pdf_bytes)
+                for pno in range(page_count):
+                    if _shutdown_requested:
+                        logger.warning(
+                            f"Stopping at page {pno + 1}/{page_count} due to shutdown request"
+                        )
+                        break
+                    try:
+                        logger.info(f"  Processing page {pno + 1}/{page_count}")
+                        page = pdf_pdfium[pno]
+                        bitmap = page.render(scale=scale)
+                        pil = bitmap.to_pil()
+                        dets = detect_page(pil)
+                        all_dets[pno] = dets
+                        elements = save_layout_elements(pil, pno, dets, out_dir)
+                        all_elements.extend(elements)
+                        page_figures = len([d for d in dets if d["name"] == "figure"])
+                        page_tables = len([d for d in dets if d["name"] == "table"])
+                        logger.info(
+                            f"    Found {page_figures} figures and {page_tables} tables"
+                        )
+                        page.close()
+                    except Exception as e:
+                        logger.error(f"Failed to process page {pno + 1}: {e}. Skipping page.")
+                pdf_pdfium.close()
+            except Exception as e:
+                logger.error(f"Fatal error processing {pdf_path.name}: {e}")
+                if "pdf_pdfium" in locals() and pdf_pdfium:
+                    pdf_pdfium.close()
+                return
+        dets_per_page: List[Optional[List[Dict[str, Any]]]] = [
+            det if det is not None else None for det in all_dets
+        ]
+        filtered_dets = [d for d in all_dets if d is not None]
+        if all_elements:
+            all_elements = merge_spanning_tables(all_elements, out_dir)
+            all_elements = attach_cross_page_figure_captions(
+                all_elements, dets_per_page, pdf_bytes, out_dir, scale
+            )
+        if all_elements:
+            content_list_path = out_dir / f"{stem}_content_list.json"
+            with open(content_list_path, "w", encoding="utf-8") as f:
+                json.dump(all_elements, f, ensure_ascii=False, indent=4)
+            logger.info(f"  Saved {len(all_elements)} elements to JSON")
+        if filtered_dets:
+            draw_layout_pdf(
+                pdf_bytes, filtered_dets, scale, out_dir / f"{stem}_layout.pdf"
+            )
+            logger.info("  Generated annotated PDF")
+        else:
+            logger.warning(f"No detections found for {stem}. Skipping layout PDF.")
+    else:
+        logger.info("  Image extraction skipped per configuration.")
+    markdown_path = None
+    if extract_markdown:
+        markdown_path = write_markdown_document(pdf_path, out_dir)
+        if markdown_path is None:
+            logger.warning(f"  Markdown extraction yielded no content for {stem}.")
+    if _shutdown_requested:
+        logger.warning(f"⚠️  Partial results saved for {stem} → {out_dir}")
+    else:
+        if extract_images:
+            logger.success(
+                f"✓ {stem} → {out_dir} ({len(all_elements)} elements extracted)"
+            )
+        else:
+            logger.success(f"✓ {stem} → {out_dir} (image extraction skipped)")
+# ----------------------------------------------------------------------
+# Main
+# ----------------------------------------------------------------------
+if __name__ == "__main__":
+    # Important for multiprocessing on Windows/macOS
+    torch.multiprocessing.set_start_method('spawn', force=True)
+    # Setup signal handlers for graceful shutdown
+    setup_signal_handlers()
+    INPUT_DIR = Path("./pdfs")
+    OUTPUT_DIR = Path("./output")
+    os.makedirs(INPUT_DIR, exist_ok=True)
+    os.makedirs(OUTPUT_DIR, exist_ok=True)
+    pdf_files = list(INPUT_DIR.glob("*.pdf"))
+    if not pdf_files:
+        logger.warning("No PDF files found in ./pdfs")
+        logger.info("Please add PDF files to the ./pdfs directory")
+        logger.info("The script will exit gracefully. No errors occurred.")
+        sys.exit(0)
+    logger.info(f"Found {len(pdf_files)} PDF file(s) to process")
+    logger.info(f"Settings: MODEL_SIZE={MODEL_SIZE}, CONF={CONF_THRESHOLD}")
+    # Determine worker count
+    total_cpus = cpu_count()
+    if NUM_WORKERS is None:
+        num_workers = max(1, total_cpus - 1)
+    else:
+        num_workers = max(1, min(NUM_WORKERS, total_cpus))
+    # Decide whether to use multiprocessing
+    use_pool = USE_MULTIPROCESSING and DEVICE == "cpu" and total_cpus >= 4
+    if use_pool:
+        logger.info(f"🚀 Creating persistent worker pool with {num_workers} workers...")
+    else:
+        if not USE_MULTIPROCESSING:
+            logger.info("Multiprocessing disabled by configuration")
+        elif DEVICE != "cpu":
+            logger.info(f"Using serial GPU processing (device: {DEVICE})")
+        else:
+            logger.info(f"Using serial CPU processing (CPU count {total_cpus} too low)")
+    pool = None
+    try:
+        # Create persistent pool ONCE for all PDFs
+        if use_pool:
+            pool = Pool(processes=num_workers, initializer=init_worker)
+            logger.success(f"✓ Worker pool ready with {num_workers} workers\n")
+        else:
+            # Load model in main process for serial execution
+            logger.info("Initializing model in main process...")
+            get_model()
+            logger.success(f"✓ Model loaded (device: {DEVICE})\n")
+        # Process all PDFs using the same pool
+        for i, pdf_path in enumerate(pdf_files, 1):
+            if _shutdown_requested:
+                logger.warning(f"\nShutdown requested. Processed {i-1}/{len(pdf_files)} files.")
+                break
+            logger.info(f"\n{'='*60}")
+            logger.info(f"📄 File {i}/{len(pdf_files)}: {pdf_path.name}")
+            logger.info(f"{'='*60}")
+            sub_out = OUTPUT_DIR / pdf_path.stem
+            os.makedirs(sub_out, exist_ok=True)
+            try:
+                process_pdf_with_pool(pdf_path, sub_out, pool)
+            except KeyboardInterrupt:
+                logger.warning(f"\nInterrupted while processing {pdf_path.name}")
+                break
+            except Exception as e:
+                logger.error(f"Error processing {pdf_path.name}: {e}")
+                if _shutdown_requested:
+                    break
+                logger.info("Continuing with next file...")
+                continue
+        if _shutdown_requested:
+            logger.warning(f"\n⚠️  Processing interrupted. Partial results saved in {OUTPUT_DIR}")
+        else:
+            logger.success(f"\n✨ All done! Results are in {OUTPUT_DIR}")
+    except KeyboardInterrupt:
+        logger.error("\n❌ Processing interrupted by user")
+        sys.exit(1)
+    except Exception as e:
+        logger.error(f"\n❌ Fatal error: {e}")
+        sys.exit(1)
+    finally:
+        # Clean up pool if it exists
+        if pool is not None:
+            logger.info("\n🧹 Shutting down worker pool...")
+            pool.close()
+            pool.join()
+            logger.success("✓ Worker pool closed cleanly")

packages.txt ADDED Viewed

File without changes

requirements.txt ADDED Viewed

	@@ -0,0 +1,16 @@

+aiofiles>=23.2.1
+fastapi>=0.109.0
+gradio>=4.0.0
+huggingface-hub>=0.20.0
+jinja2>=3.1.3
+loguru>=0.7.2
+numpy<2.0.0
+pillow>=10.2.0
+pymupdf>=1.23.0
+pymupdf4llm>=0.0.1
+pypdfium2>=4.26.0
+python-multipart>=0.0.9
+torch>=2.0.0
+torchvision>=0.15.0
+uvicorn>=0.27.0
+doclayout-yolo>=0.0.2