Spaces:

MicroHealth
/

pdf-split

Build error

App Files Files Community

bluenevus commited on Sep 18, 2025

Commit

06e6609

verified ·

1 Parent(s): f5ad955

Update app.py

Browse files

Files changed (1) hide show

app.py +718 -5

app.py CHANGED Viewed

@@ -1,7 +1,720 @@
-import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

+from fastapi import FastAPI, File, UploadFile, HTTPException, Depends, BackgroundTasks, Security, status
+from fastapi.security import APIKeyHeader
+from fastapi.responses import JSONResponse, FileResponse
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel, Field, validator
+from typing import List, Optional, Dict, Any, Tuple
+import asyncio
+import aiofiles
+from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
+import hashlib
+import uuid
+from datetime import datetime, timedelta
+import os
+import shutil
+from pathlib import Path
+import pikepdf  # Python wrapper for qpdf
+import io
+import logging
+from queue import Queue, PriorityQueue
+import threading
+import time
+from enum import Enum
+from contextlib import asynccontextmanager
+import tempfile
+import traceback
+import json
+from dataclasses import dataclass, asdict
+import redis
+from celery import Celery
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.FileHandler('pdf_processor.log'),
+        logging.StreamHandler()
+    ]
+)
+logger = logging.getLogger(__name__)
+# Configuration
+class Config:
+    MAX_FILE_SIZE_MB = 5
+    CHUNK_SIZE_MB = 4.5
+    MAX_FILE_SIZE_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024
+    CHUNK_SIZE_BYTES = int(CHUNK_SIZE_MB * 1024 * 1024)
+    UPLOAD_DIR = Path("uploads")
+    OUTPUT_DIR = Path("outputs")
+    TEMP_DIR = Path("temp")
+    MAX_WORKERS = min(32, (os.cpu_count() or 1) * 2)
+    MAX_QUEUE_SIZE = 1000
+    API_KEY_HEADER = "X-API-Key"
+    REDIS_URL = os.getenv("REDIS_URL", "redis://localhost:6379")
+    CELERY_BROKER_URL = os.getenv("CELERY_BROKER_URL", "redis://localhost:6379/0")
+    CELERY_RESULT_BACKEND = os.getenv("CELERY_RESULT_BACKEND", "redis://localhost:6379/1")
+    # qpdf specific settings
+    QPDF_COMPRESSION_LEVEL = 9  # Maximum compression
+    QPDF_STREAM_DATA = "compress"  # Keep streams compressed
+    QPDF_OBJECT_STREAMS = "generate"  # Generate object streams for better compression
+# Create directories
+for dir_path in [Config.UPLOAD_DIR, Config.OUTPUT_DIR, Config.TEMP_DIR]:
+    dir_path.mkdir(parents=True, exist_ok=True)
+# Initialize Redis
+redis_client = redis.from_url(Config.REDIS_URL, decode_responses=True)
+# Initialize Celery
+celery_app = Celery(
+    'pdf_processor',
+    broker=Config.CELERY_BROKER_URL,
+    backend=Config.CELERY_RESULT_BACKEND
+)
+# API Key Management
+class APIKeyManager:
+    def __init__(self):
+        self.valid_keys = self._load_api_keys()
+    def _load_api_keys(self) -> Dict[str, Dict]:
+        """Load API keys from Redis or environment"""
+        keys = {}
+        # Load from environment for demo
+        demo_keys = os.getenv("API_KEYS", "demo-key-123,test-key-456").split(",")
+        for key in demo_keys:
+            keys[key] = {
+                "created_at": datetime.utcnow().isoformat(),
+                "rate_limit": 100,
+                "active": True
+            }
+        return keys
+    def validate_key(self, api_key: str) -> bool:
+        """Validate API key and check rate limits"""
+        if api_key not in self.valid_keys:
+            return False
+        key_info = self.valid_keys[api_key]
+        if not key_info.get("active", False):
+            return False
+        # Check rate limit using Redis
+        key_name = f"rate_limit:{api_key}"
+        try:
+            current_count = redis_client.incr(key_name)
+            if current_count == 1:
+                redis_client.expire(key_name, 3600)  # 1 hour window
+            if current_count > key_info.get("rate_limit", 100):
+                return False
+        except Exception as e:
+            logger.error(f"Rate limit check failed: {e}")
+        return True
+# Initialize API Key Manager
+api_key_manager = APIKeyManager()
+# Job Status Enum
+class JobStatus(str, Enum):
+    PENDING = "pending"
+    PROCESSING = "processing"
+    COMPLETED = "completed"
+    FAILED = "failed"
+    CANCELLED = "cancelled"
+# Job Model
+@dataclass
+class ProcessingJob:
+    job_id: str
+    api_key: str
+    filename: str
+    status: JobStatus
+    created_at: datetime
+    updated_at: datetime
+    total_pages: Optional[int] = None
+    segments_created: Optional[int] = None
+    segments_discarded: Optional[int] = None
+    error_message: Optional[str] = None
+    output_files: List[str] = None
+    processing_time: Optional[float] = None
+    def __post_init__(self):
+        if self.output_files is None:
+            self.output_files = []
+# PDF Processor with qpdf
+class QPDFProcessor:
+    def __init__(self):
+        self.executor = ThreadPoolExecutor(max_workers=Config.MAX_WORKERS)
+    async def split_pdf_with_qpdf(self,
+                                  input_path: Path,
+                                  output_dir: Path,
+                                  job_id: str) -> Tuple[List[Path], List[Path], Dict]:
+        """
+        Split PDF using qpdf for efficient handling of compressed PDFs
+        """
+        try:
+            start_time = time.time()
+            kept_files = []
+            discarded_files = []
+            stats = {
+                "total_pages": 0,
+                "segments_created": 0,
+                "segments_discarded": 0,
+                "compression_ratio": 0
+            }
+            # Open PDF with pikepdf (qpdf wrapper)
+            with pikepdf.open(input_path) as pdf:
+                total_pages = len(pdf.pages)
+                stats["total_pages"] = total_pages
+                # Calculate pages per segment based on file size
+                file_size = input_path.stat().st_size
+                avg_page_size = file_size / total_pages
+                pages_per_segment = max(1, int(Config.CHUNK_SIZE_BYTES / avg_page_size))
+                segment_num = 0
+                page_start = 0
+                while page_start < total_pages:
+                    page_end = min(page_start + pages_per_segment, total_pages)
+                    segment_num += 1
+                    # Create output path for segment
+                    segment_filename = f"{job_id}_segment_{segment_num:04d}.pdf"
+                    segment_path = output_dir / segment_filename
+                    # Create new PDF with selected pages
+                    segment_pdf = pikepdf.new()
+                    # Copy pages efficiently without decompressing
+                    for page_num in range(page_start, page_end):
+                        segment_pdf.pages.append(pdf.pages[page_num])
+                    # Save with compression settings
+                    segment_pdf.save(
+                        segment_path,
+                        compress_streams=True,
+                        stream_decode_level=pikepdf.StreamDecodeLevel.none,  # Don't decode streams
+                        object_stream_mode=pikepdf.ObjectStreamMode.generate,
+                        linearize=True,  # Web optimization
+                        min_version=pdf.pdf_version
+                    )
+                    # Check segment size
+                    segment_size = segment_path.stat().st_size
+                    if segment_size <= Config.MAX_FILE_SIZE_BYTES:
+                        kept_files.append(segment_path)
+                        stats["segments_created"] += 1
+                        logger.info(f"Created segment {segment_num}: {segment_size / 1024 / 1024:.2f} MB")
+                    else:
+                        # Try to re-split if segment is too large
+                        if pages_per_segment > 1:
+                            # Recursively split this segment
+                            logger.warning(f"Segment {segment_num} too large ({segment_size / 1024 / 1024:.2f} MB), re-splitting...")
+                            segment_path.unlink()  # Delete oversized segment
+                            # Adjust pages per segment
+                            pages_per_segment = max(1, pages_per_segment // 2)
+                            continue
+                        else:
+                            # Single page is too large, discard
+                            discarded_files.append(segment_path)
+                            stats["segments_discarded"] += 1
+                            logger.warning(f"Discarded segment {segment_num}: {segment_size / 1024 / 1024:.2f} MB")
+                    page_start = page_end
+            # Calculate compression ratio
+            original_size = input_path.stat().st_size
+            total_output_size = sum(f.stat().st_size for f in kept_files)
+            if original_size > 0:
+                stats["compression_ratio"] = (1 - total_output_size / original_size) * 100
+            stats["processing_time"] = time.time() - start_time
+            return kept_files, discarded_files, stats
+        except Exception as e:
+            logger.error(f"Error splitting PDF with qpdf: {str(e)}")
+            raise
+    async def optimize_pdf(self, input_path: Path, output_path: Path) -> Path:
+        """
+        Optimize PDF using qpdf's advanced features
+        """
+        try:
+            with pikepdf.open(input_path) as pdf:
+                # Remove unnecessary elements
+                pdf.remove_unreferenced_resources()
+                # Save with maximum optimization
+                pdf.save(
+                    output_path,
+                    compress_streams=True,
+                    object_stream_mode=pikepdf.ObjectStreamMode.generate,
+                    linearize=True,
+                    recompress_flate=True,
+                    deterministic_id=True
+                )
+            return output_path
+        except Exception as e:
+            logger.error(f"Error optimizing PDF: {str(e)}")
+            raise
+# Job Queue Manager
+class JobQueueManager:
+    def __init__(self):
+        self.queue = PriorityQueue(maxsize=Config.MAX_QUEUE_SIZE)
+        self.jobs: Dict[str, ProcessingJob] = {}
+        self.lock = threading.Lock()
+        self.processor = QPDFProcessor()
+        self.processing_thread = threading.Thread(target=self._process_queue, daemon=True)
+        self.processing_thread.start()
+    def add_job(self, job: ProcessingJob) -> str:
+        """Add job to queue"""
+        with self.lock:
+            if self.queue.full():
+                raise HTTPException(
+                    status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
+                    detail="Queue is full. Please try again later."
+                )
+            priority = 1 if job.api_key in ["premium-key"] else 2
+            self.queue.put((priority, job.created_at, job.job_id))
+            self.jobs[job.job_id] = job
+            # Store in Redis for persistence
+            redis_client.setex(
+                f"job:{job.job_id}",
+                86400,  # 24 hours TTL
+                json.dumps(asdict(job), default=str)
+            )
+            return job.job_id
+    def get_job_status(self, job_id: str) -> Optional[ProcessingJob]:
+        """Get job status"""
+        with self.lock:
+            if job_id in self.jobs:
+                return self.jobs[job_id]
+            # Try to get from Redis
+            job_data = redis_client.get(f"job:{job_id}")
+            if job_data:
+                return ProcessingJob(**json.loads(job_data))
+            return None
+    def _process_queue(self):
+        """Background thread to process queue"""
+        while True:
+            try:
+                if not self.queue.empty():
+                    _, _, job_id = self.queue.get(timeout=1)
+                    with self.lock:
+                        job = self.jobs.get(job_id)
+                    if job and job.status == JobStatus.PENDING:
+                        asyncio.run(self._process_job(job))
+                time.sleep(0.1)
+            except Exception as e:
+                logger.error(f"Queue processing error: {str(e)}")
+    async def _process_job(self, job: ProcessingJob):
+        """Process a single job"""
+        try:
+            start_time = time.time()
+            # Update job status
+            job.status = JobStatus.PROCESSING
+            job.updated_at = datetime.utcnow()
+            self._update_job(job)
+            # Process PDF
+            input_path = Config.UPLOAD_DIR / job.filename
+            output_dir = Config.OUTPUT_DIR / job.job_id
+            output_dir.mkdir(parents=True, exist_ok=True)
+            # Split PDF using qpdf
+            kept_files, discarded_files, stats = await self.processor.split_pdf_with_qpdf(
+                input_path, output_dir, job.job_id
+            )
+            # Update job with results
+            job.status = JobStatus.COMPLETED
+            job.total_pages = stats["total_pages"]
+            job.segments_created = stats["segments_created"]
+            job.segments_discarded = stats["segments_discarded"]
+            job.output_files = [str(f.name) for f in kept_files]
+            job.processing_time = time.time() - start_time
+            job.updated_at = datetime.utcnow()
+            # Clean up discarded files
+            for file in discarded_files:
+                try:
+                    file.unlink()
+                except Exception as e:
+                    logger.error(f"Error deleting discarded file: {e}")
+            self._update_job(job)
+            logger.info(f"Job {job.job_id} completed successfully")
+        except Exception as e:
+            logger.error(f"Job processing failed: {str(e}")
+            job.status = JobStatus.FAILED
+            job.error_message = str(e)
+            job.updated_at = datetime.utcnow()
+            self._update_job(job)
+    def _update_job(self, job: ProcessingJob):
+        """Update job in memory and Redis"""
+        with self.lock:
+            self.jobs[job.job_id] = job
+            redis_client.setex(
+                f"job:{job.job_id}",
+                86400,
+                json.dumps(asdict(job), default=str)
+            )
+# Initialize Job Queue Manager
+job_queue_manager = JobQueueManager()
+# Lifespan context manager for startup/shutdown
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    # Startup
+    logger.info("Starting PDF Processor API")
+    yield
+    # Shutdown
+    logger.info("Shutting down PDF Processor API")
+# Initialize FastAPI app with OpenAPI documentation
+app = FastAPI(
+    title="PDF Splitter API",
+    description="High-performance API for splitting large PDFs into segments using qpdf",
+    version="1.0.0",
+    docs_url="/docs",
+    redoc_url="/redoc",
+    openapi_url="/openapi.json",
+    lifespan=lifespan
+)
+# Add CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Security dependency
+api_key_header = APIKeyHeader(name=Config.API_KEY_HEADER, auto_error=False)
+async def verify_api_key(api_key: str = Security(api_key_header)) -> str:
+    """Verify API key"""
+    if not api_key:
+        raise HTTPException(
+            status_code=status.HTTP_401_UNAUTHORIZED,
+            detail="API key required"
+        )
+    if not api_key_manager.validate_key(api_key):
+        raise HTTPException(
+            status_code=status.HTTP_403_FORBIDDEN,
+            detail="Invalid or rate-limited API key"
+        )
+    return api_key
+# Response Models
+class JobResponse(BaseModel):
+    job_id: str = Field(..., description="Unique job identifier")
+    status: JobStatus = Field(..., description="Current job status")
+    message: str = Field(..., description="Status message")
+    created_at: str = Field(..., description="Job creation timestamp")
+class JobStatusResponse(BaseModel):
+    job_id: str
+    status: JobStatus
+    filename: str
+    created_at: str
+    updated_at: str
+    total_pages: Optional[int] = None
+    segments_created: Optional[int] = None
+    segments_discarded: Optional[int] = None
+    error_message: Optional[str] = None
+    output_files: List[str] = []
+    processing_time: Optional[float] = None
+class ErrorResponse(BaseModel):
+    detail: str
+    error_code: Optional[str] = None
+    timestamp: str = Field(default_factory=lambda: datetime.utcnow().isoformat())
+# API Endpoints
+@app.post(
+    "/api/v1/upload",
+    response_model=JobResponse,
+    status_code=status.HTTP_202_ACCEPTED,
+    summary="Upload PDF for splitting",
+    description="Upload a large PDF file to be split into 4.5MB segments",
+    responses={
+        202: {"description": "Job accepted and queued for processing"},
+        400: {"model": ErrorResponse, "description": "Invalid file or request"},
+        401: {"model": ErrorResponse, "description": "Missing or invalid API key"},
+        403: {"model": ErrorResponse, "description": "Rate limit exceeded"},
+        413: {"model": ErrorResponse, "description": "File too large"},
+        503: {"model": ErrorResponse, "description": "Service unavailable"}
+    }
+)
+async def upload_pdf(
+    background_tasks: BackgroundTasks,
+    file: UploadFile = File(..., description="PDF file to upload"),
+    api_key: str = Depends(verify_api_key)
+):
+    """
+    Upload a PDF file for splitting into segments.
+    - Files are split into segments of approximately 4.5MB
+    - Segments larger than 5MB are discarded
+    - Processing is done asynchronously
+    - Returns a job ID for tracking progress
+    """
+    try:
+        # Validate file type
+        if not file.filename.lower().endswith('.pdf'):
+            raise HTTPException(
+                status_code=status.HTTP_400_BAD_REQUEST,
+                detail="Only PDF files are accepted"
+            )
+        # Generate unique job ID
+        job_id = str(uuid.uuid4())
+        timestamp = datetime.utcnow()
+        # Save uploaded file
+        upload_path = Config.UPLOAD_DIR / f"{job_id}_{file.filename}"
+        # Stream file to disk to handle large files efficiently
+        async with aiofiles.open(upload_path, 'wb') as f:
+            chunk_size = 1024 * 1024  # 1MB chunks
+            while content := await file.read(chunk_size):
+                await f.write(content)
+        # Verify it's a valid PDF using pikepdf
+        try:
+            with pikepdf.open(upload_path) as pdf:
+                page_count = len(pdf.pages)
+                logger.info(f"Valid PDF uploaded: {file.filename}, {page_count} pages")
+        except Exception as e:
+            upload_path.unlink()  # Delete invalid file
+            raise HTTPException(
+                status_code=status.HTTP_400_BAD_REQUEST,
+                detail=f"Invalid PDF file: {str(e)}"
+            )
+        # Create job
+        job = ProcessingJob(
+            job_id=job_id,
+            api_key=api_key,
+            filename=upload_path.name,
+            status=JobStatus.PENDING,
+            created_at=timestamp,
+            updated_at=timestamp
+        )
+        # Add to queue
+        job_queue_manager.add_job(job)
+        return JobResponse(
+            job_id=job_id,
+            status=JobStatus.PENDING,
+            message="PDF uploaded successfully and queued for processing",
+            created_at=timestamp.isoformat()
+        )
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Upload error: {str(e)}")
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=f"Upload failed: {str(e)}"
+        )
+@app.get(
+    "/api/v1/job/{job_id}",
+    response_model=JobStatusResponse,
+    summary="Get job status",
+    description="Check the status of a PDF splitting job",
+    responses={
+        200: {"description": "Job status retrieved successfully"},
+        401: {"model": ErrorResponse, "description": "Missing or invalid API key"},
+        404: {"model": ErrorResponse, "description": "Job not found"}
+    }
+)
+async def get_job_status(
+    job_id: str = Path(..., description="Job ID to check"),
+    api_key: str = Depends(verify_api_key)
+):
+    """
+    Get the current status of a PDF splitting job.
+    """
+    job = job_queue_manager.get_job_status(job_id)
+    if not job:
+        raise HTTPException(
+            status_code=status.HTTP_404_NOT_FOUND,
+            detail=f"Job {job_id} not found"
+        )
+    # Verify API key matches
+    if job.api_key != api_key:
+        raise HTTPException(
+            status_code=status.HTTP_403_FORBIDDEN,
+            detail="Access denied to this job"
+        )
+    return JobStatusResponse(
+        job_id=job.job_id,
+        status=job.status,
+        filename=job.filename,
+        created_at=job.created_at.isoformat(),
+        updated_at=job.updated_at.isoformat(),
+        total_pages=job.total_pages,
+        segments_created=job.segments_created,
+        segments_discarded=job.segments_discarded,
+        error_message=job.error_message,
+        output_files=job.output_files,
+        processing_time=job.processing_time
+    )
+@app.get(
+    "/api/v1/job/{job_id}/download/{segment_name}",
+    response_class=FileResponse,
+    summary="Download segment",
+    description="Download a specific segment from a completed job",
+    responses={
+        200: {"description": "Segment file", "content": {"application/pdf": {}}},
+        401: {"model": ErrorResponse, "description": "Missing or invalid API key"},
+        404: {"model": ErrorResponse, "description": "Job or segment not found"}
+    }
+)
+async def download_segment(
+    job_id: str = Path(..., description="Job ID"),
+    segment_name: str = Path(..., description="Segment filename"),
+    api_key: str = Depends(verify_api_key)
+):
+    """
+    Download a specific PDF segment from a completed job.
+    """
+    job = job_queue_manager.get_job_status(job_id)
+    if not job:
+        raise HTTPException(
+            status_code=status.HTTP_404_NOT_FOUND,
+            detail=f"Job {job_id} not found"
+        )
+    if job.api_key != api_key:
+        raise HTTPException(
+            status_code=status.HTTP_403_FORBIDDEN,
+            detail="Access denied"
+        )
+    if job.status != JobStatus.COMPLETED:
+        raise HTTPException(
+            status_code=status.HTTP_400_BAD_REQUEST,
+            detail=f"Job is {job.status}, not completed"
+        )
+    if segment_name not in job.output_files:
+        raise HTTPException(
+            status_code=status.HTTP_404_NOT_FOUND,
+            detail=f"Segment {segment_name} not found"
+        )
+    file_path = Config.OUTPUT_DIR / job_id / segment_name
+    if not file_path.exists():
+        raise HTTPException(
+            status_code=status.HTTP_404_NOT_FOUND,
+            detail=f"Segment file not found on disk"
+        )
+    return FileResponse(
+        path=file_path,
+        media_type="application/pdf",
+        filename=segment_name
+    )
+@app.get(
+    "/api/v1/health",
+    summary="Health check",
+    description="Check API health and system status"
+)
+async def health_check():
+    """
+    Health check endpoint for monitoring.
+    """
+    try:
+        # Check Redis connection
+        redis_status = "healthy" if redis_client.ping() else "unhealthy"
+    except:
+        redis_status = "unhealthy"
+    return {
+        "status": "healthy",
+        "timestamp": datetime.utcnow().isoformat(),
+        "redis": redis_status,
+        "queue_size": job_queue_manager.queue.qsize(),
+        "active_jobs": len(job_queue_manager.jobs)
+    }
+# Error handlers
+@app.exception_handler(HTTPException)
+async def http_exception_handler(request, exc: HTTPException):
+    return JSONResponse(
+        status_code=exc.status_code,
+        content={
+            "detail": exc.detail,
+            "timestamp": datetime.utcnow().isoformat()
+        }
+    )
+@app.exception_handler(Exception)
+async def general_exception_handler(request, exc: Exception):
+    logger.error(f"Unhandled exception: {str(exc)}")
+    return JSONResponse(
+        status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+        content={
+            "detail": "Internal server error",
+            "timestamp": datetime.utcnow().isoformat()
+        }
+    )
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(
+        "app:app",
+        host="0.0.0.0",
+        port=8000,
+        reload=True,
+        log_level="info",
+        access_log=True
+    )