Spaces:

MicroHealth
/

pdf-split

Build error

App Files Files Community

bluenevus commited on Sep 18, 2025

Commit

c21b0a9

verified ·

1 Parent(s): 6e431b7

Update app.py

Browse files

Files changed (1) hide show

app.py +271 -623

app.py CHANGED Viewed

@@ -1,183 +1,66 @@
-from fastapi import FastAPI, File, UploadFile, HTTPException, Depends, BackgroundTasks, Security, status
-from fastapi.security import APIKeyHeader
-from fastapi.responses import JSONResponse, FileResponse
-from fastapi.middleware.cors import CORSMiddleware
-from pydantic import BaseModel, Field, validator
-from typing import List, Optional, Dict, Any, Tuple
-import asyncio
-import aiofiles
-from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
-import hashlib
-import uuid
-from datetime import datetime, timedelta
 import os
 import shutil
 from pathlib import Path
-import pikepdf  # Python wrapper for qpdf
-import io
-import logging
-from queue import Queue, PriorityQueue
 import threading
 import time
-from enum import Enum
-from contextlib import asynccontextmanager
-import tempfile
-import traceback
-import json
-from dataclasses import dataclass, asdict
-import redis
-from celery import Celery
 # Configure logging
-logging.basicConfig(
-    level=logging.INFO,
-    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
-    handlers=[
-        logging.FileHandler('pdf_processor.log'),
-        logging.StreamHandler()
-    ]
-)
 logger = logging.getLogger(__name__)
 # Configuration
-class Config:
-    MAX_FILE_SIZE_MB = 5
-    CHUNK_SIZE_MB = 4.5
-    MAX_FILE_SIZE_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024
-    CHUNK_SIZE_BYTES = int(CHUNK_SIZE_MB * 1024 * 1024)
-    UPLOAD_DIR = Path("uploads")
-    OUTPUT_DIR = Path("outputs")
-    TEMP_DIR = Path("temp")
-    MAX_WORKERS = min(32, (os.cpu_count() or 1) * 2)
-    MAX_QUEUE_SIZE = 1000
-    API_KEY_HEADER = "X-API-Key"
-    REDIS_URL = os.getenv("REDIS_URL", "redis://localhost:6379")
-    CELERY_BROKER_URL = os.getenv("CELERY_BROKER_URL", "redis://localhost:6379/0")
-    CELERY_RESULT_BACKEND = os.getenv("CELERY_RESULT_BACKEND", "redis://localhost:6379/1")
-    # qpdf specific settings
-    QPDF_COMPRESSION_LEVEL = 9  # Maximum compression
-    QPDF_STREAM_DATA = "compress"  # Keep streams compressed
-    QPDF_OBJECT_STREAMS = "generate"  # Generate object streams for better compression
-# Create directories
-for dir_path in [Config.UPLOAD_DIR, Config.OUTPUT_DIR, Config.TEMP_DIR]:
-    dir_path.mkdir(parents=True, exist_ok=True)
-# Initialize Redis
-redis_client = redis.from_url(Config.REDIS_URL, decode_responses=True)
-# Initialize Celery
-celery_app = Celery(
-    'pdf_processor',
-    broker=Config.CELERY_BROKER_URL,
-    backend=Config.CELERY_RESULT_BACKEND
-)
-# API Key Management
-class APIKeyManager:
-    def __init__(self):
-        self.valid_keys = self._load_api_keys()
-    def _load_api_keys(self) -> Dict[str, Dict]:
-        """Load API keys from Redis or environment"""
-        keys = {}
-        # Load from environment for demo
-        demo_keys = os.getenv("API_KEYS", "demo-key-123,test-key-456").split(",")
-        for key in demo_keys:
-            keys[key] = {
-                "created_at": datetime.utcnow().isoformat(),
-                "rate_limit": 100,
-                "active": True
-            }
-        return keys
-    def validate_key(self, api_key: str) -> bool:
-        """Validate API key and check rate limits"""
-        if api_key not in self.valid_keys:
-            return False
-        key_info = self.valid_keys[api_key]
-        if not key_info.get("active", False):
-            return False
-        # Check rate limit using Redis
-        key_name = f"rate_limit:{api_key}"
-        try:
-            current_count = redis_client.incr(key_name)
-            if current_count == 1:
-                redis_client.expire(key_name, 3600)  # 1 hour window
-            if current_count > key_info.get("rate_limit", 100):
-                return False
-        except Exception as e:
-            logger.error(f"Rate limit check failed: {e}")
-        return True
-# Initialize API Key Manager
-api_key_manager = APIKeyManager()
-# Job Status Enum
-class JobStatus(str, Enum):
-    PENDING = "pending"
-    PROCESSING = "processing"
-    COMPLETED = "completed"
-    FAILED = "failed"
-    CANCELLED = "cancelled"
-# Job Model
-@dataclass
-class ProcessingJob:
-    job_id: str
-    api_key: str
-    filename: str
-    status: JobStatus
-    created_at: datetime
-    updated_at: datetime
-    total_pages: Optional[int] = None
-    segments_created: Optional[int] = None
-    segments_discarded: Optional[int] = None
-    error_message: Optional[str] = None
-    output_files: List[str] = None
-    processing_time: Optional[float] = None
-    def __post_init__(self):
-        if self.output_files is None:
-            self.output_files = []
-# PDF Processor with qpdf
-class QPDFProcessor:
-    def __init__(self):
-        self.executor = ThreadPoolExecutor(max_workers=Config.MAX_WORKERS)
-    async def split_pdf_with_qpdf(self,
-                                  input_path: Path,
-                                  output_dir: Path,
-                                  job_id: str) -> Tuple[List[Path], List[Path], Dict]:
         """
-        Split PDF using qpdf for efficient handling of compressed PDFs
         """
         try:
-            start_time = time.time()
-            kept_files = []
-            discarded_files = []
-            stats = {
-                "total_pages": 0,
-                "segments_created": 0,
-                "segments_discarded": 0,
-                "compression_ratio": 0
-            }
-            # Open PDF with pikepdf (qpdf wrapper)
             with pikepdf.open(input_path) as pdf:
                 total_pages = len(pdf.pages)
                 stats["total_pages"] = total_pages
-                # Calculate pages per segment based on file size
                 file_size = input_path.stat().st_size
-                avg_page_size = file_size / total_pages
-                pages_per_segment = max(1, int(Config.CHUNK_SIZE_BYTES / avg_page_size))
                 segment_num = 0
                 page_start = 0
@@ -186,535 +69,300 @@ class QPDFProcessor:
                     page_end = min(page_start + pages_per_segment, total_pages)
                     segment_num += 1
-                    # Create output path for segment
-                    segment_filename = f"{job_id}_segment_{segment_num:04d}.pdf"
                     segment_path = output_dir / segment_filename
                     # Create new PDF with selected pages
                     segment_pdf = pikepdf.new()
-                    # Copy pages efficiently without decompressing
                     for page_num in range(page_start, page_end):
                         segment_pdf.pages.append(pdf.pages[page_num])
-                    # Save with compression settings
                     segment_pdf.save(
                         segment_path,
                         compress_streams=True,
-                        stream_decode_level=pikepdf.StreamDecodeLevel.none,  # Don't decode streams
                         object_stream_mode=pikepdf.ObjectStreamMode.generate,
-                        linearize=True,  # Web optimization
-                        min_version=pdf.pdf_version
                     )
                     # Check segment size
                     segment_size = segment_path.stat().st_size
-                    if segment_size <= Config.MAX_FILE_SIZE_BYTES:
-                        kept_files.append(segment_path)
                         stats["segments_created"] += 1
                         logger.info(f"Created segment {segment_num}: {segment_size / 1024 / 1024:.2f} MB")
                     else:
-                        # Try to re-split if segment is too large
-                        if pages_per_segment > 1:
-                            # Recursively split this segment
-                            logger.warning(f"Segment {segment_num} too large ({segment_size / 1024 / 1024:.2f} MB), re-splitting...")
-                            segment_path.unlink()  # Delete oversized segment
-                            # Adjust pages per segment
                             pages_per_segment = max(1, pages_per_segment // 2)
                             continue
-                        else:
-                            # Single page is too large, discard
-                            discarded_files.append(segment_path)
-                            stats["segments_discarded"] += 1
-                            logger.warning(f"Discarded segment {segment_num}: {segment_size / 1024 / 1024:.2f} MB")
                     page_start = page_end
-            # Calculate compression ratio
-            original_size = input_path.stat().st_size
-            total_output_size = sum(f.stat().st_size for f in kept_files)
-            if original_size > 0:
-                stats["compression_ratio"] = (1 - total_output_size / original_size) * 100
-            stats["processing_time"] = time.time() - start_time
-            return kept_files, discarded_files, stats
-        except Exception as e:
-            logger.error(f"Error splitting PDF with qpdf: {str(e)}")
-            raise
-    async def optimize_pdf(self, input_path: Path, output_path: Path) -> Path:
-        """
-        Optimize PDF using qpdf's advanced features
-        """
-        try:
-            with pikepdf.open(input_path) as pdf:
-                # Remove unnecessary elements
-                pdf.remove_unreferenced_resources()
-                # Save with maximum optimization
-                pdf.save(
-                    output_path,
-                    compress_streams=True,
-                    object_stream_mode=pikepdf.ObjectStreamMode.generate,
-                    linearize=True,
-                    recompress_flate=True,
-                    deterministic_id=True
-                )
-            return output_path
         except Exception as e:
-            logger.error(f"Error optimizing PDF: {str(e)}")
             raise
-# Job Queue Manager
-class JobQueueManager:
-    def __init__(self):
-        self.queue = PriorityQueue(maxsize=Config.MAX_QUEUE_SIZE)
-        self.jobs: Dict[str, ProcessingJob] = {}
-        self.lock = threading.Lock()
-        self.processor = QPDFProcessor()
-        self.processing_thread = threading.Thread(target=self._process_queue, daemon=True)
-        self.processing_thread.start()
-    def add_job(self, job: ProcessingJob) -> str:
-        """Add job to queue"""
-        with self.lock:
-            if self.queue.full():
-                raise HTTPException(
-                    status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
-                    detail="Queue is full. Please try again later."
-                )
-            priority = 1 if job.api_key in ["premium-key"] else 2
-            self.queue.put((priority, job.created_at, job.job_id))
-            self.jobs[job.job_id] = job
-            # Store in Redis for persistence
-            redis_client.setex(
-                f"job:{job.job_id}",
-                86400,  # 24 hours TTL
-                json.dumps(asdict(job), default=str)
-            )
-            return job.job_id
-    def get_job_status(self, job_id: str) -> Optional[ProcessingJob]:
-        """Get job status"""
-        with self.lock:
-            if job_id in self.jobs:
-                return self.jobs[job_id]
-            # Try to get from Redis
-            job_data = redis_client.get(f"job:{job_id}")
-            if job_data:
-                return ProcessingJob(**json.loads(job_data))
-            return None
-    def _process_queue(self):
-        """Background thread to process queue"""
-        while True:
-            try:
-                if not self.queue.empty():
-                    _, _, job_id = self.queue.get(timeout=1)
-                    with self.lock:
-                        job = self.jobs.get(job_id)
-                    if job and job.status == JobStatus.PENDING:
-                        asyncio.run(self._process_job(job))
-                time.sleep(0.1)
-            except Exception as e:
-                logger.error(f"Queue processing error: {str(e)}")
-    async def _process_job(self, job: ProcessingJob):
-        """Process a single job"""
-        try:
-            start_time = time.time()
-            # Update job status
-            job.status = JobStatus.PROCESSING
-            job.updated_at = datetime.utcnow()
-            self._update_job(job)
-            # Process PDF
-            input_path = Config.UPLOAD_DIR / job.filename
-            output_dir = Config.OUTPUT_DIR / job.job_id
-            output_dir.mkdir(parents=True, exist_ok=True)
-            # Split PDF using qpdf
-            kept_files, discarded_files, stats = await self.processor.split_pdf_with_qpdf(
-                input_path, output_dir, job.job_id
-            )
-            # Update job with results
-            job.status = JobStatus.COMPLETED
-            job.total_pages = stats["total_pages"]
-            job.segments_created = stats["segments_created"]
-            job.segments_discarded = stats["segments_discarded"]
-            job.output_files = [str(f.name) for f in kept_files]
-            job.processing_time = time.time() - start_time
-            job.updated_at = datetime.utcnow()
-            # Clean up discarded files
-            for file in discarded_files:
                 try:
-                    file.unlink()
                 except Exception as e:
-                    logger.error(f"Error deleting discarded file: {e}")
-            self._update_job(job)
-            logger.info(f"Job {job.job_id} completed successfully")
-        except Exception as e:
-            logger.error(f"Job processing failed: {str(e)}")
-            job.status = JobStatus.FAILED
-            job.error_message = str(e)
-            job.updated_at = datetime.utcnow()
-            self._update_job(job)
-    def _update_job(self, job: ProcessingJob):
-        """Update job in memory and Redis"""
-        with self.lock:
-            self.jobs[job.job_id] = job
-            redis_client.setex(
-                f"job:{job.job_id}",
-                86400,
-                json.dumps(asdict(job), default=str)
-            )
-# Initialize Job Queue Manager
-job_queue_manager = JobQueueManager()
-# Lifespan context manager for startup/shutdown
-@asynccontextmanager
-async def lifespan(app: FastAPI):
-    # Startup
-    logger.info("Starting PDF Processor API")
-    yield
-    # Shutdown
-    logger.info("Shutting down PDF Processor API")
-# Initialize FastAPI app with OpenAPI documentation
-app = FastAPI(
-    title="PDF Splitter API",
-    description="High-performance API for splitting large PDFs into segments using qpdf",
-    version="1.0.0",
-    docs_url="/docs",
-    redoc_url="/redoc",
-    openapi_url="/openapi.json",
-    lifespan=lifespan
-)
-# Add CORS middleware
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"],
-    allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
-)
-# Security dependency
-api_key_header = APIKeyHeader(name=Config.API_KEY_HEADER, auto_error=False)
-async def verify_api_key(api_key: str = Security(api_key_header)) -> str:
-    """Verify API key"""
-    if not api_key:
-        raise HTTPException(
-            status_code=status.HTTP_401_UNAUTHORIZED,
-            detail="API key required"
-        )
-    if not api_key_manager.validate_key(api_key):
-        raise HTTPException(
-            status_code=status.HTTP_403_FORBIDDEN,
-            detail="Invalid or rate-limited API key"
-        )
-    return api_key
-# Response Models
-class JobResponse(BaseModel):
-    job_id: str = Field(..., description="Unique job identifier")
-    status: JobStatus = Field(..., description="Current job status")
-    message: str = Field(..., description="Status message")
-    created_at: str = Field(..., description="Job creation timestamp")
-class JobStatusResponse(BaseModel):
-    job_id: str
-    status: JobStatus
-    filename: str
-    created_at: str
-    updated_at: str
-    total_pages: Optional[int] = None
-    segments_created: Optional[int] = None
-    segments_discarded: Optional[int] = None
-    error_message: Optional[str] = None
-    output_files: List[str] = []
-    processing_time: Optional[float] = None
-class ErrorResponse(BaseModel):
-    detail: str
-    error_code: Optional[str] = None
-    timestamp: str = Field(default_factory=lambda: datetime.utcnow().isoformat())
-# API Endpoints
-@app.post(
-    "/api/v1/upload",
-    response_model=JobResponse,
-    status_code=status.HTTP_202_ACCEPTED,
-    summary="Upload PDF for splitting",
-    description="Upload a large PDF file to be split into 4.5MB segments",
-    responses={
-        202: {"description": "Job accepted and queued for processing"},
-        400: {"model": ErrorResponse, "description": "Invalid file or request"},
-        401: {"model": ErrorResponse, "description": "Missing or invalid API key"},
-        403: {"model": ErrorResponse, "description": "Rate limit exceeded"},
-        413: {"model": ErrorResponse, "description": "File too large"},
-        503: {"model": ErrorResponse, "description": "Service unavailable"}
-    }
-)
-async def upload_pdf(
-    background_tasks: BackgroundTasks,
-    file: UploadFile = File(..., description="PDF file to upload"),
-    api_key: str = Depends(verify_api_key)
-):
     """
-    Upload a PDF file for splitting into segments.
-    - Files are split into segments of approximately 4.5MB
-    - Segments larger than 5MB are discarded
-    - Processing is done asynchronously
-    - Returns a job ID for tracking progress
     """
     try:
-        # Validate file type
-        if not file.filename.lower().endswith('.pdf'):
-            raise HTTPException(
-                status_code=status.HTTP_400_BAD_REQUEST,
-                detail="Only PDF files are accepted"
-            )
-        # Generate unique job ID
-        job_id = str(uuid.uuid4())
-        timestamp = datetime.utcnow()
         # Save uploaded file
-        upload_path = Config.UPLOAD_DIR / f"{job_id}_{file.filename}"
-        # Stream file to disk to handle large files efficiently
-        async with aiofiles.open(upload_path, 'wb') as f:
-            chunk_size = 1024 * 1024  # 1MB chunks
-            while content := await file.read(chunk_size):
-                await f.write(content)
-        # Verify it's a valid PDF using pikepdf
-        try:
-            with pikepdf.open(upload_path) as pdf:
-                page_count = len(pdf.pages)
-                logger.info(f"Valid PDF uploaded: {file.filename}, {page_count} pages")
-        except Exception as e:
-            upload_path.unlink()  # Delete invalid file
-            raise HTTPException(
-                status_code=status.HTTP_400_BAD_REQUEST,
-                detail=f"Invalid PDF file: {str(e)}"
-            )
-        # Create job
-        job = ProcessingJob(
-            job_id=job_id,
-            api_key=api_key,
-            filename=upload_path.name,
-            status=JobStatus.PENDING,
-            created_at=timestamp,
-            updated_at=timestamp
-        )
-        # Add to queue
-        job_queue_manager.add_job(job)
-        return JobResponse(
-            job_id=job_id,
-            status=JobStatus.PENDING,
-            message="PDF uploaded successfully and queued for processing",
-            created_at=timestamp.isoformat()
         )
-    except HTTPException:
-        raise
     except Exception as e:
-        logger.error(f"Upload error: {str(e)}")
-        raise HTTPException(
-            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
-            detail=f"Upload failed: {str(e)}"
-        )
-@app.get(
-    "/api/v1/job/{job_id}",
-    response_model=JobStatusResponse,
-    summary="Get job status",
-    description="Check the status of a PDF splitting job",
-    responses={
-        200: {"description": "Job status retrieved successfully"},
-        401: {"model": ErrorResponse, "description": "Missing or invalid API key"},
-        404: {"model": ErrorResponse, "description": "Job not found"}
     }
-)
-async def get_job_status(
-    job_id: str = Path(..., description="Job ID to check"),
-    api_key: str = Depends(verify_api_key)
-):
-    """
-    Get the current status of a PDF splitting job.
     """
-    job = job_queue_manager.get_job_status(job_id)
-    if not job:
-        raise HTTPException(
-            status_code=status.HTTP_404_NOT_FOUND,
-            detail=f"Job {job_id} not found"
-        )
-    # Verify API key matches
-    if job.api_key != api_key:
-        raise HTTPException(
-            status_code=status.HTTP_403_FORBIDDEN,
-            detail="Access denied to this job"
-        )
-    return JobStatusResponse(
-        job_id=job.job_id,
-        status=job.status,
-        filename=job.filename,
-        created_at=job.created_at.isoformat(),
-        updated_at=job.updated_at.isoformat(),
-        total_pages=job.total_pages,
-        segments_created=job.segments_created,
-        segments_discarded=job.segments_discarded,
-        error_message=job.error_message,
-        output_files=job.output_files,
-        processing_time=job.processing_time
-    )
-@app.get(
-    "/api/v1/job/{job_id}/download/{segment_name}",
-    response_class=FileResponse,
-    summary="Download segment",
-    description="Download a specific segment from a completed job",
-    responses={
-        200: {"description": "Segment file", "content": {"application/pdf": {}}},
-        401: {"model": ErrorResponse, "description": "Missing or invalid API key"},
-        404: {"model": ErrorResponse, "description": "Job or segment not found"}
-    }
-)
-async def download_segment(
-    job_id: str = Path(..., description="Job ID"),
-    segment_name: str = Path(..., description="Segment filename"),
-    api_key: str = Depends(verify_api_key)
-):
-    """
-    Download a specific PDF segment from a completed job.
-    """
-    job = job_queue_manager.get_job_status(job_id)
-    if not job:
-        raise HTTPException(
-            status_code=status.HTTP_404_NOT_FOUND,
-            detail=f"Job {job_id} not found"
-        )
-    if job.api_key != api_key:
-        raise HTTPException(
-            status_code=status.HTTP_403_FORBIDDEN,
-            detail="Access denied"
-        )
-    if job.status != JobStatus.COMPLETED:
-        raise HTTPException(
-            status_code=status.HTTP_400_BAD_REQUEST,
-            detail=f"Job is {job.status}, not completed"
-        )
-    if segment_name not in job.output_files:
-        raise HTTPException(
-            status_code=status.HTTP_404_NOT_FOUND,
-            detail=f"Segment {segment_name} not found"
-        )
-    file_path = Config.OUTPUT_DIR / job_id / segment_name
-    if not file_path.exists():
-        raise HTTPException(
-            status_code=status.HTTP_404_NOT_FOUND,
-            detail=f"Segment file not found on disk"
         )
-    return FileResponse(
-        path=file_path,
-        media_type="application/pdf",
-        filename=segment_name
     )
-@app.get(
-    "/api/v1/health",
-    summary="Health check",
-    description="Check API health and system status"
-)
-async def health_check():
-    """
-    Health check endpoint for monitoring.
-    """
-    try:
-        # Check Redis connection
-        redis_status = "healthy" if redis_client.ping() else "unhealthy"
-    except:
-        redis_status = "unhealthy"
-    return {
-        "status": "healthy",
-        "timestamp": datetime.utcnow().isoformat(),
-        "redis": redis_status,
-        "queue_size": job_queue_manager.queue.qsize(),
-        "active_jobs": len(job_queue_manager.jobs)
-    }
-# Error handlers
-@app.exception_handler(HTTPException)
-async def http_exception_handler(request, exc: HTTPException):
-    return JSONResponse(
-        status_code=exc.status_code,
-        content={
-            "detail": exc.detail,
-            "timestamp": datetime.utcnow().isoformat()
-        }
-    )
-@app.exception_handler(Exception)
-async def general_exception_handler(request, exc: Exception):
-    logger.error(f"Unhandled exception: {str(exc)}")
-    return JSONResponse(
-        status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
-        content={
-            "detail": "Internal server error",
-            "timestamp": datetime.utcnow().isoformat()
-        }
-    )
 if __name__ == "__main__":
-    import uvicorn
-    uvicorn.run(
-        "app:app",
-        host="0.0.0.0",
-        port=8000,
-        reload=True,
-        log_level="info",
-        access_log=True
     )

+import gradio as gr
+import pikepdf
 import os
+import zipfile
+import tempfile
 import shutil
 from pathlib import Path
+import uuid
+from datetime import datetime, timedelta
 import threading
 import time
+from typing import Tuple, Optional
+import logging
 # Configure logging
+logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 # Configuration
+MAX_FILE_SIZE_MB = 5
+CHUNK_SIZE_MB = 4.5
+MAX_FILE_SIZE_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024
+CHUNK_SIZE_BYTES = int(CHUNK_SIZE_MB * 1024 * 1024)
+TEMP_DIR = Path("temp_files")
+CLEANUP_AFTER_MINUTES = 10
+# Create temp directory
+TEMP_DIR.mkdir(exist_ok=True)
+# Store user sessions
+user_sessions = {}
+class PDFProcessor:
+    """Handle PDF splitting with qpdf/pikepdf"""
+    @staticmethod
+    def split_pdf(input_path: Path, output_dir: Path, progress_callback=None) -> Tuple[list, dict]:
         """
+        Split PDF into chunks using pikepdf (qpdf wrapper)
+        Returns: (list of output files, statistics dict)
         """
+        output_files = []
+        stats = {
+            "total_pages": 0,
+            "segments_created": 0,
+            "segments_discarded": 0,
+            "original_size_mb": 0,
+            "total_output_size_mb": 0
+        }
         try:
+            # Get original file size
+            stats["original_size_mb"] = input_path.stat().st_size / 1024 / 1024
+            # Open PDF with pikepdf
             with pikepdf.open(input_path) as pdf:
                 total_pages = len(pdf.pages)
                 stats["total_pages"] = total_pages
+                # Calculate pages per segment
                 file_size = input_path.stat().st_size
+                avg_page_size = file_size / total_pages if total_pages > 0 else file_size
+                pages_per_segment = max(1, int(CHUNK_SIZE_BYTES / avg_page_size))
                 segment_num = 0
                 page_start = 0
                     page_end = min(page_start + pages_per_segment, total_pages)
                     segment_num += 1
+                    # Update progress
+                    if progress_callback:
+                        progress = (page_start / total_pages)
+                        progress_callback(progress, f"Processing segment {segment_num}...")
+                    # Create segment filename
+                    segment_filename = f"segment_{segment_num:04d}_pages_{page_start+1}-{page_end}.pdf"
                     segment_path = output_dir / segment_filename
                     # Create new PDF with selected pages
                     segment_pdf = pikepdf.new()
                     for page_num in range(page_start, page_end):
                         segment_pdf.pages.append(pdf.pages[page_num])
+                    # Save with compression
                     segment_pdf.save(
                         segment_path,
                         compress_streams=True,
                         object_stream_mode=pikepdf.ObjectStreamMode.generate,
+                        linearize=True
                     )
                     # Check segment size
                     segment_size = segment_path.stat().st_size
+                    if segment_size <= MAX_FILE_SIZE_BYTES:
+                        output_files.append(segment_path)
                         stats["segments_created"] += 1
+                        stats["total_output_size_mb"] += segment_size / 1024 / 1024
                         logger.info(f"Created segment {segment_num}: {segment_size / 1024 / 1024:.2f} MB")
                     else:
+                        # If single page is too large, still keep it but mark as oversized
+                        if page_end - page_start == 1:
+                            output_files.append(segment_path)
+                            stats["segments_discarded"] += 1
+                            logger.warning(f"Segment {segment_num} exceeds size limit but kept (single page)")
+                        else:
+                            # Try with fewer pages
+                            segment_path.unlink()
                             pages_per_segment = max(1, pages_per_segment // 2)
                             continue
                     page_start = page_end
+                if progress_callback:
+                    progress_callback(1.0, "Splitting complete!")
         except Exception as e:
+            logger.error(f"Error splitting PDF: {str(e)}")
             raise
+        return output_files, stats
+class SessionManager:
+    """Manage user sessions and cleanup"""
+    @staticmethod
+    def create_session(session_id: str) -> Path:
+        """Create a new user session directory"""
+        session_dir = TEMP_DIR / session_id
+        session_dir.mkdir(exist_ok=True)
+        user_sessions[session_id] = {
+            "created": datetime.now(),
+            "dir": session_dir
+        }
+        return session_dir
+    @staticmethod
+    def cleanup_old_sessions():
+        """Remove old session directories"""
+        current_time = datetime.now()
+        sessions_to_remove = []
+        for session_id, session_info in user_sessions.items():
+            if current_time - session_info["created"] > timedelta(minutes=CLEANUP_AFTER_MINUTES):
                 try:
+                    shutil.rmtree(session_info["dir"], ignore_errors=True)
+                    sessions_to_remove.append(session_id)
+                    logger.info(f"Cleaned up session: {session_id}")
                 except Exception as e:
+                    logger.error(f"Error cleaning session {session_id}: {e}")
+        for session_id in sessions_to_remove:
+            del user_sessions[session_id]
+# Start cleanup thread
+def cleanup_worker():
+    """Background thread for cleaning old files"""
+    while True:
+        try:
+            SessionManager.cleanup_old_sessions()
+            time.sleep(60)  # Check every minute
+        except Exception as e:
+            logger.error(f"Cleanup error: {e}")
+cleanup_thread = threading.Thread(target=cleanup_worker, daemon=True)
+cleanup_thread.start()
+def process_pdf(file_obj, progress=gr.Progress()) -> Tuple[Optional[str], str, str]:
     """
+    Main processing function for Gradio interface
+    Returns: (zip_file_path, statistics_html, status_message)
     """
+    if file_obj is None:
+        return None, "", "⚠️ Please upload a PDF file"
+    session_id = str(uuid.uuid4())
+    session_dir = SessionManager.create_session(session_id)
     try:
+        # Update progress
+        progress(0.1, "Initializing...")
         # Save uploaded file
+        input_path = session_dir / "input.pdf"
+        # Handle both file path string and file object
+        if isinstance(file_obj, str):
+            shutil.copy(file_obj, input_path)
+        else:
+            with open(input_path, 'wb') as f:
+                f.write(file_obj.read() if hasattr(file_obj, 'read') else file_obj)
+        # Verify it's a valid PDF
+        progress(0.2, "Verifying PDF...")
+        with pikepdf.open(input_path) as pdf:
+            page_count = len(pdf.pages)
+            logger.info(f"Valid PDF with {page_count} pages")
+        # Create output directory
+        output_dir = session_dir / "output"
+        output_dir.mkdir(exist_ok=True)
+        # Split PDF with progress updates
+        progress(0.3, "Splitting PDF into segments...")
+        def update_progress(value, message):
+            # Scale progress from 0.3 to 0.8 for splitting phase
+            scaled_progress = 0.3 + (value * 0.5)
+            progress(scaled_progress, message)
+        output_files, stats = PDFProcessor.split_pdf(
+            input_path,
+            output_dir,
+            progress_callback=update_progress
         )
+        if not output_files:
+            return None, "", "❌ No valid segments created"
+        # Create ZIP file
+        progress(0.9, "Creating ZIP archive...")
+        zip_path = session_dir / f"pdf_segments_{session_id[:8]}.zip"
+        with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
+            for file_path in output_files:
+                zipf.write(file_path, file_path.name)
+        # Generate statistics HTML
+        stats_html = f"""
+        <div style="padding: 20px; background: #f0f9ff; border-radius: 10px; margin: 10px 0;">
+            <h3 style="color: #0369a1; margin-top: 0;">📊 Processing Results</h3>
+            <table style="width: 100%; border-collapse: collapse;">
+                <tr style="border-bottom: 1px solid #e0e0e0;">
+                    <td style="padding: 8px; font-weight: bold;">📄 Total Pages:</td>
+                    <td style="padding: 8px; text-align: right;">{stats['total_pages']}</td>
+                </tr>
+                <tr style="border-bottom: 1px solid #e0e0e0;">
+                    <td style="padding: 8px; font-weight: bold;">✅ Segments Created:</td>
+                    <td style="padding: 8px; text-align: right;">{stats['segments_created']}</td>
+                </tr>
+                <tr style="border-bottom: 1px solid #e0e0e0;">
+                    <td style="padding: 8px; font-weight: bold;">📦 Original Size:</td>
+                    <td style="padding: 8px; text-align: right;">{stats['original_size_mb']:.2f} MB</td>
+                </tr>
+                <tr style="border-bottom: 1px solid #e0e0e0;">
+                    <td style="padding: 8px; font-weight: bold;">📁 Total Output Size:</td>
+                    <td style="padding: 8px; text-align: right;">{stats['total_output_size_mb']:.2f} MB</td>
+                </tr>
+                <tr>
+                    <td style="padding: 8px; font-weight: bold;">💾 Compression Ratio:</td>
+                    <td style="padding: 8px; text-align: right;">
+                        {((1 - stats['total_output_size_mb'] / stats['original_size_mb']) * 100):.1f}%
+                    </td>
+                </tr>
+            </table>
+            <p style="margin-top: 15px; color: #059669; font-weight: bold;">
+                ✨ Your file has been split successfully!
+            </p>
+            <p style="margin-top: 10px; color: #6b7280; font-size: 0.9em;">
+                ⏱️ Files will be automatically deleted after {CLEANUP_AFTER_MINUTES} minutes
+            </p>
+        </div>
+        """
+        progress(1.0, "Complete! 🎉")
+        # Clean up input file to save space
+        input_path.unlink()
+        return str(zip_path), stats_html, "✅ Processing complete! Download your ZIP file below."
     except Exception as e:
+        logger.error(f"Processing error: {str(e)}")
+        # Cleanup on error
+        try:
+            shutil.rmtree(session_dir, ignore_errors=True)
+        except:
+            pass
+        return None, "", f"❌ Error: {str(e)}"
+# Create Gradio interface
+with gr.Blocks(
+    title="PDF Splitter - Fast & Simple",
+    theme=gr.themes.Soft(),
+    css="""
+    .gradio-container {
+        max-width: 800px;
+        margin: auto;
+    }
+    footer {
+        display: none !important;
     }
     """
+) as app:
+    gr.Markdown("""
+    # 📄 PDF Splitter Tool
+    **Split large PDFs into smaller segments quickly and efficiently!**
+    This tool uses advanced compression to split your PDF into segments of approximately **4.5 MB** each.
+    Files are processed using qpdf for optimal performance without decompressing the PDF.
+    ### How to use:
+    1. Upload your PDF file
+    2. Click "Split PDF"
+    3. Download the ZIP file containing all segments
+    *Note: Files are automatically deleted after 10 minutes for your privacy.*
+    """)
+    with gr.Row():
+        with gr.Column():
+            file_input = gr.File(
+                label="Upload PDF",
+                file_types=[".pdf"],
+                type="filepath"
+            )
+            split_btn = gr.Button(
+                "🚀 Split PDF",
+                variant="primary",
+                size="lg"
+            )
+    with gr.Row():
+        status_text = gr.Markdown("Ready to process your PDF...")
+    with gr.Row():
+        stats_output = gr.HTML()
+    with gr.Row():
+        download_file = gr.File(
+            label="📦 Download ZIP",
+            visible=True
         )
+    # Handle processing
+    split_btn.click(
+        fn=process_pdf,
+        inputs=[file_input],
+        outputs=[download_file, stats_output, status_text]
     )
+    # Add examples
+    gr.Markdown("""
+    ---
+    ### 💡 Features:
+    - ✅ Handles compressed PDFs efficiently using qpdf
+    - ✅ Automatic file cleanup for privacy
+    - ✅ Progress tracking during processing
+    - ✅ Creates ZIP archive for easy download
+    - ✅ Optimized for Hugging Face Spaces
+    ### 🔒 Privacy:
+    All uploaded files are automatically deleted after processing and download.
+    No files are stored permanently on the server.
+    """)
+# Launch the app
 if __name__ == "__main__":
+    app.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False
     )