Spaces:

triflix
/

sortitout

Sleeping

App Files Files Community

triflix commited on Nov 20, 2025

Commit

a09fca3

verified ·

1 Parent(s): 7da07f9

Update app.py

Browse files

Files changed (1) hide show

app.py +326 -134

app.py CHANGED Viewed

@@ -1,186 +1,378 @@
 import io
-import gc
-import logging
-from typing import List, Dict, Any
-from PIL import Image
 import numpy as np
-from fastapi import FastAPI, File, UploadFile, HTTPException
 from paddleocr import PaddleOCR
-from pdf2image import convert_from_bytes
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-# Global OCR instance (loaded once at startup)
-ocr_engine = None
-def get_ocr_engine():
-    """Singleton pattern for OCR model"""
-    global ocr_engine
-    if ocr_engine is None:
-        logger.info("Initializing PaddleOCR model...")
-        ocr_engine = PaddleOCR(
             lang="mr",
             use_doc_orientation_classify=False,
             use_doc_unwarping=False,
             use_textline_orientation=False,
-            show_log=False  # Reduce clutter
         )
-    return ocr_engine
-app = FastAPI(title="PaddleOCR Marathi API")
-def resize_image(image: Image.Image, max_pixels: int = 2500) -> Image.Image:
-    """Resize if any dimension exceeds limit to control memory usage"""
-    if max(image.size) > max_pixels:
-        ratio = max_pixels / max(image.size)
-        new_size = (int(image.width * ratio), int(image.height * ratio))
-        logger.info(f"Resizing {image.size} -> {new_size}")
-        return image.resize(new_size, Image.Resampling.LANCZOS)
-    return image
-def process_image(contents: bytes, filename: str) -> Dict[str, Any]:
-    """Process single image entirely in memory"""
     try:
-        image = Image.open(io.BytesIO(contents)).convert('RGB')
-        image = resize_image(image)
         img_array = np.array(image)
-        ocr = get_ocr_engine()
         result = ocr.ocr(img_array, cls=False)
-        texts, scores, bboxes = [], [], []
-        if result and result[0]:
-            for line in result[0]:
-                bbox, (text, score) = line
-                texts.append(text)
-                scores.append(float(score))
-                bboxes.append(bbox)
-        # Immediate cleanup
-        del image, img_array
-        gc.collect()
         return {
-            "filename": filename,
-            "type": "image",
-            "success": True,
-            "results": [{"text": t, "confidence": s, "bbox": b}
-                       for t, s, b in zip(texts, scores, bboxes)]
         }
     except Exception as e:
-        logger.error(f"Image processing failed: {e}")
-        return {"filename": filename, "type": "image", "success": False, "error": str(e)}
-def process_pdf(contents: bytes, filename: str) -> Dict[str, Any]:
-    """Process PDF page-by-page with memory cleanup between pages"""
     try:
-        # Convert PDF to images (poppler handles memory efficiently)
-        images = convert_from_bytes(contents, dpi=200, fmt='png')
-        pages = []
-        for page_num, image in enumerate(images, 1):
-            image = resize_image(image.convert('RGB'))
-            img_array = np.array(image)
-            ocr = get_ocr_engine()
-            result = ocr.ocr(img_array, cls=False)
-            texts, scores, bboxes = [], [], []
-            if result and result[0]:
-                for line in result[0]:
-                    bbox, (text, score) = line
-                    texts.append(text)
-                    scores.append(float(score))
-                    bboxes.append(bbox)
-            pages.append({
-                "page_number": page_num,
-                "results": [{"text": t, "confidence": s, "bbox": b}
-                           for t, s, b in zip(texts, scores, bboxes)]
             })
-            # Clean up per page
-            del image, img_array
             gc.collect()
-            # REMOVED: await asyncio.sleep(0.05)  # This was causing the error
-        # Final cleanup
-        del images
         gc.collect()
         return {
-            "filename": filename,
-            "type": "pdf",
-            "success": True,
-            "page_count": len(pages),
-            "pages": pages
         }
     except Exception as e:
-        logger.error(f"PDF processing failed: {e}")
-        return {"filename": filename, "type": "pdf", "success": False, "error": str(e)}
-@app.post("/ocr/image")
-async def ocr_image(file: UploadFile = File(...)):
-    """Single image endpoint"""
-    if not file.content_type.startswith('image/'):
-        raise HTTPException(400, "Invalid image file")
-    try:
-        contents = await file.read()
-        return process_image(contents, file.filename)
-    finally:
-        await file.close()
-@app.post("/ocr/pdf")
-async def ocr_pdf(file: UploadFile = File(...)):
-    """Single PDF endpoint"""
-    if not (file.content_type == 'application/pdf' or file.filename.endswith('.pdf')):
-        raise HTTPException(400, "Invalid PDF file")
-    try:
-        contents = await file.read()
-        return process_pdf(contents, file.filename)
-    finally:
-        await file.close()
-@app.post("/ocr/batch")
-async def ocr_batch(files: List[UploadFile] = File(...)):
-    """Batch processing endpoint - max 5 files to prevent OOM"""
-    if len(files) > 5:
-        raise HTTPException(400, "Maximum 5 files per batch")
     results = []
     for file in files:
-        try:
-            contents = await file.read()
-            is_pdf = file.content_type == 'application/pdf' or file.filename.endswith('.pdf')
-            result = process_pdf(contents, file.filename) if is_pdf else process_image(contents, file.filename)
-            results.append(result)
-        except Exception as e:
-            results.append({"filename": file.filename, "success": False, "error": str(e)})
-        finally:
-            await file.close()
-    return {"processed": len(results), "files": results}
-@app.get("/health")
-async def health():
-    """Check if model is loaded"""
-    try:
-        get_ocr_engine()
-        return {"status": "ready", "model": "loaded"}
-    except:
-        raise HTTPException(503, "Model not loaded")
-@app.on_event("startup")
-async def load_model():
-    logger.info("Preloading OCR model...")
-    get_ocr_engine()
-@app.on_event("shutdown")
-async def cleanup():
-    global ocr_engine
-    ocr_engine = None
-    gc.collect()

+from fastapi import FastAPI, File, UploadFile, HTTPException
+from fastapi.responses import JSONResponse
+from fastapi.middleware.cors import CORSMiddleware
+from typing import List
 import io
 import numpy as np
+from PIL import Image
+import pdf2image
+import cv2
 from paddleocr import PaddleOCR
+import gc
+import asyncio
+from concurrent.futures import ThreadPoolExecutor
+import logging
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+app = FastAPI(
+    title="Marathi OCR API",
+    description="OCR API for Marathi text extraction from images and PDFs",
+    version="1.0.0"
+)
+# CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Global OCR instance (initialized once)
+ocr_instance = None
+executor = ThreadPoolExecutor(max_workers=2)  # Limit concurrent processing
+# Constants
+MAX_FILE_SIZE = 10 * 1024 * 1024  # 10MB
+ALLOWED_IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".webp"}
+ALLOWED_EXTENSIONS = ALLOWED_IMAGE_EXTENSIONS | {".pdf"}
+MAX_FILES_PER_REQUEST = 10
+PDF_DPI = 200  # Balance between quality and RAM usage
+def get_ocr():
+    """Lazy load OCR instance"""
+    global ocr_instance
+    if ocr_instance is None:
+        logger.info("Initializing PaddleOCR...")
+        ocr_instance = PaddleOCR(
             lang="mr",
             use_doc_orientation_classify=False,
             use_doc_unwarping=False,
             use_textline_orientation=False,
+            use_angle_cls=False,  # Disable angle classification for speed
+            show_log=False
         )
+        logger.info("PaddleOCR initialized successfully")
+    return ocr_instance
+def validate_file(file: UploadFile, file_size: int):
+    """Validate uploaded file"""
+    # Check file size
+    if file_size > MAX_FILE_SIZE:
+        raise HTTPException(
+            status_code=413,
+            detail=f"File too large. Maximum size: {MAX_FILE_SIZE / 1024 / 1024}MB"
+        )
+    # Check extension
+    file_ext = file.filename.lower().split('.')[-1]
+    if f".{file_ext}" not in ALLOWED_EXTENSIONS:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Invalid file type. Allowed: {', '.join(ALLOWED_EXTENSIONS)}"
+        )
+    return f".{file_ext}"
+def process_image_bytes(image_bytes: bytes) -> np.ndarray:
+    """Convert image bytes to numpy array"""
     try:
+        image = Image.open(io.BytesIO(image_bytes))
+        # Convert to RGB if necessary
+        if image.mode != 'RGB':
+            image = image.convert('RGB')
+        # Convert to numpy array
         img_array = np.array(image)
+        # Optional: Resize if image is too large to save RAM
+        max_dimension = 4096
+        h, w = img_array.shape[:2]
+        if max(h, w) > max_dimension:
+            scale = max_dimension / max(h, w)
+            new_w, new_h = int(w * scale), int(h * scale)
+            img_array = cv2.resize(img_array, (new_w, new_h))
+            logger.info(f"Resized image from {w}x{h} to {new_w}x{new_h}")
+        return img_array
+    except Exception as e:
+        logger.error(f"Error processing image: {e}")
+        raise HTTPException(status_code=400, detail=f"Invalid image format: {str(e)}")
+def pdf_to_images(pdf_bytes: bytes) -> List[np.ndarray]:
+    """Convert PDF to list of image arrays without saving to disk"""
+    try:
+        # Convert PDF bytes to images in memory
+        images = pdf2image.convert_from_bytes(
+            pdf_bytes,
+            dpi=PDF_DPI,
+            fmt='RGB',
+            thread_count=1  # Limit threads to control RAM
+        )
+        # Convert PIL images to numpy arrays
+        img_arrays = []
+        for img in images:
+            img_array = np.array(img)
+            img_arrays.append(img_array)
+        logger.info(f"Converted PDF to {len(img_arrays)} images")
+        return img_arrays
+    except Exception as e:
+        logger.error(f"Error converting PDF: {e}")
+        raise HTTPException(status_code=400, detail=f"Invalid PDF format: {str(e)}")
+def run_ocr(img_array: np.ndarray) -> dict:
+    """Run OCR on image array"""
+    try:
+        ocr = get_ocr()
         result = ocr.ocr(img_array, cls=False)
+        if not result or not result[0]:
+            return {
+                "texts": [],
+                "scores": [],
+                "details": []
+            }
+        # Extract data
+        texts = []
+        scores = []
+        details = []
+        for line in result[0]:
+            bbox = line[0]  # Bounding box coordinates
+            text = line[1][0]  # Recognized text
+            score = line[1][1]  # Confidence score
+            texts.append(text)
+            scores.append(float(score))
+            details.append({
+                "text": text,
+                "confidence": float(score),
+                "bbox": [[int(point[0]), int(point[1])] for point in bbox]
+            })
         return {
+            "texts": texts,
+            "scores": scores,
+            "details": details
         }
     except Exception as e:
+        logger.error(f"OCR processing error: {e}")
+        raise HTTPException(status_code=500, detail=f"OCR failed: {str(e)}")
+async def process_single_file(file: UploadFile) -> dict:
+    """Process a single file (image or PDF)"""
     try:
+        # Read file into memory
+        file_bytes = await file.read()
+        file_size = len(file_bytes)
+        # Validate
+        file_ext = validate_file(file, file_size)
+        logger.info(f"Processing file: {file.filename} ({file_size / 1024:.2f}KB)")
+        results = []
+        if file_ext == ".pdf":
+            # Process PDF
+            img_arrays = pdf_to_images(file_bytes)
+            # Process each page
+            for page_num, img_array in enumerate(img_arrays, 1):
+                logger.info(f"Processing PDF page {page_num}/{len(img_arrays)}")
+                # Run OCR in thread pool to avoid blocking
+                loop = asyncio.get_event_loop()
+                ocr_result = await loop.run_in_executor(executor, run_ocr, img_array)
+                results.append({
+                    "page": page_num,
+                    **ocr_result
+                })
+                # Clean up
+                del img_array
+                gc.collect()
+        else:
+            # Process single image
+            img_array = process_image_bytes(file_bytes)
+            # Run OCR in thread pool
+            loop = asyncio.get_event_loop()
+            ocr_result = await loop.run_in_executor(executor, run_ocr, img_array)
+            results.append({
+                "page": 1,
+                **ocr_result
             })
+            # Clean up
+            del img_array
             gc.collect()
+        # Clean up file bytes
+        del file_bytes
         gc.collect()
         return {
+            "filename": file.filename,
+            "file_type": file_ext,
+            "total_pages": len(results),
+            "results": results,
+            "status": "success"
         }
+    except HTTPException:
+        raise
     except Exception as e:
+        logger.error(f"Error processing file {file.filename}: {e}")
+        return {
+            "filename": file.filename,
+            "status": "error",
+            "error": str(e)
+        }
+@app.on_event("startup")
+async def startup_event():
+    """Initialize on startup"""
+    logger.info("Starting OCR API...")
+    # Pre-load OCR model
+    get_ocr()
+    logger.info("API ready!")
+@app.on_event("shutdown")
+async def shutdown_event():
+    """Cleanup on shutdown"""
+    logger.info("Shutting down...")
+    executor.shutdown(wait=True)
+@app.get("/")
+async def root():
+    """Health check endpoint"""
+    return {
+        "status": "healthy",
+        "message": "Marathi OCR API is running",
+        "endpoints": {
+            "single_file": "/ocr/",
+            "multiple_files": "/ocr/batch/",
+            "health": "/health"
+        }
+    }
+@app.get("/health")
+async def health():
+    """Detailed health check"""
+    return {
+        "status": "healthy",
+        "ocr_loaded": ocr_instance is not None,
+        "max_file_size_mb": MAX_FILE_SIZE / 1024 / 1024,
+        "max_files_per_request": MAX_FILES_PER_REQUEST,
+        "supported_formats": list(ALLOWED_EXTENSIONS)
+    }
+@app.post("/ocr/")
+async def ocr_single_file(file: UploadFile = File(...)):
+    """
+    OCR for a single image or PDF file
+    - **file**: Image (JPG, PNG, etc.) or PDF file
+    Returns OCR results with text, confidence scores, and bounding boxes
+    """
+    result = await process_single_file(file)
+    if result["status"] == "error":
+        raise HTTPException(status_code=500, detail=result["error"])
+    return JSONResponse(content=result)
+@app.post("/ocr/batch/")
+async def ocr_batch_files(files: List[UploadFile] = File(...)):
+    """
+    OCR for multiple images or PDF files
+    - **files**: List of image or PDF files (max 10)
+    Returns OCR results for each file
+    """
+    if len(files) > MAX_FILES_PER_REQUEST:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Too many files. Maximum: {MAX_FILES_PER_REQUEST}"
+        )
+    logger.info(f"Processing batch of {len(files)} files")
+    # Process files sequentially to manage RAM
     results = []
     for file in files:
+        result = await process_single_file(file)
+        results.append(result)
+        # Force garbage collection between files
+        gc.collect()
+    return JSONResponse(content={
+        "total_files": len(files),
+        "results": results
+    })
+@app.post("/ocr/extract-text/")
+async def extract_text_only(file: UploadFile = File(...)):
+    """
+    Extract only text from image/PDF (simplified response)
+    - **file**: Image or PDF file
+    Returns only extracted text without bounding boxes
+    """
+    result = await process_single_file(file)
+    if result["status"] == "error":
+        raise HTTPException(status_code=500, detail=result["error"])
+    # Simplify response
+    simplified = {
+        "filename": result["filename"],
+        "file_type": result["file_type"],
+        "pages": []
+    }
+    for page_result in result["results"]:
+        simplified["pages"].append({
+            "page": page_result["page"],
+            "text": " ".join(page_result["texts"]),
+            "word_count": len(page_result["texts"]),
+            "average_confidence": sum(page_result["scores"]) / len(page_result["scores"]) if page_result["scores"] else 0
+        })
+    return JSONResponse(content=simplified)
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=7860)