Spaces:

triflix
/

sortitout

Sleeping

App Files Files Community

triflix commited on Nov 20, 2025

Commit

4cfe185

verified ·

1 Parent(s): cb997d4

Update app.py

Browse files

Files changed (1) hide show

app.py +229 -274

app.py CHANGED Viewed

@@ -1,280 +1,244 @@
 from fastapi import FastAPI, File, UploadFile, HTTPException
 from fastapi.responses import JSONResponse
 from fastapi.middleware.cors import CORSMiddleware
-from typing import List
-import io
-import numpy as np
-from PIL import Image
-import pdf2image
-import cv2
 from paddleocr import PaddleOCR
-import gc
-import asyncio
-from concurrent.futures import ThreadPoolExecutor
-import logging
-# Configure logging
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-app = FastAPI(
-    title="Marathi OCR API",
-    description="OCR API for Marathi text extraction from images and PDFs",
-    version="1.0.0"
-)
-# CORS middleware
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"],
-    allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
-)
-# Global OCR instance (initialized once)
 ocr_instance = None
-executor = ThreadPoolExecutor(max_workers=2)  # Limit concurrent processing
-# Constants
-MAX_FILE_SIZE = 10 * 1024 * 1024  # 10MB
-ALLOWED_IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".webp"}
-ALLOWED_EXTENSIONS = ALLOWED_IMAGE_EXTENSIONS | {".pdf"}
-MAX_FILES_PER_REQUEST = 10
-PDF_DPI = 200  # Balance between quality and RAM usage
 def get_ocr():
-    """Lazy load OCR instance"""
     global ocr_instance
     if ocr_instance is None:
-        logger.info("Initializing PaddleOCR...")
         ocr_instance = PaddleOCR(
             lang="mr",
             use_doc_orientation_classify=False,
             use_doc_unwarping=False,
             use_textline_orientation=False,
-            use_angle_cls=False,  # Disable angle classification for speed
-            show_log=False
         )
-        logger.info("PaddleOCR initialized successfully")
     return ocr_instance
-def validate_file(file: UploadFile, file_size: int):
     """Validate uploaded file"""
-    # Check file size
-    if file_size > MAX_FILE_SIZE:
         raise HTTPException(
-            status_code=413,
-            detail=f"File too large. Maximum size: {MAX_FILE_SIZE / 1024 / 1024}MB"
         )
-    # Check extension
-    file_ext = file.filename.lower().split('.')[-1]
-    if f".{file_ext}" not in ALLOWED_EXTENSIONS:
         raise HTTPException(
             status_code=400,
-            detail=f"Invalid file type. Allowed: {', '.join(ALLOWED_EXTENSIONS)}"
         )
-    return f".{file_ext}"
-def process_image_bytes(image_bytes: bytes) -> np.ndarray:
-    """Convert image bytes to numpy array"""
     try:
-        image = Image.open(io.BytesIO(image_bytes))
-        # Convert to RGB if necessary
-        if image.mode != 'RGB':
-            image = image.convert('RGB')
-        # Convert to numpy array
-        img_array = np.array(image)
-        # Optional: Resize if image is too large to save RAM
-        max_dimension = 4096
-        h, w = img_array.shape[:2]
-        if max(h, w) > max_dimension:
-            scale = max_dimension / max(h, w)
-            new_w, new_h = int(w * scale), int(h * scale)
-            img_array = cv2.resize(img_array, (new_w, new_h))
-            logger.info(f"Resized image from {w}x{h} to {new_w}x{new_h}")
-        return img_array
-    except Exception as e:
-        logger.error(f"Error processing image: {e}")
-        raise HTTPException(status_code=400, detail=f"Invalid image format: {str(e)}")
-def pdf_to_images(pdf_bytes: bytes) -> List[np.ndarray]:
-    """Convert PDF to list of image arrays without saving to disk"""
-    try:
-        # Convert PDF bytes to images in memory
-        images = pdf2image.convert_from_bytes(
-            pdf_bytes,
-            dpi=PDF_DPI,
-            fmt='RGB',
-            thread_count=1  # Limit threads to control RAM
-        )
-        # Convert PIL images to numpy arrays
-        img_arrays = []
-        for img in images:
-            img_array = np.array(img)
-            img_arrays.append(img_array)
-        logger.info(f"Converted PDF to {len(img_arrays)} images")
-        return img_arrays
-    except Exception as e:
-        logger.error(f"Error converting PDF: {e}")
-        raise HTTPException(status_code=400, detail=f"Invalid PDF format: {str(e)}")
-def run_ocr(img_array: np.ndarray) -> dict:
-    """Run OCR on image array"""
-    try:
         ocr = get_ocr()
-        result = ocr.ocr(img_array, cls=False)
-        if not result or not result[0]:
             return {
-                "texts": [],
-                "scores": [],
-                "details": []
             }
-        # Extract data
-        texts = []
-        scores = []
-        details = []
-        for line in result[0]:
-            bbox = line[0]  # Bounding box coordinates
-            text = line[1][0]  # Recognized text
-            score = line[1][1]  # Confidence score
-            texts.append(text)
-            scores.append(float(score))
-            details.append({
-                "text": text,
-                "confidence": float(score),
-                "bbox": [[int(point[0]), int(point[1])] for point in bbox]
-            })
         return {
-            "texts": texts,
-            "scores": scores,
-            "details": details
         }
-    except Exception as e:
-        logger.error(f"OCR processing error: {e}")
-        raise HTTPException(status_code=500, detail=f"OCR failed: {str(e)}")
-async def process_single_file(file: UploadFile) -> dict:
-    """Process a single file (image or PDF)"""
     try:
-        # Read file into memory
-        file_bytes = await file.read()
-        file_size = len(file_bytes)
-        # Validate
-        file_ext = validate_file(file, file_size)
-        logger.info(f"Processing file: {file.filename} ({file_size / 1024:.2f}KB)")
-        results = []
-        if file_ext == ".pdf":
-            # Process PDF
-            img_arrays = pdf_to_images(file_bytes)
-            # Process each page
-            for page_num, img_array in enumerate(img_arrays, 1):
-                logger.info(f"Processing PDF page {page_num}/{len(img_arrays)}")
-                # Run OCR in thread pool to avoid blocking
-                loop = asyncio.get_event_loop()
-                ocr_result = await loop.run_in_executor(executor, run_ocr, img_array)
-                results.append({
-                    "page": page_num,
-                    **ocr_result
-                })
-                # Clean up
-                del img_array
-                gc.collect()
-        else:
-            # Process single image
-            img_array = process_image_bytes(file_bytes)
-            # Run OCR in thread pool
-            loop = asyncio.get_event_loop()
-            ocr_result = await loop.run_in_executor(executor, run_ocr, img_array)
-            results.append({
-                "page": 1,
-                **ocr_result
-            })
             # Clean up
-            del img_array
             gc.collect()
-        # Clean up file bytes
-        del file_bytes
-        gc.collect()
         return {
-            "filename": file.filename,
-            "file_type": file_ext,
-            "total_pages": len(results),
-            "results": results,
-            "status": "success"
         }
-    except HTTPException:
-        raise
     except Exception as e:
-        logger.error(f"Error processing file {file.filename}: {e}")
         return {
-            "filename": file.filename,
-            "status": "error",
             "error": str(e)
         }
-@app.on_event("startup")
-async def startup_event():
-    """Initialize on startup"""
-    logger.info("Starting OCR API...")
-    # Pre-load OCR model
-    get_ocr()
-    logger.info("API ready!")
-@app.on_event("shutdown")
-async def shutdown_event():
-    """Cleanup on shutdown"""
-    logger.info("Shutting down...")
-    executor.shutdown(wait=True)
 @app.get("/")
 async def root():
-    """Health check endpoint"""
     return {
-        "status": "healthy",
-        "message": "Marathi OCR API is running",
         "endpoints": {
-            "single_file": "/ocr/",
-            "multiple_files": "/ocr/batch/",
             "health": "/health"
         }
     }
@@ -282,58 +246,81 @@ async def root():
 @app.get("/health")
 async def health():
-    """Detailed health check"""
-    return {
-        "status": "healthy",
-        "ocr_loaded": ocr_instance is not None,
-        "max_file_size_mb": MAX_FILE_SIZE / 1024 / 1024,
-        "max_files_per_request": MAX_FILES_PER_REQUEST,
-        "supported_formats": list(ALLOWED_EXTENSIONS)
-    }
-@app.post("/ocr/")
 async def ocr_single_file(file: UploadFile = File(...)):
     """
-    OCR for a single image or PDF file
     - **file**: Image (JPG, PNG, etc.) or PDF file
-    Returns OCR results with text, confidence scores, and bounding boxes
     """
-    result = await process_single_file(file)
-    if result["status"] == "error":
-        raise HTTPException(status_code=500, detail=result["error"])
     return JSONResponse(content=result)
-@app.post("/ocr/batch/")
 async def ocr_batch_files(files: List[UploadFile] = File(...)):
     """
-    OCR for multiple images or PDF files
     - **files**: List of image or PDF files (max 10)
-    Returns OCR results for each file
     """
-    if len(files) > MAX_FILES_PER_REQUEST:
         raise HTTPException(
             status_code=400,
-            detail=f"Too many files. Maximum: {MAX_FILES_PER_REQUEST}"
         )
-    logger.info(f"Processing batch of {len(files)} files")
-    # Process files sequentially to manage RAM
     results = []
     for file in files:
-        result = await process_single_file(file)
-        results.append(result)
-        # Force garbage collection between files
-        gc.collect()
     return JSONResponse(content={
         "total_files": len(files),
@@ -341,38 +328,6 @@ async def ocr_batch_files(files: List[UploadFile] = File(...)):
     })
-@app.post("/ocr/extract-text/")
-async def extract_text_only(file: UploadFile = File(...)):
-    """
-    Extract only text from image/PDF (simplified response)
-    - **file**: Image or PDF file
-    Returns only extracted text without bounding boxes
-    """
-    result = await process_single_file(file)
-    if result["status"] == "error":
-        raise HTTPException(status_code=500, detail=result["error"])
-    # Simplify response
-    simplified = {
-        "filename": result["filename"],
-        "file_type": result["file_type"],
-        "pages": []
-    }
-    for page_result in result["results"]:
-        simplified["pages"].append({
-            "page": page_result["page"],
-            "text": " ".join(page_result["texts"]),
-            "word_count": len(page_result["texts"]),
-            "average_confidence": sum(page_result["scores"]) / len(page_result["scores"]) if page_result["scores"] else 0
-        })
-    return JSONResponse(content=simplified)
 if __name__ == "__main__":
     import uvicorn
     uvicorn.run(app, host="0.0.0.0", port=7860)

+import os
+import tempfile
+import asyncio
+from pathlib import Path
+from typing import List, Optional
+import gc
+from contextlib import asynccontextmanager
 from fastapi import FastAPI, File, UploadFile, HTTPException
 from fastapi.responses import JSONResponse
 from fastapi.middleware.cors import CORSMiddleware
 from paddleocr import PaddleOCR
+from PIL import Image
+import io
+# PDF support
+try:
+    from pdf2image import convert_from_path
+    PDF_SUPPORT = True
+except ImportError:
+    PDF_SUPPORT = False
+# Global OCR instance (singleton pattern for memory efficiency)
 ocr_instance = None
+# Supported formats
+SUPPORTED_IMAGE_FORMATS = {'.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp'}
+SUPPORTED_FORMATS = SUPPORTED_IMAGE_FORMATS.copy()
+if PDF_SUPPORT:
+    SUPPORTED_FORMATS.add('.pdf')
+# Configuration
+MAX_FILE_SIZE = 10 * 1024 * 1024  # 10MB per file
+MAX_FILES = 10  # Maximum files per request
+MAX_PDF_PAGES = 20  # Maximum PDF pages to process
 def get_ocr():
+    """Singleton OCR instance - initialized once"""
     global ocr_instance
     if ocr_instance is None:
         ocr_instance = PaddleOCR(
             lang="mr",
+            text_recognition_model_name="devanagari_PP-OCRv5_mobile_rec",
             use_doc_orientation_classify=False,
             use_doc_unwarping=False,
             use_textline_orientation=False,
+            show_log=False,  # Reduce console noise
+            use_gpu=False  # HuggingFace Spaces usually don't have GPU
         )
     return ocr_instance
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """Initialize OCR on startup, cleanup on shutdown"""
+    # Startup
+    print("🚀 Initializing PaddleOCR...")
+    get_ocr()
+    print("✅ PaddleOCR ready!")
+    yield
+    # Shutdown
+    print("🧹 Cleaning up...")
+    gc.collect()
+app = FastAPI(
+    title="Marathi OCR API",
+    description="PaddleOCR API for Marathi/Devanagari text recognition",
+    version="1.0.0",
+    lifespan=lifespan
+)
+# CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+def validate_file(file: UploadFile) -> None:
     """Validate uploaded file"""
+    # Check file extension
+    file_ext = Path(file.filename).suffix.lower()
+    if file_ext not in SUPPORTED_FORMATS:
         raise HTTPException(
+            status_code=400,
+            detail=f"Unsupported file format. Supported: {', '.join(SUPPORTED_FORMATS)}"
         )
+    # Check file size (read first chunk to estimate)
+    file.file.seek(0, 2)  # Seek to end
+    file_size = file.file.tell()
+    file.file.seek(0)  # Reset
+    if file_size > MAX_FILE_SIZE:
         raise HTTPException(
             status_code=400,
+            detail=f"File too large. Maximum size: {MAX_FILE_SIZE // 1024 // 1024}MB"
         )
+async def process_image_bytes(image_bytes: bytes, filename: str) -> dict:
+    """Process image bytes with OCR"""
+    temp_path = None
     try:
+        # Create temporary file
+        with tempfile.NamedTemporaryFile(delete=False, suffix=Path(filename).suffix) as tmp:
+            tmp.write(image_bytes)
+            temp_path = tmp.name
+        # Run OCR
         ocr = get_ocr()
+        result = ocr.ocr(temp_path, cls=False)
+        # Extract results
+        if result and result[0]:
+            texts = [line[1][0] for line in result[0]]
+            scores = [float(line[1][1]) for line in result[0]]
             return {
+                "filename": filename,
+                "success": True,
+                "text_count": len(texts),
+                "results": [
+                    {"text": text, "confidence": score}
+                    for text, score in zip(texts, scores)
+                ],
+                "full_text": "\n".join(texts)
+            }
+        else:
+            return {
+                "filename": filename,
+                "success": True,
+                "text_count": 0,
+                "results": [],
+                "full_text": "",
+                "message": "No text detected"
             }
+    except Exception as e:
         return {
+            "filename": filename,
+            "success": False,
+            "error": str(e)
         }
+    finally:
+        # Clean up temporary file
+        if temp_path and os.path.exists(temp_path):
+            try:
+                os.unlink(temp_path)
+            except:
+                pass
+        # Force garbage collection
+        gc.collect()
+async def process_pdf(file_bytes: bytes, filename: str) -> dict:
+    """Process PDF file page by page"""
+    if not PDF_SUPPORT:
+        raise HTTPException(status_code=400, detail="PDF support not available")
+    temp_pdf_path = None
     try:
+        # Save PDF temporarily
+        with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
+            tmp.write(file_bytes)
+            temp_pdf_path = tmp.name
+        # Convert PDF to images
+        images = convert_from_path(temp_pdf_path, dpi=200, fmt='jpeg')
+        if len(images) > MAX_PDF_PAGES:
+            raise HTTPException(
+                status_code=400,
+                detail=f"PDF has too many pages. Maximum: {MAX_PDF_PAGES}"
+            )
+        # Process each page
+        all_results = []
+        for page_num, image in enumerate(images, 1):
+            # Convert PIL Image to bytes
+            img_byte_arr = io.BytesIO()
+            image.save(img_byte_arr, format='JPEG')
+            img_bytes = img_byte_arr.getvalue()
+            # Process page
+            result = await process_image_bytes(
+                img_bytes,
+                f"{filename}_page_{page_num}"
+            )
+            result["page_number"] = page_num
+            all_results.append(result)
             # Clean up
+            del image
             gc.collect()
         return {
+            "filename": filename,
+            "success": True,
+            "total_pages": len(images),
+            "pages": all_results,
+            "combined_text": "\n\n".join([
+                f"=== Page {r['page_number']} ===\n{r.get('full_text', '')}"
+                for r in all_results
+            ])
         }
     except Exception as e:
         return {
+            "filename": filename,
+            "success": False,
             "error": str(e)
         }
+    finally:
+        # Clean up
+        if temp_pdf_path and os.path.exists(temp_pdf_path):
+            try:
+                os.unlink(temp_pdf_path)
+            except:
+                pass
+        gc.collect()
 @app.get("/")
 async def root():
+    """API information"""
     return {
+        "name": "Marathi OCR API",
+        "status": "running",
+        "supported_formats": list(SUPPORTED_FORMATS),
+        "pdf_support": PDF_SUPPORT,
+        "max_file_size_mb": MAX_FILE_SIZE // 1024 // 1024,
+        "max_files_per_request": MAX_FILES,
         "endpoints": {
+            "single_file": "/ocr",
+            "multiple_files": "/ocr/batch",
             "health": "/health"
         }
     }
 @app.get("/health")
 async def health():
+    """Health check"""
+    return {"status": "healthy", "ocr_loaded": ocr_instance is not None}
+@app.post("/ocr")
 async def ocr_single_file(file: UploadFile = File(...)):
     """
+    Process a single image or PDF file
     - **file**: Image (JPG, PNG, etc.) or PDF file
     """
+    validate_file(file)
+    # Read file
+    file_bytes = await file.read()
+    file_ext = Path(file.filename).suffix.lower()
+    # Process based on file type
+    if file_ext == '.pdf':
+        result = await process_pdf(file_bytes, file.filename)
+    else:
+        result = await process_image_bytes(file_bytes, file.filename)
+    # Clean up
+    del file_bytes
+    gc.collect()
     return JSONResponse(content=result)
+@app.post("/ocr/batch")
 async def ocr_batch_files(files: List[UploadFile] = File(...)):
     """
+    Process multiple image/PDF files
     - **files**: List of image or PDF files (max 10)
     """
+    if len(files) > MAX_FILES:
         raise HTTPException(
             status_code=400,
+            detail=f"Too many files. Maximum: {MAX_FILES}"
         )
     results = []
     for file in files:
+        try:
+            validate_file(file)
+            file_bytes = await file.read()
+            file_ext = Path(file.filename).suffix.lower()
+            # Process based on file type
+            if file_ext == '.pdf':
+                result = await process_pdf(file_bytes, file.filename)
+            else:
+                result = await process_image_bytes(file_bytes, file.filename)
+            results.append(result)
+            # Clean up after each file
+            del file_bytes
+            gc.collect()
+        except HTTPException as he:
+            results.append({
+                "filename": file.filename,
+                "success": False,
+                "error": he.detail
+            })
+        except Exception as e:
+            results.append({
+                "filename": file.filename,
+                "success": False,
+                "error": str(e)
+            })
     return JSONResponse(content={
         "total_files": len(files),
     })
 if __name__ == "__main__":
     import uvicorn
     uvicorn.run(app, host="0.0.0.0", port=7860)