Spaces:

sarveshpatel
/

ocr

Sleeping

App Files Files Community

sarveshpatel commited on Nov 27, 2025

Commit

946e1b3

verified ·

1 Parent(s): f7beca4

Update app.py

Browse files

Files changed (1) hide show

app.py +104 -223

app.py CHANGED Viewed

@@ -1,129 +1,91 @@
 import os
 import uuid
-import gc
 from fastapi import FastAPI, UploadFile, File, HTTPException
 from fastapi.responses import JSONResponse
-from typing import List, Optional
 import fitz
 from PIL import Image
-import io
 # -------------------------------------------------------------------
 # FORCE PADDLEX / PADDLEOCR CACHE DIRECTORIES TO WRITABLE LOCATIONS
 # -------------------------------------------------------------------
 os.environ["PADDLE_HOME"] = "/app/paddle_home"
 os.environ["XDG_CACHE_HOME"] = "/app/xdg_cache"
-os.environ["OMP_NUM_THREADS"] = "2"  # Match CPU count
-os.environ["MKL_NUM_THREADS"] = "2"
 os.makedirs("/app/paddle_home", exist_ok=True)
 os.makedirs("/app/xdg_cache", exist_ok=True)
 from paddleocr import PaddleOCR
 # -------------------------------------------------------------------
 # CONFIGURATION
 # -------------------------------------------------------------------
-MAX_IMAGE_DIMENSION = 1024  # Max width or height
-OPTIMAL_DPI = 150  # Lower DPI for faster PDF rendering (was 220)
-JPEG_QUALITY = 85  # Good balance of quality and size
-UPLOAD_DIR = "/app/uploads"
-PDF_IMAGES_DIR = "/app/pdf_images"
-os.makedirs(UPLOAD_DIR, exist_ok=True)
-os.makedirs(PDF_IMAGES_DIR, exist_ok=True)
 # -------------------------------------------------------------------
-# IMAGE OPTIMIZATION UTILITIES
 # -------------------------------------------------------------------
-def optimize_image(image_path: str, output_path: Optional[str] = None) -> str:
-    """
-    Resize and optimize image for faster OCR processing.
-    - Resizes to max dimension of MAX_IMAGE_DIMENSION while maintaining aspect ratio
-    - Converts to RGB (removes alpha channel if present)
-    - Saves as optimized JPEG
-    """
-    if output_path is None:
-        output_path = image_path
-    with Image.open(image_path) as img:
-        # Convert to RGB if necessary (handles PNG with alpha, etc.)
         if img.mode in ('RGBA', 'LA', 'P'):
             img = img.convert('RGB')
         elif img.mode != 'RGB':
             img = img.convert('RGB')
-        # Get current dimensions
         width, height = img.size
-        # Only resize if larger than max dimension
-        if width > MAX_IMAGE_DIMENSION or height > MAX_IMAGE_DIMENSION:
-            # Calculate new dimensions maintaining aspect ratio
             if width > height:
-                new_width = MAX_IMAGE_DIMENSION
-                new_height = int(height * (MAX_IMAGE_DIMENSION / width))
             else:
-                new_height = MAX_IMAGE_DIMENSION
-                new_width = int(width * (MAX_IMAGE_DIMENSION / height))
-            # Use LANCZOS for high-quality downscaling
             img = img.resize((new_width, new_height), Image.LANCZOS)
-        # Save optimized image
-        img.save(output_path, 'JPEG', quality=JPEG_QUALITY, optimize=True)
     return output_path
-def cleanup_file(file_path: str) -> None:
-    """Safely remove a file if it exists."""
-    try:
-        if file_path and os.path.exists(file_path):
-            os.remove(file_path)
-    except Exception:
-        pass
-def cleanup_files(file_paths: List[str]) -> None:
-    """Remove multiple files."""
-    for fp in file_paths:
-        cleanup_file(fp)
 # -------------------------------------------------------------------
-# PDF → OPTIMIZED IMAGES
 # -------------------------------------------------------------------
-def pdf_to_images(pdf_path: str, max_pages: Optional[int] = 3) -> List[str]:
-    """
-    Convert PDF pages to optimized images.
-    Uses lower DPI and resizes for faster OCR.
-    """
     if not os.path.exists(pdf_path):
         raise FileNotFoundError(pdf_path)
     doc = fitz.open(pdf_path)
     page_count = len(doc)
     limit = page_count if max_pages is None else min(max_pages, page_count)
     output_paths: List[str] = []
     for i in range(limit):
         page = doc.load_page(i)
-        # Use lower DPI for faster rendering
-        pix = page.get_pixmap(dpi=OPTIMAL_DPI)
-        # Generate unique filename
         img_name = f"{uuid.uuid4()}.jpg"
-        img_path = os.path.join(PDF_IMAGES_DIR, img_name)
-        # Save initial image
-        pix.save(img_path)
-        # Free pixmap memory immediately
-        pix = None
-        # Optimize the saved image (resize if needed)
-        optimize_image(img_path, img_path)
         output_paths.append(img_path)
@@ -132,184 +94,103 @@ def pdf_to_images(pdf_path: str, max_pages: Optional[int] = 3) -> List[str]:
 # -------------------------------------------------------------------
-# OCR ENGINE - Singleton with optimized settings
 # -------------------------------------------------------------------
-class OCREngine:
-    """Singleton OCR engine to avoid re-initialization."""
-    _instance = None
-    @classmethod
-    def get_instance(cls):
-        if cls._instance is None:
-            cls._instance = PaddleOCR(
-                lang="mr",
-                text_recognition_model_name="devanagari_PP-OCRv5_mobile_rec",
-                use_doc_orientation_classify=False,
-                use_doc_unwarping=False,
-                use_textline_orientation=False,
-                show_log=False,  # Reduce logging overhead
-            )
-        return cls._instance
-def extract_text(image_path: str) -> List[dict]:
-    """
-    Extract text from an optimized image.
-    Returns list of {text, confidence} dicts.
-    """
-    ocr = OCREngine.get_instance()
-    result = ocr.predict(input=image_path)
     output = []
     for block in result:
-        texts = block.get("rec_texts", [])
-        scores = block.get("rec_scores", [])
         for t, s in zip(texts, scores):
-            if t.strip():  # Skip empty text
-                output.append({
-                    "text": t,
-                    "confidence": round(float(s), 4)
-                })
     return output
-def process_single_image(image_path: str, is_temp: bool = False) -> List[dict]:
-    """
-    Process a single image: optimize, OCR, cleanup.
-    """
-    optimized_path = None
-    try:
-        # Create optimized version
-        optimized_name = f"opt_{uuid.uuid4()}.jpg"
-        optimized_path = os.path.join(UPLOAD_DIR, optimized_name)
-        optimize_image(image_path, optimized_path)
-        # Run OCR on optimized image
-        results = extract_text(optimized_path)
-        return results
-    finally:
-        # Cleanup optimized image
-        cleanup_file(optimized_path)
-        # Force garbage collection after each image
-        gc.collect()
 # -------------------------------------------------------------------
-# FASTAPI APPLICATION
 # -------------------------------------------------------------------
-app = FastAPI(title="Optimized Marathi OCR API")
 @app.post("/ocr")
-async def ocr_endpoint(
-    files: List[UploadFile] = File(...),
-    max_pages: Optional[int] = 3
-):
-    """
-    OCR endpoint supporting PDF and image files.
-    - Maximum 15 files per request
-    - PDFs: processes up to max_pages (default 3)
-    - Images: jpg, jpeg, png supported
-    """
     if len(files) > 15:
         raise HTTPException(status_code=400, detail="Maximum 15 files allowed.")
     structured_output = {"files": []}
-    temp_files_to_cleanup = []
-    try:
-        for index, file in enumerate(files, start=1):
-            filename = file.filename.lower() if file.filename else f"unknown_{index}"
-            ext = filename.rsplit(".", 1)[-1] if "." in filename else ""
-            # Save uploaded file
-            temp_name = f"{uuid.uuid4()}.{ext}"
-            temp_path = os.path.join(UPLOAD_DIR, temp_name)
-            temp_files_to_cleanup.append(temp_path)
-            content = await file.read()
-            with open(temp_path, "wb") as f:
-                f.write(content)
-            # Free memory from upload content
-            del content
-            file_record = {
-                "file_id": f"file_{index}",
-                "filename": filename,
-                "pages": []
-            }
-            # -------------------------------
-            # PDF PROCESSING
-            # -------------------------------
-            if filename.endswith(".pdf"):
-                img_paths = []
-                try:
-                    img_paths = pdf_to_images(temp_path, max_pages=max_pages)
-                    for page_idx, img_path in enumerate(img_paths):
-                        ocr_results = extract_text(img_path)
-                        file_record["pages"].append({
-                            "page_index": page_idx,
-                            "results": ocr_results
-                        })
-                        # Cleanup each page image immediately after processing
-                        cleanup_file(img_path)
-                        gc.collect()
-                finally:
-                    # Ensure all PDF images are cleaned up
-                    cleanup_files(img_paths)
-            # -------------------------------
-            # IMAGE PROCESSING
-            # -------------------------------
-            elif filename.endswith((".jpg", ".jpeg", ".png", ".webp", ".bmp")):
-                ocr_results = process_single_image(temp_path)
                 file_record["pages"].append({
-                    "page_index": 0,
                     "results": ocr_results
                 })
-            else:
-                raise HTTPException(
-                    status_code=400,
-                    detail=f"Unsupported file type: {filename}. Supported: pdf, jpg, jpeg, png, webp, bmp"
-                )
-            structured_output["files"].append(file_record)
-            # Cleanup temp file after processing
-            cleanup_file(temp_path)
-            gc.collect()
-    finally:
-        # Final cleanup of any remaining temp files
-        cleanup_files(temp_files_to_cleanup)
-        gc.collect()
-    return JSONResponse(structured_output)
-@app.get("/health")
-async def health_check():
-    """Health check endpoint."""
-    return {"status": "healthy", "max_dimension": MAX_IMAGE_DIMENSION}
-@app.on_event("startup")
-async def startup_event():
-    """Pre-initialize OCR engine on startup."""
-    # Warm up the OCR engine
-    OCREngine.get_instance()
-    gc.collect()

 import os
 import uuid
 from fastapi import FastAPI, UploadFile, File, HTTPException
 from fastapi.responses import JSONResponse
+from typing import List
 import fitz
 from PIL import Image
 # -------------------------------------------------------------------
 # FORCE PADDLEX / PADDLEOCR CACHE DIRECTORIES TO WRITABLE LOCATIONS
 # -------------------------------------------------------------------
 os.environ["PADDLE_HOME"] = "/app/paddle_home"
 os.environ["XDG_CACHE_HOME"] = "/app/xdg_cache"
 os.makedirs("/app/paddle_home", exist_ok=True)
 os.makedirs("/app/xdg_cache", exist_ok=True)
+# now safe to import paddlex/paddleocr
 from paddleocr import PaddleOCR
 # -------------------------------------------------------------------
 # CONFIGURATION
 # -------------------------------------------------------------------
+MAX_DIMENSION = 1024  # Max width or height for OCR processing
+PDF_DPI = 150  # Lower DPI = faster (was 220)
 # -------------------------------------------------------------------
+# IMAGE OPTIMIZATION
 # -------------------------------------------------------------------
+def optimize_image_for_ocr(input_path: str, output_path: str) -> str:
+    """Resize image if too large, keeping aspect ratio."""
+    with Image.open(input_path) as img:
+        # Convert to RGB if needed
         if img.mode in ('RGBA', 'LA', 'P'):
             img = img.convert('RGB')
         elif img.mode != 'RGB':
             img = img.convert('RGB')
         width, height = img.size
+        # Only resize if larger than MAX_DIMENSION
+        if width > MAX_DIMENSION or height > MAX_DIMENSION:
             if width > height:
+                new_width = MAX_DIMENSION
+                new_height = int(height * (MAX_DIMENSION / width))
             else:
+                new_height = MAX_DIMENSION
+                new_width = int(width * (MAX_DIMENSION / height))
             img = img.resize((new_width, new_height), Image.LANCZOS)
+        img.save(output_path, 'JPEG', quality=85)
     return output_path
 # -------------------------------------------------------------------
+# PDF → IMAGE (optimized)
 # -------------------------------------------------------------------
+def pdf_to_images(pdf_path: str, max_pages: int | None = 3) -> List[str]:
     if not os.path.exists(pdf_path):
         raise FileNotFoundError(pdf_path)
     doc = fitz.open(pdf_path)
     page_count = len(doc)
     limit = page_count if max_pages is None else min(max_pages, page_count)
     output_paths: List[str] = []
+    out_dir = "/app/pdf_images"
+    os.makedirs(out_dir, exist_ok=True)
     for i in range(limit):
         page = doc.load_page(i)
+        pix = page.get_pixmap(dpi=PDF_DPI)  # Lower DPI for speed
         img_name = f"{uuid.uuid4()}.jpg"
+        img_path = os.path.join(out_dir, img_name)
+        # Save initial
+        temp_path = img_path + ".tmp.jpg"
+        pix.save(temp_path)
+        # Optimize (resize if needed)
+        optimize_image_for_ocr(temp_path, img_path)
+        # Cleanup temp
+        if os.path.exists(temp_path):
+            os.remove(temp_path)
         output_paths.append(img_path)
 # -------------------------------------------------------------------
+# OCR ENGINE
 # -------------------------------------------------------------------
+ocr_engine = PaddleOCR(
+    lang="mr",
+    text_recognition_model_name="devanagari_PP-OCRv5_mobile_rec",
+    use_doc_orientation_classify=False,
+    use_doc_unwarping=False,
+    use_textline_orientation=False
+)
+def extract_text(image_path: str):
+    result = ocr_engine.predict(input=image_path)
     output = []
     for block in result:
+        texts = block["rec_texts"]
+        scores = block["rec_scores"]
         for t, s in zip(texts, scores):
+            output.append({"text": t, "confidence": float(s)})
     return output
 # -------------------------------------------------------------------
+# FASTAPI
 # -------------------------------------------------------------------
+app = FastAPI()
+UPLOAD_DIR = "/app/uploads"
+os.makedirs(UPLOAD_DIR, exist_ok=True)
 @app.post("/ocr")
+async def ocr_endpoint(files: List[UploadFile] = File(...), max_pages: int | None = 3):
     if len(files) > 15:
         raise HTTPException(status_code=400, detail="Maximum 15 files allowed.")
     structured_output = {"files": []}
+    for index, file in enumerate(files, start=1):
+        filename = file.filename.lower()
+        ext = filename.split(".")[-1]
+        temp_name = f"{uuid.uuid4()}.{ext}"
+        temp_path = os.path.join(UPLOAD_DIR, temp_name)
+        with open(temp_path, "wb") as f:
+            f.write(await file.read())
+        file_record = {
+            "file_id": f"file_{index}",
+            "filename": filename,
+            "pages": []
+        }
+        # -------------------------------
+        # PDF
+        # -------------------------------
+        if filename.endswith(".pdf"):
+            img_paths = pdf_to_images(temp_path, max_pages=max_pages)
+            for page_idx, img_path in enumerate(img_paths):
+                ocr_results = extract_text(img_path)
                 file_record["pages"].append({
+                    "page_index": page_idx,
                     "results": ocr_results
                 })
+                # Cleanup processed image
+                if os.path.exists(img_path):
+                    os.remove(img_path)
+        # -------------------------------
+        # IMAGE
+        # -------------------------------
+        elif filename.endswith((".jpg", ".jpeg", ".png")):
+            # Optimize image before OCR
+            optimized_path = os.path.join(UPLOAD_DIR, f"opt_{uuid.uuid4()}.jpg")
+            optimize_image_for_ocr(temp_path, optimized_path)
+            ocr_results = extract_text(optimized_path)
+            file_record["pages"].append({
+                "page_index": 0,
+                "results": ocr_results
+            })
+            # Cleanup optimized image
+            if os.path.exists(optimized_path):
+                os.remove(optimized_path)
+        else:
+            raise HTTPException(status_code=400, detail=f"Unsupported type: {filename}")
+        # Cleanup uploaded file
+        if os.path.exists(temp_path):
+            os.remove(temp_path)
+        structured_output["files"].append(file_record)
+    return JSONResponse(structured_output)