Spaces:

triflix
/

sortitout

Sleeping

App Files Files Community

triflix commited on Nov 20, 2025

Commit

76565ed

verified ·

1 Parent(s): 283bdfb

Update app.py

Browse files

Files changed (1) hide show

app.py +297 -55

app.py CHANGED Viewed

@@ -1,14 +1,75 @@
 import os
-import uuid
 import shutil
 import tempfile
-from fastapi import FastAPI, UploadFile, File
 from fastapi.responses import JSONResponse
 from paddleocr import PaddleOCR
-from pdf2image import convert_from_bytes
-# OCR instance
-ocr_engine = PaddleOCR(
     lang="mr",
     text_recognition_model_name="devanagari_PP-OCRv5_mobile_rec",
     use_doc_orientation_classify=False,
@@ -16,64 +77,245 @@ ocr_engine = PaddleOCR(
     use_textline_orientation=False
 )
-app = FastAPI()
-def run_ocr_on_image(image_path: str):
-    result = ocr_engine.predict(input=image_path)
-    collected = []
-    for block in result:
-        texts = block.get("rec_texts", [])
-        scores = block.get("rec_scores", [])
-        pairs = [{"text": t, "score": float(s)} for t, s in zip(texts, scores)]
-        collected.append(pairs)
-    return collected
-@app.post("/ocr")
-async def ocr_endpoint(files: list[UploadFile] = File(...)):
-    session_dir = tempfile.mkdtemp(prefix="ocr_")
-    response_data = {}
     try:
-        for file in files:
-            original_name = file.filename
-            file_ext = original_name.lower()
-            saved_path = os.path.join(session_dir, f"{uuid.uuid4()}_{original_name}")
-            with open(saved_path, "wb") as tmp:
-                tmp.write(await file.read())
-            # ------ PDF ------
-            if file_ext.endswith(".pdf"):
-                pdf_bytes = open(saved_path, "rb").read()
-                pages = convert_from_bytes(pdf_bytes)
-                page_results = []
-                for idx, page in enumerate(pages):
-                    img_path = os.path.join(session_dir, f"{uuid.uuid4()}_page{idx}.jpg")
-                    page.save(img_path, "JPEG")
-                    page_results.append({
-                        "page": idx,
-                        "ocr": run_ocr_on_image(img_path)
-                    })
-                response_data[original_name] = {
-                    "type": "pdf",
-                    "pages": page_results
-                }
-            # ------ Images ------
             else:
-                image_result = run_ocr_on_image(saved_path)
-                response_data[original_name] = {
-                    "type": "image",
-                    "ocr": image_result
-                }
-    finally:
-        shutil.rmtree(session_dir, ignore_errors=True)
-    return JSONResponse(response_data)

+# app.py
+"""
+Single-file FastAPI app for HuggingFace Space (CPU) supporting:
+- Batch upload of images and PDFs (combination) up to TOTAL_FILE_LIMIT processed pages/images.
+- PDF -> images conversion (PyMuPDF) with per-pdf page limit.
+- Parallel image OCR (ThreadPoolExecutor) with safe concurrency defaults.
+- Detailed per-file results, per-page breakdown, and per-item error reporting.
+- Secure defaults: file type & size validation, temp-directory isolation, cleanup, non-root user compatibility.
+Usage (example):
+  POST /ocr?per_pdf_pages=3&total_limit=15
+  multipart/form-data files: file field can be repeated
+Produces JSON:
+{
+  "summary": { "processed_files": 3, "total_pages_images": 6 },
+  "files": [
+    {
+      "filename": "CVC.jpg",
+      "type": "image",
+      "page": null,
+      "results": [{"text":"...","confidence":0.99}, ...],
+      "error": null
+    },
+    {
+      "filename": "doc.pdf",
+      "type": "pdf",
+      "page": 1,
+      "results": [...],
+      "error": null
+    }
+  ]
+}
+"""
+from __future__ import annotations
 import os
 import shutil
 import tempfile
+import uuid
+import math
+import logging
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import List, Optional, Dict, Any, Tuple
+from fastapi import FastAPI, UploadFile, File, HTTPException, Query
 from fastapi.responses import JSONResponse
+from pydantic import BaseModel, Field
+from pathlib import Path
+# OCR backend imports (local)
+# PaddleOCR heavy initialization occurs once at startup
 from paddleocr import PaddleOCR
+import fitz  # PyMuPDF
+from PIL import Image
+# --- Configuration and secure defaults ---
+ALLOWED_IMAGE_EXT = {".jpg", ".jpeg", ".png", ".tiff", ".bmp", ".webp"}
+ALLOWED_DOC_EXT = {".pdf"}
+ALLOWED_EXTENSIONS = ALLOWED_IMAGE_EXT.union(ALLOWED_DOC_EXT)
+DEFAULT_PER_PDF_PAGES = 3
+DEFAULT_TOTAL_LIMIT = 15  # max total pages/images processed per request
+MAX_PER_PDF_PAGES = 10
+MAX_FILE_SIZE_BYTES = 25 * 1024 * 1024  # 25 MB per uploaded file
+OCR_DPI = 220  # dpi used when converting PDF pages to images
+MAX_WORKERS = min(4, (os.cpu_count() or 2))  # conservative concurrency
+# Logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger("ocr_service")
+# --- Initialize PaddleOCR once (reuse across requests) ---
+# Language and model consistent with user's request (Marathi / Devanagari mobile recognizer).
+OCR_ENGINE = PaddleOCR(
     lang="mr",
     text_recognition_model_name="devanagari_PP-OCRv5_mobile_rec",
     use_doc_orientation_classify=False,
     use_textline_orientation=False
 )
+# --- Response Schemas ---
+class OCRText(BaseModel):
+    text: str = Field(..., description="Recognized text line")
+    confidence: float = Field(..., ge=0.0, le=1.0)
+class FileResult(BaseModel):
+    filename: str
+    type: str  # "image" or "pdf"
+    page: Optional[int] = None  # for pdf pages; null for images
+    results: List[OCRText] = Field(default_factory=list)
+    error: Optional[str] = None
+class OCROutput(BaseModel):
+    summary: Dict[str, Any]
+    files: List[FileResult]
+# --- Utility functions ---
+def safe_extension(filename: str) -> str:
+    return Path(filename).suffix.lower()
+def validate_extension(filename: str) -> None:
+    ext = safe_extension(filename)
+    if ext not in ALLOWED_EXTENSIONS:
+        raise HTTPException(status_code=400, detail=f"Unsupported file extension: {ext}")
+def save_upload_to_temp(upload: UploadFile, dest_dir: str) -> str:
+    """
+    Save UploadFile to a uniquely named temp file in dest_dir.
+    Validates max size and uses streaming write to avoid memory spikes.
+    Returns full path to saved file.
+    """
+    ext = safe_extension(upload.filename)
+    tmp_name = f"{uuid.uuid4()}{ext}"
+    tmp_path = os.path.join(dest_dir, tmp_name)
+    total = 0
+    with open(tmp_path, "wb") as out_f:
+        while True:
+            chunk = upload.file.read(1024 * 64)
+            if not chunk:
+                break
+            total += len(chunk)
+            if total > MAX_FILE_SIZE_BYTES:
+                out_f.close()
+                os.remove(tmp_path)
+                raise HTTPException(status_code=413, detail=f"File too large: {upload.filename}")
+            out_f.write(chunk)
+    return tmp_path
+def estimate_pdf_pages(pdf_path: str) -> int:
+    """Return number of pages in PDF without conversion."""
+    doc = fitz.open(pdf_path)
+    count = len(doc)
+    doc.close()
+    return count
+def convert_pdf_to_images(pdf_path: str, dest_dir: str, pages_to_convert: int) -> List[Tuple[str, int]]:
+    """
+    Convert first N pages of PDF to images.
+    Returns list of tuples: (image_path, page_index1based)
+    """
+    doc = fitz.open(pdf_path)
+    page_count = len(doc)
+    limit = min(page_count, pages_to_convert)
+    images: List[Tuple[str, int]] = []
+    for i in range(limit):
+        page = doc.load_page(i)
+        pix = page.get_pixmap(dpi=OCR_DPI)
+        img_name = f"{uuid.uuid4()}.jpg"
+        img_path = os.path.join(dest_dir, img_name)
+        pix.save(img_path)
+        images.append((img_path, i + 1))  # page index 1-based
+    doc.close()
+    return images
+def ocr_image_path(image_path: str) -> List[OCRText]:
+    """
+    Run PaddleOCR on a single image path and return list of OCRText.
+    This function isolates the OCR call and normalizes the output.
+    """
+    # PaddleOCR's predict/ocr returns a nested result structure.
+    # Use .predict(input=...) as in the user's examples.
+    try:
+        res = OCR_ENGINE.predict(input=image_path)
+    except Exception as e:
+        logger.exception("PaddleOCR failed on %s", image_path)
+        raise RuntimeError(f"OCR engine failure: {str(e)}")
+    aggregated: List[OCRText] = []
+    # res expected to be a list of blocks/dicts with keys 'rec_texts' and 'rec_scores'
+    for block in res:
+        rec_texts = block.get("rec_texts") or []
+        rec_scores = block.get("rec_scores") or []
+        for t, s in zip(rec_texts, rec_scores):
+            # enforce numeric confidence and clip to [0,1]
+            try:
+                conf = float(s)
+            except Exception:
+                conf = 0.0
+            conf = max(0.0, min(1.0, conf))
+            aggregated.append(OCRText(text=str(t), confidence=conf))
+    return aggregated
+# --- FastAPI app and endpoint ---
+app = FastAPI(title="Batch PaddleOCR API (PDF+Image)", version="1.0")
+@app.post("/ocr", response_model=OCROutput)
+async def ocr_batch_endpoint(
+    files: List[UploadFile] = File(..., description="Upload up to 'total_limit' images/pages across files."),
+    per_pdf_pages: int = Query(DEFAULT_PER_PDF_PAGES, ge=1, le=MAX_PER_PDF_PAGES, description="Max pages to convert per PDF"),
+    total_limit: int = Query(DEFAULT_TOTAL_LIMIT, ge=1, le=50, description="Maximum total pages/images processed in request"),
+):
+    """
+    Accepts multiple files (images and PDFs). Converts PDFs -> images (first per_pdf_pages pages)
+    and runs OCR on each image. Ensures total converted pages/images <= total_limit.
+    Returns per-file per-page OCR results and summary.
+    """
+    if len(files) == 0:
+        raise HTTPException(status_code=400, detail="No files uploaded")
+    # Save uploaded files to request-scoped temporary directory; ensures cleanup
+    request_tmpdir = tempfile.mkdtemp(prefix="ocrreq_")
+    saved_files: List[Tuple[str, str]] = []  # (original_filename, saved_path)
     try:
+        # 1) Validate and save uploads
+        for up in files:
+            validate_extension(up.filename)
+            saved_path = save_upload_to_temp(up, request_tmpdir)
+            saved_files.append((up.filename, saved_path))
+        # 2) Pre-scan PDFs to count required pages and enforce total_limit
+        total_pages_images = 0
+        pdfs_to_convert: List[Tuple[str, str, int]] = []  # (orig_name, saved_path, pages_to_convert)
+        image_files: List[Tuple[str, str]] = []  # (orig_name, saved_path)
+        for orig_name, path in saved_files:
+            ext = safe_extension(orig_name)
+            if ext in ALLOWED_IMAGE_EXT:
+                total_pages_images += 1
+                image_files.append((orig_name, path))
+            elif ext == ".pdf":
+                try:
+                    pages = estimate_pdf_pages(path)
+                except Exception as e:
+                    raise HTTPException(status_code=400, detail=f"Unable to read PDF {orig_name}: {str(e)}")
+                pages_to_convert = min(pages, per_pdf_pages)
+                pdfs_to_convert.append((orig_name, path, pages_to_convert))
+                total_pages_images += pages_to_convert
             else:
+                # Shouldn't reach due to earlier validation
+                raise HTTPException(status_code=400, detail=f"Unsupported extension for {orig_name}")
+        if total_pages_images == 0:
+            raise HTTPException(status_code=400, detail="No valid images/pages to process")
+        if total_pages_images > total_limit:
+            raise HTTPException(
+                status_code=413,
+                detail=f"Request would process {total_pages_images} pages/images which exceeds total_limit {total_limit}"
+            )
+        # 3) Convert PDFs to images (store list of (filename,page,image_path))
+        converted_images: List[Tuple[str, Optional[int], str]] = []  # (orig_filename, page_or_None, image_path)
+        for orig_name, pdf_path, pages_to_convert in pdfs_to_convert:
+            try:
+                imgs = convert_pdf_to_images(pdf_path, request_tmpdir, pages_to_convert)
+            except Exception as e:
+                # if conversion fails for a file, record as zero and continue
+                logger.exception("PDF conversion failed for %s", orig_name)
+                converted_images.append((orig_name, None, f"__error__conversion__:{str(e)}"))
+                continue
+            for img_path, page_num in imgs:
+                converted_images.append((orig_name, page_num, img_path))
+        # include standalone image files
+        for orig_name, img_path in image_files:
+            converted_images.append((orig_name, None, img_path))
+        # 4) OCR all images - use ThreadPoolExecutor for parallelism within safe workers
+        results_per_file: List[FileResult] = []
+        futures = {}
+        with ThreadPoolExecutor(max_workers=MAX_WORKERS) as ex:
+            for orig_name, page_num, img_path in converted_images:
+                if isinstance(img_path, str) and img_path.startswith("__error__conversion__"):
+                    # embed conversion error immediately
+                    err_msg = img_path.split(":", 1)[1] if ":" in img_path else "Conversion error"
+                    fr = FileResult(filename=orig_name, type="pdf", page=page_num, results=[], error=err_msg)
+                    results_per_file.append(fr)
+                    continue
+                futures[ex.submit(ocr_image_path, img_path)] = (orig_name, page_num, img_path)
+            for fut in as_completed(list(futures.keys())):
+                orig_name, page_num, img_path = futures[fut]
+                try:
+                    ocr_texts = fut.result()
+                    fr = FileResult(
+                        filename=orig_name,
+                        type=("pdf" if page_num is not None else "image"),
+                        page=page_num,
+                        results=ocr_texts,
+                        error=None,
+                    )
+                except Exception as e:
+                    logger.exception("OCR failed for %s (page=%s): %s", orig_name, page_num, str(e))
+                    fr = FileResult(
+                        filename=orig_name,
+                        type=("pdf" if page_num is not None else "image"),
+                        page=page_num,
+                        results=[],
+                        error=str(e),
+                    )
+                results_per_file.append(fr)
+        # 5) Build summary and return
+        processed_files_count = len([r for r in results_per_file if r.error is None or r.results])
+        summary = {
+            "requested_files": len(files),
+            "processed_files": processed_files_count,
+            "total_pages_images": total_pages_images,
+            "per_pdf_pages": per_pdf_pages,
+            "total_limit": total_limit,
+        }
+        return JSONResponse(OCROutput(summary=summary, files=results_per_file).model_dump())
+    finally:
+        # Cleanup temp files and directory
+        try:
+            shutil.rmtree(request_tmpdir)
+        except Exception:
+            logger.warning("Failed to cleanup tempdir %s", request_tmpdir)