Spaces:

triflix
/

sortitout

Sleeping

App Files Files Community

triflix commited on Nov 20, 2025

Commit

c08cd96

verified ·

1 Parent(s): 76565ed

Update app.py

Browse files

Files changed (1) hide show

app.py +77 -291

app.py CHANGED Viewed

@@ -1,321 +1,107 @@
-# app.py
-"""
-Single-file FastAPI app for HuggingFace Space (CPU) supporting:
-- Batch upload of images and PDFs (combination) up to TOTAL_FILE_LIMIT processed pages/images.
-- PDF -> images conversion (PyMuPDF) with per-pdf page limit.
-- Parallel image OCR (ThreadPoolExecutor) with safe concurrency defaults.
-- Detailed per-file results, per-page breakdown, and per-item error reporting.
-- Secure defaults: file type & size validation, temp-directory isolation, cleanup, non-root user compatibility.
-Usage (example):
-  POST /ocr?per_pdf_pages=3&total_limit=15
-  multipart/form-data files: file field can be repeated
-Produces JSON:
-{
-  "summary": { "processed_files": 3, "total_pages_images": 6 },
-  "files": [
-    {
-      "filename": "CVC.jpg",
-      "type": "image",
-      "page": null,
-      "results": [{"text":"...","confidence":0.99}, ...],
-      "error": null
-    },
-    {
-      "filename": "doc.pdf",
-      "type": "pdf",
-      "page": 1,
-      "results": [...],
-      "error": null
-    }
-  ]
-}
-"""
-from __future__ import annotations
 import os
-import shutil
-import tempfile
 import uuid
-import math
-import logging
-from concurrent.futures import ThreadPoolExecutor, as_completed
-from typing import List, Optional, Dict, Any, Tuple
-from fastapi import FastAPI, UploadFile, File, HTTPException, Query
 from fastapi.responses import JSONResponse
-from pydantic import BaseModel, Field
-from pathlib import Path
-# OCR backend imports (local)
-# PaddleOCR heavy initialization occurs once at startup
 from paddleocr import PaddleOCR
-import fitz  # PyMuPDF
-from PIL import Image
-# --- Configuration and secure defaults ---
-ALLOWED_IMAGE_EXT = {".jpg", ".jpeg", ".png", ".tiff", ".bmp", ".webp"}
-ALLOWED_DOC_EXT = {".pdf"}
-ALLOWED_EXTENSIONS = ALLOWED_IMAGE_EXT.union(ALLOWED_DOC_EXT)
-DEFAULT_PER_PDF_PAGES = 3
-DEFAULT_TOTAL_LIMIT = 15  # max total pages/images processed per request
-MAX_PER_PDF_PAGES = 10
-MAX_FILE_SIZE_BYTES = 25 * 1024 * 1024  # 25 MB per uploaded file
-OCR_DPI = 220  # dpi used when converting PDF pages to images
-MAX_WORKERS = min(4, (os.cpu_count() or 2))  # conservative concurrency
-# Logging
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger("ocr_service")
-# --- Initialize PaddleOCR once (reuse across requests) ---
-# Language and model consistent with user's request (Marathi / Devanagari mobile recognizer).
-OCR_ENGINE = PaddleOCR(
-    lang="mr",
-    text_recognition_model_name="devanagari_PP-OCRv5_mobile_rec",
-    use_doc_orientation_classify=False,
-    use_doc_unwarping=False,
-    use_textline_orientation=False
-)
-# --- Response Schemas ---
-class OCRText(BaseModel):
-    text: str = Field(..., description="Recognized text line")
-    confidence: float = Field(..., ge=0.0, le=1.0)
-class FileResult(BaseModel):
-    filename: str
-    type: str  # "image" or "pdf"
-    page: Optional[int] = None  # for pdf pages; null for images
-    results: List[OCRText] = Field(default_factory=list)
-    error: Optional[str] = None
-class OCROutput(BaseModel):
-    summary: Dict[str, Any]
-    files: List[FileResult]
-# --- Utility functions ---
-def safe_extension(filename: str) -> str:
-    return Path(filename).suffix.lower()
-def validate_extension(filename: str) -> None:
-    ext = safe_extension(filename)
-    if ext not in ALLOWED_EXTENSIONS:
-        raise HTTPException(status_code=400, detail=f"Unsupported file extension: {ext}")
-def save_upload_to_temp(upload: UploadFile, dest_dir: str) -> str:
-    """
-    Save UploadFile to a uniquely named temp file in dest_dir.
-    Validates max size and uses streaming write to avoid memory spikes.
-    Returns full path to saved file.
-    """
-    ext = safe_extension(upload.filename)
-    tmp_name = f"{uuid.uuid4()}{ext}"
-    tmp_path = os.path.join(dest_dir, tmp_name)
-    total = 0
-    with open(tmp_path, "wb") as out_f:
-        while True:
-            chunk = upload.file.read(1024 * 64)
-            if not chunk:
-                break
-            total += len(chunk)
-            if total > MAX_FILE_SIZE_BYTES:
-                out_f.close()
-                os.remove(tmp_path)
-                raise HTTPException(status_code=413, detail=f"File too large: {upload.filename}")
-            out_f.write(chunk)
-    return tmp_path
-def estimate_pdf_pages(pdf_path: str) -> int:
-    """Return number of pages in PDF without conversion."""
     doc = fitz.open(pdf_path)
-    count = len(doc)
-    doc.close()
-    return count
-def convert_pdf_to_images(pdf_path: str, dest_dir: str, pages_to_convert: int) -> List[Tuple[str, int]]:
-    """
-    Convert first N pages of PDF to images.
-    Returns list of tuples: (image_path, page_index1based)
-    """
-    doc = fitz.open(pdf_path)
-    page_count = len(doc)
-    limit = min(page_count, pages_to_convert)
-    images: List[Tuple[str, int]] = []
     for i in range(limit):
         page = doc.load_page(i)
-        pix = page.get_pixmap(dpi=OCR_DPI)
         img_name = f"{uuid.uuid4()}.jpg"
-        img_path = os.path.join(dest_dir, img_name)
         pix.save(img_path)
-        images.append((img_path, i + 1))  # page index 1-based
-    doc.close()
-    return images
-def ocr_image_path(image_path: str) -> List[OCRText]:
-    """
-    Run PaddleOCR on a single image path and return list of OCRText.
-    This function isolates the OCR call and normalizes the output.
-    """
-    # PaddleOCR's predict/ocr returns a nested result structure.
-    # Use .predict(input=...) as in the user's examples.
-    try:
-        res = OCR_ENGINE.predict(input=image_path)
-    except Exception as e:
-        logger.exception("PaddleOCR failed on %s", image_path)
-        raise RuntimeError(f"OCR engine failure: {str(e)}")
-    aggregated: List[OCRText] = []
-    # res expected to be a list of blocks/dicts with keys 'rec_texts' and 'rec_scores'
-    for block in res:
-        rec_texts = block.get("rec_texts") or []
-        rec_scores = block.get("rec_scores") or []
-        for t, s in zip(rec_texts, rec_scores):
-            # enforce numeric confidence and clip to [0,1]
-            try:
-                conf = float(s)
-            except Exception:
-                conf = 0.0
-            conf = max(0.0, min(1.0, conf))
-            aggregated.append(OCRText(text=str(t), confidence=conf))
-    return aggregated
-# --- FastAPI app and endpoint ---
-app = FastAPI(title="Batch PaddleOCR API (PDF+Image)", version="1.0")
-@app.post("/ocr", response_model=OCROutput)
-async def ocr_batch_endpoint(
-    files: List[UploadFile] = File(..., description="Upload up to 'total_limit' images/pages across files."),
-    per_pdf_pages: int = Query(DEFAULT_PER_PDF_PAGES, ge=1, le=MAX_PER_PDF_PAGES, description="Max pages to convert per PDF"),
-    total_limit: int = Query(DEFAULT_TOTAL_LIMIT, ge=1, le=50, description="Maximum total pages/images processed in request"),
-):
-    """
-    Accepts multiple files (images and PDFs). Converts PDFs -> images (first per_pdf_pages pages)
-    and runs OCR on each image. Ensures total converted pages/images <= total_limit.
-    Returns per-file per-page OCR results and summary.
-    """
-    if len(files) == 0:
-        raise HTTPException(status_code=400, detail="No files uploaded")
-    # Save uploaded files to request-scoped temporary directory; ensures cleanup
-    request_tmpdir = tempfile.mkdtemp(prefix="ocrreq_")
-    saved_files: List[Tuple[str, str]] = []  # (original_filename, saved_path)
-    try:
-        # 1) Validate and save uploads
-        for up in files:
-            validate_extension(up.filename)
-            saved_path = save_upload_to_temp(up, request_tmpdir)
-            saved_files.append((up.filename, saved_path))
-        # 2) Pre-scan PDFs to count required pages and enforce total_limit
-        total_pages_images = 0
-        pdfs_to_convert: List[Tuple[str, str, int]] = []  # (orig_name, saved_path, pages_to_convert)
-        image_files: List[Tuple[str, str]] = []  # (orig_name, saved_path)
-        for orig_name, path in saved_files:
-            ext = safe_extension(orig_name)
-            if ext in ALLOWED_IMAGE_EXT:
-                total_pages_images += 1
-                image_files.append((orig_name, path))
-            elif ext == ".pdf":
-                try:
-                    pages = estimate_pdf_pages(path)
-                except Exception as e:
-                    raise HTTPException(status_code=400, detail=f"Unable to read PDF {orig_name}: {str(e)}")
-                pages_to_convert = min(pages, per_pdf_pages)
-                pdfs_to_convert.append((orig_name, path, pages_to_convert))
-                total_pages_images += pages_to_convert
-            else:
-                # Shouldn't reach due to earlier validation
-                raise HTTPException(status_code=400, detail=f"Unsupported extension for {orig_name}")
-        if total_pages_images == 0:
-            raise HTTPException(status_code=400, detail="No valid images/pages to process")
-        if total_pages_images > total_limit:
-            raise HTTPException(
-                status_code=413,
-                detail=f"Request would process {total_pages_images} pages/images which exceeds total_limit {total_limit}"
-            )
-        # 3) Convert PDFs to images (store list of (filename,page,image_path))
-        converted_images: List[Tuple[str, Optional[int], str]] = []  # (orig_filename, page_or_None, image_path)
-        for orig_name, pdf_path, pages_to_convert in pdfs_to_convert:
-            try:
-                imgs = convert_pdf_to_images(pdf_path, request_tmpdir, pages_to_convert)
-            except Exception as e:
-                # if conversion fails for a file, record as zero and continue
-                logger.exception("PDF conversion failed for %s", orig_name)
-                converted_images.append((orig_name, None, f"__error__conversion__:{str(e)}"))
-                continue
-            for img_path, page_num in imgs:
-                converted_images.append((orig_name, page_num, img_path))
-        # include standalone image files
-        for orig_name, img_path in image_files:
-            converted_images.append((orig_name, None, img_path))
-        # 4) OCR all images - use ThreadPoolExecutor for parallelism within safe workers
-        results_per_file: List[FileResult] = []
-        futures = {}
-        with ThreadPoolExecutor(max_workers=MAX_WORKERS) as ex:
-            for orig_name, page_num, img_path in converted_images:
-                if isinstance(img_path, str) and img_path.startswith("__error__conversion__"):
-                    # embed conversion error immediately
-                    err_msg = img_path.split(":", 1)[1] if ":" in img_path else "Conversion error"
-                    fr = FileResult(filename=orig_name, type="pdf", page=page_num, results=[], error=err_msg)
-                    results_per_file.append(fr)
-                    continue
-                futures[ex.submit(ocr_image_path, img_path)] = (orig_name, page_num, img_path)
-            for fut in as_completed(list(futures.keys())):
-                orig_name, page_num, img_path = futures[fut]
-                try:
-                    ocr_texts = fut.result()
-                    fr = FileResult(
-                        filename=orig_name,
-                        type=("pdf" if page_num is not None else "image"),
-                        page=page_num,
-                        results=ocr_texts,
-                        error=None,
-                    )
-                except Exception as e:
-                    logger.exception("OCR failed for %s (page=%s): %s", orig_name, page_num, str(e))
-                    fr = FileResult(
-                        filename=orig_name,
-                        type=("pdf" if page_num is not None else "image"),
-                        page=page_num,
-                        results=[],
-                        error=str(e),
-                    )
-                results_per_file.append(fr)
-        # 5) Build summary and return
-        processed_files_count = len([r for r in results_per_file if r.error is None or r.results])
-        summary = {
-            "requested_files": len(files),
-            "processed_files": processed_files_count,
-            "total_pages_images": total_pages_images,
-            "per_pdf_pages": per_pdf_pages,
-            "total_limit": total_limit,
-        }
-        return JSONResponse(OCROutput(summary=summary, files=results_per_file).model_dump())
-    finally:
-        # Cleanup temp files and directory
-        try:
-            shutil.rmtree(request_tmpdir)
-        except Exception:
-            logger.warning("Failed to cleanup tempdir %s", request_tmpdir)

 import os
 import uuid
+from fastapi import FastAPI, UploadFile, File, HTTPException
 from fastapi.responses import JSONResponse
+from typing import List
+import fitz
+# -------------------------------------------------------------------
+# FORCE PADDLEX / PADDLEOCR CACHE DIRECTORIES TO WRITABLE LOCATIONS
+# -------------------------------------------------------------------
+os.environ["PADDLE_HOME"] = "/app/paddle_home"
+os.environ["XDG_CACHE_HOME"] = "/app/xdg_cache"
+os.makedirs("/app/paddle_home", exist_ok=True)
+os.makedirs("/app/xdg_cache", exist_ok=True)
+# now safe to import paddlex/paddleocr
 from paddleocr import PaddleOCR
+# -------------------------------------------------------------------
+# PDF → IMAGE
+# -------------------------------------------------------------------
+def pdf_to_images(pdf_path: str, max_pages: int | None = 3) -> List[str]:
+    if not os.path.exists(pdf_path):
+        raise FileNotFoundError(pdf_path)
     doc = fitz.open(pdf_path)
+    page_count = len(doc)
+    limit = page_count if max_pages is None else min(max_pages, page_count)
+    output_paths: List[str] = []
+    out_dir = "/app/pdf_images"
+    os.makedirs(out_dir, exist_ok=True)
     for i in range(limit):
         page = doc.load_page(i)
+        pix = page.get_pixmap(dpi=220)
         img_name = f"{uuid.uuid4()}.jpg"
+        img_path = os.path.join(out_dir, img_name)
         pix.save(img_path)
+        output_paths.append(img_path)
+    return output_paths
+# -------------------------------------------------------------------
+# OCR ENGINE
+# -------------------------------------------------------------------
+ocr_engine = PaddleOCR(
+    lang="mr",
+    text_recognition_model_name="devanagari_PP-OCRv5_mobile_rec",
+    use_doc_orientation_classify=False,
+    use_doc_unwarping=False,
+    use_textline_orientation=False
+)
+def extract_text(image_path: str):
+    result = ocr_engine.predict(input=image_path)
+    output = []
+    for block in result:
+        texts = block["rec_texts"]
+        scores = block["rec_scores"]
+        for t, s in zip(texts, scores):
+            output.append({"text": t, "confidence": float(s)})
+    return output
+# -------------------------------------------------------------------
+# FASTAPI
+# -------------------------------------------------------------------
+app = FastAPI()
+UPLOAD_DIR = "/app/uploads"
+os.makedirs(UPLOAD_DIR, exist_ok=True)
+@app.post("/ocr")
+async def ocr_endpoint(files: List[UploadFile] = File(...), max_pages: int | None = 3):
+    if len(files) > 15:
+        raise HTTPException(status_code=400, detail="Maximum 15 files allowed.")
+    final_output = []
+    for file in files:
+        filename = file.filename.lower()
+        ext = filename.split(".")[-1]
+        temp_name = f"{uuid.uuid4()}.{ext}"
+        temp_path = os.path.join(UPLOAD_DIR, temp_name)
+        with open(temp_path, "wb") as f:
+            f.write(await file.read())
+        # PDF
+        if filename.endswith(".pdf"):
+            img_paths = pdf_to_images(temp_path, max_pages=max_pages)
+            for img_path in img_paths:
+                final_output.extend(extract_text(img_path))
+        # Images
+        elif filename.endswith((".jpg", ".jpeg", ".png")):
+            final_output.extend(extract_text(temp_path))
+        else:
+            raise HTTPException(status_code=400, detail=f"Unsupported type: {filename}")
+    return JSONResponse({"results": final_output})