Spaces:

Sugamdeol
/

ocr-api

Sleeping

App Files Files Community

Sugamdeol commited on Jun 21, 2025

Commit

0776232

verified ·

1 Parent(s): e77d774

Upload 5 files

Browse files

Files changed (5) hide show

README.md +15 -7
app.py +86 -0
image_processor.py +40 -0
packages.txt +2 -0
requirements.txt +5 -0

README.md CHANGED Viewed

@@ -1,11 +1,19 @@
 ---
-title: Ocr Api
-emoji: 🌍
-colorFrom: red
-colorTo: purple
 sdk: docker
-pinned: false
-license: mit
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: High-Speed OCR API
+emoji: ⚡️📄
+colorFrom: blue
+colorTo: green
 sdk: docker
+app_port: 7860
 ---
+## High-Speed OCR API
+This Space provides a REST API for fast OCR on images and PDFs.
+**Endpoints:**
+- `/docs`: Interactive API documentation.
+- `/ocr-image`: Extracts text from a single image.
+- `/ocr-pdf`: Extracts text from all pages of a PDF document.
+Built with FastAPI and Tesseract, optimized for performance on free hardware.

app.py ADDED Viewed

	@@ -0,0 +1,86 @@

+# app.py
+import os
+import io
+import uvicorn
+from fastapi import FastAPI, File, UploadFile, HTTPException
+from fastapi.responses import StreamingResponse
+from fastapi.middleware.cors import CORSMiddleware
+from PIL import Image
+from pydantic import BaseModel
+from typing import List
+# For simplicity, we directly use the fast functions.
+# The `image_processor.py` file now contains the optimized versions.
+from image_processor import enhance_image_fast, extract_text_from_image_fast, process_pdf_in_parallel
+app = FastAPI(
+    title="High-Speed OCR API",
+    description="An API to extract text from images and PDFs, optimized for speed.",
+    version="4.0.0-hf"
+)
+# CORS Middleware to allow requests from any origin
+origins = ["*"]
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=origins,
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Pydantic Models for structured responses
+class ImageOCRResponse(BaseModel):
+    filename: str
+    text: str
+class PageResult(BaseModel):
+    page_number: int
+    text: str
+class PDFOCRResponse(BaseModel):
+    filename: str
+    total_pages: int
+    results: List[PageResult]
+# API Endpoints
+@app.get("/", tags=["General"])
+def read_root():
+    return {"message": "Welcome to the High-Speed OCR API. See /docs for documentation."}
+@app.post("/ocr-image", response_model=ImageOCRResponse, tags=["OCR"])
+async def ocr_image_endpoint(file: UploadFile = File(...)):
+    """Accepts an image, enhances it, and returns the extracted text."""
+    if not file.content_type.startswith("image/"):
+        raise HTTPException(status_code=400, detail="File must be an image.")
+    try:
+        contents = await file.read()
+        image = Image.open(io.BytesIO(contents))
+        enhanced_image = enhance_image_fast(image)
+        text = extract_text_from_image_fast(enhanced_image)
+        return {"filename": file.filename, "text": text}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Error processing image OCR: {e}")
+@app.post("/ocr-pdf", response_model=PDFOCRResponse, tags=["OCR"])
+async def ocr_pdf_endpoint(file: UploadFile = File(...)):
+    """Accepts a PDF, extracts text from each page in parallel, and returns structured results."""
+    if file.content_type != "application/pdf":
+        raise HTTPException(status_code=400, detail="File must be a PDF.")
+    try:
+        contents = await file.read()
+        results = process_pdf_in_parallel(contents)
+        return {
+            "filename": file.filename,
+            "total_pages": len(results),
+            "results": results
+        }
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Error processing PDF: {e}")
+# This block allows running the app locally for testing
+if __name__ == "__main__":
+    # Hugging Face Spaces expects the app to run on port 7860
+    port = int(os.environ.get("PORT", 7860))
+    uvicorn.run("app:app", host="0.0.0.0", port=port, reload=True)

image_processor.py ADDED Viewed

	@@ -0,0 +1,40 @@

+# image_processor.py
+from PIL import Image
+import pytesseract
+from pdf2image import convert_from_bytes
+import os
+from concurrent.futures import ThreadPoolExecutor
+def enhance_image_fast(image: Image.Image) -> Image.Image:
+    """A lightweight image enhancement pipeline optimized for speed."""
+    return image.convert('L').point(lambda x: 0 if x < 155 else 255, '1')
+def extract_text_from_image_fast(image: Image.Image) -> str:
+    """Extracts text using Tesseract with a configuration favoring speed."""
+    fast_config = r'--oem 1 --psm 6'
+    text = pytesseract.image_to_string(image, config=fast_config)
+    return text
+def _process_single_page_fast(page_image: Image.Image) -> str:
+    """Helper function that uses the new fast methods."""
+    enhanced_image = enhance_image_fast(page_image)
+    return extract_text_from_image_fast(enhanced_image)
+def process_pdf_in_parallel(pdf_bytes: bytes) -> list[dict]:
+    """Converts a PDF and processes pages in parallel using the FAST pipeline."""
+    print("FAST MODE: Converting PDF pages at 150 DPI...")
+    images = convert_from_bytes(pdf_bytes, dpi=150)
+    print(f"FAST MODE: PDF has {len(images)} pages. Starting optimized parallel OCR...")
+    page_results = []
+    with ThreadPoolExecutor(max_workers=2) as executor:
+        results = executor.map(_process_single_page_fast, images)
+        for i, text in enumerate(results):
+            page_results.append({
+                "page_number": i + 1,
+                "text": text
+            })
+    print("FAST MODE: Finished all pages.")
+    return page_results

packages.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ tesseract-ocr
2	+ poppler-utils

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+fastapi
+Pillow
+python-multipart
+pytesseract
+pdf2image