Upload 5 files
Browse files- README.md +15 -7
- app.py +86 -0
- image_processor.py +40 -0
- packages.txt +2 -0
- requirements.txt +5 -0
README.md
CHANGED
|
@@ -1,11 +1,19 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: docker
|
| 7 |
-
|
| 8 |
-
license: mit
|
| 9 |
---
|
| 10 |
|
| 11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: High-Speed OCR API
|
| 3 |
+
emoji: ⚡️📄
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: green
|
| 6 |
sdk: docker
|
| 7 |
+
app_port: 7860
|
|
|
|
| 8 |
---
|
| 9 |
|
| 10 |
+
## High-Speed OCR API
|
| 11 |
+
|
| 12 |
+
This Space provides a REST API for fast OCR on images and PDFs.
|
| 13 |
+
|
| 14 |
+
**Endpoints:**
|
| 15 |
+
- `/docs`: Interactive API documentation.
|
| 16 |
+
- `/ocr-image`: Extracts text from a single image.
|
| 17 |
+
- `/ocr-pdf`: Extracts text from all pages of a PDF document.
|
| 18 |
+
|
| 19 |
+
Built with FastAPI and Tesseract, optimized for performance on free hardware.
|
app.py
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# app.py
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import io
|
| 5 |
+
import uvicorn
|
| 6 |
+
from fastapi import FastAPI, File, UploadFile, HTTPException
|
| 7 |
+
from fastapi.responses import StreamingResponse
|
| 8 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 9 |
+
from PIL import Image
|
| 10 |
+
from pydantic import BaseModel
|
| 11 |
+
from typing import List
|
| 12 |
+
|
| 13 |
+
# For simplicity, we directly use the fast functions.
|
| 14 |
+
# The `image_processor.py` file now contains the optimized versions.
|
| 15 |
+
from image_processor import enhance_image_fast, extract_text_from_image_fast, process_pdf_in_parallel
|
| 16 |
+
|
| 17 |
+
app = FastAPI(
|
| 18 |
+
title="High-Speed OCR API",
|
| 19 |
+
description="An API to extract text from images and PDFs, optimized for speed.",
|
| 20 |
+
version="4.0.0-hf"
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
# CORS Middleware to allow requests from any origin
|
| 24 |
+
origins = ["*"]
|
| 25 |
+
app.add_middleware(
|
| 26 |
+
CORSMiddleware,
|
| 27 |
+
allow_origins=origins,
|
| 28 |
+
allow_credentials=True,
|
| 29 |
+
allow_methods=["*"],
|
| 30 |
+
allow_headers=["*"],
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
# Pydantic Models for structured responses
|
| 34 |
+
class ImageOCRResponse(BaseModel):
|
| 35 |
+
filename: str
|
| 36 |
+
text: str
|
| 37 |
+
|
| 38 |
+
class PageResult(BaseModel):
|
| 39 |
+
page_number: int
|
| 40 |
+
text: str
|
| 41 |
+
|
| 42 |
+
class PDFOCRResponse(BaseModel):
|
| 43 |
+
filename: str
|
| 44 |
+
total_pages: int
|
| 45 |
+
results: List[PageResult]
|
| 46 |
+
|
| 47 |
+
# API Endpoints
|
| 48 |
+
@app.get("/", tags=["General"])
|
| 49 |
+
def read_root():
|
| 50 |
+
return {"message": "Welcome to the High-Speed OCR API. See /docs for documentation."}
|
| 51 |
+
|
| 52 |
+
@app.post("/ocr-image", response_model=ImageOCRResponse, tags=["OCR"])
|
| 53 |
+
async def ocr_image_endpoint(file: UploadFile = File(...)):
|
| 54 |
+
"""Accepts an image, enhances it, and returns the extracted text."""
|
| 55 |
+
if not file.content_type.startswith("image/"):
|
| 56 |
+
raise HTTPException(status_code=400, detail="File must be an image.")
|
| 57 |
+
try:
|
| 58 |
+
contents = await file.read()
|
| 59 |
+
image = Image.open(io.BytesIO(contents))
|
| 60 |
+
enhanced_image = enhance_image_fast(image)
|
| 61 |
+
text = extract_text_from_image_fast(enhanced_image)
|
| 62 |
+
return {"filename": file.filename, "text": text}
|
| 63 |
+
except Exception as e:
|
| 64 |
+
raise HTTPException(status_code=500, detail=f"Error processing image OCR: {e}")
|
| 65 |
+
|
| 66 |
+
@app.post("/ocr-pdf", response_model=PDFOCRResponse, tags=["OCR"])
|
| 67 |
+
async def ocr_pdf_endpoint(file: UploadFile = File(...)):
|
| 68 |
+
"""Accepts a PDF, extracts text from each page in parallel, and returns structured results."""
|
| 69 |
+
if file.content_type != "application/pdf":
|
| 70 |
+
raise HTTPException(status_code=400, detail="File must be a PDF.")
|
| 71 |
+
try:
|
| 72 |
+
contents = await file.read()
|
| 73 |
+
results = process_pdf_in_parallel(contents)
|
| 74 |
+
return {
|
| 75 |
+
"filename": file.filename,
|
| 76 |
+
"total_pages": len(results),
|
| 77 |
+
"results": results
|
| 78 |
+
}
|
| 79 |
+
except Exception as e:
|
| 80 |
+
raise HTTPException(status_code=500, detail=f"Error processing PDF: {e}")
|
| 81 |
+
|
| 82 |
+
# This block allows running the app locally for testing
|
| 83 |
+
if __name__ == "__main__":
|
| 84 |
+
# Hugging Face Spaces expects the app to run on port 7860
|
| 85 |
+
port = int(os.environ.get("PORT", 7860))
|
| 86 |
+
uvicorn.run("app:app", host="0.0.0.0", port=port, reload=True)
|
image_processor.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# image_processor.py
|
| 2 |
+
|
| 3 |
+
from PIL import Image
|
| 4 |
+
import pytesseract
|
| 5 |
+
from pdf2image import convert_from_bytes
|
| 6 |
+
import os
|
| 7 |
+
from concurrent.futures import ThreadPoolExecutor
|
| 8 |
+
|
| 9 |
+
def enhance_image_fast(image: Image.Image) -> Image.Image:
|
| 10 |
+
"""A lightweight image enhancement pipeline optimized for speed."""
|
| 11 |
+
return image.convert('L').point(lambda x: 0 if x < 155 else 255, '1')
|
| 12 |
+
|
| 13 |
+
def extract_text_from_image_fast(image: Image.Image) -> str:
|
| 14 |
+
"""Extracts text using Tesseract with a configuration favoring speed."""
|
| 15 |
+
fast_config = r'--oem 1 --psm 6'
|
| 16 |
+
text = pytesseract.image_to_string(image, config=fast_config)
|
| 17 |
+
return text
|
| 18 |
+
|
| 19 |
+
def _process_single_page_fast(page_image: Image.Image) -> str:
|
| 20 |
+
"""Helper function that uses the new fast methods."""
|
| 21 |
+
enhanced_image = enhance_image_fast(page_image)
|
| 22 |
+
return extract_text_from_image_fast(enhanced_image)
|
| 23 |
+
|
| 24 |
+
def process_pdf_in_parallel(pdf_bytes: bytes) -> list[dict]:
|
| 25 |
+
"""Converts a PDF and processes pages in parallel using the FAST pipeline."""
|
| 26 |
+
print("FAST MODE: Converting PDF pages at 150 DPI...")
|
| 27 |
+
images = convert_from_bytes(pdf_bytes, dpi=150)
|
| 28 |
+
print(f"FAST MODE: PDF has {len(images)} pages. Starting optimized parallel OCR...")
|
| 29 |
+
|
| 30 |
+
page_results = []
|
| 31 |
+
with ThreadPoolExecutor(max_workers=2) as executor:
|
| 32 |
+
results = executor.map(_process_single_page_fast, images)
|
| 33 |
+
for i, text in enumerate(results):
|
| 34 |
+
page_results.append({
|
| 35 |
+
"page_number": i + 1,
|
| 36 |
+
"text": text
|
| 37 |
+
})
|
| 38 |
+
|
| 39 |
+
print("FAST MODE: Finished all pages.")
|
| 40 |
+
return page_results
|
packages.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
tesseract-ocr
|
| 2 |
+
poppler-utils
|
requirements.txt
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi
|
| 2 |
+
Pillow
|
| 3 |
+
python-multipart
|
| 4 |
+
pytesseract
|
| 5 |
+
pdf2image
|