Sugamdeol commited on
Commit
0776232
·
verified ·
1 Parent(s): e77d774

Upload 5 files

Browse files
Files changed (5) hide show
  1. README.md +15 -7
  2. app.py +86 -0
  3. image_processor.py +40 -0
  4. packages.txt +2 -0
  5. requirements.txt +5 -0
README.md CHANGED
@@ -1,11 +1,19 @@
1
  ---
2
- title: Ocr Api
3
- emoji: 🌍
4
- colorFrom: red
5
- colorTo: purple
6
  sdk: docker
7
- pinned: false
8
- license: mit
9
  ---
10
 
11
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: High-Speed OCR API
3
+ emoji: ⚡️📄
4
+ colorFrom: blue
5
+ colorTo: green
6
  sdk: docker
7
+ app_port: 7860
 
8
  ---
9
 
10
+ ## High-Speed OCR API
11
+
12
+ This Space provides a REST API for fast OCR on images and PDFs.
13
+
14
+ **Endpoints:**
15
+ - `/docs`: Interactive API documentation.
16
+ - `/ocr-image`: Extracts text from a single image.
17
+ - `/ocr-pdf`: Extracts text from all pages of a PDF document.
18
+
19
+ Built with FastAPI and Tesseract, optimized for performance on free hardware.
app.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+
3
+ import os
4
+ import io
5
+ import uvicorn
6
+ from fastapi import FastAPI, File, UploadFile, HTTPException
7
+ from fastapi.responses import StreamingResponse
8
+ from fastapi.middleware.cors import CORSMiddleware
9
+ from PIL import Image
10
+ from pydantic import BaseModel
11
+ from typing import List
12
+
13
+ # For simplicity, we directly use the fast functions.
14
+ # The `image_processor.py` file now contains the optimized versions.
15
+ from image_processor import enhance_image_fast, extract_text_from_image_fast, process_pdf_in_parallel
16
+
17
+ app = FastAPI(
18
+ title="High-Speed OCR API",
19
+ description="An API to extract text from images and PDFs, optimized for speed.",
20
+ version="4.0.0-hf"
21
+ )
22
+
23
+ # CORS Middleware to allow requests from any origin
24
+ origins = ["*"]
25
+ app.add_middleware(
26
+ CORSMiddleware,
27
+ allow_origins=origins,
28
+ allow_credentials=True,
29
+ allow_methods=["*"],
30
+ allow_headers=["*"],
31
+ )
32
+
33
+ # Pydantic Models for structured responses
34
+ class ImageOCRResponse(BaseModel):
35
+ filename: str
36
+ text: str
37
+
38
+ class PageResult(BaseModel):
39
+ page_number: int
40
+ text: str
41
+
42
+ class PDFOCRResponse(BaseModel):
43
+ filename: str
44
+ total_pages: int
45
+ results: List[PageResult]
46
+
47
+ # API Endpoints
48
+ @app.get("/", tags=["General"])
49
+ def read_root():
50
+ return {"message": "Welcome to the High-Speed OCR API. See /docs for documentation."}
51
+
52
+ @app.post("/ocr-image", response_model=ImageOCRResponse, tags=["OCR"])
53
+ async def ocr_image_endpoint(file: UploadFile = File(...)):
54
+ """Accepts an image, enhances it, and returns the extracted text."""
55
+ if not file.content_type.startswith("image/"):
56
+ raise HTTPException(status_code=400, detail="File must be an image.")
57
+ try:
58
+ contents = await file.read()
59
+ image = Image.open(io.BytesIO(contents))
60
+ enhanced_image = enhance_image_fast(image)
61
+ text = extract_text_from_image_fast(enhanced_image)
62
+ return {"filename": file.filename, "text": text}
63
+ except Exception as e:
64
+ raise HTTPException(status_code=500, detail=f"Error processing image OCR: {e}")
65
+
66
+ @app.post("/ocr-pdf", response_model=PDFOCRResponse, tags=["OCR"])
67
+ async def ocr_pdf_endpoint(file: UploadFile = File(...)):
68
+ """Accepts a PDF, extracts text from each page in parallel, and returns structured results."""
69
+ if file.content_type != "application/pdf":
70
+ raise HTTPException(status_code=400, detail="File must be a PDF.")
71
+ try:
72
+ contents = await file.read()
73
+ results = process_pdf_in_parallel(contents)
74
+ return {
75
+ "filename": file.filename,
76
+ "total_pages": len(results),
77
+ "results": results
78
+ }
79
+ except Exception as e:
80
+ raise HTTPException(status_code=500, detail=f"Error processing PDF: {e}")
81
+
82
+ # This block allows running the app locally for testing
83
+ if __name__ == "__main__":
84
+ # Hugging Face Spaces expects the app to run on port 7860
85
+ port = int(os.environ.get("PORT", 7860))
86
+ uvicorn.run("app:app", host="0.0.0.0", port=port, reload=True)
image_processor.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # image_processor.py
2
+
3
+ from PIL import Image
4
+ import pytesseract
5
+ from pdf2image import convert_from_bytes
6
+ import os
7
+ from concurrent.futures import ThreadPoolExecutor
8
+
9
+ def enhance_image_fast(image: Image.Image) -> Image.Image:
10
+ """A lightweight image enhancement pipeline optimized for speed."""
11
+ return image.convert('L').point(lambda x: 0 if x < 155 else 255, '1')
12
+
13
+ def extract_text_from_image_fast(image: Image.Image) -> str:
14
+ """Extracts text using Tesseract with a configuration favoring speed."""
15
+ fast_config = r'--oem 1 --psm 6'
16
+ text = pytesseract.image_to_string(image, config=fast_config)
17
+ return text
18
+
19
+ def _process_single_page_fast(page_image: Image.Image) -> str:
20
+ """Helper function that uses the new fast methods."""
21
+ enhanced_image = enhance_image_fast(page_image)
22
+ return extract_text_from_image_fast(enhanced_image)
23
+
24
+ def process_pdf_in_parallel(pdf_bytes: bytes) -> list[dict]:
25
+ """Converts a PDF and processes pages in parallel using the FAST pipeline."""
26
+ print("FAST MODE: Converting PDF pages at 150 DPI...")
27
+ images = convert_from_bytes(pdf_bytes, dpi=150)
28
+ print(f"FAST MODE: PDF has {len(images)} pages. Starting optimized parallel OCR...")
29
+
30
+ page_results = []
31
+ with ThreadPoolExecutor(max_workers=2) as executor:
32
+ results = executor.map(_process_single_page_fast, images)
33
+ for i, text in enumerate(results):
34
+ page_results.append({
35
+ "page_number": i + 1,
36
+ "text": text
37
+ })
38
+
39
+ print("FAST MODE: Finished all pages.")
40
+ return page_results
packages.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ tesseract-ocr
2
+ poppler-utils
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ fastapi
2
+ Pillow
3
+ python-multipart
4
+ pytesseract
5
+ pdf2image