ocr-api2 / app.py
bk939448's picture
Update app.py
b1c5894 verified
from fastapi import FastAPI, UploadFile, File
from fastapi.responses import JSONResponse
from pdf2image import convert_from_bytes
from PIL import Image
import pytesseract
import io
from concurrent.futures import ThreadPoolExecutor
app = FastAPI()
# एक page को OCR में डालने वाली function
def ocr_page(image):
try:
return pytesseract.image_to_string(image, lang="hin+eng")
except Exception as e:
return f"⚠️ Error: {str(e)}"
@app.post("/ocr")
async def extract_text(file: UploadFile = File(...)):
filename = file.filename.lower()
allowed_ext = (".jpg", ".jpeg", ".png", ".pdf")
if not filename.endswith(allowed_ext):
return JSONResponse(
content={"error": "❌ Unsupported file format! Please upload JPG, PNG, or PDF."},
status_code=400
)
contents = await file.read()
extracted_text = ""
try:
if filename.endswith(".pdf"):
images = convert_from_bytes(contents)
# ✅ सभी pages को parallel OCR में भेजते हैं
with ThreadPoolExecutor() as executor:
results = executor.map(ocr_page, images)
extracted_text = "\n\n".join(results)
else:
image = Image.open(io.BytesIO(contents))
extracted_text = pytesseract.image_to_string(image, lang="hin+eng")
return {"text": extracted_text.strip() or "⚠️ No text found."}
except Exception as e:
return JSONResponse(
content={"error": "🚫 Failed to process file", "details": str(e)},
status_code=500
)