ocr-api2

Sleeping

File size: 1,659 Bytes

from fastapi import FastAPI, UploadFile, File
from fastapi.responses import JSONResponse
from pdf2image import convert_from_bytes
from PIL import Image
import pytesseract
import io
from concurrent.futures import ThreadPoolExecutor

app = FastAPI()

# एक page को OCR में डालने वाली function
def ocr_page(image):
    try:
        return pytesseract.image_to_string(image, lang="hin+eng")
    except Exception as e:
        return f"⚠️ Error: {str(e)}"

@app.post("/ocr")
async def extract_text(file: UploadFile = File(...)):
    filename = file.filename.lower()
    allowed_ext = (".jpg", ".jpeg", ".png", ".pdf")

    if not filename.endswith(allowed_ext):
        return JSONResponse(
            content={"error": "❌ Unsupported file format! Please upload JPG, PNG, or PDF."},
            status_code=400
        )

    contents = await file.read()
    extracted_text = ""

    try:
        if filename.endswith(".pdf"):
            images = convert_from_bytes(contents)

            # ✅ सभी pages को parallel OCR में भेजते हैं
            with ThreadPoolExecutor() as executor:
                results = executor.map(ocr_page, images)

            extracted_text = "\n\n".join(results)

        else:
            image = Image.open(io.BytesIO(contents))
            extracted_text = pytesseract.image_to_string(image, lang="hin+eng")

        return {"text": extracted_text.strip() or "⚠️ No text found."}

    except Exception as e:
        return JSONResponse(
            content={"error": "🚫 Failed to process file", "details": str(e)},
            status_code=500
        )