Spaces:

bk939448
/

ocr-api

Sleeping

File size: 1,302 Bytes

from fastapi import FastAPI, UploadFile, File
from fastapi.responses import JSONResponse
from pdf2image import convert_from_bytes
from PIL import Image
import pytesseract
import io

app = FastAPI()

@app.post("/ocr")
async def extract_text(file: UploadFile = File(...)):
    filename = file.filename.lower()
    allowed_ext = (".jpg", ".jpeg", ".png", ".pdf")

    if not filename.endswith(allowed_ext):
        return JSONResponse(
            content={"error": "❌ Unsupported file format! Please upload JPG, PNG, or PDF."},
            status_code=400
        )

    contents = await file.read()
    extracted_text = ""

    try:
        if filename.endswith(".pdf"):
            images = convert_from_bytes(contents)
            for page in images:
                text = pytesseract.image_to_string(page, lang="hin+eng")
                extracted_text += text + "\n\n"
        else:
            image = Image.open(io.BytesIO(contents))
            text = pytesseract.image_to_string(image, lang="hin+eng")
            extracted_text = text

        return {"text": extracted_text.strip() or "⚠️ No text found."}
    
    except Exception as e:
        return JSONResponse(
            content={"error": "🚫 Failed to process file", "details": str(e)},
            status_code=500
        )