File size: 1,659 Bytes
1540402
1db1a4f
1540402
 
1db1a4f
1540402
b1c5894
1540402
 
 
b1c5894
 
 
 
 
 
 
1540402
 
6298ba6
 
 
 
 
 
 
 
 
1db1a4f
6298ba6
 
 
 
 
b1c5894
 
 
 
 
 
 
6298ba6
 
b1c5894
6298ba6
 
b1c5894
6298ba6
 
 
 
b1c5894
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
from fastapi import FastAPI, UploadFile, File
from fastapi.responses import JSONResponse
from pdf2image import convert_from_bytes
from PIL import Image
import pytesseract
import io
from concurrent.futures import ThreadPoolExecutor

app = FastAPI()

# एक page को OCR में डालने वाली function
def ocr_page(image):
    try:
        return pytesseract.image_to_string(image, lang="hin+eng")
    except Exception as e:
        return f"⚠️ Error: {str(e)}"

@app.post("/ocr")
async def extract_text(file: UploadFile = File(...)):
    filename = file.filename.lower()
    allowed_ext = (".jpg", ".jpeg", ".png", ".pdf")

    if not filename.endswith(allowed_ext):
        return JSONResponse(
            content={"error": "❌ Unsupported file format! Please upload JPG, PNG, or PDF."},
            status_code=400
        )

    contents = await file.read()
    extracted_text = ""

    try:
        if filename.endswith(".pdf"):
            images = convert_from_bytes(contents)

            # ✅ सभी pages को parallel OCR में भेजते हैं
            with ThreadPoolExecutor() as executor:
                results = executor.map(ocr_page, images)

            extracted_text = "\n\n".join(results)

        else:
            image = Image.open(io.BytesIO(contents))
            extracted_text = pytesseract.image_to_string(image, lang="hin+eng")

        return {"text": extracted_text.strip() or "⚠️ No text found."}

    except Exception as e:
        return JSONResponse(
            content={"error": "🚫 Failed to process file", "details": str(e)},
            status_code=500
        )