ocr-api2

Sleeping

ocr-api2 / app.py

Update app.py

b1c5894 verified 5 months ago

1.66 kB

	from fastapi import FastAPI, UploadFile, File
	from fastapi.responses import JSONResponse
	from pdf2image import convert_from_bytes
	from PIL import Image
	import pytesseract
	import io
	from concurrent.futures import ThreadPoolExecutor

	app = FastAPI()

	# एक page को OCR में डालने वाली function
	def ocr_page(image):
	try:
	return pytesseract.image_to_string(image, lang="hin+eng")
	except Exception as e:
	return f"⚠️ Error: {str(e)}"

	@app.post("/ocr")
	async def extract_text(file: UploadFile = File(...)):
	filename = file.filename.lower()
	allowed_ext = (".jpg", ".jpeg", ".png", ".pdf")

	if not filename.endswith(allowed_ext):
	return JSONResponse(
	content={"error": "❌ Unsupported file format! Please upload JPG, PNG, or PDF."},
	status_code=400
	)

	contents = await file.read()
	extracted_text = ""

	try:
	if filename.endswith(".pdf"):
	images = convert_from_bytes(contents)

	# ✅ सभी pages को parallel OCR में भेजते हैं
	with ThreadPoolExecutor() as executor:
	results = executor.map(ocr_page, images)

	extracted_text = "\n\n".join(results)

	else:
	image = Image.open(io.BytesIO(contents))
	extracted_text = pytesseract.image_to_string(image, lang="hin+eng")

	return {"text": extracted_text.strip() or "⚠️ No text found."}

	except Exception as e:
	return JSONResponse(
	content={"error": "🚫 Failed to process file", "details": str(e)},
	status_code=500
	)