Spaces:

venirdev
/

pdf-ocr

Sleeping

pdf-ocr / app.py

Upload 3 files

dcdbb2a verified 4 months ago

1.17 kB

	from fastapi import FastAPI, UploadFile, File, HTTPException
	from pdf2image import convert_from_bytes
	import pytesseract
	import os

	app = FastAPI(title="PDF OCR API", description="Extract text from PDF using PyTesseract", version="1.0")

	@app.post("/extract-text/")
	async def extract_text_from_pdf(file: UploadFile = File(...)):
	if not file.filename.lower().endswith(".pdf"):
	raise HTTPException(status_code=400, detail="Only PDF files are supported")

	try:
	pdf_bytes = await file.read()
	images = convert_from_bytes(pdf_bytes)

	extracted_text = ""
	for i, image in enumerate(images):
	text = pytesseract.image_to_string(image, lang="mar+eng") # or "mar+eng" if you include Marathi
	extracted_text += f"\n\n--- Page {i+1} ---\n\n{text.strip()}"

	return {"filename": file.filename, "extracted_text": extracted_text.strip()}

	except Exception as e:
	raise HTTPException(status_code=500, detail=f"Error processing file: {str(e)}")

	@app.get("/")
	def home():
	return {"message": "PDF OCR API is running! Use /extract-text endpoint to upload a PDF."}