from fastapi import FastAPI, UploadFile, File, HTTPException from pdf2image import convert_from_bytes import pytesseract import os app = FastAPI(title="PDF OCR API", description="Extract text from PDF using PyTesseract", version="1.0") @app.post("/extract-text/") async def extract_text_from_pdf(file: UploadFile = File(...)): if not file.filename.lower().endswith(".pdf"): raise HTTPException(status_code=400, detail="Only PDF files are supported") try: pdf_bytes = await file.read() images = convert_from_bytes(pdf_bytes) extracted_text = "" for i, image in enumerate(images): text = pytesseract.image_to_string(image, lang="mar+eng") # or "mar+eng" if you include Marathi extracted_text += f"\n\n--- Page {i+1} ---\n\n{text.strip()}" return {"filename": file.filename, "extracted_text": extracted_text.strip()} except Exception as e: raise HTTPException(status_code=500, detail=f"Error processing file: {str(e)}") @app.get("/") def home(): return {"message": "PDF OCR API is running! Use /extract-text endpoint to upload a PDF."}