File size: 1,166 Bytes
dcdbb2a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
from fastapi import FastAPI, UploadFile, File, HTTPException
from pdf2image import convert_from_bytes
import pytesseract
import os

app = FastAPI(title="PDF OCR API", description="Extract text from PDF using PyTesseract", version="1.0")

@app.post("/extract-text/")
async def extract_text_from_pdf(file: UploadFile = File(...)):
    if not file.filename.lower().endswith(".pdf"):
        raise HTTPException(status_code=400, detail="Only PDF files are supported")

    try:
        pdf_bytes = await file.read()
        images = convert_from_bytes(pdf_bytes)

        extracted_text = ""
        for i, image in enumerate(images):
            text = pytesseract.image_to_string(image, lang="mar+eng")  # or "mar+eng" if you include Marathi
            extracted_text += f"\n\n--- Page {i+1} ---\n\n{text.strip()}"

        return {"filename": file.filename, "extracted_text": extracted_text.strip()}

    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Error processing file: {str(e)}")

@app.get("/")
def home():
    return {"message": "PDF OCR API is running! Use /extract-text endpoint to upload a PDF."}