|
|
from fastapi import FastAPI, UploadFile, File, HTTPException
|
|
|
from pdf2image import convert_from_bytes
|
|
|
import pytesseract
|
|
|
import os
|
|
|
|
|
|
app = FastAPI(title="PDF OCR API", description="Extract text from PDF using PyTesseract", version="1.0")
|
|
|
|
|
|
@app.post("/extract-text/")
|
|
|
async def extract_text_from_pdf(file: UploadFile = File(...)):
|
|
|
if not file.filename.lower().endswith(".pdf"):
|
|
|
raise HTTPException(status_code=400, detail="Only PDF files are supported")
|
|
|
|
|
|
try:
|
|
|
pdf_bytes = await file.read()
|
|
|
images = convert_from_bytes(pdf_bytes)
|
|
|
|
|
|
extracted_text = ""
|
|
|
for i, image in enumerate(images):
|
|
|
text = pytesseract.image_to_string(image, lang="mar+eng")
|
|
|
extracted_text += f"\n\n--- Page {i+1} ---\n\n{text.strip()}"
|
|
|
|
|
|
return {"filename": file.filename, "extracted_text": extracted_text.strip()}
|
|
|
|
|
|
except Exception as e:
|
|
|
raise HTTPException(status_code=500, detail=f"Error processing file: {str(e)}")
|
|
|
|
|
|
@app.get("/")
|
|
|
def home():
|
|
|
return {"message": "PDF OCR API is running! Use /extract-text endpoint to upload a PDF."}
|
|
|
|