Spaces:

venirdev
/

pdf-ocr

Sleeping

venirdev commited on Oct 10, 2025

Commit

dcdbb2a

verified ·

1 Parent(s): 00efdb9

Upload 3 files

Files changed (3) hide show

Dockerfile ADDED Viewed

+# Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
+# you will also find guides on how best to write your Dockerfile
+FROM python:3.9
+RUN apt-get update && apt-get install -y \
+    tesseract-ocr \
+    tesseract-ocr-mar \
+    poppler-utils \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+RUN useradd -m -u 1000 user
+USER user
+ENV PATH="/home/user/.local/bin:$PATH"
+WORKDIR /app
+COPY --chown=user ./requirements.txt requirements.txt
+RUN pip install --no-cache-dir --upgrade -r requirements.txt
+COPY --chown=user . /app
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

app.py ADDED Viewed

+from fastapi import FastAPI, UploadFile, File, HTTPException
+from pdf2image import convert_from_bytes
+import pytesseract
+import os
+app = FastAPI(title="PDF OCR API", description="Extract text from PDF using PyTesseract", version="1.0")
+@app.post("/extract-text/")
+async def extract_text_from_pdf(file: UploadFile = File(...)):
+    if not file.filename.lower().endswith(".pdf"):
+        raise HTTPException(status_code=400, detail="Only PDF files are supported")
+    try:
+        pdf_bytes = await file.read()
+        images = convert_from_bytes(pdf_bytes)
+        extracted_text = ""
+        for i, image in enumerate(images):
+            text = pytesseract.image_to_string(image, lang="mar+eng")  # or "mar+eng" if you include Marathi
+            extracted_text += f"\n\n--- Page {i+1} ---\n\n{text.strip()}"
+        return {"filename": file.filename, "extracted_text": extracted_text.strip()}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Error processing file: {str(e)}")
+@app.get("/")
+def home():
+    return {"message": "PDF OCR API is running! Use /extract-text endpoint to upload a PDF."}

requirements.txt ADDED Viewed

+fastapi
+uvicorn
+pytesseract
+pdf2image
+pillow
+python-multipart