venirdev commited on
Commit
dcdbb2a
·
verified ·
1 Parent(s): 00efdb9

Upload 3 files

Browse files
Files changed (3) hide show
  1. Dockerfile +23 -0
  2. app.py +29 -0
  3. requirements.txt +6 -0
Dockerfile ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
2
+ # you will also find guides on how best to write your Dockerfile
3
+
4
+ FROM python:3.9
5
+
6
+ RUN apt-get update && apt-get install -y \
7
+ tesseract-ocr \
8
+ tesseract-ocr-mar \
9
+ poppler-utils \
10
+ && apt-get clean \
11
+ && rm -rf /var/lib/apt/lists/*
12
+
13
+ RUN useradd -m -u 1000 user
14
+ USER user
15
+ ENV PATH="/home/user/.local/bin:$PATH"
16
+
17
+ WORKDIR /app
18
+
19
+ COPY --chown=user ./requirements.txt requirements.txt
20
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
21
+
22
+ COPY --chown=user . /app
23
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
app.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, UploadFile, File, HTTPException
2
+ from pdf2image import convert_from_bytes
3
+ import pytesseract
4
+ import os
5
+
6
+ app = FastAPI(title="PDF OCR API", description="Extract text from PDF using PyTesseract", version="1.0")
7
+
8
+ @app.post("/extract-text/")
9
+ async def extract_text_from_pdf(file: UploadFile = File(...)):
10
+ if not file.filename.lower().endswith(".pdf"):
11
+ raise HTTPException(status_code=400, detail="Only PDF files are supported")
12
+
13
+ try:
14
+ pdf_bytes = await file.read()
15
+ images = convert_from_bytes(pdf_bytes)
16
+
17
+ extracted_text = ""
18
+ for i, image in enumerate(images):
19
+ text = pytesseract.image_to_string(image, lang="mar+eng") # or "mar+eng" if you include Marathi
20
+ extracted_text += f"\n\n--- Page {i+1} ---\n\n{text.strip()}"
21
+
22
+ return {"filename": file.filename, "extracted_text": extracted_text.strip()}
23
+
24
+ except Exception as e:
25
+ raise HTTPException(status_code=500, detail=f"Error processing file: {str(e)}")
26
+
27
+ @app.get("/")
28
+ def home():
29
+ return {"message": "PDF OCR API is running! Use /extract-text endpoint to upload a PDF."}
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ pytesseract
4
+ pdf2image
5
+ pillow
6
+ python-multipart