Rivalcoder commited on
Commit
e4b7f54
·
1 Parent(s): 01b6cfc
Files changed (3) hide show
  1. Dockerfile +24 -0
  2. app.py +46 -0
  3. requirements.txt +6 -0
Dockerfile ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ # System dependencies for pdf2image + poppler
4
+ RUN apt-get update && apt-get install -y \
5
+ poppler-utils \
6
+ tesseract-ocr \
7
+ libglib2.0-0 \
8
+ libsm6 \
9
+ libxext6 \
10
+ libxrender-dev \
11
+ && rm -rf /var/lib/apt/lists/*
12
+
13
+ WORKDIR /app
14
+
15
+ # Install Python deps
16
+ COPY requirements.txt .
17
+ RUN pip install --no-cache-dir -r requirements.txt
18
+
19
+ # Copy app
20
+ COPY app.py .
21
+
22
+ EXPOSE 7860
23
+
24
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
app.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import numpy as np
3
+ import easyocr
4
+ from fastapi import FastAPI, File, UploadFile
5
+ from fastapi.responses import JSONResponse
6
+ from pdf2image import convert_from_bytes
7
+ from tempfile import NamedTemporaryFile
8
+ import uvicorn
9
+
10
+ app = FastAPI(title="OCR Backend API", description="Extract text from PDF or Images using EasyOCR")
11
+
12
+ # Initialize EasyOCR Reader (English + Hindi, you can add more)
13
+ reader = easyocr.Reader(['en', 'hi'])
14
+
15
+ @app.post("/extract-text/")
16
+ async def extract_text(file: UploadFile = File(...)):
17
+ try:
18
+ # Read uploaded file
19
+ contents = await file.read()
20
+
21
+ extracted_text = ""
22
+
23
+ if file.filename.lower().endswith(".pdf"):
24
+ # Convert PDF to images
25
+ images = convert_from_bytes(contents)
26
+ for i, image in enumerate(images):
27
+ image_np = np.array(image)
28
+ result = reader.readtext(image_np)
29
+ page_text = " ".join([text for _, text, _ in result])
30
+ extracted_text += f"--- Page {i+1} ---\n{page_text}\n\n"
31
+
32
+ else:
33
+ # Treat as image
34
+ with NamedTemporaryFile(delete=False, suffix=".jpg") as temp_file:
35
+ temp_file.write(contents)
36
+ temp_file.flush()
37
+ results = reader.readtext(temp_file.name)
38
+ extracted_text = " ".join([text for _, text, _ in results])
39
+
40
+ return JSONResponse({"extracted_text": extracted_text})
41
+
42
+ except Exception as e:
43
+ return JSONResponse({"error": str(e)}, status_code=500)
44
+
45
+ if __name__ == "__main__":
46
+ uvicorn.run(app, host="0.0.0.0", port=int(os.environ.get("PORT", 7860)))
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ easyocr
4
+ pdf2image
5
+ numpy
6
+ Pillow