badman99dev commited on
Commit
1540402
·
0 Parent(s):

🚀 Final OCR API with PDF + Image support

Browse files
Files changed (4) hide show
  1. Dockerfile +18 -0
  2. app.py +30 -0
  3. packages.txt +7 -0
  4. requirements.txt +5 -0
Dockerfile ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10
2
+
3
+ RUN apt-get update && apt-get install -y \
4
+ tesseract-ocr \
5
+ tesseract-ocr-hin \
6
+ poppler-utils \
7
+ libglib2.0-0 \
8
+ libsm6 \
9
+ libxrender1 \
10
+ libxext6
11
+
12
+ WORKDIR /app
13
+
14
+ COPY . .
15
+
16
+ RUN pip install --no-cache-dir -r requirements.txt
17
+
18
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
app.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, UploadFile, File
2
+ from pdf2image import convert_from_bytes
3
+ import pytesseract
4
+ from PIL import Image
5
+ import io
6
+
7
+ app = FastAPI()
8
+
9
+ @app.get("/")
10
+ def read_root():
11
+ return {"message": "✅ Hindi OCR API is live!"}
12
+
13
+ @app.post("/ocr")
14
+ async def extract_text(file: UploadFile = File(...)):
15
+ content = await file.read()
16
+
17
+ if file.filename.endswith(".pdf"):
18
+ images = convert_from_bytes(content, dpi=300) # high quality
19
+ full_text = ""
20
+ for img in images:
21
+ text = pytesseract.image_to_string(img, lang='hin+eng')
22
+ full_text += text + "\n"
23
+ return {"text": full_text.strip()}
24
+
25
+ elif file.filename.endswith((".jpg", ".jpeg", ".png")):
26
+ image = Image.open(io.BytesIO(content))
27
+ text = pytesseract.image_to_string(image, lang='hin+eng')
28
+ return {"text": text.strip()}
29
+
30
+ return {"error": "❌ Unsupported file format"}
packages.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ tesseract-ocr
2
+ tesseract-ocr-hin
3
+ poppler-utils
4
+ libglib2.0-0
5
+ libsm6
6
+ libxrender1
7
+ libxext6
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ pytesseract
4
+ pdf2image
5
+ Pillow