Spaces:

bk939448
/

ocr-api

Paused

badman99dev commited on Jun 23, 2025

Commit

1540402

0 Parent(s):

🚀 Final OCR API with PDF + Image support

Files changed (4) hide show

Dockerfile ADDED Viewed

+FROM python:3.10
+RUN apt-get update && apt-get install -y \
+    tesseract-ocr \
+    tesseract-ocr-hin \
+    poppler-utils \
+    libglib2.0-0 \
+    libsm6 \
+    libxrender1 \
+    libxext6
+WORKDIR /app
+COPY . .
+RUN pip install --no-cache-dir -r requirements.txt
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

app.py ADDED Viewed

+from fastapi import FastAPI, UploadFile, File
+from pdf2image import convert_from_bytes
+import pytesseract
+from PIL import Image
+import io
+app = FastAPI()
+@app.get("/")
+def read_root():
+    return {"message": "✅ Hindi OCR API is live!"}
+@app.post("/ocr")
+async def extract_text(file: UploadFile = File(...)):
+    content = await file.read()
+    if file.filename.endswith(".pdf"):
+        images = convert_from_bytes(content, dpi=300)  # high quality
+        full_text = ""
+        for img in images:
+            text = pytesseract.image_to_string(img, lang='hin+eng')
+            full_text += text + "\n"
+        return {"text": full_text.strip()}
+    elif file.filename.endswith((".jpg", ".jpeg", ".png")):
+        image = Image.open(io.BytesIO(content))
+        text = pytesseract.image_to_string(image, lang='hin+eng')
+        return {"text": text.strip()}
+    return {"error": "❌ Unsupported file format"}

packages.txt ADDED Viewed

+tesseract-ocr
+tesseract-ocr-hin
+poppler-utils
+libglib2.0-0
+libsm6
+libxrender1
+libxext6

requirements.txt ADDED Viewed

+fastapi
+uvicorn
+pytesseract
+pdf2image
+Pillow