Spaces:

bk939448
/

ocr-api

Paused

badman99dev commited on Jun 25, 2025

Commit

6298ba6

1 Parent(s): 1db1a4f

🚀 Final OCR API with format filter and multi-page PDF

Files changed (5) hide show

Dockerfile CHANGED Viewed

@@ -1,17 +1,10 @@
 FROM python:3.10
-RUN apt-get update && apt-get install -y \
-    tesseract-ocr \
-    tesseract-ocr-hin \
-    poppler-utils \
-    libglib2.0-0 \
-    libsm6 \
-    libxrender1 \
-    libxext6 \
- && apt-get clean
-WORKDIR /app
 COPY . /app
-RUN pip install -r requirements.txt
 CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

 FROM python:3.10
+RUN apt-get update && \
+    apt-get install -y poppler-utils tesseract-ocr tesseract-ocr-hin && \
+    pip install --no-cache-dir fastapi uvicorn pytesseract pillow pdf2image python-multipart
 COPY . /app
+WORKDIR /app
 CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

README.md CHANGED Viewed

@@ -1,9 +1,10 @@
 ---
 title: Hindi OCR API
-emoji: 📖
-colorFrom: blue
-colorTo: indigo
 sdk: docker
 app_file: app.py
-pinned: false
 ---

 ---
 title: Hindi OCR API
+emoji: 🧾
+colorFrom: indigo
+colorTo: pink
 sdk: docker
+sdk_version: "1.0"
 app_file: app.py
+pinned: true
 ---

app.py CHANGED Viewed

@@ -7,16 +7,35 @@ import io
 app = FastAPI()
-def ocr_image(image: Image.Image) -> str:
-    return pytesseract.image_to_string(image, lang='hin+eng')
 @app.post("/ocr")
 async def extract_text(file: UploadFile = File(...)):
     contents = await file.read()
-    if file.filename.lower().endswith(".pdf"):
-        images = convert_from_bytes(contents)
-        text = "\n".join([ocr_image(img) for img in images])
-    else:
-        image = Image.open(io.BytesIO(contents))
-        text = ocr_image(image)
-    return JSONResponse(content={"text": text})

 app = FastAPI()
 @app.post("/ocr")
 async def extract_text(file: UploadFile = File(...)):
+    filename = file.filename.lower()
+    allowed_ext = (".jpg", ".jpeg", ".png", ".pdf")
+    if not filename.endswith(allowed_ext):
+        return JSONResponse(
+            content={"error": "❌ Unsupported file format! Please upload JPG, PNG, or PDF."},
+            status_code=400
+        )
     contents = await file.read()
+    extracted_text = ""
+    try:
+        if filename.endswith(".pdf"):
+            images = convert_from_bytes(contents)
+            for page in images:
+                text = pytesseract.image_to_string(page, lang="hin+eng")
+                extracted_text += text + "\n\n"
+        else:
+            image = Image.open(io.BytesIO(contents))
+            text = pytesseract.image_to_string(image, lang="hin+eng")
+            extracted_text = text
+        return {"text": extracted_text.strip() or "⚠️ No text found."}
+    except Exception as e:
+        return JSONResponse(
+            content={"error": "🚫 Failed to process file", "details": str(e)},
+            status_code=500
+        )

packages.txt CHANGED Viewed

@@ -1,7 +1,3 @@
 tesseract-ocr
 tesseract-ocr-hin
-poppler-utils
-libglib2.0-0
-libsm6
-libxrender1
-libxext6

+poppler-utils
 tesseract-ocr
 tesseract-ocr-hin

requirements.txt CHANGED Viewed

@@ -1,6 +1,6 @@
 fastapi
 uvicorn
 pytesseract
 pdf2image
 python-multipart
-Pillow

 fastapi
 uvicorn
 pytesseract
+pillow
 pdf2image
 python-multipart