Spaces:

bk939448
/

hindi-ocr-api

Paused

badman99dev commited on Jun 22, 2025

Commit

279020a

1 Parent(s): 31ce858

🛠️ Updated OCR files with PDF support

Files changed (3) hide show

Dockerfile ADDED Viewed

+FROM python:3.10-slim
+RUN apt-get update && apt-get install -y \
+    tesseract-ocr \
+    tesseract-ocr-hin \
+    libglib2.0-0 \
+    libsm6 \
+    libxrender1 \
+    libxext6 \
+    && rm -rf /var/lib/apt/lists/*
+WORKDIR /app
+COPY . .
+RUN pip install --no-cache-dir -r requirements.txt
+EXPOSE 7860
+CMD ["python", "app.py"]

app.py CHANGED Viewed

@@ -1,39 +1,44 @@
 from flask import Flask, request, jsonify
 import pytesseract
 from PIL import Image
 import fitz  # PyMuPDF
 import os
 app = Flask(__name__)
-@app.route('/ocr', methods=['POST'])
 def ocr():
-    if 'file' not in request.files:
-        return jsonify({'error': 'No file part'}), 400
-    file = request.files['file']
     filename = file.filename
-    temp_path = f"temp_{filename}"
-    file.save(temp_path)
-    extracted_text = ""
-    if filename.lower().endswith(".pdf"):
-        pdf = fitz.open(temp_path)
-        for page_num in range(len(pdf)):
-            page = pdf.load_page(page_num)
-            pix = page.get_pixmap(dpi=300)
-            img_path = f"page_{page_num}.png"
             pix.save(img_path)
             img = Image.open(img_path)
-            extracted_text += pytesseract.image_to_string(img, lang="hin+eng") + "\n"
-            os.remove(img_path)
     else:
-        img = Image.open(temp_path)
-        extracted_text = pytesseract.image_to_string(img, lang="hin+eng")
-    os.remove(temp_path)
-    return jsonify({'text': extracted_text.strip()})

 from flask import Flask, request, jsonify
+from flask_cors import CORS
 import pytesseract
 from PIL import Image
 import fitz  # PyMuPDF
 import os
 app = Flask(__name__)
+CORS(app)
+@app.route("/")
+def home():
+    return "🚀 Hindi OCR API is running!"
+@app.route("/api/ocr", methods=["POST"])
 def ocr():
+    if "file" not in request.files:
+        return jsonify({"error": "❌ No file uploaded"}), 400
+    file = request.files["file"]
     filename = file.filename
+    if filename.endswith(".pdf"):
+        doc = fitz.open(stream=file.read(), filetype="pdf")
+        text = ""
+        for page in doc:
+            pix = page.get_pixmap()
+            img_path = "temp.png"
             pix.save(img_path)
             img = Image.open(img_path)
+            text += pytesseract.image_to_string(img, lang="hin+eng") + "\n"
+        os.remove("temp.png")
+        return jsonify({"text": text.strip()})
+    elif filename.endswith((".png", ".jpg", ".jpeg")):
+        img = Image.open(file.stream)
+        text = pytesseract.image_to_string(img, lang="hin+eng")
+        return jsonify({"text": text.strip()})
     else:
+        return jsonify({"error": "❌ Unsupported file type"}), 400
+if __name__ == "__main__":
+    app.run(host="0.0.0.0", port=7860)

requirements.txt CHANGED Viewed

@@ -1,4 +1,5 @@
 flask
 pytesseract
-Pillow
 PyMuPDF

 flask
+flask-cors
 pytesseract
 PyMuPDF
+pillow