Spaces:

bk939448
/

hindi-ocr-api

Paused

badman99dev commited on Jun 22, 2025

Commit

31ce858

1 Parent(s): d67f610

🧠 Switched to Flask API for Hindi OCR

Files changed (3) hide show

app.py CHANGED Viewed

@@ -1,38 +1,39 @@
-import gradio as gr
 import pytesseract
-from pdf2image import convert_from_bytes
 from PIL import Image
-def process_file(file):
-    try:
-        if file is None:
-            return "❌ कोई फ़ाइल नहीं मिली।"
-        # Check if it's a PDF
-        if file.name.endswith(".pdf"):
-            pdf_bytes = file.read()
-            images = convert_from_bytes(pdf_bytes, dpi=100)
-        else:
-            # For image
-            img = Image.open(file)
-            images = [img]
-        final_text = ""
-        for img in images:
-            text = pytesseract.image_to_string(img, lang="hin+eng")
-            final_text += text + "\n"
-        return final_text.strip() if final_text.strip() else "❌ कोई टेक्स्ट नहीं मिला।"
-    except Exception as e:
-        return f"⚠️ Error: {str(e)}"
-demo = gr.Interface(
-    fn=process_file,
-    inputs=gr.File(label="📤 PDF या इमेज अपलोड करें"),
-    outputs=gr.Textbox(label="📝 OCR से निकाला गया टेक्स्ट"),
-    title="🧠 Hindi-English OCR",
-    description="PDF और Images से हिंदी + English टेक्स्ट निकालो 🔥"
-)
-demo.launch()

+from flask import Flask, request, jsonify
 import pytesseract
 from PIL import Image
+import fitz  # PyMuPDF
+import os
+app = Flask(__name__)
+@app.route('/ocr', methods=['POST'])
+def ocr():
+    if 'file' not in request.files:
+        return jsonify({'error': 'No file part'}), 400
+    file = request.files['file']
+    filename = file.filename
+    temp_path = f"temp_{filename}"
+    file.save(temp_path)
+    extracted_text = ""
+    if filename.lower().endswith(".pdf"):
+        pdf = fitz.open(temp_path)
+        for page_num in range(len(pdf)):
+            page = pdf.load_page(page_num)
+            pix = page.get_pixmap(dpi=300)
+            img_path = f"page_{page_num}.png"
+            pix.save(img_path)
+            img = Image.open(img_path)
+            extracted_text += pytesseract.image_to_string(img, lang="hin+eng") + "\n"
+            os.remove(img_path)
+    else:
+        img = Image.open(temp_path)
+        extracted_text = pytesseract.image_to_string(img, lang="hin+eng")
+    os.remove(temp_path)
+    return jsonify({'text': extracted_text.strip()})

packages.txt CHANGED Viewed

@@ -1,6 +1,5 @@
 tesseract-ocr
 tesseract-ocr-hin
-poppler-utils
 libglib2.0-0
 libsm6
 libxrender1

 tesseract-ocr
 tesseract-ocr-hin
 libglib2.0-0
 libsm6
 libxrender1

requirements.txt CHANGED Viewed

@@ -1,4 +1,4 @@
-gradio
 pytesseract
-pillow
-pdf2image

+flask
 pytesseract
+Pillow
+PyMuPDF