badman99dev commited on
Commit
31ce858
·
1 Parent(s): d67f610

🧠 Switched to Flask API for Hindi OCR

Browse files
Files changed (3) hide show
  1. app.py +36 -35
  2. packages.txt +0 -1
  3. requirements.txt +3 -3
app.py CHANGED
@@ -1,38 +1,39 @@
1
- import gradio as gr
2
  import pytesseract
3
- from pdf2image import convert_from_bytes
4
  from PIL import Image
 
 
5
 
6
- def process_file(file):
7
- try:
8
- if file is None:
9
- return "❌ कोई फ़ाइल नहीं मिली।"
10
-
11
- # Check if it's a PDF
12
- if file.name.endswith(".pdf"):
13
- pdf_bytes = file.read()
14
- images = convert_from_bytes(pdf_bytes, dpi=100)
15
- else:
16
- # For image
17
- img = Image.open(file)
18
- images = [img]
19
-
20
- final_text = ""
21
- for img in images:
22
- text = pytesseract.image_to_string(img, lang="hin+eng")
23
- final_text += text + "\n"
24
-
25
- return final_text.strip() if final_text.strip() else "❌ कोई टेक्स्ट नहीं मिला।"
26
-
27
- except Exception as e:
28
- return f"⚠️ Error: {str(e)}"
29
-
30
- demo = gr.Interface(
31
- fn=process_file,
32
- inputs=gr.File(label="📤 PDF या इमेज अपलोड करें"),
33
- outputs=gr.Textbox(label="📝 OCR से निकाला गया टेक्स्ट"),
34
- title="🧠 Hindi-English OCR",
35
- description="PDF और Images से हिंदी + English टेक्स्ट निकालो 🔥"
36
- )
37
-
38
- demo.launch()
 
1
+ from flask import Flask, request, jsonify
2
  import pytesseract
 
3
  from PIL import Image
4
+ import fitz # PyMuPDF
5
+ import os
6
 
7
+ app = Flask(__name__)
8
+
9
+ @app.route('/ocr', methods=['POST'])
10
+ def ocr():
11
+ if 'file' not in request.files:
12
+ return jsonify({'error': 'No file part'}), 400
13
+
14
+ file = request.files['file']
15
+ filename = file.filename
16
+
17
+ temp_path = f"temp_{filename}"
18
+ file.save(temp_path)
19
+
20
+ extracted_text = ""
21
+
22
+ if filename.lower().endswith(".pdf"):
23
+ pdf = fitz.open(temp_path)
24
+ for page_num in range(len(pdf)):
25
+ page = pdf.load_page(page_num)
26
+ pix = page.get_pixmap(dpi=300)
27
+ img_path = f"page_{page_num}.png"
28
+ pix.save(img_path)
29
+
30
+ img = Image.open(img_path)
31
+ extracted_text += pytesseract.image_to_string(img, lang="hin+eng") + "\n"
32
+
33
+ os.remove(img_path)
34
+ else:
35
+ img = Image.open(temp_path)
36
+ extracted_text = pytesseract.image_to_string(img, lang="hin+eng")
37
+
38
+ os.remove(temp_path)
39
+ return jsonify({'text': extracted_text.strip()})
packages.txt CHANGED
@@ -1,6 +1,5 @@
1
  tesseract-ocr
2
  tesseract-ocr-hin
3
- poppler-utils
4
  libglib2.0-0
5
  libsm6
6
  libxrender1
 
1
  tesseract-ocr
2
  tesseract-ocr-hin
 
3
  libglib2.0-0
4
  libsm6
5
  libxrender1
requirements.txt CHANGED
@@ -1,4 +1,4 @@
1
- gradio
2
  pytesseract
3
- pillow
4
- pdf2image
 
1
+ flask
2
  pytesseract
3
+ Pillow
4
+ PyMuPDF