badman99dev commited on
Commit
279020a
ยท
1 Parent(s): 31ce858

๐Ÿ› ๏ธ Updated OCR files with PDF support

Browse files
Files changed (3) hide show
  1. Dockerfile +17 -0
  2. app.py +28 -23
  3. requirements.txt +2 -1
Dockerfile ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ RUN apt-get update && apt-get install -y \
4
+ tesseract-ocr \
5
+ tesseract-ocr-hin \
6
+ libglib2.0-0 \
7
+ libsm6 \
8
+ libxrender1 \
9
+ libxext6 \
10
+ && rm -rf /var/lib/apt/lists/*
11
+
12
+ WORKDIR /app
13
+ COPY . .
14
+ RUN pip install --no-cache-dir -r requirements.txt
15
+ EXPOSE 7860
16
+
17
+ CMD ["python", "app.py"]
app.py CHANGED
@@ -1,39 +1,44 @@
1
  from flask import Flask, request, jsonify
 
2
  import pytesseract
3
  from PIL import Image
4
  import fitz # PyMuPDF
5
  import os
6
 
7
  app = Flask(__name__)
 
8
 
9
- @app.route('/ocr', methods=['POST'])
 
 
 
 
10
  def ocr():
11
- if 'file' not in request.files:
12
- return jsonify({'error': 'No file part'}), 400
13
 
14
- file = request.files['file']
15
  filename = file.filename
16
 
17
- temp_path = f"temp_{filename}"
18
- file.save(temp_path)
19
-
20
- extracted_text = ""
21
-
22
- if filename.lower().endswith(".pdf"):
23
- pdf = fitz.open(temp_path)
24
- for page_num in range(len(pdf)):
25
- page = pdf.load_page(page_num)
26
- pix = page.get_pixmap(dpi=300)
27
- img_path = f"page_{page_num}.png"
28
  pix.save(img_path)
29
-
30
  img = Image.open(img_path)
31
- extracted_text += pytesseract.image_to_string(img, lang="hin+eng") + "\n"
32
-
33
- os.remove(img_path)
 
 
 
 
 
 
34
  else:
35
- img = Image.open(temp_path)
36
- extracted_text = pytesseract.image_to_string(img, lang="hin+eng")
37
 
38
- os.remove(temp_path)
39
- return jsonify({'text': extracted_text.strip()})
 
1
  from flask import Flask, request, jsonify
2
+ from flask_cors import CORS
3
  import pytesseract
4
  from PIL import Image
5
  import fitz # PyMuPDF
6
  import os
7
 
8
  app = Flask(__name__)
9
+ CORS(app)
10
 
11
+ @app.route("/")
12
+ def home():
13
+ return "๐Ÿš€ Hindi OCR API is running!"
14
+
15
+ @app.route("/api/ocr", methods=["POST"])
16
  def ocr():
17
+ if "file" not in request.files:
18
+ return jsonify({"error": "โŒ No file uploaded"}), 400
19
 
20
+ file = request.files["file"]
21
  filename = file.filename
22
 
23
+ if filename.endswith(".pdf"):
24
+ doc = fitz.open(stream=file.read(), filetype="pdf")
25
+ text = ""
26
+ for page in doc:
27
+ pix = page.get_pixmap()
28
+ img_path = "temp.png"
 
 
 
 
 
29
  pix.save(img_path)
 
30
  img = Image.open(img_path)
31
+ text += pytesseract.image_to_string(img, lang="hin+eng") + "\n"
32
+ os.remove("temp.png")
33
+ return jsonify({"text": text.strip()})
34
+
35
+ elif filename.endswith((".png", ".jpg", ".jpeg")):
36
+ img = Image.open(file.stream)
37
+ text = pytesseract.image_to_string(img, lang="hin+eng")
38
+ return jsonify({"text": text.strip()})
39
+
40
  else:
41
+ return jsonify({"error": "โŒ Unsupported file type"}), 400
 
42
 
43
+ if __name__ == "__main__":
44
+ app.run(host="0.0.0.0", port=7860)
requirements.txt CHANGED
@@ -1,4 +1,5 @@
1
  flask
 
2
  pytesseract
3
- Pillow
4
  PyMuPDF
 
 
1
  flask
2
+ flask-cors
3
  pytesseract
 
4
  PyMuPDF
5
+ pillow