Gagandeep12 commited on
Commit
1e39f6a
·
verified ·
1 Parent(s): 2234729

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +124 -106
app.py CHANGED
@@ -1,112 +1,130 @@
1
- from flask import Flask, request, jsonify, render_template
2
- from flask_cors import CORS
3
- import pytesseract
4
- import numpy as np
5
- from PIL import Image
6
- import fitz
7
- import io
8
- import easyocr
9
  import os
10
-
11
- app = Flask(__name__)
12
- CORS(app)
13
-
14
- # ==============================
15
- # EasyOCR Setup (use /tmp to avoid permission errors)
16
- # ==============================
17
- EASY_OCR_DIR = os.path.join("/tmp", ".EasyOCR")
18
- os.makedirs(EASY_OCR_DIR, exist_ok=True)
19
-
20
- # Initialize EasyOCR reader once (English + Hindi)
21
- reader = easyocr.Reader(
22
- ['en', 'hi'],
23
- gpu=False,
24
- model_storage_directory=EASY_OCR_DIR,
25
- user_network_directory=os.path.join(EASY_OCR_DIR, "user_network")
26
- )
27
-
28
-
29
-
30
- @app.route('/')
31
- def home():
32
- return render_template('index.html')
33
-
34
-
35
- @app.route('/extract', methods=['POST'])
36
- def extract_text():
37
- file = request.files.get('file')
38
- method = request.form.get('method', 'tesseract') # default: tesseract
39
-
40
- if not file:
41
- return jsonify({'error': 'No file uploaded'}), 400
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
  try:
44
- filename = file.filename.lower()
45
- print(f"Received file: {filename} with method: {method}") # Debug
46
-
47
- if filename.endswith('.pdf'):
48
- return jsonify({'text': extract_text_from_pdf(file)})
49
- elif filename.endswith(('.png', '.jpg', '.jpeg')):
50
- if method == 'easyocr':
51
- return jsonify({'text': extract_text_with_easyocr(file)})
52
- else:
53
- return jsonify({'text': extract_text_from_image(file)})
54
  else:
55
- return jsonify({'error': 'Unsupported file format'}), 400
 
56
  except Exception as e:
57
- print(f"Error processing file: {str(e)}") # Debug
58
- return jsonify({'error': str(e)}), 500
59
-
60
-
61
- def extract_text_from_pdf(file):
62
- text = ""
63
- pdf_bytes = file.read()
64
- pdf = fitz.open(stream=pdf_bytes, filetype="pdf")
65
-
66
- for page_number, page in enumerate(pdf):
67
- try:
68
- page_text = page.get_text()
69
- if page_text.strip():
70
- text += f"\n--- Page {page_number + 1} ---\n"
71
- text += page_text + "\n"
72
- else:
73
- pix = page.get_pixmap(dpi=300)
74
- image_bytes = pix.tobytes("png")
75
- image = Image.open(io.BytesIO(image_bytes))
76
- ocr_text = pytesseract.image_to_string(
77
- image, lang='hin+eng', config='--oem 3 --psm 6'
78
- )
79
- text += f"\n--- Page {page_number + 1} (OCR) ---\n"
80
- text += ocr_text + "\n"
81
- except Exception as e:
82
- text += f"\n--- Page {page_number + 1} (Error) ---\nError extracting text: {str(e)}\n"
83
- continue
84
-
85
- return text.strip()
86
-
87
-
88
- def extract_text_from_image(file, method='tesseract'):
89
- image = Image.open(io.BytesIO(file.read())).convert("RGB")
90
-
91
- if method == 'easyocr':
92
- image_np = np.array(image)
93
- result = reader.readtext(image_np, detail=0)
94
- return '\n'.join(result)
95
- else:
96
- custom_config = r'--oem 3 --psm 6'
97
- return pytesseract.image_to_string(
98
- image, lang='hin+eng', config=custom_config
99
- )
100
-
101
-
102
- def extract_text_with_easyocr(file):
103
- image = Image.open(io.BytesIO(file.read())).convert("RGB")
104
- image_np = np.array(image)
105
- result = reader.readtext(image_np)
106
- sorted_result = sorted(result, key=lambda x: x[0][0][1]) # sort by top y
107
- extracted_text = "\n".join([text[1] for text in sorted_result])
108
- return extracted_text
109
-
110
-
111
- if __name__ == '__main__':
112
  app.run(host="0.0.0.0", port=7860, debug=True)
 
 
 
 
 
 
 
 
 
1
  import os
2
+ import tempfile
3
+ import time
4
+ import requests
5
+ from flask import Flask, request, jsonify, send_from_directory
6
+ from werkzeug.utils import secure_filename
7
+ from PyPDF2 import PdfReader, PdfWriter
8
+ from dotenv import load_dotenv
9
+
10
+ # Load env
11
+ load_dotenv()
12
+ AZURE_ENDPOINT = os.environ.get("AZURE_ENDPOINT")
13
+ AZURE_KEY = os.environ.get("AZURE_KEY")
14
+
15
+ if not AZURE_ENDPOINT or not AZURE_KEY:
16
+ raise RuntimeError("Set AZURE_ENDPOINT and AZURE_KEY in .env")
17
+
18
+ AZURE_ENDPOINT = AZURE_ENDPOINT.rstrip("/")
19
+ UPLOAD_DIR = "/tmp/uploads"
20
+ os.makedirs(UPLOAD_DIR, exist_ok=True)
21
+
22
+ ALLOWED_EXTENSIONS = {"png", "jpg", "jpeg", "pdf"}
23
+
24
+ app = Flask(__name__, static_folder="static", static_url_path="/static")
25
+
26
+
27
+ # --- Helpers ---
28
+ def allowed_file(filename):
29
+ return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS
30
+
31
+ def read_file_bytes(path):
32
+ with open(path, "rb") as f:
33
+ return f.read()
34
+
35
+ def submit_read_api(file_path):
36
+ """Submit file to Computer Vision Read API"""
37
+ url = f"{AZURE_ENDPOINT}/vision/v3.2/read/analyze"
38
+ headers = {
39
+ "Ocp-Apim-Subscription-Key": AZURE_KEY,
40
+ "Content-Type": "application/octet-stream"
41
+ }
42
+ data = read_file_bytes(file_path)
43
+
44
+ resp = requests.post(url, headers=headers, data=data)
45
+ resp.raise_for_status()
46
+
47
+ op_location = resp.headers.get("Operation-Location")
48
+ if not op_location:
49
+ raise RuntimeError(f"No Operation-Location header. Response: {resp.text}")
50
+ return op_location
51
+
52
+ def poll_read_result(operation_location, timeout=180, interval=2.0):
53
+ """Poll until OCR is finished"""
54
+ headers = {"Ocp-Apim-Subscription-Key": AZURE_KEY}
55
+ deadline = time.time() + timeout
56
+
57
+ while time.time() < deadline:
58
+ r = requests.get(operation_location, headers=headers)
59
+ r.raise_for_status()
60
+ j = r.json()
61
+ status = j.get("status", "").lower()
62
+ if status in ("succeeded", "failed"):
63
+ break
64
+ time.sleep(interval)
65
+
66
+ if status != "succeeded":
67
+ raise RuntimeError(f"OCR failed. Status={status}, Response={j}")
68
+
69
+ results = j.get("analyzeResult", {})
70
+ lines = []
71
+ for read_result in results.get("readResults", []):
72
+ for line in read_result.get("lines", []):
73
+ lines.append(line["text"])
74
+
75
+ return "\n".join(lines)
76
+
77
+ def split_pdf_into_chunks(pdf_path, chunk_size=2):
78
+ reader = PdfReader(pdf_path)
79
+ total_pages = len(reader.pages)
80
+ chunk_files = []
81
+ for start in range(0, total_pages, chunk_size):
82
+ writer = PdfWriter()
83
+ for p in range(start, min(start + chunk_size, total_pages)):
84
+ writer.add_page(reader.pages[p])
85
+ tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
86
+ with open(tmp.name, "wb") as f:
87
+ writer.write(f)
88
+ chunk_files.append(tmp.name)
89
+ return chunk_files
90
+
91
+
92
+ # --- Routes ---
93
+ @app.route("/")
94
+ def index():
95
+ return send_from_directory("static", "index.html")
96
+
97
+ @app.route("/upload", methods=["POST"])
98
+ def upload():
99
+ if "file" not in request.files:
100
+ return jsonify({"error": "No file part"}), 400
101
+ file = request.files["file"]
102
+ if file.filename == "":
103
+ return jsonify({"error": "Empty filename"}), 400
104
+ if not allowed_file(file.filename):
105
+ return jsonify({"error": "File type not allowed"}), 400
106
+
107
+ filename = secure_filename(file.filename)
108
+ path = os.path.join(UPLOAD_DIR, filename)
109
+ file.save(path)
110
 
111
  try:
112
+ if filename.lower().endswith(".pdf"):
113
+ chunks = split_pdf_into_chunks(path, chunk_size=2)
114
+ merged_results = []
115
+ for chunk_file in chunks:
116
+ op_location = submit_read_api(chunk_file)
117
+ chunk_text = poll_read_result(op_location)
118
+ merged_results.append(chunk_text)
119
+ extracted_text = "\n\n".join(merged_results)
 
 
120
  else:
121
+ op_location = submit_read_api(path)
122
+ extracted_text = poll_read_result(op_location)
123
  except Exception as e:
124
+ return jsonify({"error": "OCR failed", "details": str(e)}), 500
125
+
126
+ return jsonify({"text": extracted_text})
127
+
128
+
129
+ if __name__ == "__main__":
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  app.run(host="0.0.0.0", port=7860, debug=True)