Gagandeep12 commited on
Commit
05a982f
·
verified ·
1 Parent(s): 4f9dba0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +71 -13
app.py CHANGED
@@ -1,13 +1,21 @@
1
  import os
 
2
  import time
 
3
  from flask import Flask, request, jsonify, send_from_directory
4
  from werkzeug.utils import secure_filename
 
5
  from dotenv import load_dotenv
6
- from azure_ocr import submit_read_api, poll_read_result, split_pdf_into_chunks, clean_extracted_text
7
 
8
  # Load env
9
  load_dotenv()
 
 
10
 
 
 
 
 
11
  UPLOAD_DIR = "/tmp/uploads"
12
  os.makedirs(UPLOAD_DIR, exist_ok=True)
13
 
@@ -20,13 +28,72 @@ app = Flask(__name__, static_folder="static", static_url_path="/static")
20
  def allowed_file(filename):
21
  return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS
22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
  # --- Routes ---
25
  @app.route("/")
26
  def index():
27
  return send_from_directory("static", "index.html")
28
 
29
-
30
  @app.route("/upload", methods=["POST"])
31
  def upload():
32
  if "file" not in request.files:
@@ -45,23 +112,14 @@ def upload():
45
  if filename.lower().endswith(".pdf"):
46
  chunks = split_pdf_into_chunks(path, chunk_size=2)
47
  merged_results = []
48
- for idx, chunk_file in enumerate(chunks, 1):
49
- print(f"📄 Processing chunk {idx}/{len(chunks)}")
50
  op_location = submit_read_api(chunk_file)
51
  chunk_text = poll_read_result(op_location)
52
  merged_results.append(chunk_text)
53
-
54
- # throttle between requests
55
- time.sleep(2)
56
-
57
  extracted_text = "\n\n".join(merged_results)
58
- else: # images
59
  op_location = submit_read_api(path)
60
  extracted_text = poll_read_result(op_location)
61
-
62
- # cleanup text
63
- extracted_text = clean_extracted_text(extracted_text)
64
-
65
  except Exception as e:
66
  return jsonify({"error": "OCR failed", "details": str(e)}), 500
67
 
 
1
  import os
2
+ import tempfile
3
  import time
4
+ import requests
5
  from flask import Flask, request, jsonify, send_from_directory
6
  from werkzeug.utils import secure_filename
7
+ from PyPDF2 import PdfReader, PdfWriter
8
  from dotenv import load_dotenv
 
9
 
10
  # Load env
11
  load_dotenv()
12
+ AZURE_ENDPOINT = os.environ.get("AZURE_ENDPOINT")
13
+ AZURE_KEY = os.environ.get("AZURE_KEY")
14
 
15
+ if not AZURE_ENDPOINT or not AZURE_KEY:
16
+ raise RuntimeError("Set AZURE_ENDPOINT and AZURE_KEY in .env")
17
+
18
+ AZURE_ENDPOINT = AZURE_ENDPOINT.rstrip("/")
19
  UPLOAD_DIR = "/tmp/uploads"
20
  os.makedirs(UPLOAD_DIR, exist_ok=True)
21
 
 
28
  def allowed_file(filename):
29
  return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS
30
 
31
+ def read_file_bytes(path):
32
+ with open(path, "rb") as f:
33
+ return f.read()
34
+
35
+ def submit_read_api(file_path):
36
+ """Submit file to Computer Vision Read API"""
37
+ url = f"{AZURE_ENDPOINT}/vision/v3.2/read/analyze"
38
+ headers = {
39
+ "Ocp-Apim-Subscription-Key": AZURE_KEY,
40
+ "Content-Type": "application/octet-stream"
41
+ }
42
+ data = read_file_bytes(file_path)
43
+
44
+ resp = requests.post(url, headers=headers, data=data)
45
+ resp.raise_for_status()
46
+
47
+ op_location = resp.headers.get("Operation-Location")
48
+ if not op_location:
49
+ raise RuntimeError(f"No Operation-Location header. Response: {resp.text}")
50
+ return op_location
51
+
52
+ def poll_read_result(operation_location, timeout=180, interval=2.0):
53
+ """Poll until OCR is finished"""
54
+ headers = {"Ocp-Apim-Subscription-Key": AZURE_KEY}
55
+ deadline = time.time() + timeout
56
+
57
+ while time.time() < deadline:
58
+ r = requests.get(operation_location, headers=headers)
59
+ r.raise_for_status()
60
+ j = r.json()
61
+ status = j.get("status", "").lower()
62
+ if status in ("succeeded", "failed"):
63
+ break
64
+ time.sleep(interval)
65
+
66
+ if status != "succeeded":
67
+ raise RuntimeError(f"OCR failed. Status={status}, Response={j}")
68
+
69
+ results = j.get("analyzeResult", {})
70
+ lines = []
71
+ for read_result in results.get("readResults", []):
72
+ for line in read_result.get("lines", []):
73
+ lines.append(line["text"])
74
+
75
+ return "\n".join(lines)
76
+
77
+ def split_pdf_into_chunks(pdf_path, chunk_size=2):
78
+ reader = PdfReader(pdf_path)
79
+ total_pages = len(reader.pages)
80
+ chunk_files = []
81
+ for start in range(0, total_pages, chunk_size):
82
+ writer = PdfWriter()
83
+ for p in range(start, min(start + chunk_size, total_pages)):
84
+ writer.add_page(reader.pages[p])
85
+ tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
86
+ with open(tmp.name, "wb") as f:
87
+ writer.write(f)
88
+ chunk_files.append(tmp.name)
89
+ return chunk_files
90
+
91
 
92
  # --- Routes ---
93
  @app.route("/")
94
  def index():
95
  return send_from_directory("static", "index.html")
96
 
 
97
  @app.route("/upload", methods=["POST"])
98
  def upload():
99
  if "file" not in request.files:
 
112
  if filename.lower().endswith(".pdf"):
113
  chunks = split_pdf_into_chunks(path, chunk_size=2)
114
  merged_results = []
115
+ for chunk_file in chunks:
 
116
  op_location = submit_read_api(chunk_file)
117
  chunk_text = poll_read_result(op_location)
118
  merged_results.append(chunk_text)
 
 
 
 
119
  extracted_text = "\n\n".join(merged_results)
120
+ else:
121
  op_location = submit_read_api(path)
122
  extracted_text = poll_read_result(op_location)
 
 
 
 
123
  except Exception as e:
124
  return jsonify({"error": "OCR failed", "details": str(e)}), 500
125