Gagandeep12 commited on
Commit
74adb16
·
verified ·
1 Parent(s): 301336a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -7
app.py CHANGED
@@ -5,15 +5,13 @@ import requests
5
  from flask import Flask, request, jsonify, send_from_directory
6
  from werkzeug.utils import secure_filename
7
  from PyPDF2 import PdfReader, PdfWriter
8
- from dotenv import load_dotenv
9
 
10
- # Load env
11
- load_dotenv()
12
  AZURE_ENDPOINT = os.environ.get("AZURE_ENDPOINT")
13
  AZURE_KEY = os.environ.get("AZURE_KEY")
14
 
15
  if not AZURE_ENDPOINT or not AZURE_KEY:
16
- raise RuntimeError("Set AZURE_ENDPOINT and AZURE_KEY in .env")
17
 
18
  AZURE_ENDPOINT = AZURE_ENDPOINT.rstrip("/")
19
  UPLOAD_DIR = "/tmp/uploads"
@@ -28,27 +26,33 @@ app = Flask(__name__, static_folder="static", static_url_path="/static")
28
  def allowed_file(filename):
29
  return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS
30
 
 
31
  def read_file_bytes(path):
32
  with open(path, "rb") as f:
33
  return f.read()
34
 
 
35
  def submit_read_api(file_path):
36
- """Submit file to Computer Vision Read API"""
37
  url = f"{AZURE_ENDPOINT}/vision/v3.2/read/analyze"
38
  headers = {
39
  "Ocp-Apim-Subscription-Key": AZURE_KEY,
40
- "Content-Type": "application/octet-stream"
41
  }
42
  data = read_file_bytes(file_path)
43
 
44
  resp = requests.post(url, headers=headers, data=data)
45
- resp.raise_for_status()
 
 
46
 
 
47
  op_location = resp.headers.get("Operation-Location")
48
  if not op_location:
49
  raise RuntimeError(f"No Operation-Location header. Response: {resp.text}")
50
  return op_location
51
 
 
52
  def poll_read_result(operation_location, timeout=180, interval=2.0):
53
  """Poll until OCR is finished"""
54
  headers = {"Ocp-Apim-Subscription-Key": AZURE_KEY}
@@ -59,6 +63,7 @@ def poll_read_result(operation_location, timeout=180, interval=2.0):
59
  r.raise_for_status()
60
  j = r.json()
61
  status = j.get("status", "").lower()
 
62
  if status in ("succeeded", "failed"):
63
  break
64
  time.sleep(interval)
@@ -72,9 +77,12 @@ def poll_read_result(operation_location, timeout=180, interval=2.0):
72
  for line in read_result.get("lines", []):
73
  lines.append(line["text"])
74
 
 
75
  return "\n".join(lines)
76
 
 
77
  def split_pdf_into_chunks(pdf_path, chunk_size=2):
 
78
  reader = PdfReader(pdf_path)
79
  total_pages = len(reader.pages)
80
  chunk_files = []
@@ -94,6 +102,7 @@ def split_pdf_into_chunks(pdf_path, chunk_size=2):
94
  def index():
95
  return send_from_directory("static", "index.html")
96
 
 
97
  @app.route("/upload", methods=["POST"])
98
  def upload():
99
  if "file" not in request.files:
@@ -121,10 +130,24 @@ def upload():
121
  op_location = submit_read_api(path)
122
  extracted_text = poll_read_result(op_location)
123
  except Exception as e:
 
 
 
 
124
  return jsonify({"error": "OCR failed", "details": str(e)}), 500
125
 
126
  return jsonify({"text": extracted_text})
127
 
128
 
 
 
 
 
 
 
 
 
 
 
129
  if __name__ == "__main__":
130
  app.run(host="0.0.0.0", port=7860, debug=True)
 
5
  from flask import Flask, request, jsonify, send_from_directory
6
  from werkzeug.utils import secure_filename
7
  from PyPDF2 import PdfReader, PdfWriter
 
8
 
9
+ # --- Load env from Hugging Face Secrets ---
 
10
  AZURE_ENDPOINT = os.environ.get("AZURE_ENDPOINT")
11
  AZURE_KEY = os.environ.get("AZURE_KEY")
12
 
13
  if not AZURE_ENDPOINT or not AZURE_KEY:
14
+ raise RuntimeError("Set AZURE_ENDPOINT and AZURE_KEY as environment variables")
15
 
16
  AZURE_ENDPOINT = AZURE_ENDPOINT.rstrip("/")
17
  UPLOAD_DIR = "/tmp/uploads"
 
26
  def allowed_file(filename):
27
  return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS
28
 
29
+
30
  def read_file_bytes(path):
31
  with open(path, "rb") as f:
32
  return f.read()
33
 
34
+
35
  def submit_read_api(file_path):
36
+ """Submit file to Azure Computer Vision OCR Read API"""
37
  url = f"{AZURE_ENDPOINT}/vision/v3.2/read/analyze"
38
  headers = {
39
  "Ocp-Apim-Subscription-Key": AZURE_KEY,
40
+ "Content-Type": "application/octet-stream",
41
  }
42
  data = read_file_bytes(file_path)
43
 
44
  resp = requests.post(url, headers=headers, data=data)
45
+ print("➡️ Azure OCR request:", url)
46
+ print("➡️ Status:", resp.status_code)
47
+ print("➡️ Headers:", resp.headers)
48
 
49
+ resp.raise_for_status()
50
  op_location = resp.headers.get("Operation-Location")
51
  if not op_location:
52
  raise RuntimeError(f"No Operation-Location header. Response: {resp.text}")
53
  return op_location
54
 
55
+
56
  def poll_read_result(operation_location, timeout=180, interval=2.0):
57
  """Poll until OCR is finished"""
58
  headers = {"Ocp-Apim-Subscription-Key": AZURE_KEY}
 
63
  r.raise_for_status()
64
  j = r.json()
65
  status = j.get("status", "").lower()
66
+ print("📡 Polling Azure OCR:", status)
67
  if status in ("succeeded", "failed"):
68
  break
69
  time.sleep(interval)
 
77
  for line in read_result.get("lines", []):
78
  lines.append(line["text"])
79
 
80
+ print(f"✅ Extracted {len(lines)} lines of text")
81
  return "\n".join(lines)
82
 
83
+
84
  def split_pdf_into_chunks(pdf_path, chunk_size=2):
85
+ """Split large PDF into smaller chunks for OCR"""
86
  reader = PdfReader(pdf_path)
87
  total_pages = len(reader.pages)
88
  chunk_files = []
 
102
  def index():
103
  return send_from_directory("static", "index.html")
104
 
105
+
106
  @app.route("/upload", methods=["POST"])
107
  def upload():
108
  if "file" not in request.files:
 
130
  op_location = submit_read_api(path)
131
  extracted_text = poll_read_result(op_location)
132
  except Exception as e:
133
+ import traceback
134
+
135
+ print("❌ OCR Error:", e)
136
+ traceback.print_exc()
137
  return jsonify({"error": "OCR failed", "details": str(e)}), 500
138
 
139
  return jsonify({"text": extracted_text})
140
 
141
 
142
+ # Health check
143
+ @app.route("/ping-azure")
144
+ def ping_azure():
145
+ try:
146
+ r = requests.get(AZURE_ENDPOINT, timeout=5)
147
+ return {"status": r.status_code}
148
+ except Exception as e:
149
+ return {"error": str(e)}
150
+
151
+
152
  if __name__ == "__main__":
153
  app.run(host="0.0.0.0", port=7860, debug=True)