Gagandeep12 commited on
Commit
ffdd6a7
·
verified ·
1 Parent(s): 2563511

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -11
app.py CHANGED
@@ -53,19 +53,33 @@ def submit_read_api(file_path):
53
  return op_location
54
 
55
 
56
- def poll_read_result(operation_location, timeout=180, interval=2.0):
57
- """Poll until OCR is finished"""
58
  headers = {"Ocp-Apim-Subscription-Key": AZURE_KEY}
59
  deadline = time.time() + timeout
 
60
 
61
  while time.time() < deadline:
62
- r = requests.get(operation_location, headers=headers)
63
- r.raise_for_status()
64
- j = r.json()
65
- status = j.get("status", "").lower()
66
- print("📡 Polling Azure OCR:", status)
67
- if status in ("succeeded", "failed"):
68
- break
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  time.sleep(interval)
70
 
71
  if status != "succeeded":
@@ -81,6 +95,7 @@ def poll_read_result(operation_location, timeout=180, interval=2.0):
81
  return "\n".join(lines)
82
 
83
 
 
84
  def split_pdf_into_chunks(pdf_path, chunk_size=2):
85
  """Split large PDF into smaller chunks for OCR"""
86
  reader = PdfReader(pdf_path)
@@ -126,12 +141,11 @@ def upload():
126
  op_location = submit_read_api(chunk_file)
127
  chunk_text = poll_read_result(op_location)
128
  merged_results.append(chunk_text)
129
-
130
- # ⏳ wait 2 seconds before next request to avoid 429 errors
131
  if i < len(chunks) - 1:
132
  print("⏳ Sleeping 2s before next chunk...")
133
  time.sleep(2)
134
 
 
135
  extracted_text = "\n\n".join(merged_results)
136
  else:
137
  op_location = submit_read_api(path)
 
53
  return op_location
54
 
55
 
56
+ def poll_read_result(operation_location, timeout=180, interval=5.0):
57
+ """Poll until OCR is finished, with retry/backoff on 429"""
58
  headers = {"Ocp-Apim-Subscription-Key": AZURE_KEY}
59
  deadline = time.time() + timeout
60
+ attempt = 0
61
 
62
  while time.time() < deadline:
63
+ try:
64
+ r = requests.get(operation_location, headers=headers)
65
+ if r.status_code == 429:
66
+ wait = min(2 ** attempt, 30) # exponential backoff, max 30s
67
+ print(f"⚠️ Got 429 Too Many Requests. Waiting {wait}s...")
68
+ time.sleep(wait)
69
+ attempt += 1
70
+ continue
71
+
72
+ r.raise_for_status()
73
+ j = r.json()
74
+ status = j.get("status", "").lower()
75
+ print("📡 Polling Azure OCR:", status)
76
+ if status in ("succeeded", "failed"):
77
+ break
78
+
79
+ except requests.exceptions.RequestException as e:
80
+ print("⚠️ Polling error:", e)
81
+ time.sleep(interval)
82
+
83
  time.sleep(interval)
84
 
85
  if status != "succeeded":
 
95
  return "\n".join(lines)
96
 
97
 
98
+
99
  def split_pdf_into_chunks(pdf_path, chunk_size=2):
100
  """Split large PDF into smaller chunks for OCR"""
101
  reader = PdfReader(pdf_path)
 
141
  op_location = submit_read_api(chunk_file)
142
  chunk_text = poll_read_result(op_location)
143
  merged_results.append(chunk_text)
 
 
144
  if i < len(chunks) - 1:
145
  print("⏳ Sleeping 2s before next chunk...")
146
  time.sleep(2)
147
 
148
+
149
  extracted_text = "\n\n".join(merged_results)
150
  else:
151
  op_location = submit_read_api(path)