Spaces:

Gagandeep12
/

extopen-src

Sleeping

App Files Files Community

Gagandeep12 commited on Sep 27, 2025

Commit

ffdd6a7

verified ·

1 Parent(s): 2563511

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -11

app.py CHANGED Viewed

@@ -53,19 +53,33 @@ def submit_read_api(file_path):
     return op_location
-def poll_read_result(operation_location, timeout=180, interval=2.0):
-    """Poll until OCR is finished"""
     headers = {"Ocp-Apim-Subscription-Key": AZURE_KEY}
     deadline = time.time() + timeout
     while time.time() < deadline:
-        r = requests.get(operation_location, headers=headers)
-        r.raise_for_status()
-        j = r.json()
-        status = j.get("status", "").lower()
-        print("📡 Polling Azure OCR:", status)
-        if status in ("succeeded", "failed"):
-            break
         time.sleep(interval)
     if status != "succeeded":
@@ -81,6 +95,7 @@ def poll_read_result(operation_location, timeout=180, interval=2.0):
     return "\n".join(lines)
 def split_pdf_into_chunks(pdf_path, chunk_size=2):
     """Split large PDF into smaller chunks for OCR"""
     reader = PdfReader(pdf_path)
@@ -126,12 +141,11 @@ def upload():
                 op_location = submit_read_api(chunk_file)
                 chunk_text = poll_read_result(op_location)
                 merged_results.append(chunk_text)
-                # ⏳ wait 2 seconds before next request to avoid 429 errors
                 if i < len(chunks) - 1:
                     print("⏳ Sleeping 2s before next chunk...")
                     time.sleep(2)
             extracted_text = "\n\n".join(merged_results)
         else:
             op_location = submit_read_api(path)

     return op_location
+def poll_read_result(operation_location, timeout=180, interval=5.0):
+    """Poll until OCR is finished, with retry/backoff on 429"""
     headers = {"Ocp-Apim-Subscription-Key": AZURE_KEY}
     deadline = time.time() + timeout
+    attempt = 0
     while time.time() < deadline:
+        try:
+            r = requests.get(operation_location, headers=headers)
+            if r.status_code == 429:
+                wait = min(2 ** attempt, 30)  # exponential backoff, max 30s
+                print(f"⚠️ Got 429 Too Many Requests. Waiting {wait}s...")
+                time.sleep(wait)
+                attempt += 1
+                continue
+            r.raise_for_status()
+            j = r.json()
+            status = j.get("status", "").lower()
+            print("📡 Polling Azure OCR:", status)
+            if status in ("succeeded", "failed"):
+                break
+        except requests.exceptions.RequestException as e:
+            print("⚠️ Polling error:", e)
+            time.sleep(interval)
         time.sleep(interval)
     if status != "succeeded":
     return "\n".join(lines)
 def split_pdf_into_chunks(pdf_path, chunk_size=2):
     """Split large PDF into smaller chunks for OCR"""
     reader = PdfReader(pdf_path)
                 op_location = submit_read_api(chunk_file)
                 chunk_text = poll_read_result(op_location)
                 merged_results.append(chunk_text)
                 if i < len(chunks) - 1:
                     print("⏳ Sleeping 2s before next chunk...")
                     time.sleep(2)
             extracted_text = "\n\n".join(merged_results)
         else:
             op_location = submit_read_api(path)