Spaces:

Gagandeep12
/

extopen-src

Sleeping

App Files Files Community

Gagandeep12 commited on Sep 27, 2025

Commit

396abf9

verified ·

1 Parent(s): 05a982f

Update azure_ocr.py

Browse files

Files changed (1) hide show

azure_ocr.py +67 -36

azure_ocr.py CHANGED Viewed

@@ -1,25 +1,32 @@
-import os
 import time
 import requests
 from PyPDF2 import PdfReader, PdfWriter
 import tempfile
-from dotenv import load_dotenv
 import re
-def get_azure_config():
-    endpoint = os.environ.get("AZURE_ENDPOINT")
-    key = os.environ.get("AZURE_KEY")
-    if not endpoint or not key:
-        raise RuntimeError("Set AZURE_ENDPOINT and AZURE_KEY in .env or environment")
-    return endpoint.rstrip("/"), key
 def read_file_bytes(path):
     with open(path, "rb") as f:
         return f.read()
-def submit_read_api(file_path, max_retries=3, backoff=3):
-    """Submit file to Computer Vision Read API with retry + backoff"""
     url = f"{AZURE_ENDPOINT}/vision/v3.2/read/analyze"
     headers = {
         "Ocp-Apim-Subscription-Key": AZURE_KEY,
@@ -27,33 +34,20 @@ def submit_read_api(file_path, max_retries=3, backoff=3):
     }
     data = read_file_bytes(file_path)
-    for attempt in range(1, max_retries + 1):
-        resp = requests.post(url, headers=headers, data=data)
-        if resp.status_code == 429:  # throttling
-            wait = backoff * attempt
-            print(f"⚠️ Throttled (429). Waiting {wait}s before retry...")
-            time.sleep(wait)
-            continue
-        try:
-            resp.raise_for_status()
-        except Exception as e:
-            if attempt == max_retries:
-                raise
-            wait = backoff * attempt
-            print(f"⚠️ Request failed (attempt {attempt}), retrying in {wait}s")
-            time.sleep(wait)
-            continue
-        op_location = resp.headers.get("Operation-Location")
-        if not op_location:
-            raise RuntimeError(f"No Operation-Location header. Response: {resp.text}")
-        return op_location
-    raise RuntimeError("Failed to submit OCR after retries")
 def poll_read_result(operation_location, timeout=180, interval=2.0):
-    """Poll until Computer Vision OCR completes (v3.2 Read API)"""
     headers = {"Ocp-Apim-Subscription-Key": AZURE_KEY}
     deadline = time.time() + timeout
@@ -79,8 +73,8 @@ def poll_read_result(operation_location, timeout=180, interval=2.0):
     return "\n".join(lines)
 def split_pdf_into_chunks(pdf_path, chunk_size=2):
-    """Split a PDF into smaller files of N pages"""
     reader = PdfReader(pdf_path)
     total_pages = len(reader.pages)
     chunk_files = []
@@ -97,12 +91,49 @@ def split_pdf_into_chunks(pdf_path, chunk_size=2):
 def clean_extracted_text(text: str) -> str:
-    """Optional cleanup for junk words, roll numbers, etc."""
     text = re.sub(r"--- Page.*?---", "", text)
     text = re.sub(r"\(chunk\)", "", text)
     text = re.sub(r"\b(?:stone|Stegaumen|studystone\.in)\b", "", text, flags=re.IGNORECASE)
     text = re.sub(r"Z-\d+", "", text)
     text = re.sub(r"P\.T\.O\.", "", text, flags=re.IGNORECASE)
     lines = [re.sub(r"\s+", " ", line).strip() for line in text.splitlines()]
     return "\n".join([l for l in lines if l])

 import time
+import os
 import requests
+import mimetypes
 from PyPDF2 import PdfReader, PdfWriter
 import tempfile
 import re
+AZURE_ENDPOINT = os.environ.get("AZURE_ENDPOINT")
+AZURE_KEY = os.environ.get("AZURE_KEY")
+if not AZURE_ENDPOINT or not AZURE_KEY:
+    raise RuntimeError("Set AZURE_ENDPOINT and AZURE_KEY in .env")
+AZURE_ENDPOINT = AZURE_ENDPOINT.rstrip("/")
 def read_file_bytes(path):
     with open(path, "rb") as f:
         return f.read()
+def detect_content_type(file_path: str):
+    mime, _ = mimetypes.guess_type(file_path)
+    return mime or "application/octet-stream"
+def submit_read_api(file_path):
+    """Submit file to Computer Vision Read API"""
     url = f"{AZURE_ENDPOINT}/vision/v3.2/read/analyze"
     headers = {
         "Ocp-Apim-Subscription-Key": AZURE_KEY,
     }
     data = read_file_bytes(file_path)
+    resp = requests.post(url, headers=headers, data=data)
+    print("Azure OCR request URL:", url)
+    print("Azure OCR response status:", resp.status_code)
+    print("Azure OCR response headers:", resp.headers)
+    resp.raise_for_status()
+    op_location = resp.headers.get("Operation-Location")
+    if not op_location:
+        raise RuntimeError(f"No Operation-Location header. Response: {resp.text}")
+    return op_location
 def poll_read_result(operation_location, timeout=180, interval=2.0):
+    """Poll until Computer Vision OCR completes"""
     headers = {"Ocp-Apim-Subscription-Key": AZURE_KEY}
     deadline = time.time() + timeout
     return "\n".join(lines)
 def split_pdf_into_chunks(pdf_path, chunk_size=2):
     reader = PdfReader(pdf_path)
     total_pages = len(reader.pages)
     chunk_files = []
 def clean_extracted_text(text: str) -> str:
+    # Remove page markers
     text = re.sub(r"--- Page.*?---", "", text)
+    # Remove chunk markers
     text = re.sub(r"\(chunk\)", "", text)
+    # Remove junk words
     text = re.sub(r"\b(?:stone|Stegaumen|studystone\.in)\b", "", text, flags=re.IGNORECASE)
+    # Remove roll numbers and codes
     text = re.sub(r"Z-\d+", "", text)
+    # Remove P.T.O
     text = re.sub(r"P\.T\.O\.", "", text, flags=re.IGNORECASE)
+    # Normalize per-line spacing but preserve newlines
     lines = [re.sub(r"\s+", " ", line).strip() for line in text.splitlines()]
     return "\n".join([l for l in lines if l])
+def poll_read_result(operation_location, timeout=180, interval=2.0):
+    headers = {"Ocp-Apim-Subscription-Key": AZURE_KEY}
+    deadline = time.time() + timeout
+    while time.time() < deadline:
+        r = requests.get(operation_location, headers=headers)
+        r.raise_for_status()
+        j = r.json()
+        status = j.get("status", "").lower()
+        if status in ("succeeded", "failed"):
+            break
+        time.sleep(interval)
+    if status != "succeeded":
+        raise RuntimeError(f"OCR failed. Status={status}, Response={j}")
+    analyze_result = j.get("analyzeResult", {})
+    pages = analyze_result.get("pages", [])
+    content = analyze_result.get("content", "")
+    pages_text = []
+    for page in pages:
+        page_num = page.get("pageNumber", "?")
+        spans = page.get("spans", [])
+        text_parts = [content[s["offset"]: s["offset"] + s["length"]] for s in spans]
+        joined = "\n".join(text_parts).strip() or "(No text detected)"
+        pages_text.append(f"--- Page {page_num} ---\n{joined}")
+    print(f"✅ Processed {len(pages)} pages successfully")
+    return "\n\n".join(pages_text)