Spaces:

Gagandeep12
/

extopen-src

Sleeping

App Files Files Community

Gagandeep12 commited on Sep 27, 2025

Commit

d39c442

verified ·

1 Parent(s): a4b4e95

Update azure_ocr.py

Browse files

Files changed (1) hide show

azure_ocr.py +154 -139

azure_ocr.py CHANGED Viewed

@@ -1,139 +1,154 @@
-import time
-import os
-import requests
-import mimetypes
-from PyPDF2 import PdfReader, PdfWriter
-import tempfile
-import re
-AZURE_ENDPOINT = os.environ.get("AZURE_ENDPOINT")
-AZURE_KEY = os.environ.get("AZURE_KEY")
-if not AZURE_ENDPOINT or not AZURE_KEY:
-    raise RuntimeError("Set AZURE_ENDPOINT and AZURE_KEY in .env")
-AZURE_ENDPOINT = AZURE_ENDPOINT.rstrip("/")
-def read_file_bytes(path):
-    with open(path, "rb") as f:
-        return f.read()
-def detect_content_type(file_path: str):
-    mime, _ = mimetypes.guess_type(file_path)
-    return mime or "application/octet-stream"
-def submit_read_api(file_path):
-    """Submit file to Computer Vision Read API"""
-    url = f"{AZURE_ENDPOINT}/vision/v3.2/read/analyze"
-    headers = {
-        "Ocp-Apim-Subscription-Key": AZURE_KEY,
-        "Content-Type": "application/octet-stream"
-    }
-    data = read_file_bytes(file_path)
-    resp = requests.post(url, headers=headers, data=data)
-    print("Azure OCR request URL:", url)
-    print("Azure OCR response status:", resp.status_code)
-    print("Azure OCR response headers:", resp.headers)
-    resp.raise_for_status()
-    op_location = resp.headers.get("Operation-Location")
-    if not op_location:
-        raise RuntimeError(f"No Operation-Location header. Response: {resp.text}")
-    return op_location
-def poll_read_result(operation_location, timeout=180, interval=2.0):
-    """Poll until Computer Vision OCR completes"""
-    headers = {"Ocp-Apim-Subscription-Key": AZURE_KEY}
-    deadline = time.time() + timeout
-    while time.time() < deadline:
-        r = requests.get(operation_location, headers=headers)
-        r.raise_for_status()
-        j = r.json()
-        status = j.get("status", "").lower()
-        if status in ("succeeded", "failed"):
-            break
-        time.sleep(interval)
-    if status != "succeeded":
-        raise RuntimeError(f"OCR failed. Status={status}, Response={j}")
-    analyze_result = j.get("analyzeResult", {})
-    lines = []
-    for read_result in analyze_result.get("readResults", []):
-        for line in read_result.get("lines", []):
-            lines.append(line["text"])
-    print(f"✅ Extracted {len(lines)} lines of text")
-    return "\n".join(lines)
-def split_pdf_into_chunks(pdf_path, chunk_size=2):
-    reader = PdfReader(pdf_path)
-    total_pages = len(reader.pages)
-    chunk_files = []
-    for start in range(0, total_pages, chunk_size):
-        writer = PdfWriter()
-        for p in range(start, min(start + chunk_size, total_pages)):
-            writer.add_page(reader.pages[p])
-        tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
-        with open(tmp.name, "wb") as f:
-            writer.write(f)
-        chunk_files.append(tmp.name)
-    return chunk_files
-def clean_extracted_text(text: str) -> str:
-    # Remove page markers
-    text = re.sub(r"--- Page.*?---", "", text)
-    # Remove chunk markers
-    text = re.sub(r"\(chunk\)", "", text)
-    # Remove junk words
-    text = re.sub(r"\b(?:stone|Stegaumen|studystone\.in)\b", "", text, flags=re.IGNORECASE)
-    # Remove roll numbers and codes
-    text = re.sub(r"Z-\d+", "", text)
-    # Remove P.T.O
-    text = re.sub(r"P\.T\.O\.", "", text, flags=re.IGNORECASE)
-    # Normalize per-line spacing but preserve newlines
-    lines = [re.sub(r"\s+", " ", line).strip() for line in text.splitlines()]
-    return "\n".join([l for l in lines if l])
-def poll_read_result(operation_location, timeout=180, interval=2.0):
-    headers = {"Ocp-Apim-Subscription-Key": AZURE_KEY}
-    deadline = time.time() + timeout
-    while time.time() < deadline:
-        r = requests.get(operation_location, headers=headers)
-        r.raise_for_status()
-        j = r.json()
-        status = j.get("status", "").lower()
-        if status in ("succeeded", "failed"):
-            break
-        time.sleep(interval)
-    if status != "succeeded":
-        raise RuntimeError(f"OCR failed. Status={status}, Response={j}")
-    analyze_result = j.get("analyzeResult", {})
-    pages = analyze_result.get("pages", [])
-    content = analyze_result.get("content", "")
-    pages_text = []
-    for page in pages:
-        page_num = page.get("pageNumber", "?")
-        spans = page.get("spans", [])
-        text_parts = [content[s["offset"]: s["offset"] + s["length"]] for s in spans]
-        joined = "\n".join(text_parts).strip() or "(No text detected)"
-        pages_text.append(f"--- Page {page_num} ---\n{joined}")
-    print(f"✅ Processed {len(pages)} pages successfully")
-    return "\n\n".join(pages_text)

+import time
+import os
+import requests
+import mimetypes
+from PyPDF2 import PdfReader, PdfWriter
+import tempfile
+import re
+import random
+AZURE_ENDPOINT = os.environ.get("AZURE_ENDPOINT")
+AZURE_KEY = os.environ.get("AZURE_KEY")
+if not AZURE_ENDPOINT or not AZURE_KEY:
+    raise RuntimeError("Set AZURE_ENDPOINT and AZURE_KEY in .env")
+AZURE_ENDPOINT = AZURE_ENDPOINT.rstrip("/")
+def read_file_bytes(path):
+    with open(path, "rb") as f:
+        return f.read()
+def detect_content_type(file_path: str):
+    mime, _ = mimetypes.guess_type(file_path)
+    return mime or "application/octet-stream"
+def submit_read_api(file_path, max_retries=3, backoff=3):
+    """Submit file to Computer Vision Read API with retry + backoff"""
+    url = f"{AZURE_ENDPOINT}/vision/v3.2/read/analyze"
+    headers = {
+        "Ocp-Apim-Subscription-Key": AZURE_KEY,
+        "Content-Type": "application/octet-stream"
+    }
+    data = read_file_bytes(file_path)
+    for attempt in range(1, max_retries + 1):
+        resp = requests.post(url, headers=headers, data=data)
+        if resp.status_code == 429:  # throttling
+            wait = backoff * attempt + random.uniform(0, 1)
+            print(f"⚠️ Throttled (429). Waiting {wait:.1f}s before retry...")
+            time.sleep(wait)
+            continue
+        try:
+            resp.raise_for_status()
+        except Exception as e:
+            if attempt == max_retries:
+                raise
+            wait = backoff * attempt
+            print(f"⚠️ Request failed (attempt {attempt}), retrying in {wait}s")
+            time.sleep(wait)
+            continue
+        op_location = resp.headers.get("Operation-Location")
+        if not op_location:
+            raise RuntimeError(f"No Operation-Location header. Response: {resp.text}")
+        return op_location
+    raise RuntimeError("Failed to submit OCR after retries")
+def poll_read_result(operation_location, timeout=180, interval=2.0):
+    """Poll until Computer Vision OCR completes"""
+    headers = {"Ocp-Apim-Subscription-Key": AZURE_KEY}
+    deadline = time.time() + timeout
+    while time.time() < deadline:
+        r = requests.get(operation_location, headers=headers)
+        r.raise_for_status()
+        j = r.json()
+        status = j.get("status", "").lower()
+        if status in ("succeeded", "failed"):
+            break
+        time.sleep(interval)
+    if status != "succeeded":
+        raise RuntimeError(f"OCR failed. Status={status}, Response={j}")
+    analyze_result = j.get("analyzeResult", {})
+    lines = []
+    for read_result in analyze_result.get("readResults", []):
+        for line in read_result.get("lines", []):
+            lines.append(line["text"])
+    print(f"✅ Extracted {len(lines)} lines of text")
+    return "\n".join(lines)
+def split_pdf_into_chunks(pdf_path, chunk_size=):
+    reader = PdfReader(pdf_path)
+    total_pages = len(reader.pages)
+    chunk_files = []
+    for start in range(0, total_pages, chunk_size):
+        writer = PdfWriter()
+        for p in range(start, min(start + chunk_size, total_pages)):
+            writer.add_page(reader.pages[p])
+        tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
+        with open(tmp.name, "wb") as f:
+            writer.write(f)
+        chunk_files.append(tmp.name)
+    return chunk_files
+def clean_extracted_text(text: str) -> str:
+    # Remove page markers
+    text = re.sub(r"--- Page.*?---", "", text)
+    # Remove chunk markers
+    text = re.sub(r"\(chunk\)", "", text)
+    # Remove junk words
+    text = re.sub(r"\b(?:stone|Stegaumen|studystone\.in)\b", "", text, flags=re.IGNORECASE)
+    # Remove roll numbers and codes
+    text = re.sub(r"Z-\d+", "", text)
+    # Remove P.T.O
+    text = re.sub(r"P\.T\.O\.", "", text, flags=re.IGNORECASE)
+    # Normalize per-line spacing but preserve newlines
+    lines = [re.sub(r"\s+", " ", line).strip() for line in text.splitlines()]
+    return "\n".join([l for l in lines if l])
+def poll_read_result(operation_location, timeout=180, interval=2.0):
+    headers = {"Ocp-Apim-Subscription-Key": AZURE_KEY}
+    deadline = time.time() + timeout
+    while time.time() < deadline:
+        r = requests.get(operation_location, headers=headers)
+        r.raise_for_status()
+        j = r.json()
+        status = j.get("status", "").lower()
+        if status in ("succeeded", "failed"):
+            break
+        time.sleep(interval)
+    if status != "succeeded":
+        raise RuntimeError(f"OCR failed. Status={status}, Response={j}")
+    analyze_result = j.get("analyzeResult", {})
+    pages = analyze_result.get("pages", [])
+    content = analyze_result.get("content", "")
+    pages_text = []
+    for page in pages:
+        page_num = page.get("pageNumber", "?")
+        spans = page.get("spans", [])
+        text_parts = [content[s["offset"]: s["offset"] + s["length"]] for s in spans]
+        joined = "\n".join(text_parts).strip() or "(No text detected)"
+        pages_text.append(f"--- Page {page_num} ---\n{joined}")
+    print(f"✅ Processed {len(pages)} pages successfully")
+    return "\n\n".join(pages_text)