Spaces:
Sleeping
Sleeping
Update azure_ocr.py
Browse files- azure_ocr.py +67 -36
azure_ocr.py
CHANGED
|
@@ -1,25 +1,32 @@
|
|
| 1 |
-
import os
|
| 2 |
import time
|
|
|
|
| 3 |
import requests
|
|
|
|
| 4 |
from PyPDF2 import PdfReader, PdfWriter
|
| 5 |
import tempfile
|
| 6 |
-
from dotenv import load_dotenv
|
| 7 |
import re
|
| 8 |
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
|
|
|
|
|
|
| 15 |
|
| 16 |
def read_file_bytes(path):
|
| 17 |
with open(path, "rb") as f:
|
| 18 |
return f.read()
|
| 19 |
|
| 20 |
|
| 21 |
-
def
|
| 22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
url = f"{AZURE_ENDPOINT}/vision/v3.2/read/analyze"
|
| 24 |
headers = {
|
| 25 |
"Ocp-Apim-Subscription-Key": AZURE_KEY,
|
|
@@ -27,33 +34,20 @@ def submit_read_api(file_path, max_retries=3, backoff=3):
|
|
| 27 |
}
|
| 28 |
data = read_file_bytes(file_path)
|
| 29 |
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
if attempt == max_retries:
|
| 41 |
-
raise
|
| 42 |
-
wait = backoff * attempt
|
| 43 |
-
print(f"⚠️ Request failed (attempt {attempt}), retrying in {wait}s")
|
| 44 |
-
time.sleep(wait)
|
| 45 |
-
continue
|
| 46 |
-
|
| 47 |
-
op_location = resp.headers.get("Operation-Location")
|
| 48 |
-
if not op_location:
|
| 49 |
-
raise RuntimeError(f"No Operation-Location header. Response: {resp.text}")
|
| 50 |
-
return op_location
|
| 51 |
-
|
| 52 |
-
raise RuntimeError("Failed to submit OCR after retries")
|
| 53 |
|
| 54 |
|
| 55 |
def poll_read_result(operation_location, timeout=180, interval=2.0):
|
| 56 |
-
"""Poll until Computer Vision OCR completes
|
| 57 |
headers = {"Ocp-Apim-Subscription-Key": AZURE_KEY}
|
| 58 |
deadline = time.time() + timeout
|
| 59 |
|
|
@@ -79,8 +73,8 @@ def poll_read_result(operation_location, timeout=180, interval=2.0):
|
|
| 79 |
return "\n".join(lines)
|
| 80 |
|
| 81 |
|
|
|
|
| 82 |
def split_pdf_into_chunks(pdf_path, chunk_size=2):
|
| 83 |
-
"""Split a PDF into smaller files of N pages"""
|
| 84 |
reader = PdfReader(pdf_path)
|
| 85 |
total_pages = len(reader.pages)
|
| 86 |
chunk_files = []
|
|
@@ -97,12 +91,49 @@ def split_pdf_into_chunks(pdf_path, chunk_size=2):
|
|
| 97 |
|
| 98 |
|
| 99 |
def clean_extracted_text(text: str) -> str:
|
| 100 |
-
|
| 101 |
text = re.sub(r"--- Page.*?---", "", text)
|
|
|
|
| 102 |
text = re.sub(r"\(chunk\)", "", text)
|
|
|
|
| 103 |
text = re.sub(r"\b(?:stone|Stegaumen|studystone\.in)\b", "", text, flags=re.IGNORECASE)
|
|
|
|
| 104 |
text = re.sub(r"Z-\d+", "", text)
|
|
|
|
| 105 |
text = re.sub(r"P\.T\.O\.", "", text, flags=re.IGNORECASE)
|
| 106 |
|
|
|
|
| 107 |
lines = [re.sub(r"\s+", " ", line).strip() for line in text.splitlines()]
|
| 108 |
return "\n".join([l for l in lines if l])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import time
|
| 2 |
+
import os
|
| 3 |
import requests
|
| 4 |
+
import mimetypes
|
| 5 |
from PyPDF2 import PdfReader, PdfWriter
|
| 6 |
import tempfile
|
|
|
|
| 7 |
import re
|
| 8 |
|
| 9 |
+
AZURE_ENDPOINT = os.environ.get("AZURE_ENDPOINT")
|
| 10 |
+
AZURE_KEY = os.environ.get("AZURE_KEY")
|
| 11 |
+
|
| 12 |
+
if not AZURE_ENDPOINT or not AZURE_KEY:
|
| 13 |
+
raise RuntimeError("Set AZURE_ENDPOINT and AZURE_KEY in .env")
|
| 14 |
+
|
| 15 |
+
AZURE_ENDPOINT = AZURE_ENDPOINT.rstrip("/")
|
| 16 |
+
|
| 17 |
|
| 18 |
def read_file_bytes(path):
|
| 19 |
with open(path, "rb") as f:
|
| 20 |
return f.read()
|
| 21 |
|
| 22 |
|
| 23 |
+
def detect_content_type(file_path: str):
|
| 24 |
+
mime, _ = mimetypes.guess_type(file_path)
|
| 25 |
+
return mime or "application/octet-stream"
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def submit_read_api(file_path):
|
| 29 |
+
"""Submit file to Computer Vision Read API"""
|
| 30 |
url = f"{AZURE_ENDPOINT}/vision/v3.2/read/analyze"
|
| 31 |
headers = {
|
| 32 |
"Ocp-Apim-Subscription-Key": AZURE_KEY,
|
|
|
|
| 34 |
}
|
| 35 |
data = read_file_bytes(file_path)
|
| 36 |
|
| 37 |
+
resp = requests.post(url, headers=headers, data=data)
|
| 38 |
+
print("Azure OCR request URL:", url)
|
| 39 |
+
print("Azure OCR response status:", resp.status_code)
|
| 40 |
+
print("Azure OCR response headers:", resp.headers)
|
| 41 |
+
|
| 42 |
+
resp.raise_for_status()
|
| 43 |
+
op_location = resp.headers.get("Operation-Location")
|
| 44 |
+
if not op_location:
|
| 45 |
+
raise RuntimeError(f"No Operation-Location header. Response: {resp.text}")
|
| 46 |
+
return op_location
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
|
| 48 |
|
| 49 |
def poll_read_result(operation_location, timeout=180, interval=2.0):
|
| 50 |
+
"""Poll until Computer Vision OCR completes"""
|
| 51 |
headers = {"Ocp-Apim-Subscription-Key": AZURE_KEY}
|
| 52 |
deadline = time.time() + timeout
|
| 53 |
|
|
|
|
| 73 |
return "\n".join(lines)
|
| 74 |
|
| 75 |
|
| 76 |
+
|
| 77 |
def split_pdf_into_chunks(pdf_path, chunk_size=2):
|
|
|
|
| 78 |
reader = PdfReader(pdf_path)
|
| 79 |
total_pages = len(reader.pages)
|
| 80 |
chunk_files = []
|
|
|
|
| 91 |
|
| 92 |
|
| 93 |
def clean_extracted_text(text: str) -> str:
|
| 94 |
+
# Remove page markers
|
| 95 |
text = re.sub(r"--- Page.*?---", "", text)
|
| 96 |
+
# Remove chunk markers
|
| 97 |
text = re.sub(r"\(chunk\)", "", text)
|
| 98 |
+
# Remove junk words
|
| 99 |
text = re.sub(r"\b(?:stone|Stegaumen|studystone\.in)\b", "", text, flags=re.IGNORECASE)
|
| 100 |
+
# Remove roll numbers and codes
|
| 101 |
text = re.sub(r"Z-\d+", "", text)
|
| 102 |
+
# Remove P.T.O
|
| 103 |
text = re.sub(r"P\.T\.O\.", "", text, flags=re.IGNORECASE)
|
| 104 |
|
| 105 |
+
# Normalize per-line spacing but preserve newlines
|
| 106 |
lines = [re.sub(r"\s+", " ", line).strip() for line in text.splitlines()]
|
| 107 |
return "\n".join([l for l in lines if l])
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
def poll_read_result(operation_location, timeout=180, interval=2.0):
|
| 111 |
+
headers = {"Ocp-Apim-Subscription-Key": AZURE_KEY}
|
| 112 |
+
deadline = time.time() + timeout
|
| 113 |
+
|
| 114 |
+
while time.time() < deadline:
|
| 115 |
+
r = requests.get(operation_location, headers=headers)
|
| 116 |
+
r.raise_for_status()
|
| 117 |
+
j = r.json()
|
| 118 |
+
status = j.get("status", "").lower()
|
| 119 |
+
if status in ("succeeded", "failed"):
|
| 120 |
+
break
|
| 121 |
+
time.sleep(interval)
|
| 122 |
+
|
| 123 |
+
if status != "succeeded":
|
| 124 |
+
raise RuntimeError(f"OCR failed. Status={status}, Response={j}")
|
| 125 |
+
|
| 126 |
+
analyze_result = j.get("analyzeResult", {})
|
| 127 |
+
pages = analyze_result.get("pages", [])
|
| 128 |
+
content = analyze_result.get("content", "")
|
| 129 |
+
|
| 130 |
+
pages_text = []
|
| 131 |
+
for page in pages:
|
| 132 |
+
page_num = page.get("pageNumber", "?")
|
| 133 |
+
spans = page.get("spans", [])
|
| 134 |
+
text_parts = [content[s["offset"]: s["offset"] + s["length"]] for s in spans]
|
| 135 |
+
joined = "\n".join(text_parts).strip() or "(No text detected)"
|
| 136 |
+
pages_text.append(f"--- Page {page_num} ---\n{joined}")
|
| 137 |
+
|
| 138 |
+
print(f"✅ Processed {len(pages)} pages successfully")
|
| 139 |
+
return "\n\n".join(pages_text)
|