extopen-src / azure_ocr.py
Gagandeep12's picture
Update azure_ocr.py
396abf9 verified
import time
import os
import requests
import mimetypes
from PyPDF2 import PdfReader, PdfWriter
import tempfile
import re
AZURE_ENDPOINT = os.environ.get("AZURE_ENDPOINT")
AZURE_KEY = os.environ.get("AZURE_KEY")
if not AZURE_ENDPOINT or not AZURE_KEY:
raise RuntimeError("Set AZURE_ENDPOINT and AZURE_KEY in .env")
AZURE_ENDPOINT = AZURE_ENDPOINT.rstrip("/")
def read_file_bytes(path):
with open(path, "rb") as f:
return f.read()
def detect_content_type(file_path: str):
mime, _ = mimetypes.guess_type(file_path)
return mime or "application/octet-stream"
def submit_read_api(file_path):
"""Submit file to Computer Vision Read API"""
url = f"{AZURE_ENDPOINT}/vision/v3.2/read/analyze"
headers = {
"Ocp-Apim-Subscription-Key": AZURE_KEY,
"Content-Type": "application/octet-stream"
}
data = read_file_bytes(file_path)
resp = requests.post(url, headers=headers, data=data)
print("Azure OCR request URL:", url)
print("Azure OCR response status:", resp.status_code)
print("Azure OCR response headers:", resp.headers)
resp.raise_for_status()
op_location = resp.headers.get("Operation-Location")
if not op_location:
raise RuntimeError(f"No Operation-Location header. Response: {resp.text}")
return op_location
def poll_read_result(operation_location, timeout=180, interval=2.0):
"""Poll until Computer Vision OCR completes"""
headers = {"Ocp-Apim-Subscription-Key": AZURE_KEY}
deadline = time.time() + timeout
while time.time() < deadline:
r = requests.get(operation_location, headers=headers)
r.raise_for_status()
j = r.json()
status = j.get("status", "").lower()
if status in ("succeeded", "failed"):
break
time.sleep(interval)
if status != "succeeded":
raise RuntimeError(f"OCR failed. Status={status}, Response={j}")
analyze_result = j.get("analyzeResult", {})
lines = []
for read_result in analyze_result.get("readResults", []):
for line in read_result.get("lines", []):
lines.append(line["text"])
print(f"✅ Extracted {len(lines)} lines of text")
return "\n".join(lines)
def split_pdf_into_chunks(pdf_path, chunk_size=2):
reader = PdfReader(pdf_path)
total_pages = len(reader.pages)
chunk_files = []
for start in range(0, total_pages, chunk_size):
writer = PdfWriter()
for p in range(start, min(start + chunk_size, total_pages)):
writer.add_page(reader.pages[p])
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
with open(tmp.name, "wb") as f:
writer.write(f)
chunk_files.append(tmp.name)
return chunk_files
def clean_extracted_text(text: str) -> str:
# Remove page markers
text = re.sub(r"--- Page.*?---", "", text)
# Remove chunk markers
text = re.sub(r"\(chunk\)", "", text)
# Remove junk words
text = re.sub(r"\b(?:stone|Stegaumen|studystone\.in)\b", "", text, flags=re.IGNORECASE)
# Remove roll numbers and codes
text = re.sub(r"Z-\d+", "", text)
# Remove P.T.O
text = re.sub(r"P\.T\.O\.", "", text, flags=re.IGNORECASE)
# Normalize per-line spacing but preserve newlines
lines = [re.sub(r"\s+", " ", line).strip() for line in text.splitlines()]
return "\n".join([l for l in lines if l])
def poll_read_result(operation_location, timeout=180, interval=2.0):
headers = {"Ocp-Apim-Subscription-Key": AZURE_KEY}
deadline = time.time() + timeout
while time.time() < deadline:
r = requests.get(operation_location, headers=headers)
r.raise_for_status()
j = r.json()
status = j.get("status", "").lower()
if status in ("succeeded", "failed"):
break
time.sleep(interval)
if status != "succeeded":
raise RuntimeError(f"OCR failed. Status={status}, Response={j}")
analyze_result = j.get("analyzeResult", {})
pages = analyze_result.get("pages", [])
content = analyze_result.get("content", "")
pages_text = []
for page in pages:
page_num = page.get("pageNumber", "?")
spans = page.get("spans", [])
text_parts = [content[s["offset"]: s["offset"] + s["length"]] for s in spans]
joined = "\n".join(text_parts).strip() or "(No text detected)"
pages_text.append(f"--- Page {page_num} ---\n{joined}")
print(f"✅ Processed {len(pages)} pages successfully")
return "\n\n".join(pages_text)