Spaces:
Sleeping
Sleeping
File size: 4,532 Bytes
e550287 396abf9 d39c442 396abf9 d39c442 396abf9 d39c442 396abf9 d39c442 396abf9 d39c442 e550287 396abf9 e550287 396abf9 660d5f6 d39c442 396abf9 d39c442 396abf9 d39c442 396abf9 d39c442 396abf9 d39c442 396abf9 d39c442 396abf9 d39c442 396abf9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
import time
import os
import requests
import mimetypes
from PyPDF2 import PdfReader, PdfWriter
import tempfile
import re
AZURE_ENDPOINT = os.environ.get("AZURE_ENDPOINT")
AZURE_KEY = os.environ.get("AZURE_KEY")
if not AZURE_ENDPOINT or not AZURE_KEY:
raise RuntimeError("Set AZURE_ENDPOINT and AZURE_KEY in .env")
AZURE_ENDPOINT = AZURE_ENDPOINT.rstrip("/")
def read_file_bytes(path):
with open(path, "rb") as f:
return f.read()
def detect_content_type(file_path: str):
mime, _ = mimetypes.guess_type(file_path)
return mime or "application/octet-stream"
def submit_read_api(file_path):
"""Submit file to Computer Vision Read API"""
url = f"{AZURE_ENDPOINT}/vision/v3.2/read/analyze"
headers = {
"Ocp-Apim-Subscription-Key": AZURE_KEY,
"Content-Type": "application/octet-stream"
}
data = read_file_bytes(file_path)
resp = requests.post(url, headers=headers, data=data)
print("Azure OCR request URL:", url)
print("Azure OCR response status:", resp.status_code)
print("Azure OCR response headers:", resp.headers)
resp.raise_for_status()
op_location = resp.headers.get("Operation-Location")
if not op_location:
raise RuntimeError(f"No Operation-Location header. Response: {resp.text}")
return op_location
def poll_read_result(operation_location, timeout=180, interval=2.0):
"""Poll until Computer Vision OCR completes"""
headers = {"Ocp-Apim-Subscription-Key": AZURE_KEY}
deadline = time.time() + timeout
while time.time() < deadline:
r = requests.get(operation_location, headers=headers)
r.raise_for_status()
j = r.json()
status = j.get("status", "").lower()
if status in ("succeeded", "failed"):
break
time.sleep(interval)
if status != "succeeded":
raise RuntimeError(f"OCR failed. Status={status}, Response={j}")
analyze_result = j.get("analyzeResult", {})
lines = []
for read_result in analyze_result.get("readResults", []):
for line in read_result.get("lines", []):
lines.append(line["text"])
print(f"✅ Extracted {len(lines)} lines of text")
return "\n".join(lines)
def split_pdf_into_chunks(pdf_path, chunk_size=2):
reader = PdfReader(pdf_path)
total_pages = len(reader.pages)
chunk_files = []
for start in range(0, total_pages, chunk_size):
writer = PdfWriter()
for p in range(start, min(start + chunk_size, total_pages)):
writer.add_page(reader.pages[p])
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
with open(tmp.name, "wb") as f:
writer.write(f)
chunk_files.append(tmp.name)
return chunk_files
def clean_extracted_text(text: str) -> str:
# Remove page markers
text = re.sub(r"--- Page.*?---", "", text)
# Remove chunk markers
text = re.sub(r"\(chunk\)", "", text)
# Remove junk words
text = re.sub(r"\b(?:stone|Stegaumen|studystone\.in)\b", "", text, flags=re.IGNORECASE)
# Remove roll numbers and codes
text = re.sub(r"Z-\d+", "", text)
# Remove P.T.O
text = re.sub(r"P\.T\.O\.", "", text, flags=re.IGNORECASE)
# Normalize per-line spacing but preserve newlines
lines = [re.sub(r"\s+", " ", line).strip() for line in text.splitlines()]
return "\n".join([l for l in lines if l])
def poll_read_result(operation_location, timeout=180, interval=2.0):
headers = {"Ocp-Apim-Subscription-Key": AZURE_KEY}
deadline = time.time() + timeout
while time.time() < deadline:
r = requests.get(operation_location, headers=headers)
r.raise_for_status()
j = r.json()
status = j.get("status", "").lower()
if status in ("succeeded", "failed"):
break
time.sleep(interval)
if status != "succeeded":
raise RuntimeError(f"OCR failed. Status={status}, Response={j}")
analyze_result = j.get("analyzeResult", {})
pages = analyze_result.get("pages", [])
content = analyze_result.get("content", "")
pages_text = []
for page in pages:
page_num = page.get("pageNumber", "?")
spans = page.get("spans", [])
text_parts = [content[s["offset"]: s["offset"] + s["length"]] for s in spans]
joined = "\n".join(text_parts).strip() or "(No text detected)"
pages_text.append(f"--- Page {page_num} ---\n{joined}")
print(f"✅ Processed {len(pages)} pages successfully")
return "\n\n".join(pages_text)
|