Gagandeep12 commited on
Commit
396abf9
·
verified ·
1 Parent(s): 05a982f

Update azure_ocr.py

Browse files
Files changed (1) hide show
  1. azure_ocr.py +67 -36
azure_ocr.py CHANGED
@@ -1,25 +1,32 @@
1
- import os
2
  import time
 
3
  import requests
 
4
  from PyPDF2 import PdfReader, PdfWriter
5
  import tempfile
6
- from dotenv import load_dotenv
7
  import re
8
 
9
- def get_azure_config():
10
- endpoint = os.environ.get("AZURE_ENDPOINT")
11
- key = os.environ.get("AZURE_KEY")
12
- if not endpoint or not key:
13
- raise RuntimeError("Set AZURE_ENDPOINT and AZURE_KEY in .env or environment")
14
- return endpoint.rstrip("/"), key
 
 
15
 
16
  def read_file_bytes(path):
17
  with open(path, "rb") as f:
18
  return f.read()
19
 
20
 
21
- def submit_read_api(file_path, max_retries=3, backoff=3):
22
- """Submit file to Computer Vision Read API with retry + backoff"""
 
 
 
 
 
23
  url = f"{AZURE_ENDPOINT}/vision/v3.2/read/analyze"
24
  headers = {
25
  "Ocp-Apim-Subscription-Key": AZURE_KEY,
@@ -27,33 +34,20 @@ def submit_read_api(file_path, max_retries=3, backoff=3):
27
  }
28
  data = read_file_bytes(file_path)
29
 
30
- for attempt in range(1, max_retries + 1):
31
- resp = requests.post(url, headers=headers, data=data)
32
- if resp.status_code == 429: # throttling
33
- wait = backoff * attempt
34
- print(f"⚠️ Throttled (429). Waiting {wait}s before retry...")
35
- time.sleep(wait)
36
- continue
37
- try:
38
- resp.raise_for_status()
39
- except Exception as e:
40
- if attempt == max_retries:
41
- raise
42
- wait = backoff * attempt
43
- print(f"⚠️ Request failed (attempt {attempt}), retrying in {wait}s")
44
- time.sleep(wait)
45
- continue
46
-
47
- op_location = resp.headers.get("Operation-Location")
48
- if not op_location:
49
- raise RuntimeError(f"No Operation-Location header. Response: {resp.text}")
50
- return op_location
51
-
52
- raise RuntimeError("Failed to submit OCR after retries")
53
 
54
 
55
  def poll_read_result(operation_location, timeout=180, interval=2.0):
56
- """Poll until Computer Vision OCR completes (v3.2 Read API)"""
57
  headers = {"Ocp-Apim-Subscription-Key": AZURE_KEY}
58
  deadline = time.time() + timeout
59
 
@@ -79,8 +73,8 @@ def poll_read_result(operation_location, timeout=180, interval=2.0):
79
  return "\n".join(lines)
80
 
81
 
 
82
  def split_pdf_into_chunks(pdf_path, chunk_size=2):
83
- """Split a PDF into smaller files of N pages"""
84
  reader = PdfReader(pdf_path)
85
  total_pages = len(reader.pages)
86
  chunk_files = []
@@ -97,12 +91,49 @@ def split_pdf_into_chunks(pdf_path, chunk_size=2):
97
 
98
 
99
  def clean_extracted_text(text: str) -> str:
100
- """Optional cleanup for junk words, roll numbers, etc."""
101
  text = re.sub(r"--- Page.*?---", "", text)
 
102
  text = re.sub(r"\(chunk\)", "", text)
 
103
  text = re.sub(r"\b(?:stone|Stegaumen|studystone\.in)\b", "", text, flags=re.IGNORECASE)
 
104
  text = re.sub(r"Z-\d+", "", text)
 
105
  text = re.sub(r"P\.T\.O\.", "", text, flags=re.IGNORECASE)
106
 
 
107
  lines = [re.sub(r"\s+", " ", line).strip() for line in text.splitlines()]
108
  return "\n".join([l for l in lines if l])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import time
2
+ import os
3
  import requests
4
+ import mimetypes
5
  from PyPDF2 import PdfReader, PdfWriter
6
  import tempfile
 
7
  import re
8
 
9
+ AZURE_ENDPOINT = os.environ.get("AZURE_ENDPOINT")
10
+ AZURE_KEY = os.environ.get("AZURE_KEY")
11
+
12
+ if not AZURE_ENDPOINT or not AZURE_KEY:
13
+ raise RuntimeError("Set AZURE_ENDPOINT and AZURE_KEY in .env")
14
+
15
+ AZURE_ENDPOINT = AZURE_ENDPOINT.rstrip("/")
16
+
17
 
18
  def read_file_bytes(path):
19
  with open(path, "rb") as f:
20
  return f.read()
21
 
22
 
23
+ def detect_content_type(file_path: str):
24
+ mime, _ = mimetypes.guess_type(file_path)
25
+ return mime or "application/octet-stream"
26
+
27
+
28
+ def submit_read_api(file_path):
29
+ """Submit file to Computer Vision Read API"""
30
  url = f"{AZURE_ENDPOINT}/vision/v3.2/read/analyze"
31
  headers = {
32
  "Ocp-Apim-Subscription-Key": AZURE_KEY,
 
34
  }
35
  data = read_file_bytes(file_path)
36
 
37
+ resp = requests.post(url, headers=headers, data=data)
38
+ print("Azure OCR request URL:", url)
39
+ print("Azure OCR response status:", resp.status_code)
40
+ print("Azure OCR response headers:", resp.headers)
41
+
42
+ resp.raise_for_status()
43
+ op_location = resp.headers.get("Operation-Location")
44
+ if not op_location:
45
+ raise RuntimeError(f"No Operation-Location header. Response: {resp.text}")
46
+ return op_location
 
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
 
49
  def poll_read_result(operation_location, timeout=180, interval=2.0):
50
+ """Poll until Computer Vision OCR completes"""
51
  headers = {"Ocp-Apim-Subscription-Key": AZURE_KEY}
52
  deadline = time.time() + timeout
53
 
 
73
  return "\n".join(lines)
74
 
75
 
76
+
77
  def split_pdf_into_chunks(pdf_path, chunk_size=2):
 
78
  reader = PdfReader(pdf_path)
79
  total_pages = len(reader.pages)
80
  chunk_files = []
 
91
 
92
 
93
  def clean_extracted_text(text: str) -> str:
94
+ # Remove page markers
95
  text = re.sub(r"--- Page.*?---", "", text)
96
+ # Remove chunk markers
97
  text = re.sub(r"\(chunk\)", "", text)
98
+ # Remove junk words
99
  text = re.sub(r"\b(?:stone|Stegaumen|studystone\.in)\b", "", text, flags=re.IGNORECASE)
100
+ # Remove roll numbers and codes
101
  text = re.sub(r"Z-\d+", "", text)
102
+ # Remove P.T.O
103
  text = re.sub(r"P\.T\.O\.", "", text, flags=re.IGNORECASE)
104
 
105
+ # Normalize per-line spacing but preserve newlines
106
  lines = [re.sub(r"\s+", " ", line).strip() for line in text.splitlines()]
107
  return "\n".join([l for l in lines if l])
108
+
109
+
110
+ def poll_read_result(operation_location, timeout=180, interval=2.0):
111
+ headers = {"Ocp-Apim-Subscription-Key": AZURE_KEY}
112
+ deadline = time.time() + timeout
113
+
114
+ while time.time() < deadline:
115
+ r = requests.get(operation_location, headers=headers)
116
+ r.raise_for_status()
117
+ j = r.json()
118
+ status = j.get("status", "").lower()
119
+ if status in ("succeeded", "failed"):
120
+ break
121
+ time.sleep(interval)
122
+
123
+ if status != "succeeded":
124
+ raise RuntimeError(f"OCR failed. Status={status}, Response={j}")
125
+
126
+ analyze_result = j.get("analyzeResult", {})
127
+ pages = analyze_result.get("pages", [])
128
+ content = analyze_result.get("content", "")
129
+
130
+ pages_text = []
131
+ for page in pages:
132
+ page_num = page.get("pageNumber", "?")
133
+ spans = page.get("spans", [])
134
+ text_parts = [content[s["offset"]: s["offset"] + s["length"]] for s in spans]
135
+ joined = "\n".join(text_parts).strip() or "(No text detected)"
136
+ pages_text.append(f"--- Page {page_num} ---\n{joined}")
137
+
138
+ print(f"✅ Processed {len(pages)} pages successfully")
139
+ return "\n\n".join(pages_text)