Gagandeep12 commited on
Commit
d39c442
·
verified ·
1 Parent(s): a4b4e95

Update azure_ocr.py

Browse files
Files changed (1) hide show
  1. azure_ocr.py +154 -139
azure_ocr.py CHANGED
@@ -1,139 +1,154 @@
1
- import time
2
- import os
3
- import requests
4
- import mimetypes
5
- from PyPDF2 import PdfReader, PdfWriter
6
- import tempfile
7
- import re
8
-
9
- AZURE_ENDPOINT = os.environ.get("AZURE_ENDPOINT")
10
- AZURE_KEY = os.environ.get("AZURE_KEY")
11
-
12
- if not AZURE_ENDPOINT or not AZURE_KEY:
13
- raise RuntimeError("Set AZURE_ENDPOINT and AZURE_KEY in .env")
14
-
15
- AZURE_ENDPOINT = AZURE_ENDPOINT.rstrip("/")
16
-
17
-
18
- def read_file_bytes(path):
19
- with open(path, "rb") as f:
20
- return f.read()
21
-
22
-
23
- def detect_content_type(file_path: str):
24
- mime, _ = mimetypes.guess_type(file_path)
25
- return mime or "application/octet-stream"
26
-
27
-
28
- def submit_read_api(file_path):
29
- """Submit file to Computer Vision Read API"""
30
- url = f"{AZURE_ENDPOINT}/vision/v3.2/read/analyze"
31
- headers = {
32
- "Ocp-Apim-Subscription-Key": AZURE_KEY,
33
- "Content-Type": "application/octet-stream"
34
- }
35
- data = read_file_bytes(file_path)
36
-
37
- resp = requests.post(url, headers=headers, data=data)
38
- print("Azure OCR request URL:", url)
39
- print("Azure OCR response status:", resp.status_code)
40
- print("Azure OCR response headers:", resp.headers)
41
-
42
- resp.raise_for_status()
43
- op_location = resp.headers.get("Operation-Location")
44
- if not op_location:
45
- raise RuntimeError(f"No Operation-Location header. Response: {resp.text}")
46
- return op_location
47
-
48
-
49
- def poll_read_result(operation_location, timeout=180, interval=2.0):
50
- """Poll until Computer Vision OCR completes"""
51
- headers = {"Ocp-Apim-Subscription-Key": AZURE_KEY}
52
- deadline = time.time() + timeout
53
-
54
- while time.time() < deadline:
55
- r = requests.get(operation_location, headers=headers)
56
- r.raise_for_status()
57
- j = r.json()
58
- status = j.get("status", "").lower()
59
- if status in ("succeeded", "failed"):
60
- break
61
- time.sleep(interval)
62
-
63
- if status != "succeeded":
64
- raise RuntimeError(f"OCR failed. Status={status}, Response={j}")
65
-
66
- analyze_result = j.get("analyzeResult", {})
67
- lines = []
68
- for read_result in analyze_result.get("readResults", []):
69
- for line in read_result.get("lines", []):
70
- lines.append(line["text"])
71
-
72
- print(f"✅ Extracted {len(lines)} lines of text")
73
- return "\n".join(lines)
74
-
75
-
76
-
77
- def split_pdf_into_chunks(pdf_path, chunk_size=2):
78
- reader = PdfReader(pdf_path)
79
- total_pages = len(reader.pages)
80
- chunk_files = []
81
-
82
- for start in range(0, total_pages, chunk_size):
83
- writer = PdfWriter()
84
- for p in range(start, min(start + chunk_size, total_pages)):
85
- writer.add_page(reader.pages[p])
86
- tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
87
- with open(tmp.name, "wb") as f:
88
- writer.write(f)
89
- chunk_files.append(tmp.name)
90
- return chunk_files
91
-
92
-
93
- def clean_extracted_text(text: str) -> str:
94
- # Remove page markers
95
- text = re.sub(r"--- Page.*?---", "", text)
96
- # Remove chunk markers
97
- text = re.sub(r"\(chunk\)", "", text)
98
- # Remove junk words
99
- text = re.sub(r"\b(?:stone|Stegaumen|studystone\.in)\b", "", text, flags=re.IGNORECASE)
100
- # Remove roll numbers and codes
101
- text = re.sub(r"Z-\d+", "", text)
102
- # Remove P.T.O
103
- text = re.sub(r"P\.T\.O\.", "", text, flags=re.IGNORECASE)
104
-
105
- # Normalize per-line spacing but preserve newlines
106
- lines = [re.sub(r"\s+", " ", line).strip() for line in text.splitlines()]
107
- return "\n".join([l for l in lines if l])
108
-
109
-
110
- def poll_read_result(operation_location, timeout=180, interval=2.0):
111
- headers = {"Ocp-Apim-Subscription-Key": AZURE_KEY}
112
- deadline = time.time() + timeout
113
-
114
- while time.time() < deadline:
115
- r = requests.get(operation_location, headers=headers)
116
- r.raise_for_status()
117
- j = r.json()
118
- status = j.get("status", "").lower()
119
- if status in ("succeeded", "failed"):
120
- break
121
- time.sleep(interval)
122
-
123
- if status != "succeeded":
124
- raise RuntimeError(f"OCR failed. Status={status}, Response={j}")
125
-
126
- analyze_result = j.get("analyzeResult", {})
127
- pages = analyze_result.get("pages", [])
128
- content = analyze_result.get("content", "")
129
-
130
- pages_text = []
131
- for page in pages:
132
- page_num = page.get("pageNumber", "?")
133
- spans = page.get("spans", [])
134
- text_parts = [content[s["offset"]: s["offset"] + s["length"]] for s in spans]
135
- joined = "\n".join(text_parts).strip() or "(No text detected)"
136
- pages_text.append(f"--- Page {page_num} ---\n{joined}")
137
-
138
- print(f"✅ Processed {len(pages)} pages successfully")
139
- return "\n\n".join(pages_text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import os
3
+ import requests
4
+ import mimetypes
5
+ from PyPDF2 import PdfReader, PdfWriter
6
+ import tempfile
7
+ import re
8
+ import random
9
+
10
+ AZURE_ENDPOINT = os.environ.get("AZURE_ENDPOINT")
11
+ AZURE_KEY = os.environ.get("AZURE_KEY")
12
+
13
+ if not AZURE_ENDPOINT or not AZURE_KEY:
14
+ raise RuntimeError("Set AZURE_ENDPOINT and AZURE_KEY in .env")
15
+
16
+ AZURE_ENDPOINT = AZURE_ENDPOINT.rstrip("/")
17
+
18
+
19
+ def read_file_bytes(path):
20
+ with open(path, "rb") as f:
21
+ return f.read()
22
+
23
+
24
+ def detect_content_type(file_path: str):
25
+ mime, _ = mimetypes.guess_type(file_path)
26
+ return mime or "application/octet-stream"
27
+
28
+
29
+ def submit_read_api(file_path, max_retries=3, backoff=3):
30
+ """Submit file to Computer Vision Read API with retry + backoff"""
31
+ url = f"{AZURE_ENDPOINT}/vision/v3.2/read/analyze"
32
+ headers = {
33
+ "Ocp-Apim-Subscription-Key": AZURE_KEY,
34
+ "Content-Type": "application/octet-stream"
35
+ }
36
+ data = read_file_bytes(file_path)
37
+
38
+ for attempt in range(1, max_retries + 1):
39
+ resp = requests.post(url, headers=headers, data=data)
40
+ if resp.status_code == 429: # throttling
41
+ wait = backoff * attempt + random.uniform(0, 1)
42
+ print(f"⚠️ Throttled (429). Waiting {wait:.1f}s before retry...")
43
+ time.sleep(wait)
44
+ continue
45
+ try:
46
+ resp.raise_for_status()
47
+ except Exception as e:
48
+ if attempt == max_retries:
49
+ raise
50
+ wait = backoff * attempt
51
+ print(f"⚠️ Request failed (attempt {attempt}), retrying in {wait}s")
52
+ time.sleep(wait)
53
+ continue
54
+
55
+ op_location = resp.headers.get("Operation-Location")
56
+ if not op_location:
57
+ raise RuntimeError(f"No Operation-Location header. Response: {resp.text}")
58
+ return op_location
59
+
60
+ raise RuntimeError("Failed to submit OCR after retries")
61
+
62
+
63
+
64
+ def poll_read_result(operation_location, timeout=180, interval=2.0):
65
+ """Poll until Computer Vision OCR completes"""
66
+ headers = {"Ocp-Apim-Subscription-Key": AZURE_KEY}
67
+ deadline = time.time() + timeout
68
+
69
+ while time.time() < deadline:
70
+ r = requests.get(operation_location, headers=headers)
71
+ r.raise_for_status()
72
+ j = r.json()
73
+ status = j.get("status", "").lower()
74
+ if status in ("succeeded", "failed"):
75
+ break
76
+ time.sleep(interval)
77
+
78
+ if status != "succeeded":
79
+ raise RuntimeError(f"OCR failed. Status={status}, Response={j}")
80
+
81
+ analyze_result = j.get("analyzeResult", {})
82
+ lines = []
83
+ for read_result in analyze_result.get("readResults", []):
84
+ for line in read_result.get("lines", []):
85
+ lines.append(line["text"])
86
+
87
+ print(f"✅ Extracted {len(lines)} lines of text")
88
+ return "\n".join(lines)
89
+
90
+
91
+
92
+ def split_pdf_into_chunks(pdf_path, chunk_size=):
93
+ reader = PdfReader(pdf_path)
94
+ total_pages = len(reader.pages)
95
+ chunk_files = []
96
+
97
+ for start in range(0, total_pages, chunk_size):
98
+ writer = PdfWriter()
99
+ for p in range(start, min(start + chunk_size, total_pages)):
100
+ writer.add_page(reader.pages[p])
101
+ tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
102
+ with open(tmp.name, "wb") as f:
103
+ writer.write(f)
104
+ chunk_files.append(tmp.name)
105
+ return chunk_files
106
+
107
+
108
+ def clean_extracted_text(text: str) -> str:
109
+ # Remove page markers
110
+ text = re.sub(r"--- Page.*?---", "", text)
111
+ # Remove chunk markers
112
+ text = re.sub(r"\(chunk\)", "", text)
113
+ # Remove junk words
114
+ text = re.sub(r"\b(?:stone|Stegaumen|studystone\.in)\b", "", text, flags=re.IGNORECASE)
115
+ # Remove roll numbers and codes
116
+ text = re.sub(r"Z-\d+", "", text)
117
+ # Remove P.T.O
118
+ text = re.sub(r"P\.T\.O\.", "", text, flags=re.IGNORECASE)
119
+
120
+ # Normalize per-line spacing but preserve newlines
121
+ lines = [re.sub(r"\s+", " ", line).strip() for line in text.splitlines()]
122
+ return "\n".join([l for l in lines if l])
123
+
124
+
125
+ def poll_read_result(operation_location, timeout=180, interval=2.0):
126
+ headers = {"Ocp-Apim-Subscription-Key": AZURE_KEY}
127
+ deadline = time.time() + timeout
128
+
129
+ while time.time() < deadline:
130
+ r = requests.get(operation_location, headers=headers)
131
+ r.raise_for_status()
132
+ j = r.json()
133
+ status = j.get("status", "").lower()
134
+ if status in ("succeeded", "failed"):
135
+ break
136
+ time.sleep(interval)
137
+
138
+ if status != "succeeded":
139
+ raise RuntimeError(f"OCR failed. Status={status}, Response={j}")
140
+
141
+ analyze_result = j.get("analyzeResult", {})
142
+ pages = analyze_result.get("pages", [])
143
+ content = analyze_result.get("content", "")
144
+
145
+ pages_text = []
146
+ for page in pages:
147
+ page_num = page.get("pageNumber", "?")
148
+ spans = page.get("spans", [])
149
+ text_parts = [content[s["offset"]: s["offset"] + s["length"]] for s in spans]
150
+ joined = "\n".join(text_parts).strip() or "(No text detected)"
151
+ pages_text.append(f"--- Page {page_num} ---\n{joined}")
152
+
153
+ print(f"✅ Processed {len(pages)} pages successfully")
154
+ return "\n\n".join(pages_text)