File size: 4,532 Bytes
e550287
396abf9
d39c442
396abf9
d39c442
 
 
 
396abf9
 
 
 
 
 
 
 
d39c442
 
 
 
 
 
396abf9
 
 
 
 
 
 
d39c442
 
 
 
 
 
 
396abf9
 
 
 
 
 
 
 
 
 
d39c442
 
e550287
396abf9
e550287
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
396abf9
660d5f6
d39c442
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
396abf9
d39c442
396abf9
d39c442
396abf9
d39c442
396abf9
d39c442
396abf9
d39c442
 
396abf9
d39c442
 
396abf9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import time
import os
import requests
import mimetypes
from PyPDF2 import PdfReader, PdfWriter
import tempfile
import re

AZURE_ENDPOINT = os.environ.get("AZURE_ENDPOINT")
AZURE_KEY = os.environ.get("AZURE_KEY")

if not AZURE_ENDPOINT or not AZURE_KEY:
    raise RuntimeError("Set AZURE_ENDPOINT and AZURE_KEY in .env")

AZURE_ENDPOINT = AZURE_ENDPOINT.rstrip("/")


def read_file_bytes(path):
    with open(path, "rb") as f:
        return f.read()


def detect_content_type(file_path: str):
    mime, _ = mimetypes.guess_type(file_path)
    return mime or "application/octet-stream"


def submit_read_api(file_path):
    """Submit file to Computer Vision Read API"""
    url = f"{AZURE_ENDPOINT}/vision/v3.2/read/analyze"
    headers = {
        "Ocp-Apim-Subscription-Key": AZURE_KEY,
        "Content-Type": "application/octet-stream"
    }
    data = read_file_bytes(file_path)

    resp = requests.post(url, headers=headers, data=data)
    print("Azure OCR request URL:", url)
    print("Azure OCR response status:", resp.status_code)
    print("Azure OCR response headers:", resp.headers)

    resp.raise_for_status()
    op_location = resp.headers.get("Operation-Location")
    if not op_location:
        raise RuntimeError(f"No Operation-Location header. Response: {resp.text}")
    return op_location


def poll_read_result(operation_location, timeout=180, interval=2.0):
    """Poll until Computer Vision OCR completes"""
    headers = {"Ocp-Apim-Subscription-Key": AZURE_KEY}
    deadline = time.time() + timeout

    while time.time() < deadline:
        r = requests.get(operation_location, headers=headers)
        r.raise_for_status()
        j = r.json()
        status = j.get("status", "").lower()
        if status in ("succeeded", "failed"):
            break
        time.sleep(interval)

    if status != "succeeded":
        raise RuntimeError(f"OCR failed. Status={status}, Response={j}")

    analyze_result = j.get("analyzeResult", {})
    lines = []
    for read_result in analyze_result.get("readResults", []):
        for line in read_result.get("lines", []):
            lines.append(line["text"])

    print(f"✅ Extracted {len(lines)} lines of text")
    return "\n".join(lines)



def split_pdf_into_chunks(pdf_path, chunk_size=2):
    reader = PdfReader(pdf_path)
    total_pages = len(reader.pages)
    chunk_files = []

    for start in range(0, total_pages, chunk_size):
        writer = PdfWriter()
        for p in range(start, min(start + chunk_size, total_pages)):
            writer.add_page(reader.pages[p])
        tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
        with open(tmp.name, "wb") as f:
            writer.write(f)
        chunk_files.append(tmp.name)
    return chunk_files


def clean_extracted_text(text: str) -> str:
    # Remove page markers
    text = re.sub(r"--- Page.*?---", "", text)
    # Remove chunk markers
    text = re.sub(r"\(chunk\)", "", text)
    # Remove junk words
    text = re.sub(r"\b(?:stone|Stegaumen|studystone\.in)\b", "", text, flags=re.IGNORECASE)
    # Remove roll numbers and codes
    text = re.sub(r"Z-\d+", "", text)
    # Remove P.T.O
    text = re.sub(r"P\.T\.O\.", "", text, flags=re.IGNORECASE)

    # Normalize per-line spacing but preserve newlines
    lines = [re.sub(r"\s+", " ", line).strip() for line in text.splitlines()]
    return "\n".join([l for l in lines if l])


def poll_read_result(operation_location, timeout=180, interval=2.0):
    headers = {"Ocp-Apim-Subscription-Key": AZURE_KEY}
    deadline = time.time() + timeout

    while time.time() < deadline:
        r = requests.get(operation_location, headers=headers)
        r.raise_for_status()
        j = r.json()
        status = j.get("status", "").lower()
        if status in ("succeeded", "failed"):
            break
        time.sleep(interval)

    if status != "succeeded":
        raise RuntimeError(f"OCR failed. Status={status}, Response={j}")

    analyze_result = j.get("analyzeResult", {})
    pages = analyze_result.get("pages", [])
    content = analyze_result.get("content", "")

    pages_text = []
    for page in pages:
        page_num = page.get("pageNumber", "?")
        spans = page.get("spans", [])
        text_parts = [content[s["offset"]: s["offset"] + s["length"]] for s in spans]
        joined = "\n".join(text_parts).strip() or "(No text detected)"
        pages_text.append(f"--- Page {page_num} ---\n{joined}")

    print(f"✅ Processed {len(pages)} pages successfully")
    return "\n\n".join(pages_text)