Spaces:

Gagandeep12
/

extopen-src

Sleeping

File size: 4,532 Bytes

import time
import os
import requests
import mimetypes
from PyPDF2 import PdfReader, PdfWriter
import tempfile
import re

AZURE_ENDPOINT = os.environ.get("AZURE_ENDPOINT")
AZURE_KEY = os.environ.get("AZURE_KEY")

if not AZURE_ENDPOINT or not AZURE_KEY:
    raise RuntimeError("Set AZURE_ENDPOINT and AZURE_KEY in .env")

AZURE_ENDPOINT = AZURE_ENDPOINT.rstrip("/")


def read_file_bytes(path):
    with open(path, "rb") as f:
        return f.read()


def detect_content_type(file_path: str):
    mime, _ = mimetypes.guess_type(file_path)
    return mime or "application/octet-stream"


def submit_read_api(file_path):
    """Submit file to Computer Vision Read API"""
    url = f"{AZURE_ENDPOINT}/vision/v3.2/read/analyze"
    headers = {
        "Ocp-Apim-Subscription-Key": AZURE_KEY,
        "Content-Type": "application/octet-stream"
    }
    data = read_file_bytes(file_path)

    resp = requests.post(url, headers=headers, data=data)
    print("Azure OCR request URL:", url)
    print("Azure OCR response status:", resp.status_code)
    print("Azure OCR response headers:", resp.headers)

    resp.raise_for_status()
    op_location = resp.headers.get("Operation-Location")
    if not op_location:
        raise RuntimeError(f"No Operation-Location header. Response: {resp.text}")
    return op_location


def poll_read_result(operation_location, timeout=180, interval=2.0):
    """Poll until Computer Vision OCR completes"""
    headers = {"Ocp-Apim-Subscription-Key": AZURE_KEY}
    deadline = time.time() + timeout

    while time.time() < deadline:
        r = requests.get(operation_location, headers=headers)
        r.raise_for_status()
        j = r.json()
        status = j.get("status", "").lower()
        if status in ("succeeded", "failed"):
            break
        time.sleep(interval)

    if status != "succeeded":
        raise RuntimeError(f"OCR failed. Status={status}, Response={j}")

    analyze_result = j.get("analyzeResult", {})
    lines = []
    for read_result in analyze_result.get("readResults", []):
        for line in read_result.get("lines", []):
            lines.append(line["text"])

    print(f"✅ Extracted {len(lines)} lines of text")
    return "\n".join(lines)



def split_pdf_into_chunks(pdf_path, chunk_size=2):
    reader = PdfReader(pdf_path)
    total_pages = len(reader.pages)
    chunk_files = []

    for start in range(0, total_pages, chunk_size):
        writer = PdfWriter()
        for p in range(start, min(start + chunk_size, total_pages)):
            writer.add_page(reader.pages[p])
        tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
        with open(tmp.name, "wb") as f:
            writer.write(f)
        chunk_files.append(tmp.name)
    return chunk_files


def clean_extracted_text(text: str) -> str:
    # Remove page markers
    text = re.sub(r"--- Page.*?---", "", text)
    # Remove chunk markers
    text = re.sub(r"\(chunk\)", "", text)
    # Remove junk words
    text = re.sub(r"\b(?:stone|Stegaumen|studystone\.in)\b", "", text, flags=re.IGNORECASE)
    # Remove roll numbers and codes
    text = re.sub(r"Z-\d+", "", text)
    # Remove P.T.O
    text = re.sub(r"P\.T\.O\.", "", text, flags=re.IGNORECASE)

    # Normalize per-line spacing but preserve newlines
    lines = [re.sub(r"\s+", " ", line).strip() for line in text.splitlines()]
    return "\n".join([l for l in lines if l])


def poll_read_result(operation_location, timeout=180, interval=2.0):
    headers = {"Ocp-Apim-Subscription-Key": AZURE_KEY}
    deadline = time.time() + timeout

    while time.time() < deadline:
        r = requests.get(operation_location, headers=headers)
        r.raise_for_status()
        j = r.json()
        status = j.get("status", "").lower()
        if status in ("succeeded", "failed"):
            break
        time.sleep(interval)

    if status != "succeeded":
        raise RuntimeError(f"OCR failed. Status={status}, Response={j}")

    analyze_result = j.get("analyzeResult", {})
    pages = analyze_result.get("pages", [])
    content = analyze_result.get("content", "")

    pages_text = []
    for page in pages:
        page_num = page.get("pageNumber", "?")
        spans = page.get("spans", [])
        text_parts = [content[s["offset"]: s["offset"] + s["length"]] for s in spans]
        joined = "\n".join(text_parts).strip() or "(No text detected)"
        pages_text.append(f"--- Page {page_num} ---\n{joined}")

    print(f"✅ Processed {len(pages)} pages successfully")
    return "\n\n".join(pages_text)