File size: 5,539 Bytes
a62e97e
05a982f
1e39f6a
05a982f
1e39f6a
 
05a982f
1e39f6a
74adb16
05a982f
 
1e39f6a
05a982f
74adb16
05a982f
 
1e39f6a
 
 
 
 
 
 
 
 
 
 
 
74adb16
05a982f
 
 
 
74adb16
05a982f
74adb16
05a982f
 
 
74adb16
05a982f
 
 
 
74adb16
 
 
05a982f
74adb16
05a982f
 
 
 
 
74adb16
ffdd6a7
 
05a982f
 
ffdd6a7
05a982f
 
ffdd6a7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
05a982f
 
 
 
 
 
 
 
 
 
 
74adb16
05a982f
 
74adb16
ffdd6a7
05a982f
74adb16
05a982f
 
 
 
 
 
 
 
 
 
 
 
 
1e39f6a
 
 
 
 
 
74adb16
1e39f6a
 
 
 
 
 
 
 
 
 
 
 
 
a62e97e
 
1e39f6a
 
 
b56f8a6
 
1e39f6a
 
 
b56f8a6
fa1c489
 
b56f8a6
ffdd6a7
1e39f6a
05a982f
1e39f6a
 
a62e97e
74adb16
 
 
 
1e39f6a
 
 
 
 
b56f8a6
74adb16
 
 
 
 
 
 
 
 
 
1e39f6a
a62e97e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
import os
import tempfile
import time
import requests
from flask import Flask, request, jsonify, send_from_directory
from werkzeug.utils import secure_filename
from PyPDF2 import PdfReader, PdfWriter

# --- Load env from Hugging Face Secrets ---
AZURE_ENDPOINT = os.environ.get("AZURE_ENDPOINT")
AZURE_KEY = os.environ.get("AZURE_KEY")

if not AZURE_ENDPOINT or not AZURE_KEY:
    raise RuntimeError("Set AZURE_ENDPOINT and AZURE_KEY as environment variables")

AZURE_ENDPOINT = AZURE_ENDPOINT.rstrip("/")
UPLOAD_DIR = "/tmp/uploads"
os.makedirs(UPLOAD_DIR, exist_ok=True)

ALLOWED_EXTENSIONS = {"png", "jpg", "jpeg", "pdf"}

app = Flask(__name__, static_folder="static", static_url_path="/static")


# --- Helpers ---
def allowed_file(filename):
    return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS


def read_file_bytes(path):
    with open(path, "rb") as f:
        return f.read()


def submit_read_api(file_path):
    """Submit file to Azure Computer Vision OCR Read API"""
    url = f"{AZURE_ENDPOINT}/vision/v3.2/read/analyze"
    headers = {
        "Ocp-Apim-Subscription-Key": AZURE_KEY,
        "Content-Type": "application/octet-stream",
    }
    data = read_file_bytes(file_path)

    resp = requests.post(url, headers=headers, data=data)
    print("➡️ Azure OCR request:", url)
    print("➡️ Status:", resp.status_code)
    print("➡️ Headers:", resp.headers)

    resp.raise_for_status()
    op_location = resp.headers.get("Operation-Location")
    if not op_location:
        raise RuntimeError(f"No Operation-Location header. Response: {resp.text}")
    return op_location


def poll_read_result(operation_location, timeout=180, interval=5.0):
    """Poll until OCR is finished, with retry/backoff on 429"""
    headers = {"Ocp-Apim-Subscription-Key": AZURE_KEY}
    deadline = time.time() + timeout
    attempt = 0

    while time.time() < deadline:
        try:
            r = requests.get(operation_location, headers=headers)
            if r.status_code == 429:
                wait = min(2 ** attempt, 30)  # exponential backoff, max 30s
                print(f"⚠️ Got 429 Too Many Requests. Waiting {wait}s...")
                time.sleep(wait)
                attempt += 1
                continue

            r.raise_for_status()
            j = r.json()
            status = j.get("status", "").lower()
            print("📡 Polling Azure OCR:", status)
            if status in ("succeeded", "failed"):
                break

        except requests.exceptions.RequestException as e:
            print("⚠️ Polling error:", e)
            time.sleep(interval)

        time.sleep(interval)

    if status != "succeeded":
        raise RuntimeError(f"OCR failed. Status={status}, Response={j}")

    results = j.get("analyzeResult", {})
    lines = []
    for read_result in results.get("readResults", []):
        for line in read_result.get("lines", []):
            lines.append(line["text"])

    print(f"✅ Extracted {len(lines)} lines of text")
    return "\n".join(lines)



def split_pdf_into_chunks(pdf_path, chunk_size=2):
    """Split large PDF into smaller chunks for OCR"""
    reader = PdfReader(pdf_path)
    total_pages = len(reader.pages)
    chunk_files = []
    for start in range(0, total_pages, chunk_size):
        writer = PdfWriter()
        for p in range(start, min(start + chunk_size, total_pages)):
            writer.add_page(reader.pages[p])
        tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
        with open(tmp.name, "wb") as f:
            writer.write(f)
        chunk_files.append(tmp.name)
    return chunk_files


# --- Routes ---
@app.route("/")
def index():
    return send_from_directory("static", "index.html")


@app.route("/upload", methods=["POST"])
def upload():
    if "file" not in request.files:
        return jsonify({"error": "No file part"}), 400
    file = request.files["file"]
    if file.filename == "":
        return jsonify({"error": "Empty filename"}), 400
    if not allowed_file(file.filename):
        return jsonify({"error": "File type not allowed"}), 400

    filename = secure_filename(file.filename)
    path = os.path.join(UPLOAD_DIR, filename)
    file.save(path)

    try:
        if filename.lower().endswith(".pdf"):
            chunks = split_pdf_into_chunks(path, chunk_size=2)
            merged_results = []
            for i, chunk_file in enumerate(chunks):
                print(f"📄 Processing chunk {i+1}/{len(chunks)}")
                op_location = submit_read_api(chunk_file)
                chunk_text = poll_read_result(op_location)
                merged_results.append(chunk_text)
                if i < len(chunks) - 1:
                    print("⏳ Sleeping 1s before next chunk...")
                    time.sleep(1)


            extracted_text = "\n\n".join(merged_results)
        else:
            op_location = submit_read_api(path)
            extracted_text = poll_read_result(op_location)
    except Exception as e:
        import traceback

        print("❌ OCR Error:", e)
        traceback.print_exc()
        return jsonify({"error": "OCR failed", "details": str(e)}), 500

    return jsonify({"text": extracted_text})



# Health check
@app.route("/ping-azure")
def ping_azure():
    try:
        r = requests.get(AZURE_ENDPOINT, timeout=5)
        return {"status": r.status_code}
    except Exception as e:
        return {"error": str(e)}


if __name__ == "__main__":
    app.run(host="0.0.0.0", port=7860, debug=True)