import os import tempfile import time import requests from flask import Flask, request, jsonify, send_from_directory from werkzeug.utils import secure_filename from PyPDF2 import PdfReader, PdfWriter # --- Load env from Hugging Face Secrets --- AZURE_ENDPOINT = os.environ.get("AZURE_ENDPOINT") AZURE_KEY = os.environ.get("AZURE_KEY") if not AZURE_ENDPOINT or not AZURE_KEY: raise RuntimeError("Set AZURE_ENDPOINT and AZURE_KEY as environment variables") AZURE_ENDPOINT = AZURE_ENDPOINT.rstrip("/") UPLOAD_DIR = "/tmp/uploads" os.makedirs(UPLOAD_DIR, exist_ok=True) ALLOWED_EXTENSIONS = {"png", "jpg", "jpeg", "pdf"} app = Flask(__name__, static_folder="static", static_url_path="/static") # --- Helpers --- def allowed_file(filename): return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS def read_file_bytes(path): with open(path, "rb") as f: return f.read() def submit_read_api(file_path): """Submit file to Azure Computer Vision OCR Read API""" url = f"{AZURE_ENDPOINT}/vision/v3.2/read/analyze" headers = { "Ocp-Apim-Subscription-Key": AZURE_KEY, "Content-Type": "application/octet-stream", } data = read_file_bytes(file_path) resp = requests.post(url, headers=headers, data=data) print("➡️ Azure OCR request:", url) print("➡️ Status:", resp.status_code) print("➡️ Headers:", resp.headers) resp.raise_for_status() op_location = resp.headers.get("Operation-Location") if not op_location: raise RuntimeError(f"No Operation-Location header. Response: {resp.text}") return op_location def poll_read_result(operation_location, timeout=180, interval=5.0): """Poll until OCR is finished, with retry/backoff on 429""" headers = {"Ocp-Apim-Subscription-Key": AZURE_KEY} deadline = time.time() + timeout attempt = 0 while time.time() < deadline: try: r = requests.get(operation_location, headers=headers) if r.status_code == 429: wait = min(2 ** attempt, 30) # exponential backoff, max 30s print(f"⚠️ Got 429 Too Many Requests. Waiting {wait}s...") time.sleep(wait) attempt += 1 continue r.raise_for_status() j = r.json() status = j.get("status", "").lower() print("📡 Polling Azure OCR:", status) if status in ("succeeded", "failed"): break except requests.exceptions.RequestException as e: print("⚠️ Polling error:", e) time.sleep(interval) time.sleep(interval) if status != "succeeded": raise RuntimeError(f"OCR failed. Status={status}, Response={j}") results = j.get("analyzeResult", {}) lines = [] for read_result in results.get("readResults", []): for line in read_result.get("lines", []): lines.append(line["text"]) print(f"✅ Extracted {len(lines)} lines of text") return "\n".join(lines) def split_pdf_into_chunks(pdf_path, chunk_size=2): """Split large PDF into smaller chunks for OCR""" reader = PdfReader(pdf_path) total_pages = len(reader.pages) chunk_files = [] for start in range(0, total_pages, chunk_size): writer = PdfWriter() for p in range(start, min(start + chunk_size, total_pages)): writer.add_page(reader.pages[p]) tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") with open(tmp.name, "wb") as f: writer.write(f) chunk_files.append(tmp.name) return chunk_files # --- Routes --- @app.route("/") def index(): return send_from_directory("static", "index.html") @app.route("/upload", methods=["POST"]) def upload(): if "file" not in request.files: return jsonify({"error": "No file part"}), 400 file = request.files["file"] if file.filename == "": return jsonify({"error": "Empty filename"}), 400 if not allowed_file(file.filename): return jsonify({"error": "File type not allowed"}), 400 filename = secure_filename(file.filename) path = os.path.join(UPLOAD_DIR, filename) file.save(path) try: if filename.lower().endswith(".pdf"): chunks = split_pdf_into_chunks(path, chunk_size=2) merged_results = [] for i, chunk_file in enumerate(chunks): print(f"📄 Processing chunk {i+1}/{len(chunks)}") op_location = submit_read_api(chunk_file) chunk_text = poll_read_result(op_location) merged_results.append(chunk_text) if i < len(chunks) - 1: print("⏳ Sleeping 1s before next chunk...") time.sleep(1) extracted_text = "\n\n".join(merged_results) else: op_location = submit_read_api(path) extracted_text = poll_read_result(op_location) except Exception as e: import traceback print("❌ OCR Error:", e) traceback.print_exc() return jsonify({"error": "OCR failed", "details": str(e)}), 500 return jsonify({"text": extracted_text}) # Health check @app.route("/ping-azure") def ping_azure(): try: r = requests.get(AZURE_ENDPOINT, timeout=5) return {"status": r.status_code} except Exception as e: return {"error": str(e)} if __name__ == "__main__": app.run(host="0.0.0.0", port=7860, debug=True)