Spaces:
Sleeping
Sleeping
| import os | |
| import tempfile | |
| import time | |
| import requests | |
| from flask import Flask, request, jsonify, send_from_directory | |
| from werkzeug.utils import secure_filename | |
| from PyPDF2 import PdfReader, PdfWriter | |
| # --- Load env from Hugging Face Secrets --- | |
| AZURE_ENDPOINT = os.environ.get("AZURE_ENDPOINT") | |
| AZURE_KEY = os.environ.get("AZURE_KEY") | |
| if not AZURE_ENDPOINT or not AZURE_KEY: | |
| raise RuntimeError("Set AZURE_ENDPOINT and AZURE_KEY as environment variables") | |
| AZURE_ENDPOINT = AZURE_ENDPOINT.rstrip("/") | |
| UPLOAD_DIR = "/tmp/uploads" | |
| os.makedirs(UPLOAD_DIR, exist_ok=True) | |
| ALLOWED_EXTENSIONS = {"png", "jpg", "jpeg", "pdf"} | |
| app = Flask(__name__, static_folder="static", static_url_path="/static") | |
| # --- Helpers --- | |
| def allowed_file(filename): | |
| return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS | |
| def read_file_bytes(path): | |
| with open(path, "rb") as f: | |
| return f.read() | |
| def submit_read_api(file_path): | |
| """Submit file to Azure Computer Vision OCR Read API""" | |
| url = f"{AZURE_ENDPOINT}/vision/v3.2/read/analyze" | |
| headers = { | |
| "Ocp-Apim-Subscription-Key": AZURE_KEY, | |
| "Content-Type": "application/octet-stream", | |
| } | |
| data = read_file_bytes(file_path) | |
| resp = requests.post(url, headers=headers, data=data) | |
| print("➡️ Azure OCR request:", url) | |
| print("➡️ Status:", resp.status_code) | |
| print("➡️ Headers:", resp.headers) | |
| resp.raise_for_status() | |
| op_location = resp.headers.get("Operation-Location") | |
| if not op_location: | |
| raise RuntimeError(f"No Operation-Location header. Response: {resp.text}") | |
| return op_location | |
| def poll_read_result(operation_location, timeout=180, interval=5.0): | |
| """Poll until OCR is finished, with retry/backoff on 429""" | |
| headers = {"Ocp-Apim-Subscription-Key": AZURE_KEY} | |
| deadline = time.time() + timeout | |
| attempt = 0 | |
| while time.time() < deadline: | |
| try: | |
| r = requests.get(operation_location, headers=headers) | |
| if r.status_code == 429: | |
| wait = min(2 ** attempt, 30) # exponential backoff, max 30s | |
| print(f"⚠️ Got 429 Too Many Requests. Waiting {wait}s...") | |
| time.sleep(wait) | |
| attempt += 1 | |
| continue | |
| r.raise_for_status() | |
| j = r.json() | |
| status = j.get("status", "").lower() | |
| print("📡 Polling Azure OCR:", status) | |
| if status in ("succeeded", "failed"): | |
| break | |
| except requests.exceptions.RequestException as e: | |
| print("⚠️ Polling error:", e) | |
| time.sleep(interval) | |
| time.sleep(interval) | |
| if status != "succeeded": | |
| raise RuntimeError(f"OCR failed. Status={status}, Response={j}") | |
| results = j.get("analyzeResult", {}) | |
| lines = [] | |
| for read_result in results.get("readResults", []): | |
| for line in read_result.get("lines", []): | |
| lines.append(line["text"]) | |
| print(f"✅ Extracted {len(lines)} lines of text") | |
| return "\n".join(lines) | |
| def split_pdf_into_chunks(pdf_path, chunk_size=2): | |
| """Split large PDF into smaller chunks for OCR""" | |
| reader = PdfReader(pdf_path) | |
| total_pages = len(reader.pages) | |
| chunk_files = [] | |
| for start in range(0, total_pages, chunk_size): | |
| writer = PdfWriter() | |
| for p in range(start, min(start + chunk_size, total_pages)): | |
| writer.add_page(reader.pages[p]) | |
| tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") | |
| with open(tmp.name, "wb") as f: | |
| writer.write(f) | |
| chunk_files.append(tmp.name) | |
| return chunk_files | |
| # --- Routes --- | |
| def index(): | |
| return send_from_directory("static", "index.html") | |
| def upload(): | |
| if "file" not in request.files: | |
| return jsonify({"error": "No file part"}), 400 | |
| file = request.files["file"] | |
| if file.filename == "": | |
| return jsonify({"error": "Empty filename"}), 400 | |
| if not allowed_file(file.filename): | |
| return jsonify({"error": "File type not allowed"}), 400 | |
| filename = secure_filename(file.filename) | |
| path = os.path.join(UPLOAD_DIR, filename) | |
| file.save(path) | |
| try: | |
| if filename.lower().endswith(".pdf"): | |
| chunks = split_pdf_into_chunks(path, chunk_size=2) | |
| merged_results = [] | |
| for i, chunk_file in enumerate(chunks): | |
| print(f"📄 Processing chunk {i+1}/{len(chunks)}") | |
| op_location = submit_read_api(chunk_file) | |
| chunk_text = poll_read_result(op_location) | |
| merged_results.append(chunk_text) | |
| if i < len(chunks) - 1: | |
| print("⏳ Sleeping 1s before next chunk...") | |
| time.sleep(1) | |
| extracted_text = "\n\n".join(merged_results) | |
| else: | |
| op_location = submit_read_api(path) | |
| extracted_text = poll_read_result(op_location) | |
| except Exception as e: | |
| import traceback | |
| print("❌ OCR Error:", e) | |
| traceback.print_exc() | |
| return jsonify({"error": "OCR failed", "details": str(e)}), 500 | |
| return jsonify({"text": extracted_text}) | |
| # Health check | |
| def ping_azure(): | |
| try: | |
| r = requests.get(AZURE_ENDPOINT, timeout=5) | |
| return {"status": r.status_code} | |
| except Exception as e: | |
| return {"error": str(e)} | |
| if __name__ == "__main__": | |
| app.run(host="0.0.0.0", port=7860, debug=True) | |