Spaces:
Sleeping
Sleeping
File size: 5,539 Bytes
a62e97e 05a982f 1e39f6a 05a982f 1e39f6a 05a982f 1e39f6a 74adb16 05a982f 1e39f6a 05a982f 74adb16 05a982f 1e39f6a 74adb16 05a982f 74adb16 05a982f 74adb16 05a982f 74adb16 05a982f 74adb16 05a982f 74adb16 05a982f 74adb16 ffdd6a7 05a982f ffdd6a7 05a982f ffdd6a7 05a982f 74adb16 05a982f 74adb16 ffdd6a7 05a982f 74adb16 05a982f 1e39f6a 74adb16 1e39f6a a62e97e 1e39f6a b56f8a6 1e39f6a b56f8a6 fa1c489 b56f8a6 ffdd6a7 1e39f6a 05a982f 1e39f6a a62e97e 74adb16 1e39f6a b56f8a6 74adb16 1e39f6a a62e97e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 |
import os
import tempfile
import time
import requests
from flask import Flask, request, jsonify, send_from_directory
from werkzeug.utils import secure_filename
from PyPDF2 import PdfReader, PdfWriter
# --- Load env from Hugging Face Secrets ---
AZURE_ENDPOINT = os.environ.get("AZURE_ENDPOINT")
AZURE_KEY = os.environ.get("AZURE_KEY")
if not AZURE_ENDPOINT or not AZURE_KEY:
raise RuntimeError("Set AZURE_ENDPOINT and AZURE_KEY as environment variables")
AZURE_ENDPOINT = AZURE_ENDPOINT.rstrip("/")
UPLOAD_DIR = "/tmp/uploads"
os.makedirs(UPLOAD_DIR, exist_ok=True)
ALLOWED_EXTENSIONS = {"png", "jpg", "jpeg", "pdf"}
app = Flask(__name__, static_folder="static", static_url_path="/static")
# --- Helpers ---
def allowed_file(filename):
return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS
def read_file_bytes(path):
with open(path, "rb") as f:
return f.read()
def submit_read_api(file_path):
"""Submit file to Azure Computer Vision OCR Read API"""
url = f"{AZURE_ENDPOINT}/vision/v3.2/read/analyze"
headers = {
"Ocp-Apim-Subscription-Key": AZURE_KEY,
"Content-Type": "application/octet-stream",
}
data = read_file_bytes(file_path)
resp = requests.post(url, headers=headers, data=data)
print("➡️ Azure OCR request:", url)
print("➡️ Status:", resp.status_code)
print("➡️ Headers:", resp.headers)
resp.raise_for_status()
op_location = resp.headers.get("Operation-Location")
if not op_location:
raise RuntimeError(f"No Operation-Location header. Response: {resp.text}")
return op_location
def poll_read_result(operation_location, timeout=180, interval=5.0):
"""Poll until OCR is finished, with retry/backoff on 429"""
headers = {"Ocp-Apim-Subscription-Key": AZURE_KEY}
deadline = time.time() + timeout
attempt = 0
while time.time() < deadline:
try:
r = requests.get(operation_location, headers=headers)
if r.status_code == 429:
wait = min(2 ** attempt, 30) # exponential backoff, max 30s
print(f"⚠️ Got 429 Too Many Requests. Waiting {wait}s...")
time.sleep(wait)
attempt += 1
continue
r.raise_for_status()
j = r.json()
status = j.get("status", "").lower()
print("📡 Polling Azure OCR:", status)
if status in ("succeeded", "failed"):
break
except requests.exceptions.RequestException as e:
print("⚠️ Polling error:", e)
time.sleep(interval)
time.sleep(interval)
if status != "succeeded":
raise RuntimeError(f"OCR failed. Status={status}, Response={j}")
results = j.get("analyzeResult", {})
lines = []
for read_result in results.get("readResults", []):
for line in read_result.get("lines", []):
lines.append(line["text"])
print(f"✅ Extracted {len(lines)} lines of text")
return "\n".join(lines)
def split_pdf_into_chunks(pdf_path, chunk_size=2):
"""Split large PDF into smaller chunks for OCR"""
reader = PdfReader(pdf_path)
total_pages = len(reader.pages)
chunk_files = []
for start in range(0, total_pages, chunk_size):
writer = PdfWriter()
for p in range(start, min(start + chunk_size, total_pages)):
writer.add_page(reader.pages[p])
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
with open(tmp.name, "wb") as f:
writer.write(f)
chunk_files.append(tmp.name)
return chunk_files
# --- Routes ---
@app.route("/")
def index():
return send_from_directory("static", "index.html")
@app.route("/upload", methods=["POST"])
def upload():
if "file" not in request.files:
return jsonify({"error": "No file part"}), 400
file = request.files["file"]
if file.filename == "":
return jsonify({"error": "Empty filename"}), 400
if not allowed_file(file.filename):
return jsonify({"error": "File type not allowed"}), 400
filename = secure_filename(file.filename)
path = os.path.join(UPLOAD_DIR, filename)
file.save(path)
try:
if filename.lower().endswith(".pdf"):
chunks = split_pdf_into_chunks(path, chunk_size=2)
merged_results = []
for i, chunk_file in enumerate(chunks):
print(f"📄 Processing chunk {i+1}/{len(chunks)}")
op_location = submit_read_api(chunk_file)
chunk_text = poll_read_result(op_location)
merged_results.append(chunk_text)
if i < len(chunks) - 1:
print("⏳ Sleeping 1s before next chunk...")
time.sleep(1)
extracted_text = "\n\n".join(merged_results)
else:
op_location = submit_read_api(path)
extracted_text = poll_read_result(op_location)
except Exception as e:
import traceback
print("❌ OCR Error:", e)
traceback.print_exc()
return jsonify({"error": "OCR failed", "details": str(e)}), 500
return jsonify({"text": extracted_text})
# Health check
@app.route("/ping-azure")
def ping_azure():
try:
r = requests.get(AZURE_ENDPOINT, timeout=5)
return {"status": r.status_code}
except Exception as e:
return {"error": str(e)}
if __name__ == "__main__":
app.run(host="0.0.0.0", port=7860, debug=True)
|