extopen-src / app.py
Gagandeep12's picture
Update app.py
fa1c489 verified
import os
import tempfile
import time
import requests
from flask import Flask, request, jsonify, send_from_directory
from werkzeug.utils import secure_filename
from PyPDF2 import PdfReader, PdfWriter
# --- Load env from Hugging Face Secrets ---
AZURE_ENDPOINT = os.environ.get("AZURE_ENDPOINT")
AZURE_KEY = os.environ.get("AZURE_KEY")
if not AZURE_ENDPOINT or not AZURE_KEY:
raise RuntimeError("Set AZURE_ENDPOINT and AZURE_KEY as environment variables")
AZURE_ENDPOINT = AZURE_ENDPOINT.rstrip("/")
UPLOAD_DIR = "/tmp/uploads"
os.makedirs(UPLOAD_DIR, exist_ok=True)
ALLOWED_EXTENSIONS = {"png", "jpg", "jpeg", "pdf"}
app = Flask(__name__, static_folder="static", static_url_path="/static")
# --- Helpers ---
def allowed_file(filename):
return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS
def read_file_bytes(path):
with open(path, "rb") as f:
return f.read()
def submit_read_api(file_path):
"""Submit file to Azure Computer Vision OCR Read API"""
url = f"{AZURE_ENDPOINT}/vision/v3.2/read/analyze"
headers = {
"Ocp-Apim-Subscription-Key": AZURE_KEY,
"Content-Type": "application/octet-stream",
}
data = read_file_bytes(file_path)
resp = requests.post(url, headers=headers, data=data)
print("➡️ Azure OCR request:", url)
print("➡️ Status:", resp.status_code)
print("➡️ Headers:", resp.headers)
resp.raise_for_status()
op_location = resp.headers.get("Operation-Location")
if not op_location:
raise RuntimeError(f"No Operation-Location header. Response: {resp.text}")
return op_location
def poll_read_result(operation_location, timeout=180, interval=5.0):
"""Poll until OCR is finished, with retry/backoff on 429"""
headers = {"Ocp-Apim-Subscription-Key": AZURE_KEY}
deadline = time.time() + timeout
attempt = 0
while time.time() < deadline:
try:
r = requests.get(operation_location, headers=headers)
if r.status_code == 429:
wait = min(2 ** attempt, 30) # exponential backoff, max 30s
print(f"⚠️ Got 429 Too Many Requests. Waiting {wait}s...")
time.sleep(wait)
attempt += 1
continue
r.raise_for_status()
j = r.json()
status = j.get("status", "").lower()
print("📡 Polling Azure OCR:", status)
if status in ("succeeded", "failed"):
break
except requests.exceptions.RequestException as e:
print("⚠️ Polling error:", e)
time.sleep(interval)
time.sleep(interval)
if status != "succeeded":
raise RuntimeError(f"OCR failed. Status={status}, Response={j}")
results = j.get("analyzeResult", {})
lines = []
for read_result in results.get("readResults", []):
for line in read_result.get("lines", []):
lines.append(line["text"])
print(f"✅ Extracted {len(lines)} lines of text")
return "\n".join(lines)
def split_pdf_into_chunks(pdf_path, chunk_size=2):
"""Split large PDF into smaller chunks for OCR"""
reader = PdfReader(pdf_path)
total_pages = len(reader.pages)
chunk_files = []
for start in range(0, total_pages, chunk_size):
writer = PdfWriter()
for p in range(start, min(start + chunk_size, total_pages)):
writer.add_page(reader.pages[p])
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
with open(tmp.name, "wb") as f:
writer.write(f)
chunk_files.append(tmp.name)
return chunk_files
# --- Routes ---
@app.route("/")
def index():
return send_from_directory("static", "index.html")
@app.route("/upload", methods=["POST"])
def upload():
if "file" not in request.files:
return jsonify({"error": "No file part"}), 400
file = request.files["file"]
if file.filename == "":
return jsonify({"error": "Empty filename"}), 400
if not allowed_file(file.filename):
return jsonify({"error": "File type not allowed"}), 400
filename = secure_filename(file.filename)
path = os.path.join(UPLOAD_DIR, filename)
file.save(path)
try:
if filename.lower().endswith(".pdf"):
chunks = split_pdf_into_chunks(path, chunk_size=2)
merged_results = []
for i, chunk_file in enumerate(chunks):
print(f"📄 Processing chunk {i+1}/{len(chunks)}")
op_location = submit_read_api(chunk_file)
chunk_text = poll_read_result(op_location)
merged_results.append(chunk_text)
if i < len(chunks) - 1:
print("⏳ Sleeping 1s before next chunk...")
time.sleep(1)
extracted_text = "\n\n".join(merged_results)
else:
op_location = submit_read_api(path)
extracted_text = poll_read_result(op_location)
except Exception as e:
import traceback
print("❌ OCR Error:", e)
traceback.print_exc()
return jsonify({"error": "OCR failed", "details": str(e)}), 500
return jsonify({"text": extracted_text})
# Health check
@app.route("/ping-azure")
def ping_azure():
try:
r = requests.get(AZURE_ENDPOINT, timeout=5)
return {"status": r.status_code}
except Exception as e:
return {"error": str(e)}
if __name__ == "__main__":
app.run(host="0.0.0.0", port=7860, debug=True)