Spaces:

Gagandeep12
/

extopen-src

Sleeping

App Files Files Community

extopen-src / app.py

Gagandeep12

Update app.py

fa1c489 verified 3 months ago

raw

history blame contribute delete

5.54 kB

	import os
	import tempfile
	import time
	import requests
	from flask import Flask, request, jsonify, send_from_directory
	from werkzeug.utils import secure_filename
	from PyPDF2 import PdfReader, PdfWriter

	# --- Load env from Hugging Face Secrets ---
	AZURE_ENDPOINT = os.environ.get("AZURE_ENDPOINT")
	AZURE_KEY = os.environ.get("AZURE_KEY")

	if not AZURE_ENDPOINT or not AZURE_KEY:
	raise RuntimeError("Set AZURE_ENDPOINT and AZURE_KEY as environment variables")

	AZURE_ENDPOINT = AZURE_ENDPOINT.rstrip("/")
	UPLOAD_DIR = "/tmp/uploads"
	os.makedirs(UPLOAD_DIR, exist_ok=True)

	ALLOWED_EXTENSIONS = {"png", "jpg", "jpeg", "pdf"}

	app = Flask(__name__, static_folder="static", static_url_path="/static")


	# --- Helpers ---
	def allowed_file(filename):
	return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS


	def read_file_bytes(path):
	with open(path, "rb") as f:
	return f.read()


	def submit_read_api(file_path):
	"""Submit file to Azure Computer Vision OCR Read API"""
	url = f"{AZURE_ENDPOINT}/vision/v3.2/read/analyze"
	headers = {
	"Ocp-Apim-Subscription-Key": AZURE_KEY,
	"Content-Type": "application/octet-stream",
	}
	data = read_file_bytes(file_path)

	resp = requests.post(url, headers=headers, data=data)
	print("➡️ Azure OCR request:", url)
	print("➡️ Status:", resp.status_code)
	print("➡️ Headers:", resp.headers)

	resp.raise_for_status()
	op_location = resp.headers.get("Operation-Location")
	if not op_location:
	raise RuntimeError(f"No Operation-Location header. Response: {resp.text}")
	return op_location


	def poll_read_result(operation_location, timeout=180, interval=5.0):
	"""Poll until OCR is finished, with retry/backoff on 429"""
	headers = {"Ocp-Apim-Subscription-Key": AZURE_KEY}
	deadline = time.time() + timeout
	attempt = 0

	while time.time() < deadline:
	try:
	r = requests.get(operation_location, headers=headers)
	if r.status_code == 429:
	wait = min(2 ** attempt, 30) # exponential backoff, max 30s
	print(f"⚠️ Got 429 Too Many Requests. Waiting {wait}s...")
	time.sleep(wait)
	attempt += 1
	continue

	r.raise_for_status()
	j = r.json()
	status = j.get("status", "").lower()
	print("📡 Polling Azure OCR:", status)
	if status in ("succeeded", "failed"):
	break

	except requests.exceptions.RequestException as e:
	print("⚠️ Polling error:", e)
	time.sleep(interval)

	time.sleep(interval)

	if status != "succeeded":
	raise RuntimeError(f"OCR failed. Status={status}, Response={j}")

	results = j.get("analyzeResult", {})
	lines = []
	for read_result in results.get("readResults", []):
	for line in read_result.get("lines", []):
	lines.append(line["text"])

	print(f"✅ Extracted {len(lines)} lines of text")
	return "\n".join(lines)



	def split_pdf_into_chunks(pdf_path, chunk_size=2):
	"""Split large PDF into smaller chunks for OCR"""
	reader = PdfReader(pdf_path)
	total_pages = len(reader.pages)
	chunk_files = []
	for start in range(0, total_pages, chunk_size):
	writer = PdfWriter()
	for p in range(start, min(start + chunk_size, total_pages)):
	writer.add_page(reader.pages[p])
	tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
	with open(tmp.name, "wb") as f:
	writer.write(f)
	chunk_files.append(tmp.name)
	return chunk_files


	# --- Routes ---
	@app.route("/")
	def index():
	return send_from_directory("static", "index.html")


	@app.route("/upload", methods=["POST"])
	def upload():
	if "file" not in request.files:
	return jsonify({"error": "No file part"}), 400
	file = request.files["file"]
	if file.filename == "":
	return jsonify({"error": "Empty filename"}), 400
	if not allowed_file(file.filename):
	return jsonify({"error": "File type not allowed"}), 400

	filename = secure_filename(file.filename)
	path = os.path.join(UPLOAD_DIR, filename)
	file.save(path)

	try:
	if filename.lower().endswith(".pdf"):
	chunks = split_pdf_into_chunks(path, chunk_size=2)
	merged_results = []
	for i, chunk_file in enumerate(chunks):
	print(f"📄 Processing chunk {i+1}/{len(chunks)}")
	op_location = submit_read_api(chunk_file)
	chunk_text = poll_read_result(op_location)
	merged_results.append(chunk_text)
	if i < len(chunks) - 1:
	print("⏳ Sleeping 1s before next chunk...")
	time.sleep(1)


	extracted_text = "\n\n".join(merged_results)
	else:
	op_location = submit_read_api(path)
	extracted_text = poll_read_result(op_location)
	except Exception as e:
	import traceback

	print("❌ OCR Error:", e)
	traceback.print_exc()
	return jsonify({"error": "OCR failed", "details": str(e)}), 500

	return jsonify({"text": extracted_text})



	# Health check
	@app.route("/ping-azure")
	def ping_azure():
	try:
	r = requests.get(AZURE_ENDPOINT, timeout=5)
	return {"status": r.status_code}
	except Exception as e:
	return {"error": str(e)}


	if __name__ == "__main__":
	app.run(host="0.0.0.0", port=7860, debug=True)