Spaces:

iplotnor
/

agent-bya

Sleeping

App Files Files Community

agent-bya / app.py

iplotnor

Update app.py

27b9862 verified 8 months ago

raw

history blame contribute delete

21.9 kB

	import os
	import json
	from datetime import datetime
	from typing import Dict, Any, Tuple, Optional

	import gradio as gr
	import requests
	import fitz # PyMuPDF


	# Read API key only from environment
	OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")


	class NorwayPlanExtractor:
	def __init__(self, openai_api_key: str):
	self.openai_api_key = openai_api_key
	self.openai_url = "https://api.openai.com/v1/chat/completions"

	self.system_prompt = """You are a Norwegian planning document expert. Extract ONLY building/planning rules with numeric values.
	Return ONLY valid JSON. Convert decimal commas to dots (6,5 → 6.5). Use null for missing fields.

	FOCUS ONLY ON: BYA, building heights, setbacks, parking, and area limits. IGNORE: procedures, general text, background info.

	JSON Structure:
	{
	"plan_metadata": {
	"plan_id": "string or null",
	"plan_name": "string or null",
	"municipality": "string or null"
	},
	"zones": [
	{
	"name": "string",
	"rules": {
	"bya": {
	"type": "absolute_m2",
	"total_value": "number",
	"breakdown": [
	{"component": "main_building", "value": "number", "unit": "m²", "page": "number", "quote": "exact Norwegian text"},
	{"component": "parking", "value": "number", "unit": "m²", "page": "number", "quote": "exact Norwegian text"}
	]
	},
	"heights": {
	"rules": [
	{"type": "max_height", "value": "number", "unit": "m", "applies_to": "main_building", "page": "number", "quote": "exact Norwegian text"}
	]
	},
	"parking": {
	"rules": [
	{"type": "required_spaces", "value": "number", "unit": "spaces", "page": "number", "quote": "exact Norwegian text"},
	{"type": "space_size", "value": "number", "unit": "m²", "page": "number", "quote": "exact Norwegian text"}
	]
	},
	"setbacks": {
	"rules": [
	{"type": "property_line", "value": "number", "unit": "m", "page": "number", "quote": "exact Norwegian text"}
	]
	},
	"areas": {
	"rules": [
	{"type": "max_fenced", "value": "number", "unit": "m²", "page": "number", "quote": "exact Norwegian text"}
	]
	}
	},
	"derived": {
	"plot_utilization_percent_gross": "number or null",
	"plot_utilization_percent_net_buildings": "number or null"
	}
	}
	]
	}

	Extract EVERY numeric rule with its exact Norwegian quote and page number. Group related rules together."""
	# ---- PDF helpers ----

	def extract_text_from_pdf_url(self, pdf_url: str) -> str:
	try:
	response = requests.get(pdf_url, timeout=30)
	response.raise_for_status()
	doc = fitz.open(stream=response.content, filetype="pdf")
	full_text = ""
	for page_num in range(len(doc)):
	page = doc[page_num]
	page_text = page.get_text()
	full_text += f"\n--- PAGE {page_num + 1} ---\n{page_text}"
	doc.close()
	return full_text
	except Exception as e:
	raise Exception(f"Failed to extract PDF text: {str(e)}")

	def extract_text_from_pdf_file(self, pdf_file_path: str) -> str:
	try:
	doc = fitz.open(pdf_file_path)
	full_text = ""
	for page_num in range(len(doc)):
	page = doc[page_num]
	page_text = page.get_text()
	full_text += f"\n--- PAGE {page_num + 1} ---\n{page_text}"
	doc.close()
	return full_text
	except Exception as e:
	raise Exception(f"Failed to extract PDF text: {str(e)}")

	# ---- LLM call ----

	def call_openai_api(self, text: str, plot_size_m2: Optional[float] = None) -> Dict[str, Any]:
	headers = {
	"Authorization": f"Bearer {self.openai_api_key}",
	"Content-Type": "application/json",
	}

	user_content = (
	"Extract ONLY numeric planning rules from this Norwegian document. "
	"Focus on BYA, heights, parking, setbacks, area limits:\n\n" + text
	)

	if plot_size_m2:
	user_content += f"\n\nPLOT SIZE: {plot_size_m2} m² - Calculate utilization percentages."

	payload = {
	"model": "gpt-4o-mini",
	"messages": [
	{"role": "system", "content": self.system_prompt},
	{"role": "user", "content": user_content},
	],
	"temperature": 0,
	"max_tokens": 3000,
	"response_format": {"type": "json_object"},
	}

	try:
	response = requests.post(self.openai_url, headers=headers, json=payload, timeout=60)
	response.raise_for_status()
	result = response.json()
	content = result["choices"][0]["message"]["content"]
	extracted_data = json.loads(content)

	if plot_size_m2:
	extracted_data = self._post_process_calculations(extracted_data, plot_size_m2)

	return extracted_data
	except Exception as e:
	raise Exception(f"OpenAI API call failed: {str(e)}")

	def _post_process_calculations(self, data: Dict[str, Any], plot_size_m2: float) -> Dict[str, Any]:
	zones = data.get("zones", [])
	for zone in zones:
	rules = zone.get("rules", {})
	bya = rules.get("bya", {})
	derived = zone.setdefault("derived", {})

	if bya and bya.get("total_value") and plot_size_m2:
	total_bya = bya["total_value"]

	parking_area = 0
	breakdown = bya.get("breakdown", [])
	for item in breakdown:
	if "parking" in item.get("component", "").lower():
	parking_area = item.get("value", 0)
	break

	net_building_area = total_bya - parking_area
	gross_percent = (total_bya / plot_size_m2) * 100
	net_percent = (net_building_area / plot_size_m2) * 100

	derived["plot_utilization_percent_gross"] = round(gross_percent, 1)
	derived["plot_utilization_percent_net_buildings"] = round(net_percent, 1)
	return data

	def process_document(self, pdf_input: str, input_type: str, plot_size_m2: Optional[float] = None) -> Tuple[Dict[str, Any], str]:
	try:
	if input_type == "url":
	pdf_text = self.extract_text_from_pdf_url(pdf_input)
	source = f"URL: {pdf_input}"
	else:
	pdf_text = self.extract_text_from_pdf_file(pdf_input)
	source = f"File: {pdf_input}"

	if len(pdf_text.strip()) == 0:
	raise Exception("No text could be extracted from PDF")

	extracted_data = self.call_openai_api(pdf_text, plot_size_m2)
	extracted_data["_processing_info"] = {
	"processed_at": datetime.now().isoformat(),
	"source": source,
	"text_length": len(pdf_text),
	"model_used": "gpt-4o-mini",
	"plot_size_m2": plot_size_m2,
	}
	return extracted_data, "Success"
	except Exception as e:
	return {}, f"Processing failed: {str(e)}"


	# ---------------- UI summary helper (unchanged) ----------------
	def create_summary_html(data: Dict[str, Any]) -> str:
	if not data or not data.get("zones"):
	return "<p>No data to display</p>"

	html_parts = []
	html_parts.append("<div style='font-family: Arial, sans-serif;'>")

	proc_info = data.get("_processing_info", {})
	plot_size = proc_info.get("plot_size_m2")

	if plot_size:
	html_parts.append("<h3>📐 Plot Information</h3>")
	html_parts.append("<div style='background-color: #e8f4fd; padding: 15px; border-radius: 8px; margin-bottom: 20px;'>")
	html_parts.append(f"<p><b>Plot Size:</b> {plot_size:,.0f} m²</p>")
	html_parts.append("</div>")

	plan_meta = data.get("plan_metadata", {})
	if plan_meta and any(plan_meta.values()):
	html_parts.append("<h3>📋 Plan Information</h3>")
	html_parts.append("<div style='background-color: #f8f9fa; padding: 15px; border-radius: 8px; margin-bottom: 20px;'>")
	for key, value in plan_meta.items():
	if value:
	label = key.replace("_", " ").title()
	html_parts.append(f"<p><b>{label}:</b> {value}</p>")
	html_parts.append("</div>")

	zones = data.get("zones", [])
	for i, zone in enumerate(zones, 1):
	html_parts.append("<div style='border: 2px solid #2c5282; margin: 20px 0; padding: 20px; border-radius: 10px; background-color: #fafafa;'>")
	html_parts.append(f"<h3 style='margin-top: 0; color: #2c5282;'>🏗️ Zone {i}: {zone.get('name', 'Unnamed')}</h3>")

	rules = zone.get("rules", {})

	# BYA
	bya = rules.get("bya")
	if bya:
	html_parts.append("<div style='background-color: #e8f5e8; padding: 15px; border-radius: 8px; margin: 15px 0;'>")
	html_parts.append("<h4 style='margin-top: 0; color: #2d5a2d;'>🏠 Building Coverage (BYA) Rules</h4>")
	if bya.get("total_value"):
	html_parts.append(f"<p style='font-size: 1.1em;'><b>Total Allowed BYA:</b> <span style='color: #2d5a2d; font-weight: bold;'>{bya['total_value']} m²</span></p>")
	breakdown = bya.get("breakdown", [])
	if breakdown:
	html_parts.append("<table style='width: 100%; border-collapse: collapse; margin-top: 10px;'>")
	html_parts.append("<thead><tr style='background-color: #d4edda;'><th style='padding: 10px; text-align: left; border: 1px solid #c3e6cb;'>Component</th><th style='padding: 10px; text-align: center; border: 1px solid #c3e6cb;'>Value</th><th style='padding: 10px; text-align: left; border: 1px solid #c3e6cb;'>Source</th></tr></thead>")
	html_parts.append("<tbody>")
	for item in breakdown:
	component = item.get("component", "N/A").replace("_", " ").title()
	value = item.get("value", "N/A")
	unit = item.get("unit", "")
	page = item.get("page", "N/A")
	quote = item.get("quote", "N/A")
	display_value = f"{value} {unit}" if value != "N/A" else "N/A"
	html_parts.append(f"""
	<tr style='border-bottom: 1px solid #c3e6cb;'>
	<td style='padding: 8px; border: 1px solid #c3e6cb; font-weight: 500;'>{component}</td>
	<td style='padding: 8px; border: 1px solid #c3e6cb; text-align: center; font-weight: bold; color: #2d5a2d;'>{display_value}</td>
	<td style='padding: 8px; border: 1px solid #c3e6cb; font-size: 0.9em;'><b>Page {page}:</b> "{quote}"</td>
	</tr>
	""")
	html_parts.append("</tbody></table>")
	html_parts.append("</div>")

	# Parking
	parking = rules.get("parking")
	if parking:
	parking_rules = parking.get("rules", [])
	if parking_rules:
	html_parts.append("<div style='background-color: #fff3cd; padding: 15px; border-radius: 8px; margin: 15px 0;'>")
	html_parts.append("<h4 style='margin-top: 0; color: #856404;'>🚗 Parking Requirements</h4>")
	html_parts.append("<table style='width: 100%; border-collapse: collapse;'>")
	html_parts.append("<thead><tr style='background-color: #ffeaa7;'><th style='padding: 10px; text-align: left; border: 1px solid #fdcb6e;'>Requirement</th><th style='padding: 10px; text-align: center; border: 1px solid #fdcb6e;'>Value</th><th style='padding: 10px; text-align: left; border: 1px solid #fdcb6e;'>Source</th></tr></thead>")
	html_parts.append("<tbody>")
	for rule in parking_rules:
	rule_type = rule.get("type", "N/A").replace("_", " ").title()
	value = rule.get("value", "N/A")
	unit = rule.get("unit", "")
	page = rule.get("page", "N/A")
	quote = rule.get("quote", "N/A")
	display_value = f"{value} {unit}" if value != "N/A" else "N/A"
	html_parts.append(f"""
	<tr style='border-bottom: 1px solid #fdcb6e;'>
	<td style='padding: 8px; border: 1px solid #fdcb6e; font-weight: 500;'>{rule_type}</td>
	<td style='padding: 8px; border: 1px solid #fdcb6e; text-align: center; font-weight: bold; color: #856404;'>{display_value}</td>
	<td style='padding: 8px; border: 1px solid #fdcb6e; font-size: 0.9em;'><b>Page {page}:</b> "{quote}"</td>
	</tr>
	""")
	html_parts.append("</tbody></table>")
	html_parts.append("</div>")

	# Heights
	heights = rules.get("heights")
	if heights:
	height_rules = heights.get("rules", [])
	if height_rules:
	html_parts.append("<div style='background-color: #f0f8ff; padding: 15px; border-radius: 8px; margin: 15px 0;'>")
	html_parts.append("<h4 style='margin-top: 0; color: #4c51bf;'>📏 Height Restrictions</h4>")
	html_parts.append("<table style='width: 100%; border-collapse: collapse;'>")
	html_parts.append("<thead><tr style='background-color: #c6d5fd;'><th style='padding: 10px; text-align: left; border: 1px solid #a3c2fd;'>Type</th><th style='padding: 10px; text-align: center; border: 1px solid #a3c2fd;'>Applies To</th><th style='padding: 10px; text-align: center; border: 1px solid #a3c2fd;'>Max Height</th><th style='padding: 10px; text-align: left; border: 1px solid #a3c2fd;'>Source</th></tr></thead>")
	html_parts.append("<tbody>")
	for rule in height_rules:
	rule_type = rule.get("type", "N/A").replace("_", " ").title()
	applies_to = rule.get("applies_to", "N/A").replace("_", " ").title()
	value = rule.get("value", "N/A")
	unit = rule.get("unit", "")
	page = rule.get("page", "N/A")
	quote = rule.get("quote", "N/A")
	display_value = f"{value} {unit}" if value != "N/A" else "N/A"
	html_parts.append(f"""
	<tr style='border-bottom: 1px solid #a3c2fd;'>
	<td style='padding: 8px; border: 1px solid #a3c2fd; font-weight: 500;'>{rule_type}</td>
	<td style='padding: 8px; border: 1px solid #a3c2fd; text-align: center;'>{applies_to}</td>
	<td style='padding: 8px; border: 1px solid #a3c2fd; text-align: center; font-weight: bold; color: #4c51bf;'>{display_value}</td>
	<td style='padding: 8px; border: 1px solid #a3c2fd; font-size: 0.9em;'><b>Page {page}:</b> "{quote}"</td>
	</tr>
	""")
	html_parts.append("</tbody></table>")
	html_parts.append("</div>")

	html_parts.append("</div>")

	html_parts.append("<div style='margin-top: 30px; padding: 15px; background-color: #f8f9fa; border-radius: 8px; border-left: 4px solid #28a745;'>")
	html_parts.append("<h4 style='margin-top: 0; color: #28a745;'>✅ Processing Complete</h4>")
	html_parts.append(f"<p><b>Processed:</b> {proc_info.get('processed_at', 'N/A')}</p>")
	if plot_size:
	html_parts.append(f"<p><b>Plot Size Used:</b> {plot_size:,.0f} m²</p>")
	html_parts.append("</div>")

	html_parts.append("</div>")
	return "".join(html_parts)


	# ---------------- UI glue (unchanged UI function) ----------------
	def process_pdf_interface(pdf_url: str, pdf_file_path, plot_size_m2: str) -> Tuple[str, str, str]:
	if not OPENAI_API_KEY:
	return "❌ Please set the OPENAI_API_KEY environment variable", "", ""

	if not pdf_url and not pdf_file_path:
	return "❌ Please provide either a PDF URL or upload a PDF file", "", ""

	if pdf_url and pdf_file_path:
	return "❌ Please provide either a PDF URL OR upload a file, not both", "", ""

	plot_size = None
	if plot_size_m2 and plot_size_m2.strip():
	try:
	plot_size = float(plot_size_m2.replace(",", ".").strip())
	if plot_size <= 0:
	return "❌ Plot size must be greater than 0", "", ""
	except ValueError:
	return "❌ Please enter a valid plot size (numbers only)", "", ""

	try:
	extractor = NorwayPlanExtractor(OPENAI_API_KEY)
	if pdf_url:
	results, status = extractor.process_document(pdf_url, "url", plot_size)
	else:
	results, status = extractor.process_document(pdf_file_path, "file", plot_size)

	if "Success" in status:
	summary_html = create_summary_html(results)
	json_output = json.dumps(results, indent=2, ensure_ascii=False)
	plot_info = f" (Plot: {plot_size:,.0f} m²)" if plot_size else ""
	return f"✅ Processing completed successfully!{plot_info}", summary_html, json_output
	else:
	return f"❌ {status}", "", ""
	except Exception as e:
	return f"❌ Error: {str(e)}", "", ""


	# ---------------- API-ONLY function (Option 1) ----------------
	def process_pdf_api(pdf_url: str, plot_size_m2: str = "") -> Dict[str, Any]:
	"""
	Clean JSON API: pass a PDF URL and optional plot_size_m2 (string or empty).
	Returns the exact structured JSON, or {"error": "..."} on failure.
	"""
	if not OPENAI_API_KEY:
	return {"error": "OPENAI_API_KEY not set"}
	if not pdf_url:
	return {"error": "pdf_url is required"}

	plot_size = None
	if plot_size_m2 and plot_size_m2.strip():
	try:
	plot_size = float(plot_size_m2.replace(",", ".").strip())
	if plot_size <= 0:
	return {"error": "plot_size_m2 must be > 0"}
	except ValueError:
	return {"error": "plot_size_m2 must be a number"}

	extractor = NorwayPlanExtractor(OPENAI_API_KEY)
	results, status = extractor.process_document(pdf_url, "url", plot_size)
	if "Success" in status:
	return results
	return {"error": status}


	# ---------------- Build Gradio app with named API route ----------------
	def create_interface():
	with gr.Blocks(title="Norwegian Municipal Plan Extractor") as interface:
	gr.HTML("""
	<div style="text-align: center; margin-bottom: 30px;">
	<h1 style="color: #2E86AB;">🏛️ Norwegian Municipal Plan Extractor</h1>
	<p style="color: #666;">Fast extraction of structured data from Norwegian municipal planning documents</p>
	</div>
	""")

	with gr.Row():
	with gr.Column(scale=1):
	gr.HTML("<h3>📝 Input</h3>")

	plot_size_input = gr.Textbox(
	label="📐 Plot Size (m²) - Optional, needed for BYA %",
	placeholder="e.g., 1000",
	value=""
	)

	gr.HTML("<p><b>Choose input method:</b></p>")

	pdf_url = gr.Textbox(
	label="📎 PDF URL",
	placeholder="https://example.com/plan.pdf"
	)

	gr.HTML("<p style='text-align: center;'><b>— OR —</b></p>")

	pdf_file = gr.File(
	label="📄 Upload PDF File",
	file_types=[".pdf"],
	type="filepath", # returns a string path
	)

	process_btn = gr.Button(
	"🚀 Extract Plan Data",
	variant="primary"
	)

	with gr.Column(scale=2):
	gr.HTML("<h3>📊 Results</h3>")

	status_output = gr.Textbox(label="Status", interactive=False)
	summary_output = gr.HTML(
	label="Summary",
	value="<p>No data processed yet. Please provide a PDF and click Extract Plan Data.</p>"
	)
	json_output = gr.Code(label="Raw JSON Output", language="json", interactive=False)

	# UI action
	process_btn.click(
	fn=process_pdf_interface,
	inputs=[pdf_url, pdf_file, plot_size_input],
	outputs=[status_output, summary_output, json_output]
	)

	# --------- Named API endpoint: /run/extract_json ----------
	# We define hidden inputs purely to attach a stable API route.
	# Clients will POST to /run/extract_json with:
	# {"data": ["<pdf_url>", "<plot_size_m2 or ''>"]}
	with gr.Row(visible=False):
	api_pdf_url = gr.Textbox()
	api_plot_size = gr.Textbox()

	interface.load(
	fn=process_pdf_api,
	inputs=[api_pdf_url, api_plot_size],
	outputs=gr.JSON(label="result"),
	api_name="extract_json", # <-- your public API route
	)

	gr.HTML("""
	<div style="margin-top: 20px; padding: 20px; background-color: #f0f8ff; border-radius: 10px;">
	<h4>⚡ JSON API Usage:</h4>
	<pre style="white-space: pre-wrap; background:#fff; padding:12px; border-radius:8px;">
	POST https://<username>-<space>.hf.space/run/extract_json
	Content-Type: application/json

	{
	"data": [
	"https://example.com/plan.pdf",
	"1200" // optional plot_size_m2; use "" if unknown
	]
	}
	</pre>
	</div>
	""")

	return interface


	if __name__ == "__main__":
	demo = create_interface()
	demo.launch()