agent-bya / app.py
iplotnor's picture
Update app.py
27b9862 verified
import os
import json
from datetime import datetime
from typing import Dict, Any, Tuple, Optional
import gradio as gr
import requests
import fitz # PyMuPDF
# Read API key only from environment
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
class NorwayPlanExtractor:
def __init__(self, openai_api_key: str):
self.openai_api_key = openai_api_key
self.openai_url = "https://api.openai.com/v1/chat/completions"
self.system_prompt = """You are a Norwegian planning document expert. Extract ONLY building/planning rules with numeric values.
Return ONLY valid JSON. Convert decimal commas to dots (6,5 β†’ 6.5). Use null for missing fields.
FOCUS ONLY ON: BYA, building heights, setbacks, parking, and area limits. IGNORE: procedures, general text, background info.
JSON Structure:
{
"plan_metadata": {
"plan_id": "string or null",
"plan_name": "string or null",
"municipality": "string or null"
},
"zones": [
{
"name": "string",
"rules": {
"bya": {
"type": "absolute_m2",
"total_value": "number",
"breakdown": [
{"component": "main_building", "value": "number", "unit": "mΒ²", "page": "number", "quote": "exact Norwegian text"},
{"component": "parking", "value": "number", "unit": "mΒ²", "page": "number", "quote": "exact Norwegian text"}
]
},
"heights": {
"rules": [
{"type": "max_height", "value": "number", "unit": "m", "applies_to": "main_building", "page": "number", "quote": "exact Norwegian text"}
]
},
"parking": {
"rules": [
{"type": "required_spaces", "value": "number", "unit": "spaces", "page": "number", "quote": "exact Norwegian text"},
{"type": "space_size", "value": "number", "unit": "mΒ²", "page": "number", "quote": "exact Norwegian text"}
]
},
"setbacks": {
"rules": [
{"type": "property_line", "value": "number", "unit": "m", "page": "number", "quote": "exact Norwegian text"}
]
},
"areas": {
"rules": [
{"type": "max_fenced", "value": "number", "unit": "mΒ²", "page": "number", "quote": "exact Norwegian text"}
]
}
},
"derived": {
"plot_utilization_percent_gross": "number or null",
"plot_utilization_percent_net_buildings": "number or null"
}
}
]
}
Extract EVERY numeric rule with its exact Norwegian quote and page number. Group related rules together."""
# ---- PDF helpers ----
def extract_text_from_pdf_url(self, pdf_url: str) -> str:
try:
response = requests.get(pdf_url, timeout=30)
response.raise_for_status()
doc = fitz.open(stream=response.content, filetype="pdf")
full_text = ""
for page_num in range(len(doc)):
page = doc[page_num]
page_text = page.get_text()
full_text += f"\n--- PAGE {page_num + 1} ---\n{page_text}"
doc.close()
return full_text
except Exception as e:
raise Exception(f"Failed to extract PDF text: {str(e)}")
def extract_text_from_pdf_file(self, pdf_file_path: str) -> str:
try:
doc = fitz.open(pdf_file_path)
full_text = ""
for page_num in range(len(doc)):
page = doc[page_num]
page_text = page.get_text()
full_text += f"\n--- PAGE {page_num + 1} ---\n{page_text}"
doc.close()
return full_text
except Exception as e:
raise Exception(f"Failed to extract PDF text: {str(e)}")
# ---- LLM call ----
def call_openai_api(self, text: str, plot_size_m2: Optional[float] = None) -> Dict[str, Any]:
headers = {
"Authorization": f"Bearer {self.openai_api_key}",
"Content-Type": "application/json",
}
user_content = (
"Extract ONLY numeric planning rules from this Norwegian document. "
"Focus on BYA, heights, parking, setbacks, area limits:\n\n" + text
)
if plot_size_m2:
user_content += f"\n\nPLOT SIZE: {plot_size_m2} mΒ² - Calculate utilization percentages."
payload = {
"model": "gpt-4o-mini",
"messages": [
{"role": "system", "content": self.system_prompt},
{"role": "user", "content": user_content},
],
"temperature": 0,
"max_tokens": 3000,
"response_format": {"type": "json_object"},
}
try:
response = requests.post(self.openai_url, headers=headers, json=payload, timeout=60)
response.raise_for_status()
result = response.json()
content = result["choices"][0]["message"]["content"]
extracted_data = json.loads(content)
if plot_size_m2:
extracted_data = self._post_process_calculations(extracted_data, plot_size_m2)
return extracted_data
except Exception as e:
raise Exception(f"OpenAI API call failed: {str(e)}")
def _post_process_calculations(self, data: Dict[str, Any], plot_size_m2: float) -> Dict[str, Any]:
zones = data.get("zones", [])
for zone in zones:
rules = zone.get("rules", {})
bya = rules.get("bya", {})
derived = zone.setdefault("derived", {})
if bya and bya.get("total_value") and plot_size_m2:
total_bya = bya["total_value"]
parking_area = 0
breakdown = bya.get("breakdown", [])
for item in breakdown:
if "parking" in item.get("component", "").lower():
parking_area = item.get("value", 0)
break
net_building_area = total_bya - parking_area
gross_percent = (total_bya / plot_size_m2) * 100
net_percent = (net_building_area / plot_size_m2) * 100
derived["plot_utilization_percent_gross"] = round(gross_percent, 1)
derived["plot_utilization_percent_net_buildings"] = round(net_percent, 1)
return data
def process_document(self, pdf_input: str, input_type: str, plot_size_m2: Optional[float] = None) -> Tuple[Dict[str, Any], str]:
try:
if input_type == "url":
pdf_text = self.extract_text_from_pdf_url(pdf_input)
source = f"URL: {pdf_input}"
else:
pdf_text = self.extract_text_from_pdf_file(pdf_input)
source = f"File: {pdf_input}"
if len(pdf_text.strip()) == 0:
raise Exception("No text could be extracted from PDF")
extracted_data = self.call_openai_api(pdf_text, plot_size_m2)
extracted_data["_processing_info"] = {
"processed_at": datetime.now().isoformat(),
"source": source,
"text_length": len(pdf_text),
"model_used": "gpt-4o-mini",
"plot_size_m2": plot_size_m2,
}
return extracted_data, "Success"
except Exception as e:
return {}, f"Processing failed: {str(e)}"
# ---------------- UI summary helper (unchanged) ----------------
def create_summary_html(data: Dict[str, Any]) -> str:
if not data or not data.get("zones"):
return "<p>No data to display</p>"
html_parts = []
html_parts.append("<div style='font-family: Arial, sans-serif;'>")
proc_info = data.get("_processing_info", {})
plot_size = proc_info.get("plot_size_m2")
if plot_size:
html_parts.append("<h3>πŸ“ Plot Information</h3>")
html_parts.append("<div style='background-color: #e8f4fd; padding: 15px; border-radius: 8px; margin-bottom: 20px;'>")
html_parts.append(f"<p><b>Plot Size:</b> {plot_size:,.0f} mΒ²</p>")
html_parts.append("</div>")
plan_meta = data.get("plan_metadata", {})
if plan_meta and any(plan_meta.values()):
html_parts.append("<h3>πŸ“‹ Plan Information</h3>")
html_parts.append("<div style='background-color: #f8f9fa; padding: 15px; border-radius: 8px; margin-bottom: 20px;'>")
for key, value in plan_meta.items():
if value:
label = key.replace("_", " ").title()
html_parts.append(f"<p><b>{label}:</b> {value}</p>")
html_parts.append("</div>")
zones = data.get("zones", [])
for i, zone in enumerate(zones, 1):
html_parts.append("<div style='border: 2px solid #2c5282; margin: 20px 0; padding: 20px; border-radius: 10px; background-color: #fafafa;'>")
html_parts.append(f"<h3 style='margin-top: 0; color: #2c5282;'>πŸ—οΈ Zone {i}: {zone.get('name', 'Unnamed')}</h3>")
rules = zone.get("rules", {})
# BYA
bya = rules.get("bya")
if bya:
html_parts.append("<div style='background-color: #e8f5e8; padding: 15px; border-radius: 8px; margin: 15px 0;'>")
html_parts.append("<h4 style='margin-top: 0; color: #2d5a2d;'>🏠 Building Coverage (BYA) Rules</h4>")
if bya.get("total_value"):
html_parts.append(f"<p style='font-size: 1.1em;'><b>Total Allowed BYA:</b> <span style='color: #2d5a2d; font-weight: bold;'>{bya['total_value']} mΒ²</span></p>")
breakdown = bya.get("breakdown", [])
if breakdown:
html_parts.append("<table style='width: 100%; border-collapse: collapse; margin-top: 10px;'>")
html_parts.append("<thead><tr style='background-color: #d4edda;'><th style='padding: 10px; text-align: left; border: 1px solid #c3e6cb;'>Component</th><th style='padding: 10px; text-align: center; border: 1px solid #c3e6cb;'>Value</th><th style='padding: 10px; text-align: left; border: 1px solid #c3e6cb;'>Source</th></tr></thead>")
html_parts.append("<tbody>")
for item in breakdown:
component = item.get("component", "N/A").replace("_", " ").title()
value = item.get("value", "N/A")
unit = item.get("unit", "")
page = item.get("page", "N/A")
quote = item.get("quote", "N/A")
display_value = f"{value} {unit}" if value != "N/A" else "N/A"
html_parts.append(f"""
<tr style='border-bottom: 1px solid #c3e6cb;'>
<td style='padding: 8px; border: 1px solid #c3e6cb; font-weight: 500;'>{component}</td>
<td style='padding: 8px; border: 1px solid #c3e6cb; text-align: center; font-weight: bold; color: #2d5a2d;'>{display_value}</td>
<td style='padding: 8px; border: 1px solid #c3e6cb; font-size: 0.9em;'><b>Page {page}:</b> "{quote}"</td>
</tr>
""")
html_parts.append("</tbody></table>")
html_parts.append("</div>")
# Parking
parking = rules.get("parking")
if parking:
parking_rules = parking.get("rules", [])
if parking_rules:
html_parts.append("<div style='background-color: #fff3cd; padding: 15px; border-radius: 8px; margin: 15px 0;'>")
html_parts.append("<h4 style='margin-top: 0; color: #856404;'>πŸš— Parking Requirements</h4>")
html_parts.append("<table style='width: 100%; border-collapse: collapse;'>")
html_parts.append("<thead><tr style='background-color: #ffeaa7;'><th style='padding: 10px; text-align: left; border: 1px solid #fdcb6e;'>Requirement</th><th style='padding: 10px; text-align: center; border: 1px solid #fdcb6e;'>Value</th><th style='padding: 10px; text-align: left; border: 1px solid #fdcb6e;'>Source</th></tr></thead>")
html_parts.append("<tbody>")
for rule in parking_rules:
rule_type = rule.get("type", "N/A").replace("_", " ").title()
value = rule.get("value", "N/A")
unit = rule.get("unit", "")
page = rule.get("page", "N/A")
quote = rule.get("quote", "N/A")
display_value = f"{value} {unit}" if value != "N/A" else "N/A"
html_parts.append(f"""
<tr style='border-bottom: 1px solid #fdcb6e;'>
<td style='padding: 8px; border: 1px solid #fdcb6e; font-weight: 500;'>{rule_type}</td>
<td style='padding: 8px; border: 1px solid #fdcb6e; text-align: center; font-weight: bold; color: #856404;'>{display_value}</td>
<td style='padding: 8px; border: 1px solid #fdcb6e; font-size: 0.9em;'><b>Page {page}:</b> "{quote}"</td>
</tr>
""")
html_parts.append("</tbody></table>")
html_parts.append("</div>")
# Heights
heights = rules.get("heights")
if heights:
height_rules = heights.get("rules", [])
if height_rules:
html_parts.append("<div style='background-color: #f0f8ff; padding: 15px; border-radius: 8px; margin: 15px 0;'>")
html_parts.append("<h4 style='margin-top: 0; color: #4c51bf;'>πŸ“ Height Restrictions</h4>")
html_parts.append("<table style='width: 100%; border-collapse: collapse;'>")
html_parts.append("<thead><tr style='background-color: #c6d5fd;'><th style='padding: 10px; text-align: left; border: 1px solid #a3c2fd;'>Type</th><th style='padding: 10px; text-align: center; border: 1px solid #a3c2fd;'>Applies To</th><th style='padding: 10px; text-align: center; border: 1px solid #a3c2fd;'>Max Height</th><th style='padding: 10px; text-align: left; border: 1px solid #a3c2fd;'>Source</th></tr></thead>")
html_parts.append("<tbody>")
for rule in height_rules:
rule_type = rule.get("type", "N/A").replace("_", " ").title()
applies_to = rule.get("applies_to", "N/A").replace("_", " ").title()
value = rule.get("value", "N/A")
unit = rule.get("unit", "")
page = rule.get("page", "N/A")
quote = rule.get("quote", "N/A")
display_value = f"{value} {unit}" if value != "N/A" else "N/A"
html_parts.append(f"""
<tr style='border-bottom: 1px solid #a3c2fd;'>
<td style='padding: 8px; border: 1px solid #a3c2fd; font-weight: 500;'>{rule_type}</td>
<td style='padding: 8px; border: 1px solid #a3c2fd; text-align: center;'>{applies_to}</td>
<td style='padding: 8px; border: 1px solid #a3c2fd; text-align: center; font-weight: bold; color: #4c51bf;'>{display_value}</td>
<td style='padding: 8px; border: 1px solid #a3c2fd; font-size: 0.9em;'><b>Page {page}:</b> "{quote}"</td>
</tr>
""")
html_parts.append("</tbody></table>")
html_parts.append("</div>")
html_parts.append("</div>")
html_parts.append("<div style='margin-top: 30px; padding: 15px; background-color: #f8f9fa; border-radius: 8px; border-left: 4px solid #28a745;'>")
html_parts.append("<h4 style='margin-top: 0; color: #28a745;'>βœ… Processing Complete</h4>")
html_parts.append(f"<p><b>Processed:</b> {proc_info.get('processed_at', 'N/A')}</p>")
if plot_size:
html_parts.append(f"<p><b>Plot Size Used:</b> {plot_size:,.0f} mΒ²</p>")
html_parts.append("</div>")
html_parts.append("</div>")
return "".join(html_parts)
# ---------------- UI glue (unchanged UI function) ----------------
def process_pdf_interface(pdf_url: str, pdf_file_path, plot_size_m2: str) -> Tuple[str, str, str]:
if not OPENAI_API_KEY:
return "❌ Please set the OPENAI_API_KEY environment variable", "", ""
if not pdf_url and not pdf_file_path:
return "❌ Please provide either a PDF URL or upload a PDF file", "", ""
if pdf_url and pdf_file_path:
return "❌ Please provide either a PDF URL OR upload a file, not both", "", ""
plot_size = None
if plot_size_m2 and plot_size_m2.strip():
try:
plot_size = float(plot_size_m2.replace(",", ".").strip())
if plot_size <= 0:
return "❌ Plot size must be greater than 0", "", ""
except ValueError:
return "❌ Please enter a valid plot size (numbers only)", "", ""
try:
extractor = NorwayPlanExtractor(OPENAI_API_KEY)
if pdf_url:
results, status = extractor.process_document(pdf_url, "url", plot_size)
else:
results, status = extractor.process_document(pdf_file_path, "file", plot_size)
if "Success" in status:
summary_html = create_summary_html(results)
json_output = json.dumps(results, indent=2, ensure_ascii=False)
plot_info = f" (Plot: {plot_size:,.0f} mΒ²)" if plot_size else ""
return f"βœ… Processing completed successfully!{plot_info}", summary_html, json_output
else:
return f"❌ {status}", "", ""
except Exception as e:
return f"❌ Error: {str(e)}", "", ""
# ---------------- API-ONLY function (Option 1) ----------------
def process_pdf_api(pdf_url: str, plot_size_m2: str = "") -> Dict[str, Any]:
"""
Clean JSON API: pass a PDF URL and optional plot_size_m2 (string or empty).
Returns the exact structured JSON, or {"error": "..."} on failure.
"""
if not OPENAI_API_KEY:
return {"error": "OPENAI_API_KEY not set"}
if not pdf_url:
return {"error": "pdf_url is required"}
plot_size = None
if plot_size_m2 and plot_size_m2.strip():
try:
plot_size = float(plot_size_m2.replace(",", ".").strip())
if plot_size <= 0:
return {"error": "plot_size_m2 must be > 0"}
except ValueError:
return {"error": "plot_size_m2 must be a number"}
extractor = NorwayPlanExtractor(OPENAI_API_KEY)
results, status = extractor.process_document(pdf_url, "url", plot_size)
if "Success" in status:
return results
return {"error": status}
# ---------------- Build Gradio app with named API route ----------------
def create_interface():
with gr.Blocks(title="Norwegian Municipal Plan Extractor") as interface:
gr.HTML("""
<div style="text-align: center; margin-bottom: 30px;">
<h1 style="color: #2E86AB;">πŸ›οΈ Norwegian Municipal Plan Extractor</h1>
<p style="color: #666;">Fast extraction of structured data from Norwegian municipal planning documents</p>
</div>
""")
with gr.Row():
with gr.Column(scale=1):
gr.HTML("<h3>πŸ“ Input</h3>")
plot_size_input = gr.Textbox(
label="πŸ“ Plot Size (mΒ²) - Optional, needed for BYA %",
placeholder="e.g., 1000",
value=""
)
gr.HTML("<p><b>Choose input method:</b></p>")
pdf_url = gr.Textbox(
label="πŸ“Ž PDF URL",
placeholder="https://example.com/plan.pdf"
)
gr.HTML("<p style='text-align: center;'><b>β€” OR β€”</b></p>")
pdf_file = gr.File(
label="πŸ“„ Upload PDF File",
file_types=[".pdf"],
type="filepath", # returns a string path
)
process_btn = gr.Button(
"πŸš€ Extract Plan Data",
variant="primary"
)
with gr.Column(scale=2):
gr.HTML("<h3>πŸ“Š Results</h3>")
status_output = gr.Textbox(label="Status", interactive=False)
summary_output = gr.HTML(
label="Summary",
value="<p>No data processed yet. Please provide a PDF and click Extract Plan Data.</p>"
)
json_output = gr.Code(label="Raw JSON Output", language="json", interactive=False)
# UI action
process_btn.click(
fn=process_pdf_interface,
inputs=[pdf_url, pdf_file, plot_size_input],
outputs=[status_output, summary_output, json_output]
)
# --------- Named API endpoint: /run/extract_json ----------
# We define hidden inputs purely to attach a stable API route.
# Clients will POST to /run/extract_json with:
# {"data": ["<pdf_url>", "<plot_size_m2 or ''>"]}
with gr.Row(visible=False):
api_pdf_url = gr.Textbox()
api_plot_size = gr.Textbox()
interface.load(
fn=process_pdf_api,
inputs=[api_pdf_url, api_plot_size],
outputs=gr.JSON(label="result"),
api_name="extract_json", # <-- your public API route
)
gr.HTML("""
<div style="margin-top: 20px; padding: 20px; background-color: #f0f8ff; border-radius: 10px;">
<h4>⚑ JSON API Usage:</h4>
<pre style="white-space: pre-wrap; background:#fff; padding:12px; border-radius:8px;">
POST https://&lt;username&gt;-&lt;space&gt;.hf.space/run/extract_json
Content-Type: application/json
{
"data": [
"https://example.com/plan.pdf",
"1200" // optional plot_size_m2; use "" if unknown
]
}
</pre>
</div>
""")
return interface
if __name__ == "__main__":
demo = create_interface()
demo.launch()