| import os |
| import json |
| from datetime import datetime |
| from typing import Dict, Any, Tuple, Optional |
|
|
| import gradio as gr |
| import requests |
| import fitz |
|
|
|
|
| |
| OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") |
|
|
|
|
| class NorwayPlanExtractor: |
| def __init__(self, openai_api_key: str): |
| self.openai_api_key = openai_api_key |
| self.openai_url = "https://api.openai.com/v1/chat/completions" |
|
|
| self.system_prompt = """You are a Norwegian planning document expert. Extract ONLY building/planning rules with numeric values. |
| Return ONLY valid JSON. Convert decimal commas to dots (6,5 β 6.5). Use null for missing fields. |
| |
| FOCUS ONLY ON: BYA, building heights, setbacks, parking, and area limits. IGNORE: procedures, general text, background info. |
| |
| JSON Structure: |
| { |
| "plan_metadata": { |
| "plan_id": "string or null", |
| "plan_name": "string or null", |
| "municipality": "string or null" |
| }, |
| "zones": [ |
| { |
| "name": "string", |
| "rules": { |
| "bya": { |
| "type": "absolute_m2", |
| "total_value": "number", |
| "breakdown": [ |
| {"component": "main_building", "value": "number", "unit": "mΒ²", "page": "number", "quote": "exact Norwegian text"}, |
| {"component": "parking", "value": "number", "unit": "mΒ²", "page": "number", "quote": "exact Norwegian text"} |
| ] |
| }, |
| "heights": { |
| "rules": [ |
| {"type": "max_height", "value": "number", "unit": "m", "applies_to": "main_building", "page": "number", "quote": "exact Norwegian text"} |
| ] |
| }, |
| "parking": { |
| "rules": [ |
| {"type": "required_spaces", "value": "number", "unit": "spaces", "page": "number", "quote": "exact Norwegian text"}, |
| {"type": "space_size", "value": "number", "unit": "mΒ²", "page": "number", "quote": "exact Norwegian text"} |
| ] |
| }, |
| "setbacks": { |
| "rules": [ |
| {"type": "property_line", "value": "number", "unit": "m", "page": "number", "quote": "exact Norwegian text"} |
| ] |
| }, |
| "areas": { |
| "rules": [ |
| {"type": "max_fenced", "value": "number", "unit": "mΒ²", "page": "number", "quote": "exact Norwegian text"} |
| ] |
| } |
| }, |
| "derived": { |
| "plot_utilization_percent_gross": "number or null", |
| "plot_utilization_percent_net_buildings": "number or null" |
| } |
| } |
| ] |
| } |
| |
| Extract EVERY numeric rule with its exact Norwegian quote and page number. Group related rules together.""" |
| |
|
|
| def extract_text_from_pdf_url(self, pdf_url: str) -> str: |
| try: |
| response = requests.get(pdf_url, timeout=30) |
| response.raise_for_status() |
| doc = fitz.open(stream=response.content, filetype="pdf") |
| full_text = "" |
| for page_num in range(len(doc)): |
| page = doc[page_num] |
| page_text = page.get_text() |
| full_text += f"\n--- PAGE {page_num + 1} ---\n{page_text}" |
| doc.close() |
| return full_text |
| except Exception as e: |
| raise Exception(f"Failed to extract PDF text: {str(e)}") |
|
|
| def extract_text_from_pdf_file(self, pdf_file_path: str) -> str: |
| try: |
| doc = fitz.open(pdf_file_path) |
| full_text = "" |
| for page_num in range(len(doc)): |
| page = doc[page_num] |
| page_text = page.get_text() |
| full_text += f"\n--- PAGE {page_num + 1} ---\n{page_text}" |
| doc.close() |
| return full_text |
| except Exception as e: |
| raise Exception(f"Failed to extract PDF text: {str(e)}") |
|
|
| |
|
|
| def call_openai_api(self, text: str, plot_size_m2: Optional[float] = None) -> Dict[str, Any]: |
| headers = { |
| "Authorization": f"Bearer {self.openai_api_key}", |
| "Content-Type": "application/json", |
| } |
|
|
| user_content = ( |
| "Extract ONLY numeric planning rules from this Norwegian document. " |
| "Focus on BYA, heights, parking, setbacks, area limits:\n\n" + text |
| ) |
|
|
| if plot_size_m2: |
| user_content += f"\n\nPLOT SIZE: {plot_size_m2} mΒ² - Calculate utilization percentages." |
|
|
| payload = { |
| "model": "gpt-4o-mini", |
| "messages": [ |
| {"role": "system", "content": self.system_prompt}, |
| {"role": "user", "content": user_content}, |
| ], |
| "temperature": 0, |
| "max_tokens": 3000, |
| "response_format": {"type": "json_object"}, |
| } |
|
|
| try: |
| response = requests.post(self.openai_url, headers=headers, json=payload, timeout=60) |
| response.raise_for_status() |
| result = response.json() |
| content = result["choices"][0]["message"]["content"] |
| extracted_data = json.loads(content) |
|
|
| if plot_size_m2: |
| extracted_data = self._post_process_calculations(extracted_data, plot_size_m2) |
|
|
| return extracted_data |
| except Exception as e: |
| raise Exception(f"OpenAI API call failed: {str(e)}") |
|
|
| def _post_process_calculations(self, data: Dict[str, Any], plot_size_m2: float) -> Dict[str, Any]: |
| zones = data.get("zones", []) |
| for zone in zones: |
| rules = zone.get("rules", {}) |
| bya = rules.get("bya", {}) |
| derived = zone.setdefault("derived", {}) |
|
|
| if bya and bya.get("total_value") and plot_size_m2: |
| total_bya = bya["total_value"] |
|
|
| parking_area = 0 |
| breakdown = bya.get("breakdown", []) |
| for item in breakdown: |
| if "parking" in item.get("component", "").lower(): |
| parking_area = item.get("value", 0) |
| break |
|
|
| net_building_area = total_bya - parking_area |
| gross_percent = (total_bya / plot_size_m2) * 100 |
| net_percent = (net_building_area / plot_size_m2) * 100 |
|
|
| derived["plot_utilization_percent_gross"] = round(gross_percent, 1) |
| derived["plot_utilization_percent_net_buildings"] = round(net_percent, 1) |
| return data |
|
|
| def process_document(self, pdf_input: str, input_type: str, plot_size_m2: Optional[float] = None) -> Tuple[Dict[str, Any], str]: |
| try: |
| if input_type == "url": |
| pdf_text = self.extract_text_from_pdf_url(pdf_input) |
| source = f"URL: {pdf_input}" |
| else: |
| pdf_text = self.extract_text_from_pdf_file(pdf_input) |
| source = f"File: {pdf_input}" |
|
|
| if len(pdf_text.strip()) == 0: |
| raise Exception("No text could be extracted from PDF") |
|
|
| extracted_data = self.call_openai_api(pdf_text, plot_size_m2) |
| extracted_data["_processing_info"] = { |
| "processed_at": datetime.now().isoformat(), |
| "source": source, |
| "text_length": len(pdf_text), |
| "model_used": "gpt-4o-mini", |
| "plot_size_m2": plot_size_m2, |
| } |
| return extracted_data, "Success" |
| except Exception as e: |
| return {}, f"Processing failed: {str(e)}" |
|
|
|
|
| |
| def create_summary_html(data: Dict[str, Any]) -> str: |
| if not data or not data.get("zones"): |
| return "<p>No data to display</p>" |
|
|
| html_parts = [] |
| html_parts.append("<div style='font-family: Arial, sans-serif;'>") |
|
|
| proc_info = data.get("_processing_info", {}) |
| plot_size = proc_info.get("plot_size_m2") |
|
|
| if plot_size: |
| html_parts.append("<h3>π Plot Information</h3>") |
| html_parts.append("<div style='background-color: #e8f4fd; padding: 15px; border-radius: 8px; margin-bottom: 20px;'>") |
| html_parts.append(f"<p><b>Plot Size:</b> {plot_size:,.0f} mΒ²</p>") |
| html_parts.append("</div>") |
|
|
| plan_meta = data.get("plan_metadata", {}) |
| if plan_meta and any(plan_meta.values()): |
| html_parts.append("<h3>π Plan Information</h3>") |
| html_parts.append("<div style='background-color: #f8f9fa; padding: 15px; border-radius: 8px; margin-bottom: 20px;'>") |
| for key, value in plan_meta.items(): |
| if value: |
| label = key.replace("_", " ").title() |
| html_parts.append(f"<p><b>{label}:</b> {value}</p>") |
| html_parts.append("</div>") |
|
|
| zones = data.get("zones", []) |
| for i, zone in enumerate(zones, 1): |
| html_parts.append("<div style='border: 2px solid #2c5282; margin: 20px 0; padding: 20px; border-radius: 10px; background-color: #fafafa;'>") |
| html_parts.append(f"<h3 style='margin-top: 0; color: #2c5282;'>ποΈ Zone {i}: {zone.get('name', 'Unnamed')}</h3>") |
|
|
| rules = zone.get("rules", {}) |
|
|
| |
| bya = rules.get("bya") |
| if bya: |
| html_parts.append("<div style='background-color: #e8f5e8; padding: 15px; border-radius: 8px; margin: 15px 0;'>") |
| html_parts.append("<h4 style='margin-top: 0; color: #2d5a2d;'>π Building Coverage (BYA) Rules</h4>") |
| if bya.get("total_value"): |
| html_parts.append(f"<p style='font-size: 1.1em;'><b>Total Allowed BYA:</b> <span style='color: #2d5a2d; font-weight: bold;'>{bya['total_value']} mΒ²</span></p>") |
| breakdown = bya.get("breakdown", []) |
| if breakdown: |
| html_parts.append("<table style='width: 100%; border-collapse: collapse; margin-top: 10px;'>") |
| html_parts.append("<thead><tr style='background-color: #d4edda;'><th style='padding: 10px; text-align: left; border: 1px solid #c3e6cb;'>Component</th><th style='padding: 10px; text-align: center; border: 1px solid #c3e6cb;'>Value</th><th style='padding: 10px; text-align: left; border: 1px solid #c3e6cb;'>Source</th></tr></thead>") |
| html_parts.append("<tbody>") |
| for item in breakdown: |
| component = item.get("component", "N/A").replace("_", " ").title() |
| value = item.get("value", "N/A") |
| unit = item.get("unit", "") |
| page = item.get("page", "N/A") |
| quote = item.get("quote", "N/A") |
| display_value = f"{value} {unit}" if value != "N/A" else "N/A" |
| html_parts.append(f""" |
| <tr style='border-bottom: 1px solid #c3e6cb;'> |
| <td style='padding: 8px; border: 1px solid #c3e6cb; font-weight: 500;'>{component}</td> |
| <td style='padding: 8px; border: 1px solid #c3e6cb; text-align: center; font-weight: bold; color: #2d5a2d;'>{display_value}</td> |
| <td style='padding: 8px; border: 1px solid #c3e6cb; font-size: 0.9em;'><b>Page {page}:</b> "{quote}"</td> |
| </tr> |
| """) |
| html_parts.append("</tbody></table>") |
| html_parts.append("</div>") |
|
|
| |
| parking = rules.get("parking") |
| if parking: |
| parking_rules = parking.get("rules", []) |
| if parking_rules: |
| html_parts.append("<div style='background-color: #fff3cd; padding: 15px; border-radius: 8px; margin: 15px 0;'>") |
| html_parts.append("<h4 style='margin-top: 0; color: #856404;'>π Parking Requirements</h4>") |
| html_parts.append("<table style='width: 100%; border-collapse: collapse;'>") |
| html_parts.append("<thead><tr style='background-color: #ffeaa7;'><th style='padding: 10px; text-align: left; border: 1px solid #fdcb6e;'>Requirement</th><th style='padding: 10px; text-align: center; border: 1px solid #fdcb6e;'>Value</th><th style='padding: 10px; text-align: left; border: 1px solid #fdcb6e;'>Source</th></tr></thead>") |
| html_parts.append("<tbody>") |
| for rule in parking_rules: |
| rule_type = rule.get("type", "N/A").replace("_", " ").title() |
| value = rule.get("value", "N/A") |
| unit = rule.get("unit", "") |
| page = rule.get("page", "N/A") |
| quote = rule.get("quote", "N/A") |
| display_value = f"{value} {unit}" if value != "N/A" else "N/A" |
| html_parts.append(f""" |
| <tr style='border-bottom: 1px solid #fdcb6e;'> |
| <td style='padding: 8px; border: 1px solid #fdcb6e; font-weight: 500;'>{rule_type}</td> |
| <td style='padding: 8px; border: 1px solid #fdcb6e; text-align: center; font-weight: bold; color: #856404;'>{display_value}</td> |
| <td style='padding: 8px; border: 1px solid #fdcb6e; font-size: 0.9em;'><b>Page {page}:</b> "{quote}"</td> |
| </tr> |
| """) |
| html_parts.append("</tbody></table>") |
| html_parts.append("</div>") |
|
|
| |
| heights = rules.get("heights") |
| if heights: |
| height_rules = heights.get("rules", []) |
| if height_rules: |
| html_parts.append("<div style='background-color: #f0f8ff; padding: 15px; border-radius: 8px; margin: 15px 0;'>") |
| html_parts.append("<h4 style='margin-top: 0; color: #4c51bf;'>π Height Restrictions</h4>") |
| html_parts.append("<table style='width: 100%; border-collapse: collapse;'>") |
| html_parts.append("<thead><tr style='background-color: #c6d5fd;'><th style='padding: 10px; text-align: left; border: 1px solid #a3c2fd;'>Type</th><th style='padding: 10px; text-align: center; border: 1px solid #a3c2fd;'>Applies To</th><th style='padding: 10px; text-align: center; border: 1px solid #a3c2fd;'>Max Height</th><th style='padding: 10px; text-align: left; border: 1px solid #a3c2fd;'>Source</th></tr></thead>") |
| html_parts.append("<tbody>") |
| for rule in height_rules: |
| rule_type = rule.get("type", "N/A").replace("_", " ").title() |
| applies_to = rule.get("applies_to", "N/A").replace("_", " ").title() |
| value = rule.get("value", "N/A") |
| unit = rule.get("unit", "") |
| page = rule.get("page", "N/A") |
| quote = rule.get("quote", "N/A") |
| display_value = f"{value} {unit}" if value != "N/A" else "N/A" |
| html_parts.append(f""" |
| <tr style='border-bottom: 1px solid #a3c2fd;'> |
| <td style='padding: 8px; border: 1px solid #a3c2fd; font-weight: 500;'>{rule_type}</td> |
| <td style='padding: 8px; border: 1px solid #a3c2fd; text-align: center;'>{applies_to}</td> |
| <td style='padding: 8px; border: 1px solid #a3c2fd; text-align: center; font-weight: bold; color: #4c51bf;'>{display_value}</td> |
| <td style='padding: 8px; border: 1px solid #a3c2fd; font-size: 0.9em;'><b>Page {page}:</b> "{quote}"</td> |
| </tr> |
| """) |
| html_parts.append("</tbody></table>") |
| html_parts.append("</div>") |
|
|
| html_parts.append("</div>") |
|
|
| html_parts.append("<div style='margin-top: 30px; padding: 15px; background-color: #f8f9fa; border-radius: 8px; border-left: 4px solid #28a745;'>") |
| html_parts.append("<h4 style='margin-top: 0; color: #28a745;'>β
Processing Complete</h4>") |
| html_parts.append(f"<p><b>Processed:</b> {proc_info.get('processed_at', 'N/A')}</p>") |
| if plot_size: |
| html_parts.append(f"<p><b>Plot Size Used:</b> {plot_size:,.0f} mΒ²</p>") |
| html_parts.append("</div>") |
|
|
| html_parts.append("</div>") |
| return "".join(html_parts) |
|
|
|
|
| |
| def process_pdf_interface(pdf_url: str, pdf_file_path, plot_size_m2: str) -> Tuple[str, str, str]: |
| if not OPENAI_API_KEY: |
| return "β Please set the OPENAI_API_KEY environment variable", "", "" |
|
|
| if not pdf_url and not pdf_file_path: |
| return "β Please provide either a PDF URL or upload a PDF file", "", "" |
|
|
| if pdf_url and pdf_file_path: |
| return "β Please provide either a PDF URL OR upload a file, not both", "", "" |
|
|
| plot_size = None |
| if plot_size_m2 and plot_size_m2.strip(): |
| try: |
| plot_size = float(plot_size_m2.replace(",", ".").strip()) |
| if plot_size <= 0: |
| return "β Plot size must be greater than 0", "", "" |
| except ValueError: |
| return "β Please enter a valid plot size (numbers only)", "", "" |
|
|
| try: |
| extractor = NorwayPlanExtractor(OPENAI_API_KEY) |
| if pdf_url: |
| results, status = extractor.process_document(pdf_url, "url", plot_size) |
| else: |
| results, status = extractor.process_document(pdf_file_path, "file", plot_size) |
|
|
| if "Success" in status: |
| summary_html = create_summary_html(results) |
| json_output = json.dumps(results, indent=2, ensure_ascii=False) |
| plot_info = f" (Plot: {plot_size:,.0f} mΒ²)" if plot_size else "" |
| return f"β
Processing completed successfully!{plot_info}", summary_html, json_output |
| else: |
| return f"β {status}", "", "" |
| except Exception as e: |
| return f"β Error: {str(e)}", "", "" |
|
|
|
|
| |
| def process_pdf_api(pdf_url: str, plot_size_m2: str = "") -> Dict[str, Any]: |
| """ |
| Clean JSON API: pass a PDF URL and optional plot_size_m2 (string or empty). |
| Returns the exact structured JSON, or {"error": "..."} on failure. |
| """ |
| if not OPENAI_API_KEY: |
| return {"error": "OPENAI_API_KEY not set"} |
| if not pdf_url: |
| return {"error": "pdf_url is required"} |
|
|
| plot_size = None |
| if plot_size_m2 and plot_size_m2.strip(): |
| try: |
| plot_size = float(plot_size_m2.replace(",", ".").strip()) |
| if plot_size <= 0: |
| return {"error": "plot_size_m2 must be > 0"} |
| except ValueError: |
| return {"error": "plot_size_m2 must be a number"} |
|
|
| extractor = NorwayPlanExtractor(OPENAI_API_KEY) |
| results, status = extractor.process_document(pdf_url, "url", plot_size) |
| if "Success" in status: |
| return results |
| return {"error": status} |
|
|
|
|
| |
| def create_interface(): |
| with gr.Blocks(title="Norwegian Municipal Plan Extractor") as interface: |
| gr.HTML(""" |
| <div style="text-align: center; margin-bottom: 30px;"> |
| <h1 style="color: #2E86AB;">ποΈ Norwegian Municipal Plan Extractor</h1> |
| <p style="color: #666;">Fast extraction of structured data from Norwegian municipal planning documents</p> |
| </div> |
| """) |
|
|
| with gr.Row(): |
| with gr.Column(scale=1): |
| gr.HTML("<h3>π Input</h3>") |
|
|
| plot_size_input = gr.Textbox( |
| label="π Plot Size (mΒ²) - Optional, needed for BYA %", |
| placeholder="e.g., 1000", |
| value="" |
| ) |
|
|
| gr.HTML("<p><b>Choose input method:</b></p>") |
|
|
| pdf_url = gr.Textbox( |
| label="π PDF URL", |
| placeholder="https://example.com/plan.pdf" |
| ) |
|
|
| gr.HTML("<p style='text-align: center;'><b>β OR β</b></p>") |
|
|
| pdf_file = gr.File( |
| label="π Upload PDF File", |
| file_types=[".pdf"], |
| type="filepath", |
| ) |
|
|
| process_btn = gr.Button( |
| "π Extract Plan Data", |
| variant="primary" |
| ) |
|
|
| with gr.Column(scale=2): |
| gr.HTML("<h3>π Results</h3>") |
|
|
| status_output = gr.Textbox(label="Status", interactive=False) |
| summary_output = gr.HTML( |
| label="Summary", |
| value="<p>No data processed yet. Please provide a PDF and click Extract Plan Data.</p>" |
| ) |
| json_output = gr.Code(label="Raw JSON Output", language="json", interactive=False) |
|
|
| |
| process_btn.click( |
| fn=process_pdf_interface, |
| inputs=[pdf_url, pdf_file, plot_size_input], |
| outputs=[status_output, summary_output, json_output] |
| ) |
|
|
| |
| |
| |
| |
| with gr.Row(visible=False): |
| api_pdf_url = gr.Textbox() |
| api_plot_size = gr.Textbox() |
|
|
| interface.load( |
| fn=process_pdf_api, |
| inputs=[api_pdf_url, api_plot_size], |
| outputs=gr.JSON(label="result"), |
| api_name="extract_json", |
| ) |
|
|
| gr.HTML(""" |
| <div style="margin-top: 20px; padding: 20px; background-color: #f0f8ff; border-radius: 10px;"> |
| <h4>β‘ JSON API Usage:</h4> |
| <pre style="white-space: pre-wrap; background:#fff; padding:12px; border-radius:8px;"> |
| POST https://<username>-<space>.hf.space/run/extract_json |
| Content-Type: application/json |
| |
| { |
| "data": [ |
| "https://example.com/plan.pdf", |
| "1200" // optional plot_size_m2; use "" if unknown |
| ] |
| } |
| </pre> |
| </div> |
| """) |
|
|
| return interface |
|
|
|
|
| if __name__ == "__main__": |
| demo = create_interface() |
| demo.launch() |
|
|