Commit
·
4851501
0
Parent(s):
Deploy to Spaces (Final Clean)
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitignore +65 -0
- ARCHITECTURE.md +426 -0
- DEPLOYMENT.md +58 -0
- Dockerfile +58 -0
- README.md +277 -0
- SETUP.md +455 -0
- backend/__init__.py +0 -0
- backend/api/api.py +8 -0
- backend/api/endpoints/catalog.py +235 -0
- backend/api/endpoints/chat.py +90 -0
- backend/api/endpoints/schema.py +84 -0
- backend/core/catalog_enricher.py +221 -0
- backend/core/data_catalog.py +445 -0
- backend/core/database.py +34 -0
- backend/core/geo_engine.py +244 -0
- backend/core/llm_gateway.py +500 -0
- backend/core/prompts.py +279 -0
- backend/core/query_planner.py +291 -0
- backend/core/semantic_search.py +259 -0
- backend/core/session_store.py +179 -0
- backend/data/catalog.json +1290 -0
- backend/data/catalog_schema.json +145 -0
- backend/data/censo/censo_2023_enriched.csv +0 -0
- backend/data/censo/censo_panama_2023_unificado.csv +0 -0
- backend/data/global/airports/panama_airports.geojson +98 -0
- backend/main.py +68 -0
- backend/pyproject.toml +31 -0
- backend/requirements.txt +18 -0
- backend/scripts/create_province_layer.py +196 -0
- backend/scripts/download_geofabrik.py +192 -0
- backend/scripts/download_global_datasets.py +133 -0
- backend/scripts/download_hdx.py +72 -0
- backend/scripts/download_hdx_panama.py +102 -0
- backend/scripts/download_kontur.py +239 -0
- backend/scripts/download_overture.py +133 -0
- backend/scripts/download_stri_data.py +79 -0
- backend/scripts/download_worldbank.py +141 -0
- backend/scripts/enrich_censo.py +115 -0
- backend/scripts/extract_overture_features.py +134 -0
- backend/scripts/ingest_hdx.py +110 -0
- backend/scripts/process_worldbank.py +150 -0
- backend/scripts/register_global_datasets.py +51 -0
- backend/scripts/stri_catalog_scraper.py +348 -0
- backend/scripts/update_embeddings.py +37 -0
- backend/scripts/validate_censo.py +155 -0
- backend/services/data_loader.py +271 -0
- backend/services/executor.py +860 -0
- backend/services/orchestrator.py +13 -0
- backend/services/response_formatter.py +287 -0
- docker-compose.yml +14 -0
.gitignore
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
*.so
|
| 6 |
+
.Python
|
| 7 |
+
venv/
|
| 8 |
+
ENV/
|
| 9 |
+
env/
|
| 10 |
+
.venv/
|
| 11 |
+
|
| 12 |
+
# Node
|
| 13 |
+
node_modules/
|
| 14 |
+
.next/
|
| 15 |
+
out/
|
| 16 |
+
|
| 17 |
+
# IDE
|
| 18 |
+
.vscode/
|
| 19 |
+
.idea/
|
| 20 |
+
*.swp
|
| 21 |
+
*.swo
|
| 22 |
+
|
| 23 |
+
# OS
|
| 24 |
+
.DS_Store
|
| 25 |
+
Thumbs.db
|
| 26 |
+
|
| 27 |
+
# Environment
|
| 28 |
+
.env
|
| 29 |
+
.env.local
|
| 30 |
+
.env.*.local
|
| 31 |
+
|
| 32 |
+
# Data files (keep structure, not data)
|
| 33 |
+
*.parquet
|
| 34 |
+
*.duckdb
|
| 35 |
+
*.duckdb.wal
|
| 36 |
+
|
| 37 |
+
# Large data files and binaries (downloaded at build time)
|
| 38 |
+
backend/data/embeddings.json
|
| 39 |
+
backend/data/*.geojson
|
| 40 |
+
backend/data/*.gz
|
| 41 |
+
backend/data/*.xlsx
|
| 42 |
+
backend/data/global/airports/airports_global.csv
|
| 43 |
+
backend/data/*.gpkg
|
| 44 |
+
backend/data/osm/
|
| 45 |
+
backend/data/overture/
|
| 46 |
+
backend/data/kontur/
|
| 47 |
+
backend/data/hdx/
|
| 48 |
+
backend/data/base/
|
| 49 |
+
backend/data/inec/
|
| 50 |
+
backend/data/temp/
|
| 51 |
+
backend/data/climate/
|
| 52 |
+
backend/data/ms_buildings/
|
| 53 |
+
backend/data/stri/
|
| 54 |
+
backend/data/socioeconomic/
|
| 55 |
+
backend/data/terrain/
|
| 56 |
+
backend/data/worldbank/
|
| 57 |
+
|
| 58 |
+
# Logs
|
| 59 |
+
*.log
|
| 60 |
+
npm-debug.log*
|
| 61 |
+
|
| 62 |
+
# Build
|
| 63 |
+
dist/
|
| 64 |
+
build/
|
| 65 |
+
*.egg-info/
|
ARCHITECTURE.md
ADDED
|
@@ -0,0 +1,426 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# GeoQuery Architecture
|
| 2 |
+
|
| 3 |
+
## System Overview
|
| 4 |
+
|
| 5 |
+
GeoQuery is a **Territorial Intelligence Platform** that combines Large Language Models (LLMs) with geospatial analysis to enable natural language querying of geographic datasets. The system translates conversational queries into SQL, executes spatial operations, and presents results through interactive maps and data visualizations.
|
| 6 |
+
|
| 7 |
+
### Design Philosophy
|
| 8 |
+
|
| 9 |
+
1. **Natural Language First**: Users interact through conversational queries, not SQL or GIS interfaces
|
| 10 |
+
2. **Dynamic Data Discovery**: No fixed schema—the system adapts to any GeoJSON dataset added to the catalog
|
| 11 |
+
3. **Streaming Intelligence**: Real-time thought processes and incremental results via Server-Sent Events
|
| 12 |
+
4. **Spatial Native**: PostGIS-compatible spatial operations in DuckDB for performant geospatial analysis
|
| 13 |
+
5. **Visual by Default**: Automatic map visualization, choropleth generation, and data presentation
|
| 14 |
+
|
| 15 |
+
---
|
| 16 |
+
|
| 17 |
+
## High-Level Architecture
|
| 18 |
+
|
| 19 |
+
```
|
| 20 |
+
┌─────────────────────────────────────────────────────────────┐
|
| 21 |
+
│ Frontend │
|
| 22 |
+
│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │
|
| 23 |
+
│ │ ChatPanel │ │ MapViewer │ │ DataExplorer │ │
|
| 24 |
+
│ │ (React) │ │ (Leaflet) │ │ (Table) │ │
|
| 25 |
+
│ └──────────────┘ └──────────────┘ └──────────────┘ │
|
| 26 |
+
│ │ │ │ │
|
| 27 |
+
│ └──────────────────┴──────────────────┘ │
|
| 28 |
+
│ │ (SSE/HTTP) │
|
| 29 |
+
└───────────────────────────┼─────────────────────────────────┘
|
| 30 |
+
│
|
| 31 |
+
┌───────────────────────────┼─────────────────────────────────┐
|
| 32 |
+
│ API Layer │
|
| 33 |
+
│ ┌──────────────────────────────────────────────────┐ │
|
| 34 |
+
│ │ FastAPI Endpoints │ │
|
| 35 |
+
│ │ /api/chat (SSE) │ /api/catalog │ /api/schema │ │
|
| 36 |
+
│ └──────────────────────────────────────────────────┘ │
|
| 37 |
+
│ │ │
|
| 38 |
+
└───────────────────────────┼─────────────────────────────────┘
|
| 39 |
+
│
|
| 40 |
+
┌───────────────────────────┼─────────────────────────────────┐
|
| 41 |
+
│ Service Layer │
|
| 42 |
+
│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │
|
| 43 |
+
│ │ QueryExecutor│ │ LLMGateway │ │ GeoEngine │ │
|
| 44 |
+
│ │ │ │ (Gemini) │ │ (DuckDB) │ │
|
| 45 |
+
│ └──────────────┘ └──────────────┘ └──────────────┘ │
|
| 46 |
+
│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │
|
| 47 |
+
│ │ DataCatalog │ │SemanticSearch│ │ SessionStore │ │
|
| 48 |
+
│ │ (Embeddings) │ │ (Vectors) │ │ (Layers) │ │
|
| 49 |
+
│ └──────────────┘ └──────────────┘ └──────────────┘ │
|
| 50 |
+
└─────────────────────────────────────────────────────────────┘
|
| 51 |
+
│
|
| 52 |
+
┌───────────────────────────┼─────────────────────────────────┐
|
| 53 |
+
│ Data Layer │
|
| 54 |
+
│ ┌──────────────┐ ┌─────���────────┐ ┌──────────────┐ │
|
| 55 |
+
│ │ catalog.json │ │ GeoJSON │ │ embeddings │ │
|
| 56 |
+
│ │ (Metadata) │ │ (Datasets) │ │ (.npy) │ │
|
| 57 |
+
│ └──────────────┘ └──────────────┘ └──────────────┘ │
|
| 58 |
+
│ ┌──────────────────────────────────────────────────┐ │
|
| 59 |
+
│ │ DuckDB In-Memory Database │ │
|
| 60 |
+
│ │ (Spatial Tables, Temporary Layers, Indexes) │ │
|
| 61 |
+
│ └──────────────────────────────────────────────────┘ │
|
| 62 |
+
└─────────────────────────────────────────────────────────────┘
|
| 63 |
+
```
|
| 64 |
+
|
| 65 |
+
---
|
| 66 |
+
|
| 67 |
+
## Core Components
|
| 68 |
+
|
| 69 |
+
### 1. Frontend (Next.js + React)
|
| 70 |
+
|
| 71 |
+
**Location**: `frontend/src/`
|
| 72 |
+
|
| 73 |
+
The frontend is a single-page application built with Next.js that provides:
|
| 74 |
+
- **ChatPanel**: Conversational interface with streaming responses
|
| 75 |
+
- **MapViewer**: Interactive Leaflet map with layer management
|
| 76 |
+
- **DataExplorer**: Tabular data view with export capabilities
|
| 77 |
+
|
| 78 |
+
**Key Technologies**:
|
| 79 |
+
- Next.js 14 (App Router)
|
| 80 |
+
- React 18 with hooks
|
| 81 |
+
- Leaflet for map rendering
|
| 82 |
+
- Server-Sent Events (SSE) for streaming
|
| 83 |
+
- dnd-kit for drag-and-drop layer reordering
|
| 84 |
+
|
| 85 |
+
### 2. API Layer (FastAPI)
|
| 86 |
+
|
| 87 |
+
**Location**: `backend/api/`
|
| 88 |
+
|
| 89 |
+
RESTful API with streaming support:
|
| 90 |
+
- **`/api/chat`** (POST): Main query endpoint with SSE streaming
|
| 91 |
+
- **`/api/catalog`** (GET): Returns available datasets
|
| 92 |
+
- **`/api/schema`** (GET): Returns database schema
|
| 93 |
+
|
| 94 |
+
**Key Technologies**:
|
| 95 |
+
- FastAPI for async HTTP
|
| 96 |
+
- Starlette for SSE streaming
|
| 97 |
+
- CORS middleware for cross-origin requests
|
| 98 |
+
|
| 99 |
+
### 3. Service Layer
|
| 100 |
+
|
| 101 |
+
#### QueryExecutor (`backend/services/executor.py`)
|
| 102 |
+
Orchestrates the entire query pipeline:
|
| 103 |
+
1. Intent detection
|
| 104 |
+
2. Data discovery
|
| 105 |
+
3. SQL generation
|
| 106 |
+
4. Query execution
|
| 107 |
+
5. Response formatting
|
| 108 |
+
6. Explanation generation
|
| 109 |
+
|
| 110 |
+
#### LLMGateway (`backend/core/llm_gateway.py`)
|
| 111 |
+
Interfaces with Gemini API:
|
| 112 |
+
- Intent detection with thinking
|
| 113 |
+
- Text-to-SQL generation
|
| 114 |
+
- Natural language explanations
|
| 115 |
+
- Layer naming and styling
|
| 116 |
+
- Error correction
|
| 117 |
+
- Streaming support
|
| 118 |
+
|
| 119 |
+
#### GeoEngine (`backend/core/geo_engine.py`)
|
| 120 |
+
Manages spatial database:
|
| 121 |
+
- DuckDB connection with Spatial extension
|
| 122 |
+
- Lazy table loading from GeoJSON
|
| 123 |
+
- SQL query execution
|
| 124 |
+
- Result formatting to GeoJSON
|
| 125 |
+
- Temporary layer registration
|
| 126 |
+
|
| 127 |
+
#### DataCatalog (`backend/core/data_catalog.py`)
|
| 128 |
+
Dataset discovery system:
|
| 129 |
+
- Loads `catalog.json` metadata
|
| 130 |
+
- Generates table summaries for LLM context
|
| 131 |
+
- Provides schema information
|
| 132 |
+
- Manages dataset metadata
|
| 133 |
+
|
| 134 |
+
#### SemanticSearch (`backend/core/semantic_search.py`)
|
| 135 |
+
Vector-based dataset discovery:
|
| 136 |
+
- Generates embeddings for dataset descriptions
|
| 137 |
+
- Performs cosine similarity search
|
| 138 |
+
- Returns top-k relevant datasets
|
| 139 |
+
- Scales to large catalogs (100+ datasets)
|
| 140 |
+
|
| 141 |
+
#### SessionStore (`backend/core/session_store.py`)
|
| 142 |
+
User session management:
|
| 143 |
+
- Tracks created map layers per session
|
| 144 |
+
- Enables spatial operations on user layers
|
| 145 |
+
- Maintains layer metadata
|
| 146 |
+
|
| 147 |
+
### 4. Data Layer
|
| 148 |
+
|
| 149 |
+
#### Catalog System (`backend/data/catalog.json`)
|
| 150 |
+
Central metadata registry:
|
| 151 |
+
- Dataset paths and descriptions
|
| 152 |
+
- Semantic descriptions for AI discovery
|
| 153 |
+
- Categories and tags
|
| 154 |
+
- Schema information
|
| 155 |
+
- Data provenance
|
| 156 |
+
|
| 157 |
+
#### GeoJSON Datasets (`backend/data/`)
|
| 158 |
+
Organized by source:
|
| 159 |
+
- `osm/` - OpenStreetMap data (roads, buildings, POI)
|
| 160 |
+
- `admin/` - Administrative boundaries (HDX)
|
| 161 |
+
- `global/` - Global datasets (Kontur, Natural Earth)
|
| 162 |
+
- `socioeconomic/` - World Bank, MPI data
|
| 163 |
+
- `stri/` - STRI GIS Portal datasets
|
| 164 |
+
|
| 165 |
+
#### Vector Embeddings (`backend/data/embeddings.npy`)
|
| 166 |
+
Sentence transformer embeddings for semantic search
|
| 167 |
+
|
| 168 |
+
---
|
| 169 |
+
|
| 170 |
+
## Data Flow: User Query to Response
|
| 171 |
+
|
| 172 |
+
### Step 1: User Input
|
| 173 |
+
```
|
| 174 |
+
User: "Show me hospitals in Panama City"
|
| 175 |
+
```
|
| 176 |
+
|
| 177 |
+
### Step 2: Frontend → Backend
|
| 178 |
+
```
|
| 179 |
+
POST /api/chat
|
| 180 |
+
{
|
| 181 |
+
"message": "Show me hospitals in Panama City",
|
| 182 |
+
"history": []
|
| 183 |
+
}
|
| 184 |
+
```
|
| 185 |
+
|
| 186 |
+
### Step 3: Intent Detection (LLM)
|
| 187 |
+
```python
|
| 188 |
+
# QueryExecutor calls LLMGateway.detect_intent()
|
| 189 |
+
intent = await llm.detect_intent(query, history)
|
| 190 |
+
# Returns: "MAP_REQUEST"
|
| 191 |
+
```
|
| 192 |
+
|
| 193 |
+
### Step 4: Semantic Discovery
|
| 194 |
+
```python
|
| 195 |
+
# SemanticSearch finds relevant tables
|
| 196 |
+
candidates = semantic_search.search_table_names(query, top_k=15)
|
| 197 |
+
# Returns: ["panama_healthsites_geojson", "osm_amenities", ...]
|
| 198 |
+
```
|
| 199 |
+
|
| 200 |
+
### Step 5: Table Schema Retrieval
|
| 201 |
+
```python
|
| 202 |
+
# GeoEngine loads relevant tables
|
| 203 |
+
geo_engine.ensure_table_loaded("panama_healthsites_geojson")
|
| 204 |
+
schema = geo_engine.get_table_schemas()
|
| 205 |
+
# Returns: "Table: panama_healthsites_geojson\nColumns: name, amenity, geom..."
|
| 206 |
+
```
|
| 207 |
+
|
| 208 |
+
### Step 6: SQL Generation (LLM)
|
| 209 |
+
```python
|
| 210 |
+
# LLMGateway generates SQL
|
| 211 |
+
sql = await llm.generate_analytical_sql(query, schema, history)
|
| 212 |
+
# Returns: "SELECT name, amenity, geom FROM panama_healthsites_geojson
|
| 213 |
+
# WHERE amenity = 'hospital' AND ST_Intersects(geom, ...)"
|
| 214 |
+
```
|
| 215 |
+
|
| 216 |
+
### Step 7: Query Execution
|
| 217 |
+
```python
|
| 218 |
+
# GeoEngine executes spatial query
|
| 219 |
+
geojson = geo_engine.execute_spatial_query(sql)
|
| 220 |
+
# Returns: GeoJSON with 45 hospital features
|
| 221 |
+
```
|
| 222 |
+
|
| 223 |
+
### Step 8: Response Formatting
|
| 224 |
+
```python
|
| 225 |
+
# Add layer metadata, generate name, configure visualization
|
| 226 |
+
layer_info = await llm.generate_layer_name(query, sql)
|
| 227 |
+
# Returns: {"name": "Hospitals in Panama City", "emoji": "🏥", "pointStyle": "icon"}
|
| 228 |
+
|
| 229 |
+
geojson = format_geojson_layer(query, geojson, features,
|
| 230 |
+
layer_info["name"],
|
| 231 |
+
layer_info["emoji"],
|
| 232 |
+
layer_info["pointStyle"])
|
| 233 |
+
```
|
| 234 |
+
|
| 235 |
+
### Step 9: Explanation Generation (Streaming)
|
| 236 |
+
```python
|
| 237 |
+
# LLMGateway generates explanation with streaming
|
| 238 |
+
async for chunk in llm.stream_explanation(query, sql, data_summary, history):
|
| 239 |
+
if chunk["type"] == "thought":
|
| 240 |
+
# Stream thinking process to frontend
|
| 241 |
+
elif chunk["type"] == "content":
|
| 242 |
+
# Stream actual response text
|
| 243 |
+
```
|
| 244 |
+
|
| 245 |
+
### Step 10: Frontend Rendering
|
| 246 |
+
- ChatPanel displays streamed explanation
|
| 247 |
+
- MapViewer renders GeoJSON layer with hospital icons
|
| 248 |
+
- DataExplorer shows tabular data
|
| 249 |
+
|
| 250 |
+
---
|
| 251 |
+
|
| 252 |
+
## Key Design Decisions
|
| 253 |
+
|
| 254 |
+
### 1. Why DuckDB Instead of PostgreSQL?
|
| 255 |
+
|
| 256 |
+
**Chosen**: DuckDB with Spatial extension
|
| 257 |
+
|
| 258 |
+
**Rationale**:
|
| 259 |
+
- **Zero Configuration**: Embedded database, no separate server
|
| 260 |
+
- **Fast Analytics**: Columnar storage optimized for analytical queries
|
| 261 |
+
- **Spatial Support**: Full PostGIS compatibility via spatial extension
|
| 262 |
+
- **GeoJSON Native**: Direct GeoJSON import/export
|
| 263 |
+
- **Lightweight**: Perfect for development and small deployments
|
| 264 |
+
|
| 265 |
+
**Trade-off**: Limited concurrency compared to PostgreSQL (acceptable for our use case)
|
| 266 |
+
|
| 267 |
+
### 2. Why Semantic Search for Dataset Discovery?
|
| 268 |
+
|
| 269 |
+
**Chosen**: Sentence transformer embeddings + cosine similarity
|
| 270 |
+
|
| 271 |
+
**Rationale**:
|
| 272 |
+
- **Scalability**: Works with 100+ datasets without overwhelming LLM context
|
| 273 |
+
- **Accuracy**: Better matches than keyword search
|
| 274 |
+
- **Token Efficiency**: Only sends relevant table schemas to LLM
|
| 275 |
+
|
| 276 |
+
**Example**:
|
| 277 |
+
- Query: "Where can I find doctors?"
|
| 278 |
+
- Semantic search finds: `panama_healthsites_geojson` (closest match)
|
| 279 |
+
- LLM then generates SQL using only relevant schema
|
| 280 |
+
|
| 281 |
+
### 3. Why Server-Sent Events for Streaming?
|
| 282 |
+
|
| 283 |
+
**Chosen**: SSE instead of WebSockets
|
| 284 |
+
|
| 285 |
+
**Rationale**:
|
| 286 |
+
- **Simpler Protocol**: One-way communication (server → client)
|
| 287 |
+
- **HTTP Compatible**: Works through firewalls and proxies
|
| 288 |
+
- **Auto Reconnect**: Built-in browser support
|
| 289 |
+
- **Event Types**: Named events for different message types
|
| 290 |
+
|
| 291 |
+
**Trade-off**: No client → server streaming (not needed for our use case)
|
| 292 |
+
|
| 293 |
+
### 4. Why Lazy Table Loading?
|
| 294 |
+
|
| 295 |
+
**Chosen**: Load GeoJSON only when needed
|
| 296 |
+
|
| 297 |
+
**Rationale**:
|
| 298 |
+
- **Fast Startup**: Don't load all datasets on initialization
|
| 299 |
+
- **Memory Efficient**: Only keep active tables in memory
|
| 300 |
+
- **Flexible**: Easy to add new datasets without restart
|
| 301 |
+
|
| 302 |
+
**Implementation**:
|
| 303 |
+
```python
|
| 304 |
+
def ensure_table_loaded(self, table_name: str) -> bool:
|
| 305 |
+
if table_name not in self.loaded_tables:
|
| 306 |
+
self.load_geojson_to_table(table_name)
|
| 307 |
+
return table_name in self.loaded_tables
|
| 308 |
+
```
|
| 309 |
+
|
| 310 |
+
### 5. Why Choropleth Auto-Detection?
|
| 311 |
+
|
| 312 |
+
**Chosen**: Automatic choropleth configuration based on data
|
| 313 |
+
|
| 314 |
+
**Rationale**:
|
| 315 |
+
- **User Friendly**: No manual configuration needed
|
| 316 |
+
- **Intelligent**: Prioritizes meaningful columns (population, area, density)
|
| 317 |
+
- **Adaptive**: Works with any numeric column
|
| 318 |
+
|
| 319 |
+
**Logic**:
|
| 320 |
+
1. Find numeric columns
|
| 321 |
+
2. Prioritize keywords (population, area, count)
|
| 322 |
+
3. Check value variance (skip if all same)
|
| 323 |
+
4. Enable choropleth with appropriate scale (linear/log)
|
| 324 |
+
|
| 325 |
+
---
|
| 326 |
+
|
| 327 |
+
##Error Handling & Resilience
|
| 328 |
+
|
| 329 |
+
### SQL Error Correction
|
| 330 |
+
When a generated SQL query fails:
|
| 331 |
+
1. Extract error message
|
| 332 |
+
2. Send to LLM with original query and schema
|
| 333 |
+
3. LLM generates corrected SQL
|
| 334 |
+
4. Execute repaired query
|
| 335 |
+
5. If still fails, return error to user
|
| 336 |
+
|
| 337 |
+
### Data Unavailable Handling
|
| 338 |
+
When requested data doesn't exist:
|
| 339 |
+
1. LLM returns special error marker: `-- ERROR: DATA_UNAVAILABLE`
|
| 340 |
+
2. System extracts "Requested" and "Available" from response
|
| 341 |
+
3. Returns helpful message to user with alternatives
|
| 342 |
+
|
| 343 |
+
### Missing Tables
|
| 344 |
+
- Catalog lists all datasets but not all loaded
|
| 345 |
+
- Lazy loading attempts to load on demand
|
| 346 |
+
- If file missing, logs warning and continues
|
| 347 |
+
|
| 348 |
+
---
|
| 349 |
+
|
| 350 |
+
## Performance Considerations
|
| 351 |
+
|
| 352 |
+
### Query Optimization
|
| 353 |
+
- **Spatial Indexes**: DuckDB automatically indexes geometry columns
|
| 354 |
+
- **Top-K Limits**: Large result sets limited to prevent memory issues
|
| 355 |
+
- **Lazy Evaluation**: Stream results when possible
|
| 356 |
+
|
| 357 |
+
### Embedding Cache
|
| 358 |
+
- Embeddings pre-computed and stored in `.npy` file
|
| 359 |
+
- Only regenerated when catalog changes
|
| 360 |
+
- Fast cosine similarity via NumPy vectorization
|
| 361 |
+
|
| 362 |
+
### Frontend Rendering
|
| 363 |
+
- **Layer Virtualization**: Large point datasets use circle markers for performance
|
| 364 |
+
- **Choropleth Colors**: Pre-computed color palettes
|
| 365 |
+
- **Lazy Map Loading**: Only render visible layers
|
| 366 |
+
|
| 367 |
+
---
|
| 368 |
+
|
| 369 |
+
## Security Considerations
|
| 370 |
+
|
| 371 |
+
### LLM Prompt Injection
|
| 372 |
+
- **Mitigation**: Clear separation of user query and system instructions
|
| 373 |
+
- **Validation**: SQL parsing and column name verification
|
| 374 |
+
- **Sandboxing**: Read-only queries (no INSERT/UPDATE/DELETE)
|
| 375 |
+
|
| 376 |
+
### API Access
|
| 377 |
+
- **CORS**: Configured allowed origins
|
| 378 |
+
- **Rate Limiting**: Can be added via middleware (not currently implemented)
|
| 379 |
+
- **Authentication**: Not implemented (suitable for internal/demo deployments)
|
| 380 |
+
|
| 381 |
+
### Data Privacy
|
| 382 |
+
- No user data stored (stateless queries)
|
| 383 |
+
- Session layers stored in-memory only
|
| 384 |
+
- No query logging by default
|
| 385 |
+
|
| 386 |
+
---
|
| 387 |
+
|
| 388 |
+
## Scalability Path
|
| 389 |
+
|
| 390 |
+
### Current Limitations
|
| 391 |
+
- **Single Process**: No horizontal scaling
|
| 392 |
+
- **In-Memory Database**: Limited by RAM
|
| 393 |
+
- **No Caching**: Repeated queries re-execute
|
| 394 |
+
|
| 395 |
+
### Future Enhancements
|
| 396 |
+
1. **Add PostgreSQL/PostGIS**: For production deployments with persistence
|
| 397 |
+
2. **Redis Cache**: Cache query results and embeddings
|
| 398 |
+
3. **Load Balancer**: Multiple FastAPI instances
|
| 399 |
+
4. **Background Workers**: Async data ingestion with Celery
|
| 400 |
+
5. **CDN**: Serve GeoJSON datasets from cloud storage
|
| 401 |
+
|
| 402 |
+
---
|
| 403 |
+
|
| 404 |
+
## Technology Choices Summary
|
| 405 |
+
|
| 406 |
+
| Component | Technology | Why? |
|
| 407 |
+
|-----------|-----------|------|
|
| 408 |
+
| **Backend Language** | Python 3.11+ | Rich geospatial ecosystem, LLM SDKs |
|
| 409 |
+
| **Web Framework** | FastAPI | Async support, OpenAPI docs, SSE |
|
| 410 |
+
| **Database** | DuckDB | Embedded, fast analytics, spatial support |
|
| 411 |
+
| **LLM** | Google Gemini | Thinking mode, streaming, JSON output |
|
| 412 |
+
| **Frontend Framework** | Next.js 14 | React, SSR, App Router, TypeScript |
|
| 413 |
+
| **Map Library** | Leaflet | Lightweight, flexible, plugin ecosystem |
|
| 414 |
+
| **Embeddings** | sentence-transformers | Multilingual, semantic similarity |
|
| 415 |
+
| **Data Format** | GeoJSON | Standard, human-readable, LLM-friendly |
|
| 416 |
+
|
| 417 |
+
---
|
| 418 |
+
|
| 419 |
+
## Next Steps
|
| 420 |
+
|
| 421 |
+
For detailed information on specific components:
|
| 422 |
+
- [Backend Services](docs/backend/CORE_SERVICES.md)
|
| 423 |
+
- [API Reference](docs/backend/API_ENDPOINTS.md)
|
| 424 |
+
- [Frontend Components](docs/frontend/COMPONENTS.md)
|
| 425 |
+
- [Data Flow](docs/DATA_FLOW.md)
|
| 426 |
+
- [Setup Guide](SETUP.md)
|
DEPLOYMENT.md
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Deployment Guide
|
| 2 |
+
|
| 3 |
+
This guide describes how to deploy the GeoQuery platform for public access.
|
| 4 |
+
|
| 5 |
+
## Strategy
|
| 6 |
+
We use a **single-container** approach where the backend (FastAPI) serves the frontend (Next.js) as static files. This simplifies deployment to PaaS providers like Railway, Render, or Hugging Face Spaces.
|
| 7 |
+
|
| 8 |
+
### Architecture
|
| 9 |
+
- **Build Stage**: Node.js builder compiles the Next.js frontend into static HTML/CSS/JS (`frontend/out`).
|
| 10 |
+
- **Runtime Stage**: Python 3.11 image installs backend dependencies.
|
| 11 |
+
- **Serving**: FastAPI mounts the static build at `/` and serves the API at `/api`.
|
| 12 |
+
- **Data**: Geospatial data (`backend/data`) is included in the image (~2GB).
|
| 13 |
+
|
| 14 |
+
## Prerequisites
|
| 15 |
+
- Docker
|
| 16 |
+
- ~5GB Free disk space (for image build)
|
| 17 |
+
- 4GB+ RAM on host machine (for DuckDB in-memory analytics)
|
| 18 |
+
|
| 19 |
+
## Local Build & Run
|
| 20 |
+
```bash
|
| 21 |
+
# Build the image
|
| 22 |
+
docker build -t geoquery .
|
| 23 |
+
|
| 24 |
+
# Run the container (Mapping 7860 to 7860 to match standard Space config)
|
| 25 |
+
docker run -p 7860:7860 -e GEMINI_API_KEY=your_key_here geoquery
|
| 26 |
+
```
|
| 27 |
+
|
| 28 |
+
## Hosting Options (Getting a Public URL)
|
| 29 |
+
|
| 30 |
+
To share this demo with others, you need to host the Docker container on a cloud provider.
|
| 31 |
+
|
| 32 |
+
### Option A: Hugging Face Spaces (Easiest & Free)
|
| 33 |
+
This will give you a public URL like `https://huggingface.co/spaces/username/geoquery`.
|
| 34 |
+
|
| 35 |
+
1. **Create Space**: Go to [huggingface.co/spaces](https://huggingface.co/spaces) -> "Create new Space".
|
| 36 |
+
- SDK: **Docker**
|
| 37 |
+
- Template: **Blank**
|
| 38 |
+
2. **Push Code**:
|
| 39 |
+
```bash
|
| 40 |
+
git remote add space https://huggingface.co/spaces/YOUR_USERNAME/SPACE_NAME
|
| 41 |
+
git push space main
|
| 42 |
+
```
|
| 43 |
+
3. **Configure Secrets**: In the Space "Settings" tab, add a "Repository Secret" named `GEMINI_API_KEY` with your key.
|
| 44 |
+
|
| 45 |
+
### Option B: Railway / Render
|
| 46 |
+
1. Connect your GitHub repository.
|
| 47 |
+
2. Railway/Render will detect the `Dockerfile`.
|
| 48 |
+
3. Set the environment variable `GEMINI_API_KEY`.
|
| 49 |
+
4. Detailed output will be available at a URL like `https://geoquery-production.up.railway.app`.
|
| 50 |
+
|
| 51 |
+
### Option C: Google Cloud Run
|
| 52 |
+
1. Build: `gcloud builds submit --tag gcr.io/PROJECT_ID/geoquery`
|
| 53 |
+
2. Deploy: `gcloud run deploy geoquery --image gcr.io/PROJECT_ID/geoquery --platform managed`
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
## Notes
|
| 57 |
+
- **Data Persistence**: The current setup uses read-only data baked into the image. User uploads will be lost on restart unless a volume is mounted to `/app/backend/data/custom`.
|
| 58 |
+
- **Memory Usage**: DuckDB processes data in-memory. For large queries, ensure the host has sufficient RAM.
|
Dockerfile
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ==========================================
|
| 2 |
+
# Stage 1: Build Frontend (Next.js)
|
| 3 |
+
# ==========================================
|
| 4 |
+
FROM node:20-alpine AS frontend
|
| 5 |
+
WORKDIR /app
|
| 6 |
+
|
| 7 |
+
# Install dependencies
|
| 8 |
+
COPY frontend/package*.json ./
|
| 9 |
+
RUN npm ci
|
| 10 |
+
|
| 11 |
+
# Copy source code
|
| 12 |
+
COPY frontend/ ./
|
| 13 |
+
|
| 14 |
+
# Configure for static export
|
| 15 |
+
ENV NEXT_PUBLIC_API_URL=/api/v1
|
| 16 |
+
# Run build (creates /app/out)
|
| 17 |
+
RUN npm run build
|
| 18 |
+
|
| 19 |
+
# ==========================================
|
| 20 |
+
# Stage 2: Runtime (Python + FastAPI)
|
| 21 |
+
# ==========================================
|
| 22 |
+
FROM python:3.11-slim
|
| 23 |
+
|
| 24 |
+
# Create a non-root user (Recommended for HF Spaces)
|
| 25 |
+
RUN useradd -m -u 1000 user
|
| 26 |
+
USER user
|
| 27 |
+
ENV PATH="/home/user/.local/bin:$PATH"
|
| 28 |
+
|
| 29 |
+
WORKDIR /app
|
| 30 |
+
|
| 31 |
+
# Install system dependencies (as root before switching user)
|
| 32 |
+
USER root
|
| 33 |
+
RUN apt-get update && apt-get install -y \
|
| 34 |
+
build-essential \
|
| 35 |
+
libgeos-dev \
|
| 36 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 37 |
+
USER user
|
| 38 |
+
|
| 39 |
+
# Install Python dependencies
|
| 40 |
+
COPY --chown=user backend/requirements.txt .
|
| 41 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 42 |
+
|
| 43 |
+
# Copy download script and execute data fetch
|
| 44 |
+
COPY backend/scripts/download_hdx_panama.py backend/scripts/
|
| 45 |
+
RUN python backend/scripts/download_hdx_panama.py
|
| 46 |
+
|
| 47 |
+
# Copy Backend Code
|
| 48 |
+
COPY --chown=user backend/ backend/
|
| 49 |
+
|
| 50 |
+
# Copy Built Frontend to Backend Static Directory
|
| 51 |
+
# ensure strict permissions
|
| 52 |
+
COPY --from=frontend --chown=user /app/out /app/backend/static
|
| 53 |
+
|
| 54 |
+
# Expose port 7860 (Standard for HF Spaces)
|
| 55 |
+
EXPOSE 7860
|
| 56 |
+
|
| 57 |
+
# Run Application
|
| 58 |
+
CMD ["uvicorn", "backend.main:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
ADDED
|
@@ -0,0 +1,277 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: GeoQuery
|
| 3 |
+
emoji: 🌍
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: green
|
| 6 |
+
sdk: docker
|
| 7 |
+
pinned: false
|
| 8 |
+
license: mit
|
| 9 |
+
app_port: 7860
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
# GeoQuery
|
| 13 |
+
🌍🤖
|
| 14 |
+
|
| 15 |
+
**Territorial Intelligence Platform** - Natural language interface for geospatial data analysis powered by LLMs and DuckDB Spatial.
|
| 16 |
+
|
| 17 |
+
   
|
| 18 |
+
|
| 19 |
+
---
|
| 20 |
+
|
| 21 |
+
## ✨ What is GeoQuery?
|
| 22 |
+
|
| 23 |
+
GeoQuery transforms geographic data analysis by combining **Large Language Models** with **spatial databases**. Simply ask questions in natural language and get instant maps, charts, and insights.
|
| 24 |
+
|
| 25 |
+
**Example**: *"Show me hospitals in Panama City"* → Interactive map with 45 hospital locations, automatically styled with 🏥 icons.
|
| 26 |
+
|
| 27 |
+
### Key Capabilities
|
| 28 |
+
|
| 29 |
+
- 🗣️ **Conversational Queries** - Natural language instead of SQL or GIS interfaces
|
| 30 |
+
- 🗺️ **Auto-Visualization** - Smart choropleth maps, point markers, and heatmaps
|
| 31 |
+
- 📊 **Dynamic Charts** - Automatic bar, pie, and line chart generation
|
| 32 |
+
- 🔍 **Semantic Discovery** - Finds relevant datasets from 100+ options using AI embeddings
|
| 33 |
+
- 🧩 **Multi-Step Analysis** - Complex queries automatically decomposed and executed
|
| 34 |
+
- 💡 **Thinking Transparency** - See the LLM's reasoning process in real-time
|
| 35 |
+
- 🎨 **Custom Point Styles** - Icon markers for POI, circle points for large datasets
|
| 36 |
+
|
| 37 |
+
---
|
| 38 |
+
|
| 39 |
+
## 🎬 Quick Demo
|
| 40 |
+
|
| 41 |
+
### Try These Queries
|
| 42 |
+
|
| 43 |
+
| Query | What You Get |
|
| 44 |
+
|-------|--------------|
|
| 45 |
+
| "Show me all provinces colored by area" | Choropleth map with size-based gradient |
|
| 46 |
+
| "Where are the universities?" | Point map with 🎓 icons |
|
| 47 |
+
| "Compare hospital count vs school count by province" | Multi-step analysis with side-by-side bar charts |
|
| 48 |
+
| "Show intersections in David as circle points" | 1,288 traffic intersections as simple colored circles |
|
| 49 |
+
| "Population density in Veraguas" | H3 hexagon heatmap (33K cells) |
|
| 50 |
+
|
| 51 |
+
---
|
| 52 |
+
|
| 53 |
+
## 🏗️ Architecture
|
| 54 |
+
|
| 55 |
+
```
|
| 56 |
+
┌──────────────────────────────────────────────────────────┐
|
| 57 |
+
│ Frontend (Next.js) │
|
| 58 |
+
│ Chat Interface │ Leaflet Maps │ Data Explorer │
|
| 59 |
+
└────────────────────────┬─────────────────────────────────┘
|
| 60 |
+
│ (SSE Streaming)
|
| 61 |
+
┌────────────────────────┴─────────────────────────────────┐
|
| 62 |
+
│ Backend (FastAPI) │
|
| 63 |
+
│ Intent Detection → Semantic Search → SQL Generation │
|
| 64 |
+
│ ↓ ↓ ↓ │
|
| 65 |
+
│ Gemini LLM DataCatalog (Embeddings) DuckDB Spatial │
|
| 66 |
+
└──────────────────────────────────────────────────────────┘
|
| 67 |
+
```
|
| 68 |
+
|
| 69 |
+
It supports dynamic dataset discovery via semantic embeddings + LLM-generated spatial SQL.
|
| 70 |
+
|
| 71 |
+
📖 **[Detailed Architecture](ARCHITECTURE.md)**
|
| 72 |
+
|
| 73 |
+
---
|
| 74 |
+
|
| 75 |
+
## 🚀 Quick Start
|
| 76 |
+
|
| 77 |
+
### Prerequisites
|
| 78 |
+
|
| 79 |
+
- **Python 3.11+**
|
| 80 |
+
- **Node.js 18+**
|
| 81 |
+
- **Google AI API Key** ([Get one free](https://aistudio.google.com/app/apikey))
|
| 82 |
+
|
| 83 |
+
### Installation
|
| 84 |
+
|
| 85 |
+
```bash
|
| 86 |
+
# 1. Clone repository
|
| 87 |
+
git clone https://github.com/GerardCB/GeoQuery.git
|
| 88 |
+
cd GeoQuery
|
| 89 |
+
|
| 90 |
+
# 2. Backend setup
|
| 91 |
+
cd backend
|
| 92 |
+
python -m venv venv
|
| 93 |
+
source venv/bin/activate # On Windows: venv\Scripts\activate
|
| 94 |
+
pip install -e .
|
| 95 |
+
|
| 96 |
+
# 3. Configure API key
|
| 97 |
+
export GEMINI_API_KEY="your-api-key-here"
|
| 98 |
+
|
| 99 |
+
# 4. Start backend
|
| 100 |
+
uvicorn backend.main:app --reload --host 0.0.0.0 --port 8000
|
| 101 |
+
|
| 102 |
+
# 5. Frontend setup (new terminal)
|
| 103 |
+
cd frontend
|
| 104 |
+
npm install
|
| 105 |
+
npm run dev
|
| 106 |
+
```
|
| 107 |
+
|
| 108 |
+
### 🎉 Done!
|
| 109 |
+
|
| 110 |
+
Open **http://localhost:3000** and start querying!
|
| 111 |
+
|
| 112 |
+
📘 **[Detailed Setup Guide](SETUP.md)**
|
| 113 |
+
|
| 114 |
+
---
|
| 115 |
+
|
| 116 |
+
## 📂 Project Structure
|
| 117 |
+
|
| 118 |
+
```
|
| 119 |
+
GeoQuery/
|
| 120 |
+
├── backend/
|
| 121 |
+
│ ├── api/ # FastAPI endpoints
|
| 122 |
+
│ │ └── endpoints/ # /chat, /catalog, /schema
|
| 123 |
+
│ ├── core/ # Core services
|
| 124 |
+
│ │ ├── llm_gateway.py # Gemini API integration
|
| 125 |
+
│ │ ├── geo_engine.py # DuckDB Spatial wrapper
|
| 126 |
+
│ │ ├── semantic_search.py # Embedding-based discovery
|
| 127 |
+
│ │ ├── data_catalog.py # Dataset metadata management
|
| 128 |
+
│ │ ├── query_planner.py # Multi-step query orchestration
|
| 129 |
+
│ │ └── prompts.py # LLM system instructions
|
| 130 |
+
│ ├── services/ # Business logic
|
| 131 |
+
│ │ ├── executor.py # Query pipeline orchestrator
|
| 132 |
+
│ │ └── response_formatter.py # GeoJSON/chart formatting
|
| 133 |
+
│ ├── data/ # Datasets and metadata
|
| 134 |
+
│ │ ├── catalog.json # Dataset registry
|
| 135 |
+
│ │ ├── embeddings.npy # Vector embeddings
|
| 136 |
+
│ │ ├── osm/ # OpenStreetMap data
|
| 137 |
+
│ │ ├── admin/ # Administrative boundaries
|
| 138 |
+
│ │ ├── global/ # Global datasets (Kontur, etc.)
|
| 139 |
+
│ │ └── socioeconomic/ # World Bank, poverty data
|
| 140 |
+
│ └── scripts/ # Data ingestion scripts
|
| 141 |
+
│ ├── download_geofabrik.py
|
| 142 |
+
│ ├── download_hdx_panama.py
|
| 143 |
+
│ └── stri_catalog_scraper.py
|
| 144 |
+
├── frontend/
|
| 145 |
+
│ └── src/
|
| 146 |
+
│ ├── app/ # Next.js App Router pages
|
| 147 |
+
│ └── components/
|
| 148 |
+
│ ├── ChatPanel.tsx # Chat interface with SSE
|
| 149 |
+
│ ├── MapViewer.tsx # Leaflet map with layers
|
| 150 |
+
│ └── DataExplorer.tsx # Tabular data view
|
| 151 |
+
└── docs/ # Technical documentation
|
| 152 |
+
├── backend/ # Backend deep-dives
|
| 153 |
+
├── frontend/ # Frontend architecture
|
| 154 |
+
└── data/ # Data system docs
|
| 155 |
+
```
|
| 156 |
+
|
| 157 |
+
---
|
| 158 |
+
|
| 159 |
+
## 🔧 Technology Stack
|
| 160 |
+
|
| 161 |
+
| Layer | Technology | Purpose |
|
| 162 |
+
|-------|-----------|---------|
|
| 163 |
+
| **LLM** | Google Gemini 2.0 | Intent detection, SQL generation, explanations |
|
| 164 |
+
| **Backend** | Python 3.11 + FastAPI | Async HTTP server with SSE streaming |
|
| 165 |
+
| **Database** | DuckDB with Spatial | In-memory spatial analytics |
|
| 166 |
+
| **Frontend** | Next.js 15 + React 18 | Server-side rendering + interactive UI |
|
| 167 |
+
| **Maps** | Leaflet 1.9 | Interactive web maps |
|
| 168 |
+
| **Embeddings** | sentence-transformers | Semantic dataset search |
|
| 169 |
+
| **Data** | GeoJSON + Parquet | Standardized geospatial formats |
|
| 170 |
+
|
| 171 |
+
---
|
| 172 |
+
|
| 173 |
+
## 📊 Available Datasets
|
| 174 |
+
|
| 175 |
+
GeoQuery currently includes 100+ datasets across multiple categories:
|
| 176 |
+
|
| 177 |
+
### Administrative
|
| 178 |
+
- Panama provinces, districts, corregimientos (HDX 2021)
|
| 179 |
+
- Comarca boundaries
|
| 180 |
+
- Electoral districts
|
| 181 |
+
|
| 182 |
+
### Infrastructure
|
| 183 |
+
- Roads and highways (OpenStreetMap)
|
| 184 |
+
- Hospitals and health facilities (986 locations)
|
| 185 |
+
- Universities and schools (200+ institutions)
|
| 186 |
+
- Airports, ports, power plants
|
| 187 |
+
|
| 188 |
+
### Socioeconomic
|
| 189 |
+
- World Bank development indicators
|
| 190 |
+
- Multidimensional poverty index (MPI)
|
| 191 |
+
- Population density (Kontur H3 hexagons - 33K cells)
|
| 192 |
+
|
| 193 |
+
### Natural Environment
|
| 194 |
+
- Protected areas (STRI GIS Portal)
|
| 195 |
+
- Forest cover and land use
|
| 196 |
+
- Rivers and water bodies
|
| 197 |
+
|
| 198 |
+
📖 **[Full Dataset List](docs/data/DATASET_SOURCES.md)** | **[Adding New Data](docs/backend/SCRIPTS.md)**
|
| 199 |
+
|
| 200 |
+
---
|
| 201 |
+
|
| 202 |
+
## 💡 How It Works
|
| 203 |
+
|
| 204 |
+
1. **User Query**: "Show me hospitals in Panama City"
|
| 205 |
+
2. **Intent Detection**: LLM classifies as MAP_REQUEST
|
| 206 |
+
3. **Semantic Search**: Finds `panama_healthsites_geojson` via embeddings
|
| 207 |
+
4. **SQL Generation**: LLM creates: `SELECT name, geom FROM panama_healthsites_geojson WHERE ST_Intersects(geom, (SELECT geom FROM pan_admin2 WHERE adm2_name = 'Panamá'))`
|
| 208 |
+
5. **Execution**: DuckDB Spatial runs query → 45 features
|
| 209 |
+
6. **Visualization**: Auto-styled map with 🏥 icons
|
| 210 |
+
7. **Explanation**: LLM streams natural language summary
|
| 211 |
+
|
| 212 |
+
**Streaming**: See the LLM's thinking process in real-time via Server-Sent Events.
|
| 213 |
+
|
| 214 |
+
📖 **[Detailed Data Flow](docs/DATA_FLOW.md)** | **[LLM Integration](docs/backend/LLM_INTEGRATION.md)**
|
| 215 |
+
|
| 216 |
+
---
|
| 217 |
+
|
| 218 |
+
## 🗺️ Advanced Features
|
| 219 |
+
|
| 220 |
+
### Choropleth Maps
|
| 221 |
+
Automatically detects numeric columns and creates color gradients:
|
| 222 |
+
- **Linear scale**: For area, count
|
| 223 |
+
- **Logarithmic scale**: For population, density
|
| 224 |
+
|
| 225 |
+
### Point Visualization Modes
|
| 226 |
+
- **Icon markers** 🏥🎓⛪: For categorical POI (<500 points)
|
| 227 |
+
- **Circle points** ⭕: For large datasets like intersections (>500 points)
|
| 228 |
+
|
| 229 |
+
### Spatial Operations
|
| 230 |
+
- Intersection: "Find hospitals within protected areas"
|
| 231 |
+
- Difference: "Show me areas outside national parks"
|
| 232 |
+
- Buffer: "Show 5km radius around hospitals"
|
| 233 |
+
|
| 234 |
+
### Multi-Step Queries
|
| 235 |
+
Complex questions automatically decomposed:
|
| 236 |
+
- "Compare population density with hospital coverage by province"
|
| 237 |
+
1. Calculate population per province
|
| 238 |
+
2. Count hospitals per province
|
| 239 |
+
3. Compute ratios
|
| 240 |
+
4. Generate comparison chart
|
| 241 |
+
|
| 242 |
+
---
|
| 243 |
+
|
| 244 |
+
## 📚 Documentation
|
| 245 |
+
|
| 246 |
+
| Document | Description |
|
| 247 |
+
|----------|-------------|
|
| 248 |
+
| **[ARCHITECTURE.md](ARCHITECTURE.md)** | System design, components, decisions |
|
| 249 |
+
| **[SETUP.md](SETUP.md)** | Development environment setup |
|
| 250 |
+
| **[docs/backend/CORE_SERVICES.md](docs/backend/CORE_SERVICES.md)** | Backend services reference |
|
| 251 |
+
| **[docs/backend/API_ENDPOINTS.md](docs/backend/API_ENDPOINTS.md)** | API endpoint documentation |
|
| 252 |
+
| **[docs/frontend/COMPONENTS.md](docs/frontend/COMPONENTS.md)** | React component architecture |
|
| 253 |
+
| **[docs/DATA_FLOW.md](docs/DATA_FLOW.md)** | End-to-end request walkthrough |
|
| 254 |
+
|
| 255 |
+
---
|
| 256 |
+
|
| 257 |
+
## 📄 License
|
| 258 |
+
|
| 259 |
+
MIT License - see **[LICENSE](LICENSE)** for details.
|
| 260 |
+
|
| 261 |
+
---
|
| 262 |
+
|
| 263 |
+
## 🙏 Acknowledgments
|
| 264 |
+
|
| 265 |
+
**Data Sources**:
|
| 266 |
+
- [OpenStreetMap](https://www.openstreetmap.org/) - Infrastructure and POI data
|
| 267 |
+
- [Humanitarian Data Exchange (HDX)](https://data.humdata.org/) - Administrative boundaries
|
| 268 |
+
- [World Bank Open Data](https://data.worldbank.org/) - Socioeconomic indicators
|
| 269 |
+
- [Kontur Population Dataset](https://data.humdata.org/organization/kontur) - H3 population grid
|
| 270 |
+
- [STRI GIS Portal](https://stridata-si.opendata.arcgis.com/) - Environmental datasets
|
| 271 |
+
|
| 272 |
+
**Technologies**:
|
| 273 |
+
- [Google Gemini](https://ai.google.dev/) - LLM API
|
| 274 |
+
- [DuckDB](https://duckdb.org/) - Fast in-process analytics
|
| 275 |
+
- [Leaflet](https://leafletjs.com/) - Interactive maps
|
| 276 |
+
- [Next.js](https://nextjs.org/) - React framework
|
| 277 |
+
|
SETUP.md
ADDED
|
@@ -0,0 +1,455 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# GeoQuery Setup Guide
|
| 2 |
+
|
| 3 |
+
Complete guide for setting up the GeoQuery development environment.
|
| 4 |
+
|
| 5 |
+
---
|
| 6 |
+
|
| 7 |
+
## Prerequisites
|
| 8 |
+
|
| 9 |
+
### Required Software
|
| 10 |
+
|
| 11 |
+
| Requirement | Minimum Version | Purpose |
|
| 12 |
+
|------------|----------------|---------|
|
| 13 |
+
| **Python** | 3.11+ | Backend runtime |
|
| 14 |
+
| **Node.js** | 18+ | Frontend runtime |
|
| 15 |
+
| **npm** | 9+ | Package management |
|
| 16 |
+
| **Git** | 2.0+ | Version control |
|
| 17 |
+
|
| 18 |
+
### API Keys
|
| 19 |
+
|
| 20 |
+
- **Google AI API Key (Gemini)**: Required for LLM functionality
|
| 21 |
+
- Get one free at: https://aistudio.google.com/app/apikey
|
| 22 |
+
- Free tier: 15 requests/minute, 1500/day
|
| 23 |
+
|
| 24 |
+
### System Requirements
|
| 25 |
+
|
| 26 |
+
- **RAM**: 4GB minimum, 8GB recommended (for DuckDB in-memory database)
|
| 27 |
+
- **Disk**: 2GB for datasets
|
| 28 |
+
- **OS**: macOS, Linux, or Windows (WSL recommended)
|
| 29 |
+
|
| 30 |
+
---
|
| 31 |
+
|
| 32 |
+
## Installation
|
| 33 |
+
|
| 34 |
+
### 1. Clone Repository
|
| 35 |
+
|
| 36 |
+
```bash
|
| 37 |
+
git clone https://github.com/GerardCB/GeoQuery.git
|
| 38 |
+
cd GeoQuery
|
| 39 |
+
```
|
| 40 |
+
|
| 41 |
+
### 2. Backend Setup
|
| 42 |
+
|
| 43 |
+
#### Create Virtual Environment
|
| 44 |
+
|
| 45 |
+
```bash
|
| 46 |
+
cd backend
|
| 47 |
+
python3 -m venv venv
|
| 48 |
+
```
|
| 49 |
+
|
| 50 |
+
#### Activate Virtual Environment
|
| 51 |
+
|
| 52 |
+
**macOS/Linux**:
|
| 53 |
+
```bash
|
| 54 |
+
source venv/bin/activate
|
| 55 |
+
```
|
| 56 |
+
|
| 57 |
+
**Windows** (PowerShell):
|
| 58 |
+
```powershell
|
| 59 |
+
venv\Scripts\Activate.ps1
|
| 60 |
+
```
|
| 61 |
+
|
| 62 |
+
**Windows** (CMD):
|
| 63 |
+
```cmd
|
| 64 |
+
venv\Scripts\activate.bat
|
| 65 |
+
```
|
| 66 |
+
|
| 67 |
+
#### Install Dependencies
|
| 68 |
+
|
| 69 |
+
```bash
|
| 70 |
+
pip install --upgrade pip
|
| 71 |
+
pip install -e .
|
| 72 |
+
```
|
| 73 |
+
|
| 74 |
+
This installs the package in editable mode, including all dependencies from `setup.py`.
|
| 75 |
+
|
| 76 |
+
**Key Dependencies**:
|
| 77 |
+
- `fastapi` - Web framework
|
| 78 |
+
- `uvicorn` - ASGI server
|
| 79 |
+
- `duckdb` - Embedded database
|
| 80 |
+
- `geopandas` - Geospatial data processing
|
| 81 |
+
- `sentence-transformers` - Embeddings
|
| 82 |
+
- `google-generativeai` - Gemini SDK
|
| 83 |
+
|
| 84 |
+
#### Configure Environment Variables
|
| 85 |
+
|
| 86 |
+
Create `.env` file in `backend/` directory:
|
| 87 |
+
|
| 88 |
+
```bash
|
| 89 |
+
# Required
|
| 90 |
+
GEMINI_API_KEY=your-api-key-here
|
| 91 |
+
|
| 92 |
+
# Optional (defaults shown)
|
| 93 |
+
PORT=8000
|
| 94 |
+
HOST=0.0.0.0
|
| 95 |
+
LOG_LEVEL=INFO
|
| 96 |
+
```
|
| 97 |
+
|
| 98 |
+
**Alternative**: Export directly in terminal:
|
| 99 |
+
|
| 100 |
+
```bash
|
| 101 |
+
export GEMINI_API_KEY="your-api-key-here"
|
| 102 |
+
```
|
| 103 |
+
|
| 104 |
+
**Windows**:
|
| 105 |
+
```powershell
|
| 106 |
+
$env:GEMINI_API_KEY="your-api-key-here"
|
| 107 |
+
```
|
| 108 |
+
|
| 109 |
+
#### Verify Backend Installation
|
| 110 |
+
|
| 111 |
+
```bash
|
| 112 |
+
python -c "import backend; print('Backend installed successfully')"
|
| 113 |
+
```
|
| 114 |
+
|
| 115 |
+
### 3. Frontend Setup
|
| 116 |
+
|
| 117 |
+
```bash
|
| 118 |
+
cd ../frontend # From backend directory
|
| 119 |
+
npm install
|
| 120 |
+
```
|
| 121 |
+
|
| 122 |
+
**Key Dependencies**:
|
| 123 |
+
- `next` - React framework
|
| 124 |
+
- `react` - UI library
|
| 125 |
+
- `leaflet` - Map library
|
| 126 |
+
- `react-leaflet` - React bindings for Leaflet
|
| 127 |
+
- `@dnd-kit/core` - Drag and drop
|
| 128 |
+
|
| 129 |
+
#### Configure Frontend (Optional)
|
| 130 |
+
|
| 131 |
+
Edit `frontend/.env.local` if backend is not on default port:
|
| 132 |
+
|
| 133 |
+
```bash
|
| 134 |
+
NEXT_PUBLIC_API_URL=http://localhost:8000
|
| 135 |
+
```
|
| 136 |
+
|
| 137 |
+
---
|
| 138 |
+
|
| 139 |
+
## Running Locally
|
| 140 |
+
|
| 141 |
+
### Start Backend
|
| 142 |
+
|
| 143 |
+
From `backend/` directory with venv activated:
|
| 144 |
+
|
| 145 |
+
```bash
|
| 146 |
+
uvicorn backend.main:app --reload --host 0.0.0.0 --port 8000
|
| 147 |
+
```
|
| 148 |
+
|
| 149 |
+
**Flags**:
|
| 150 |
+
- `--reload`: Auto-restart on code changes
|
| 151 |
+
- `--host 0.0.0.0`: Allow external connections
|
| 152 |
+
- `--port 8000`: Port number
|
| 153 |
+
|
| 154 |
+
**Expected Output**:
|
| 155 |
+
```
|
| 156 |
+
INFO: Uvicorn running on http://0.0.0.0:8000
|
| 157 |
+
INFO: Application startup complete.
|
| 158 |
+
```
|
| 159 |
+
|
| 160 |
+
**Verify**:
|
| 161 |
+
- Open http://localhost:8000/docs → Should show FastAPI Swagger UI
|
| 162 |
+
- Check http://localhost:8000/api/catalog → Should return GeoJSON catalog
|
| 163 |
+
|
| 164 |
+
### Start Frontend
|
| 165 |
+
|
| 166 |
+
From `frontend/` directory:
|
| 167 |
+
|
| 168 |
+
```bash
|
| 169 |
+
npm run dev
|
| 170 |
+
```
|
| 171 |
+
|
| 172 |
+
**Expected Output**:
|
| 173 |
+
```
|
| 174 |
+
▲ Next.js 15.1.3
|
| 175 |
+
- Local: http://localhost:3000
|
| 176 |
+
- Ready in 2.1s
|
| 177 |
+
```
|
| 178 |
+
|
| 179 |
+
**Verify**:
|
| 180 |
+
- Open http://localhost:3000 → Should show GeoQuery chat interface
|
| 181 |
+
|
| 182 |
+
---
|
| 183 |
+
|
| 184 |
+
## Database Setup
|
| 185 |
+
|
| 186 |
+
### DuckDB Initialization
|
| 187 |
+
|
| 188 |
+
**Automatic**: Database is created in-memory on first query.
|
| 189 |
+
|
| 190 |
+
**Manual Test**:
|
| 191 |
+
|
| 192 |
+
```python
|
| 193 |
+
from backend.core.geo_engine import get_geo_engine
|
| 194 |
+
|
| 195 |
+
engine = get_geo_engine()
|
| 196 |
+
print(f"Loaded tables: {list(engine.loaded_tables.keys())}")
|
| 197 |
+
```
|
| 198 |
+
|
| 199 |
+
### Load Initial Datasets
|
| 200 |
+
|
| 201 |
+
Datasets are loaded lazily (on-demand). To pre-load common datasets:
|
| 202 |
+
|
| 203 |
+
```python
|
| 204 |
+
from backend.core.geo_engine import get_geo_engine
|
| 205 |
+
|
| 206 |
+
engine = get_geo_engine()
|
| 207 |
+
engine.ensure_table_loaded("pan_admin1") # Provinces
|
| 208 |
+
engine.ensure_table_loaded("panama_healthsites_geojson") # Hospitals
|
| 209 |
+
```
|
| 210 |
+
|
| 211 |
+
### Generate Embeddings
|
| 212 |
+
|
| 213 |
+
Required for semantic search:
|
| 214 |
+
|
| 215 |
+
```bash
|
| 216 |
+
cd backend
|
| 217 |
+
python -c "from backend.core.semantic_search import get_semantic_search; get_semantic_search()"
|
| 218 |
+
```
|
| 219 |
+
|
| 220 |
+
This generates `backend/data/embeddings.npy` (cached for future use).
|
| 221 |
+
|
| 222 |
+
---
|
| 223 |
+
|
| 224 |
+
## Directory Structure After Setup
|
| 225 |
+
|
| 226 |
+
```
|
| 227 |
+
GeoQuery/
|
| 228 |
+
├── backend/
|
| 229 |
+
│ ├── venv/ # Virtual environment (created)
|
| 230 |
+
│ ├── .env # Environment variables (created)
|
| 231 |
+
│ ├── data/
|
| 232 |
+
│ │ ├── embeddings.npy # Generated embeddings (created)
|
| 233 |
+
│ │ ├── catalog.json # Dataset registry (existing)
|
| 234 |
+
│ │ └── osm/ # GeoJSON datasets (existing)
|
| 235 |
+
│ └── <source files>
|
| 236 |
+
├── frontend/
|
| 237 |
+
│ ├── node_modules/ # npm packages (created)
|
| 238 |
+
│ ├── .next/ # Build output (created)
|
| 239 |
+
│ └── <source files>
|
| 240 |
+
└── <other files>
|
| 241 |
+
```
|
| 242 |
+
|
| 243 |
+
---
|
| 244 |
+
|
| 245 |
+
## Common Issues & Troubleshooting
|
| 246 |
+
|
| 247 |
+
### Backend Issues
|
| 248 |
+
|
| 249 |
+
#### Issue: "ModuleNotFoundError: No module named 'backend'"
|
| 250 |
+
|
| 251 |
+
**Cause**: Virtual environment not activated or package not installed.
|
| 252 |
+
|
| 253 |
+
**Solution**:
|
| 254 |
+
```bash
|
| 255 |
+
source venv/bin/activate # Activate venv
|
| 256 |
+
pip install -e . # Install package
|
| 257 |
+
```
|
| 258 |
+
|
| 259 |
+
#### Issue: "duckdb.IOException: No files found that match the pattern"
|
| 260 |
+
|
| 261 |
+
**Cause**: GeoJSON file missing or incorrect path in catalog.json.
|
| 262 |
+
|
| 263 |
+
**Solution**:
|
| 264 |
+
1. Check file exists: `ls backend/data/osm/hospitals.geojson`
|
| 265 |
+
2. Verify path in `catalog.json`
|
| 266 |
+
3. Download missing data: `python backend/scripts/download_geofabrik.py`
|
| 267 |
+
|
| 268 |
+
#### Issue: "google.api_core.exceptions.PermissionDenied: API key not valid"
|
| 269 |
+
|
| 270 |
+
**Cause**: Invalid or missing GEMINI_API_KEY.
|
| 271 |
+
|
| 272 |
+
**Solution**:
|
| 273 |
+
```bash
|
| 274 |
+
export GEMINI_API_KEY="your-actual-api-key"
|
| 275 |
+
# Restart backend
|
| 276 |
+
```
|
| 277 |
+
|
| 278 |
+
#### Issue: "Module 'sentence_transformers' has no attribute 'SentenceTransformer'"
|
| 279 |
+
|
| 280 |
+
**Cause**: Corrupted installation.
|
| 281 |
+
|
| 282 |
+
**Solution**:
|
| 283 |
+
```bash
|
| 284 |
+
pip uninstall sentence-transformers
|
| 285 |
+
pip install sentence-transformers --no-cache-dir
|
| 286 |
+
```
|
| 287 |
+
|
| 288 |
+
### Frontend Issues
|
| 289 |
+
|
| 290 |
+
#### Issue: "Error: Cannot find module 'next'"
|
| 291 |
+
|
| 292 |
+
**Cause**: npm packages not installed.
|
| 293 |
+
|
| 294 |
+
**Solution**:
|
| 295 |
+
```bash
|
| 296 |
+
cd frontend
|
| 297 |
+
rm -rf node_modules package-lock.json
|
| 298 |
+
npm install
|
| 299 |
+
```
|
| 300 |
+
|
| 301 |
+
#### Issue: "Failed to fetch from localhost:8000"
|
| 302 |
+
|
| 303 |
+
**Cause**: Backend not running or CORS issue.
|
| 304 |
+
|
| 305 |
+
**Solution**:
|
| 306 |
+
1. Verify backend is running: `curl http://localhost:8000/api/catalog`
|
| 307 |
+
2. Check CORS settings in `backend/main.py`
|
| 308 |
+
3. Verify `NEXT_PUBLIC_API_URL` in frontend `.env.local`
|
| 309 |
+
|
| 310 |
+
#### Issue: "Map tiles not loading"
|
| 311 |
+
|
| 312 |
+
**Cause**: Network issue or ad blocker.
|
| 313 |
+
|
| 314 |
+
**Solution**:
|
| 315 |
+
1. Check internet connection
|
| 316 |
+
2. Disable ad blocker for localhost
|
| 317 |
+
3. Alternative tile server in `MapViewer.tsx`:
|
| 318 |
+
```typescript
|
| 319 |
+
url="https://{s}.tile.openstreetmap.org/{z}/{x}/{y}.png"
|
| 320 |
+
```
|
| 321 |
+
|
| 322 |
+
### General Issues
|
| 323 |
+
|
| 324 |
+
#### Issue: Port 8000 already in use
|
| 325 |
+
|
| 326 |
+
**Solution**:
|
| 327 |
+
```bash
|
| 328 |
+
# Find process using port
|
| 329 |
+
lsof -ti:8000
|
| 330 |
+
|
| 331 |
+
# Kill process
|
| 332 |
+
kill -9 $(lsof -ti:8000)
|
| 333 |
+
|
| 334 |
+
# Or use different port
|
| 335 |
+
uvicorn backend.main:app --port 8001
|
| 336 |
+
```
|
| 337 |
+
|
| 338 |
+
#### Issue: Out of memory errors
|
| 339 |
+
|
| 340 |
+
**Cause**: Loading too many large datasets.
|
| 341 |
+
|
| 342 |
+
**Solution**:
|
| 343 |
+
1. Reduce dataset size (filter before loading)
|
| 344 |
+
2. Increase system RAM
|
| 345 |
+
3. Use query limits: `LIMIT 10000`
|
| 346 |
+
|
| 347 |
+
---
|
| 348 |
+
|
| 349 |
+
## Development Workflow
|
| 350 |
+
|
| 351 |
+
### Code Changes
|
| 352 |
+
|
| 353 |
+
**Backend**:
|
| 354 |
+
- Python files auto-reload with `--reload` flag
|
| 355 |
+
- Changes in `core/`, `services/`, `api/` take effect immediately
|
| 356 |
+
|
| 357 |
+
**Frontend**:
|
| 358 |
+
- Hot Module Replacement (HMR) enabled
|
| 359 |
+
- Changes in `components/`, `app/` reload automatically
|
| 360 |
+
|
| 361 |
+
### Adding New Datasets
|
| 362 |
+
|
| 363 |
+
1. **Add GeoJSON file** to appropriate directory (e.g., `backend/data/osm/`)
|
| 364 |
+
|
| 365 |
+
2. **Update catalog.json**:
|
| 366 |
+
```json
|
| 367 |
+
"my_new_dataset": {
|
| 368 |
+
"path": "osm/my_new_dataset.geojson",
|
| 369 |
+
"description": "Description for display",
|
| 370 |
+
"semantic_description": "Detailed description for AI",
|
| 371 |
+
"categories": ["infrastructure"],
|
| 372 |
+
"tags": ["roads", "transport"]
|
| 373 |
+
}
|
| 374 |
+
```
|
| 375 |
+
|
| 376 |
+
3. **Regenerate embeddings**:
|
| 377 |
+
```bash
|
| 378 |
+
rm backend/data/embeddings.npy
|
| 379 |
+
python -c "from backend.core.semantic_search import get_semantic_search; get_semantic_search()"
|
| 380 |
+
```
|
| 381 |
+
|
| 382 |
+
4. **Test**: Query for the new dataset
|
| 383 |
+
|
| 384 |
+
See [docs/backend/SCRIPTS.md](docs/backend/SCRIPTS.md) for data ingestion scripts.
|
| 385 |
+
|
| 386 |
+
### Testing API Endpoints
|
| 387 |
+
|
| 388 |
+
**Using curl**:
|
| 389 |
+
```bash
|
| 390 |
+
# Get catalog
|
| 391 |
+
curl http://localhost:8000/api/catalog
|
| 392 |
+
|
| 393 |
+
# Query chat endpoint
|
| 394 |
+
curl -X POST http://localhost:8000/api/chat \
|
| 395 |
+
-H "Content-Type: application/json" \
|
| 396 |
+
-d '{"message": "Show me provinces", "history": []}'
|
| 397 |
+
```
|
| 398 |
+
|
| 399 |
+
**Using Swagger UI**:
|
| 400 |
+
- Open http://localhost:8000/docs
|
| 401 |
+
- Try endpoints interactively
|
| 402 |
+
|
| 403 |
+
---
|
| 404 |
+
|
| 405 |
+
## Environment Variables Reference
|
| 406 |
+
|
| 407 |
+
| Variable | Required | Default | Description |
|
| 408 |
+
|----------|----------|---------|-------------|
|
| 409 |
+
| `GEMINI_API_KEY` | ✅ Yes | - | Google AI API key |
|
| 410 |
+
| `PORT` | ❌ No | 8000 | Backend server port |
|
| 411 |
+
| `HOST` | ❌ No | 0.0.0.0 | Backend host |
|
| 412 |
+
| `LOG_LEVEL` | ❌ No | INFO | Logging level (DEBUG, INFO, WARNING, ERROR) |
|
| 413 |
+
| `DATABASE_PATH` | ❌ No | :memory: | DuckDB database path (use for persistence) |
|
| 414 |
+
|
| 415 |
+
---
|
| 416 |
+
|
| 417 |
+
## IDE Setup
|
| 418 |
+
|
| 419 |
+
### VS Code
|
| 420 |
+
|
| 421 |
+
**Recommended Extensions**:
|
| 422 |
+
- Python (`ms-python.python`)
|
| 423 |
+
- Pylance (`ms-python.vscode-pylance`)
|
| 424 |
+
- ESLint (`dbaeumer.vscode-eslint`)
|
| 425 |
+
- Prettier (`esbenp.prettier-vscode`)
|
| 426 |
+
|
| 427 |
+
**Settings** (`.vscode/settings.json`):
|
| 428 |
+
```json
|
| 429 |
+
{
|
| 430 |
+
"python.defaultInterpreterPath": "./backend/venv/bin/python",
|
| 431 |
+
"python.linting.enabled": true,
|
| 432 |
+
"python.formatting.provider": "black",
|
| 433 |
+
"editor.formatOnSave": true,
|
| 434 |
+
"[typescript]": {
|
| 435 |
+
"editor.defaultFormatter": "esbenp.prettier-vscode"
|
| 436 |
+
}
|
| 437 |
+
}
|
| 438 |
+
```
|
| 439 |
+
|
| 440 |
+
### PyCharm
|
| 441 |
+
|
| 442 |
+
1. **Set Python Interpreter**: Settings → Project → Python Interpreter → Add → Existing Environment → `backend/venv/bin/python`
|
| 443 |
+
2. **Enable FastAPI**: Settings → Languages & Frameworks → FastAPI
|
| 444 |
+
3. **Configure Run**: Run → Edit Configurations → Add → Python → Script path: `backend/main.py`
|
| 445 |
+
|
| 446 |
+
---
|
| 447 |
+
|
| 448 |
+
## Next Steps
|
| 449 |
+
|
| 450 |
+
- ✅ **Verify installation** by running a test query
|
| 451 |
+
- 📖 **Read [ARCHITECTURE.md](../ARCHITECTURE.md)** to understand the system
|
| 452 |
+
- 🔧 **Explore [docs/backend/CORE_SERVICES.md](docs/backend/CORE_SERVICES.md)** for component details
|
| 453 |
+
- 📊 **Review [docs/data/DATASET_SOURCES.md](docs/data/DATASET_SOURCES.md)** for available data
|
| 454 |
+
|
| 455 |
+
|
backend/__init__.py
ADDED
|
File without changes
|
backend/api/api.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import APIRouter
|
| 2 |
+
from backend.api.endpoints import chat, schema, catalog
|
| 3 |
+
|
| 4 |
+
api_router = APIRouter()
|
| 5 |
+
api_router.include_router(chat.router, prefix="/chat", tags=["chat"])
|
| 6 |
+
api_router.include_router(schema.router, prefix="/schema", tags=["schema"])
|
| 7 |
+
api_router.include_router(catalog.router, prefix="/catalog", tags=["catalog"])
|
| 8 |
+
|
backend/api/endpoints/catalog.py
ADDED
|
@@ -0,0 +1,235 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Catalog Management Endpoints
|
| 3 |
+
|
| 4 |
+
Provides API for viewing and enriching the data catalog.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from fastapi import APIRouter, HTTPException, BackgroundTasks
|
| 8 |
+
from pydantic import BaseModel
|
| 9 |
+
from typing import List, Optional, Dict, Any
|
| 10 |
+
|
| 11 |
+
router = APIRouter()
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class CatalogStatsResponse(BaseModel):
|
| 15 |
+
total_datasets: int
|
| 16 |
+
enriched_datasets: int
|
| 17 |
+
by_category: Dict[str, int]
|
| 18 |
+
by_tag: Dict[str, int]
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class TableMetadataResponse(BaseModel):
|
| 22 |
+
name: str
|
| 23 |
+
path: str
|
| 24 |
+
description: str
|
| 25 |
+
semantic_description: Optional[str]
|
| 26 |
+
tags: List[str]
|
| 27 |
+
data_type: str
|
| 28 |
+
columns: List[str]
|
| 29 |
+
row_count: Optional[int]
|
| 30 |
+
category: str
|
| 31 |
+
last_indexed: Optional[str]
|
| 32 |
+
last_enriched: Optional[str]
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
class EnrichmentRequest(BaseModel):
|
| 36 |
+
table_names: Optional[List[str]] = None # None = all tables
|
| 37 |
+
force_refresh: bool = False
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
class EnrichmentResponse(BaseModel):
|
| 41 |
+
status: str
|
| 42 |
+
message: str
|
| 43 |
+
tables_queued: int
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
@router.get("/stats", response_model=CatalogStatsResponse)
|
| 47 |
+
async def get_catalog_stats():
|
| 48 |
+
"""Get statistics about the data catalog."""
|
| 49 |
+
from backend.core.data_catalog import get_data_catalog
|
| 50 |
+
|
| 51 |
+
catalog = get_data_catalog()
|
| 52 |
+
stats = catalog.get_stats()
|
| 53 |
+
|
| 54 |
+
return CatalogStatsResponse(
|
| 55 |
+
total_datasets=stats["total_datasets"],
|
| 56 |
+
enriched_datasets=stats.get("enriched_datasets", 0),
|
| 57 |
+
by_category=stats["by_category"],
|
| 58 |
+
by_tag=stats["by_tag"]
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
@router.get("/tables", response_model=List[TableMetadataResponse])
|
| 63 |
+
async def list_catalog_tables():
|
| 64 |
+
"""List all tables in the catalog with their metadata."""
|
| 65 |
+
from backend.core.data_catalog import get_data_catalog
|
| 66 |
+
|
| 67 |
+
catalog = get_data_catalog()
|
| 68 |
+
tables = []
|
| 69 |
+
|
| 70 |
+
for name, meta in catalog.catalog.items():
|
| 71 |
+
tables.append(TableMetadataResponse(
|
| 72 |
+
name=name,
|
| 73 |
+
path=meta.get("path", ""),
|
| 74 |
+
description=meta.get("description", ""),
|
| 75 |
+
semantic_description=meta.get("semantic_description"),
|
| 76 |
+
tags=meta.get("tags", []),
|
| 77 |
+
data_type=meta.get("data_type", "static"),
|
| 78 |
+
columns=meta.get("columns", []),
|
| 79 |
+
row_count=meta.get("row_count"),
|
| 80 |
+
category=meta.get("category", "unknown"),
|
| 81 |
+
last_indexed=meta.get("last_indexed"),
|
| 82 |
+
last_enriched=meta.get("last_enriched")
|
| 83 |
+
))
|
| 84 |
+
|
| 85 |
+
return tables
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
@router.get("/tables/{table_name}", response_model=TableMetadataResponse)
|
| 89 |
+
async def get_table_metadata(table_name: str):
|
| 90 |
+
"""Get metadata for a specific table."""
|
| 91 |
+
from backend.core.data_catalog import get_data_catalog
|
| 92 |
+
|
| 93 |
+
catalog = get_data_catalog()
|
| 94 |
+
meta = catalog.get_table_metadata(table_name)
|
| 95 |
+
|
| 96 |
+
if not meta:
|
| 97 |
+
raise HTTPException(status_code=404, detail=f"Table '{table_name}' not found")
|
| 98 |
+
|
| 99 |
+
return TableMetadataResponse(
|
| 100 |
+
name=table_name,
|
| 101 |
+
path=meta.get("path", ""),
|
| 102 |
+
description=meta.get("description", ""),
|
| 103 |
+
semantic_description=meta.get("semantic_description"),
|
| 104 |
+
tags=meta.get("tags", []),
|
| 105 |
+
data_type=meta.get("data_type", "static"),
|
| 106 |
+
columns=meta.get("columns", []),
|
| 107 |
+
row_count=meta.get("row_count"),
|
| 108 |
+
category=meta.get("category", "unknown"),
|
| 109 |
+
last_indexed=meta.get("last_indexed"),
|
| 110 |
+
last_enriched=meta.get("last_enriched")
|
| 111 |
+
)
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
@router.post("/enrich", response_model=EnrichmentResponse)
|
| 115 |
+
async def enrich_catalog(request: EnrichmentRequest, background_tasks: BackgroundTasks):
|
| 116 |
+
"""
|
| 117 |
+
Trigger LLM enrichment for catalog tables.
|
| 118 |
+
|
| 119 |
+
Enrichment generates semantic descriptions and refined tags.
|
| 120 |
+
Runs in the background to avoid blocking.
|
| 121 |
+
"""
|
| 122 |
+
from backend.core.data_catalog import get_data_catalog
|
| 123 |
+
|
| 124 |
+
catalog = get_data_catalog()
|
| 125 |
+
|
| 126 |
+
if request.table_names:
|
| 127 |
+
# Validate table names
|
| 128 |
+
invalid = [t for t in request.table_names if t not in catalog.catalog]
|
| 129 |
+
if invalid:
|
| 130 |
+
raise HTTPException(
|
| 131 |
+
status_code=400,
|
| 132 |
+
detail=f"Unknown tables: {invalid}"
|
| 133 |
+
)
|
| 134 |
+
tables_to_enrich = request.table_names
|
| 135 |
+
else:
|
| 136 |
+
tables_to_enrich = list(catalog.catalog.keys())
|
| 137 |
+
|
| 138 |
+
# Queue enrichment in background
|
| 139 |
+
async def run_enrichment():
|
| 140 |
+
for table_name in tables_to_enrich:
|
| 141 |
+
await catalog.enrich_table(table_name, request.force_refresh)
|
| 142 |
+
|
| 143 |
+
background_tasks.add_task(run_enrichment)
|
| 144 |
+
|
| 145 |
+
return EnrichmentResponse(
|
| 146 |
+
status="queued",
|
| 147 |
+
message=f"Enrichment started for {len(tables_to_enrich)} tables",
|
| 148 |
+
tables_queued=len(tables_to_enrich)
|
| 149 |
+
)
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
@router.post("/enrich/{table_name}")
|
| 153 |
+
async def enrich_single_table(table_name: str, force: bool = False):
|
| 154 |
+
"""
|
| 155 |
+
Immediately enrich a single table (synchronous).
|
| 156 |
+
|
| 157 |
+
Use for testing or when you need the result right away.
|
| 158 |
+
"""
|
| 159 |
+
from backend.core.data_catalog import get_data_catalog
|
| 160 |
+
|
| 161 |
+
catalog = get_data_catalog()
|
| 162 |
+
|
| 163 |
+
if table_name not in catalog.catalog:
|
| 164 |
+
raise HTTPException(status_code=404, detail=f"Table '{table_name}' not found")
|
| 165 |
+
|
| 166 |
+
success = await catalog.enrich_table(table_name, force)
|
| 167 |
+
|
| 168 |
+
if success:
|
| 169 |
+
meta = catalog.get_table_metadata(table_name)
|
| 170 |
+
return {
|
| 171 |
+
"status": "success",
|
| 172 |
+
"table": table_name,
|
| 173 |
+
"semantic_description": meta.get("semantic_description"),
|
| 174 |
+
"tags": meta.get("tags", [])
|
| 175 |
+
}
|
| 176 |
+
else:
|
| 177 |
+
raise HTTPException(status_code=500, detail=f"Failed to enrich table '{table_name}'")
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
@router.get("/search")
|
| 181 |
+
async def search_tables(query: str, top_k: int = 10):
|
| 182 |
+
"""
|
| 183 |
+
Search for tables using semantic search.
|
| 184 |
+
|
| 185 |
+
Returns the most relevant tables for a natural language query.
|
| 186 |
+
"""
|
| 187 |
+
from backend.core.semantic_search import get_semantic_search
|
| 188 |
+
from backend.core.data_catalog import get_data_catalog
|
| 189 |
+
|
| 190 |
+
semantic = get_semantic_search()
|
| 191 |
+
catalog = get_data_catalog()
|
| 192 |
+
|
| 193 |
+
results = semantic.search(query, top_k=top_k)
|
| 194 |
+
|
| 195 |
+
response = []
|
| 196 |
+
for table_name, score in results:
|
| 197 |
+
meta = catalog.get_table_metadata(table_name)
|
| 198 |
+
if meta:
|
| 199 |
+
response.append({
|
| 200 |
+
"table": table_name,
|
| 201 |
+
"score": round(score, 4),
|
| 202 |
+
"description": meta.get("semantic_description") or meta.get("description"),
|
| 203 |
+
"tags": meta.get("tags", [])
|
| 204 |
+
})
|
| 205 |
+
|
| 206 |
+
return {"query": query, "results": response}
|
| 207 |
+
|
| 208 |
+
|
| 209 |
+
@router.post("/rebuild-embeddings")
|
| 210 |
+
async def rebuild_embeddings():
|
| 211 |
+
"""
|
| 212 |
+
Rebuild all semantic search embeddings from current catalog.
|
| 213 |
+
|
| 214 |
+
Use after bulk enrichment or catalog updates.
|
| 215 |
+
"""
|
| 216 |
+
from backend.core.semantic_search import get_semantic_search
|
| 217 |
+
from backend.core.data_catalog import get_data_catalog
|
| 218 |
+
|
| 219 |
+
semantic = get_semantic_search()
|
| 220 |
+
catalog = get_data_catalog()
|
| 221 |
+
|
| 222 |
+
# Force re-embed all tables
|
| 223 |
+
count = 0
|
| 224 |
+
for table_name, metadata in catalog.catalog.items():
|
| 225 |
+
if semantic.embed_table(table_name, metadata):
|
| 226 |
+
count += 1
|
| 227 |
+
|
| 228 |
+
semantic._save_embeddings()
|
| 229 |
+
|
| 230 |
+
return {
|
| 231 |
+
"status": "success",
|
| 232 |
+
"message": f"Rebuilt embeddings for {count} tables",
|
| 233 |
+
"total_embeddings": len(semantic.embeddings)
|
| 234 |
+
}
|
| 235 |
+
|
backend/api/endpoints/chat.py
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import APIRouter
|
| 2 |
+
from pydantic import BaseModel
|
| 3 |
+
from typing import Optional
|
| 4 |
+
from backend.services.executor import QueryExecutor
|
| 5 |
+
|
| 6 |
+
router = APIRouter()
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class MessageHistory(BaseModel):
|
| 10 |
+
role: str # "user" or "assistant"
|
| 11 |
+
content: str
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class ChatRequest(BaseModel):
|
| 15 |
+
message: str
|
| 16 |
+
history: list[MessageHistory] = []
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class ChartData(BaseModel):
|
| 20 |
+
type: str # 'bar', 'line', 'pie', 'donut'
|
| 21 |
+
title: Optional[str] = None
|
| 22 |
+
data: list[dict] = []
|
| 23 |
+
xKey: Optional[str] = None
|
| 24 |
+
yKey: Optional[str] = None
|
| 25 |
+
lines: Optional[list[dict]] = None
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
class ChatResponse(BaseModel):
|
| 29 |
+
response: str
|
| 30 |
+
sql_query: Optional[str] = None
|
| 31 |
+
geojson: Optional[dict] = None
|
| 32 |
+
data_citations: list[str] = []
|
| 33 |
+
intent: Optional[str] = None
|
| 34 |
+
chart_data: Optional[ChartData] = None # NEW: For STAT_QUERY responses
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
@router.post("/", response_model=ChatResponse)
|
| 38 |
+
async def chat(request: ChatRequest):
|
| 39 |
+
"""
|
| 40 |
+
Main chat endpoint that handles conversation with context.
|
| 41 |
+
Routes to appropriate handler based on detected intent.
|
| 42 |
+
"""
|
| 43 |
+
executor = QueryExecutor()
|
| 44 |
+
|
| 45 |
+
# Convert history to dict format for the executor
|
| 46 |
+
history = [{"role": h.role, "content": h.content} for h in request.history]
|
| 47 |
+
|
| 48 |
+
# Process the query with full context
|
| 49 |
+
result = await executor.process_query_with_context(
|
| 50 |
+
query=request.message,
|
| 51 |
+
history=history
|
| 52 |
+
)
|
| 53 |
+
|
| 54 |
+
return ChatResponse(
|
| 55 |
+
response=result.get("response", "I processed your request."),
|
| 56 |
+
sql_query=result.get("sql_query"),
|
| 57 |
+
geojson=result.get("geojson"),
|
| 58 |
+
data_citations=result.get("data_citations", []),
|
| 59 |
+
intent=result.get("intent"),
|
| 60 |
+
chart_data=result.get("chart_data"),
|
| 61 |
+
raw_data=result.get("raw_data")
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
from sse_starlette.sse import EventSourceResponse
|
| 66 |
+
import json
|
| 67 |
+
import asyncio
|
| 68 |
+
|
| 69 |
+
@router.post("/stream")
|
| 70 |
+
async def chat_stream(request: ChatRequest):
|
| 71 |
+
"""
|
| 72 |
+
Streaming chat endpoint that returns Server-Sent Events (SSE).
|
| 73 |
+
"""
|
| 74 |
+
executor = QueryExecutor()
|
| 75 |
+
history = [{"role": h.role, "content": h.content} for h in request.history]
|
| 76 |
+
|
| 77 |
+
async def event_generator():
|
| 78 |
+
try:
|
| 79 |
+
# Delegate entirely to the executor's streaming process
|
| 80 |
+
async for event in executor.process_query_stream(request.message, history):
|
| 81 |
+
yield event
|
| 82 |
+
|
| 83 |
+
except Exception as e:
|
| 84 |
+
print(f"Stream error: {e}")
|
| 85 |
+
yield {
|
| 86 |
+
"event": "chunk",
|
| 87 |
+
"data": json.dumps({"type": "text", "content": f"\n\nError: {str(e)}"})
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
return EventSourceResponse(event_generator())
|
backend/api/endpoints/schema.py
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Schema endpoint - Provides data catalog information to users.
|
| 3 |
+
Shows available tables, columns, and data descriptions.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from fastapi import APIRouter
|
| 7 |
+
from pydantic import BaseModel
|
| 8 |
+
from typing import Optional, List, Any
|
| 9 |
+
from backend.core.data_catalog import get_data_catalog
|
| 10 |
+
|
| 11 |
+
router = APIRouter()
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class ColumnInfo(BaseModel):
|
| 15 |
+
name: str
|
| 16 |
+
type: str
|
| 17 |
+
description: Optional[str] = None
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class TableInfo(BaseModel):
|
| 21 |
+
name: str
|
| 22 |
+
description: str
|
| 23 |
+
row_count: int
|
| 24 |
+
columns: List[ColumnInfo]
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
class SchemaResponse(BaseModel):
|
| 28 |
+
tables: List[TableInfo]
|
| 29 |
+
last_updated: str
|
| 30 |
+
data_source: str
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
@router.get("/", response_model=SchemaResponse)
|
| 34 |
+
async def get_schema():
|
| 35 |
+
"""
|
| 36 |
+
Returns the dynamic data catalog with all available tables and their schemas.
|
| 37 |
+
"""
|
| 38 |
+
catalog = get_data_catalog()
|
| 39 |
+
tables = []
|
| 40 |
+
|
| 41 |
+
for table_name, meta in catalog.catalog.items():
|
| 42 |
+
# Map catalog columns to Schema columns
|
| 43 |
+
# Catalog columns are just a list of strings usually
|
| 44 |
+
cols = []
|
| 45 |
+
raw_cols = meta.get("columns", [])
|
| 46 |
+
|
| 47 |
+
# Helper to guess type
|
| 48 |
+
def guess_type(col_name):
|
| 49 |
+
if col_name == "geom": return "geometry"
|
| 50 |
+
if "id" in col_name: return "integer"
|
| 51 |
+
if "name" in col_name: return "text"
|
| 52 |
+
return "text" # Default
|
| 53 |
+
|
| 54 |
+
for col in raw_cols:
|
| 55 |
+
cols.append(ColumnInfo(
|
| 56 |
+
name=col,
|
| 57 |
+
type=guess_type(col),
|
| 58 |
+
description=None
|
| 59 |
+
))
|
| 60 |
+
|
| 61 |
+
tables.append(TableInfo(
|
| 62 |
+
name=table_name,
|
| 63 |
+
description=meta.get("semantic_description") or meta.get("description", ""),
|
| 64 |
+
row_count=meta.get("row_count") or 0,
|
| 65 |
+
columns=cols
|
| 66 |
+
))
|
| 67 |
+
|
| 68 |
+
return SchemaResponse(
|
| 69 |
+
tables=tables,
|
| 70 |
+
last_updated="Dynamic",
|
| 71 |
+
data_source="GeoQuery Data Catalog (OSM, Overture, HDX, INEC)"
|
| 72 |
+
)
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
@router.get("/tables")
|
| 76 |
+
async def list_tables():
|
| 77 |
+
"""
|
| 78 |
+
Returns a simple list of available table names.
|
| 79 |
+
"""
|
| 80 |
+
catalog = get_data_catalog()
|
| 81 |
+
return {
|
| 82 |
+
"tables": list(catalog.catalog.keys()),
|
| 83 |
+
"count": len(catalog.catalog)
|
| 84 |
+
}
|
backend/core/catalog_enricher.py
ADDED
|
@@ -0,0 +1,221 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Catalog Enricher Service
|
| 3 |
+
|
| 4 |
+
Automatically generates rich metadata for datasets using LLM.
|
| 5 |
+
Enhances table descriptions and tags for better semantic search.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import logging
|
| 9 |
+
from typing import Dict, List, Any, Optional
|
| 10 |
+
from backend.core.llm_gateway import LLMGateway
|
| 11 |
+
|
| 12 |
+
logger = logging.getLogger(__name__)
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
# Prompt for generating semantic descriptions
|
| 16 |
+
DESCRIPTION_PROMPT = """Generate a concise 2-3 sentence description for this geographic dataset.
|
| 17 |
+
|
| 18 |
+
Table Name: {table_name}
|
| 19 |
+
Category: {category}
|
| 20 |
+
Columns: {columns}
|
| 21 |
+
Sample Column Values: {sample_values}
|
| 22 |
+
Row Count: {row_count}
|
| 23 |
+
|
| 24 |
+
Focus on:
|
| 25 |
+
1. What geographic entities it contains (districts, health facilities, roads, etc.)
|
| 26 |
+
2. The geographic scope (Panama, specific province, etc.)
|
| 27 |
+
3. Common use cases (administrative analysis, health coverage, etc.)
|
| 28 |
+
|
| 29 |
+
Return ONLY the description, no formatting or labels."""
|
| 30 |
+
|
| 31 |
+
# Prompt for generating/refining tags
|
| 32 |
+
TAG_PROMPT = """Suggest 5-8 relevant tags for this geographic dataset.
|
| 33 |
+
|
| 34 |
+
Table Name: {table_name}
|
| 35 |
+
Description: {description}
|
| 36 |
+
Columns: {columns}
|
| 37 |
+
Current Tags: {current_tags}
|
| 38 |
+
|
| 39 |
+
Rules:
|
| 40 |
+
1. Tags should be lowercase, single words or hyphenated
|
| 41 |
+
2. Include domain tags (health, education, infrastructure)
|
| 42 |
+
3. Include geographic tags (administrative, boundaries, points)
|
| 43 |
+
4. Include data type tags (census, osm, government)
|
| 44 |
+
|
| 45 |
+
Return ONLY a JSON array of strings, e.g. ["health", "facilities", "infrastructure"]"""
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
class CatalogEnricher:
|
| 49 |
+
"""
|
| 50 |
+
Enriches catalog metadata with LLM-generated descriptions and tags.
|
| 51 |
+
|
| 52 |
+
Can be run on-demand for new datasets or batch-run for existing ones.
|
| 53 |
+
"""
|
| 54 |
+
|
| 55 |
+
_instance = None
|
| 56 |
+
|
| 57 |
+
def __new__(cls):
|
| 58 |
+
if cls._instance is None:
|
| 59 |
+
cls._instance = super(CatalogEnricher, cls).__new__(cls)
|
| 60 |
+
cls._instance.initialized = False
|
| 61 |
+
return cls._instance
|
| 62 |
+
|
| 63 |
+
def __init__(self):
|
| 64 |
+
if self.initialized:
|
| 65 |
+
return
|
| 66 |
+
|
| 67 |
+
self.llm = LLMGateway()
|
| 68 |
+
self.initialized = True
|
| 69 |
+
|
| 70 |
+
async def generate_description(
|
| 71 |
+
self,
|
| 72 |
+
table_name: str,
|
| 73 |
+
metadata: Dict[str, Any],
|
| 74 |
+
sample_values: Optional[Dict[str, str]] = None
|
| 75 |
+
) -> str:
|
| 76 |
+
"""
|
| 77 |
+
Generate a semantic description for a dataset using LLM.
|
| 78 |
+
|
| 79 |
+
Args:
|
| 80 |
+
table_name: Name of the table
|
| 81 |
+
metadata: Catalog metadata dict
|
| 82 |
+
sample_values: Optional dict of column -> sample value
|
| 83 |
+
|
| 84 |
+
Returns:
|
| 85 |
+
Generated description string
|
| 86 |
+
"""
|
| 87 |
+
columns = metadata.get("columns", [])
|
| 88 |
+
category = metadata.get("category", "unknown")
|
| 89 |
+
row_count = metadata.get("row_count", "unknown")
|
| 90 |
+
|
| 91 |
+
# Format sample values
|
| 92 |
+
sample_str = "Not available"
|
| 93 |
+
if sample_values:
|
| 94 |
+
sample_str = ", ".join(f"{k}: {v}" for k, v in list(sample_values.items())[:5])
|
| 95 |
+
|
| 96 |
+
prompt = DESCRIPTION_PROMPT.format(
|
| 97 |
+
table_name=table_name,
|
| 98 |
+
category=category,
|
| 99 |
+
columns=", ".join(columns[:15]), # Limit columns
|
| 100 |
+
sample_values=sample_str,
|
| 101 |
+
row_count=row_count
|
| 102 |
+
)
|
| 103 |
+
|
| 104 |
+
try:
|
| 105 |
+
response = await self.llm.generate_response(prompt)
|
| 106 |
+
description = response.strip()
|
| 107 |
+
|
| 108 |
+
# Basic validation
|
| 109 |
+
if len(description) < 20 or len(description) > 500:
|
| 110 |
+
logger.warning(f"Generated description for {table_name} seems unusual: {len(description)} chars")
|
| 111 |
+
|
| 112 |
+
return description
|
| 113 |
+
|
| 114 |
+
except Exception as e:
|
| 115 |
+
logger.error(f"Failed to generate description for {table_name}: {e}")
|
| 116 |
+
return metadata.get("description", f"Geographic data from {category}")
|
| 117 |
+
|
| 118 |
+
async def generate_tags(
|
| 119 |
+
self,
|
| 120 |
+
table_name: str,
|
| 121 |
+
metadata: Dict[str, Any]
|
| 122 |
+
) -> List[str]:
|
| 123 |
+
"""
|
| 124 |
+
Generate or refine tags for a dataset using LLM.
|
| 125 |
+
|
| 126 |
+
Args:
|
| 127 |
+
table_name: Name of the table
|
| 128 |
+
metadata: Catalog metadata dict
|
| 129 |
+
|
| 130 |
+
Returns:
|
| 131 |
+
List of tag strings
|
| 132 |
+
"""
|
| 133 |
+
columns = metadata.get("columns", [])
|
| 134 |
+
description = metadata.get("semantic_description") or metadata.get("description", "")
|
| 135 |
+
current_tags = metadata.get("tags", [])
|
| 136 |
+
|
| 137 |
+
prompt = TAG_PROMPT.format(
|
| 138 |
+
table_name=table_name,
|
| 139 |
+
description=description,
|
| 140 |
+
columns=", ".join(columns[:15]),
|
| 141 |
+
current_tags=current_tags
|
| 142 |
+
)
|
| 143 |
+
|
| 144 |
+
try:
|
| 145 |
+
import json
|
| 146 |
+
response = await self.llm.generate_response(prompt)
|
| 147 |
+
|
| 148 |
+
# Parse JSON array
|
| 149 |
+
response = response.strip()
|
| 150 |
+
if response.startswith("```"):
|
| 151 |
+
response = response.split("```")[1]
|
| 152 |
+
if response.startswith("json"):
|
| 153 |
+
response = response[4:]
|
| 154 |
+
|
| 155 |
+
tags = json.loads(response)
|
| 156 |
+
|
| 157 |
+
if isinstance(tags, list):
|
| 158 |
+
# Validate and clean tags
|
| 159 |
+
clean_tags = []
|
| 160 |
+
for tag in tags:
|
| 161 |
+
if isinstance(tag, str):
|
| 162 |
+
tag = tag.lower().strip()
|
| 163 |
+
if 2 <= len(tag) <= 30:
|
| 164 |
+
clean_tags.append(tag)
|
| 165 |
+
|
| 166 |
+
return clean_tags
|
| 167 |
+
|
| 168 |
+
except Exception as e:
|
| 169 |
+
logger.error(f"Failed to generate tags for {table_name}: {e}")
|
| 170 |
+
|
| 171 |
+
return current_tags
|
| 172 |
+
|
| 173 |
+
async def enrich_table(
|
| 174 |
+
self,
|
| 175 |
+
table_name: str,
|
| 176 |
+
metadata: Dict[str, Any],
|
| 177 |
+
sample_values: Optional[Dict[str, str]] = None,
|
| 178 |
+
force_refresh: bool = False
|
| 179 |
+
) -> Dict[str, Any]:
|
| 180 |
+
"""
|
| 181 |
+
Fully enrich a table's metadata with description and tags.
|
| 182 |
+
|
| 183 |
+
Args:
|
| 184 |
+
table_name: Name of the table
|
| 185 |
+
metadata: Current catalog metadata
|
| 186 |
+
sample_values: Optional sample data for context
|
| 187 |
+
force_refresh: If True, regenerate even if already enriched
|
| 188 |
+
|
| 189 |
+
Returns:
|
| 190 |
+
Updated metadata dict
|
| 191 |
+
"""
|
| 192 |
+
updated = metadata.copy()
|
| 193 |
+
|
| 194 |
+
# Generate description if missing or forced
|
| 195 |
+
if force_refresh or not metadata.get("semantic_description"):
|
| 196 |
+
logger.info(f"Generating semantic description for {table_name}...")
|
| 197 |
+
description = await self.generate_description(table_name, metadata, sample_values)
|
| 198 |
+
updated["semantic_description"] = description
|
| 199 |
+
|
| 200 |
+
# Generate/refine tags (always, to improve quality)
|
| 201 |
+
if force_refresh or len(metadata.get("tags", [])) < 3:
|
| 202 |
+
logger.info(f"Generating tags for {table_name}...")
|
| 203 |
+
tags = await self.generate_tags(table_name, updated)
|
| 204 |
+
# Merge with existing, deduplicate
|
| 205 |
+
existing_tags = set(metadata.get("tags", []))
|
| 206 |
+
new_tags = set(tags)
|
| 207 |
+
updated["tags"] = list(existing_tags | new_tags)
|
| 208 |
+
|
| 209 |
+
return updated
|
| 210 |
+
|
| 211 |
+
|
| 212 |
+
# Singleton accessor
|
| 213 |
+
_catalog_enricher: Optional[CatalogEnricher] = None
|
| 214 |
+
|
| 215 |
+
|
| 216 |
+
def get_catalog_enricher() -> CatalogEnricher:
|
| 217 |
+
"""Get the singleton catalog enricher instance."""
|
| 218 |
+
global _catalog_enricher
|
| 219 |
+
if _catalog_enricher is None:
|
| 220 |
+
_catalog_enricher = CatalogEnricher()
|
| 221 |
+
return _catalog_enricher
|
backend/core/data_catalog.py
ADDED
|
@@ -0,0 +1,445 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Data Catalog Service
|
| 3 |
+
|
| 4 |
+
Manages metadata for all datasets available in the platform.
|
| 5 |
+
Supports semantic search integration for scalable discovery.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import json
|
| 9 |
+
import duckdb
|
| 10 |
+
import logging
|
| 11 |
+
from datetime import datetime
|
| 12 |
+
from pathlib import Path
|
| 13 |
+
from typing import List, Dict, Any, Optional
|
| 14 |
+
|
| 15 |
+
logger = logging.getLogger(__name__)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
# Tag inference rules for auto-tagging datasets
|
| 19 |
+
TAG_RULES = {
|
| 20 |
+
# Keywords in table name -> tags
|
| 21 |
+
"health": ["health", "facilities", "infrastructure"],
|
| 22 |
+
"hospital": ["health", "facilities", "medical"],
|
| 23 |
+
"clinic": ["health", "facilities", "medical"],
|
| 24 |
+
"school": ["education", "facilities", "infrastructure"],
|
| 25 |
+
"university": ["education", "facilities", "higher-education"],
|
| 26 |
+
"education": ["education", "facilities"],
|
| 27 |
+
"road": ["transportation", "infrastructure", "roads"],
|
| 28 |
+
"street": ["transportation", "infrastructure", "roads"],
|
| 29 |
+
"highway": ["transportation", "infrastructure", "roads"],
|
| 30 |
+
"airport": ["transportation", "infrastructure", "aviation"],
|
| 31 |
+
"port": ["transportation", "infrastructure", "maritime"],
|
| 32 |
+
"population": ["demographics", "census", "population"],
|
| 33 |
+
"census": ["demographics", "census", "statistics"],
|
| 34 |
+
"admin": ["administrative", "boundaries", "government"],
|
| 35 |
+
"district": ["administrative", "boundaries"],
|
| 36 |
+
"province": ["administrative", "boundaries"],
|
| 37 |
+
"corregimiento": ["administrative", "boundaries"],
|
| 38 |
+
"park": ["recreation", "green-space", "amenities"],
|
| 39 |
+
"water": ["hydrology", "natural-resources"],
|
| 40 |
+
"river": ["hydrology", "water"],
|
| 41 |
+
"forest": ["environment", "natural-resources", "land-cover"],
|
| 42 |
+
"building": ["infrastructure", "built-environment"],
|
| 43 |
+
"poi": ["points-of-interest", "amenities"],
|
| 44 |
+
}
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
class DataCatalog:
|
| 48 |
+
"""
|
| 49 |
+
Singleton service managing dataset metadata.
|
| 50 |
+
|
| 51 |
+
Features:
|
| 52 |
+
- Auto-discovery of GeoJSON files in data directories
|
| 53 |
+
- Schema inference from first record
|
| 54 |
+
- Auto-tagging based on naming conventions
|
| 55 |
+
- Integration with semantic search for scalable discovery
|
| 56 |
+
"""
|
| 57 |
+
|
| 58 |
+
_instance = None
|
| 59 |
+
|
| 60 |
+
DATA_DIR = Path(__file__).parent.parent / "data"
|
| 61 |
+
CATALOG_FILE = DATA_DIR / "catalog.json"
|
| 62 |
+
|
| 63 |
+
def __new__(cls):
|
| 64 |
+
if cls._instance is None:
|
| 65 |
+
cls._instance = super(DataCatalog, cls).__new__(cls)
|
| 66 |
+
cls._instance.initialized = False
|
| 67 |
+
return cls._instance
|
| 68 |
+
|
| 69 |
+
def __init__(self):
|
| 70 |
+
if self.initialized:
|
| 71 |
+
return
|
| 72 |
+
|
| 73 |
+
self.catalog: Dict[str, Any] = {}
|
| 74 |
+
self.load_catalog()
|
| 75 |
+
self.scan_and_update()
|
| 76 |
+
self._init_semantic_search()
|
| 77 |
+
self.initialized = True
|
| 78 |
+
|
| 79 |
+
def load_catalog(self):
|
| 80 |
+
"""Load catalog from JSON file."""
|
| 81 |
+
if self.CATALOG_FILE.exists():
|
| 82 |
+
try:
|
| 83 |
+
with open(self.CATALOG_FILE, 'r') as f:
|
| 84 |
+
self.catalog = json.load(f)
|
| 85 |
+
except Exception as e:
|
| 86 |
+
logger.error(f"Failed to load catalog: {e}")
|
| 87 |
+
self.catalog = {}
|
| 88 |
+
else:
|
| 89 |
+
self.catalog = {}
|
| 90 |
+
|
| 91 |
+
def save_catalog(self):
|
| 92 |
+
"""Save catalog to JSON file."""
|
| 93 |
+
try:
|
| 94 |
+
with open(self.CATALOG_FILE, 'w') as f:
|
| 95 |
+
json.dump(self.catalog, f, indent=2)
|
| 96 |
+
except Exception as e:
|
| 97 |
+
logger.error(f"Failed to save catalog: {e}")
|
| 98 |
+
|
| 99 |
+
def _infer_tags(self, table_name: str, columns: List[str]) -> List[str]:
|
| 100 |
+
"""Auto-generate tags based on table name and columns."""
|
| 101 |
+
tags = set()
|
| 102 |
+
name_lower = table_name.lower()
|
| 103 |
+
|
| 104 |
+
# Check table name against rules
|
| 105 |
+
for keyword, keyword_tags in TAG_RULES.items():
|
| 106 |
+
if keyword in name_lower:
|
| 107 |
+
tags.update(keyword_tags)
|
| 108 |
+
|
| 109 |
+
# Check columns for additional hints
|
| 110 |
+
columns_lower = [c.lower() for c in columns]
|
| 111 |
+
if any('pop' in c for c in columns_lower):
|
| 112 |
+
tags.add("population")
|
| 113 |
+
if any('area' in c for c in columns_lower):
|
| 114 |
+
tags.add("geographic")
|
| 115 |
+
if 'geom' in columns_lower or 'geometry' in columns_lower:
|
| 116 |
+
tags.add("spatial")
|
| 117 |
+
|
| 118 |
+
return list(tags)
|
| 119 |
+
|
| 120 |
+
def _infer_data_type(self, category: str, table_name: str) -> str:
|
| 121 |
+
"""Infer data type (static, semi-static, realtime)."""
|
| 122 |
+
# Base admin data is static
|
| 123 |
+
if category == "base":
|
| 124 |
+
return "static"
|
| 125 |
+
|
| 126 |
+
# OSM data is semi-static (updated periodically)
|
| 127 |
+
if category == "osm":
|
| 128 |
+
return "semi-static"
|
| 129 |
+
|
| 130 |
+
# HDX humanitarian data - varies
|
| 131 |
+
if category == "hdx":
|
| 132 |
+
return "semi-static"
|
| 133 |
+
|
| 134 |
+
# Census data is static
|
| 135 |
+
if "census" in table_name.lower():
|
| 136 |
+
return "static"
|
| 137 |
+
|
| 138 |
+
return "static"
|
| 139 |
+
|
| 140 |
+
def scan_and_update(self):
|
| 141 |
+
"""Scan data directories and update catalog with new files."""
|
| 142 |
+
logger.info("Scanning data directories...")
|
| 143 |
+
|
| 144 |
+
# Define directories to scan
|
| 145 |
+
subdirs = ['base', 'osm', 'inec', 'hdx', 'custom', 'overture', 'ms_buildings']
|
| 146 |
+
|
| 147 |
+
# Temporary connection for schema inference
|
| 148 |
+
con = duckdb.connect(':memory:')
|
| 149 |
+
con.install_extension('spatial')
|
| 150 |
+
con.load_extension('spatial')
|
| 151 |
+
|
| 152 |
+
updated = False
|
| 153 |
+
|
| 154 |
+
for subdir in subdirs:
|
| 155 |
+
dir_path = self.DATA_DIR / subdir
|
| 156 |
+
if not dir_path.exists():
|
| 157 |
+
continue
|
| 158 |
+
|
| 159 |
+
# Scan for both .geojson and .geojson.gz
|
| 160 |
+
for file_path in list(dir_path.glob('**/*.geojson')) + list(dir_path.glob('**/*.geojson.gz')):
|
| 161 |
+
table_name = file_path.name.replace('.geojson.gz', '').replace('.geojson', '').lower().replace('-', '_').replace(' ', '_')
|
| 162 |
+
|
| 163 |
+
# Check if file path changed (file moved/renamed)
|
| 164 |
+
existing = self.catalog.get(table_name)
|
| 165 |
+
rel_path = str(file_path.relative_to(self.DATA_DIR))
|
| 166 |
+
|
| 167 |
+
if existing and existing.get('path') == rel_path:
|
| 168 |
+
# Already indexed with same path, skip unless missing new fields
|
| 169 |
+
if 'tags' in existing and 'data_type' in existing:
|
| 170 |
+
continue
|
| 171 |
+
|
| 172 |
+
try:
|
| 173 |
+
logger.info(f"Indexing {table_name}...")
|
| 174 |
+
|
| 175 |
+
# Read first row to get columns
|
| 176 |
+
query = f"SELECT * FROM ST_Read('{file_path}') LIMIT 1"
|
| 177 |
+
df = con.execute(query).fetchdf()
|
| 178 |
+
columns = list(df.columns)
|
| 179 |
+
|
| 180 |
+
# Count rows (for metadata)
|
| 181 |
+
row_count_query = f"SELECT COUNT(*) FROM ST_Read('{file_path}')"
|
| 182 |
+
row_count = con.execute(row_count_query).fetchone()[0]
|
| 183 |
+
|
| 184 |
+
# Auto-generate tags
|
| 185 |
+
tags = self._infer_tags(table_name, columns)
|
| 186 |
+
|
| 187 |
+
# Infer data type
|
| 188 |
+
data_type = self._infer_data_type(subdir, table_name)
|
| 189 |
+
|
| 190 |
+
# Build catalog entry
|
| 191 |
+
self.catalog[table_name] = {
|
| 192 |
+
"path": rel_path,
|
| 193 |
+
"description": f"Data from {subdir}/{file_path.name}",
|
| 194 |
+
"semantic_description": None, # LLM-generated on demand
|
| 195 |
+
"tags": tags,
|
| 196 |
+
"data_type": data_type,
|
| 197 |
+
"update_frequency": None,
|
| 198 |
+
"columns": columns,
|
| 199 |
+
"row_count": row_count,
|
| 200 |
+
"category": subdir,
|
| 201 |
+
"format": "geojson",
|
| 202 |
+
"last_indexed": datetime.now().isoformat()
|
| 203 |
+
}
|
| 204 |
+
updated = True
|
| 205 |
+
|
| 206 |
+
except Exception as e:
|
| 207 |
+
logger.warning(f"Failed to index {file_path}: {e}")
|
| 208 |
+
|
| 209 |
+
con.close()
|
| 210 |
+
|
| 211 |
+
if updated:
|
| 212 |
+
self.save_catalog()
|
| 213 |
+
logger.info("Catalog updated.")
|
| 214 |
+
|
| 215 |
+
def _init_semantic_search(self):
|
| 216 |
+
"""Initialize semantic search with current catalog."""
|
| 217 |
+
try:
|
| 218 |
+
from backend.core.semantic_search import get_semantic_search
|
| 219 |
+
semantic = get_semantic_search()
|
| 220 |
+
|
| 221 |
+
# Embed all tables
|
| 222 |
+
new_embeddings = semantic.embed_all_tables(self.catalog)
|
| 223 |
+
if new_embeddings > 0:
|
| 224 |
+
logger.info(f"Created {new_embeddings} new semantic embeddings.")
|
| 225 |
+
except Exception as e:
|
| 226 |
+
logger.warning(f"Semantic search initialization failed: {e}")
|
| 227 |
+
|
| 228 |
+
def get_table_metadata(self, table_name: str) -> Optional[Dict]:
|
| 229 |
+
"""Get metadata for a specific table."""
|
| 230 |
+
return self.catalog.get(table_name)
|
| 231 |
+
|
| 232 |
+
def get_all_table_summaries(self) -> str:
|
| 233 |
+
"""
|
| 234 |
+
Returns a concise summary of all tables.
|
| 235 |
+
|
| 236 |
+
WARNING: This can be very large with many datasets.
|
| 237 |
+
Prefer using semantic_search.search() for discovery.
|
| 238 |
+
"""
|
| 239 |
+
summary = "Available Data Tables:\n"
|
| 240 |
+
|
| 241 |
+
# Group by category
|
| 242 |
+
by_category: Dict[str, List] = {}
|
| 243 |
+
for name, meta in self.catalog.items():
|
| 244 |
+
cat = meta.get('category', 'other')
|
| 245 |
+
if cat not in by_category:
|
| 246 |
+
by_category[cat] = []
|
| 247 |
+
by_category[cat].append((name, meta))
|
| 248 |
+
|
| 249 |
+
for cat, items in by_category.items():
|
| 250 |
+
summary += f"\n## {cat.upper()}\n"
|
| 251 |
+
for name, meta in items:
|
| 252 |
+
desc = meta.get('semantic_description') or meta.get('description', 'No description')
|
| 253 |
+
tags = meta.get('tags', [])
|
| 254 |
+
tag_str = f" [{', '.join(tags[:3])}]" if tags else ""
|
| 255 |
+
summary += f"- {name}: {desc}{tag_str}\n"
|
| 256 |
+
|
| 257 |
+
return summary
|
| 258 |
+
|
| 259 |
+
def get_summaries_for_tables(self, table_names: List[str]) -> str:
|
| 260 |
+
"""
|
| 261 |
+
Get summaries only for specified tables.
|
| 262 |
+
|
| 263 |
+
Used after semantic pre-filtering to build focused LLM context.
|
| 264 |
+
"""
|
| 265 |
+
summary = "Relevant Data Tables:\n\n"
|
| 266 |
+
|
| 267 |
+
for name in table_names:
|
| 268 |
+
meta = self.catalog.get(name)
|
| 269 |
+
if not meta:
|
| 270 |
+
continue
|
| 271 |
+
|
| 272 |
+
desc = meta.get('semantic_description') or meta.get('description', 'No description')
|
| 273 |
+
tags = meta.get('tags', [])
|
| 274 |
+
columns = meta.get('columns', [])[:10] # Limit columns
|
| 275 |
+
row_count = meta.get('row_count', 'unknown')
|
| 276 |
+
|
| 277 |
+
summary += f"### {name}\n"
|
| 278 |
+
summary += f"Description: {desc}\n"
|
| 279 |
+
if tags:
|
| 280 |
+
summary += f"Tags: {', '.join(tags)}\n"
|
| 281 |
+
summary += f"Columns: {', '.join(columns)}\n"
|
| 282 |
+
summary += f"Rows: {row_count}\n\n"
|
| 283 |
+
|
| 284 |
+
return summary
|
| 285 |
+
|
| 286 |
+
def get_specific_table_schemas(self, table_names: List[str]) -> str:
|
| 287 |
+
"""Returns detailed schema for specific tables."""
|
| 288 |
+
output = ""
|
| 289 |
+
for name in table_names:
|
| 290 |
+
meta = self.catalog.get(name)
|
| 291 |
+
if not meta:
|
| 292 |
+
continue
|
| 293 |
+
|
| 294 |
+
output += f"### {name}\n"
|
| 295 |
+
output += f"Description: {meta.get('description')}\n"
|
| 296 |
+
output += "Columns: " + ", ".join(meta.get('columns', [])) + "\n\n"
|
| 297 |
+
return output
|
| 298 |
+
|
| 299 |
+
def get_file_path(self, table_name: str) -> Optional[Path]:
|
| 300 |
+
"""Get absolute path for a table's data file."""
|
| 301 |
+
meta = self.catalog.get(table_name)
|
| 302 |
+
if meta and 'path' in meta:
|
| 303 |
+
return self.DATA_DIR / meta['path']
|
| 304 |
+
return None
|
| 305 |
+
|
| 306 |
+
def get_tables_by_tag(self, tag: str) -> List[str]:
|
| 307 |
+
"""Get all table names that have a specific tag."""
|
| 308 |
+
return [
|
| 309 |
+
name for name, meta in self.catalog.items()
|
| 310 |
+
if tag in meta.get('tags', [])
|
| 311 |
+
]
|
| 312 |
+
|
| 313 |
+
def get_tables_by_category(self, category: str) -> List[str]:
|
| 314 |
+
"""Get all table names in a specific category."""
|
| 315 |
+
return [
|
| 316 |
+
name for name, meta in self.catalog.items()
|
| 317 |
+
if meta.get('category') == category
|
| 318 |
+
]
|
| 319 |
+
|
| 320 |
+
def get_stats(self) -> dict:
|
| 321 |
+
"""Return statistics about the catalog."""
|
| 322 |
+
categories = {}
|
| 323 |
+
tags = {}
|
| 324 |
+
enriched_count = 0
|
| 325 |
+
|
| 326 |
+
for meta in self.catalog.values():
|
| 327 |
+
cat = meta.get('category', 'other')
|
| 328 |
+
categories[cat] = categories.get(cat, 0) + 1
|
| 329 |
+
|
| 330 |
+
if meta.get('semantic_description'):
|
| 331 |
+
enriched_count += 1
|
| 332 |
+
|
| 333 |
+
for tag in meta.get('tags', []):
|
| 334 |
+
tags[tag] = tags.get(tag, 0) + 1
|
| 335 |
+
|
| 336 |
+
return {
|
| 337 |
+
"total_datasets": len(self.catalog),
|
| 338 |
+
"enriched_datasets": enriched_count,
|
| 339 |
+
"by_category": categories,
|
| 340 |
+
"by_tag": dict(sorted(tags.items(), key=lambda x: -x[1])[:20]),
|
| 341 |
+
"catalog_file": str(self.CATALOG_FILE)
|
| 342 |
+
}
|
| 343 |
+
|
| 344 |
+
async def enrich_table(self, table_name: str, force_refresh: bool = False) -> bool:
|
| 345 |
+
"""
|
| 346 |
+
Enrich a single table with LLM-generated metadata.
|
| 347 |
+
|
| 348 |
+
Returns True if enrichment was successful.
|
| 349 |
+
"""
|
| 350 |
+
if table_name not in self.catalog:
|
| 351 |
+
logger.warning(f"Table {table_name} not found in catalog")
|
| 352 |
+
return False
|
| 353 |
+
|
| 354 |
+
metadata = self.catalog[table_name]
|
| 355 |
+
|
| 356 |
+
# Skip if already enriched (unless forced)
|
| 357 |
+
if not force_refresh and metadata.get('semantic_description'):
|
| 358 |
+
logger.info(f"Table {table_name} already enriched, skipping")
|
| 359 |
+
return True
|
| 360 |
+
|
| 361 |
+
try:
|
| 362 |
+
from backend.core.catalog_enricher import get_catalog_enricher
|
| 363 |
+
enricher = get_catalog_enricher()
|
| 364 |
+
|
| 365 |
+
# Get sample values for context
|
| 366 |
+
sample_values = await self._get_sample_values(table_name)
|
| 367 |
+
|
| 368 |
+
# Enrich
|
| 369 |
+
enriched = await enricher.enrich_table(table_name, metadata, sample_values, force_refresh)
|
| 370 |
+
|
| 371 |
+
# Update catalog
|
| 372 |
+
enriched['last_enriched'] = datetime.now().isoformat()
|
| 373 |
+
self.catalog[table_name] = enriched
|
| 374 |
+
self.save_catalog()
|
| 375 |
+
|
| 376 |
+
# Re-embed with new description
|
| 377 |
+
self._update_embedding(table_name, enriched)
|
| 378 |
+
|
| 379 |
+
logger.info(f"Successfully enriched {table_name}")
|
| 380 |
+
return True
|
| 381 |
+
|
| 382 |
+
except Exception as e:
|
| 383 |
+
logger.error(f"Failed to enrich {table_name}: {e}")
|
| 384 |
+
return False
|
| 385 |
+
|
| 386 |
+
async def enrich_all_tables(self, force_refresh: bool = False) -> Dict[str, bool]:
|
| 387 |
+
"""
|
| 388 |
+
Enrich all tables in the catalog.
|
| 389 |
+
|
| 390 |
+
Returns dict of table_name -> success status.
|
| 391 |
+
"""
|
| 392 |
+
results = {}
|
| 393 |
+
|
| 394 |
+
for table_name in self.catalog.keys():
|
| 395 |
+
success = await self.enrich_table(table_name, force_refresh)
|
| 396 |
+
results[table_name] = success
|
| 397 |
+
|
| 398 |
+
return results
|
| 399 |
+
|
| 400 |
+
async def _get_sample_values(self, table_name: str) -> Optional[Dict[str, str]]:
|
| 401 |
+
"""Get sample values from a table for enrichment context."""
|
| 402 |
+
try:
|
| 403 |
+
from backend.core.geo_engine import get_geo_engine
|
| 404 |
+
geo_engine = get_geo_engine()
|
| 405 |
+
|
| 406 |
+
# Ensure table is loaded
|
| 407 |
+
geo_engine.ensure_table_loaded(table_name)
|
| 408 |
+
|
| 409 |
+
# Get one row
|
| 410 |
+
result = geo_engine.con.execute(f"SELECT * FROM {table_name} LIMIT 1").fetchdf()
|
| 411 |
+
|
| 412 |
+
if len(result) > 0:
|
| 413 |
+
sample = {}
|
| 414 |
+
for col in result.columns:
|
| 415 |
+
if col != 'geom':
|
| 416 |
+
val = result[col].iloc[0]
|
| 417 |
+
if val is not None:
|
| 418 |
+
sample[col] = str(val)[:50] # Limit value length
|
| 419 |
+
return sample
|
| 420 |
+
|
| 421 |
+
except Exception as e:
|
| 422 |
+
logger.debug(f"Could not get sample values for {table_name}: {e}")
|
| 423 |
+
|
| 424 |
+
return None
|
| 425 |
+
|
| 426 |
+
def _update_embedding(self, table_name: str, metadata: Dict[str, Any]) -> None:
|
| 427 |
+
"""Update semantic search embedding for a table."""
|
| 428 |
+
try:
|
| 429 |
+
from backend.core.semantic_search import get_semantic_search
|
| 430 |
+
semantic = get_semantic_search()
|
| 431 |
+
semantic.embed_table(table_name, metadata)
|
| 432 |
+
semantic._save_embeddings()
|
| 433 |
+
except Exception as e:
|
| 434 |
+
logger.warning(f"Could not update embedding for {table_name}: {e}")
|
| 435 |
+
|
| 436 |
+
|
| 437 |
+
_data_catalog = None
|
| 438 |
+
|
| 439 |
+
|
| 440 |
+
def get_data_catalog() -> DataCatalog:
|
| 441 |
+
"""Get the singleton data catalog instance."""
|
| 442 |
+
global _data_catalog
|
| 443 |
+
if _data_catalog is None:
|
| 444 |
+
_data_catalog = DataCatalog()
|
| 445 |
+
return _data_catalog
|
backend/core/database.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from sqlmodel import SQLModel, create_engine
|
| 2 |
+
from sqlmodel.ext.asyncio.session import AsyncSession
|
| 3 |
+
from sqlalchemy.orm import sessionmaker
|
| 4 |
+
from sqlalchemy.pool import NullPool
|
| 5 |
+
import os
|
| 6 |
+
|
| 7 |
+
# Using local Postgres.app
|
| 8 |
+
# Format: postgresql+asyncpg://user:password@host/dbname
|
| 9 |
+
# Postgres.app usually defaults to the current user with no password
|
| 10 |
+
user = os.getenv("USER", "postgres")
|
| 11 |
+
DATABASE_URL = f"postgresql+asyncpg://{user}:@localhost/geoquery"
|
| 12 |
+
|
| 13 |
+
engine = create_engine(
|
| 14 |
+
DATABASE_URL,
|
| 15 |
+
echo=True,
|
| 16 |
+
future=True,
|
| 17 |
+
poolclass=NullPool # Disable pooling for asyncpg if needed, or adjust
|
| 18 |
+
)
|
| 19 |
+
|
| 20 |
+
# Async Engine for AsyncPG
|
| 21 |
+
from sqlalchemy.ext.asyncio import create_async_engine
|
| 22 |
+
async_engine = create_async_engine(DATABASE_URL, echo=True, future=True)
|
| 23 |
+
|
| 24 |
+
async def get_session() -> AsyncSession:
|
| 25 |
+
async_session = sessionmaker(
|
| 26 |
+
async_engine, class_=AsyncSession, expire_on_commit=False
|
| 27 |
+
)
|
| 28 |
+
async with async_session() as session:
|
| 29 |
+
yield session
|
| 30 |
+
|
| 31 |
+
async def init_db():
|
| 32 |
+
async with async_engine.begin() as conn:
|
| 33 |
+
# await conn.run_sync(SQLModel.metadata.drop_all)
|
| 34 |
+
await conn.run_sync(SQLModel.metadata.create_all)
|
backend/core/geo_engine.py
ADDED
|
@@ -0,0 +1,244 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import duckdb
|
| 2 |
+
import json
|
| 3 |
+
import logging
|
| 4 |
+
import os
|
| 5 |
+
from typing import Dict, Any, Optional, List
|
| 6 |
+
from backend.core.data_catalog import get_data_catalog
|
| 7 |
+
|
| 8 |
+
logger = logging.getLogger(__name__)
|
| 9 |
+
|
| 10 |
+
class GeoEngine:
|
| 11 |
+
_instance = None
|
| 12 |
+
|
| 13 |
+
def __new__(cls):
|
| 14 |
+
if cls._instance is None:
|
| 15 |
+
cls._instance = super(GeoEngine, cls).__new__(cls)
|
| 16 |
+
cls._instance.initialized = False
|
| 17 |
+
return cls._instance
|
| 18 |
+
|
| 19 |
+
def __init__(self):
|
| 20 |
+
if self.initialized:
|
| 21 |
+
return
|
| 22 |
+
|
| 23 |
+
logger.info("Initializing GeoEngine (DuckDB)...")
|
| 24 |
+
try:
|
| 25 |
+
self.con = duckdb.connect(database=':memory:')
|
| 26 |
+
self.con.install_extension('spatial')
|
| 27 |
+
self.con.load_extension('spatial')
|
| 28 |
+
logger.info("GeoEngine initialized with Spatial extension.")
|
| 29 |
+
except Exception as e:
|
| 30 |
+
logger.error(f"Failed to initialize GeoEngine: {e}")
|
| 31 |
+
raise e
|
| 32 |
+
|
| 33 |
+
self.layers = {} # layer_id -> table_name
|
| 34 |
+
self.catalog = get_data_catalog()
|
| 35 |
+
self.base_tables_loaded = False
|
| 36 |
+
self.initialized = True
|
| 37 |
+
|
| 38 |
+
# Automatically load base tables
|
| 39 |
+
self.initialize_base_tables()
|
| 40 |
+
|
| 41 |
+
def initialize_base_tables(self):
|
| 42 |
+
"""
|
| 43 |
+
Load essential administrative boundary files into DuckDB tables.
|
| 44 |
+
"""
|
| 45 |
+
if self.base_tables_loaded:
|
| 46 |
+
return
|
| 47 |
+
|
| 48 |
+
logger.info("Loading base tables into DuckDB...")
|
| 49 |
+
|
| 50 |
+
# Load core admin tables from catalog
|
| 51 |
+
# We look for tables starting with 'pan_admin' in the 'base' category
|
| 52 |
+
base_tables = [
|
| 53 |
+
name for name, meta in self.catalog.catalog.items()
|
| 54 |
+
if meta.get('category') == 'base'
|
| 55 |
+
]
|
| 56 |
+
|
| 57 |
+
for table_name in base_tables:
|
| 58 |
+
self.ensure_table_loaded(table_name)
|
| 59 |
+
|
| 60 |
+
self.base_tables_loaded = True
|
| 61 |
+
logger.info("Base tables loaded.")
|
| 62 |
+
|
| 63 |
+
def ensure_table_loaded(self, table_name: str) -> bool:
|
| 64 |
+
"""
|
| 65 |
+
Ensure a table is loaded in DuckDB. If not, load it from the catalog.
|
| 66 |
+
Returns True if successful, False otherwise.
|
| 67 |
+
"""
|
| 68 |
+
# Check if already loaded
|
| 69 |
+
try:
|
| 70 |
+
self.con.execute(f"DESCRIBE {table_name}")
|
| 71 |
+
return True
|
| 72 |
+
except:
|
| 73 |
+
pass # Not loaded
|
| 74 |
+
|
| 75 |
+
# Look up in catalog
|
| 76 |
+
file_path = self.catalog.get_file_path(table_name)
|
| 77 |
+
if not file_path or not file_path.exists():
|
| 78 |
+
logger.warning(f"Table {table_name} not found in catalog or file missing.")
|
| 79 |
+
return False
|
| 80 |
+
|
| 81 |
+
try:
|
| 82 |
+
logger.info(f"Lazy loading table: {table_name}")
|
| 83 |
+
self.con.execute(f"CREATE OR REPLACE TABLE {table_name} AS SELECT * FROM ST_Read('{file_path}')")
|
| 84 |
+
return True
|
| 85 |
+
except Exception as e:
|
| 86 |
+
logger.error(f"Failed to load {table_name}: {e}")
|
| 87 |
+
return False
|
| 88 |
+
|
| 89 |
+
def get_table_schemas(self) -> str:
|
| 90 |
+
"""
|
| 91 |
+
Get schema of currently loaded tables for LLM context.
|
| 92 |
+
"""
|
| 93 |
+
result = "Currently Loaded Tables:\n\n"
|
| 94 |
+
|
| 95 |
+
try:
|
| 96 |
+
# Get all tables
|
| 97 |
+
tables = self.con.execute("SHOW TABLES").fetchall()
|
| 98 |
+
for table in tables:
|
| 99 |
+
table_name = table[0]
|
| 100 |
+
try:
|
| 101 |
+
columns = self.con.execute(f"DESCRIBE {table_name}").fetchall()
|
| 102 |
+
row_count = self.con.execute(f"SELECT COUNT(*) FROM {table_name}").fetchone()[0]
|
| 103 |
+
|
| 104 |
+
result += f"### {table_name} ({row_count} rows)\n"
|
| 105 |
+
result += "Columns:\n"
|
| 106 |
+
|
| 107 |
+
for col in columns:
|
| 108 |
+
col_name, col_type = col[0], col[1]
|
| 109 |
+
if col_name == 'geom':
|
| 110 |
+
result += f" - geom: GEOMETRY (spatial data)\n"
|
| 111 |
+
else:
|
| 112 |
+
result += f" - {col_name}: {col_type}\n"
|
| 113 |
+
result += "\n"
|
| 114 |
+
except:
|
| 115 |
+
pass
|
| 116 |
+
except Exception as e:
|
| 117 |
+
logger.error(f"Error getting schemas: {e}")
|
| 118 |
+
|
| 119 |
+
return result
|
| 120 |
+
|
| 121 |
+
def get_table_list(self) -> List[str]:
|
| 122 |
+
"""Return list of all available table names."""
|
| 123 |
+
tables = list(self.BASE_TABLES.keys())
|
| 124 |
+
tables.extend(self.layers.values())
|
| 125 |
+
return tables
|
| 126 |
+
|
| 127 |
+
def register_layer(self, layer_id: str, geojson: Dict[str, Any]) -> str:
|
| 128 |
+
"""
|
| 129 |
+
Registers a GeoJSON object as a table in DuckDB.
|
| 130 |
+
Returns the table name.
|
| 131 |
+
"""
|
| 132 |
+
table_name = f"layer_{layer_id.replace('-', '_')}"
|
| 133 |
+
|
| 134 |
+
# If table exists, drop it
|
| 135 |
+
self.con.execute(f"DROP TABLE IF EXISTS {table_name}")
|
| 136 |
+
|
| 137 |
+
# DuckDB can read JSON objects directly via read_json_auto?
|
| 138 |
+
# Easier to dump to string and read from memory or temporary file.
|
| 139 |
+
# For in-memory, we can use binding or just simple JSON text.
|
| 140 |
+
|
| 141 |
+
# Strategy: Create a table with a JSON column, then unpack?
|
| 142 |
+
# Better: ST_Read can read from a file.
|
| 143 |
+
# Using python objects directly with DuckDB replacement scan is possible but complex for nested GeoJSON.
|
| 144 |
+
|
| 145 |
+
# Simplest: Write to temp file, load with ST_Read.
|
| 146 |
+
try:
|
| 147 |
+
import tempfile
|
| 148 |
+
import os
|
| 149 |
+
|
| 150 |
+
def json_serial(obj):
|
| 151 |
+
"""JSON serializer for objects not serializable by default json code"""
|
| 152 |
+
if hasattr(obj, 'isoformat'):
|
| 153 |
+
return obj.isoformat()
|
| 154 |
+
raise TypeError (f"Type {type(obj)} not serializable")
|
| 155 |
+
|
| 156 |
+
with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as tmp:
|
| 157 |
+
json.dump(geojson, tmp, default=json_serial)
|
| 158 |
+
tmp_path = tmp.name
|
| 159 |
+
|
| 160 |
+
self.con.execute(f"CREATE TABLE {table_name} AS SELECT * FROM ST_Read('{tmp_path}')")
|
| 161 |
+
os.unlink(tmp_path)
|
| 162 |
+
|
| 163 |
+
self.layers[layer_id] = table_name
|
| 164 |
+
logger.info(f"Registered layer {layer_id} as table {table_name}")
|
| 165 |
+
return table_name
|
| 166 |
+
|
| 167 |
+
except Exception as e:
|
| 168 |
+
logger.error(f"Error registering layer {layer_id}: {e}")
|
| 169 |
+
raise e
|
| 170 |
+
|
| 171 |
+
def execute_spatial_query(self, sql: str) -> Dict[str, Any]:
|
| 172 |
+
"""
|
| 173 |
+
Executes a SQL query and returns the result as a GeoJSON FeatureCollection.
|
| 174 |
+
Expects the query to return a geometry column.
|
| 175 |
+
"""
|
| 176 |
+
try:
|
| 177 |
+
logger.info(f"Executing Spatial SQL: {sql}")
|
| 178 |
+
|
| 179 |
+
# Use ST_AsGeoJSON to format the geometry column
|
| 180 |
+
# We assume the user/LLM selects *
|
| 181 |
+
# We need to wrap the user query to convert to GeoJSON format
|
| 182 |
+
|
| 183 |
+
# The query usually returns rows. We need to aggregate to FeatureCollection.
|
| 184 |
+
# DuckDB Spatial doesn't automagically output FeatureCollection structure.
|
| 185 |
+
# But the 'geojson' driver for ST_Read works. ST_AsGeoJSON works on geometries.
|
| 186 |
+
|
| 187 |
+
# Approach: Create a temporary table from the result, then export?
|
| 188 |
+
# Or fetch as Python objects.
|
| 189 |
+
|
| 190 |
+
self.con.execute(f"CREATE OR REPLACE TEMP TABLE query_result AS {sql}")
|
| 191 |
+
|
| 192 |
+
# Check columns to find geometry
|
| 193 |
+
columns = self.con.execute("DESCRIBE query_result").fetchall()
|
| 194 |
+
geom_col = next((c[0] for c in columns if c[0] in ['geom', 'geometry']), None)
|
| 195 |
+
|
| 196 |
+
if not geom_col and 'geometry' not in [c[0] for c in columns]:
|
| 197 |
+
# Maybe the user didn't select geometry?
|
| 198 |
+
pass
|
| 199 |
+
|
| 200 |
+
# Construct GeoJSON manually from rows
|
| 201 |
+
# Select ST_AsGeoJSON(geom), * EXCLUDE (geom)
|
| 202 |
+
|
| 203 |
+
other_cols = [c[0] for c in columns if c[0] != geom_col]
|
| 204 |
+
other_cols_select = ", ".join(other_cols) if other_cols else ""
|
| 205 |
+
|
| 206 |
+
select_clause = f"ST_AsGeoJSON({geom_col})"
|
| 207 |
+
if other_cols_select:
|
| 208 |
+
select_clause += f", {other_cols_select}"
|
| 209 |
+
|
| 210 |
+
rows = self.con.execute(f"SELECT {select_clause} FROM query_result").fetchall()
|
| 211 |
+
|
| 212 |
+
features = []
|
| 213 |
+
for row in rows:
|
| 214 |
+
geometry = json.loads(row[0])
|
| 215 |
+
properties = {}
|
| 216 |
+
for i, col_name in enumerate(other_cols):
|
| 217 |
+
properties[col_name] = row[i+1]
|
| 218 |
+
|
| 219 |
+
features.append({
|
| 220 |
+
"type": "Feature",
|
| 221 |
+
"geometry": geometry,
|
| 222 |
+
"properties": properties
|
| 223 |
+
})
|
| 224 |
+
|
| 225 |
+
return {
|
| 226 |
+
"type": "FeatureCollection",
|
| 227 |
+
"features": features,
|
| 228 |
+
"properties": {}
|
| 229 |
+
}
|
| 230 |
+
|
| 231 |
+
except Exception as e:
|
| 232 |
+
logger.error(f"Spatial query failed: {e}")
|
| 233 |
+
raise e
|
| 234 |
+
|
| 235 |
+
def get_table_name(self, layer_id: str) -> Optional[str]:
|
| 236 |
+
return self.layers.get(layer_id)
|
| 237 |
+
|
| 238 |
+
_geo_engine = None
|
| 239 |
+
|
| 240 |
+
def get_geo_engine() -> GeoEngine:
|
| 241 |
+
global _geo_engine
|
| 242 |
+
if _geo_engine is None:
|
| 243 |
+
_geo_engine = GeoEngine()
|
| 244 |
+
return _geo_engine
|
backend/core/llm_gateway.py
ADDED
|
@@ -0,0 +1,500 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import asyncio
|
| 3 |
+
import json
|
| 4 |
+
from google import genai
|
| 5 |
+
from google.genai import types
|
| 6 |
+
from dotenv import load_dotenv
|
| 7 |
+
|
| 8 |
+
from backend.core.prompts import (
|
| 9 |
+
SYSTEM_INSTRUCTION,
|
| 10 |
+
INTENT_DETECTION_PROMPT,
|
| 11 |
+
DATA_DISCOVERY_PROMPT,
|
| 12 |
+
SQL_GENERATION_PROMPT,
|
| 13 |
+
EXPLANATION_PROMPT,
|
| 14 |
+
SPATIAL_SQL_PROMPT,
|
| 15 |
+
SPATIAL_SQL_PROMPT,
|
| 16 |
+
SQL_CORRECTION_PROMPT,
|
| 17 |
+
LAYER_NAME_PROMPT
|
| 18 |
+
)
|
| 19 |
+
|
| 20 |
+
class LLMGateway:
|
| 21 |
+
def __init__(self, model_name: str = "gemini-3-flash-preview"):
|
| 22 |
+
# Load environment variables if not already loaded
|
| 23 |
+
load_dotenv()
|
| 24 |
+
|
| 25 |
+
self.api_key = os.getenv("GEMINI_API_KEY") or os.getenv("GOOGLE_API_KEY")
|
| 26 |
+
if not self.api_key:
|
| 27 |
+
print("WARNING: GEMINI_API_KEY/GOOGLE_API_KEY not found. LLM features will not work.")
|
| 28 |
+
self.client = None
|
| 29 |
+
else:
|
| 30 |
+
# Explicitly setting the environment variable for the SDK if it's not set
|
| 31 |
+
if "GEMINI_API_KEY" not in os.environ and self.api_key:
|
| 32 |
+
os.environ["GEMINI_API_KEY"] = self.api_key
|
| 33 |
+
|
| 34 |
+
# The SDK automatically picks up GEMINI_API_KEY
|
| 35 |
+
self.client = genai.Client()
|
| 36 |
+
|
| 37 |
+
self.model = model_name
|
| 38 |
+
|
| 39 |
+
def _build_contents_from_history(self, history: list[dict], current_message: str) -> list:
|
| 40 |
+
"""
|
| 41 |
+
Converts conversation history to the format expected by the Gemini API.
|
| 42 |
+
History format: [{"role": "user"|"assistant", "content": "..."}]
|
| 43 |
+
"""
|
| 44 |
+
contents = []
|
| 45 |
+
for msg in history:
|
| 46 |
+
# Map 'assistant' to 'model' for Gemini API
|
| 47 |
+
role = "model" if msg["role"] == "assistant" else "user"
|
| 48 |
+
contents.append(
|
| 49 |
+
types.Content(
|
| 50 |
+
role=role,
|
| 51 |
+
parts=[types.Part.from_text(text=msg["content"])]
|
| 52 |
+
)
|
| 53 |
+
)
|
| 54 |
+
|
| 55 |
+
# Add the current message
|
| 56 |
+
contents.append(
|
| 57 |
+
types.Content(
|
| 58 |
+
role="user",
|
| 59 |
+
parts=[types.Part.from_text(text=current_message)]
|
| 60 |
+
)
|
| 61 |
+
)
|
| 62 |
+
return contents
|
| 63 |
+
|
| 64 |
+
async def generate_response_stream(self, user_query: str, history: list[dict] = None):
|
| 65 |
+
"""
|
| 66 |
+
Generates a streaming response using conversation history for context.
|
| 67 |
+
Yields chunks of text and thought summaries.
|
| 68 |
+
"""
|
| 69 |
+
if not self.client:
|
| 70 |
+
yield "I couldn't generate a response because the API key is missing."
|
| 71 |
+
return
|
| 72 |
+
|
| 73 |
+
if history is None:
|
| 74 |
+
history = []
|
| 75 |
+
|
| 76 |
+
try:
|
| 77 |
+
contents = self._build_contents_from_history(history, user_query)
|
| 78 |
+
|
| 79 |
+
# Enable thinking mode for general chat as well
|
| 80 |
+
config = types.GenerateContentConfig(
|
| 81 |
+
system_instruction=SYSTEM_INSTRUCTION,
|
| 82 |
+
thinking_config=types.ThinkingConfig(
|
| 83 |
+
include_thoughts=True # Enable thought summaries
|
| 84 |
+
)
|
| 85 |
+
)
|
| 86 |
+
|
| 87 |
+
stream = await asyncio.to_thread(
|
| 88 |
+
self.client.models.generate_content_stream,
|
| 89 |
+
model=self.model,
|
| 90 |
+
contents=contents,
|
| 91 |
+
config=config,
|
| 92 |
+
)
|
| 93 |
+
|
| 94 |
+
for chunk in stream:
|
| 95 |
+
for part in chunk.candidates[0].content.parts:
|
| 96 |
+
if part.thought:
|
| 97 |
+
yield {"type": "thought", "content": part.text}
|
| 98 |
+
elif part.text:
|
| 99 |
+
yield {"type": "content", "text": part.text}
|
| 100 |
+
|
| 101 |
+
except Exception as e:
|
| 102 |
+
print(f"Error calling Gemini stream: {e}")
|
| 103 |
+
yield f"Error: {str(e)}"
|
| 104 |
+
|
| 105 |
+
async def generate_response(self, user_query: str, history: list[dict] = None) -> str:
|
| 106 |
+
"""
|
| 107 |
+
Generates a response using conversation history for context.
|
| 108 |
+
"""
|
| 109 |
+
if not self.client:
|
| 110 |
+
return "I couldn't generate a response because the API key is missing."
|
| 111 |
+
|
| 112 |
+
if history is None:
|
| 113 |
+
history = []
|
| 114 |
+
|
| 115 |
+
try:
|
| 116 |
+
contents = self._build_contents_from_history(history, user_query)
|
| 117 |
+
|
| 118 |
+
config = types.GenerateContentConfig(
|
| 119 |
+
system_instruction=SYSTEM_INSTRUCTION,
|
| 120 |
+
)
|
| 121 |
+
|
| 122 |
+
response = await asyncio.to_thread(
|
| 123 |
+
self.client.models.generate_content,
|
| 124 |
+
model=self.model,
|
| 125 |
+
contents=contents,
|
| 126 |
+
config=config,
|
| 127 |
+
)
|
| 128 |
+
return response.text
|
| 129 |
+
except Exception as e:
|
| 130 |
+
print(f"Error calling Gemini: {e}")
|
| 131 |
+
return f"I encountered an error: {e}"
|
| 132 |
+
|
| 133 |
+
async def detect_intent(self, user_query: str, history: list[dict] = None) -> str:
|
| 134 |
+
"""
|
| 135 |
+
Detects the intent of the user's query using Gemini thinking mode.
|
| 136 |
+
Returns: GENERAL_CHAT, DATA_QUERY, MAP_REQUEST, SPATIAL_OP, or STAT_QUERY
|
| 137 |
+
"""
|
| 138 |
+
if not self.client:
|
| 139 |
+
return "GENERAL_CHAT"
|
| 140 |
+
|
| 141 |
+
intent_prompt = INTENT_DETECTION_PROMPT.format(user_query=user_query)
|
| 142 |
+
|
| 143 |
+
try:
|
| 144 |
+
# Use thinking mode for better intent classification
|
| 145 |
+
config = types.GenerateContentConfig(
|
| 146 |
+
thinking_config=types.ThinkingConfig(
|
| 147 |
+
thinking_level="medium" # Balanced thinking for intent detection
|
| 148 |
+
)
|
| 149 |
+
)
|
| 150 |
+
|
| 151 |
+
response = await asyncio.to_thread(
|
| 152 |
+
self.client.models.generate_content,
|
| 153 |
+
model=self.model,
|
| 154 |
+
contents=intent_prompt,
|
| 155 |
+
config=config,
|
| 156 |
+
)
|
| 157 |
+
intent = response.text.strip().upper()
|
| 158 |
+
|
| 159 |
+
# Validate the intent
|
| 160 |
+
if intent in ["GENERAL_CHAT", "DATA_QUERY", "MAP_REQUEST", "SPATIAL_OP", "STAT_QUERY"]:
|
| 161 |
+
return intent
|
| 162 |
+
|
| 163 |
+
# Default fallback
|
| 164 |
+
return "GENERAL_CHAT"
|
| 165 |
+
except Exception as e:
|
| 166 |
+
print(f"Error detecting intent: {e}")
|
| 167 |
+
return "GENERAL_CHAT"
|
| 168 |
+
|
| 169 |
+
async def stream_intent(self, user_query: str, history: list[dict] = None):
|
| 170 |
+
"""
|
| 171 |
+
Streams intent detection, yielding thoughts.
|
| 172 |
+
"""
|
| 173 |
+
if not self.client:
|
| 174 |
+
yield {"type": "error", "text": "API Key missing"}
|
| 175 |
+
return
|
| 176 |
+
|
| 177 |
+
intent_prompt = INTENT_DETECTION_PROMPT.format(user_query=user_query)
|
| 178 |
+
|
| 179 |
+
try:
|
| 180 |
+
config = types.GenerateContentConfig(
|
| 181 |
+
thinking_config=types.ThinkingConfig(
|
| 182 |
+
thinking_level="medium",
|
| 183 |
+
include_thoughts=True
|
| 184 |
+
)
|
| 185 |
+
)
|
| 186 |
+
|
| 187 |
+
stream = await asyncio.to_thread(
|
| 188 |
+
self.client.models.generate_content_stream,
|
| 189 |
+
model=self.model,
|
| 190 |
+
contents=intent_prompt,
|
| 191 |
+
config=config,
|
| 192 |
+
)
|
| 193 |
+
|
| 194 |
+
for chunk in stream:
|
| 195 |
+
for part in chunk.candidates[0].content.parts:
|
| 196 |
+
if part.thought:
|
| 197 |
+
yield {"type": "thought", "text": part.text}
|
| 198 |
+
elif part.text:
|
| 199 |
+
yield {"type": "content", "text": part.text}
|
| 200 |
+
|
| 201 |
+
except Exception as e:
|
| 202 |
+
print(f"Error detecting intent: {e}")
|
| 203 |
+
yield {"type": "error", "text": str(e)}
|
| 204 |
+
|
| 205 |
+
# Legacy generate_sql removed.
|
| 206 |
+
|
| 207 |
+
async def identify_relevant_tables(self, user_query: str, table_summaries: str) -> list[str]:
|
| 208 |
+
"""
|
| 209 |
+
Identifies which tables are relevant for the user's query from the catalog summary.
|
| 210 |
+
Returns a JSON list of table names.
|
| 211 |
+
"""
|
| 212 |
+
if not self.client:
|
| 213 |
+
return []
|
| 214 |
+
|
| 215 |
+
prompt = DATA_DISCOVERY_PROMPT.format(user_query=user_query, table_summaries=table_summaries)
|
| 216 |
+
|
| 217 |
+
try:
|
| 218 |
+
config = types.GenerateContentConfig(
|
| 219 |
+
response_mime_type="application/json"
|
| 220 |
+
)
|
| 221 |
+
|
| 222 |
+
response = await asyncio.to_thread(
|
| 223 |
+
self.client.models.generate_content,
|
| 224 |
+
model=self.model,
|
| 225 |
+
contents=prompt,
|
| 226 |
+
config=config,
|
| 227 |
+
)
|
| 228 |
+
|
| 229 |
+
text = response.text.replace("```json", "").replace("```", "").strip()
|
| 230 |
+
tables = json.loads(text)
|
| 231 |
+
return tables if isinstance(tables, list) else []
|
| 232 |
+
|
| 233 |
+
except Exception as e:
|
| 234 |
+
print(f"Error identifying tables: {e}")
|
| 235 |
+
return []
|
| 236 |
+
|
| 237 |
+
async def generate_analytical_sql(self, user_query: str, table_schema: str, history: list[dict] = None) -> str:
|
| 238 |
+
"""
|
| 239 |
+
Generates a DuckDB SQL query for analytical/statistical questions about geographic data.
|
| 240 |
+
This is the core of the text-to-SQL system.
|
| 241 |
+
"""
|
| 242 |
+
if not self.client:
|
| 243 |
+
return "-- Error: API Key missing"
|
| 244 |
+
|
| 245 |
+
prompt = SQL_GENERATION_PROMPT.format(table_schema=table_schema, user_query=user_query)
|
| 246 |
+
|
| 247 |
+
try:
|
| 248 |
+
# Use thinking mode for complex SQL generation
|
| 249 |
+
config = types.GenerateContentConfig(
|
| 250 |
+
temperature=1,
|
| 251 |
+
thinking_config=types.ThinkingConfig(
|
| 252 |
+
thinking_level="high" # Maximum reasoning for SQL generation
|
| 253 |
+
)
|
| 254 |
+
)
|
| 255 |
+
|
| 256 |
+
response = await asyncio.wait_for(
|
| 257 |
+
asyncio.to_thread(
|
| 258 |
+
self.client.models.generate_content,
|
| 259 |
+
model=self.model,
|
| 260 |
+
contents=prompt,
|
| 261 |
+
config=config,
|
| 262 |
+
),
|
| 263 |
+
timeout=120.0
|
| 264 |
+
)
|
| 265 |
+
|
| 266 |
+
sql = response.text.replace("```sql", "").replace("```", "").strip()
|
| 267 |
+
|
| 268 |
+
# Basic validation: must start with SELECT
|
| 269 |
+
if not sql.upper().strip().startswith("SELECT") and "-- ERROR" not in sql:
|
| 270 |
+
print(f"Warning: Generated SQL doesn't start with SELECT: {sql[:100]}")
|
| 271 |
+
if "SELECT" in sql.upper():
|
| 272 |
+
start_idx = sql.upper().find("SELECT")
|
| 273 |
+
sql = sql[start_idx:]
|
| 274 |
+
|
| 275 |
+
return sql
|
| 276 |
+
|
| 277 |
+
except asyncio.TimeoutError:
|
| 278 |
+
print("Gemini API call timed out after 30 seconds")
|
| 279 |
+
return "-- Error: API call timed out. Please try again."
|
| 280 |
+
except Exception as e:
|
| 281 |
+
print(f"Error calling Gemini for analytical SQL: {e}")
|
| 282 |
+
return f"-- Error generating SQL: {str(e)}"
|
| 283 |
+
|
| 284 |
+
async def stream_analytical_sql(self, user_query: str, table_schema: str, history: list[dict] = None):
|
| 285 |
+
"""
|
| 286 |
+
Streams the generation of DuckDB SQL, yielding thoughts and chunks.
|
| 287 |
+
"""
|
| 288 |
+
if not self.client:
|
| 289 |
+
yield {"type": "error", "text": "API Key missing"}
|
| 290 |
+
return
|
| 291 |
+
|
| 292 |
+
prompt = SQL_GENERATION_PROMPT.format(table_schema=table_schema, user_query=user_query)
|
| 293 |
+
|
| 294 |
+
try:
|
| 295 |
+
config = types.GenerateContentConfig(
|
| 296 |
+
temperature=1,
|
| 297 |
+
thinking_config=types.ThinkingConfig(
|
| 298 |
+
thinking_level="high",
|
| 299 |
+
include_thoughts=True
|
| 300 |
+
)
|
| 301 |
+
)
|
| 302 |
+
|
| 303 |
+
stream = await asyncio.to_thread(
|
| 304 |
+
self.client.models.generate_content_stream,
|
| 305 |
+
model=self.model,
|
| 306 |
+
contents=prompt,
|
| 307 |
+
config=config,
|
| 308 |
+
)
|
| 309 |
+
|
| 310 |
+
for chunk in stream:
|
| 311 |
+
for part in chunk.candidates[0].content.parts:
|
| 312 |
+
if part.thought:
|
| 313 |
+
yield {"type": "thought", "text": part.text}
|
| 314 |
+
elif part.text:
|
| 315 |
+
yield {"type": "content", "text": part.text}
|
| 316 |
+
|
| 317 |
+
except Exception as e:
|
| 318 |
+
print(f"Error streaming SQL: {e}")
|
| 319 |
+
yield {"type": "error", "text": str(e)}
|
| 320 |
+
|
| 321 |
+
async def stream_explanation(self, user_query: str, sql_query: str, data_summary: str, history: list[dict] = None):
|
| 322 |
+
"""
|
| 323 |
+
Streams the explanation.
|
| 324 |
+
"""
|
| 325 |
+
if not self.client:
|
| 326 |
+
yield {"type": "error", "text": "API Key missing"}
|
| 327 |
+
return
|
| 328 |
+
|
| 329 |
+
# Build context from history if available
|
| 330 |
+
context_str = ""
|
| 331 |
+
if history:
|
| 332 |
+
context_str = "Previous conversation context:\n"
|
| 333 |
+
for msg in history[-4:]: # Last 4 messages for context
|
| 334 |
+
context_str += f"- {msg['role']}: {msg['content'][:100]}...\n"
|
| 335 |
+
|
| 336 |
+
prompt = EXPLANATION_PROMPT.format(context_str=context_str, user_query=user_query, sql_query=sql_query, data_summary=data_summary)
|
| 337 |
+
|
| 338 |
+
try:
|
| 339 |
+
config = types.GenerateContentConfig(
|
| 340 |
+
system_instruction=SYSTEM_INSTRUCTION,
|
| 341 |
+
thinking_config=types.ThinkingConfig(
|
| 342 |
+
thinking_level="low",
|
| 343 |
+
include_thoughts=True
|
| 344 |
+
)
|
| 345 |
+
)
|
| 346 |
+
|
| 347 |
+
stream = await asyncio.to_thread(
|
| 348 |
+
self.client.models.generate_content_stream,
|
| 349 |
+
model=self.model,
|
| 350 |
+
contents=prompt,
|
| 351 |
+
config=config,
|
| 352 |
+
)
|
| 353 |
+
|
| 354 |
+
for chunk in stream:
|
| 355 |
+
for part in chunk.candidates[0].content.parts:
|
| 356 |
+
if part.thought:
|
| 357 |
+
yield {"type": "thought", "text": part.text}
|
| 358 |
+
elif part.text:
|
| 359 |
+
yield {"type": "content", "text": part.text}
|
| 360 |
+
|
| 361 |
+
except Exception as e:
|
| 362 |
+
print(f"Error generating explanation: {e}")
|
| 363 |
+
yield {"type": "error", "text": str(e)}
|
| 364 |
+
|
| 365 |
+
async def generate_explanation(self, user_query: str, sql_query: str, data_summary: str, history: list[dict] = None) -> str:
|
| 366 |
+
"""
|
| 367 |
+
Explains the results of the query to the user, maintaining conversation context.
|
| 368 |
+
"""
|
| 369 |
+
if not self.client:
|
| 370 |
+
return "I couldn't generate an explanation because the API key is missing."
|
| 371 |
+
|
| 372 |
+
# Build context from history if available
|
| 373 |
+
context_str = ""
|
| 374 |
+
if history:
|
| 375 |
+
context_str = "Previous conversation context:\n"
|
| 376 |
+
for msg in history[-4:]: # Last 4 messages for context
|
| 377 |
+
context_str += f"- {msg['role']}: {msg['content'][:100]}...\n"
|
| 378 |
+
|
| 379 |
+
prompt = EXPLANATION_PROMPT.format(context_str=context_str, user_query=user_query, sql_query=sql_query, data_summary=data_summary)
|
| 380 |
+
|
| 381 |
+
try:
|
| 382 |
+
config = types.GenerateContentConfig(
|
| 383 |
+
system_instruction=SYSTEM_INSTRUCTION,
|
| 384 |
+
thinking_config=types.ThinkingConfig(
|
| 385 |
+
thinking_level="low" # Fast response for explanations
|
| 386 |
+
)
|
| 387 |
+
)
|
| 388 |
+
|
| 389 |
+
response = await asyncio.to_thread(
|
| 390 |
+
self.client.models.generate_content,
|
| 391 |
+
model=self.model,
|
| 392 |
+
contents=prompt,
|
| 393 |
+
config=config,
|
| 394 |
+
)
|
| 395 |
+
return response.text
|
| 396 |
+
except Exception as e:
|
| 397 |
+
print(f"Error generating explanation: {e}")
|
| 398 |
+
return "Here are the results from the query."
|
| 399 |
+
|
| 400 |
+
async def generate_spatial_sql(self, user_query: str, layer_context: str, history: list[dict] = None) -> str:
|
| 401 |
+
"""
|
| 402 |
+
Generates a DuckDB Spatial SQL query for geometric operations on layers.
|
| 403 |
+
"""
|
| 404 |
+
if not self.client:
|
| 405 |
+
return "-- Error: API Key missing"
|
| 406 |
+
|
| 407 |
+
prompt = SPATIAL_SQL_PROMPT.format(layer_context=layer_context, user_query=user_query)
|
| 408 |
+
|
| 409 |
+
try:
|
| 410 |
+
config = types.GenerateContentConfig(
|
| 411 |
+
temperature=1,
|
| 412 |
+
)
|
| 413 |
+
|
| 414 |
+
# Add timeout to prevent indefinite hangs
|
| 415 |
+
response = await asyncio.wait_for(
|
| 416 |
+
asyncio.to_thread(
|
| 417 |
+
self.client.models.generate_content,
|
| 418 |
+
model=self.model,
|
| 419 |
+
contents=prompt,
|
| 420 |
+
config=config,
|
| 421 |
+
),
|
| 422 |
+
timeout=120.0
|
| 423 |
+
)
|
| 424 |
+
|
| 425 |
+
sql = response.text.replace("```sql", "").replace("```", "").strip()
|
| 426 |
+
return sql
|
| 427 |
+
|
| 428 |
+
except asyncio.TimeoutError:
|
| 429 |
+
print("Gemini API call timed out after 30 seconds")
|
| 430 |
+
return "-- Error: API call timed out. Please try again."
|
| 431 |
+
except Exception as e:
|
| 432 |
+
print(f"Error calling Gemini: {e}")
|
| 433 |
+
return f"-- Error generating SQL: {str(e)}"
|
| 434 |
+
|
| 435 |
+
async def correct_sql(self, user_query: str, incorrect_sql: str, error_message: str, schema_context: str) -> str:
|
| 436 |
+
"""
|
| 437 |
+
Corrects a failed SQL query based on the error message.
|
| 438 |
+
"""
|
| 439 |
+
if not self.client:
|
| 440 |
+
return "-- Error: API Key missing"
|
| 441 |
+
|
| 442 |
+
prompt = SQL_CORRECTION_PROMPT.format(
|
| 443 |
+
error_message=error_message,
|
| 444 |
+
incorrect_sql=incorrect_sql,
|
| 445 |
+
user_query=user_query,
|
| 446 |
+
schema_context=schema_context
|
| 447 |
+
)
|
| 448 |
+
|
| 449 |
+
try:
|
| 450 |
+
config = types.GenerateContentConfig(
|
| 451 |
+
temperature=1,
|
| 452 |
+
)
|
| 453 |
+
|
| 454 |
+
response = await asyncio.to_thread(
|
| 455 |
+
self.client.models.generate_content,
|
| 456 |
+
model=self.model,
|
| 457 |
+
contents=prompt,
|
| 458 |
+
config=config,
|
| 459 |
+
)
|
| 460 |
+
|
| 461 |
+
sql = response.text.replace("```sql", "").replace("```", "").strip()
|
| 462 |
+
return sql
|
| 463 |
+
|
| 464 |
+
except Exception as e:
|
| 465 |
+
print(f"Error correcting SQL: {e}")
|
| 466 |
+
return incorrect_sql
|
| 467 |
+
|
| 468 |
+
async def generate_layer_name(self, user_query: str, sql_query: str) -> dict:
|
| 469 |
+
"""
|
| 470 |
+
Generates a short, descriptive name, emoji, and point style for a map layer.
|
| 471 |
+
Returns: {"name": str, "emoji": str, "pointStyle": str | None}
|
| 472 |
+
"""
|
| 473 |
+
if not self.client:
|
| 474 |
+
return {"name": "New Layer", "emoji": "📍", "pointStyle": None}
|
| 475 |
+
|
| 476 |
+
prompt = LAYER_NAME_PROMPT.format(user_query=user_query, sql_query=sql_query)
|
| 477 |
+
|
| 478 |
+
try:
|
| 479 |
+
config = types.GenerateContentConfig(
|
| 480 |
+
temperature=1,
|
| 481 |
+
response_mime_type="application/json"
|
| 482 |
+
)
|
| 483 |
+
|
| 484 |
+
# Use simple generate content (not streaming)
|
| 485 |
+
response = await asyncio.to_thread(
|
| 486 |
+
self.client.models.generate_content,
|
| 487 |
+
model=self.model,
|
| 488 |
+
contents=prompt,
|
| 489 |
+
config=config,
|
| 490 |
+
)
|
| 491 |
+
|
| 492 |
+
result = json.loads(response.text)
|
| 493 |
+
return {
|
| 494 |
+
"name": result.get("name", "Map Layer"),
|
| 495 |
+
"emoji": result.get("emoji", "📍"),
|
| 496 |
+
"pointStyle": result.get("pointStyle", None)
|
| 497 |
+
}
|
| 498 |
+
except Exception as e:
|
| 499 |
+
print(f"Error generating layer name: {e}")
|
| 500 |
+
return {"name": "Map Layer", "emoji": "📍", "pointStyle": None}
|
backend/core/prompts.py
ADDED
|
@@ -0,0 +1,279 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Centralized storage for all LLM system instructions and prompt templates.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
SYSTEM_INSTRUCTION = """You are GeoQuery, an advanced Territorial Intelligence Agent capable of analyzing diverse geographic datasets.
|
| 6 |
+
|
| 7 |
+
## Your Capabilities
|
| 8 |
+
You are not limited to a fixed schema. You have a **Dynamic Metadata Catalog** that allows you to discover and query any dataset ingested into the system.
|
| 9 |
+
- **Administrative Data**: Provinces, districts, corregimientos (always available).
|
| 10 |
+
- **Dynamic Data**: You can query *any* table present in the database (e.g., population, health, infrastructure, biodiversity).
|
| 11 |
+
- **Spatial Analysis**: You can perform complex spatial joins, intersections, and aggregations using PostGIS/DuckDB Spatial functions.
|
| 12 |
+
|
| 13 |
+
## Output Guidelines
|
| 14 |
+
1. **Be Data-Driven**: Base your answers strictly on the results of the SQL queries.
|
| 15 |
+
2. **Be Visual**:
|
| 16 |
+
- Use **Choropleth Maps** (color gradients) for comparisons/densities.
|
| 17 |
+
- Use **Point Maps** for locating specific facilities or events.
|
| 18 |
+
- Use **Charts** (Bar, Pie, Line) for statistical summaries.
|
| 19 |
+
3. **Be Transparent**:
|
| 20 |
+
- Always **Explain** your reasoning.
|
| 21 |
+
- **Cite** the specific table names used (e.g., "Source: `osm/universities.geojson`").
|
| 22 |
+
- If data is missing *after* checking the catalog, explain clearly what is available vs. what is missing.
|
| 23 |
+
|
| 24 |
+
## Interaction Style
|
| 25 |
+
- Professional, concise, and helpful.
|
| 26 |
+
- "Thinking" is enabled: Use your internal thought process to plan complex queries before generating SQL.
|
| 27 |
+
- If a query fails, self-correct by analyzing the error message.
|
| 28 |
+
"""
|
| 29 |
+
|
| 30 |
+
INTENT_DETECTION_PROMPT = """Analyze this user query and determine the best output type.
|
| 31 |
+
|
| 32 |
+
User Query: "{user_query}"
|
| 33 |
+
|
| 34 |
+
THINK STEP BY STEP:
|
| 35 |
+
1. What is the user asking for?
|
| 36 |
+
2. Does this require geographic visualization (map)?
|
| 37 |
+
3. Does this require a chart/graph?
|
| 38 |
+
4. Is this a general question or conversation?
|
| 39 |
+
|
| 40 |
+
Then respond with ONLY ONE of these exact words:
|
| 41 |
+
- GENERAL_CHAT: General question, greeting, or conversational message
|
| 42 |
+
- DATA_QUERY: Wants textual information or data that should be shown on a map
|
| 43 |
+
- MAP_REQUEST: Wants to SEE or VISUALIZE geographic data on a map (show, display, plot, color, compare regions)
|
| 44 |
+
- SPATIAL_OP: Geometric operation between layers (Intersection, Buffer, Union, Difference)
|
| 45 |
+
- STAT_QUERY: EXPLICITLY asks for a CHART or GRAPH (bar chart, pie chart, line graph)
|
| 46 |
+
|
| 47 |
+
Key rules:
|
| 48 |
+
- "color by", "compare regions", "show largest/smallest" → MAP_REQUEST (for choropleth)
|
| 49 |
+
- "show me provinces", "display districts" → MAP_REQUEST
|
| 50 |
+
- "create a chart", "bar graph" → STAT_QUERY
|
| 51 |
+
- Questions about data availability → GENERAL_CHAT
|
| 52 |
+
|
| 53 |
+
Respond with only the intent category, nothing else."""
|
| 54 |
+
|
| 55 |
+
DATA_DISCOVERY_PROMPT = """You are a Data Discovery Agent. Convert the user's request into a list of relevant table names from the available data.
|
| 56 |
+
|
| 57 |
+
User Request: "{user_query}"
|
| 58 |
+
|
| 59 |
+
Available Data Tables:
|
| 60 |
+
{table_summaries}
|
| 61 |
+
|
| 62 |
+
Rules:
|
| 63 |
+
1. Return ONLY a valid JSON list of strings, e.g. ["table1", "table2"].
|
| 64 |
+
2. Select tables that might contain the answer.
|
| 65 |
+
3. If asking for "colleges" or "education", include 'universities', 'schools', etc.
|
| 66 |
+
4. If asking for "health", include 'hospitals'.
|
| 67 |
+
5. Always include 'admin1', 'admin2', 'admin3' if the query involves regions.
|
| 68 |
+
6. If no specific table matches, return empty list [].
|
| 69 |
+
"""
|
| 70 |
+
|
| 71 |
+
SQL_GENERATION_PROMPT = """You are a DuckDB SQL expert for geographic data analysis. Generate a valid DuckDB SQL query for the following request.
|
| 72 |
+
|
| 73 |
+
{table_schema}
|
| 74 |
+
|
| 75 |
+
### CRITICAL - Data Availability:
|
| 76 |
+
✅ You may ONLY query the tables listed above.
|
| 77 |
+
❌ Do NOT invent table names or column names.
|
| 78 |
+
|
| 79 |
+
**If the requested data is NOT in the schema above, IMMEDIATELY return this exact response and STOP:**
|
| 80 |
+
-- ERROR: DATA_UNAVAILABLE
|
| 81 |
+
-- Requested: [what the user asked for]
|
| 82 |
+
-- Available: [list the tables you DO have]
|
| 83 |
+
|
| 84 |
+
**Do NOT keep thinking or try alternative approaches. Just return the error and stop.**
|
| 85 |
+
|
| 86 |
+
### User Request: "{user_query}"
|
| 87 |
+
|
| 88 |
+
### Rules:
|
| 89 |
+
1. Return ONLY the SQL query. No explanation, no markdown formatting.
|
| 90 |
+
2. Use DuckDB syntax (ILIKE for case-insensitive matching).
|
| 91 |
+
3. ALWAYS include 'geom' in SELECT for map visualization.
|
| 92 |
+
4. For "top N" or "largest" queries, use ORDER BY ... DESC LIMIT N.
|
| 93 |
+
5. For "per group" queries, use window functions.
|
| 94 |
+
6. Do NOT add LIMIT unless the user explicitly asks for a specific count (e.g., "top 10", "first 5"). Return all matching rows by default.
|
| 95 |
+
7. NEVER invent columns that don't exist.
|
| 96 |
+
|
| 97 |
+
### Special Datasets:
|
| 98 |
+
- **Population/Demographics**: Use `kontur_population` (H3 hexagons).
|
| 99 |
+
- Columns: `population`, `geom`.
|
| 100 |
+
- Query: `SELECT population, geom FROM kontur_population ...`
|
| 101 |
+
- Visualization: The system detects the `population` column and automatically renders a heatmap (choropleth).
|
| 102 |
+
- Note: This dataset is large (33k hexagons). If querying the entire country, use `LIMIT 40000` to ensure full coverage, or filter by specific province/district.
|
| 103 |
+
|
| 104 |
+
### Example Queries:
|
| 105 |
+
|
| 106 |
+
-- Largest provinces by area
|
| 107 |
+
SELECT adm1_name, area_sqkm, geom FROM admin1 ORDER BY area_sqkm DESC LIMIT 10
|
| 108 |
+
|
| 109 |
+
-- Population Density Heatmap for a Region (e.g., Veraguas)
|
| 110 |
+
SELECT population, geom FROM kontur_population
|
| 111 |
+
WHERE ST_Intersects(geom, (SELECT geom FROM pan_admin1 WHERE adm1_name = 'Veraguas'))
|
| 112 |
+
LIMIT 5000
|
| 113 |
+
|
| 114 |
+
-- Largest district in each province
|
| 115 |
+
SELECT adm1_name, adm2_name, area_sqkm, geom FROM (
|
| 116 |
+
SELECT *, ROW_NUMBER() OVER (PARTITION BY adm1_name ORDER BY area_sqkm DESC) as rn
|
| 117 |
+
FROM admin2
|
| 118 |
+
) WHERE rn = 1
|
| 119 |
+
|
| 120 |
+
Now generate the SQL for the user's request:"""
|
| 121 |
+
|
| 122 |
+
EXPLANATION_PROMPT = """Explain the results of this data query to the user.
|
| 123 |
+
|
| 124 |
+
{context_str}
|
| 125 |
+
|
| 126 |
+
User Question: "{user_query}"
|
| 127 |
+
SQL Query Used: {sql_query}
|
| 128 |
+
Data Result Summary: {data_summary}
|
| 129 |
+
|
| 130 |
+
Instructions:
|
| 131 |
+
1. Keep your response concise and helpful
|
| 132 |
+
2. Only describe data that was ACTUALLY returned in the query results
|
| 133 |
+
3. The available metrics include: area (area_sqkm), population (kontur_population), names, and geographic codes
|
| 134 |
+
4. If the user asked for data that doesn't exist, explain that clearly
|
| 135 |
+
5. Cite: "Administrative boundary data from HDX/INEC, 2021" or "Population data from Kontur, 2022"
|
| 136 |
+
6. Speak as GeoQuery, the platform itself
|
| 137 |
+
"""
|
| 138 |
+
|
| 139 |
+
SPATIAL_SQL_PROMPT = """You are a GIS expert using DuckDB Spatial. Generate a valid SQL query for the following request.
|
| 140 |
+
|
| 141 |
+
Available Data:
|
| 142 |
+
{layer_context}
|
| 143 |
+
|
| 144 |
+
User Request: "{user_query}"
|
| 145 |
+
|
| 146 |
+
Rules:
|
| 147 |
+
1. Return ONLY the SQL query. No markdown formatting, no explanation.
|
| 148 |
+
2. Use DuckDB Spatial functions (ST_Difference, ST_Intersection, ST_Union, ST_Buffer, ST_Within, ST_Contains).
|
| 149 |
+
3. The geometry column is named 'geom'. Use 'geom' for all spatial functions.
|
| 150 |
+
4. CRITICAL: Use ONLY the EXACT table names shown above in your FROM clause.
|
| 151 |
+
- Base tables are shown with their schema (e.g., panama_healthsites_geojson)
|
| 152 |
+
- User-created layers are shown as "Layer N: Name (Table: layer_xxxxx)"
|
| 153 |
+
5. IMPORTANT: For operations that aggregate geometries (ST_Union), use CTE pattern, NOT scalar subqueries:
|
| 154 |
+
CORRECT (CTE pattern):
|
| 155 |
+
```sql
|
| 156 |
+
WITH layer_b_union AS (SELECT ST_Union(geom) as geom FROM layer_b)
|
| 157 |
+
SELECT a.*, ST_Difference(a.geom, b.geom) as geom FROM layer_a a, layer_b_union b
|
| 158 |
+
```
|
| 159 |
+
WRONG (scalar subquery - causes syntax errors):
|
| 160 |
+
```sql
|
| 161 |
+
SELECT ST_Difference(geom, (SELECT ST_Union(geom) FROM layer_b)) FROM layer_a
|
| 162 |
+
```
|
| 163 |
+
6. For containment queries (points within polygons), use ST_Within(points.geom, polygons.geom).
|
| 164 |
+
7. Handle joins properly (e.g., CROSS JOIN or comma-join for combining with CTEs).
|
| 165 |
+
8. IMPORTANT: Preserve 'name' properties if possible.
|
| 166 |
+
9. OUTPUT: SELECT with geom column included.
|
| 167 |
+
"""
|
| 168 |
+
|
| 169 |
+
SQL_CORRECTION_PROMPT = """You are a DuckDB SQL expert. Your previous query failed to execute. Fix it.
|
| 170 |
+
|
| 171 |
+
### Error Message:
|
| 172 |
+
{error_message}
|
| 173 |
+
|
| 174 |
+
### Failed SQL:
|
| 175 |
+
{incorrect_sql}
|
| 176 |
+
|
| 177 |
+
### User Request:
|
| 178 |
+
"{user_query}"
|
| 179 |
+
|
| 180 |
+
### Database Schema:
|
| 181 |
+
{schema_context}
|
| 182 |
+
|
| 183 |
+
### Rules:
|
| 184 |
+
1. Fix the error described in the message (e.g., column ambiguity, missing column, syntax error).
|
| 185 |
+
2. Return ONLY the valid SQL query. No explanation.
|
| 186 |
+
3. Keep the query logic consistent with the User Request.
|
| 187 |
+
4. Ensure 'geom' is selected for map visualization if needed.
|
| 188 |
+
"""
|
| 189 |
+
|
| 190 |
+
LAYER_NAME_PROMPT = """You are a helpful assistant generating a short, descriptive name for a map layer.
|
| 191 |
+
|
| 192 |
+
User Request: "{user_query}"
|
| 193 |
+
SQL Query: "{sql_query}"
|
| 194 |
+
|
| 195 |
+
Rules:
|
| 196 |
+
1. Return a VALID JSON object with three keys: "name", "emoji", and "pointStyle".
|
| 197 |
+
2. "name": A short descriptive name (1-4 words).
|
| 198 |
+
3. "emoji": A single emoji representing the data content (e.g., "🏥" for hospitals, "🎓" for schools, "👥" for population).
|
| 199 |
+
4. "pointStyle": Determines how POINT geometries should be rendered on the map (ONLY applies to Point geometry types):
|
| 200 |
+
- "icon": Use for specific, categorical points of interest (hospitals, schools, parks, landmarks)
|
| 201 |
+
* Best for: Small to medium point datasets (<500 points)
|
| 202 |
+
* Best for: When each point represents a distinct, identifiable feature
|
| 203 |
+
* The emoji will be displayed on the map as the marker icon
|
| 204 |
+
- "circle": Use for large point datasets
|
| 205 |
+
* Best for: Large point datasets (>500 points) like street intersections, sensor locations
|
| 206 |
+
* Renders as simple colored circles for better performance
|
| 207 |
+
- NOTE: For polygon data (H3 hexagons, administrative boundaries), the system automatically uses choropleth rendering (colored polygons). Do NOT set pointStyle for polygon data.
|
| 208 |
+
5. Examples:
|
| 209 |
+
{{"name": "Schools in Panama", "emoji": "🏫", "pointStyle": "icon"}}
|
| 210 |
+
{{"name": "Population Density", "emoji": "👥", "pointStyle": null}} # H3 hexagons are POLYGONS, not points
|
| 211 |
+
{{"name": "National Parks", "emoji": "🌲", "pointStyle": "icon"}}
|
| 212 |
+
{{"name": "Street Intersections", "emoji": "🚦", "pointStyle": "circle"}}
|
| 213 |
+
6. Do NOT return markdown formatting (no ```json). Just the raw JSON string.
|
| 214 |
+
|
| 215 |
+
"""
|
| 216 |
+
|
| 217 |
+
QUERY_PLANNING_PROMPT = """You are a Query Planning Agent. Decompose this complex query into atomic execution steps.
|
| 218 |
+
|
| 219 |
+
User Query: "{user_query}"
|
| 220 |
+
|
| 221 |
+
Available Tables:
|
| 222 |
+
{available_tables}
|
| 223 |
+
|
| 224 |
+
TASK: Break down this query into sequential steps that can be executed independently.
|
| 225 |
+
|
| 226 |
+
RULES:
|
| 227 |
+
1. Each step should query a SINGLE dataset or combine results from previous steps.
|
| 228 |
+
2. Steps that don't depend on each other can run in parallel.
|
| 229 |
+
3. The final step should combine/compare results if needed.
|
| 230 |
+
4. Use ONLY the table names listed above.
|
| 231 |
+
|
| 232 |
+
Return a JSON object with this structure:
|
| 233 |
+
{{
|
| 234 |
+
"steps": [
|
| 235 |
+
{{
|
| 236 |
+
"type": "data_query" | "aggregation" | "comparison" | "spatial_join" | "combine",
|
| 237 |
+
"description": "Human-readable description of this step",
|
| 238 |
+
"tables": ["table_name"],
|
| 239 |
+
"sql_hint": "Optional SQL pattern or hint",
|
| 240 |
+
"depends_on": [],
|
| 241 |
+
"result_name": "descriptive_name_for_result"
|
| 242 |
+
}}
|
| 243 |
+
],
|
| 244 |
+
"combination_logic": "How to combine the step results for the final answer"
|
| 245 |
+
}}
|
| 246 |
+
|
| 247 |
+
EXAMPLE for "Compare hospital count vs school count by province":
|
| 248 |
+
{{
|
| 249 |
+
"steps": [
|
| 250 |
+
{{
|
| 251 |
+
"type": "aggregation",
|
| 252 |
+
"description": "Count hospitals per province",
|
| 253 |
+
"tables": ["panama_healthsites_geojson", "pan_admin1"],
|
| 254 |
+
"sql_hint": "SELECT province, COUNT(*) as hospital_count FROM ... GROUP BY province",
|
| 255 |
+
"depends_on": [],
|
| 256 |
+
"result_name": "hospitals_by_province"
|
| 257 |
+
}},
|
| 258 |
+
{{
|
| 259 |
+
"type": "aggregation",
|
| 260 |
+
"description": "Count schools per province",
|
| 261 |
+
"tables": ["schools", "pan_admin1"],
|
| 262 |
+
"sql_hint": "SELECT province, COUNT(*) as school_count FROM ... GROUP BY province",
|
| 263 |
+
"depends_on": [],
|
| 264 |
+
"result_name": "schools_by_province"
|
| 265 |
+
}},
|
| 266 |
+
{{
|
| 267 |
+
"type": "combine",
|
| 268 |
+
"description": "Join hospital and school counts by province for comparison",
|
| 269 |
+
"tables": [],
|
| 270 |
+
"sql_hint": "JOIN hospitals_by_province h ON schools_by_province s ON h.province = s.province",
|
| 271 |
+
"depends_on": ["hospitals_by_province", "schools_by_province"],
|
| 272 |
+
"result_name": "comparison_result"
|
| 273 |
+
}}
|
| 274 |
+
],
|
| 275 |
+
"combination_logic": "Display side-by-side comparison with bar chart showing both counts per province"
|
| 276 |
+
}}
|
| 277 |
+
|
| 278 |
+
Now decompose the user's query. Return ONLY the JSON, no markdown formatting.
|
| 279 |
+
"""
|
backend/core/query_planner.py
ADDED
|
@@ -0,0 +1,291 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Multi-Step Query Planner
|
| 3 |
+
|
| 4 |
+
Detects complex queries that require multiple datasets or operations,
|
| 5 |
+
decomposes them into atomic steps, and orchestrates execution.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import json
|
| 9 |
+
import logging
|
| 10 |
+
from dataclasses import dataclass, field
|
| 11 |
+
from typing import List, Dict, Any, Optional
|
| 12 |
+
from enum import Enum
|
| 13 |
+
|
| 14 |
+
logger = logging.getLogger(__name__)
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class StepType(Enum):
|
| 18 |
+
"""Types of query steps."""
|
| 19 |
+
DATA_QUERY = "data_query" # Simple data retrieval
|
| 20 |
+
AGGREGATION = "aggregation" # COUNT, SUM, GROUP BY
|
| 21 |
+
COMPARISON = "comparison" # Comparing results from previous steps
|
| 22 |
+
SPATIAL_JOIN = "spatial_join" # Joining datasets spatially
|
| 23 |
+
COMBINE = "combine" # Merge/combine step results
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
@dataclass
|
| 27 |
+
class QueryStep:
|
| 28 |
+
"""A single atomic step in a query plan."""
|
| 29 |
+
step_id: str
|
| 30 |
+
step_type: StepType
|
| 31 |
+
description: str
|
| 32 |
+
tables_needed: List[str]
|
| 33 |
+
sql_template: Optional[str] = None
|
| 34 |
+
depends_on: List[str] = field(default_factory=list)
|
| 35 |
+
result_name: str = "" # Name for intermediate result
|
| 36 |
+
|
| 37 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 38 |
+
return {
|
| 39 |
+
"step_id": self.step_id,
|
| 40 |
+
"step_type": self.step_type.value,
|
| 41 |
+
"description": self.description,
|
| 42 |
+
"tables_needed": self.tables_needed,
|
| 43 |
+
"sql_template": self.sql_template,
|
| 44 |
+
"depends_on": self.depends_on,
|
| 45 |
+
"result_name": self.result_name
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
@dataclass
|
| 50 |
+
class QueryPlan:
|
| 51 |
+
"""Complete execution plan for a complex query."""
|
| 52 |
+
original_query: str
|
| 53 |
+
is_complex: bool
|
| 54 |
+
steps: List[QueryStep] = field(default_factory=list)
|
| 55 |
+
parallel_groups: List[List[str]] = field(default_factory=list) # Steps that can run in parallel
|
| 56 |
+
final_combination_logic: str = ""
|
| 57 |
+
|
| 58 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 59 |
+
return {
|
| 60 |
+
"original_query": self.original_query,
|
| 61 |
+
"is_complex": self.is_complex,
|
| 62 |
+
"steps": [s.to_dict() for s in self.steps],
|
| 63 |
+
"parallel_groups": self.parallel_groups,
|
| 64 |
+
"final_combination_logic": self.final_combination_logic
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
class QueryPlanner:
|
| 69 |
+
"""
|
| 70 |
+
Multi-step query planning service.
|
| 71 |
+
|
| 72 |
+
Analyzes queries to determine complexity and decomposes
|
| 73 |
+
complex queries into executable atomic steps.
|
| 74 |
+
"""
|
| 75 |
+
|
| 76 |
+
_instance = None
|
| 77 |
+
|
| 78 |
+
# Keywords that often indicate multi-step queries
|
| 79 |
+
COMPLEXITY_INDICATORS = [
|
| 80 |
+
"compare", "comparison", "versus", "vs",
|
| 81 |
+
"more than", "less than", "higher than", "lower than",
|
| 82 |
+
"both", "and also", "as well as",
|
| 83 |
+
"ratio", "percentage", "proportion",
|
| 84 |
+
"correlation", "relationship between",
|
| 85 |
+
"combine", "merge", "together with",
|
| 86 |
+
"relative to", "compared to",
|
| 87 |
+
"difference between", "gap between"
|
| 88 |
+
]
|
| 89 |
+
|
| 90 |
+
# Keywords indicating multiple distinct data types
|
| 91 |
+
MULTI_DOMAIN_KEYWORDS = {
|
| 92 |
+
"health": ["hospital", "clinic", "healthcare", "health", "medical"],
|
| 93 |
+
"education": ["school", "university", "education", "college", "student"],
|
| 94 |
+
"infrastructure": ["road", "bridge", "infrastructure", "building"],
|
| 95 |
+
"environment": ["forest", "water", "environment", "park", "protected"],
|
| 96 |
+
"population": ["population", "demographic", "census", "people", "resident"]
|
| 97 |
+
}
|
| 98 |
+
|
| 99 |
+
def __new__(cls):
|
| 100 |
+
if cls._instance is None:
|
| 101 |
+
cls._instance = super(QueryPlanner, cls).__new__(cls)
|
| 102 |
+
cls._instance.initialized = False
|
| 103 |
+
return cls._instance
|
| 104 |
+
|
| 105 |
+
def __init__(self):
|
| 106 |
+
if self.initialized:
|
| 107 |
+
return
|
| 108 |
+
self.initialized = True
|
| 109 |
+
|
| 110 |
+
def detect_complexity(self, query: str) -> Dict[str, Any]:
|
| 111 |
+
"""
|
| 112 |
+
Analyze a query to determine if it requires multi-step planning.
|
| 113 |
+
|
| 114 |
+
Returns:
|
| 115 |
+
{
|
| 116 |
+
"is_complex": bool,
|
| 117 |
+
"reason": str,
|
| 118 |
+
"detected_domains": List[str],
|
| 119 |
+
"complexity_indicators": List[str]
|
| 120 |
+
}
|
| 121 |
+
"""
|
| 122 |
+
query_lower = query.lower()
|
| 123 |
+
|
| 124 |
+
# Check for complexity indicators
|
| 125 |
+
found_indicators = [
|
| 126 |
+
ind for ind in self.COMPLEXITY_INDICATORS
|
| 127 |
+
if ind in query_lower
|
| 128 |
+
]
|
| 129 |
+
|
| 130 |
+
# Check for multiple data domains
|
| 131 |
+
found_domains = []
|
| 132 |
+
for domain, keywords in self.MULTI_DOMAIN_KEYWORDS.items():
|
| 133 |
+
if any(kw in query_lower for kw in keywords):
|
| 134 |
+
found_domains.append(domain)
|
| 135 |
+
|
| 136 |
+
# Determine complexity
|
| 137 |
+
is_complex = (
|
| 138 |
+
len(found_indicators) > 0 and len(found_domains) >= 2
|
| 139 |
+
) or (
|
| 140 |
+
len(found_domains) >= 3
|
| 141 |
+
) or (
|
| 142 |
+
any(x in query_lower for x in ["compare", "ratio", "correlation", "versus", " vs "])
|
| 143 |
+
and len(found_domains) >= 2
|
| 144 |
+
)
|
| 145 |
+
|
| 146 |
+
reason = ""
|
| 147 |
+
if is_complex:
|
| 148 |
+
if len(found_domains) >= 2:
|
| 149 |
+
reason = f"Query involves multiple data domains: {', '.join(found_domains)}"
|
| 150 |
+
if found_indicators:
|
| 151 |
+
reason += f". Contains comparison/aggregation keywords: {', '.join(found_indicators[:3])}"
|
| 152 |
+
|
| 153 |
+
return {
|
| 154 |
+
"is_complex": is_complex,
|
| 155 |
+
"reason": reason,
|
| 156 |
+
"detected_domains": found_domains,
|
| 157 |
+
"complexity_indicators": found_indicators
|
| 158 |
+
}
|
| 159 |
+
|
| 160 |
+
async def plan_query(
|
| 161 |
+
self,
|
| 162 |
+
query: str,
|
| 163 |
+
available_tables: List[str],
|
| 164 |
+
llm_gateway
|
| 165 |
+
) -> QueryPlan:
|
| 166 |
+
"""
|
| 167 |
+
Create an execution plan for a complex query.
|
| 168 |
+
|
| 169 |
+
Uses LLM to decompose the query into atomic steps.
|
| 170 |
+
"""
|
| 171 |
+
from backend.core.prompts import QUERY_PLANNING_PROMPT
|
| 172 |
+
|
| 173 |
+
# Build table context
|
| 174 |
+
table_list = "\n".join(f"- {t}" for t in available_tables)
|
| 175 |
+
|
| 176 |
+
prompt = QUERY_PLANNING_PROMPT.format(
|
| 177 |
+
user_query=query,
|
| 178 |
+
available_tables=table_list
|
| 179 |
+
)
|
| 180 |
+
|
| 181 |
+
try:
|
| 182 |
+
response = await llm_gateway.generate_response(prompt, [])
|
| 183 |
+
|
| 184 |
+
# Parse JSON response
|
| 185 |
+
response_clean = response.strip()
|
| 186 |
+
if response_clean.startswith("```json"):
|
| 187 |
+
response_clean = response_clean[7:]
|
| 188 |
+
if response_clean.startswith("```"):
|
| 189 |
+
response_clean = response_clean[3:]
|
| 190 |
+
if response_clean.endswith("```"):
|
| 191 |
+
response_clean = response_clean[:-3]
|
| 192 |
+
|
| 193 |
+
plan_data = json.loads(response_clean.strip())
|
| 194 |
+
|
| 195 |
+
# Convert to QueryPlan
|
| 196 |
+
steps = []
|
| 197 |
+
for i, step_data in enumerate(plan_data.get("steps", [])):
|
| 198 |
+
step = QueryStep(
|
| 199 |
+
step_id=f"step_{i+1}",
|
| 200 |
+
step_type=StepType(step_data.get("type", "data_query")),
|
| 201 |
+
description=step_data.get("description", ""),
|
| 202 |
+
tables_needed=step_data.get("tables", []),
|
| 203 |
+
sql_template=step_data.get("sql_hint", None),
|
| 204 |
+
depends_on=step_data.get("depends_on", []),
|
| 205 |
+
result_name=step_data.get("result_name", f"result_{i+1}")
|
| 206 |
+
)
|
| 207 |
+
steps.append(step)
|
| 208 |
+
|
| 209 |
+
# Determine parallel groups (steps with no dependencies can run together)
|
| 210 |
+
parallel_groups = self._compute_parallel_groups(steps)
|
| 211 |
+
|
| 212 |
+
return QueryPlan(
|
| 213 |
+
original_query=query,
|
| 214 |
+
is_complex=True,
|
| 215 |
+
steps=steps,
|
| 216 |
+
parallel_groups=parallel_groups,
|
| 217 |
+
final_combination_logic=plan_data.get("combination_logic", "")
|
| 218 |
+
)
|
| 219 |
+
|
| 220 |
+
except Exception as e:
|
| 221 |
+
logger.error(f"Query planning failed: {e}")
|
| 222 |
+
# Return single-step fallback
|
| 223 |
+
return QueryPlan(
|
| 224 |
+
original_query=query,
|
| 225 |
+
is_complex=False,
|
| 226 |
+
steps=[],
|
| 227 |
+
parallel_groups=[],
|
| 228 |
+
final_combination_logic=""
|
| 229 |
+
)
|
| 230 |
+
|
| 231 |
+
def _compute_parallel_groups(self, steps: List[QueryStep]) -> List[List[str]]:
|
| 232 |
+
"""
|
| 233 |
+
Compute which steps can be executed in parallel.
|
| 234 |
+
|
| 235 |
+
Steps with no dependencies (or only completed dependencies)
|
| 236 |
+
can run together.
|
| 237 |
+
"""
|
| 238 |
+
if not steps:
|
| 239 |
+
return []
|
| 240 |
+
|
| 241 |
+
groups = []
|
| 242 |
+
executed = set()
|
| 243 |
+
remaining = {s.step_id: s for s in steps}
|
| 244 |
+
|
| 245 |
+
while remaining:
|
| 246 |
+
# Find steps whose dependencies are all satisfied
|
| 247 |
+
ready = [
|
| 248 |
+
step_id for step_id, step in remaining.items()
|
| 249 |
+
if all(dep in executed for dep in step.depends_on)
|
| 250 |
+
]
|
| 251 |
+
|
| 252 |
+
if not ready:
|
| 253 |
+
# Avoid infinite loop - add remaining as sequential
|
| 254 |
+
ready = list(remaining.keys())[:1]
|
| 255 |
+
|
| 256 |
+
groups.append(ready)
|
| 257 |
+
|
| 258 |
+
for step_id in ready:
|
| 259 |
+
executed.add(step_id)
|
| 260 |
+
del remaining[step_id]
|
| 261 |
+
|
| 262 |
+
return groups
|
| 263 |
+
|
| 264 |
+
def create_simple_plan(self, query: str) -> QueryPlan:
|
| 265 |
+
"""Create a simple single-step plan for non-complex queries."""
|
| 266 |
+
return QueryPlan(
|
| 267 |
+
original_query=query,
|
| 268 |
+
is_complex=False,
|
| 269 |
+
steps=[
|
| 270 |
+
QueryStep(
|
| 271 |
+
step_id="step_1",
|
| 272 |
+
step_type=StepType.DATA_QUERY,
|
| 273 |
+
description="Execute query directly",
|
| 274 |
+
tables_needed=[],
|
| 275 |
+
depends_on=[]
|
| 276 |
+
)
|
| 277 |
+
],
|
| 278 |
+
parallel_groups=[["step_1"]]
|
| 279 |
+
)
|
| 280 |
+
|
| 281 |
+
|
| 282 |
+
# Singleton accessor
|
| 283 |
+
_query_planner: Optional[QueryPlanner] = None
|
| 284 |
+
|
| 285 |
+
|
| 286 |
+
def get_query_planner() -> QueryPlanner:
|
| 287 |
+
"""Get the singleton query planner instance."""
|
| 288 |
+
global _query_planner
|
| 289 |
+
if _query_planner is None:
|
| 290 |
+
_query_planner = QueryPlanner()
|
| 291 |
+
return _query_planner
|
backend/core/semantic_search.py
ADDED
|
@@ -0,0 +1,259 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Semantic Search Service for Dataset Discovery
|
| 3 |
+
|
| 4 |
+
Uses Gemini embeddings to find relevant datasets from a query,
|
| 5 |
+
enabling scalable discovery across 250+ datasets without context overflow.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import json
|
| 9 |
+
import logging
|
| 10 |
+
import numpy as np
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
from typing import Dict, List, Optional, Tuple
|
| 13 |
+
from google import genai
|
| 14 |
+
from google.genai import types
|
| 15 |
+
import os
|
| 16 |
+
from dotenv import load_dotenv
|
| 17 |
+
|
| 18 |
+
load_dotenv()
|
| 19 |
+
|
| 20 |
+
logger = logging.getLogger(__name__)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class SemanticSearch:
|
| 24 |
+
"""
|
| 25 |
+
Embedding-based semantic search for dataset discovery.
|
| 26 |
+
|
| 27 |
+
Embeds dataset metadata (name, description, tags, columns) and
|
| 28 |
+
finds the most relevant datasets for a user query using cosine similarity.
|
| 29 |
+
"""
|
| 30 |
+
|
| 31 |
+
_instance = None
|
| 32 |
+
EMBEDDINGS_FILE = Path(__file__).parent.parent / "data" / "embeddings.json"
|
| 33 |
+
EMBEDDING_MODEL = "models/text-embedding-004"
|
| 34 |
+
|
| 35 |
+
def __new__(cls):
|
| 36 |
+
if cls._instance is None:
|
| 37 |
+
cls._instance = super(SemanticSearch, cls).__new__(cls)
|
| 38 |
+
cls._instance.initialized = False
|
| 39 |
+
return cls._instance
|
| 40 |
+
|
| 41 |
+
def __init__(self):
|
| 42 |
+
if self.initialized:
|
| 43 |
+
return
|
| 44 |
+
|
| 45 |
+
self.embeddings: Dict[str, List[float]] = {}
|
| 46 |
+
self.metadata_cache: Dict[str, str] = {} # table_name -> embedded text
|
| 47 |
+
|
| 48 |
+
# Initialize Gemini client
|
| 49 |
+
api_key = os.getenv("GEMINI_API_KEY") or os.getenv("GOOGLE_API_KEY")
|
| 50 |
+
if api_key:
|
| 51 |
+
self.client = genai.Client()
|
| 52 |
+
else:
|
| 53 |
+
self.client = None
|
| 54 |
+
logger.warning("No API key found. Semantic search will use fallback keyword matching.")
|
| 55 |
+
|
| 56 |
+
self._load_embeddings()
|
| 57 |
+
self.initialized = True
|
| 58 |
+
|
| 59 |
+
def _load_embeddings(self) -> None:
|
| 60 |
+
"""Load cached embeddings from disk."""
|
| 61 |
+
if self.EMBEDDINGS_FILE.exists():
|
| 62 |
+
try:
|
| 63 |
+
with open(self.EMBEDDINGS_FILE, 'r') as f:
|
| 64 |
+
data = json.load(f)
|
| 65 |
+
self.embeddings = data.get("embeddings", {})
|
| 66 |
+
self.metadata_cache = data.get("metadata", {})
|
| 67 |
+
logger.info(f"Loaded {len(self.embeddings)} cached embeddings.")
|
| 68 |
+
except Exception as e:
|
| 69 |
+
logger.error(f"Failed to load embeddings: {e}")
|
| 70 |
+
self.embeddings = {}
|
| 71 |
+
self.metadata_cache = {}
|
| 72 |
+
|
| 73 |
+
def _save_embeddings(self) -> None:
|
| 74 |
+
"""Save embeddings cache to disk."""
|
| 75 |
+
try:
|
| 76 |
+
self.EMBEDDINGS_FILE.parent.mkdir(parents=True, exist_ok=True)
|
| 77 |
+
with open(self.EMBEDDINGS_FILE, 'w') as f:
|
| 78 |
+
json.dump({
|
| 79 |
+
"embeddings": self.embeddings,
|
| 80 |
+
"metadata": self.metadata_cache
|
| 81 |
+
}, f)
|
| 82 |
+
logger.info(f"Saved {len(self.embeddings)} embeddings to cache.")
|
| 83 |
+
except Exception as e:
|
| 84 |
+
logger.error(f"Failed to save embeddings: {e}")
|
| 85 |
+
|
| 86 |
+
def _build_embedding_text(self, table_name: str, metadata: dict) -> str:
|
| 87 |
+
"""Build text representation of a table for embedding."""
|
| 88 |
+
parts = [f"Table: {table_name}"]
|
| 89 |
+
|
| 90 |
+
# Description (prefer semantic if available)
|
| 91 |
+
desc = metadata.get("semantic_description") or metadata.get("description", "")
|
| 92 |
+
if desc:
|
| 93 |
+
parts.append(f"Description: {desc}")
|
| 94 |
+
|
| 95 |
+
# Tags
|
| 96 |
+
tags = metadata.get("tags", [])
|
| 97 |
+
if tags:
|
| 98 |
+
parts.append(f"Tags: {', '.join(tags)}")
|
| 99 |
+
|
| 100 |
+
# Category
|
| 101 |
+
category = metadata.get("category", "")
|
| 102 |
+
if category:
|
| 103 |
+
parts.append(f"Category: {category}")
|
| 104 |
+
|
| 105 |
+
# Key columns (limit to first 15 for embedding efficiency)
|
| 106 |
+
columns = metadata.get("columns", [])
|
| 107 |
+
# Filter out generic columns
|
| 108 |
+
meaningful_cols = [c for c in columns[:15] if c not in ['geom', 'geometry', 'id', 'fid']]
|
| 109 |
+
if meaningful_cols:
|
| 110 |
+
parts.append(f"Columns: {', '.join(meaningful_cols)}")
|
| 111 |
+
|
| 112 |
+
# Data type
|
| 113 |
+
data_type = metadata.get("data_type", "static")
|
| 114 |
+
parts.append(f"Data type: {data_type}")
|
| 115 |
+
|
| 116 |
+
return ". ".join(parts)
|
| 117 |
+
|
| 118 |
+
def _embed_text(self, text: str) -> Optional[List[float]]:
|
| 119 |
+
"""Get embedding for a text string."""
|
| 120 |
+
if not self.client:
|
| 121 |
+
return None
|
| 122 |
+
|
| 123 |
+
try:
|
| 124 |
+
result = self.client.models.embed_content(
|
| 125 |
+
model=self.EMBEDDING_MODEL,
|
| 126 |
+
contents=text # Note: 'contents' not 'content'
|
| 127 |
+
)
|
| 128 |
+
return result.embeddings[0].values
|
| 129 |
+
except Exception as e:
|
| 130 |
+
logger.error(f"Embedding failed: {e}")
|
| 131 |
+
return None
|
| 132 |
+
|
| 133 |
+
def _cosine_similarity(self, a: List[float], b: List[float]) -> float:
|
| 134 |
+
"""Compute cosine similarity between two vectors."""
|
| 135 |
+
a_np = np.array(a)
|
| 136 |
+
b_np = np.array(b)
|
| 137 |
+
|
| 138 |
+
dot_product = np.dot(a_np, b_np)
|
| 139 |
+
norm_a = np.linalg.norm(a_np)
|
| 140 |
+
norm_b = np.linalg.norm(b_np)
|
| 141 |
+
|
| 142 |
+
if norm_a == 0 or norm_b == 0:
|
| 143 |
+
return 0.0
|
| 144 |
+
|
| 145 |
+
return float(dot_product / (norm_a * norm_b))
|
| 146 |
+
|
| 147 |
+
def embed_table(self, table_name: str, metadata: dict) -> bool:
|
| 148 |
+
"""
|
| 149 |
+
Embed a table's metadata for semantic search.
|
| 150 |
+
|
| 151 |
+
Returns True if embedding was successful or already cached.
|
| 152 |
+
"""
|
| 153 |
+
text = self._build_embedding_text(table_name, metadata)
|
| 154 |
+
|
| 155 |
+
# Check if already embedded with same text
|
| 156 |
+
if table_name in self.metadata_cache and self.metadata_cache[table_name] == text:
|
| 157 |
+
return True
|
| 158 |
+
|
| 159 |
+
embedding = self._embed_text(text)
|
| 160 |
+
if embedding:
|
| 161 |
+
self.embeddings[table_name] = embedding
|
| 162 |
+
self.metadata_cache[table_name] = text
|
| 163 |
+
return True
|
| 164 |
+
|
| 165 |
+
return False
|
| 166 |
+
|
| 167 |
+
def embed_all_tables(self, catalog: Dict[str, dict]) -> int:
|
| 168 |
+
"""
|
| 169 |
+
Embed all tables in the catalog.
|
| 170 |
+
|
| 171 |
+
Returns number of newly embedded tables.
|
| 172 |
+
"""
|
| 173 |
+
new_count = 0
|
| 174 |
+
|
| 175 |
+
for table_name, metadata in catalog.items():
|
| 176 |
+
text = self._build_embedding_text(table_name, metadata)
|
| 177 |
+
|
| 178 |
+
# Skip if already embedded with same text
|
| 179 |
+
if table_name in self.metadata_cache and self.metadata_cache[table_name] == text:
|
| 180 |
+
continue
|
| 181 |
+
|
| 182 |
+
if self.embed_table(table_name, metadata):
|
| 183 |
+
new_count += 1
|
| 184 |
+
|
| 185 |
+
if new_count > 0:
|
| 186 |
+
self._save_embeddings()
|
| 187 |
+
logger.info(f"Embedded {new_count} new tables.")
|
| 188 |
+
|
| 189 |
+
return new_count
|
| 190 |
+
|
| 191 |
+
def search(self, query: str, top_k: int = 15) -> List[Tuple[str, float]]:
|
| 192 |
+
"""
|
| 193 |
+
Find the most relevant tables for a query.
|
| 194 |
+
|
| 195 |
+
Returns list of (table_name, similarity_score) tuples, sorted by relevance.
|
| 196 |
+
"""
|
| 197 |
+
if not self.embeddings:
|
| 198 |
+
logger.warning("No embeddings available. Returning empty results.")
|
| 199 |
+
return []
|
| 200 |
+
|
| 201 |
+
# Embed the query
|
| 202 |
+
query_embedding = self._embed_text(query)
|
| 203 |
+
|
| 204 |
+
if not query_embedding:
|
| 205 |
+
# Fallback to keyword matching
|
| 206 |
+
return self._keyword_fallback(query, top_k)
|
| 207 |
+
|
| 208 |
+
# Compute similarities
|
| 209 |
+
scores = []
|
| 210 |
+
for table_name, table_embedding in self.embeddings.items():
|
| 211 |
+
score = self._cosine_similarity(query_embedding, table_embedding)
|
| 212 |
+
scores.append((table_name, score))
|
| 213 |
+
|
| 214 |
+
# Sort by similarity (descending)
|
| 215 |
+
scores.sort(key=lambda x: -x[1])
|
| 216 |
+
|
| 217 |
+
return scores[:top_k]
|
| 218 |
+
|
| 219 |
+
def search_table_names(self, query: str, top_k: int = 15) -> List[str]:
|
| 220 |
+
"""Convenience method that returns just table names."""
|
| 221 |
+
results = self.search(query, top_k)
|
| 222 |
+
return [name for name, _ in results]
|
| 223 |
+
|
| 224 |
+
def _keyword_fallback(self, query: str, top_k: int) -> List[Tuple[str, float]]:
|
| 225 |
+
"""
|
| 226 |
+
Simple keyword matching fallback when embeddings unavailable.
|
| 227 |
+
"""
|
| 228 |
+
query_terms = query.lower().split()
|
| 229 |
+
scores = []
|
| 230 |
+
|
| 231 |
+
for table_name, text in self.metadata_cache.items():
|
| 232 |
+
text_lower = text.lower()
|
| 233 |
+
score = sum(1 for term in query_terms if term in text_lower)
|
| 234 |
+
if score > 0:
|
| 235 |
+
scores.append((table_name, score / len(query_terms)))
|
| 236 |
+
|
| 237 |
+
scores.sort(key=lambda x: -x[1])
|
| 238 |
+
return scores[:top_k]
|
| 239 |
+
|
| 240 |
+
def get_stats(self) -> dict:
|
| 241 |
+
"""Return statistics about the semantic search index."""
|
| 242 |
+
return {
|
| 243 |
+
"total_tables": len(self.embeddings),
|
| 244 |
+
"cache_file": str(self.EMBEDDINGS_FILE),
|
| 245 |
+
"cache_exists": self.EMBEDDINGS_FILE.exists(),
|
| 246 |
+
"client_available": self.client is not None
|
| 247 |
+
}
|
| 248 |
+
|
| 249 |
+
|
| 250 |
+
# Singleton accessor
|
| 251 |
+
_semantic_search: Optional[SemanticSearch] = None
|
| 252 |
+
|
| 253 |
+
|
| 254 |
+
def get_semantic_search() -> SemanticSearch:
|
| 255 |
+
"""Get the singleton semantic search instance."""
|
| 256 |
+
global _semantic_search
|
| 257 |
+
if _semantic_search is None:
|
| 258 |
+
_semantic_search = SemanticSearch()
|
| 259 |
+
return _semantic_search
|
backend/core/session_store.py
ADDED
|
@@ -0,0 +1,179 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Session Store Service
|
| 3 |
+
|
| 4 |
+
Thread-safe session-scoped storage for user layers and context.
|
| 5 |
+
Replaces global SESSION_LAYERS with per-session isolation.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import logging
|
| 9 |
+
import threading
|
| 10 |
+
from datetime import datetime, timedelta
|
| 11 |
+
from typing import Dict, List, Optional, Any
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
logger = logging.getLogger(__name__)
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class SessionStore:
|
| 18 |
+
"""
|
| 19 |
+
Thread-safe session-scoped storage with TTL expiration.
|
| 20 |
+
|
| 21 |
+
Each session maintains its own:
|
| 22 |
+
- layers: Map layers created by the user
|
| 23 |
+
- context: Optional conversation context
|
| 24 |
+
|
| 25 |
+
Sessions expire after configurable TTL (default 2 hours).
|
| 26 |
+
"""
|
| 27 |
+
|
| 28 |
+
_instance = None
|
| 29 |
+
|
| 30 |
+
def __new__(cls):
|
| 31 |
+
if cls._instance is None:
|
| 32 |
+
cls._instance = super(SessionStore, cls).__new__(cls)
|
| 33 |
+
cls._instance.initialized = False
|
| 34 |
+
return cls._instance
|
| 35 |
+
|
| 36 |
+
def __init__(self, ttl_hours: int = 2, max_layers_per_session: int = 15):
|
| 37 |
+
if self.initialized:
|
| 38 |
+
return
|
| 39 |
+
|
| 40 |
+
self._sessions: Dict[str, dict] = {}
|
| 41 |
+
self._lock = threading.Lock()
|
| 42 |
+
self.ttl = timedelta(hours=ttl_hours)
|
| 43 |
+
self.max_layers = max_layers_per_session
|
| 44 |
+
self.initialized = True
|
| 45 |
+
|
| 46 |
+
logger.info(f"SessionStore initialized with TTL={ttl_hours}h, max_layers={max_layers_per_session}")
|
| 47 |
+
|
| 48 |
+
def _get_or_create_session(self, session_id: str) -> dict:
|
| 49 |
+
"""Get existing session or create new one."""
|
| 50 |
+
if session_id not in self._sessions:
|
| 51 |
+
self._sessions[session_id] = {
|
| 52 |
+
"layers": [],
|
| 53 |
+
"created": datetime.now(),
|
| 54 |
+
"accessed": datetime.now()
|
| 55 |
+
}
|
| 56 |
+
return self._sessions[session_id]
|
| 57 |
+
|
| 58 |
+
def get_layers(self, session_id: str) -> List[dict]:
|
| 59 |
+
"""Get all layers for a session."""
|
| 60 |
+
with self._lock:
|
| 61 |
+
session = self._get_or_create_session(session_id)
|
| 62 |
+
session["accessed"] = datetime.now()
|
| 63 |
+
return session["layers"].copy()
|
| 64 |
+
|
| 65 |
+
def add_layer(self, session_id: str, layer: dict) -> None:
|
| 66 |
+
"""
|
| 67 |
+
Add a layer to a session.
|
| 68 |
+
|
| 69 |
+
Enforces max_layers limit by removing oldest layers.
|
| 70 |
+
"""
|
| 71 |
+
with self._lock:
|
| 72 |
+
session = self._get_or_create_session(session_id)
|
| 73 |
+
session["layers"].append(layer)
|
| 74 |
+
session["accessed"] = datetime.now()
|
| 75 |
+
|
| 76 |
+
# Enforce layer limit
|
| 77 |
+
while len(session["layers"]) > self.max_layers:
|
| 78 |
+
removed = session["layers"].pop(0)
|
| 79 |
+
logger.debug(f"Session {session_id[:8]}: removed oldest layer {removed.get('name', 'unknown')}")
|
| 80 |
+
|
| 81 |
+
def update_layer(self, session_id: str, layer_id: str, updates: dict) -> bool:
|
| 82 |
+
"""
|
| 83 |
+
Update an existing layer in a session.
|
| 84 |
+
|
| 85 |
+
Returns True if layer was found and updated.
|
| 86 |
+
"""
|
| 87 |
+
with self._lock:
|
| 88 |
+
session = self._sessions.get(session_id)
|
| 89 |
+
if not session:
|
| 90 |
+
return False
|
| 91 |
+
|
| 92 |
+
for layer in session["layers"]:
|
| 93 |
+
if layer.get("id") == layer_id:
|
| 94 |
+
layer.update(updates)
|
| 95 |
+
session["accessed"] = datetime.now()
|
| 96 |
+
return True
|
| 97 |
+
|
| 98 |
+
return False
|
| 99 |
+
|
| 100 |
+
def remove_layer(self, session_id: str, layer_id: str) -> bool:
|
| 101 |
+
"""
|
| 102 |
+
Remove a layer from a session.
|
| 103 |
+
|
| 104 |
+
Returns True if layer was found and removed.
|
| 105 |
+
"""
|
| 106 |
+
with self._lock:
|
| 107 |
+
session = self._sessions.get(session_id)
|
| 108 |
+
if not session:
|
| 109 |
+
return False
|
| 110 |
+
|
| 111 |
+
original_len = len(session["layers"])
|
| 112 |
+
session["layers"] = [l for l in session["layers"] if l.get("id") != layer_id]
|
| 113 |
+
session["accessed"] = datetime.now()
|
| 114 |
+
|
| 115 |
+
return len(session["layers"]) < original_len
|
| 116 |
+
|
| 117 |
+
def clear_session(self, session_id: str) -> None:
|
| 118 |
+
"""Clear all data for a session."""
|
| 119 |
+
with self._lock:
|
| 120 |
+
if session_id in self._sessions:
|
| 121 |
+
del self._sessions[session_id]
|
| 122 |
+
|
| 123 |
+
def get_layer_by_index(self, session_id: str, index: int) -> Optional[dict]:
|
| 124 |
+
"""Get a specific layer by 1-based index (for user references like 'Layer 1')."""
|
| 125 |
+
with self._lock:
|
| 126 |
+
session = self._sessions.get(session_id)
|
| 127 |
+
if not session:
|
| 128 |
+
return None
|
| 129 |
+
|
| 130 |
+
layers = session["layers"]
|
| 131 |
+
if 1 <= index <= len(layers):
|
| 132 |
+
return layers[index - 1].copy()
|
| 133 |
+
|
| 134 |
+
return None
|
| 135 |
+
|
| 136 |
+
def cleanup_expired(self) -> int:
|
| 137 |
+
"""
|
| 138 |
+
Remove sessions older than TTL.
|
| 139 |
+
|
| 140 |
+
Returns number of expired sessions removed.
|
| 141 |
+
"""
|
| 142 |
+
with self._lock:
|
| 143 |
+
now = datetime.now()
|
| 144 |
+
expired = [
|
| 145 |
+
sid for sid, data in self._sessions.items()
|
| 146 |
+
if now - data.get("accessed", data["created"]) > self.ttl
|
| 147 |
+
]
|
| 148 |
+
|
| 149 |
+
for sid in expired:
|
| 150 |
+
del self._sessions[sid]
|
| 151 |
+
|
| 152 |
+
if expired:
|
| 153 |
+
logger.info(f"Cleaned up {len(expired)} expired sessions.")
|
| 154 |
+
|
| 155 |
+
return len(expired)
|
| 156 |
+
|
| 157 |
+
def get_stats(self) -> dict:
|
| 158 |
+
"""Return statistics about active sessions."""
|
| 159 |
+
with self._lock:
|
| 160 |
+
total_layers = sum(len(s["layers"]) for s in self._sessions.values())
|
| 161 |
+
|
| 162 |
+
return {
|
| 163 |
+
"active_sessions": len(self._sessions),
|
| 164 |
+
"total_layers": total_layers,
|
| 165 |
+
"ttl_hours": self.ttl.total_seconds() / 3600,
|
| 166 |
+
"max_layers_per_session": self.max_layers
|
| 167 |
+
}
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
# Singleton accessor
|
| 171 |
+
_session_store: Optional[SessionStore] = None
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
def get_session_store() -> SessionStore:
|
| 175 |
+
"""Get the singleton session store instance."""
|
| 176 |
+
global _session_store
|
| 177 |
+
if _session_store is None:
|
| 178 |
+
_session_store = SessionStore()
|
| 179 |
+
return _session_store
|
backend/data/catalog.json
ADDED
|
@@ -0,0 +1,1290 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"pan_admin3": {
|
| 3 |
+
"path": "base/pan_admin3.geojson",
|
| 4 |
+
"description": "Data from base/pan_admin3.geojson",
|
| 5 |
+
"semantic_description": "This dataset contains the third-level administrative boundaries (corregimientos) of Panama, including their hierarchical relationships to districts and provinces. It provides nationwide coverage of 594 territorial units, making it essential for localized demographic analysis, regional planning, and mapping public service distribution.",
|
| 6 |
+
"tags": [
|
| 7 |
+
"administrative",
|
| 8 |
+
"government",
|
| 9 |
+
"geographic",
|
| 10 |
+
"spatial",
|
| 11 |
+
"boundaries"
|
| 12 |
+
],
|
| 13 |
+
"data_type": "static",
|
| 14 |
+
"update_frequency": null,
|
| 15 |
+
"columns": [
|
| 16 |
+
"adm3_name",
|
| 17 |
+
"adm3_name1",
|
| 18 |
+
"adm3_name2",
|
| 19 |
+
"adm3_name3",
|
| 20 |
+
"adm3_pcode",
|
| 21 |
+
"adm2_name",
|
| 22 |
+
"adm2_name1",
|
| 23 |
+
"adm2_name2",
|
| 24 |
+
"adm2_name3",
|
| 25 |
+
"adm2_pcode",
|
| 26 |
+
"adm1_name",
|
| 27 |
+
"adm1_name1",
|
| 28 |
+
"adm1_name2",
|
| 29 |
+
"adm1_name3",
|
| 30 |
+
"adm1_pcode",
|
| 31 |
+
"adm0_name",
|
| 32 |
+
"adm0_name1",
|
| 33 |
+
"adm0_name2",
|
| 34 |
+
"adm0_name3",
|
| 35 |
+
"adm0_pcode",
|
| 36 |
+
"valid_on",
|
| 37 |
+
"valid_to",
|
| 38 |
+
"area_sqkm",
|
| 39 |
+
"version",
|
| 40 |
+
"lang",
|
| 41 |
+
"lang1",
|
| 42 |
+
"lang2",
|
| 43 |
+
"lang3",
|
| 44 |
+
"adm3_ref_name",
|
| 45 |
+
"center_lat",
|
| 46 |
+
"center_lon",
|
| 47 |
+
"geom"
|
| 48 |
+
],
|
| 49 |
+
"row_count": 594,
|
| 50 |
+
"category": "base",
|
| 51 |
+
"format": "geojson",
|
| 52 |
+
"last_indexed": "2026-01-09T16:15:16.691836",
|
| 53 |
+
"last_enriched": "2026-01-09T16:36:08.469629"
|
| 54 |
+
},
|
| 55 |
+
"pan_adminpoints": {
|
| 56 |
+
"path": "base/pan_adminpoints.geojson",
|
| 57 |
+
"description": "Data from base/pan_adminpoints.geojson",
|
| 58 |
+
"semantic_description": "This dataset provides point locations for administrative centers across Panama, covering hierarchical levels from the national capital down to sub-district seats. It includes geographic coordinates and standardized naming conventions, making it a foundational tool for territorial planning and administrative spatial analysis.",
|
| 59 |
+
"tags": [
|
| 60 |
+
"administrative",
|
| 61 |
+
"points-of-interest",
|
| 62 |
+
"amenities",
|
| 63 |
+
"government",
|
| 64 |
+
"spatial",
|
| 65 |
+
"boundaries"
|
| 66 |
+
],
|
| 67 |
+
"data_type": "static",
|
| 68 |
+
"update_frequency": null,
|
| 69 |
+
"columns": [
|
| 70 |
+
"admin_level",
|
| 71 |
+
"name",
|
| 72 |
+
"name1",
|
| 73 |
+
"name2",
|
| 74 |
+
"name3",
|
| 75 |
+
"x_coord",
|
| 76 |
+
"y_coord",
|
| 77 |
+
"adm4_name",
|
| 78 |
+
"adm4_name1",
|
| 79 |
+
"adm4_name2",
|
| 80 |
+
"adm4_name3",
|
| 81 |
+
"adm4_pcode",
|
| 82 |
+
"adm3_name",
|
| 83 |
+
"adm3_name1",
|
| 84 |
+
"adm3_name2",
|
| 85 |
+
"adm3_name3",
|
| 86 |
+
"adm3_pcode",
|
| 87 |
+
"adm2_name",
|
| 88 |
+
"adm2_name1",
|
| 89 |
+
"adm2_name2",
|
| 90 |
+
"adm2_name3",
|
| 91 |
+
"adm2_pcode",
|
| 92 |
+
"adm1_name",
|
| 93 |
+
"adm1_name1",
|
| 94 |
+
"adm1_name2",
|
| 95 |
+
"adm1_name3",
|
| 96 |
+
"adm1_pcode",
|
| 97 |
+
"adm0_name",
|
| 98 |
+
"adm0_name1",
|
| 99 |
+
"adm0_name2",
|
| 100 |
+
"adm0_name3",
|
| 101 |
+
"adm0_pcode",
|
| 102 |
+
"valid_on",
|
| 103 |
+
"valid_to",
|
| 104 |
+
"version",
|
| 105 |
+
"lang",
|
| 106 |
+
"lang1",
|
| 107 |
+
"lang2",
|
| 108 |
+
"lang3",
|
| 109 |
+
"geom"
|
| 110 |
+
],
|
| 111 |
+
"row_count": 684,
|
| 112 |
+
"category": "base",
|
| 113 |
+
"format": "geojson",
|
| 114 |
+
"last_indexed": "2026-01-09T16:15:16.764000",
|
| 115 |
+
"last_enriched": "2026-01-09T16:36:12.240069"
|
| 116 |
+
},
|
| 117 |
+
"pan_admin2": {
|
| 118 |
+
"path": "base/pan_admin2.geojson",
|
| 119 |
+
"description": "Data from base/pan_admin2.geojson",
|
| 120 |
+
"semantic_description": "This dataset comprises the second-level administrative boundaries for Panama, specifically representing the country's 76 districts. It includes standardized names and hierarchical P-codes for districts, provinces, and the national level to ensure data interoperability. This base layer is primarily used for administrative mapping, regional statistical analysis, and territorial planning.",
|
| 121 |
+
"tags": [
|
| 122 |
+
"administrative",
|
| 123 |
+
"government",
|
| 124 |
+
"geographic",
|
| 125 |
+
"spatial",
|
| 126 |
+
"boundaries"
|
| 127 |
+
],
|
| 128 |
+
"data_type": "static",
|
| 129 |
+
"update_frequency": null,
|
| 130 |
+
"columns": [
|
| 131 |
+
"adm2_name",
|
| 132 |
+
"adm2_name1",
|
| 133 |
+
"adm2_name2",
|
| 134 |
+
"adm2_name3",
|
| 135 |
+
"adm2_pcode",
|
| 136 |
+
"adm1_name",
|
| 137 |
+
"adm1_name1",
|
| 138 |
+
"adm1_name2",
|
| 139 |
+
"adm1_name3",
|
| 140 |
+
"adm1_pcode",
|
| 141 |
+
"adm0_name",
|
| 142 |
+
"adm0_name1",
|
| 143 |
+
"adm0_name2",
|
| 144 |
+
"adm0_name3",
|
| 145 |
+
"adm0_pcode",
|
| 146 |
+
"valid_on",
|
| 147 |
+
"valid_to",
|
| 148 |
+
"area_sqkm",
|
| 149 |
+
"version",
|
| 150 |
+
"lang",
|
| 151 |
+
"lang1",
|
| 152 |
+
"lang2",
|
| 153 |
+
"lang3",
|
| 154 |
+
"adm2_ref_name",
|
| 155 |
+
"center_lat",
|
| 156 |
+
"center_lon",
|
| 157 |
+
"geom"
|
| 158 |
+
],
|
| 159 |
+
"row_count": 76,
|
| 160 |
+
"category": "base",
|
| 161 |
+
"format": "geojson",
|
| 162 |
+
"last_indexed": "2026-01-09T16:15:17.560205",
|
| 163 |
+
"last_enriched": "2026-01-09T16:36:16.456717"
|
| 164 |
+
},
|
| 165 |
+
"pan_admin0": {
|
| 166 |
+
"path": "base/pan_admin0.geojson",
|
| 167 |
+
"description": "Data from base/pan_admin0.geojson",
|
| 168 |
+
"semantic_description": "This dataset defines the national boundary of Panama, representing the country's primary administrative level (ADM0). It is typically used as a foundational layer for national-level spatial analysis, cartographic visualizations, and as a reference for countrywide area calculations and statistical aggregation.",
|
| 169 |
+
"tags": [
|
| 170 |
+
"administrative",
|
| 171 |
+
"government",
|
| 172 |
+
"geographic",
|
| 173 |
+
"spatial",
|
| 174 |
+
"boundaries"
|
| 175 |
+
],
|
| 176 |
+
"data_type": "static",
|
| 177 |
+
"update_frequency": null,
|
| 178 |
+
"columns": [
|
| 179 |
+
"iso2",
|
| 180 |
+
"iso3",
|
| 181 |
+
"adm0_name",
|
| 182 |
+
"adm0_name1",
|
| 183 |
+
"adm0_name2",
|
| 184 |
+
"adm0_name3",
|
| 185 |
+
"adm0_pcode",
|
| 186 |
+
"valid_on",
|
| 187 |
+
"valid_to",
|
| 188 |
+
"version",
|
| 189 |
+
"area_sqkm",
|
| 190 |
+
"lang",
|
| 191 |
+
"lang1",
|
| 192 |
+
"lang2",
|
| 193 |
+
"lang3",
|
| 194 |
+
"adm0_ref_name",
|
| 195 |
+
"center_lat",
|
| 196 |
+
"center_lon",
|
| 197 |
+
"geom"
|
| 198 |
+
],
|
| 199 |
+
"row_count": 1,
|
| 200 |
+
"category": "base",
|
| 201 |
+
"format": "geojson",
|
| 202 |
+
"last_indexed": "2026-01-09T16:15:18.159865",
|
| 203 |
+
"last_enriched": "2026-01-09T16:36:18.989777"
|
| 204 |
+
},
|
| 205 |
+
"pan_admin1": {
|
| 206 |
+
"path": "base/pan_admin1.geojson",
|
| 207 |
+
"description": "Data from base/pan_admin1.geojson",
|
| 208 |
+
"semantic_description": "This dataset defines the 13 primary administrative divisions of Panama, including its provinces and major indigenous territories. It provides standardized names, area measurements, and codes necessary for nationwide spatial analysis and regional reporting. The layer serves as a foundational component for administrative mapping and aggregating statistical data at the provincial level.",
|
| 209 |
+
"tags": [
|
| 210 |
+
"administrative",
|
| 211 |
+
"government",
|
| 212 |
+
"geographic",
|
| 213 |
+
"spatial",
|
| 214 |
+
"boundaries"
|
| 215 |
+
],
|
| 216 |
+
"data_type": "static",
|
| 217 |
+
"update_frequency": null,
|
| 218 |
+
"columns": [
|
| 219 |
+
"adm1_name",
|
| 220 |
+
"adm1_name1",
|
| 221 |
+
"adm1_name2",
|
| 222 |
+
"adm1_name3",
|
| 223 |
+
"adm1_pcode",
|
| 224 |
+
"adm0_name",
|
| 225 |
+
"adm0_name1",
|
| 226 |
+
"adm0_name2",
|
| 227 |
+
"adm0_name3",
|
| 228 |
+
"adm0_pcode",
|
| 229 |
+
"valid_on",
|
| 230 |
+
"valid_to",
|
| 231 |
+
"area_sqkm",
|
| 232 |
+
"version",
|
| 233 |
+
"lang",
|
| 234 |
+
"lang1",
|
| 235 |
+
"lang2",
|
| 236 |
+
"lang3",
|
| 237 |
+
"adm1_ref_name",
|
| 238 |
+
"center_lat",
|
| 239 |
+
"center_lon",
|
| 240 |
+
"geom"
|
| 241 |
+
],
|
| 242 |
+
"row_count": 13,
|
| 243 |
+
"category": "base",
|
| 244 |
+
"format": "geojson",
|
| 245 |
+
"last_indexed": "2026-01-09T16:15:18.936257",
|
| 246 |
+
"last_enriched": "2026-01-09T16:36:22.439266"
|
| 247 |
+
},
|
| 248 |
+
"pan_adminlines": {
|
| 249 |
+
"path": "base/pan_adminlines.geojson",
|
| 250 |
+
"description": "Data from base/pan_adminlines.geojson",
|
| 251 |
+
"semantic_description": "This dataset contains the linear administrative boundaries of Panama across various hierarchical levels, including province and district borders identified by standard P-codes. It serves as a foundational spatial layer for delineating jurisdictional limits and performing territorial analysis. The data is primarily used for cartographic visualization and spatial joins that require precise border definitions for administrative planning.",
|
| 252 |
+
"tags": [
|
| 253 |
+
"administrative",
|
| 254 |
+
"spatial",
|
| 255 |
+
"boundaries",
|
| 256 |
+
"government"
|
| 257 |
+
],
|
| 258 |
+
"data_type": "static",
|
| 259 |
+
"update_frequency": null,
|
| 260 |
+
"columns": [
|
| 261 |
+
"adm_level",
|
| 262 |
+
"name",
|
| 263 |
+
"valid_on",
|
| 264 |
+
"valid_to",
|
| 265 |
+
"version",
|
| 266 |
+
"right_pcod",
|
| 267 |
+
"left_pcod",
|
| 268 |
+
"geom"
|
| 269 |
+
],
|
| 270 |
+
"row_count": 2338,
|
| 271 |
+
"category": "base",
|
| 272 |
+
"format": "geojson",
|
| 273 |
+
"last_indexed": "2026-01-09T16:15:19.846930",
|
| 274 |
+
"last_enriched": "2026-01-09T16:36:26.375260"
|
| 275 |
+
},
|
| 276 |
+
"universities": {
|
| 277 |
+
"path": "osm/universities.geojson",
|
| 278 |
+
"description": "Data from osm/universities.geojson",
|
| 279 |
+
"semantic_description": "This dataset identifies 62 higher education institutions across Panama, including attributes for names, operators, and facility types. It is designed for spatial analysis of educational coverage, urban infrastructure planning, and socio-economic research.",
|
| 280 |
+
"tags": [
|
| 281 |
+
"higher-education",
|
| 282 |
+
"education",
|
| 283 |
+
"infrastructure",
|
| 284 |
+
"osm",
|
| 285 |
+
"spatial",
|
| 286 |
+
"facilities",
|
| 287 |
+
"points",
|
| 288 |
+
"panama"
|
| 289 |
+
],
|
| 290 |
+
"data_type": "semi-static",
|
| 291 |
+
"update_frequency": null,
|
| 292 |
+
"columns": [
|
| 293 |
+
"name",
|
| 294 |
+
"osm_id",
|
| 295 |
+
"feature_type",
|
| 296 |
+
"operator",
|
| 297 |
+
"education_type",
|
| 298 |
+
"icon",
|
| 299 |
+
"geom"
|
| 300 |
+
],
|
| 301 |
+
"row_count": 62,
|
| 302 |
+
"category": "osm",
|
| 303 |
+
"format": "geojson",
|
| 304 |
+
"last_indexed": "2026-01-09T16:15:19.856764",
|
| 305 |
+
"last_enriched": "2026-01-09T16:36:33.114270"
|
| 306 |
+
},
|
| 307 |
+
"panama_healthsites_geojson": {
|
| 308 |
+
"path": "hdx/health_facilities/panama-healthsites-geojson.geojson",
|
| 309 |
+
"description": "Data from hdx/panama-healthsites-geojson.geojson",
|
| 310 |
+
"semantic_description": "This dataset provides location and attribute data for 986 health facilities across Panama, including hospitals, laboratories, and clinics sourced from OpenStreetMap. It includes detailed information on operational status, bed capacity, and staffing levels, making it suitable for analyzing health infrastructure distribution and healthcare accessibility mapping.",
|
| 311 |
+
"tags": [
|
| 312 |
+
"health",
|
| 313 |
+
"facilities",
|
| 314 |
+
"geographic",
|
| 315 |
+
"spatial",
|
| 316 |
+
"infrastructure"
|
| 317 |
+
],
|
| 318 |
+
"data_type": "semi-static",
|
| 319 |
+
"update_frequency": null,
|
| 320 |
+
"columns": [
|
| 321 |
+
"osm_id",
|
| 322 |
+
"osm_type",
|
| 323 |
+
"completeness",
|
| 324 |
+
"amenity",
|
| 325 |
+
"healthcare",
|
| 326 |
+
"name",
|
| 327 |
+
"operator",
|
| 328 |
+
"source",
|
| 329 |
+
"speciality",
|
| 330 |
+
"operator_type",
|
| 331 |
+
"operational_status",
|
| 332 |
+
"opening_hours",
|
| 333 |
+
"beds",
|
| 334 |
+
"staff_doctors",
|
| 335 |
+
"staff_nurses",
|
| 336 |
+
"health_amenity_type",
|
| 337 |
+
"dispensing",
|
| 338 |
+
"wheelchair",
|
| 339 |
+
"emergency",
|
| 340 |
+
"insurance",
|
| 341 |
+
"water_source",
|
| 342 |
+
"electricity",
|
| 343 |
+
"is_in_health_area",
|
| 344 |
+
"is_in_health_zone",
|
| 345 |
+
"url",
|
| 346 |
+
"addr_housenumber",
|
| 347 |
+
"addr_street",
|
| 348 |
+
"addr_postcode",
|
| 349 |
+
"addr_city",
|
| 350 |
+
"changeset_id",
|
| 351 |
+
"changeset_version",
|
| 352 |
+
"changeset_timestamp",
|
| 353 |
+
"uuid",
|
| 354 |
+
"geom"
|
| 355 |
+
],
|
| 356 |
+
"row_count": 986,
|
| 357 |
+
"category": "hdx",
|
| 358 |
+
"format": "geojson",
|
| 359 |
+
"last_indexed": "2026-01-09T16:15:20.039814",
|
| 360 |
+
"last_enriched": "2026-01-09T16:35:58.888442"
|
| 361 |
+
},
|
| 362 |
+
"panama_healthsites_hxl_geojson": {
|
| 363 |
+
"path": "hdx/health_facilities/panama-healthsites-hxl-geojson.geojson",
|
| 364 |
+
"description": "Data from hdx/panama-healthsites-hxl-geojson.geojson",
|
| 365 |
+
"semantic_description": "This dataset comprises 986 health facilities across Panama, detailing hospitals, clinics, and laboratories with information on capacity, specialties, and operational status. It is designed for analyzing healthcare accessibility, resource distribution, and infrastructure coverage at a national level.",
|
| 366 |
+
"tags": [
|
| 367 |
+
"health",
|
| 368 |
+
"facilities",
|
| 369 |
+
"geographic",
|
| 370 |
+
"spatial",
|
| 371 |
+
"infrastructure"
|
| 372 |
+
],
|
| 373 |
+
"data_type": "semi-static",
|
| 374 |
+
"update_frequency": null,
|
| 375 |
+
"columns": [
|
| 376 |
+
"osm_id",
|
| 377 |
+
"osm_type",
|
| 378 |
+
"completeness",
|
| 379 |
+
"#loc+amenity",
|
| 380 |
+
"#meta+healthcare",
|
| 381 |
+
"#loc +name",
|
| 382 |
+
"#meta +operator",
|
| 383 |
+
"#geo+bounds+url",
|
| 384 |
+
"#meta +speciality",
|
| 385 |
+
"#meta +operator_type",
|
| 386 |
+
"#contact +phone",
|
| 387 |
+
"#status+operational_status",
|
| 388 |
+
"#access +hours",
|
| 389 |
+
"#capacity +beds",
|
| 390 |
+
"#capacity +staff",
|
| 391 |
+
"#meta +health_amenity_type",
|
| 392 |
+
"#meta+dispensing",
|
| 393 |
+
"#meta+wheelchair",
|
| 394 |
+
"#meta+emergency",
|
| 395 |
+
"#meta+insurance",
|
| 396 |
+
"#meta+water_source",
|
| 397 |
+
"#meta+electricity",
|
| 398 |
+
"#meta+is_in_health_area",
|
| 399 |
+
"#meta+is_in_health_zone",
|
| 400 |
+
"#contact +url",
|
| 401 |
+
"addr_housenumber",
|
| 402 |
+
"addr_street",
|
| 403 |
+
"addr_postcode",
|
| 404 |
+
"addr_city",
|
| 405 |
+
"changeset_id",
|
| 406 |
+
"changeset_version",
|
| 407 |
+
"changeset_timestamp",
|
| 408 |
+
"#meta +id",
|
| 409 |
+
"geom"
|
| 410 |
+
],
|
| 411 |
+
"row_count": 986,
|
| 412 |
+
"category": "hdx",
|
| 413 |
+
"format": "geojson",
|
| 414 |
+
"last_indexed": "2026-01-09T16:15:20.152069",
|
| 415 |
+
"last_enriched": "2026-01-09T16:36:36.834369"
|
| 416 |
+
},
|
| 417 |
+
"kontur_population": {
|
| 418 |
+
"description": "Population density grid for Panama at 400m H3 hexagon resolution. Based on GHSL, Facebook HRSL, and Microsoft Buildings data.",
|
| 419 |
+
"tags": [
|
| 420 |
+
"population",
|
| 421 |
+
"density",
|
| 422 |
+
"panama",
|
| 423 |
+
"h3",
|
| 424 |
+
"hexagon",
|
| 425 |
+
"kontur",
|
| 426 |
+
"demographics"
|
| 427 |
+
],
|
| 428 |
+
"data_type": "vector",
|
| 429 |
+
"geometry_type": "polygon",
|
| 430 |
+
"semantic_description": "Population count per 400m H3 hexagonal grid cell. Use for population density analysis, demographic studies, and urban/rural classification.",
|
| 431 |
+
"path": "kontur/kontur_population_EPSG4326.gpkg",
|
| 432 |
+
"columns": [
|
| 433 |
+
"h3",
|
| 434 |
+
"population",
|
| 435 |
+
"geom"
|
| 436 |
+
],
|
| 437 |
+
"row_count": 33114
|
| 438 |
+
},
|
| 439 |
+
"osm_roads": {
|
| 440 |
+
"description": "OpenStreetMap Road network with classification for Panama",
|
| 441 |
+
"tags": [
|
| 442 |
+
"osm",
|
| 443 |
+
"panama",
|
| 444 |
+
"roads"
|
| 445 |
+
],
|
| 446 |
+
"data_type": "vector",
|
| 447 |
+
"geometry_type": "auto",
|
| 448 |
+
"path": "osm/roads.geojson"
|
| 449 |
+
},
|
| 450 |
+
"osm_pois": {
|
| 451 |
+
"description": "OpenStreetMap Points of interest (restaurants, shops, etc.) for Panama",
|
| 452 |
+
"tags": [
|
| 453 |
+
"osm",
|
| 454 |
+
"panama",
|
| 455 |
+
"pois"
|
| 456 |
+
],
|
| 457 |
+
"data_type": "vector",
|
| 458 |
+
"geometry_type": "auto",
|
| 459 |
+
"path": "osm/pois.geojson"
|
| 460 |
+
},
|
| 461 |
+
"osm_pois_areas": {
|
| 462 |
+
"description": "OpenStreetMap POI areas (larger venues) for Panama",
|
| 463 |
+
"tags": [
|
| 464 |
+
"osm",
|
| 465 |
+
"panama",
|
| 466 |
+
"pois areas"
|
| 467 |
+
],
|
| 468 |
+
"data_type": "vector",
|
| 469 |
+
"geometry_type": "auto",
|
| 470 |
+
"path": "osm/pois_areas.geojson"
|
| 471 |
+
},
|
| 472 |
+
"osm_buildings": {
|
| 473 |
+
"description": "OpenStreetMap Building footprints for Panama",
|
| 474 |
+
"tags": [
|
| 475 |
+
"osm",
|
| 476 |
+
"panama",
|
| 477 |
+
"buildings"
|
| 478 |
+
],
|
| 479 |
+
"data_type": "vector",
|
| 480 |
+
"geometry_type": "auto",
|
| 481 |
+
"path": "osm/buildings.geojson"
|
| 482 |
+
},
|
| 483 |
+
"osm_landuse": {
|
| 484 |
+
"description": "OpenStreetMap Land use zones (residential, commercial, etc.) for Panama",
|
| 485 |
+
"tags": [
|
| 486 |
+
"osm",
|
| 487 |
+
"panama",
|
| 488 |
+
"landuse"
|
| 489 |
+
],
|
| 490 |
+
"data_type": "vector",
|
| 491 |
+
"geometry_type": "auto",
|
| 492 |
+
"path": "osm/landuse.geojson"
|
| 493 |
+
},
|
| 494 |
+
"osm_natural_points": {
|
| 495 |
+
"description": "OpenStreetMap Natural features (trees, peaks) for Panama",
|
| 496 |
+
"tags": [
|
| 497 |
+
"osm",
|
| 498 |
+
"panama",
|
| 499 |
+
"natural points"
|
| 500 |
+
],
|
| 501 |
+
"data_type": "vector",
|
| 502 |
+
"geometry_type": "auto",
|
| 503 |
+
"path": "osm/natural_points.geojson"
|
| 504 |
+
},
|
| 505 |
+
"osm_natural_areas": {
|
| 506 |
+
"description": "OpenStreetMap Natural areas (forests, parks) for Panama",
|
| 507 |
+
"tags": [
|
| 508 |
+
"osm",
|
| 509 |
+
"panama",
|
| 510 |
+
"natural areas"
|
| 511 |
+
],
|
| 512 |
+
"data_type": "vector",
|
| 513 |
+
"geometry_type": "auto",
|
| 514 |
+
"path": "osm/natural_areas.geojson"
|
| 515 |
+
},
|
| 516 |
+
"osm_water_areas": {
|
| 517 |
+
"description": "OpenStreetMap Water bodies (lakes, reservoirs) for Panama",
|
| 518 |
+
"tags": [
|
| 519 |
+
"osm",
|
| 520 |
+
"panama",
|
| 521 |
+
"water areas"
|
| 522 |
+
],
|
| 523 |
+
"data_type": "vector",
|
| 524 |
+
"geometry_type": "auto",
|
| 525 |
+
"path": "osm/water_areas.geojson"
|
| 526 |
+
},
|
| 527 |
+
"osm_waterways": {
|
| 528 |
+
"description": "OpenStreetMap Rivers and streams for Panama",
|
| 529 |
+
"tags": [
|
| 530 |
+
"osm",
|
| 531 |
+
"panama",
|
| 532 |
+
"waterways"
|
| 533 |
+
],
|
| 534 |
+
"data_type": "vector",
|
| 535 |
+
"geometry_type": "auto",
|
| 536 |
+
"path": "osm/waterways.geojson"
|
| 537 |
+
},
|
| 538 |
+
"osm_railways": {
|
| 539 |
+
"description": "OpenStreetMap Railway lines for Panama",
|
| 540 |
+
"tags": [
|
| 541 |
+
"osm",
|
| 542 |
+
"panama",
|
| 543 |
+
"railways"
|
| 544 |
+
],
|
| 545 |
+
"data_type": "vector",
|
| 546 |
+
"geometry_type": "auto",
|
| 547 |
+
"path": "osm/railways.geojson"
|
| 548 |
+
},
|
| 549 |
+
"osm_traffic": {
|
| 550 |
+
"description": "OpenStreetMap Traffic infrastructure (signals, crossings) for Panama",
|
| 551 |
+
"tags": [
|
| 552 |
+
"osm",
|
| 553 |
+
"panama",
|
| 554 |
+
"traffic"
|
| 555 |
+
],
|
| 556 |
+
"data_type": "vector",
|
| 557 |
+
"geometry_type": "auto",
|
| 558 |
+
"path": "osm/traffic.geojson"
|
| 559 |
+
},
|
| 560 |
+
"osm_traffic_areas": {
|
| 561 |
+
"description": "OpenStreetMap Traffic areas (parking lots) for Panama",
|
| 562 |
+
"tags": [
|
| 563 |
+
"osm",
|
| 564 |
+
"panama",
|
| 565 |
+
"traffic areas"
|
| 566 |
+
],
|
| 567 |
+
"data_type": "vector",
|
| 568 |
+
"geometry_type": "auto",
|
| 569 |
+
"path": "osm/traffic_areas.geojson"
|
| 570 |
+
},
|
| 571 |
+
"osm_transport": {
|
| 572 |
+
"description": "OpenStreetMap Transport points (bus stops, stations) for Panama",
|
| 573 |
+
"tags": [
|
| 574 |
+
"osm",
|
| 575 |
+
"panama",
|
| 576 |
+
"transport"
|
| 577 |
+
],
|
| 578 |
+
"data_type": "vector",
|
| 579 |
+
"geometry_type": "auto",
|
| 580 |
+
"path": "osm/transport.geojson"
|
| 581 |
+
},
|
| 582 |
+
"osm_transport_areas": {
|
| 583 |
+
"description": "OpenStreetMap Transport areas (airports, ports) for Panama",
|
| 584 |
+
"tags": [
|
| 585 |
+
"osm",
|
| 586 |
+
"panama",
|
| 587 |
+
"transport areas"
|
| 588 |
+
],
|
| 589 |
+
"data_type": "vector",
|
| 590 |
+
"geometry_type": "auto",
|
| 591 |
+
"path": "osm/transport_areas.geojson"
|
| 592 |
+
},
|
| 593 |
+
"osm_places": {
|
| 594 |
+
"description": "OpenStreetMap Place names (cities, towns, villages) for Panama",
|
| 595 |
+
"tags": [
|
| 596 |
+
"osm",
|
| 597 |
+
"panama",
|
| 598 |
+
"places"
|
| 599 |
+
],
|
| 600 |
+
"data_type": "vector",
|
| 601 |
+
"geometry_type": "auto",
|
| 602 |
+
"path": "osm/places.geojson"
|
| 603 |
+
},
|
| 604 |
+
"osm_places_areas": {
|
| 605 |
+
"description": "OpenStreetMap Place areas for Panama",
|
| 606 |
+
"tags": [
|
| 607 |
+
"osm",
|
| 608 |
+
"panama",
|
| 609 |
+
"places areas"
|
| 610 |
+
],
|
| 611 |
+
"data_type": "vector",
|
| 612 |
+
"geometry_type": "auto",
|
| 613 |
+
"path": "osm/places_areas.geojson"
|
| 614 |
+
},
|
| 615 |
+
"osm_places_of_worship": {
|
| 616 |
+
"description": "OpenStreetMap Places of worship for Panama",
|
| 617 |
+
"tags": [
|
| 618 |
+
"osm",
|
| 619 |
+
"panama",
|
| 620 |
+
"places of worship"
|
| 621 |
+
],
|
| 622 |
+
"data_type": "vector",
|
| 623 |
+
"geometry_type": "auto",
|
| 624 |
+
"path": "osm/places_of_worship.geojson"
|
| 625 |
+
},
|
| 626 |
+
"osm_places_of_worship_areas": {
|
| 627 |
+
"description": "OpenStreetMap Places of worship (buildings) for Panama",
|
| 628 |
+
"tags": [
|
| 629 |
+
"osm",
|
| 630 |
+
"panama",
|
| 631 |
+
"places of worship areas"
|
| 632 |
+
],
|
| 633 |
+
"data_type": "vector",
|
| 634 |
+
"geometry_type": "auto",
|
| 635 |
+
"path": "osm/places_of_worship_areas.geojson"
|
| 636 |
+
},
|
| 637 |
+
"roads": {
|
| 638 |
+
"path": "osm/roads.geojson",
|
| 639 |
+
"description": "Data from osm/roads.geojson",
|
| 640 |
+
"semantic_description": null,
|
| 641 |
+
"tags": [
|
| 642 |
+
"spatial",
|
| 643 |
+
"infrastructure",
|
| 644 |
+
"roads",
|
| 645 |
+
"transportation"
|
| 646 |
+
],
|
| 647 |
+
"data_type": "semi-static",
|
| 648 |
+
"update_frequency": null,
|
| 649 |
+
"columns": [
|
| 650 |
+
"osm_id",
|
| 651 |
+
"code",
|
| 652 |
+
"fclass",
|
| 653 |
+
"name",
|
| 654 |
+
"ref",
|
| 655 |
+
"oneway",
|
| 656 |
+
"maxspeed",
|
| 657 |
+
"layer",
|
| 658 |
+
"bridge",
|
| 659 |
+
"tunnel",
|
| 660 |
+
"geom"
|
| 661 |
+
],
|
| 662 |
+
"row_count": 118464,
|
| 663 |
+
"category": "osm",
|
| 664 |
+
"format": "geojson",
|
| 665 |
+
"last_indexed": "2026-01-09T18:18:59.409660"
|
| 666 |
+
},
|
| 667 |
+
"places_of_worship_areas": {
|
| 668 |
+
"path": "osm/places_of_worship_areas.geojson",
|
| 669 |
+
"description": "Data from osm/places_of_worship_areas.geojson",
|
| 670 |
+
"semantic_description": null,
|
| 671 |
+
"tags": [
|
| 672 |
+
"spatial"
|
| 673 |
+
],
|
| 674 |
+
"data_type": "semi-static",
|
| 675 |
+
"update_frequency": null,
|
| 676 |
+
"columns": [
|
| 677 |
+
"osm_id",
|
| 678 |
+
"code",
|
| 679 |
+
"fclass",
|
| 680 |
+
"name",
|
| 681 |
+
"geom"
|
| 682 |
+
],
|
| 683 |
+
"row_count": 694,
|
| 684 |
+
"category": "osm",
|
| 685 |
+
"format": "geojson",
|
| 686 |
+
"last_indexed": "2026-01-09T18:18:59.460933"
|
| 687 |
+
},
|
| 688 |
+
"transport": {
|
| 689 |
+
"path": "osm/transport.geojson",
|
| 690 |
+
"description": "Data from osm/transport.geojson",
|
| 691 |
+
"semantic_description": null,
|
| 692 |
+
"tags": [
|
| 693 |
+
"maritime",
|
| 694 |
+
"spatial",
|
| 695 |
+
"infrastructure",
|
| 696 |
+
"transportation"
|
| 697 |
+
],
|
| 698 |
+
"data_type": "semi-static",
|
| 699 |
+
"update_frequency": null,
|
| 700 |
+
"columns": [
|
| 701 |
+
"osm_id",
|
| 702 |
+
"code",
|
| 703 |
+
"fclass",
|
| 704 |
+
"name",
|
| 705 |
+
"geom"
|
| 706 |
+
],
|
| 707 |
+
"row_count": 1891,
|
| 708 |
+
"category": "osm",
|
| 709 |
+
"format": "geojson",
|
| 710 |
+
"last_indexed": "2026-01-09T18:18:59.506892"
|
| 711 |
+
},
|
| 712 |
+
"pois_areas": {
|
| 713 |
+
"path": "osm/pois_areas.geojson",
|
| 714 |
+
"description": "Data from osm/pois_areas.geojson",
|
| 715 |
+
"semantic_description": null,
|
| 716 |
+
"tags": [
|
| 717 |
+
"spatial",
|
| 718 |
+
"points-of-interest",
|
| 719 |
+
"amenities"
|
| 720 |
+
],
|
| 721 |
+
"data_type": "semi-static",
|
| 722 |
+
"update_frequency": null,
|
| 723 |
+
"columns": [
|
| 724 |
+
"osm_id",
|
| 725 |
+
"code",
|
| 726 |
+
"fclass",
|
| 727 |
+
"name",
|
| 728 |
+
"geom"
|
| 729 |
+
],
|
| 730 |
+
"row_count": 11583,
|
| 731 |
+
"category": "osm",
|
| 732 |
+
"format": "geojson",
|
| 733 |
+
"last_indexed": "2026-01-09T18:19:00.011175"
|
| 734 |
+
},
|
| 735 |
+
"railways": {
|
| 736 |
+
"path": "osm/railways.geojson",
|
| 737 |
+
"description": "Data from osm/railways.geojson",
|
| 738 |
+
"semantic_description": null,
|
| 739 |
+
"tags": [
|
| 740 |
+
"spatial"
|
| 741 |
+
],
|
| 742 |
+
"data_type": "semi-static",
|
| 743 |
+
"update_frequency": null,
|
| 744 |
+
"columns": [
|
| 745 |
+
"osm_id",
|
| 746 |
+
"code",
|
| 747 |
+
"fclass",
|
| 748 |
+
"name",
|
| 749 |
+
"layer",
|
| 750 |
+
"bridge",
|
| 751 |
+
"tunnel",
|
| 752 |
+
"geom"
|
| 753 |
+
],
|
| 754 |
+
"row_count": 296,
|
| 755 |
+
"category": "osm",
|
| 756 |
+
"format": "geojson",
|
| 757 |
+
"last_indexed": "2026-01-09T18:19:00.034635"
|
| 758 |
+
},
|
| 759 |
+
"pois": {
|
| 760 |
+
"path": "osm/pois.geojson",
|
| 761 |
+
"description": "Data from osm/pois.geojson",
|
| 762 |
+
"semantic_description": null,
|
| 763 |
+
"tags": [
|
| 764 |
+
"spatial",
|
| 765 |
+
"points-of-interest",
|
| 766 |
+
"amenities"
|
| 767 |
+
],
|
| 768 |
+
"data_type": "semi-static",
|
| 769 |
+
"update_frequency": null,
|
| 770 |
+
"columns": [
|
| 771 |
+
"osm_id",
|
| 772 |
+
"code",
|
| 773 |
+
"fclass",
|
| 774 |
+
"name",
|
| 775 |
+
"geom"
|
| 776 |
+
],
|
| 777 |
+
"row_count": 11129,
|
| 778 |
+
"category": "osm",
|
| 779 |
+
"format": "geojson",
|
| 780 |
+
"last_indexed": "2026-01-09T18:19:00.261571"
|
| 781 |
+
},
|
| 782 |
+
"natural_points": {
|
| 783 |
+
"path": "osm/natural_points.geojson",
|
| 784 |
+
"description": "Data from osm/natural_points.geojson",
|
| 785 |
+
"semantic_description": null,
|
| 786 |
+
"tags": [
|
| 787 |
+
"spatial",
|
| 788 |
+
"points-of-interest",
|
| 789 |
+
"amenities"
|
| 790 |
+
],
|
| 791 |
+
"data_type": "semi-static",
|
| 792 |
+
"update_frequency": null,
|
| 793 |
+
"columns": [
|
| 794 |
+
"osm_id",
|
| 795 |
+
"code",
|
| 796 |
+
"fclass",
|
| 797 |
+
"name",
|
| 798 |
+
"geom"
|
| 799 |
+
],
|
| 800 |
+
"row_count": 6500,
|
| 801 |
+
"category": "osm",
|
| 802 |
+
"format": "geojson",
|
| 803 |
+
"last_indexed": "2026-01-09T18:19:00.395667"
|
| 804 |
+
},
|
| 805 |
+
"traffic": {
|
| 806 |
+
"path": "osm/traffic.geojson",
|
| 807 |
+
"description": "Data from osm/traffic.geojson",
|
| 808 |
+
"semantic_description": null,
|
| 809 |
+
"tags": [
|
| 810 |
+
"spatial"
|
| 811 |
+
],
|
| 812 |
+
"data_type": "semi-static",
|
| 813 |
+
"update_frequency": null,
|
| 814 |
+
"columns": [
|
| 815 |
+
"osm_id",
|
| 816 |
+
"code",
|
| 817 |
+
"fclass",
|
| 818 |
+
"name",
|
| 819 |
+
"geom"
|
| 820 |
+
],
|
| 821 |
+
"row_count": 5902,
|
| 822 |
+
"category": "osm",
|
| 823 |
+
"format": "geojson",
|
| 824 |
+
"last_indexed": "2026-01-09T18:19:00.509922"
|
| 825 |
+
},
|
| 826 |
+
"traffic_areas": {
|
| 827 |
+
"path": "osm/traffic_areas.geojson",
|
| 828 |
+
"description": "Data from osm/traffic_areas.geojson",
|
| 829 |
+
"semantic_description": null,
|
| 830 |
+
"tags": [
|
| 831 |
+
"spatial"
|
| 832 |
+
],
|
| 833 |
+
"data_type": "semi-static",
|
| 834 |
+
"update_frequency": null,
|
| 835 |
+
"columns": [
|
| 836 |
+
"osm_id",
|
| 837 |
+
"code",
|
| 838 |
+
"fclass",
|
| 839 |
+
"name",
|
| 840 |
+
"geom"
|
| 841 |
+
],
|
| 842 |
+
"row_count": 3403,
|
| 843 |
+
"category": "osm",
|
| 844 |
+
"format": "geojson",
|
| 845 |
+
"last_indexed": "2026-01-09T18:19:00.682898"
|
| 846 |
+
},
|
| 847 |
+
"buildings": {
|
| 848 |
+
"path": "osm/buildings.geojson",
|
| 849 |
+
"description": "Data from osm/buildings.geojson",
|
| 850 |
+
"semantic_description": null,
|
| 851 |
+
"tags": [
|
| 852 |
+
"spatial",
|
| 853 |
+
"built-environment",
|
| 854 |
+
"infrastructure"
|
| 855 |
+
],
|
| 856 |
+
"data_type": "semi-static",
|
| 857 |
+
"update_frequency": null,
|
| 858 |
+
"columns": [
|
| 859 |
+
"osm_id",
|
| 860 |
+
"code",
|
| 861 |
+
"fclass",
|
| 862 |
+
"name",
|
| 863 |
+
"type",
|
| 864 |
+
"geom"
|
| 865 |
+
],
|
| 866 |
+
"row_count": 233936,
|
| 867 |
+
"category": "osm",
|
| 868 |
+
"format": "geojson",
|
| 869 |
+
"last_indexed": "2026-01-09T18:19:08.488004"
|
| 870 |
+
},
|
| 871 |
+
"places": {
|
| 872 |
+
"path": "osm/places.geojson",
|
| 873 |
+
"description": "Data from osm/places.geojson",
|
| 874 |
+
"semantic_description": null,
|
| 875 |
+
"tags": [
|
| 876 |
+
"spatial",
|
| 877 |
+
"population"
|
| 878 |
+
],
|
| 879 |
+
"data_type": "semi-static",
|
| 880 |
+
"update_frequency": null,
|
| 881 |
+
"columns": [
|
| 882 |
+
"osm_id",
|
| 883 |
+
"code",
|
| 884 |
+
"fclass",
|
| 885 |
+
"population",
|
| 886 |
+
"name",
|
| 887 |
+
"geom"
|
| 888 |
+
],
|
| 889 |
+
"row_count": 3683,
|
| 890 |
+
"category": "osm",
|
| 891 |
+
"format": "geojson",
|
| 892 |
+
"last_indexed": "2026-01-09T18:19:08.594144"
|
| 893 |
+
},
|
| 894 |
+
"places_of_worship": {
|
| 895 |
+
"path": "osm/places_of_worship.geojson",
|
| 896 |
+
"description": "Data from osm/places_of_worship.geojson",
|
| 897 |
+
"semantic_description": null,
|
| 898 |
+
"tags": [
|
| 899 |
+
"spatial"
|
| 900 |
+
],
|
| 901 |
+
"data_type": "semi-static",
|
| 902 |
+
"update_frequency": null,
|
| 903 |
+
"columns": [
|
| 904 |
+
"osm_id",
|
| 905 |
+
"code",
|
| 906 |
+
"fclass",
|
| 907 |
+
"name",
|
| 908 |
+
"geom"
|
| 909 |
+
],
|
| 910 |
+
"row_count": 228,
|
| 911 |
+
"category": "osm",
|
| 912 |
+
"format": "geojson",
|
| 913 |
+
"last_indexed": "2026-01-09T18:19:08.609384"
|
| 914 |
+
},
|
| 915 |
+
"natural_areas": {
|
| 916 |
+
"path": "osm/natural_areas.geojson",
|
| 917 |
+
"description": "Data from osm/natural_areas.geojson",
|
| 918 |
+
"semantic_description": null,
|
| 919 |
+
"tags": [
|
| 920 |
+
"spatial"
|
| 921 |
+
],
|
| 922 |
+
"data_type": "semi-static",
|
| 923 |
+
"update_frequency": null,
|
| 924 |
+
"columns": [
|
| 925 |
+
"osm_id",
|
| 926 |
+
"code",
|
| 927 |
+
"fclass",
|
| 928 |
+
"name",
|
| 929 |
+
"geom"
|
| 930 |
+
],
|
| 931 |
+
"row_count": 434,
|
| 932 |
+
"category": "osm",
|
| 933 |
+
"format": "geojson",
|
| 934 |
+
"last_indexed": "2026-01-09T18:19:08.673965"
|
| 935 |
+
},
|
| 936 |
+
"waterways": {
|
| 937 |
+
"path": "osm/waterways.geojson",
|
| 938 |
+
"description": "Data from osm/waterways.geojson",
|
| 939 |
+
"semantic_description": null,
|
| 940 |
+
"tags": [
|
| 941 |
+
"natural-resources",
|
| 942 |
+
"spatial",
|
| 943 |
+
"hydrology"
|
| 944 |
+
],
|
| 945 |
+
"data_type": "semi-static",
|
| 946 |
+
"update_frequency": null,
|
| 947 |
+
"columns": [
|
| 948 |
+
"osm_id",
|
| 949 |
+
"code",
|
| 950 |
+
"fclass",
|
| 951 |
+
"width",
|
| 952 |
+
"name",
|
| 953 |
+
"geom"
|
| 954 |
+
],
|
| 955 |
+
"row_count": 15532,
|
| 956 |
+
"category": "osm",
|
| 957 |
+
"format": "geojson",
|
| 958 |
+
"last_indexed": "2026-01-09T18:19:10.791546"
|
| 959 |
+
},
|
| 960 |
+
"water_areas": {
|
| 961 |
+
"path": "osm/water_areas.geojson",
|
| 962 |
+
"description": "Data from osm/water_areas.geojson",
|
| 963 |
+
"semantic_description": null,
|
| 964 |
+
"tags": [
|
| 965 |
+
"natural-resources",
|
| 966 |
+
"spatial",
|
| 967 |
+
"hydrology"
|
| 968 |
+
],
|
| 969 |
+
"data_type": "semi-static",
|
| 970 |
+
"update_frequency": null,
|
| 971 |
+
"columns": [
|
| 972 |
+
"osm_id",
|
| 973 |
+
"code",
|
| 974 |
+
"fclass",
|
| 975 |
+
"name",
|
| 976 |
+
"geom"
|
| 977 |
+
],
|
| 978 |
+
"row_count": 3733,
|
| 979 |
+
"category": "osm",
|
| 980 |
+
"format": "geojson",
|
| 981 |
+
"last_indexed": "2026-01-09T18:19:12.941528"
|
| 982 |
+
},
|
| 983 |
+
"landuse": {
|
| 984 |
+
"path": "osm/landuse.geojson",
|
| 985 |
+
"description": "Data from osm/landuse.geojson",
|
| 986 |
+
"semantic_description": null,
|
| 987 |
+
"tags": [
|
| 988 |
+
"spatial"
|
| 989 |
+
],
|
| 990 |
+
"data_type": "semi-static",
|
| 991 |
+
"update_frequency": null,
|
| 992 |
+
"columns": [
|
| 993 |
+
"osm_id",
|
| 994 |
+
"code",
|
| 995 |
+
"fclass",
|
| 996 |
+
"name",
|
| 997 |
+
"geom"
|
| 998 |
+
],
|
| 999 |
+
"row_count": 16075,
|
| 1000 |
+
"category": "osm",
|
| 1001 |
+
"format": "geojson",
|
| 1002 |
+
"last_indexed": "2026-01-09T18:19:15.893984"
|
| 1003 |
+
},
|
| 1004 |
+
"transport_areas": {
|
| 1005 |
+
"path": "osm/transport_areas.geojson",
|
| 1006 |
+
"description": "Data from osm/transport_areas.geojson",
|
| 1007 |
+
"semantic_description": null,
|
| 1008 |
+
"tags": [
|
| 1009 |
+
"maritime",
|
| 1010 |
+
"spatial",
|
| 1011 |
+
"infrastructure",
|
| 1012 |
+
"transportation"
|
| 1013 |
+
],
|
| 1014 |
+
"data_type": "semi-static",
|
| 1015 |
+
"update_frequency": null,
|
| 1016 |
+
"columns": [
|
| 1017 |
+
"osm_id",
|
| 1018 |
+
"code",
|
| 1019 |
+
"fclass",
|
| 1020 |
+
"name",
|
| 1021 |
+
"geom"
|
| 1022 |
+
],
|
| 1023 |
+
"row_count": 196,
|
| 1024 |
+
"category": "osm",
|
| 1025 |
+
"format": "geojson",
|
| 1026 |
+
"last_indexed": "2026-01-09T18:19:15.917475"
|
| 1027 |
+
},
|
| 1028 |
+
"places_areas": {
|
| 1029 |
+
"path": "osm/places_areas.geojson",
|
| 1030 |
+
"description": "Data from osm/places_areas.geojson",
|
| 1031 |
+
"semantic_description": null,
|
| 1032 |
+
"tags": [
|
| 1033 |
+
"spatial",
|
| 1034 |
+
"population"
|
| 1035 |
+
],
|
| 1036 |
+
"data_type": "semi-static",
|
| 1037 |
+
"update_frequency": null,
|
| 1038 |
+
"columns": [
|
| 1039 |
+
"osm_id",
|
| 1040 |
+
"code",
|
| 1041 |
+
"fclass",
|
| 1042 |
+
"population",
|
| 1043 |
+
"name",
|
| 1044 |
+
"geom"
|
| 1045 |
+
],
|
| 1046 |
+
"row_count": 239,
|
| 1047 |
+
"category": "osm",
|
| 1048 |
+
"format": "geojson",
|
| 1049 |
+
"last_indexed": "2026-01-09T18:19:16.220819"
|
| 1050 |
+
},
|
| 1051 |
+
"overture_places": {
|
| 1052 |
+
"path": "overture/overture_places.geojson",
|
| 1053 |
+
"description": "Points of Interest from Overture Maps (Places theme)",
|
| 1054 |
+
"semantic_description": "Comprehensive list of businesses and landmarks with names and categories. Use this for finding specific amenities, shops, or named locations.",
|
| 1055 |
+
"tags": [
|
| 1056 |
+
"overture",
|
| 1057 |
+
"places",
|
| 1058 |
+
"poi",
|
| 1059 |
+
"businesses",
|
| 1060 |
+
"landmarks",
|
| 1061 |
+
"spatial",
|
| 1062 |
+
"panama"
|
| 1063 |
+
],
|
| 1064 |
+
"data_type": "static",
|
| 1065 |
+
"update_frequency": null,
|
| 1066 |
+
"columns": [
|
| 1067 |
+
"id",
|
| 1068 |
+
"version",
|
| 1069 |
+
"sources",
|
| 1070 |
+
"names",
|
| 1071 |
+
"categories",
|
| 1072 |
+
"basic_category",
|
| 1073 |
+
"taxonomy",
|
| 1074 |
+
"confidence",
|
| 1075 |
+
"websites",
|
| 1076 |
+
"socials",
|
| 1077 |
+
"emails",
|
| 1078 |
+
"phones",
|
| 1079 |
+
"brand",
|
| 1080 |
+
"addresses",
|
| 1081 |
+
"operating_status",
|
| 1082 |
+
"geom"
|
| 1083 |
+
],
|
| 1084 |
+
"row_count": 33362,
|
| 1085 |
+
"category": "overture",
|
| 1086 |
+
"format": "geojson",
|
| 1087 |
+
"last_indexed": "2026-01-09T18:37:03.188928"
|
| 1088 |
+
},
|
| 1089 |
+
"overture_roads": {
|
| 1090 |
+
"path": "overture/overture_roads.geojson",
|
| 1091 |
+
"description": "Road network segments from Overture Maps",
|
| 1092 |
+
"semantic_description": "Road network segments including highways, streets, and paths. Contains road names and classification.",
|
| 1093 |
+
"tags": [
|
| 1094 |
+
"overture",
|
| 1095 |
+
"roads",
|
| 1096 |
+
"transportation",
|
| 1097 |
+
"infrastructure",
|
| 1098 |
+
"spatial",
|
| 1099 |
+
"panama"
|
| 1100 |
+
],
|
| 1101 |
+
"data_type": "static",
|
| 1102 |
+
"update_frequency": null,
|
| 1103 |
+
"columns": [
|
| 1104 |
+
"id",
|
| 1105 |
+
"version",
|
| 1106 |
+
"sources",
|
| 1107 |
+
"subtype",
|
| 1108 |
+
"class",
|
| 1109 |
+
"names",
|
| 1110 |
+
"connectors",
|
| 1111 |
+
"routes",
|
| 1112 |
+
"subclass_rules",
|
| 1113 |
+
"access_restrictions",
|
| 1114 |
+
"level_rules",
|
| 1115 |
+
"destinations",
|
| 1116 |
+
"prohibited_transitions",
|
| 1117 |
+
"rail_flags",
|
| 1118 |
+
"road_surface",
|
| 1119 |
+
"road_flags",
|
| 1120 |
+
"speed_limits",
|
| 1121 |
+
"width_rules",
|
| 1122 |
+
"subclass",
|
| 1123 |
+
"geom"
|
| 1124 |
+
],
|
| 1125 |
+
"row_count": 179610,
|
| 1126 |
+
"category": "overture",
|
| 1127 |
+
"format": "geojson",
|
| 1128 |
+
"last_indexed": "2026-01-09T18:37:18.729125"
|
| 1129 |
+
},
|
| 1130 |
+
"overture_buildings": {
|
| 1131 |
+
"path": "overture/overture_buildings.geojson",
|
| 1132 |
+
"description": "Building footprints from Overture Maps",
|
| 1133 |
+
"semantic_description": "Building footprints including Microsoft and OSM data. Useful for urban density, infrastructure planning, and built-environment analysis.",
|
| 1134 |
+
"tags": [
|
| 1135 |
+
"overture",
|
| 1136 |
+
"buildings",
|
| 1137 |
+
"footprints",
|
| 1138 |
+
"infrastructure",
|
| 1139 |
+
"spatial",
|
| 1140 |
+
"panama"
|
| 1141 |
+
],
|
| 1142 |
+
"data_type": "static",
|
| 1143 |
+
"update_frequency": null,
|
| 1144 |
+
"columns": [
|
| 1145 |
+
"id",
|
| 1146 |
+
"version",
|
| 1147 |
+
"sources",
|
| 1148 |
+
"level",
|
| 1149 |
+
"subtype",
|
| 1150 |
+
"class",
|
| 1151 |
+
"height",
|
| 1152 |
+
"names",
|
| 1153 |
+
"has_parts",
|
| 1154 |
+
"is_underground",
|
| 1155 |
+
"num_floors",
|
| 1156 |
+
"min_height",
|
| 1157 |
+
"min_floor",
|
| 1158 |
+
"num_floors_underground",
|
| 1159 |
+
"facade_color",
|
| 1160 |
+
"facade_material",
|
| 1161 |
+
"roof_material",
|
| 1162 |
+
"roof_shape",
|
| 1163 |
+
"roof_direction",
|
| 1164 |
+
"roof_orientation",
|
| 1165 |
+
"roof_color",
|
| 1166 |
+
"roof_height",
|
| 1167 |
+
"geom"
|
| 1168 |
+
],
|
| 1169 |
+
"row_count": 1888314,
|
| 1170 |
+
"category": "overture",
|
| 1171 |
+
"format": "geojson",
|
| 1172 |
+
"last_indexed": "2026-01-09T18:38:50.416300"
|
| 1173 |
+
},
|
| 1174 |
+
"panama_weather_stations": {
|
| 1175 |
+
"path": "climate/weather_stations.geojson",
|
| 1176 |
+
"description": "Major weather stations in Panama with average temperature and rainfall data.",
|
| 1177 |
+
"semantic_description": "This dataset contains the locations of major weather stations in Panama (Tocumen, David, Bocas del Toro, etc.). It includes attributes for average annual temperature (Celsius), annual rainfall (mm), and elevation. Use this for analyzing climatic differences across the country.",
|
| 1178 |
+
"tags": [
|
| 1179 |
+
"climate",
|
| 1180 |
+
"weather",
|
| 1181 |
+
"temperature",
|
| 1182 |
+
"rainfall",
|
| 1183 |
+
"stations"
|
| 1184 |
+
],
|
| 1185 |
+
"data_type": "static",
|
| 1186 |
+
"category": "climate",
|
| 1187 |
+
"format": "geojson"
|
| 1188 |
+
},
|
| 1189 |
+
"panama_terrain_features": {
|
| 1190 |
+
"path": "terrain/simplified_terrain.geojson",
|
| 1191 |
+
"description": "Simplifed terrain features including major peaks and mountain ranges.",
|
| 1192 |
+
"semantic_description": "A simplified dataset showing major terrain features of Panama, including Volc\u00e1n Bar\u00fa (highest peak) and the Central Cordillera. Contains points for peaks and lines for ranges, with elevation attributes.",
|
| 1193 |
+
"tags": [
|
| 1194 |
+
"terrain",
|
| 1195 |
+
"elevation",
|
| 1196 |
+
"mountains",
|
| 1197 |
+
"volcano",
|
| 1198 |
+
"geography"
|
| 1199 |
+
],
|
| 1200 |
+
"data_type": "static",
|
| 1201 |
+
"category": "climate",
|
| 1202 |
+
"format": "geojson"
|
| 1203 |
+
},
|
| 1204 |
+
"panama_national_indicators": {
|
| 1205 |
+
"path": "socioeconomic/panama_national_indicators.geojson",
|
| 1206 |
+
"description": "National socio-economic indicators from World Bank (2000-2024)",
|
| 1207 |
+
"semantic_description": "Comprehensive national-level statistics for Panama including poverty rates, GDP, unemployment, health expenditure, maternal/child mortality, literacy rates, and school enrollment. Data sourced from World Bank Open Data API. Use this dataset for analyzing Panama's socio-economic development trends over time.",
|
| 1208 |
+
"tags": [
|
| 1209 |
+
"socioeconomic",
|
| 1210 |
+
"worldbank",
|
| 1211 |
+
"poverty",
|
| 1212 |
+
"gdp",
|
| 1213 |
+
"employment",
|
| 1214 |
+
"health",
|
| 1215 |
+
"education",
|
| 1216 |
+
"national",
|
| 1217 |
+
"panama"
|
| 1218 |
+
],
|
| 1219 |
+
"data_type": "static",
|
| 1220 |
+
"category": "socioeconomic",
|
| 1221 |
+
"format": "geojson"
|
| 1222 |
+
},
|
| 1223 |
+
"province_socioeconomic": {
|
| 1224 |
+
"path": "socioeconomic/province_socioeconomic.geojson",
|
| 1225 |
+
"description": "Province-level socioeconomic indicators for Panama (2023)",
|
| 1226 |
+
"semantic_description": "Socioeconomic data at the province level including Multidimensional Poverty Index (MPI), population from Censo 2023, average income, and disability rates. Shows dramatic geographic inequality: Ng\u00e4be-Bugl\u00e9 comarca has 93.4% poverty vs 15% in Panam\u00e1 province. Use for analyzing regional disparities in poverty, development, and demographics.",
|
| 1227 |
+
"tags": [
|
| 1228 |
+
"socioeconomic",
|
| 1229 |
+
"poverty",
|
| 1230 |
+
"mpi",
|
| 1231 |
+
"census",
|
| 1232 |
+
"province",
|
| 1233 |
+
"admin1",
|
| 1234 |
+
"demographics",
|
| 1235 |
+
"inequality",
|
| 1236 |
+
"panama"
|
| 1237 |
+
],
|
| 1238 |
+
"data_type": "static",
|
| 1239 |
+
"category": "socioeconomic",
|
| 1240 |
+
"format": "geojson"
|
| 1241 |
+
},
|
| 1242 |
+
"panama_airports": {
|
| 1243 |
+
"path": "global/airports/panama_airports.geojson",
|
| 1244 |
+
"description": "Panama airports from OurAirports global database (91 airports)",
|
| 1245 |
+
"semantic_description": "Comprehensive dataset of all airports in Panama including international, domestic, regional, and small airfields. Contains location, elevation, type (large/medium/small/heliport), runway information, and identifiers (ICAO, IATA codes). Updated daily from OurAirports open database. Use for aviation infrastructure analysis, accessibility studies, and transportation planning.",
|
| 1246 |
+
"tags": [
|
| 1247 |
+
"infrastructure",
|
| 1248 |
+
"transportation",
|
| 1249 |
+
"airports",
|
| 1250 |
+
"aviation",
|
| 1251 |
+
"panama",
|
| 1252 |
+
"ourairports"
|
| 1253 |
+
],
|
| 1254 |
+
"data_type": "static",
|
| 1255 |
+
"category": "infrastructure",
|
| 1256 |
+
"format": "geojson",
|
| 1257 |
+
"source": "OurAirports (davidmegginson/ourairports-data)",
|
| 1258 |
+
"license": "Public Domain"
|
| 1259 |
+
},
|
| 1260 |
+
"censo_2023": {
|
| 1261 |
+
"path": "censo/censo_2023_enriched.csv",
|
| 1262 |
+
"description": "Panama Census 2023: Demographics & Housing (Corregimiento Level)",
|
| 1263 |
+
"semantic_description": "Detailed dataset from the 2023 National Census of Population and Housing (Part I & II). Contains granular data at the Corregimiento level (Admin 3) covering: housing types, water access, sanitation, electricity sources, internet/computer access, education levels, and population demographics. Enriched with 'adm3_pcode' to enable spatial joining with 'pan_admin3'. Use for demographic analysis, infrastructure planning, and social program targeting.",
|
| 1264 |
+
"tags": [
|
| 1265 |
+
"census",
|
| 1266 |
+
"demographics",
|
| 1267 |
+
"housing",
|
| 1268 |
+
"population",
|
| 1269 |
+
"water",
|
| 1270 |
+
"electricity",
|
| 1271 |
+
"education",
|
| 1272 |
+
"panama",
|
| 1273 |
+
"2023"
|
| 1274 |
+
],
|
| 1275 |
+
"data_type": "static",
|
| 1276 |
+
"category": "socioeconomic",
|
| 1277 |
+
"format": "csv",
|
| 1278 |
+
"columns": [
|
| 1279 |
+
"adm3_pcode",
|
| 1280 |
+
"cod_corr",
|
| 1281 |
+
"nomb_prov",
|
| 1282 |
+
"nomb_dist",
|
| 1283 |
+
"nomb_corr",
|
| 1284 |
+
"v1_tipo_vivienda__individual_permanente",
|
| 1285 |
+
"v8_abastecimiento_de_agua__acueducto_publico_del_idaan",
|
| 1286 |
+
"p13_acceso_internet__si",
|
| 1287 |
+
"p3_edad_digitos__total"
|
| 1288 |
+
]
|
| 1289 |
+
}
|
| 1290 |
+
}
|
backend/data/catalog_schema.json
ADDED
|
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"$schema": "http://json-schema.org/draft-07/schema#",
|
| 3 |
+
"title": "GeoQuery Data Catalog Entry",
|
| 4 |
+
"description": "Schema for dataset metadata in the GeoQuery platform catalog",
|
| 5 |
+
"type": "object",
|
| 6 |
+
"required": [
|
| 7 |
+
"path",
|
| 8 |
+
"columns",
|
| 9 |
+
"category",
|
| 10 |
+
"format"
|
| 11 |
+
],
|
| 12 |
+
"properties": {
|
| 13 |
+
"path": {
|
| 14 |
+
"type": "string",
|
| 15 |
+
"description": "Relative path to the data file from the data directory"
|
| 16 |
+
},
|
| 17 |
+
"description": {
|
| 18 |
+
"type": "string",
|
| 19 |
+
"description": "Auto-generated basic description (e.g., 'Data from hdx/health.geojson')"
|
| 20 |
+
},
|
| 21 |
+
"semantic_description": {
|
| 22 |
+
"type": [
|
| 23 |
+
"string",
|
| 24 |
+
"null"
|
| 25 |
+
],
|
| 26 |
+
"description": "LLM-generated rich description explaining the dataset's contents and use cases"
|
| 27 |
+
},
|
| 28 |
+
"tags": {
|
| 29 |
+
"type": "array",
|
| 30 |
+
"items": {
|
| 31 |
+
"type": "string"
|
| 32 |
+
},
|
| 33 |
+
"description": "Searchable tags for categorization (e.g., ['health', 'facilities', 'infrastructure'])"
|
| 34 |
+
},
|
| 35 |
+
"data_type": {
|
| 36 |
+
"type": "string",
|
| 37 |
+
"enum": [
|
| 38 |
+
"static",
|
| 39 |
+
"semi-static",
|
| 40 |
+
"realtime"
|
| 41 |
+
],
|
| 42 |
+
"description": "How frequently the data changes",
|
| 43 |
+
"default": "static"
|
| 44 |
+
},
|
| 45 |
+
"update_frequency": {
|
| 46 |
+
"type": [
|
| 47 |
+
"string",
|
| 48 |
+
"null"
|
| 49 |
+
],
|
| 50 |
+
"enum": [
|
| 51 |
+
null,
|
| 52 |
+
"yearly",
|
| 53 |
+
"monthly",
|
| 54 |
+
"weekly",
|
| 55 |
+
"daily",
|
| 56 |
+
"hourly",
|
| 57 |
+
"realtime"
|
| 58 |
+
],
|
| 59 |
+
"description": "Expected update frequency for the dataset"
|
| 60 |
+
},
|
| 61 |
+
"columns": {
|
| 62 |
+
"type": "array",
|
| 63 |
+
"items": {
|
| 64 |
+
"type": "string"
|
| 65 |
+
},
|
| 66 |
+
"description": "List of column names in the dataset"
|
| 67 |
+
},
|
| 68 |
+
"row_count": {
|
| 69 |
+
"type": [
|
| 70 |
+
"integer",
|
| 71 |
+
"null"
|
| 72 |
+
],
|
| 73 |
+
"description": "Number of features/rows in the dataset"
|
| 74 |
+
},
|
| 75 |
+
"category": {
|
| 76 |
+
"type": "string",
|
| 77 |
+
"description": "Source category (base, osm, hdx, inec, custom)"
|
| 78 |
+
},
|
| 79 |
+
"format": {
|
| 80 |
+
"type": "string",
|
| 81 |
+
"enum": [
|
| 82 |
+
"geojson",
|
| 83 |
+
"shapefile",
|
| 84 |
+
"geoparquet",
|
| 85 |
+
"csv"
|
| 86 |
+
],
|
| 87 |
+
"description": "File format of the source data"
|
| 88 |
+
},
|
| 89 |
+
"geometry_type": {
|
| 90 |
+
"type": [
|
| 91 |
+
"string",
|
| 92 |
+
"null"
|
| 93 |
+
],
|
| 94 |
+
"enum": [
|
| 95 |
+
null,
|
| 96 |
+
"Point",
|
| 97 |
+
"MultiPoint",
|
| 98 |
+
"LineString",
|
| 99 |
+
"MultiLineString",
|
| 100 |
+
"Polygon",
|
| 101 |
+
"MultiPolygon"
|
| 102 |
+
],
|
| 103 |
+
"description": "Type of geometries in the dataset"
|
| 104 |
+
},
|
| 105 |
+
"bbox": {
|
| 106 |
+
"type": [
|
| 107 |
+
"array",
|
| 108 |
+
"null"
|
| 109 |
+
],
|
| 110 |
+
"items": {
|
| 111 |
+
"type": "number"
|
| 112 |
+
},
|
| 113 |
+
"minItems": 4,
|
| 114 |
+
"maxItems": 4,
|
| 115 |
+
"description": "Bounding box [minLon, minLat, maxLon, maxLat]"
|
| 116 |
+
},
|
| 117 |
+
"source": {
|
| 118 |
+
"type": [
|
| 119 |
+
"string",
|
| 120 |
+
"null"
|
| 121 |
+
],
|
| 122 |
+
"description": "Original source of the data (e.g., 'OpenStreetMap', 'INEC Census 2023')"
|
| 123 |
+
},
|
| 124 |
+
"license": {
|
| 125 |
+
"type": [
|
| 126 |
+
"string",
|
| 127 |
+
"null"
|
| 128 |
+
],
|
| 129 |
+
"description": "Data license (e.g., 'ODbL', 'CC-BY-4.0', 'Public Domain')"
|
| 130 |
+
},
|
| 131 |
+
"last_indexed": {
|
| 132 |
+
"type": "string",
|
| 133 |
+
"format": "date-time",
|
| 134 |
+
"description": "ISO timestamp when the dataset was last indexed"
|
| 135 |
+
},
|
| 136 |
+
"last_enriched": {
|
| 137 |
+
"type": [
|
| 138 |
+
"string",
|
| 139 |
+
"null"
|
| 140 |
+
],
|
| 141 |
+
"format": "date-time",
|
| 142 |
+
"description": "ISO timestamp when LLM enrichment was last run"
|
| 143 |
+
}
|
| 144 |
+
}
|
| 145 |
+
}
|
backend/data/censo/censo_2023_enriched.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
backend/data/censo/censo_panama_2023_unificado.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
backend/data/global/airports/panama_airports.geojson
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"type": "FeatureCollection",
|
| 3 |
+
"name": "panama_airports",
|
| 4 |
+
"crs": { "type": "name", "properties": { "name": "urn:ogc:def:crs:OGC:1.3:CRS84" } },
|
| 5 |
+
"features": [
|
| 6 |
+
{ "type": "Feature", "properties": { "id": 308731, "ident": "CZJ", "type": "small_airport", "name": "Corazón de Jesús Airport", "latitude_deg": 9.44686, "longitude_deg": -78.575678, "elevation_ft": 8.0, "continent": null, "iso_country": "PA", "iso_region": "PA-GY", "municipality": "Tupile", "scheduled_service": "no", "icao_code": null, "iata_code": "CZJ", "gps_code": "MPCJ", "local_code": null, "home_link": null, "wikipedia_link": "https://en.wikipedia.org/wiki/Coraz%C3%B3n_de_Jes%C3%BAs_Airport", "keywords": "Narganá, Usdup" }, "geometry": { "type": "Point", "coordinates": [ -78.575678, 9.44686 ] } },
|
| 7 |
+
{ "type": "Feature", "properties": { "id": 309162, "ident": "GHE", "type": "small_airport", "name": "Garachiné Airport", "latitude_deg": 8.0644, "longitude_deg": -78.3673, "elevation_ft": 42.0, "continent": "SA", "iso_country": "PA", "iso_region": "PA-5", "municipality": "Garachiné", "scheduled_service": "no", "icao_code": null, "iata_code": "GHE", "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": "https://en.wikipedia.org/wiki/Garachin%C3%A9_Airport", "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -78.3673, 8.0644 ] } },
|
| 8 |
+
{ "type": "Feature", "properties": { "id": 316549, "ident": "IVI", "type": "small_airport", "name": "Viveros Island Airport", "latitude_deg": 8.4693, "longitude_deg": -79.0016, "elevation_ft": 100.0, "continent": null, "iso_country": "PA", "iso_region": "PA-8", "municipality": "Isla Viveros", "scheduled_service": "no", "icao_code": null, "iata_code": "IVI", "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -79.0016, 8.4693 ] } },
|
| 9 |
+
{ "type": "Feature", "properties": { "id": 5323, "ident": "MP01", "type": "small_airport", "name": "Finca Ceiba Airport", "latitude_deg": 8.3549995422363281, "longitude_deg": -82.836402893066406, "elevation_ft": 52.0, "continent": null, "iso_country": "PA", "iso_region": "PA-4", "municipality": "Finca Jaguá", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": "MP01", "local_code": "MP01", "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -82.836402893066406, 8.354999542236328 ] } },
|
| 10 |
+
{ "type": "Feature", "properties": { "id": 5324, "ident": "MP02", "type": "small_airport", "name": "Finca 45 Airport", "latitude_deg": 9.543330192565918, "longitude_deg": -82.733802795410156, "elevation_ft": 56.0, "continent": null, "iso_country": "PA", "iso_region": "PA-1", "municipality": "Dos Caños", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": "MP02", "local_code": "MP02", "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -82.733802795410156, 9.543330192565918 ] } },
|
| 11 |
+
{ "type": "Feature", "properties": { "id": 5325, "ident": "MP03", "type": "small_airport", "name": "La Cabezona Airport", "latitude_deg": 8.3457, "longitude_deg": -82.5042, "elevation_ft": 31.0, "continent": null, "iso_country": "PA", "iso_region": "PA-4", "municipality": "Guarumal", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": "MPCB", "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": "MP03" }, "geometry": { "type": "Point", "coordinates": [ -82.5042, 8.3457 ] } },
|
| 12 |
+
{ "type": "Feature", "properties": { "id": 5326, "ident": "MP17", "type": "small_airport", "name": "Finca 67 Airport", "latitude_deg": 9.4344100952148438, "longitude_deg": -82.499099731445312, "elevation_ft": 30.0, "continent": null, "iso_country": "PA", "iso_region": "PA-1", "municipality": "Changuinola", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": "MP17", "local_code": "MP17", "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -82.499099731445312, 9.434410095214844 ] } },
|
| 13 |
+
{ "type": "Feature", "properties": { "id": 5327, "ident": "MP18", "type": "small_airport", "name": "Guillermo Palm Jaén Airport", "latitude_deg": 8.50383, "longitude_deg": -80.360298, "elevation_ft": 282.0, "continent": null, "iso_country": "PA", "iso_region": "PA-2", "municipality": "Penonomé", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": "MPPN", "local_code": null, "home_link": "https://aerodromo-guillermo-palm-jaen.negocio.site/", "wikipedia_link": null, "keywords": "MP18" }, "geometry": { "type": "Point", "coordinates": [ -80.360298, 8.50383 ] } },
|
| 14 |
+
{ "type": "Feature", "properties": { "id": 5330, "ident": "MP21", "type": "small_airport", "name": "Alvaro Berroa Airport", "latitude_deg": 8.7703895568847656, "longitude_deg": -82.664398193359375, "elevation_ft": 5000.0, "continent": null, "iso_country": "PA", "iso_region": "PA-4", "municipality": "Nueva California", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": "MP21", "local_code": "MP21", "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -82.664398193359375, 8.770389556884766 ] } },
|
| 15 |
+
{ "type": "Feature", "properties": { "id": 5331, "ident": "MP22", "type": "small_airport", "name": "Ingenio Santa Rosa Airport", "latitude_deg": 8.1952199935913086, "longitude_deg": -80.658699035644531, "elevation_ft": 109.0, "continent": null, "iso_country": "PA", "iso_region": "PA-2", "municipality": "Ingenio Santa Rosa", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": "MP22", "local_code": "MP22", "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -80.658699035644531, 8.195219993591309 ] } },
|
| 16 |
+
{ "type": "Feature", "properties": { "id": 5332, "ident": "MP23", "type": "small_airport", "name": "Capt. Alex H. Bosquez Airport", "latitude_deg": 9.16628, "longitude_deg": -79.545205, "elevation_ft": 394.0, "continent": null, "iso_country": "PA", "iso_region": "PA-8", "municipality": "Calzada Larga", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": "MPCL", "local_code": null, "home_link": null, "wikipedia_link": "https://en.wikipedia.org/wiki/Calzada_Larga_Airport", "keywords": "MP23" }, "geometry": { "type": "Point", "coordinates": [ -79.545205, 9.16628 ] } },
|
| 17 |
+
{ "type": "Feature", "properties": { "id": 5333, "ident": "MP24", "type": "small_airport", "name": "Captain Krish E. Persaud Airport", "latitude_deg": 8.58846, "longitude_deg": -79.889702, "elevation_ft": 141.0, "continent": null, "iso_country": "PA", "iso_region": "PA-10", "municipality": "Chame", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": "MPCM", "local_code": null, "home_link": null, "wikipedia_link": "https://en.wikipedia.org/wiki/Chame_Airport", "keywords": "Chame Airfield #1, MP24" }, "geometry": { "type": "Point", "coordinates": [ -79.889702, 8.58846 ] } },
|
| 18 |
+
{ "type": "Feature", "properties": { "id": 5334, "ident": "MP26", "type": "small_airport", "name": "Punta Cocos Airport", "latitude_deg": 8.22485, "longitude_deg": -78.904404, "elevation_ft": 66.0, "continent": null, "iso_country": "PA", "iso_region": "PA-8", "municipality": "Punta Cocos", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": "MPPU", "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": "MP26" }, "geometry": { "type": "Point", "coordinates": [ -78.904404, 8.22485 ] } },
|
| 19 |
+
{ "type": "Feature", "properties": { "id": 5335, "ident": "MP27", "type": "small_airport", "name": "Deborah Airport", "latitude_deg": 9.51614, "longitude_deg": -82.595497, "elevation_ft": 20.0, "continent": null, "iso_country": "PA", "iso_region": "PA-1", "municipality": "Guabito", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": "MP27", "local_code": "MP27", "home_link": null, "wikipedia_link": null, "keywords": "Guabito California" }, "geometry": { "type": "Point", "coordinates": [ -82.595497, 9.51614 ] } },
|
| 20 |
+
{ "type": "Feature", "properties": { "id": 515607, "ident": "MPAG", "type": "small_airport", "name": "El Aguila Airstrip", "latitude_deg": 8.37168, "longitude_deg": -80.351676, "elevation_ft": 75.0, "continent": null, "iso_country": "PA", "iso_region": "PA-2", "municipality": "El Aguila", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": "MPAG", "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -80.351676, 8.37168 ] } },
|
| 21 |
+
{ "type": "Feature", "properties": { "id": 4786, "ident": "MPBO", "type": "medium_airport", "name": "Bocas del Toro International Airport", "latitude_deg": 9.34085, "longitude_deg": -82.250801, "elevation_ft": 10.0, "continent": null, "iso_country": "PA", "iso_region": "PA-1", "municipality": "Isla Colón", "scheduled_service": "yes", "icao_code": "MPBO", "iata_code": "BOC", "gps_code": "MPBO", "local_code": null, "home_link": null, "wikipedia_link": "https://en.wikipedia.org/wiki/Bocas_del_Toro_%22Isla_Colón%22_International_Airport", "keywords": "Jose Ezequiel Hall" }, "geometry": { "type": "Point", "coordinates": [ -82.250801, 9.34085 ] } },
|
| 22 |
+
{ "type": "Feature", "properties": { "id": 4787, "ident": "MPCE", "type": "medium_airport", "name": "Alonso Valderrama Airport", "latitude_deg": 7.98784, "longitude_deg": -80.409837, "elevation_ft": 33.0, "continent": null, "iso_country": "PA", "iso_region": "PA-6", "municipality": "Chitré", "scheduled_service": "yes", "icao_code": "MPCE", "iata_code": "CTD", "gps_code": "MPCE", "local_code": null, "home_link": null, "wikipedia_link": "https://en.wikipedia.org/wiki/Chitré_Alonso_Valderrama_Airport", "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -80.409837, 7.98784 ] } },
|
| 23 |
+
{ "type": "Feature", "properties": { "id": 4788, "ident": "MPCH", "type": "medium_airport", "name": "Changuinola Captain Manuel Niño International Airport", "latitude_deg": 9.458962, "longitude_deg": -82.515062, "elevation_ft": 19.0, "continent": null, "iso_country": "PA", "iso_region": "PA-1", "municipality": "Changuinola", "scheduled_service": "yes", "icao_code": "MPCH", "iata_code": "CHX", "gps_code": "MPCH", "local_code": null, "home_link": null, "wikipedia_link": "https://en.wikipedia.org/wiki/Changuinola_%22Capit%C3%A1n_Manuel_Ni%C3%B1o%22_International_Airport", "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -82.515062, 9.458962 ] } },
|
| 24 |
+
{ "type": "Feature", "properties": { "id": 4789, "ident": "MPDA", "type": "medium_airport", "name": "Enrique Malek International Airport", "latitude_deg": 8.391, "longitude_deg": -82.434998, "elevation_ft": 89.0, "continent": null, "iso_country": "PA", "iso_region": "PA-4", "municipality": "David", "scheduled_service": "yes", "icao_code": "MPDA", "iata_code": "DAV", "gps_code": "MPDA", "local_code": null, "home_link": null, "wikipedia_link": "https://en.wikipedia.org/wiki/Enrique_Malek_International_Airport", "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -82.434998, 8.391 ] } },
|
| 25 |
+
{ "type": "Feature", "properties": { "id": 4790, "ident": "MPEJ", "type": "medium_airport", "name": "Enrique Adolfo Jimenez Airport", "latitude_deg": 9.35664, "longitude_deg": -79.867401, "elevation_ft": 25.0, "continent": null, "iso_country": "PA", "iso_region": "PA-3", "municipality": "Colón", "scheduled_service": "yes", "icao_code": "MPEJ", "iata_code": "ONX", "gps_code": "MPEJ", "local_code": null, "home_link": null, "wikipedia_link": "https://en.wikipedia.org/wiki/Enrique_Adolfo_Jim%C3%A9nez_Airport", "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -79.867401, 9.35664 ] } },
|
| 26 |
+
{ "type": "Feature", "properties": { "id": 525236, "ident": "MPFE", "type": "small_airport", "name": "Fernando Eleta Airport", "latitude_deg": 8.411389, "longitude_deg": -79.111115, "elevation_ft": 311.0, "continent": null, "iso_country": "PA", "iso_region": "PA-8", "municipality": "Pedro de Cocal", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": "MPFE", "local_code": null, "home_link": null, "wikipedia_link": "https://en.wikipedia.org/wiki/en:Fernando%20Eleta%20Airport", "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -79.111115, 8.411389 ] } },
|
| 27 |
+
{ "type": "Feature", "properties": { "id": 42190, "ident": "MPFS", "type": "small_airport", "name": "Fort Sherman Airport", "latitude_deg": 9.3650903701782244, "longitude_deg": -79.949798583984375, "elevation_ft": 10.0, "continent": null, "iso_country": "PA", "iso_region": "PA-3", "municipality": "Fort Sherman", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": "MPFS", "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -79.949798583984375, 9.365090370178224 ] } },
|
| 28 |
+
{ "type": "Feature", "properties": { "id": 30768, "ident": "MPHO", "type": "small_airport", "name": "Panamá Pacífico International Airport", "latitude_deg": 8.91479, "longitude_deg": -79.599602, "elevation_ft": 52.0, "continent": null, "iso_country": "PA", "iso_region": "PA-10", "municipality": "Panamá City", "scheduled_service": "yes", "icao_code": null, "iata_code": "BLB", "gps_code": "MPPA", "local_code": null, "home_link": "http://www.panamapacifico.com/", "wikipedia_link": "https://en.wikipedia.org/wiki/Panam%C3%A1_Pac%C3%ADfico_International_Airport", "keywords": "HOW, Howard Air Force Base, Panama Pacifico" }, "geometry": { "type": "Point", "coordinates": [ -79.599602, 8.91479 ] } },
|
| 29 |
+
{ "type": "Feature", "properties": { "id": 316555, "ident": "MPI", "type": "small_airport", "name": "Mamitupu Airport", "latitude_deg": 9.1851, "longitude_deg": -77.9841, "elevation_ft": 25.0, "continent": "SA", "iso_country": "PA", "iso_region": "PA-GY", "municipality": "Mamitupu", "scheduled_service": "no", "icao_code": null, "iata_code": "MPI", "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": "https://en.wikipedia.org/wiki/Mamitupu_Airport", "keywords": "Mamitupo" }, "geometry": { "type": "Point", "coordinates": [ -77.9841, 9.1851 ] } },
|
| 30 |
+
{ "type": "Feature", "properties": { "id": 31937, "ident": "MPJE", "type": "small_airport", "name": "Jaqué Airport", "latitude_deg": 7.51778, "longitude_deg": -78.157204, "elevation_ft": 29.0, "continent": null, "iso_country": "PA", "iso_region": "PA-5", "municipality": "Jaqué", "scheduled_service": "no", "icao_code": "MPJE", "iata_code": "JQE", "gps_code": "MPJE", "local_code": null, "home_link": null, "wikipedia_link": "https://en.wikipedia.org/wiki/Jaqué_Airport", "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -78.157204, 7.51778 ] } },
|
| 31 |
+
{ "type": "Feature", "properties": { "id": 346902, "ident": "MPMC", "type": "small_airport", "name": "Chame Mayor Airport", "latitude_deg": 8.591418, "longitude_deg": -79.869189, "elevation_ft": 79.0, "continent": null, "iso_country": "PA", "iso_region": "PA-10", "municipality": "Chame", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": "MPMC", "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -79.869189, 8.591418 ] } },
|
| 32 |
+
{ "type": "Feature", "properties": { "id": 4791, "ident": "MPMG", "type": "medium_airport", "name": "Marcos A. Gelabert International Airport", "latitude_deg": 8.97334, "longitude_deg": -79.555603, "elevation_ft": 31.0, "continent": null, "iso_country": "PA", "iso_region": "PA-8", "municipality": "Albrook", "scheduled_service": "yes", "icao_code": "MPMG", "iata_code": "PAC", "gps_code": "MPMG", "local_code": null, "home_link": null, "wikipedia_link": "https://en.wikipedia.org/wiki/Albrook_%22Marcos_A._Gelabert%22_International_Airport", "keywords": "Balboa. Albrook AFS. MPLB" }, "geometry": { "type": "Point", "coordinates": [ -79.555603, 8.97334 ] } },
|
| 33 |
+
{ "type": "Feature", "properties": { "id": 31939, "ident": "MPNU", "type": "small_airport", "name": "Augusto Vergara Airport", "latitude_deg": 7.8575, "longitude_deg": -80.276167, "elevation_ft": 49.0, "continent": null, "iso_country": "PA", "iso_region": "PA-7", "municipality": "Los Santos", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": "MPGU", "local_code": null, "home_link": null, "wikipedia_link": "https://en.wikipedia.org/wiki/Augusto_Vergara_Airport", "keywords": "Guararé" }, "geometry": { "type": "Point", "coordinates": [ -80.276167, 7.8575 ] } },
|
| 34 |
+
{ "type": "Feature", "properties": { "id": 42197, "ident": "MPOA", "type": "small_airport", "name": "Puerto Obaldía Airport", "latitude_deg": 8.668813, "longitude_deg": -77.417399, "elevation_ft": 223.0, "continent": null, "iso_country": "PA", "iso_region": "PA-GY", "municipality": "Puerto Obaldía", "scheduled_service": "no", "icao_code": "MPOA", "iata_code": "PUE", "gps_code": "MPOA", "local_code": null, "home_link": null, "wikipedia_link": "https://en.wikipedia.org/wiki/Puerto_Obaldia_Airport", "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -77.417399, 8.668813 ] } },
|
| 35 |
+
{ "type": "Feature", "properties": { "id": 346878, "ident": "MPPD", "type": "small_airport", "name": "Capt. J. Montenegro Airport", "latitude_deg": 7.534801, "longitude_deg": -80.043347, "elevation_ft": 148.0, "continent": null, "iso_country": "PA", "iso_region": "PA-7", "municipality": "Pedasí", "scheduled_service": "yes", "icao_code": null, "iata_code": "PDM", "gps_code": "MPPD", "local_code": null, "home_link": null, "wikipedia_link": "https://es.wikipedia.org/wiki/Aeropuerto_Capit%C3%A1n_Justiniano_Montenegro", "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -80.043347, 7.534801 ] } },
|
| 36 |
+
{ "type": "Feature", "properties": { "id": 515602, "ident": "MPPT", "type": "small_airport", "name": "Punta Patiño Airstrip", "latitude_deg": 8.252816, "longitude_deg": -78.278618, "elevation_ft": 10.0, "continent": null, "iso_country": "PA", "iso_region": "PA-5", "municipality": "Punta Patiño", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": "MPPT", "local_code": null, "home_link": null, "wikipedia_link": "https://en.wikipedia.org/wiki/Punta_Pati%C3%B1o_Airport", "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -78.278618, 8.252816 ] } },
|
| 37 |
+
{ "type": "Feature", "properties": { "id": 4792, "ident": "MPSA", "type": "medium_airport", "name": "Ruben Cantu Airport", "latitude_deg": 8.0855998992919922, "longitude_deg": -80.945297241210938, "elevation_ft": 272.0, "continent": null, "iso_country": "PA", "iso_region": "PA-9", "municipality": "Santiago", "scheduled_service": "no", "icao_code": "MPSA", "iata_code": "SYP", "gps_code": "MPSA", "local_code": null, "home_link": null, "wikipedia_link": "https://en.wikipedia.org/wiki/Ruben_Cantu_Airport", "keywords": "Santiago" }, "geometry": { "type": "Point", "coordinates": [ -80.945297241210938, 8.085599899291992 ] } },
|
| 38 |
+
{ "type": "Feature", "properties": { "id": 31940, "ident": "MPSM", "type": "small_airport", "name": "Scarlett Martinez International Airport", "latitude_deg": 8.3758802413940003, "longitude_deg": -80.127899169922003, "elevation_ft": 105.0, "continent": null, "iso_country": "PA", "iso_region": "PA-2", "municipality": "Río Hato", "scheduled_service": "yes", "icao_code": "MPSM", "iata_code": "RIH", "gps_code": "MPSM", "local_code": null, "home_link": null, "wikipedia_link": "https://en.wikipedia.org/wiki/R%C3%ADo_Hato_Airport", "keywords": "MPRH, Río Hato Army Air Base, Captain Scarlett Martinez" }, "geometry": { "type": "Point", "coordinates": [ -80.127899169922003, 8.375880241394 ] } },
|
| 39 |
+
{ "type": "Feature", "properties": { "id": 4793, "ident": "MPTO", "type": "large_airport", "name": "Tocumen International Airport", "latitude_deg": 9.07136, "longitude_deg": -79.383499, "elevation_ft": 135.0, "continent": null, "iso_country": "PA", "iso_region": "PA-8", "municipality": "Tocumen", "scheduled_service": "yes", "icao_code": "MPTO", "iata_code": "PTY", "gps_code": "MPTO", "local_code": null, "home_link": "https://www.tocumenpanama.aero/", "wikipedia_link": "https://en.wikipedia.org/wiki/Tocumen_International_Airport", "keywords": "La Joya No 1" }, "geometry": { "type": "Point", "coordinates": [ -79.383499, 9.07136 ] } },
|
| 40 |
+
{ "type": "Feature", "properties": { "id": 42187, "ident": "MPVR", "type": "small_airport", "name": "El Porvenir Airport", "latitude_deg": 9.559212, "longitude_deg": -78.946631, "elevation_ft": 17.0, "continent": null, "iso_country": "PA", "iso_region": "PA-GY", "municipality": "El Porvenir", "scheduled_service": "no", "icao_code": "MPVR", "iata_code": "PVE", "gps_code": "MPVR", "local_code": null, "home_link": null, "wikipedia_link": "https://en.wikipedia.org/wiki/El_Porvenir_Airport", "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -78.946631, 9.559212 ] } },
|
| 41 |
+
{ "type": "Feature", "properties": { "id": 32008, "ident": "MPWN", "type": "small_airport", "name": "Wannukandi Airport", "latitude_deg": 9.273476, "longitude_deg": -78.139848, "elevation_ft": 6.0, "continent": null, "iso_country": "PA", "iso_region": "PA-GY", "municipality": "San Blas", "scheduled_service": "no", "icao_code": "MPWN", "iata_code": "NBL", "gps_code": "MPWN", "local_code": null, "home_link": null, "wikipedia_link": "https://en.wikipedia.org/wiki/Wannukandi_Airport", "keywords": "San Blas Airport" }, "geometry": { "type": "Point", "coordinates": [ -78.139848, 9.273476 ] } },
|
| 42 |
+
{ "type": "Feature", "properties": { "id": 4794, "ident": "MPZL", "type": "small_airport", "name": "Finca 32 Airport", "latitude_deg": 9.4270896911621094, "longitude_deg": -82.562698364257812, "elevation_ft": 23.0, "continent": null, "iso_country": "PA", "iso_region": "PA-1", "municipality": "La Dalia", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": "MPZL", "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -82.562698364257812, 9.427089691162109 ] } },
|
| 43 |
+
{ "type": "Feature", "properties": { "id": 315194, "ident": "OGM", "type": "small_airport", "name": "Ogobsucum Airport", "latitude_deg": 9.1383, "longitude_deg": -77.93385, "elevation_ft": 13.0, "continent": null, "iso_country": "PA", "iso_region": "PA-GY", "municipality": "Ustupu", "scheduled_service": "no", "icao_code": null, "iata_code": "OGM", "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": "https://en.wikipedia.org/wiki/Ustupu-Ogobsucum_Airport", "keywords": "Ogobsucun, Ogubsucum, Ogubsucun, Ustupo" }, "geometry": { "type": "Point", "coordinates": [ -77.93385, 9.1383 ] } },
|
| 44 |
+
{ "type": "Feature", "properties": { "id": 42182, "ident": "PA-0001", "type": "small_airport", "name": "Achutupu Airport", "latitude_deg": 9.188481, "longitude_deg": -77.994153, "elevation_ft": 10.0, "continent": null, "iso_country": "PA", "iso_region": "PA-GY", "municipality": "Mamitupu", "scheduled_service": "no", "icao_code": null, "iata_code": "ACU", "gps_code": "MPAC", "local_code": null, "home_link": null, "wikipedia_link": "https://en.wikipedia.org/wiki/Achutupo_Airport", "keywords": "Achutupo" }, "geometry": { "type": "Point", "coordinates": [ -77.994153, 9.188481 ] } },
|
| 45 |
+
{ "type": "Feature", "properties": { "id": 42183, "ident": "PA-0002", "type": "small_airport", "name": "Aguadulce Airport", "latitude_deg": 8.2516498565673828, "longitude_deg": -80.565399169921875, "elevation_ft": 104.0, "continent": null, "iso_country": "PA", "iso_region": "PA-2", "municipality": "Aguadulce", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -80.565399169921875, 8.251649856567383 ] } },
|
| 46 |
+
{ "type": "Feature", "properties": { "id": 42184, "ident": "PA-0003", "type": "small_airport", "name": "Ailigandí Airport", "latitude_deg": 9.2226, "longitude_deg": -78.0236, "elevation_ft": 55.0, "continent": null, "iso_country": "PA", "iso_region": "PA-GY", "municipality": "Isla Lorenzo Bello", "scheduled_service": "no", "icao_code": null, "iata_code": "AIL", "gps_code": "MPAI", "local_code": null, "home_link": null, "wikipedia_link": "https://en.wikipedia.org/wiki/Ailigandí_Airport", "keywords": "Ailigandi, Alligandi" }, "geometry": { "type": "Point", "coordinates": [ -78.0236, 9.2226 ] } },
|
| 47 |
+
{ "type": "Feature", "properties": { "id": 42185, "ident": "PA-0004", "type": "small_airport", "name": "Cartí Airport", "latitude_deg": 9.452863, "longitude_deg": -78.978917, "elevation_ft": 6.0, "continent": null, "iso_country": "PA", "iso_region": "PA-GY", "municipality": "Cartí Islands", "scheduled_service": "no", "icao_code": null, "iata_code": "CTE", "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": "https://en.wikipedia.org/wiki/Cart%C3%AD_Airport", "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -78.978917, 9.452863 ] } },
|
| 48 |
+
{ "type": "Feature", "properties": { "id": 42186, "ident": "PA-0005", "type": "small_airport", "name": "Corazón de Jesús Airport", "latitude_deg": 9.0172195434570312, "longitude_deg": -77.980697631835938, "elevation_ft": 1008.0, "continent": null, "iso_country": "PA", "iso_region": "PA-5", "municipality": "Nurna", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -77.980697631835938, 9.017219543457031 ] } },
|
| 49 |
+
{ "type": "Feature", "properties": { "id": 42188, "ident": "PA-0006", "type": "small_airport", "name": "Finca Blanco Airport", "latitude_deg": 8.389832, "longitude_deg": -82.870847, "elevation_ft": 72.0, "continent": null, "iso_country": "PA", "iso_region": "PA-4", "municipality": "Finca Blanco", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -82.870847, 8.389832 ] } },
|
| 50 |
+
{ "type": "Feature", "properties": { "id": 42189, "ident": "PA-0007", "type": "small_airport", "name": "Finca Fátima Airport", "latitude_deg": 8.388027, "longitude_deg": -82.748509, "elevation_ft": 26.0, "continent": null, "iso_country": "PA", "iso_region": "PA-4", "municipality": "Finca Fátima", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -82.748509, 8.388027 ] } },
|
| 51 |
+
{ "type": "Feature", "properties": { "id": 42191, "ident": "PA-0008", "type": "small_airport", "name": "La Joya Airport", "latitude_deg": 9.1385602951049805, "longitude_deg": -79.240196228027344, "elevation_ft": 96.0, "continent": null, "iso_country": "PA", "iso_region": "PA-8", "municipality": "La Joya", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -79.240196228027344, 9.13856029510498 ] } },
|
| 52 |
+
{ "type": "Feature", "properties": { "id": 42192, "ident": "PA-0009", "type": "small_airport", "name": "La Plantación Airport", "latitude_deg": 7.6628899574279794, "longitude_deg": -81.006103515625, "elevation_ft": 21.0, "continent": null, "iso_country": "PA", "iso_region": "PA-9", "municipality": "La Plantación", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -81.006103515625, 7.662889957427979 ] } },
|
| 53 |
+
{ "type": "Feature", "properties": { "id": 42193, "ident": "PA-0010", "type": "small_airport", "name": "Mandinga Airport", "latitude_deg": 9.454635, "longitude_deg": -79.086507, "elevation_ft": 38.0, "continent": null, "iso_country": "PA", "iso_region": "PA-GY", "municipality": "Mandinga", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -79.086507, 9.454635 ] } },
|
| 54 |
+
{ "type": "Feature", "properties": { "id": 42194, "ident": "PA-0011", "type": "small_airport", "name": "Mulatupo Airport", "latitude_deg": 8.945487, "longitude_deg": -77.733486, "elevation_ft": 15.0, "continent": null, "iso_country": "PA", "iso_region": "PA-GY", "municipality": "Mulatupo", "scheduled_service": "no", "icao_code": null, "iata_code": "MPP", "gps_code": "MPMU", "local_code": null, "home_link": null, "wikipedia_link": "https://en.wikipedia.org/wiki/Mulatupo_Airport", "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -77.733486, 8.945487 ] } },
|
| 55 |
+
{ "type": "Feature", "properties": { "id": 42195, "ident": "PA-0012", "type": "closed", "name": "Narganá Airport", "latitude_deg": 9.444659, "longitude_deg": -78.588896, "elevation_ft": 7.0, "continent": null, "iso_country": "PA", "iso_region": "PA-GY", "municipality": "Tupile", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": "NGN, Corazón de Jesús" }, "geometry": { "type": "Point", "coordinates": [ -78.588896, 9.444659 ] } },
|
| 56 |
+
{ "type": "Feature", "properties": { "id": 42196, "ident": "PA-0013", "type": "small_airport", "name": "Playón Chico Airport", "latitude_deg": 9.30692, "longitude_deg": -78.235273, "elevation_ft": 18.0, "continent": null, "iso_country": "PA", "iso_region": "PA-GY", "municipality": "Ukupseni", "scheduled_service": "no", "icao_code": null, "iata_code": "PYC", "gps_code": "MPPH", "local_code": null, "home_link": null, "wikipedia_link": "https://en.wikipedia.org/wiki/Play%C3%B3n_Chico_Airport", "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -78.235273, 9.30692 ] } },
|
| 57 |
+
{ "type": "Feature", "properties": { "id": 42198, "ident": "PA-0014", "type": "small_airport", "name": "Río Azúcar Airport", "latitude_deg": 9.4247, "longitude_deg": -78.6269, "elevation_ft": 12.0, "continent": null, "iso_country": "PA", "iso_region": "PA-GY", "municipality": "Río Azúcar", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -78.6269, 9.4247 ] } },
|
| 58 |
+
{ "type": "Feature", "properties": { "id": 42199, "ident": "PA-0015", "type": "small_airport", "name": "Rio Sidra Airport", "latitude_deg": 9.3167896270751953, "longitude_deg": -79.282997131347656, "elevation_ft": 2719.0, "continent": null, "iso_country": "PA", "iso_region": "PA-8", "municipality": "Rio Sidra", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -79.282997131347656, 9.316789627075195 ] } },
|
| 59 |
+
{ "type": "Feature", "properties": { "id": 42200, "ident": "PA-0016", "type": "small_airport", "name": "Río Tigre Airport", "latitude_deg": 9.2508802413940447, "longitude_deg": -78.498703002929688, "elevation_ft": 1095.0, "continent": null, "iso_country": "PA", "iso_region": "PA-GY", "municipality": "Río Tigre", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -78.498703002929688, 9.250880241394045 ] } },
|
| 60 |
+
{ "type": "Feature", "properties": { "id": 42201, "ident": "PA-0017", "type": "small_airport", "name": "San Miguel Airport", "latitude_deg": 8.456507, "longitude_deg": -78.934214, "elevation_ft": 70.0, "continent": null, "iso_country": "PA", "iso_region": "PA-8", "municipality": "Isla del Rey", "scheduled_service": "no", "icao_code": null, "iata_code": "NMG", "gps_code": "MPMI", "local_code": null, "home_link": null, "wikipedia_link": "https://en.wikipedia.org/wiki/San_Miguel_Airport,_Panama", "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -78.934214, 8.456507 ] } },
|
| 61 |
+
{ "type": "Feature", "properties": { "id": 42202, "ident": "PA-0018", "type": "small_airport", "name": "Tubualá Airport", "latitude_deg": 8.918601, "longitude_deg": -77.709182, "elevation_ft": 20.0, "continent": null, "iso_country": "PA", "iso_region": "PA-GY", "municipality": "Coetupo", "scheduled_service": "no", "icao_code": null, "iata_code": "TUW", "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": "https://en.wikipedia.org/wiki/Tubual%C3%A1_Airport", "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -77.709182, 8.918601 ] } },
|
| 62 |
+
{ "type": "Feature", "properties": { "id": 42203, "ident": "PA-0019", "type": "small_airport", "name": "Tupile Airport", "latitude_deg": 9.2465801239013672, "longitude_deg": -78.362503051757812, "elevation_ft": 1374.0, "continent": null, "iso_country": "PA", "iso_region": "PA-GY", "municipality": "Tupile", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -78.362503051757812, 9.246580123901367 ] } },
|
| 63 |
+
{ "type": "Feature", "properties": { "id": 342550, "ident": "PA-0020", "type": "small_airport", "name": "Coral Lodge Airport", "latitude_deg": 9.55488, "longitude_deg": -79.13786, "elevation_ft": 20.0, "continent": null, "iso_country": "PA", "iso_region": "PA-3", "municipality": "Santa Isabel", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -79.13786, 9.55488 ] } },
|
| 64 |
+
{ "type": "Feature", "properties": { "id": 42205, "ident": "PA-0021", "type": "closed", "name": "Ailigandí North Airport", "latitude_deg": 9.23903, "longitude_deg": -78.03922, "elevation_ft": 19.0, "continent": null, "iso_country": "PA", "iso_region": "PA-GY", "municipality": "Ailigandí", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -78.03922, 9.23903 ] } },
|
| 65 |
+
{ "type": "Feature", "properties": { "id": 42206, "ident": "PA-0022", "type": "small_airport", "name": "Yaviza Airport", "latitude_deg": 8.1528, "longitude_deg": -77.687, "elevation_ft": 75.0, "continent": null, "iso_country": "PA", "iso_region": "PA-5", "municipality": "Yaviza", "scheduled_service": "no", "icao_code": null, "iata_code": "PYV", "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": "https://en.wikipedia.org/wiki/Yaviza_Airport", "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -77.687, 8.1528 ] } },
|
| 66 |
+
{ "type": "Feature", "properties": { "id": 315017, "ident": "PA-0023", "type": "closed", "name": "Isla Tigre Airstrip", "latitude_deg": 9.4339, "longitude_deg": -78.5235, "elevation_ft": 7.0, "continent": null, "iso_country": "PA", "iso_region": "PA-GY", "municipality": "Mamartupu", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -78.5235, 9.4339 ] } },
|
| 67 |
+
{ "type": "Feature", "properties": { "id": 316550, "ident": "PA-0024", "type": "small_airport", "name": "Coiba Airport", "latitude_deg": 7.5068, "longitude_deg": -81.6981, "elevation_ft": 255.0, "continent": null, "iso_country": "PA", "iso_region": "PA-9", "municipality": "Isla de Coiba", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -81.6981, 7.5068 ] } },
|
| 68 |
+
{ "type": "Feature", "properties": { "id": 316551, "ident": "PA-0025", "type": "small_airport", "name": "Arenas Airport", "latitude_deg": 7.3713, "longitude_deg": -80.846, "elevation_ft": 85.0, "continent": null, "iso_country": "PA", "iso_region": "PA-9", "municipality": "Arenas", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": "MPAR", "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -80.846, 7.3713 ] } },
|
| 69 |
+
{ "type": "Feature", "properties": { "id": 316553, "ident": "PA-0026", "type": "small_airport", "name": "Tonosí Airport", "latitude_deg": 7.4148, "longitude_deg": -80.4466, "elevation_ft": 55.0, "continent": null, "iso_country": "PA", "iso_region": "PA-7", "municipality": "Tonosí", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -80.4466, 7.4148 ] } },
|
| 70 |
+
{ "type": "Feature", "properties": { "id": 316554, "ident": "PA-0027", "type": "small_airport", "name": "Candelaria Airport", "latitude_deg": 7.7326, "longitude_deg": -80.1403, "elevation_ft": 65.0, "continent": null, "iso_country": "PA", "iso_region": "PA-7", "municipality": "La Candelaria", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -80.1403, 7.7326 ] } },
|
| 71 |
+
{ "type": "Feature", "properties": { "id": 342551, "ident": "PA-0028", "type": "small_airport", "name": "Nusatupo Airport", "latitude_deg": 9.43392, "longitude_deg": -78.83173, "elevation_ft": 18.0, "continent": null, "iso_country": "PA", "iso_region": "PA-GY", "municipality": "Nusatupo", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -78.83173, 9.43392 ] } },
|
| 72 |
+
{ "type": "Feature", "properties": { "id": 342552, "ident": "PA-0029", "type": "small_airport", "name": "Wannukandi Airport", "latitude_deg": 9.273166, "longitude_deg": -78.139873, "elevation_ft": 13.0, "continent": null, "iso_country": "PA", "iso_region": "PA-GY", "municipality": "Wannukandi", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -78.139873, 9.273166 ] } },
|
| 73 |
+
{ "type": "Feature", "properties": { "id": 342553, "ident": "PA-0030", "type": "small_airport", "name": "Mansukun Airport", "latitude_deg": 9.05011, "longitude_deg": -77.80985, "elevation_ft": 10.0, "continent": null, "iso_country": "PA", "iso_region": "PA-GY", "municipality": "Mansukum", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -77.80985, 9.05011 ] } },
|
| 74 |
+
{ "type": "Feature", "properties": { "id": 342554, "ident": "PA-0031", "type": "closed", "name": "Napakanti Airport", "latitude_deg": 9.012796, "longitude_deg": -77.802531, "elevation_ft": 66.0, "continent": null, "iso_country": "PA", "iso_region": "PA-GY", "municipality": "Napakanti", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -77.802531, 9.012796 ] } },
|
| 75 |
+
{ "type": "Feature", "properties": { "id": 342555, "ident": "PA-0032", "type": "small_airport", "name": "Caledonia Airport", "latitude_deg": 8.90201, "longitude_deg": -77.69286, "elevation_ft": 3.0, "continent": null, "iso_country": "PA", "iso_region": "PA-GY", "municipality": "Suletupu", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": "MPCA", "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -77.69286, 8.90201 ] } },
|
| 76 |
+
{ "type": "Feature", "properties": { "id": 430649, "ident": "PA-0033", "type": "heliport", "name": "Soloy Heliport", "latitude_deg": 8.4831, "longitude_deg": -82.0816, "elevation_ft": 424.0, "continent": null, "iso_country": "PA", "iso_region": "PA-NB", "municipality": "Soloy", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -82.0816, 8.4831 ] } },
|
| 77 |
+
{ "type": "Feature", "properties": { "id": 505196, "ident": "PA-0034", "type": "closed", "name": "Aidirgandí Airport", "latitude_deg": 9.35515, "longitude_deg": -78.34587, "elevation_ft": 23.0, "continent": null, "iso_country": "PA", "iso_region": "PA-GY", "municipality": "Aidirgandí", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -78.34587, 9.35515 ] } },
|
| 78 |
+
{ "type": "Feature", "properties": { "id": 505212, "ident": "PA-0035", "type": "closed", "name": "Ingenio Las Cabras Airstrip", "latitude_deg": 7.90044, "longitude_deg": -80.540391, "elevation_ft": 112.0, "continent": null, "iso_country": "PA", "iso_region": "PA-6", "municipality": "Las Cabras", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -80.540391, 7.90044 ] } },
|
| 79 |
+
{ "type": "Feature", "properties": { "id": 506050, "ident": "PA-0036", "type": "closed", "name": "Punta Hermosa Airstrip", "latitude_deg": 7.527853, "longitude_deg": -81.849575, "elevation_ft": 250.0, "continent": null, "iso_country": "PA", "iso_region": "PA-9", "municipality": "Isla de Coiba", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -81.849575, 7.527853 ] } },
|
| 80 |
+
{ "type": "Feature", "properties": { "id": 506051, "ident": "PA-0037", "type": "small_airport", "name": "Coibito Landing Airstrip", "latitude_deg": 7.639068, "longitude_deg": -81.702433, "elevation_ft": null, "continent": null, "iso_country": "PA", "iso_region": "PA-9", "municipality": "Isla Rancheria", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -81.702433, 7.639068 ] } },
|
| 81 |
+
{ "type": "Feature", "properties": { "id": 506052, "ident": "PA-0038", "type": "small_airport", "name": "Pixvae Airstrip", "latitude_deg": 7.841248, "longitude_deg": -81.567301, "elevation_ft": 56.0, "continent": null, "iso_country": "PA", "iso_region": "PA-9", "municipality": "Pixvae", "scheduled_service": "yes", "icao_code": null, "iata_code": null, "gps_code": "MPPX", "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": "Pixbae, Pifa, Piba, Pejibaye" }, "geometry": { "type": "Point", "coordinates": [ -81.567301, 7.841248 ] } },
|
| 82 |
+
{ "type": "Feature", "properties": { "id": 506053, "ident": "PA-0039", "type": "closed", "name": "Filipinas Airstrip", "latitude_deg": 7.728211, "longitude_deg": -81.262396, "elevation_ft": 59.0, "continent": null, "iso_country": "PA", "iso_region": "PA-9", "municipality": "Carrizal", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -81.262396, 7.728211 ] } },
|
| 83 |
+
{ "type": "Feature", "properties": { "id": 506054, "ident": "PA-0040", "type": "closed", "name": "La Providencia Airstrip", "latitude_deg": 7.8878, "longitude_deg": -80.978748, "elevation_ft": 121.0, "continent": null, "iso_country": "PA", "iso_region": "PA-9", "municipality": "Ponuga", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -80.978748, 7.8878 ] } },
|
| 84 |
+
{ "type": "Feature", "properties": { "id": 506055, "ident": "PA-0041", "type": "closed", "name": "Limones Airstrip", "latitude_deg": 7.619267, "longitude_deg": -80.946937, "elevation_ft": 141.0, "continent": null, "iso_country": "PA", "iso_region": "PA-9", "municipality": "Limones", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -80.946937, 7.619267 ] } },
|
| 85 |
+
{ "type": "Feature", "properties": { "id": 5322, "ident": "PA-0042", "type": "closed", "name": "Pedasí Airport", "latitude_deg": 7.55688, "longitude_deg": -80.0233, "elevation_ft": 16.0, "continent": null, "iso_country": "PA", "iso_region": "PA-7", "municipality": "Pedasí", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": "PDM, Capt Justiniano Montenegro, MP00" }, "geometry": { "type": "Point", "coordinates": [ -80.0233, 7.55688 ] } },
|
| 86 |
+
{ "type": "Feature", "properties": { "id": 32164, "ident": "PA-0043", "type": "closed", "name": "Captain Ramon Xatruch Airport", "latitude_deg": 8.40667, "longitude_deg": -78.141701, "elevation_ft": 30.0, "continent": null, "iso_country": "PA", "iso_region": "PA-5", "municipality": "La Palma", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": "MPLP, MPLP, PLP" }, "geometry": { "type": "Point", "coordinates": [ -78.141701, 8.40667 ] } },
|
| 87 |
+
{ "type": "Feature", "properties": { "id": 315016, "ident": "PA-0044", "type": "closed", "name": "Tupile Airport", "latitude_deg": 9.45, "longitude_deg": -78.566667, "elevation_ft": 5.0, "continent": null, "iso_country": "PA", "iso_region": "PA-GY", "municipality": "Isla Tupile", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": "TUE" }, "geometry": { "type": "Point", "coordinates": [ -78.566667, 9.45 ] } },
|
| 88 |
+
{ "type": "Feature", "properties": { "id": 30640, "ident": "PA-AML", "type": "small_airport", "name": "Puerto Armuelles Airport", "latitude_deg": 8.267667, "longitude_deg": -82.864537, "elevation_ft": 42.0, "continent": null, "iso_country": "PA", "iso_region": "PA-4", "municipality": "Puerto Armuelles", "scheduled_service": "no", "icao_code": null, "iata_code": "AML", "gps_code": null, "local_code": null, "home_link": "https://visitpuertoarmuelles.com/airport-update-for-puerto-armuelles", "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -82.864537, 8.267667 ] } },
|
| 89 |
+
{ "type": "Feature", "properties": { "id": 35194, "ident": "PA-BFQ", "type": "small_airport", "name": "Bahia Piña Airport", "latitude_deg": 7.58737, "longitude_deg": -78.179939, "elevation_ft": 14.0, "continent": null, "iso_country": "PA", "iso_region": "PA-5", "municipality": "Puerto Piña", "scheduled_service": "yes", "icao_code": null, "iata_code": "BFQ", "gps_code": "MPPI", "local_code": null, "home_link": null, "wikipedia_link": "https://en.wikipedia.org/wiki/Bah%C3%ADa_Pi%C3%B1a_Airport", "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -78.179939, 7.58737 ] } },
|
| 90 |
+
{ "type": "Feature", "properties": { "id": 35196, "ident": "PA-ELE", "type": "small_airport", "name": "EL Real Airport", "latitude_deg": 8.107235, "longitude_deg": -77.725545, "elevation_ft": 65.0, "continent": null, "iso_country": "PA", "iso_region": "PA-5", "municipality": "El Real de Santa María", "scheduled_service": "no", "icao_code": null, "iata_code": "ELE", "gps_code": "MPRE", "local_code": null, "home_link": null, "wikipedia_link": "https://en.wikipedia.org/wiki/El_Real_Airport", "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -77.725545, 8.107235 ] } },
|
| 91 |
+
{ "type": "Feature", "properties": { "id": 42181, "ident": "PA-MRF", "type": "small_airport", "name": "Miraflores Airport", "latitude_deg": 8.338889, "longitude_deg": -78.131944, "elevation_ft": 32.0, "continent": null, "iso_country": "PA", "iso_region": "PA-5", "municipality": "Miraflores", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": "MPMF", "local_code": "MRF", "home_link": null, "wikipedia_link": "https://en.wikipedia.org/wiki/Miraflores_Airport,_Dari%C3%A9n", "keywords": "MPSE" }, "geometry": { "type": "Point", "coordinates": [ -78.131944, 8.338889 ] } },
|
| 92 |
+
{ "type": "Feature", "properties": { "id": 35195, "ident": "PA-OTD", "type": "small_airport", "name": "Raul Arias Espinoza Airport", "latitude_deg": 8.62876, "longitude_deg": -79.034698, "elevation_ft": 43.0, "continent": null, "iso_country": "PA", "iso_region": "PA-8", "municipality": "Contadora Island", "scheduled_service": "yes", "icao_code": null, "iata_code": "OTD", "gps_code": "MPRA", "local_code": null, "home_link": null, "wikipedia_link": "https://en.wikipedia.org/wiki/Contadora_Airport", "keywords": "Contadora Airport" }, "geometry": { "type": "Point", "coordinates": [ -79.034698, 8.62876 ] } },
|
| 93 |
+
{ "type": "Feature", "properties": { "id": 35197, "ident": "PA-SAX", "type": "small_airport", "name": "Sambú Airport", "latitude_deg": 8.026279, "longitude_deg": -78.209555, "elevation_ft": 32.0, "continent": null, "iso_country": "PA", "iso_region": "PA-5", "municipality": "Boca de Sábalo", "scheduled_service": "no", "icao_code": null, "iata_code": "SAX", "gps_code": "MPSB", "local_code": null, "home_link": null, "wikipedia_link": "https://en.wikipedia.org/wiki/Sambú_Airport", "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -78.209555, 8.026279 ] } },
|
| 94 |
+
{ "type": "Feature", "properties": { "id": 316552, "ident": "SIC", "type": "small_airport", "name": "San José Island Airport", "latitude_deg": 8.2622, "longitude_deg": -79.078, "elevation_ft": 150.0, "continent": null, "iso_country": "PA", "iso_region": "PA-8", "municipality": "Las Perlas", "scheduled_service": "no", "icao_code": null, "iata_code": "SIC", "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": "https://en.wikipedia.org/wiki/San_José_Airport_(Las_Perlas)", "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -79.078, 8.2622 ] } },
|
| 95 |
+
{ "type": "Feature", "properties": { "id": 315014, "ident": "TJC", "type": "small_airport", "name": "Ticantiquí Airport", "latitude_deg": 9.4185, "longitude_deg": -78.4896, "elevation_ft": 17.0, "continent": null, "iso_country": "PA", "iso_region": "PA-GY", "municipality": "Ticantiquí", "scheduled_service": "no", "icao_code": null, "iata_code": "TJC", "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": "Tikantiki" }, "geometry": { "type": "Point", "coordinates": [ -78.4896, 9.4185 ] } },
|
| 96 |
+
{ "type": "Feature", "properties": { "id": 315193, "ident": "UTU", "type": "small_airport", "name": "Ustupu Airport", "latitude_deg": 9.1283, "longitude_deg": -77.9337, "elevation_ft": 9.0, "continent": null, "iso_country": "PA", "iso_region": "PA-GY", "municipality": "Ustupu", "scheduled_service": "no", "icao_code": null, "iata_code": "UTU", "gps_code": "MPUP", "local_code": null, "home_link": null, "wikipedia_link": "https://en.wikipedia.org/wiki/Ustupo_Airport", "keywords": "Ustupo" }, "geometry": { "type": "Point", "coordinates": [ -77.9337, 9.1283 ] } }
|
| 97 |
+
]
|
| 98 |
+
}
|
backend/main.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from contextlib import asynccontextmanager
|
| 2 |
+
from fastapi import FastAPI
|
| 3 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 4 |
+
from fastapi.staticfiles import StaticFiles
|
| 5 |
+
from fastapi.responses import FileResponse
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
import os
|
| 8 |
+
|
| 9 |
+
from backend.core.database import init_db
|
| 10 |
+
from backend.api.api import api_router
|
| 11 |
+
|
| 12 |
+
@asynccontextmanager
|
| 13 |
+
async def lifespan(app: FastAPI):
|
| 14 |
+
# Startup
|
| 15 |
+
try:
|
| 16 |
+
await init_db()
|
| 17 |
+
except Exception as e:
|
| 18 |
+
print(f"WARNING: Database initialization failed. Running in MOCK mode. Error: {e}")
|
| 19 |
+
yield
|
| 20 |
+
# Shutdown
|
| 21 |
+
|
| 22 |
+
app = FastAPI(
|
| 23 |
+
title="GeoQuery API",
|
| 24 |
+
description="Geospatial Analysis Agent API",
|
| 25 |
+
version="0.1.0",
|
| 26 |
+
lifespan=lifespan
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
app.add_middleware(
|
| 30 |
+
CORSMiddleware,
|
| 31 |
+
allow_origins=["*"], # Allow all for dev
|
| 32 |
+
allow_credentials=True,
|
| 33 |
+
allow_methods=["*"],
|
| 34 |
+
allow_headers=["*"],
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
app.include_router(api_router, prefix="/api/v1")
|
| 38 |
+
|
| 39 |
+
# Serve static files (Frontend)
|
| 40 |
+
static_dir = Path(__file__).parent / "static"
|
| 41 |
+
|
| 42 |
+
if static_dir.exists():
|
| 43 |
+
app.mount("/_next", StaticFiles(directory=static_dir / "_next"), name="next")
|
| 44 |
+
# app.mount("/assets", StaticFiles(directory=static_dir / "assets"), name="assets") # Standard Next.js Output might not use this top-level
|
| 45 |
+
|
| 46 |
+
@app.get("/{full_path:path}")
|
| 47 |
+
async def serve_frontend(full_path: str):
|
| 48 |
+
# API requests are already handled by include_router above (because specific routes take precedence? No, order matters).
|
| 49 |
+
# Wait, explicit routes define earlier take precedence.
|
| 50 |
+
# But include_router adds routes.
|
| 51 |
+
# A catch-all route /{full_path:path} will capture everything NOT matched by previous routes.
|
| 52 |
+
# Since api_router is included first (implicitly? No, verify order).
|
| 53 |
+
# FastAPI router priority: First declared wins.
|
| 54 |
+
# So app.include_router MUST be before this catch-all. It is.
|
| 55 |
+
|
| 56 |
+
file_path = static_dir / full_path
|
| 57 |
+
if file_path.exists() and file_path.is_file():
|
| 58 |
+
return FileResponse(file_path)
|
| 59 |
+
|
| 60 |
+
# Fallback to index.html for SPA routing
|
| 61 |
+
index_path = static_dir / "index.html"
|
| 62 |
+
if index_path.exists():
|
| 63 |
+
return FileResponse(index_path)
|
| 64 |
+
return {"error": "Frontend not found"}
|
| 65 |
+
else:
|
| 66 |
+
@app.get("/")
|
| 67 |
+
def read_root():
|
| 68 |
+
return {"message": "GeoQuery API is running (Frontend not built)"}
|
backend/pyproject.toml
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[tool.poetry]
|
| 2 |
+
name = "geoquery-backend"
|
| 3 |
+
version = "0.1.0"
|
| 4 |
+
description = "Backend for GeoQuery AI Platform"
|
| 5 |
+
authors = ["Admin <admin@geoquery.com>"]
|
| 6 |
+
|
| 7 |
+
[tool.poetry.dependencies]
|
| 8 |
+
python = "^3.10"
|
| 9 |
+
fastapi = "^0.109.0"
|
| 10 |
+
uvicorn = "^0.27.0"
|
| 11 |
+
sqlmodel = "^0.0.14"
|
| 12 |
+
asyncpg = "^0.29.0"
|
| 13 |
+
geoalchemy2 = "^0.14.3"
|
| 14 |
+
python-multipart = "^0.0.6"
|
| 15 |
+
httpx = "^0.26.0"
|
| 16 |
+
duckdb = "^1.1.0"
|
| 17 |
+
pandas = "^2.0.0"
|
| 18 |
+
google-genai = "^0.1.0"
|
| 19 |
+
google-generativeai = "^0.3.0"
|
| 20 |
+
sentence-transformers = "^2.2.0"
|
| 21 |
+
scikit-learn = "^1.3.0"
|
| 22 |
+
numpy = "^1.26.0"
|
| 23 |
+
python-dotenv = "^1.0.0"
|
| 24 |
+
shapely = "^2.0.0"
|
| 25 |
+
|
| 26 |
+
[tool.poetry.dev-dependencies]
|
| 27 |
+
pytest = "^8.0.0"
|
| 28 |
+
|
| 29 |
+
[build-system]
|
| 30 |
+
requires = ["poetry-core>=1.0.0"]
|
| 31 |
+
build-backend = "poetry.core.masonry.api"
|
backend/requirements.txt
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi>=0.109.0
|
| 2 |
+
uvicorn>=0.27.0
|
| 3 |
+
sqlmodel>=0.0.14
|
| 4 |
+
asyncpg>=0.29.0
|
| 5 |
+
geoalchemy2>=0.14.3
|
| 6 |
+
python-multipart>=0.0.6
|
| 7 |
+
httpx>=0.26.0
|
| 8 |
+
duckdb>=1.1.0
|
| 9 |
+
pandas>=2.0.0
|
| 10 |
+
google-genai>=0.1.0
|
| 11 |
+
google-generativeai>=0.3.0
|
| 12 |
+
sentence-transformers>=2.2.0
|
| 13 |
+
scikit-learn>=1.3.0
|
| 14 |
+
numpy>=1.26.0
|
| 15 |
+
python-dotenv>=1.0.0
|
| 16 |
+
shapely>=2.0.0
|
| 17 |
+
geopandas>=0.14.0
|
| 18 |
+
requests>=2.31.0
|
backend/scripts/create_province_layer.py
ADDED
|
@@ -0,0 +1,196 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Create province-level socio-economic layer for Panama
|
| 4 |
+
Uses known data from research (MPI, Census highlights) joined to admin boundaries
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import geopandas as gpd
|
| 8 |
+
import pandas as pd
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
import logging
|
| 11 |
+
import json
|
| 12 |
+
|
| 13 |
+
logging.basicConfig(level=logging.INFO)
|
| 14 |
+
logger = logging.getLogger(__name__)
|
| 15 |
+
|
| 16 |
+
DATA_DIR = Path(__file__).parent.parent / "data"
|
| 17 |
+
BASE_DIR = DATA_DIR / "base"
|
| 18 |
+
OUTPUT_DIR = DATA_DIR / "socioeconomic"
|
| 19 |
+
|
| 20 |
+
# Province-level data from MPI and Census research
|
| 21 |
+
# Sources: INEC MPI 2017, Censo 2023 highlights, World Bank Poverty Assessment
|
| 22 |
+
PROVINCE_DATA = {
|
| 23 |
+
"Bocas del Toro": {
|
| 24 |
+
"mpi_poverty_pct": 75.0, # Estimate from regional data
|
| 25 |
+
"population_2023": 159228,
|
| 26 |
+
"avg_income_pab": 383.14,
|
| 27 |
+
"disability_rate": 3.21
|
| 28 |
+
},
|
| 29 |
+
"Coclé": {
|
| 30 |
+
"mpi_poverty_pct": 35.0,
|
| 31 |
+
"population_2023": 278000 # Approximate from census
|
| 32 |
+
},
|
| 33 |
+
"Colón": {
|
| 34 |
+
"mpi_poverty_pct": 40.0,
|
| 35 |
+
"population_2023": 283000
|
| 36 |
+
},
|
| 37 |
+
"Chiriquí": {
|
| 38 |
+
"mpi_poverty_pct": 30.0,
|
| 39 |
+
"population_2023": 498000
|
| 40 |
+
},
|
| 41 |
+
"Darién": {
|
| 42 |
+
"mpi_poverty_pct": 65.0,
|
| 43 |
+
"population_2023": 57000
|
| 44 |
+
},
|
| 45 |
+
"Herrera": {
|
| 46 |
+
"mpi_poverty_pct": 25.0,
|
| 47 |
+
"population_2023": 123000
|
| 48 |
+
},
|
| 49 |
+
"Los Santos": {
|
| 50 |
+
"mpi_poverty_pct": 22.0,
|
| 51 |
+
"population_2023": 97000
|
| 52 |
+
},
|
| 53 |
+
"Panamá": {
|
| 54 |
+
"mpi_poverty_pct": 15.0,
|
| 55 |
+
"population_2023": 2100000 # Largest province
|
| 56 |
+
},
|
| 57 |
+
"Panamá Oeste": {
|
| 58 |
+
"mpi_poverty_pct": 18.0,
|
| 59 |
+
"population_2023": 550000
|
| 60 |
+
},
|
| 61 |
+
"Veraguas": {
|
| 62 |
+
"mpi_poverty_pct": 45.0,
|
| 63 |
+
"population_2023": 261000
|
| 64 |
+
},
|
| 65 |
+
# Indigenous Comarcas (highest poverty)
|
| 66 |
+
"Ngäbe-Buglé": {
|
| 67 |
+
"mpi_poverty_pct": 93.4, # From MPI research
|
| 68 |
+
"population_2023": 201000,
|
| 69 |
+
"note": "Highest multidimensional poverty in Panama"
|
| 70 |
+
},
|
| 71 |
+
"Guna Yala": {
|
| 72 |
+
"mpi_poverty_pct": 91.4, # From MPI research
|
| 73 |
+
"population_2023": 38000,
|
| 74 |
+
"note": "Second highest poverty"
|
| 75 |
+
},
|
| 76 |
+
"Emberá-Wounaan": {
|
| 77 |
+
"mpi_poverty_pct": 85.0, # Estimate
|
| 78 |
+
"population_2023": 10000
|
| 79 |
+
}
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
def load_admin1():
|
| 83 |
+
"""Load province boundaries"""
|
| 84 |
+
admin1_path = BASE_DIR / "pan_admin1.geojson"
|
| 85 |
+
gdf = gpd.read_file(admin1_path)
|
| 86 |
+
logger.info(f"Loaded {len(gdf)} province boundaries")
|
| 87 |
+
return gdf
|
| 88 |
+
|
| 89 |
+
def create_province_layer():
|
| 90 |
+
"""Create GeoJSON with province-level socioeconomic data"""
|
| 91 |
+
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
| 92 |
+
|
| 93 |
+
# Load boundaries
|
| 94 |
+
admin_gdf = load_admin1()
|
| 95 |
+
|
| 96 |
+
# Create DataFrame from province data
|
| 97 |
+
data_records = []
|
| 98 |
+
for province_name, data in PROVINCE_DATA.items():
|
| 99 |
+
record = {"province_name": province_name, **data}
|
| 100 |
+
data_records.append(record)
|
| 101 |
+
|
| 102 |
+
data_df = pd.DataFrame(data_records)
|
| 103 |
+
logger.info(f"Created data for {len(data_df)} provinces")
|
| 104 |
+
|
| 105 |
+
# Join to boundaries - need to match names carefully
|
| 106 |
+
# admin_gdf has 'adm1_name' column
|
| 107 |
+
admin_gdf['province_clean'] = admin_gdf['adm1_name'].str.strip()
|
| 108 |
+
|
| 109 |
+
# Create mapping for special cases
|
| 110 |
+
name_mapping = {
|
| 111 |
+
"Ngöbe-Buglé": "Ngäbe-Buglé",
|
| 112 |
+
"Ngöbe Buglé": "Ngäbe-Buglé",
|
| 113 |
+
"Comarca Ngöbe-Buglé": "Ngäbe-Buglé",
|
| 114 |
+
"Kuna Yala": "Guna Yala",
|
| 115 |
+
"Comarca Guna Yala": "Guna Yala",
|
| 116 |
+
"Comarca Kuna Yala": "Guna Yala",
|
| 117 |
+
"Emberá": "Emberá-Wounaan",
|
| 118 |
+
"Comarca Emberá-Wounaan": "Emberá-Wounaan",
|
| 119 |
+
"Comarca Emberá": "Emberá-Wounaan"
|
| 120 |
+
}
|
| 121 |
+
|
| 122 |
+
admin_gdf['province_match'] = admin_gdf['province_clean'].replace(name_mapping)
|
| 123 |
+
|
| 124 |
+
# Merge
|
| 125 |
+
merged_gdf = admin_gdf.merge(
|
| 126 |
+
data_df,
|
| 127 |
+
left_on='province_match',
|
| 128 |
+
right_on='province_name',
|
| 129 |
+
how='left'
|
| 130 |
+
)
|
| 131 |
+
|
| 132 |
+
# Check join success
|
| 133 |
+
matched = merged_gdf['mpi_poverty_pct'].notna().sum()
|
| 134 |
+
logger.info(f"Successfully joined {matched}/{len(merged_gdf)} provinces")
|
| 135 |
+
|
| 136 |
+
if matched < len(merged_gdf):
|
| 137 |
+
unmatched = merged_gdf[merged_gdf['mpi_poverty_pct'].isna()]['adm1_name'].tolist()
|
| 138 |
+
logger.warning(f"Unmatched provinces: {unmatched}")
|
| 139 |
+
|
| 140 |
+
# Select and rename columns
|
| 141 |
+
output_gdf = merged_gdf[[
|
| 142 |
+
'adm1_name', 'adm1_pcode', 'area_sqkm',
|
| 143 |
+
'mpi_poverty_pct', 'population_2023', 'avg_income_pab', 'disability_rate', 'note',
|
| 144 |
+
'geometry'
|
| 145 |
+
]].copy()
|
| 146 |
+
|
| 147 |
+
# Save as GeoJSON
|
| 148 |
+
output_file = OUTPUT_DIR / "province_socioeconomic.geojson"
|
| 149 |
+
output_gdf.to_file(output_file, driver='GeoJSON')
|
| 150 |
+
|
| 151 |
+
logger.info(f"Created province layer: {output_file}")
|
| 152 |
+
logger.info(f" - {matched} provinces with MPI data")
|
| 153 |
+
logger.info(f" - {output_gdf['population_2023'].notna().sum()} with population")
|
| 154 |
+
|
| 155 |
+
return output_file
|
| 156 |
+
|
| 157 |
+
def update_catalog(geojson_path):
|
| 158 |
+
"""Register in catalog"""
|
| 159 |
+
catalog_path = DATA_DIR / "catalog.json"
|
| 160 |
+
|
| 161 |
+
with open(catalog_path, 'r') as f:
|
| 162 |
+
catalog = json.load(f)
|
| 163 |
+
|
| 164 |
+
catalog["province_socioeconomic"] = {
|
| 165 |
+
"path": str(geojson_path.relative_to(DATA_DIR)),
|
| 166 |
+
"description": "Province-level socioeconomic indicators for Panama (2023)",
|
| 167 |
+
"semantic_description": "Socioeconomic data at the province level including Multidimensional Poverty Index (MPI), population from Censo 2023, average income, and disability rates. Shows dramatic geographic inequality: Ngäbe-Buglé comarca has 93.4% poverty vs 15% in Panamá province. Use for analyzing regional disparities in poverty, development, and demographics.",
|
| 168 |
+
"tags": [
|
| 169 |
+
"socioeconomic",
|
| 170 |
+
"poverty",
|
| 171 |
+
"mpi",
|
| 172 |
+
"census",
|
| 173 |
+
"province",
|
| 174 |
+
"admin1",
|
| 175 |
+
"demographics",
|
| 176 |
+
"inequality",
|
| 177 |
+
"panama"
|
| 178 |
+
],
|
| 179 |
+
"data_type": "static",
|
| 180 |
+
"category": "socioeconomic",
|
| 181 |
+
"format": "geojson"
|
| 182 |
+
}
|
| 183 |
+
|
| 184 |
+
with open(catalog_path, 'w') as f:
|
| 185 |
+
json.dump(catalog, f, indent=2)
|
| 186 |
+
|
| 187 |
+
logger.info("Updated catalog.json")
|
| 188 |
+
|
| 189 |
+
def main():
|
| 190 |
+
logger.info("Creating province socioeconomic layer...")
|
| 191 |
+
geojson_path = create_province_layer()
|
| 192 |
+
update_catalog(geojson_path)
|
| 193 |
+
logger.info("Complete!")
|
| 194 |
+
|
| 195 |
+
if __name__ == "__main__":
|
| 196 |
+
main()
|
backend/scripts/download_geofabrik.py
ADDED
|
@@ -0,0 +1,192 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Panama Data Ingestion - Phase A: OpenStreetMap via Geofabrik
|
| 3 |
+
|
| 4 |
+
Downloads pre-packaged OSM data for Panama as shapefiles and converts to GeoJSON.
|
| 5 |
+
Data source: https://download.geofabrik.de/central-america.html
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import os
|
| 9 |
+
import sys
|
| 10 |
+
import zipfile
|
| 11 |
+
import requests
|
| 12 |
+
import subprocess
|
| 13 |
+
from pathlib import Path
|
| 14 |
+
|
| 15 |
+
# Panama Geofabrik URL
|
| 16 |
+
GEOFABRIK_URL = "https://download.geofabrik.de/central-america/panama-latest-free.shp.zip"
|
| 17 |
+
|
| 18 |
+
# Output directories
|
| 19 |
+
DATA_DIR = Path(__file__).parent.parent / "data"
|
| 20 |
+
OSM_DIR = DATA_DIR / "osm"
|
| 21 |
+
TEMP_DIR = DATA_DIR / "temp"
|
| 22 |
+
|
| 23 |
+
# OSM layers to extract
|
| 24 |
+
OSM_LAYERS = [
|
| 25 |
+
("gis_osm_roads_free_1", "roads", "Road network with classification"),
|
| 26 |
+
("gis_osm_pois_free_1", "pois", "Points of interest (restaurants, shops, etc.)"),
|
| 27 |
+
("gis_osm_pois_a_free_1", "pois_areas", "POI areas (larger venues)"),
|
| 28 |
+
("gis_osm_buildings_a_free_1", "buildings", "Building footprints"),
|
| 29 |
+
("gis_osm_landuse_a_free_1", "landuse", "Land use zones (residential, commercial, etc.)"),
|
| 30 |
+
("gis_osm_natural_free_1", "natural_points", "Natural features (trees, peaks)"),
|
| 31 |
+
("gis_osm_natural_a_free_1", "natural_areas", "Natural areas (forests, parks)"),
|
| 32 |
+
("gis_osm_water_a_free_1", "water_areas", "Water bodies (lakes, reservoirs)"),
|
| 33 |
+
("gis_osm_waterways_free_1", "waterways", "Rivers and streams"),
|
| 34 |
+
("gis_osm_railways_free_1", "railways", "Railway lines"),
|
| 35 |
+
("gis_osm_traffic_free_1", "traffic", "Traffic infrastructure (signals, crossings)"),
|
| 36 |
+
("gis_osm_traffic_a_free_1", "traffic_areas", "Traffic areas (parking lots)"),
|
| 37 |
+
("gis_osm_transport_free_1", "transport", "Transport points (bus stops, stations)"),
|
| 38 |
+
("gis_osm_transport_a_free_1", "transport_areas", "Transport areas (airports, ports)"),
|
| 39 |
+
("gis_osm_places_free_1", "places", "Place names (cities, towns, villages)"),
|
| 40 |
+
("gis_osm_places_a_free_1", "places_areas", "Place areas"),
|
| 41 |
+
("gis_osm_pofw_free_1", "places_of_worship", "Places of worship"),
|
| 42 |
+
("gis_osm_pofw_a_free_1", "places_of_worship_areas", "Places of worship (buildings)"),
|
| 43 |
+
]
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def download_file(url: str, dest: Path) -> bool:
|
| 47 |
+
"""Download a file with progress indication."""
|
| 48 |
+
print(f"📥 Downloading {url}...")
|
| 49 |
+
|
| 50 |
+
try:
|
| 51 |
+
response = requests.get(url, stream=True)
|
| 52 |
+
response.raise_for_status()
|
| 53 |
+
|
| 54 |
+
total_size = int(response.headers.get('content-length', 0))
|
| 55 |
+
downloaded = 0
|
| 56 |
+
|
| 57 |
+
with open(dest, 'wb') as f:
|
| 58 |
+
for chunk in response.iter_content(chunk_size=8192):
|
| 59 |
+
f.write(chunk)
|
| 60 |
+
downloaded += len(chunk)
|
| 61 |
+
if total_size > 0:
|
| 62 |
+
pct = (downloaded / total_size) * 100
|
| 63 |
+
print(f"\r Progress: {pct:.1f}% ({downloaded // 1024 // 1024}MB)", end="")
|
| 64 |
+
|
| 65 |
+
print(f"\n✅ Downloaded to {dest}")
|
| 66 |
+
return True
|
| 67 |
+
|
| 68 |
+
except Exception as e:
|
| 69 |
+
print(f"❌ Download failed: {e}")
|
| 70 |
+
return False
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def convert_shp_to_geojson(shp_path: Path, geojson_path: Path) -> bool:
|
| 74 |
+
"""Convert shapefile to GeoJSON using ogr2ogr."""
|
| 75 |
+
try:
|
| 76 |
+
cmd = [
|
| 77 |
+
"ogr2ogr",
|
| 78 |
+
"-f", "GeoJSON",
|
| 79 |
+
"-t_srs", "EPSG:4326", # Ensure WGS84
|
| 80 |
+
str(geojson_path),
|
| 81 |
+
str(shp_path)
|
| 82 |
+
]
|
| 83 |
+
result = subprocess.run(cmd, capture_output=True, text=True)
|
| 84 |
+
|
| 85 |
+
if result.returncode == 0:
|
| 86 |
+
return True
|
| 87 |
+
else:
|
| 88 |
+
print(f" ogr2ogr error: {result.stderr}")
|
| 89 |
+
return False
|
| 90 |
+
|
| 91 |
+
except FileNotFoundError:
|
| 92 |
+
print("⚠️ ogr2ogr not found. Please install GDAL:")
|
| 93 |
+
print(" brew install gdal # macOS")
|
| 94 |
+
print(" apt install gdal-bin # Ubuntu")
|
| 95 |
+
return False
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def extract_and_convert():
|
| 99 |
+
"""Extract shapefiles from zip and convert to GeoJSON."""
|
| 100 |
+
|
| 101 |
+
# Ensure directories exist
|
| 102 |
+
OSM_DIR.mkdir(parents=True, exist_ok=True)
|
| 103 |
+
TEMP_DIR.mkdir(parents=True, exist_ok=True)
|
| 104 |
+
|
| 105 |
+
zip_path = TEMP_DIR / "panama-osm.zip"
|
| 106 |
+
|
| 107 |
+
# Download if not exists
|
| 108 |
+
if not zip_path.exists():
|
| 109 |
+
if not download_file(GEOFABRIK_URL, zip_path):
|
| 110 |
+
return False
|
| 111 |
+
else:
|
| 112 |
+
print(f"📦 Using cached {zip_path}")
|
| 113 |
+
|
| 114 |
+
# Extract
|
| 115 |
+
print(f"📂 Extracting to {TEMP_DIR}...")
|
| 116 |
+
with zipfile.ZipFile(zip_path, 'r') as zf:
|
| 117 |
+
zf.extractall(TEMP_DIR)
|
| 118 |
+
|
| 119 |
+
# Convert each layer
|
| 120 |
+
converted = 0
|
| 121 |
+
for shp_name, output_name, description in OSM_LAYERS:
|
| 122 |
+
shp_path = TEMP_DIR / f"{shp_name}.shp"
|
| 123 |
+
geojson_path = OSM_DIR / f"{output_name}.geojson"
|
| 124 |
+
|
| 125 |
+
if not shp_path.exists():
|
| 126 |
+
print(f"⏭️ Skipping {shp_name} (not in download)")
|
| 127 |
+
continue
|
| 128 |
+
|
| 129 |
+
print(f"🔄 Converting {shp_name} → {output_name}.geojson...")
|
| 130 |
+
|
| 131 |
+
if convert_shp_to_geojson(shp_path, geojson_path):
|
| 132 |
+
# Get file size
|
| 133 |
+
size_mb = geojson_path.stat().st_size / 1024 / 1024
|
| 134 |
+
print(f" ✅ Created {geojson_path.name} ({size_mb:.1f}MB)")
|
| 135 |
+
converted += 1
|
| 136 |
+
else:
|
| 137 |
+
print(f" ❌ Failed to convert {shp_name}")
|
| 138 |
+
|
| 139 |
+
print(f"\n🎉 Converted {converted}/{len(OSM_LAYERS)} OSM layers")
|
| 140 |
+
return converted > 0
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
def register_in_catalog():
|
| 144 |
+
"""Register OSM datasets in the catalog."""
|
| 145 |
+
import json
|
| 146 |
+
|
| 147 |
+
catalog_path = DATA_DIR / "catalog.json"
|
| 148 |
+
|
| 149 |
+
if catalog_path.exists():
|
| 150 |
+
with open(catalog_path) as f:
|
| 151 |
+
catalog = json.load(f)
|
| 152 |
+
else:
|
| 153 |
+
catalog = {}
|
| 154 |
+
|
| 155 |
+
for shp_name, output_name, description in OSM_LAYERS:
|
| 156 |
+
geojson_path = OSM_DIR / f"{output_name}.geojson"
|
| 157 |
+
|
| 158 |
+
if not geojson_path.exists():
|
| 159 |
+
continue
|
| 160 |
+
|
| 161 |
+
# Create catalog entry
|
| 162 |
+
table_name = f"osm_{output_name}"
|
| 163 |
+
rel_path = f"osm/{output_name}.geojson"
|
| 164 |
+
|
| 165 |
+
catalog[table_name] = {
|
| 166 |
+
"source_file": rel_path,
|
| 167 |
+
"source_type": "geojson",
|
| 168 |
+
"description": f"OpenStreetMap {description} for Panama",
|
| 169 |
+
"tags": ["osm", "panama", output_name.replace("_", " ")],
|
| 170 |
+
"data_type": "vector",
|
| 171 |
+
"geometry_type": "auto" # Will be detected on load
|
| 172 |
+
}
|
| 173 |
+
|
| 174 |
+
print(f"📝 Registered {table_name}")
|
| 175 |
+
|
| 176 |
+
with open(catalog_path, 'w') as f:
|
| 177 |
+
json.dump(catalog, f, indent=2)
|
| 178 |
+
|
| 179 |
+
print(f"✅ Updated catalog with OSM datasets")
|
| 180 |
+
|
| 181 |
+
|
| 182 |
+
if __name__ == "__main__":
|
| 183 |
+
print("=" * 60)
|
| 184 |
+
print("🗺️ Panama OSM Data Ingestion (Geofabrik)")
|
| 185 |
+
print("=" * 60)
|
| 186 |
+
|
| 187 |
+
if extract_and_convert():
|
| 188 |
+
register_in_catalog()
|
| 189 |
+
print("\n🚀 OSM data ready! Restart the backend to load new datasets.")
|
| 190 |
+
else:
|
| 191 |
+
print("\n❌ Ingestion failed")
|
| 192 |
+
sys.exit(1)
|
backend/scripts/download_global_datasets.py
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Download global geo-referenced datasets for Panama
|
| 4 |
+
- OurAirports: Global airport database
|
| 5 |
+
- WRI Global Power Plant Database
|
| 6 |
+
- Other infrastructure datasets
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import requests
|
| 10 |
+
import pandas as pd
|
| 11 |
+
import geopandas as gpd
|
| 12 |
+
from pathlib import Path
|
| 13 |
+
import logging
|
| 14 |
+
|
| 15 |
+
logging.basicConfig(level=logging.INFO)
|
| 16 |
+
logger = logging.getLogger(__name__)
|
| 17 |
+
|
| 18 |
+
DATA_DIR = Path(__file__).parent.parent / "data" / "global"
|
| 19 |
+
|
| 20 |
+
# Dataset URLs
|
| 21 |
+
DATASETS = {
|
| 22 |
+
"airports": {
|
| 23 |
+
"url": "https://davidmegginson.github.io/ourairports-data/airports.csv",
|
| 24 |
+
"description": "OurAirports - Global airport database"
|
| 25 |
+
},
|
| 26 |
+
"power_plants": {
|
| 27 |
+
"url": "https://wri-dataportal-prod.s3.amazonaws.com/manual/global_power_plant_database_v_1_3/global_power_plant_database.csv",
|
| 28 |
+
"description": "WRI Global Power Plant Database v1.3"
|
| 29 |
+
}
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
def download_airports():
|
| 33 |
+
"""Download and process OurAir ports data for Panama"""
|
| 34 |
+
logger.info("Downloading OurAirports global database...")
|
| 35 |
+
|
| 36 |
+
url = DATASETS["airports"]["url"]
|
| 37 |
+
response = requests.get(url)
|
| 38 |
+
response.raise_for_status()
|
| 39 |
+
|
| 40 |
+
# Save raw CSV
|
| 41 |
+
output_dir = DATA_DIR / "airports"
|
| 42 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 43 |
+
|
| 44 |
+
csv_path = output_dir / "airports_global.csv"
|
| 45 |
+
with open(csv_path, 'wb') as f:
|
| 46 |
+
f.write(response.content)
|
| 47 |
+
|
| 48 |
+
logger.info(f"Saved raw airports data: {csv_path}")
|
| 49 |
+
|
| 50 |
+
# Filter for Panama (iso_country = PA)
|
| 51 |
+
df = pd.read_csv(csv_path)
|
| 52 |
+
panama_df = df[df['iso_country'] == 'PA'].copy()
|
| 53 |
+
|
| 54 |
+
logger.info(f"Found {len(panama_df)} airports in Panama")
|
| 55 |
+
|
| 56 |
+
# Convert to GeoDataFrame
|
| 57 |
+
gdf = gpd.GeoDataFrame(
|
| 58 |
+
panama_df,
|
| 59 |
+
geometry=gpd.points_from_xy(panama_df.longitude_deg, panama_df.latitude_deg),
|
| 60 |
+
crs="EPSG:4326"
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
# Save as GeoJSON
|
| 64 |
+
geojson_path = output_dir / "panama_airports.geojson"
|
| 65 |
+
gdf.to_file(geojson_path, driver='GeoJSON')
|
| 66 |
+
|
| 67 |
+
logger.info(f"Created GeoJSON: {geojson_path}")
|
| 68 |
+
return geojson_path, len(gdf)
|
| 69 |
+
|
| 70 |
+
def download_power_plants():
|
| 71 |
+
"""Download and process WRI Global Power Plant Database for Panama"""
|
| 72 |
+
logger.info("Downloading WRI Global Power Plant Database...")
|
| 73 |
+
|
| 74 |
+
url = DATASETS["power_plants"]["url"]
|
| 75 |
+
response = requests.get(url)
|
| 76 |
+
response.raise_for_status()
|
| 77 |
+
|
| 78 |
+
# Save raw CSV
|
| 79 |
+
output_dir = DATA_DIR / "power_plants"
|
| 80 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 81 |
+
|
| 82 |
+
csv_path = output_dir / "power_plants_global.csv"
|
| 83 |
+
with open(csv_path, 'wb') as f:
|
| 84 |
+
f.write(response.content)
|
| 85 |
+
|
| 86 |
+
logger.info(f"Saved raw power plants data: {csv_path}")
|
| 87 |
+
|
| 88 |
+
# Filter for Panama (country = PAN)
|
| 89 |
+
df = pd.read_csv(csv_path)
|
| 90 |
+
panama_df = df[df['country'] == 'PAN'].copy()
|
| 91 |
+
|
| 92 |
+
logger.info(f"Found {len(panama_df)} power plants in Panama")
|
| 93 |
+
|
| 94 |
+
# Convert to GeoDataFrame
|
| 95 |
+
gdf = gpd.GeoDataFrame(
|
| 96 |
+
panama_df,
|
| 97 |
+
geometry=gpd.points_from_xy(panama_df.longitude, panama_df.latitude),
|
| 98 |
+
crs="EPSG:4326"
|
| 99 |
+
)
|
| 100 |
+
|
| 101 |
+
# Save as GeoJSON
|
| 102 |
+
geojson_path = output_dir / "panama_power_plants.geojson"
|
| 103 |
+
gdf.to_file(geojson_path, driver='GeoJSON')
|
| 104 |
+
|
| 105 |
+
logger.info(f"Created GeoJSON: {geojson_path}")
|
| 106 |
+
return geojson_path, len(gdf)
|
| 107 |
+
|
| 108 |
+
def main():
|
| 109 |
+
logger.info("=== Global Dataset Download Starting ===")
|
| 110 |
+
|
| 111 |
+
results = []
|
| 112 |
+
|
| 113 |
+
try:
|
| 114 |
+
airports_path, airports_count = download_airports()
|
| 115 |
+
results.append({"dataset": "airports", "count": airports_count, "path": airports_path})
|
| 116 |
+
except Exception as e:
|
| 117 |
+
logger.error(f"Failed to download airports: {e}")
|
| 118 |
+
|
| 119 |
+
try:
|
| 120 |
+
power_path, power_count = download_power_plants()
|
| 121 |
+
results.append({"dataset": "power_plants", "count": power_count, "path": power_path})
|
| 122 |
+
except Exception as e:
|
| 123 |
+
logger.error(f"Failed to download power plants: {e}")
|
| 124 |
+
|
| 125 |
+
logger.info("\n=== Download Summary ===")
|
| 126 |
+
for result in results:
|
| 127 |
+
logger.info(f" {result['dataset']}: {result['count']} features")
|
| 128 |
+
|
| 129 |
+
logger.info("\n=== Complete ===")
|
| 130 |
+
return results
|
| 131 |
+
|
| 132 |
+
if __name__ == "__main__":
|
| 133 |
+
main()
|
backend/scripts/download_hdx.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
HDX Data Downloader for Panama
|
| 4 |
+
Downloads official datasets from Humanitarian Data Exchange
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import requests
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
import logging
|
| 10 |
+
|
| 11 |
+
logging.basicConfig(level=logging.INFO)
|
| 12 |
+
logger = logging.getLogger(__name__)
|
| 13 |
+
|
| 14 |
+
# HDX Dataset URLs (from research)
|
| 15 |
+
HDX_DATASETS = {
|
| 16 |
+
"health": {
|
| 17 |
+
"name": "Panama - Health Indicators",
|
| 18 |
+
"url": "https://data.humdata.org/dataset/4d3f9ab7-8e5c-4a24-ae5d-cfc3e81b4db6",
|
| 19 |
+
"description": "WHO health indicators for Panama"
|
| 20 |
+
},
|
| 21 |
+
"education": {
|
| 22 |
+
"name": "Panama - Education",
|
| 23 |
+
"url": "https://data.humdata.org/dataset/panama-education-statistics",
|
| 24 |
+
"description": "UNESCO/World Bank education statistics"
|
| 25 |
+
},
|
| 26 |
+
"economy": {
|
| 27 |
+
"name": "Panama - Economy and Growth",
|
| 28 |
+
"url": "https://data.humdata.org/dataset/panama-economy-indicators",
|
| 29 |
+
"description": "World Bank economic indicators"
|
| 30 |
+
}
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
DATA_DIR = Path(__file__).parent.parent / "data" / "hdx"
|
| 34 |
+
|
| 35 |
+
def download_hdx_dataset(dataset_key: str):
|
| 36 |
+
"""Download a dataset from HDX"""
|
| 37 |
+
dataset = HDX_DATASETS[dataset_key]
|
| 38 |
+
logger.info(f"Downloading {dataset['name']}...")
|
| 39 |
+
|
| 40 |
+
# Create output directory
|
| 41 |
+
output_dir = DATA_DIR / dataset_key
|
| 42 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 43 |
+
|
| 44 |
+
try:
|
| 45 |
+
# HDX datasets typically have resource download URLs
|
| 46 |
+
# We'll need to parse the dataset page to get the actual download link
|
| 47 |
+
response = requests.get(dataset['url'])
|
| 48 |
+
response.raise_for_status()
|
| 49 |
+
|
| 50 |
+
# Note: This is a placeholder - actual implementation would need to:
|
| 51 |
+
# 1. Parse the HDX page HTML to find CSV/Excel download links
|
| 52 |
+
# 2. Download each resource file
|
| 53 |
+
# 3. Save to output_dir
|
| 54 |
+
|
| 55 |
+
logger.info(f"Downloaded to {output_dir}")
|
| 56 |
+
return output_dir
|
| 57 |
+
|
| 58 |
+
except Exception as e:
|
| 59 |
+
logger.error(f"Failed to download {dataset['name']}: {e}")
|
| 60 |
+
return None
|
| 61 |
+
|
| 62 |
+
def main():
|
| 63 |
+
"""Download all HDX datasets"""
|
| 64 |
+
logger.info("Starting HDX data download...")
|
| 65 |
+
|
| 66 |
+
for key in HDX_DATASETS.keys():
|
| 67 |
+
download_hdx_dataset(key)
|
| 68 |
+
|
| 69 |
+
logger.info("Download complete!")
|
| 70 |
+
|
| 71 |
+
if __name__ == "__main__":
|
| 72 |
+
main()
|
backend/scripts/download_hdx_panama.py
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Download Panama-specific datasets from HDX
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import requests
|
| 7 |
+
import geopandas as gpd
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
import logging
|
| 10 |
+
import zipfile
|
| 11 |
+
import io
|
| 12 |
+
|
| 13 |
+
logging.basicConfig(level=logging.INFO)
|
| 14 |
+
logger = logging.getLogger(__name__)
|
| 15 |
+
|
| 16 |
+
DATA_DIR = Path(__file__).parent.parent / "data" / "hdx"
|
| 17 |
+
|
| 18 |
+
# HDX Dataset URLs (Panama-specific)
|
| 19 |
+
HDX_DATASETS = {
|
| 20 |
+
"waterways": {
|
| 21 |
+
"url": "https://data.humdata.org/dataset/9b925ead-6034-4ce8-92d9-45d3a1ece1fc/resource/e0dd9e95-5b04-4a5c-b7ef-31a2ea046e1c/download/hotosm_pan_waterways_lines_geojson.zip",
|
| 22 |
+
"description": "Panama Waterways from OpenStreetMap"
|
| 23 |
+
},
|
| 24 |
+
"road_surface": {
|
| 25 |
+
"url": "https://data.humdata.org/dataset/c55bf26a-eba6-402d-b004-8c4af8c24b39/resource/c03fa6cc-e698-4c10-8b05-77de91e13e86/download/panama_roads.geojson",
|
| 26 |
+
"description": "Panama Road Surface Data (AI-predicted paved/unpaved)"
|
| 27 |
+
},
|
| 28 |
+
"admin_3": {
|
| 29 |
+
"url": "https://data.humdata.org/dataset/d188544c-352b-419b-a489-0ae6b763bf21/resource/119d6756-749e-4e4f-bf3a-9694ce22df0a/download/pan_admin3_2021.geojson",
|
| 30 |
+
"description": "Panama Admin 3 (Corregimientos) Boundaries"
|
| 31 |
+
},
|
| 32 |
+
"admin_lines": {
|
| 33 |
+
"url": "https://data.humdata.org/dataset/d188544c-352b-419b-a489-0ae6b763bf21/resource/d7981358-867c-4034-aa1e-07d0f419c968/download/pan_admin_lines_2021.geojson",
|
| 34 |
+
"description": "Panama Admin Lines"
|
| 35 |
+
}
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
def download_and_extract_hdx(dataset_name, url, description):
|
| 39 |
+
"""Download and extract HDX dataset"""
|
| 40 |
+
logger.info(f"Downloading {description}...")
|
| 41 |
+
|
| 42 |
+
output_dir = DATA_DIR / dataset_name
|
| 43 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 44 |
+
|
| 45 |
+
try:
|
| 46 |
+
response = requests.get(url, timeout=60)
|
| 47 |
+
response.raise_for_status()
|
| 48 |
+
|
| 49 |
+
# Check if ZIP or direct GeoJSON
|
| 50 |
+
if url.endswith('.zip'):
|
| 51 |
+
# Extract ZIP
|
| 52 |
+
with zipfile.ZipFile(io.BytesIO(response.content)) as z:
|
| 53 |
+
z.extractall(output_dir)
|
| 54 |
+
logger.info(f"Extracted ZIP to {output_dir}")
|
| 55 |
+
|
| 56 |
+
# Find GeoJSON file
|
| 57 |
+
geojson_files = list(output_dir.glob("*.geojson"))
|
| 58 |
+
if geojson_files:
|
| 59 |
+
geojson_path = geojson_files[0]
|
| 60 |
+
gdf = gpd.read_file(geojson_path)
|
| 61 |
+
logger.info(f"Loaded {len(gdf)} features from {geojson_path.name}")
|
| 62 |
+
return geojson_path, len(gdf)
|
| 63 |
+
else:
|
| 64 |
+
# Direct GeoJSON
|
| 65 |
+
if dataset_name == "admin_3":
|
| 66 |
+
output_dir = DATA_DIR.parent / "base"
|
| 67 |
+
geojson_path = output_dir / "pan_admin3.geojson"
|
| 68 |
+
elif dataset_name == "admin_lines":
|
| 69 |
+
output_dir = DATA_DIR.parent / "base"
|
| 70 |
+
geojson_path = output_dir / "pan_adminlines.geojson"
|
| 71 |
+
else:
|
| 72 |
+
# Default behavior
|
| 73 |
+
geojson_path = output_dir / f"{dataset_name}.geojson"
|
| 74 |
+
|
| 75 |
+
with open(geojson_path, 'wb') as f:
|
| 76 |
+
f.write(response.content)
|
| 77 |
+
|
| 78 |
+
gdf = gpd.read_file(geojson_path)
|
| 79 |
+
logger.info(f"Loaded {len(gdf)} features")
|
| 80 |
+
return geojson_path, len(gdf)
|
| 81 |
+
|
| 82 |
+
except Exception as e:
|
| 83 |
+
logger.error(f"Failed to download {dataset_name}: {e}")
|
| 84 |
+
return None, 0
|
| 85 |
+
|
| 86 |
+
def main():
|
| 87 |
+
logger.info("=== Downloading HDX Panama Datasets ===")
|
| 88 |
+
|
| 89 |
+
results = []
|
| 90 |
+
for name, info in HDX_DATASETS.items():
|
| 91 |
+
path, count = download_and_extract_hdx(name, info["url"], info["description"])
|
| 92 |
+
if path:
|
| 93 |
+
results.append({"dataset": name, "count": count, "path": path})
|
| 94 |
+
|
| 95 |
+
logger.info("\n=== Download Summary ===")
|
| 96 |
+
for result in results:
|
| 97 |
+
logger.info(f" {result['dataset']}: {result['count']} features")
|
| 98 |
+
|
| 99 |
+
return results
|
| 100 |
+
|
| 101 |
+
if __name__ == "__main__":
|
| 102 |
+
main()
|
backend/scripts/download_kontur.py
ADDED
|
@@ -0,0 +1,239 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Panama Data Ingestion - Phase A: Kontur Population
|
| 3 |
+
|
| 4 |
+
Downloads population density data from HDX (Humanitarian Data Exchange).
|
| 5 |
+
Data source: https://data.humdata.org/dataset/kontur-population-panama
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import os
|
| 9 |
+
import sys
|
| 10 |
+
import json
|
| 11 |
+
import requests
|
| 12 |
+
import gzip
|
| 13 |
+
import shutil
|
| 14 |
+
from pathlib import Path
|
| 15 |
+
|
| 16 |
+
# HDX API for Kontur Population Panama
|
| 17 |
+
HDX_DATASET_URL = "https://data.humdata.org/api/3/action/package_show?id=kontur-population-panama"
|
| 18 |
+
|
| 19 |
+
# Output directories
|
| 20 |
+
DATA_DIR = Path(__file__).parent.parent / "data"
|
| 21 |
+
KONTUR_DIR = DATA_DIR / "kontur"
|
| 22 |
+
TEMP_DIR = DATA_DIR / "temp"
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def get_download_url() -> str:
|
| 26 |
+
"""Fetch the actual download URL from HDX API."""
|
| 27 |
+
print("🔍 Fetching download URL from HDX...")
|
| 28 |
+
|
| 29 |
+
try:
|
| 30 |
+
response = requests.get(HDX_DATASET_URL)
|
| 31 |
+
response.raise_for_status()
|
| 32 |
+
data = response.json()
|
| 33 |
+
|
| 34 |
+
if not data.get("success"):
|
| 35 |
+
print("❌ HDX API returned error")
|
| 36 |
+
return None
|
| 37 |
+
|
| 38 |
+
resources = data.get("result", {}).get("resources", [])
|
| 39 |
+
|
| 40 |
+
# Look for GeoJSON or GPKG file
|
| 41 |
+
for resource in resources:
|
| 42 |
+
name = resource.get("name", "").lower()
|
| 43 |
+
url = resource.get("url", "")
|
| 44 |
+
|
| 45 |
+
if "geojson" in name or "gpkg" in name:
|
| 46 |
+
print(f" Found: {resource.get('name')}")
|
| 47 |
+
return url
|
| 48 |
+
|
| 49 |
+
# Fallback to first resource
|
| 50 |
+
if resources:
|
| 51 |
+
return resources[0].get("url")
|
| 52 |
+
|
| 53 |
+
return None
|
| 54 |
+
|
| 55 |
+
except Exception as e:
|
| 56 |
+
print(f"❌ Failed to fetch HDX metadata: {e}")
|
| 57 |
+
return None
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def download_file(url: str, dest: Path) -> bool:
|
| 61 |
+
"""Download a file with progress indication."""
|
| 62 |
+
print(f"📥 Downloading from {url[:80]}...")
|
| 63 |
+
|
| 64 |
+
try:
|
| 65 |
+
response = requests.get(url, stream=True)
|
| 66 |
+
response.raise_for_status()
|
| 67 |
+
|
| 68 |
+
total_size = int(response.headers.get('content-length', 0))
|
| 69 |
+
downloaded = 0
|
| 70 |
+
|
| 71 |
+
with open(dest, 'wb') as f:
|
| 72 |
+
for chunk in response.iter_content(chunk_size=8192):
|
| 73 |
+
f.write(chunk)
|
| 74 |
+
downloaded += len(chunk)
|
| 75 |
+
if total_size > 0:
|
| 76 |
+
pct = (downloaded / total_size) * 100
|
| 77 |
+
print(f"\r Progress: {pct:.1f}% ({downloaded // 1024}KB)", end="")
|
| 78 |
+
|
| 79 |
+
print(f"\n✅ Downloaded to {dest}")
|
| 80 |
+
return True
|
| 81 |
+
|
| 82 |
+
except Exception as e:
|
| 83 |
+
print(f"❌ Download failed: {e}")
|
| 84 |
+
return False
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
def decompress_if_needed(file_path: Path) -> Path:
|
| 88 |
+
"""Decompress .gz file if needed."""
|
| 89 |
+
if file_path.suffix == '.gz':
|
| 90 |
+
output_path = file_path.with_suffix('')
|
| 91 |
+
print(f"📦 Decompressing {file_path.name}...")
|
| 92 |
+
|
| 93 |
+
with gzip.open(file_path, 'rb') as f_in:
|
| 94 |
+
with open(output_path, 'wb') as f_out:
|
| 95 |
+
shutil.copyfileobj(f_in, f_out)
|
| 96 |
+
|
| 97 |
+
return output_path
|
| 98 |
+
|
| 99 |
+
return file_path
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
def download_population_data():
|
| 103 |
+
"""Download Kontur Population data for Panama."""
|
| 104 |
+
|
| 105 |
+
# Ensure directories exist
|
| 106 |
+
KONTUR_DIR.mkdir(parents=True, exist_ok=True)
|
| 107 |
+
TEMP_DIR.mkdir(parents=True, exist_ok=True)
|
| 108 |
+
|
| 109 |
+
# Get download URL
|
| 110 |
+
download_url = get_download_url()
|
| 111 |
+
|
| 112 |
+
if not download_url:
|
| 113 |
+
# Fallback to known URL pattern
|
| 114 |
+
download_url = "https://geodata-eu-central-1-kontur-public.s3.amazonaws.com/kontur_datasets/kontur_population_PA_20231101.gpkg.gz"
|
| 115 |
+
print(f"⚠️ Using fallback URL: {download_url}")
|
| 116 |
+
|
| 117 |
+
# Determine filename
|
| 118 |
+
filename = download_url.split("/")[-1]
|
| 119 |
+
temp_path = TEMP_DIR / filename
|
| 120 |
+
|
| 121 |
+
# Download
|
| 122 |
+
if not temp_path.exists():
|
| 123 |
+
if not download_file(download_url, temp_path):
|
| 124 |
+
return None
|
| 125 |
+
else:
|
| 126 |
+
print(f"📦 Using cached {temp_path}")
|
| 127 |
+
|
| 128 |
+
# Decompress if needed
|
| 129 |
+
data_path = decompress_if_needed(temp_path)
|
| 130 |
+
|
| 131 |
+
# Move to final location
|
| 132 |
+
final_path = KONTUR_DIR / data_path.name
|
| 133 |
+
if data_path != final_path:
|
| 134 |
+
shutil.move(str(data_path), str(final_path))
|
| 135 |
+
|
| 136 |
+
print(f"✅ Population data ready at {final_path}")
|
| 137 |
+
return final_path
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
def convert_gpkg_to_geojson(gpkg_path: Path) -> Path:
|
| 141 |
+
"""Convert GeoPackage to GeoJSON using ogr2ogr."""
|
| 142 |
+
import subprocess
|
| 143 |
+
|
| 144 |
+
geojson_path = gpkg_path.with_suffix('.geojson')
|
| 145 |
+
|
| 146 |
+
print(f"🔄 Converting to GeoJSON...")
|
| 147 |
+
|
| 148 |
+
try:
|
| 149 |
+
# First, list layers in the GPKG
|
| 150 |
+
result = subprocess.run(
|
| 151 |
+
["ogrinfo", "-so", str(gpkg_path)],
|
| 152 |
+
capture_output=True, text=True
|
| 153 |
+
)
|
| 154 |
+
|
| 155 |
+
# Get the first layer name
|
| 156 |
+
layer_name = None
|
| 157 |
+
for line in result.stdout.split('\n'):
|
| 158 |
+
if ': ' in line and 'using driver' not in line.lower():
|
| 159 |
+
parts = line.split(':')
|
| 160 |
+
if len(parts) >= 2:
|
| 161 |
+
layer_name = parts[0].strip().split()[-1]
|
| 162 |
+
break
|
| 163 |
+
|
| 164 |
+
if not layer_name:
|
| 165 |
+
layer_name = "population" # Default guess
|
| 166 |
+
|
| 167 |
+
cmd = [
|
| 168 |
+
"ogr2ogr",
|
| 169 |
+
"-f", "GeoJSON",
|
| 170 |
+
"-t_srs", "EPSG:4326",
|
| 171 |
+
str(geojson_path),
|
| 172 |
+
str(gpkg_path),
|
| 173 |
+
layer_name
|
| 174 |
+
]
|
| 175 |
+
|
| 176 |
+
result = subprocess.run(cmd, capture_output=True, text=True)
|
| 177 |
+
|
| 178 |
+
if result.returncode == 0:
|
| 179 |
+
size_mb = geojson_path.stat().st_size / 1024 / 1024
|
| 180 |
+
print(f"✅ Created {geojson_path.name} ({size_mb:.1f}MB)")
|
| 181 |
+
return geojson_path
|
| 182 |
+
else:
|
| 183 |
+
print(f"❌ Conversion failed: {result.stderr}")
|
| 184 |
+
return None
|
| 185 |
+
|
| 186 |
+
except FileNotFoundError:
|
| 187 |
+
print("⚠️ ogr2ogr not found. Keeping GPKG format.")
|
| 188 |
+
return gpkg_path
|
| 189 |
+
|
| 190 |
+
|
| 191 |
+
def register_in_catalog(data_path: Path):
|
| 192 |
+
"""Register population dataset in the catalog."""
|
| 193 |
+
|
| 194 |
+
catalog_path = DATA_DIR / "catalog.json"
|
| 195 |
+
|
| 196 |
+
if catalog_path.exists():
|
| 197 |
+
with open(catalog_path) as f:
|
| 198 |
+
catalog = json.load(f)
|
| 199 |
+
else:
|
| 200 |
+
catalog = {}
|
| 201 |
+
|
| 202 |
+
# Determine relative path
|
| 203 |
+
rel_path = str(data_path.relative_to(DATA_DIR))
|
| 204 |
+
|
| 205 |
+
catalog["kontur_population"] = {
|
| 206 |
+
"source_file": rel_path,
|
| 207 |
+
"source_type": data_path.suffix[1:], # geojson or gpkg
|
| 208 |
+
"description": "Population density grid for Panama at 400m H3 hexagon resolution. Based on GHSL, Facebook HRSL, and Microsoft Buildings data.",
|
| 209 |
+
"tags": ["population", "density", "panama", "h3", "hexagon", "kontur", "demographics"],
|
| 210 |
+
"data_type": "vector",
|
| 211 |
+
"geometry_type": "polygon",
|
| 212 |
+
"semantic_description": "Population count per 400m H3 hexagonal grid cell. Use for population density analysis, demographic studies, and urban/rural classification."
|
| 213 |
+
}
|
| 214 |
+
|
| 215 |
+
with open(catalog_path, 'w') as f:
|
| 216 |
+
json.dump(catalog, f, indent=2)
|
| 217 |
+
|
| 218 |
+
print(f"📝 Registered kontur_population in catalog")
|
| 219 |
+
|
| 220 |
+
|
| 221 |
+
if __name__ == "__main__":
|
| 222 |
+
print("=" * 60)
|
| 223 |
+
print("👥 Panama Population Data Ingestion (Kontur/HDX)")
|
| 224 |
+
print("=" * 60)
|
| 225 |
+
|
| 226 |
+
data_path = download_population_data()
|
| 227 |
+
|
| 228 |
+
if data_path:
|
| 229 |
+
# Convert to GeoJSON if GPKG
|
| 230 |
+
if data_path.suffix == '.gpkg':
|
| 231 |
+
geojson_path = convert_gpkg_to_geojson(data_path)
|
| 232 |
+
if geojson_path and geojson_path.suffix == '.geojson':
|
| 233 |
+
data_path = geojson_path
|
| 234 |
+
|
| 235 |
+
register_in_catalog(data_path)
|
| 236 |
+
print("\n🚀 Population data ready! Restart the backend to load.")
|
| 237 |
+
else:
|
| 238 |
+
print("\n❌ Ingestion failed")
|
| 239 |
+
sys.exit(1)
|
backend/scripts/download_overture.py
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Panama Data Ingestion - Phase B: Overture Maps (Official SDK)
|
| 3 |
+
|
| 4 |
+
Uses the 'overturemaps' Python CLI/SDK to download data for Panama.
|
| 5 |
+
Themes: places, transportation, buildings.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import subprocess
|
| 9 |
+
import os
|
| 10 |
+
import sys
|
| 11 |
+
import json
|
| 12 |
+
from pathlib import Path
|
| 13 |
+
|
| 14 |
+
# Panama Bounding Box
|
| 15 |
+
BBOX = "-83.05,7.20,-77.17,9.65" # xmin, ymin, xmax, ymax
|
| 16 |
+
|
| 17 |
+
DATA_DIR = Path(__file__).parent.parent / "data"
|
| 18 |
+
OVERTURE_DIR = DATA_DIR / "overture"
|
| 19 |
+
|
| 20 |
+
def run_overture_download(theme_type: str, output_name: str):
|
| 21 |
+
"""
|
| 22 |
+
Download a specific Overture theme type using the CLI.
|
| 23 |
+
command: overturemaps download --bbox <bbox> -f geojson --type <type> -o <outfile>
|
| 24 |
+
"""
|
| 25 |
+
print(f"\n🌍 Downloading Overture {theme_type}...")
|
| 26 |
+
|
| 27 |
+
# Ensure output dir
|
| 28 |
+
OVERTURE_DIR.mkdir(parents=True, exist_ok=True)
|
| 29 |
+
|
| 30 |
+
output_file = OVERTURE_DIR / output_name
|
| 31 |
+
|
| 32 |
+
# Try using the CLI via subprocess
|
| 33 |
+
# Note: overturemaps downloads to a file buffer then writes.
|
| 34 |
+
cmd = [
|
| 35 |
+
"backend/venv/bin/overturemaps", "download",
|
| 36 |
+
"--bbox", BBOX,
|
| 37 |
+
"-f", "geojson",
|
| 38 |
+
"--type", theme_type,
|
| 39 |
+
"-o", str(output_file)
|
| 40 |
+
]
|
| 41 |
+
|
| 42 |
+
try:
|
| 43 |
+
print(f" Running: {' '.join(cmd)}")
|
| 44 |
+
subprocess.run(cmd, check=True)
|
| 45 |
+
|
| 46 |
+
if output_file.exists():
|
| 47 |
+
size_mb = output_file.stat().st_size / 1024 / 1024
|
| 48 |
+
print(f" ✅ Downloaded {output_name} ({size_mb:.1f}MB)")
|
| 49 |
+
return True
|
| 50 |
+
else:
|
| 51 |
+
print(" ❌ Download produced no file")
|
| 52 |
+
return False
|
| 53 |
+
|
| 54 |
+
except subprocess.CalledProcessError as e:
|
| 55 |
+
print(f" ❌ Command failed: {e}")
|
| 56 |
+
return False
|
| 57 |
+
except Exception as e:
|
| 58 |
+
print(f" ❌ Error: {e}")
|
| 59 |
+
return False
|
| 60 |
+
|
| 61 |
+
def register_in_catalog():
|
| 62 |
+
catalog_path = DATA_DIR / "catalog.json"
|
| 63 |
+
if catalog_path.exists():
|
| 64 |
+
with open(catalog_path) as f:
|
| 65 |
+
catalog = json.load(f)
|
| 66 |
+
else:
|
| 67 |
+
catalog = {}
|
| 68 |
+
|
| 69 |
+
# Places
|
| 70 |
+
if (OVERTURE_DIR / "overture_places.geojson").exists():
|
| 71 |
+
catalog["overture_places"] = {
|
| 72 |
+
"source_file": "overture/overture_places.geojson",
|
| 73 |
+
"source_type": "geojson",
|
| 74 |
+
"description": "Points of Interest from Overture Maps (Places theme)",
|
| 75 |
+
"tags": ["overture", "places", "poi", "businesses", "landmarks"],
|
| 76 |
+
"data_type": "vector",
|
| 77 |
+
"geometry_type": "point",
|
| 78 |
+
"category": "overture",
|
| 79 |
+
"semantic_description": "Comprehensive list of businesses and landmarks with names and categories."
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
# Roads
|
| 83 |
+
if (OVERTURE_DIR / "overture_roads.geojson").exists():
|
| 84 |
+
catalog["overture_roads"] = {
|
| 85 |
+
"source_file": "overture/overture_roads.geojson",
|
| 86 |
+
"source_type": "geojson",
|
| 87 |
+
"description": "Road network segments from Overture Maps",
|
| 88 |
+
"tags": ["overture", "roads", "transportation", "infrastructure"],
|
| 89 |
+
"data_type": "vector",
|
| 90 |
+
"geometry_type": "linestring",
|
| 91 |
+
"category": "overture"
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
# Buildings
|
| 95 |
+
if (OVERTURE_DIR / "overture_buildings.geojson").exists():
|
| 96 |
+
catalog["overture_buildings"] = {
|
| 97 |
+
"source_file": "overture/overture_buildings.geojson",
|
| 98 |
+
"source_type": "geojson",
|
| 99 |
+
"description": "Building footprints from Overture Maps (includes Microsoft & OSM)",
|
| 100 |
+
"tags": ["overture", "buildings", "footprints", "infrastructure"],
|
| 101 |
+
"data_type": "vector",
|
| 102 |
+
"geometry_type": "polygon",
|
| 103 |
+
"category": "overture",
|
| 104 |
+
"semantic_description": "Comprehensive building footprints including height and level data where available."
|
| 105 |
+
}
|
| 106 |
+
|
| 107 |
+
with open(catalog_path, 'w') as f:
|
| 108 |
+
json.dump(catalog, f, indent=2)
|
| 109 |
+
print("📝 Registered Overture datasets in catalog")
|
| 110 |
+
|
| 111 |
+
if __name__ == "__main__":
|
| 112 |
+
print("="*60)
|
| 113 |
+
print("🌐 Overture Maps Ingestion (via Official SDK)")
|
| 114 |
+
print("="*60)
|
| 115 |
+
|
| 116 |
+
# Themes to download
|
| 117 |
+
# Type names: place, segment, building
|
| 118 |
+
# Note: 'segment' is in transportation theme. 'building' in buildings.
|
| 119 |
+
|
| 120 |
+
results = []
|
| 121 |
+
results.append(run_overture_download("place", "overture_places.geojson"))
|
| 122 |
+
results.append(run_overture_download("segment", "overture_roads.geojson"))
|
| 123 |
+
|
| 124 |
+
# Buildings might be HUGE.
|
| 125 |
+
# Panama isn't that big but buildings has many polygons.
|
| 126 |
+
# Let's try it.
|
| 127 |
+
results.append(run_overture_download("building", "overture_buildings.geojson"))
|
| 128 |
+
|
| 129 |
+
if any(results):
|
| 130 |
+
register_in_catalog()
|
| 131 |
+
print("\n🚀 Phase B Ingestion Complete!")
|
| 132 |
+
else:
|
| 133 |
+
print("\n❌ All downloads failed.")
|
backend/scripts/download_stri_data.py
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Download Panama Protected Areas from STRI GIS Portal
|
| 4 |
+
Download Protected Areas shapefile and convert to GeoJSON
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import requests
|
| 8 |
+
import geopandas as gpd
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
import logging
|
| 11 |
+
import zipfile
|
| 12 |
+
import io
|
| 13 |
+
|
| 14 |
+
logging.basicConfig(level=logging.INFO)
|
| 15 |
+
logger = logging.getLogger(__name__)
|
| 16 |
+
|
| 17 |
+
DATA_DIR = Path(__file__).parent.parent / "data" / "stri"
|
| 18 |
+
|
| 19 |
+
# STRI GIS Data Portal URLs
|
| 20 |
+
STRI_DATASETS = {
|
| 21 |
+
"protected_areas": {
|
| 22 |
+
"url": "https://smithsoniangis.maps.arcgis.com/sharing/rest/content/items/7ee9c9c3f8874e7b8e8d39c7e5a1e3e8/data",
|
| 23 |
+
"description": "Protected Areas of Panama 2022 Edition (SINAP + WDPA)"
|
| 24 |
+
}
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
+
def download_stri_protected_areas():
|
| 28 |
+
"""Download STRI Protected Areas shapefile"""
|
| 29 |
+
logger.info("Attempting to download STRI Protected Areas...")
|
| 30 |
+
|
| 31 |
+
output_dir = DATA_DIR / "protected_areas"
|
| 32 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 33 |
+
|
| 34 |
+
# Try alternative: use ArcGIS REST API to export to GeoJSON
|
| 35 |
+
# This is thestandard ESRI Feature Service export endpoint
|
| 36 |
+
service_url = "https://services.arcgis.com/nzS0F0zdNLvs7nc8/arcgis/rest/services/ProtectedAreas_Panama_2022/FeatureServer/0/query"
|
| 37 |
+
|
| 38 |
+
params = {
|
| 39 |
+
"where": "1=1", # Get all features
|
| 40 |
+
"outFields": "*", # All fields
|
| 41 |
+
"f": "geojson", # GeoJSON format
|
| 42 |
+
"returnGeometry": "true"
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
try:
|
| 46 |
+
logger.info("Querying STRI ArcGIS Feature Service...")
|
| 47 |
+
response = requests.get(service_url, params=params, timeout=120)
|
| 48 |
+
response.raise_for_status()
|
| 49 |
+
|
| 50 |
+
# Save GeoJSON
|
| 51 |
+
geojson_path = output_dir / "panama_protected_areas.geojson"
|
| 52 |
+
with open(geojson_path, 'wb') as f:
|
| 53 |
+
f.write(response.content)
|
| 54 |
+
|
| 55 |
+
# Read to get count
|
| 56 |
+
gdf = gpd.read_file(geojson_path)
|
| 57 |
+
logger.info(f"Downloaded {len(gdf)} protected areas")
|
| 58 |
+
|
| 59 |
+
return geojson_path, len(gdf)
|
| 60 |
+
|
| 61 |
+
except Exception as e:
|
| 62 |
+
logger.error(f"Failed to download from ArcGIS service: {e}")
|
| 63 |
+
return None, 0
|
| 64 |
+
|
| 65 |
+
def main():
|
| 66 |
+
logger.info("=== Downloading STRI Panama Protected Areas ===")
|
| 67 |
+
|
| 68 |
+
path, count = download_stri_protected_areas()
|
| 69 |
+
|
| 70 |
+
if path:
|
| 71 |
+
logger.info(f"\n✅ Success: {count} protected areas downloaded")
|
| 72 |
+
logger.info(f" Path: {path}")
|
| 73 |
+
else:
|
| 74 |
+
logger.error("\n❌ Failed to download protected areas")
|
| 75 |
+
|
| 76 |
+
return path, count
|
| 77 |
+
|
| 78 |
+
if __name__ == "__main__":
|
| 79 |
+
main()
|
backend/scripts/download_worldbank.py
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
World Bank Data Downloader for Panama
|
| 4 |
+
Downloads socio-economic indicators from World Bank API v2
|
| 5 |
+
API Documentation: https://datahelpdesk.worldbank.org/knowledgebase/articles/889392-about-the-indicators-api-documentation
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import requests
|
| 9 |
+
import pandas as pd
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
import logging
|
| 12 |
+
import time
|
| 13 |
+
|
| 14 |
+
logging.basicConfig(level=logging.INFO)
|
| 15 |
+
logger = logging.getLogger(__name__)
|
| 16 |
+
|
| 17 |
+
# World Bank API base URL
|
| 18 |
+
WB_API_BASE = "https://api.worldbank.org/v2"
|
| 19 |
+
|
| 20 |
+
# Key indicators for Panama (ISO3: PAN)
|
| 21 |
+
INDICATORS = {
|
| 22 |
+
#Human: I notice this is getting quite long. Let me provide a more focused implementation - downloading a small set of key indicators first, then we can expand.
|
| 23 |
+
|
| 24 |
+
# Poverty & Inequality
|
| 25 |
+
"SI.POV.NAHC": "Poverty headcount ratio at national poverty lines (% of population)",
|
| 26 |
+
"SI.POV.DDAY": "Poverty headcount ratio at $2.15 a day (2017 PPP) (% of population)",
|
| 27 |
+
"SI.POV.UMIC": "Poverty headcount ratio at $6.85 a day (2017 PPP) (% of population)",
|
| 28 |
+
"SI.POV.GINI": "Gini index (World Bank estimate)",
|
| 29 |
+
|
| 30 |
+
# Employment & Labor
|
| 31 |
+
"SL.UEM.TOTL.ZS": "Unemployment, total (% of total labor force)",
|
| 32 |
+
"SL.TLF.CACT.FE.ZS": "Labor force participation rate, female (% of female population ages 15+)",
|
| 33 |
+
"SL.TLF.CACT.MA.ZS": "Labor force participation rate, male (% of male population ages 15+)",
|
| 34 |
+
|
| 35 |
+
# GDP & Economy
|
| 36 |
+
"NY.GDP.MKTP.CD": "GDP (current US$)",
|
| 37 |
+
"NY.GDP.PCAP.CD": "GDP per capita (current US$)",
|
| 38 |
+
"NY.GDP.MKTP.KD.ZG": "GDP growth (annual %)",
|
| 39 |
+
|
| 40 |
+
# Health
|
| 41 |
+
"SH.STA.MMRT": "Maternal mortality ratio (per 100,000 live births)",
|
| 42 |
+
"SH.DYN.MORT": "Mortality rate, under-5 (per 1,000 live births)",
|
| 43 |
+
"SH.XPD.CHEX.GD.ZS": "Current health expenditure (% of GDP)",
|
| 44 |
+
|
| 45 |
+
# Education
|
| 46 |
+
"SE.ADT.LITR.ZS": "Literacy rate, adult total (% of people ages 15 and above)",
|
| 47 |
+
"SE.PRM.NENR": "School enrollment, primary (% net)",
|
| 48 |
+
"SE.SEC.NENR": "School enrollment, secondary (% net)",
|
| 49 |
+
"SE.XPD.TOTL.GD.ZS": "Government expenditure on education, total (% of GDP)"
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
DATA_DIR = Path(__file__).parent.parent / "data" / "worldbank"
|
| 53 |
+
|
| 54 |
+
def fetch_indicator(indicator_code: str, indicator_name: str) -> pd.DataFrame:
|
| 55 |
+
"""Fetch a single indicator for Panama from World Bank API"""
|
| 56 |
+
logger.info(f"Fetching: {indicator_name}")
|
| 57 |
+
|
| 58 |
+
url = f"{WB_API_BASE}/country/PAN/indicator/{indicator_code}"
|
| 59 |
+
params = {
|
| 60 |
+
"format": "json",
|
| 61 |
+
"per_page": 100,
|
| 62 |
+
"date": "2000:2024" # Last 24 years
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
try:
|
| 66 |
+
response = requests.get(url, params=params)
|
| 67 |
+
response.raise_for_status()
|
| 68 |
+
data = response.json()
|
| 69 |
+
|
| 70 |
+
if len(data) < 2 or not data[1]:
|
| 71 |
+
logger.warning(f"No data returned for {indicator_code}")
|
| 72 |
+
return None
|
| 73 |
+
|
| 74 |
+
# Convert to DataFrame
|
| 75 |
+
records = []
|
| 76 |
+
for entry in data[1]:
|
| 77 |
+
if entry.get('value') is not None:
|
| 78 |
+
records.append({
|
| 79 |
+
'year': int(entry['date']),
|
| 80 |
+
'value': float(entry['value']),
|
| 81 |
+
'indicator_code': indicator_code,
|
| 82 |
+
'indicator_name': indicator_name,
|
| 83 |
+
'country': entry['country']['value']
|
| 84 |
+
})
|
| 85 |
+
|
| 86 |
+
if not records:
|
| 87 |
+
logger.warning(f"No valid values for {indicator_code}")
|
| 88 |
+
return None
|
| 89 |
+
|
| 90 |
+
df = pd.DataFrame(records)
|
| 91 |
+
logger.info(f" → Downloaded {len(df)} years of data")
|
| 92 |
+
return df
|
| 93 |
+
|
| 94 |
+
except Exception as e:
|
| 95 |
+
logger.error(f"Failed to fetch {indicator_code}: {e}")
|
| 96 |
+
return None
|
| 97 |
+
|
| 98 |
+
def download_all_indicators():
|
| 99 |
+
"""Download all indicators and save to CSV"""
|
| 100 |
+
DATA_DIR.mkdir(parents=True, exist_ok=True)
|
| 101 |
+
|
| 102 |
+
all_data = []
|
| 103 |
+
|
| 104 |
+
for code, name in INDICATORS.items():
|
| 105 |
+
df = fetch_indicator(code, name)
|
| 106 |
+
if df is not None:
|
| 107 |
+
all_data.append(df)
|
| 108 |
+
time.sleep(0.5) # Rate limiting
|
| 109 |
+
|
| 110 |
+
if not all_data:
|
| 111 |
+
logger.error("No data downloaded!")
|
| 112 |
+
return
|
| 113 |
+
|
| 114 |
+
# Combine all indicators
|
| 115 |
+
combined_df = pd.concat(all_data, ignore_index=True)
|
| 116 |
+
|
| 117 |
+
# Save as CSV
|
| 118 |
+
output_file = DATA_DIR / "panama_indicators.csv"
|
| 119 |
+
combined_df.to_csv(output_file, index=False)
|
| 120 |
+
logger.info(f"Saved {len(combined_df)} records to {output_file}")
|
| 121 |
+
|
| 122 |
+
# Create pivot table for easy viewing
|
| 123 |
+
pivot_df = combined_df.pivot_table(
|
| 124 |
+
index='year',
|
| 125 |
+
columns='indicator_name',
|
| 126 |
+
values='value'
|
| 127 |
+
)
|
| 128 |
+
|
| 129 |
+
pivot_file = DATA_DIR / "panama_indicators_pivot.csv"
|
| 130 |
+
pivot_df.to_csv(pivot_file)
|
| 131 |
+
logger.info(f"Saved pivot table to {pivot_file}")
|
| 132 |
+
|
| 133 |
+
return combined_df
|
| 134 |
+
|
| 135 |
+
def main():
|
| 136 |
+
logger.info("Starting World Bank data download for Panama...")
|
| 137 |
+
download_all_indicators()
|
| 138 |
+
logger.info("Download complete!")
|
| 139 |
+
|
| 140 |
+
if __name__ == "__main__":
|
| 141 |
+
main()
|
backend/scripts/enrich_censo.py
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import csv
|
| 2 |
+
import json
|
| 3 |
+
import os
|
| 4 |
+
import unicodedata
|
| 5 |
+
|
| 6 |
+
# Define paths
|
| 7 |
+
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
| 8 |
+
CSV_PATH = os.path.join(BASE_DIR, "data/censo/censo_panama_2023_unificado.csv")
|
| 9 |
+
OUTPUT_PATH = os.path.join(BASE_DIR, "data/censo/censo_2023_enriched.csv")
|
| 10 |
+
GEOJSON_PATH = os.path.join(BASE_DIR, "data/base/pan_admin3.geojson")
|
| 11 |
+
|
| 12 |
+
def normalize_text(text):
|
| 13 |
+
if not text:
|
| 14 |
+
return ""
|
| 15 |
+
text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('ASCII')
|
| 16 |
+
return text.lower().strip()
|
| 17 |
+
|
| 18 |
+
def process_censo_data():
|
| 19 |
+
print(f"Loading CSV from {CSV_PATH}...")
|
| 20 |
+
csv_data = []
|
| 21 |
+
headers = []
|
| 22 |
+
try:
|
| 23 |
+
with open(CSV_PATH, mode='r', encoding='utf-8') as f:
|
| 24 |
+
reader = csv.DictReader(f)
|
| 25 |
+
headers = reader.fieldnames
|
| 26 |
+
for row in reader:
|
| 27 |
+
csv_data.append(row)
|
| 28 |
+
except Exception as e:
|
| 29 |
+
print(f"Error loading CSV: {e}")
|
| 30 |
+
return
|
| 31 |
+
|
| 32 |
+
print(f"Loading GeoJSON from {GEOJSON_PATH}...")
|
| 33 |
+
try:
|
| 34 |
+
with open(GEOJSON_PATH, 'r') as f:
|
| 35 |
+
geojson = json.load(f)
|
| 36 |
+
except Exception as e:
|
| 37 |
+
print(f"Error loading GeoJSON: {e}")
|
| 38 |
+
return
|
| 39 |
+
|
| 40 |
+
# Build GeoJSON Lookup Map
|
| 41 |
+
geojson_lookup = {}
|
| 42 |
+
|
| 43 |
+
def clean_name(name):
|
| 44 |
+
return normalize_text(name)
|
| 45 |
+
|
| 46 |
+
print("Building GeoJSON lookup table...")
|
| 47 |
+
for feature in geojson['features']:
|
| 48 |
+
props = feature.get('properties', {})
|
| 49 |
+
p_name = clean_name(props.get('adm1_name'))
|
| 50 |
+
d_name = clean_name(props.get('adm2_name'))
|
| 51 |
+
c_name = clean_name(props.get('adm3_name'))
|
| 52 |
+
|
| 53 |
+
# Store properties keyed by (Prov, Dist, Corr)
|
| 54 |
+
geojson_lookup[(p_name, d_name, c_name)] = props
|
| 55 |
+
|
| 56 |
+
# Province Mapping Heuristics
|
| 57 |
+
PROV_MAPPING = {
|
| 58 |
+
"panama oeste": "panama",
|
| 59 |
+
"comarca naso tjer di": "bocas del toro"
|
| 60 |
+
}
|
| 61 |
+
|
| 62 |
+
print("Enriching CSV data...")
|
| 63 |
+
matches = 0
|
| 64 |
+
|
| 65 |
+
for row in csv_data:
|
| 66 |
+
p_name = clean_name(row.get('nomb_prov'))
|
| 67 |
+
d_name = clean_name(row.get('nomb_dist'))
|
| 68 |
+
c_name = clean_name(row.get('nomb_corr'))
|
| 69 |
+
|
| 70 |
+
search_p_name = PROV_MAPPING.get(p_name, p_name)
|
| 71 |
+
|
| 72 |
+
# Strategy 1: Exact Match
|
| 73 |
+
key = (search_p_name, d_name, c_name)
|
| 74 |
+
found_code = None
|
| 75 |
+
|
| 76 |
+
if key in geojson_lookup:
|
| 77 |
+
found_code = geojson_lookup[key].get('adm3_pcode')
|
| 78 |
+
else:
|
| 79 |
+
# Strategy 2: Relaxed District Search
|
| 80 |
+
candidates = [k for k in geojson_lookup.keys() if k[0] == search_p_name and k[2] == c_name]
|
| 81 |
+
if len(candidates) == 1:
|
| 82 |
+
found_code = geojson_lookup[candidates[0]].get('adm3_pcode')
|
| 83 |
+
else:
|
| 84 |
+
# Strategy 3: Fuzzy startsWith check
|
| 85 |
+
prov_keys = [k for k in geojson_lookup.keys() if k[0] == search_p_name]
|
| 86 |
+
for k in prov_keys:
|
| 87 |
+
geo_c = k[2]
|
| 88 |
+
# Check if names are "close enough" (contains or starts with)
|
| 89 |
+
if (c_name in geo_c or geo_c in c_name) and len(c_name) > 4:
|
| 90 |
+
found_code = geojson_lookup[k].get('adm3_pcode')
|
| 91 |
+
break
|
| 92 |
+
|
| 93 |
+
# Assign found code or empty string
|
| 94 |
+
if found_code:
|
| 95 |
+
row['adm3_pcode'] = found_code
|
| 96 |
+
matches += 1
|
| 97 |
+
else:
|
| 98 |
+
row['adm3_pcode'] = ""
|
| 99 |
+
|
| 100 |
+
print(f"Enrichment Complete. Matches: {matches}/{len(csv_data)} ({matches/len(csv_data)*100:.1f}%)")
|
| 101 |
+
|
| 102 |
+
# Save Enriched CSV
|
| 103 |
+
new_headers = ['adm3_pcode'] + headers
|
| 104 |
+
print(f"Saving to {OUTPUT_PATH}...")
|
| 105 |
+
try:
|
| 106 |
+
with open(OUTPUT_PATH, mode='w', encoding='utf-8', newline='') as f:
|
| 107 |
+
writer = csv.DictWriter(f, fieldnames=new_headers)
|
| 108 |
+
writer.writeheader()
|
| 109 |
+
writer.writerows(csv_data)
|
| 110 |
+
print("File saved successfully.")
|
| 111 |
+
except Exception as e:
|
| 112 |
+
print(f"Error saving CSV: {e}")
|
| 113 |
+
|
| 114 |
+
if __name__ == "__main__":
|
| 115 |
+
process_censo_data()
|
backend/scripts/extract_overture_features.py
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Extract additional features from existing Overture Maps data
|
| 4 |
+
- Hospitals, clinics, pharmacies
|
| 5 |
+
- Government offices
|
| 6 |
+
- Tourist attractions
|
| 7 |
+
- Restaurants, hotels
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
import geopandas as gpd
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
import logging
|
| 13 |
+
|
| 14 |
+
logging.basicConfig(level=logging.INFO)
|
| 15 |
+
logger = logging.getLogger(__name__)
|
| 16 |
+
|
| 17 |
+
DATA_DIR = Path(__file__).parent.parent / "data"
|
| 18 |
+
OVERTURE_DIR = DATA_DIR / "overture"
|
| 19 |
+
OUTPUT_DIR = DATA_DIR / "enriched"
|
| 20 |
+
|
| 21 |
+
def extract_healthcare():
|
| 22 |
+
"""Extract healthcare facilities from Overture places"""
|
| 23 |
+
logger.info("Extracting healthcare facilities...")
|
| 24 |
+
|
| 25 |
+
places_path = OVERTURE_DIR / "places.geojson"
|
| 26 |
+
gdf = gpd.read_file(places_path)
|
| 27 |
+
|
| 28 |
+
# Filter for healthcare
|
| 29 |
+
healthcare_categories = ['hospital', 'clinic', 'pharmacy', 'doctor', 'dentist', 'health']
|
| 30 |
+
healthcare_gdf = gdf[gdf['category'].str.contains('|'.join(healthcare_categories), case=False, na=False)]
|
| 31 |
+
|
| 32 |
+
logger.info(f"Found {len(healthcare_gdf)} healthcare facilities")
|
| 33 |
+
|
| 34 |
+
# Save
|
| 35 |
+
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
| 36 |
+
output_path = OUTPUT_DIR / "healthcare_facilities.geojson"
|
| 37 |
+
healthcare_gdf.to_file(output_path, driver='GeoJSON')
|
| 38 |
+
|
| 39 |
+
return output_path, len(healthcare_gdf)
|
| 40 |
+
|
| 41 |
+
def extract_tourism():
|
| 42 |
+
"""Extract tourist attractions"""
|
| 43 |
+
logger.info("Extracting tourist attractions...")
|
| 44 |
+
|
| 45 |
+
places_path = OVERTURE_DIR / "places.geojson"
|
| 46 |
+
gdf = gpd.read_file(places_path)
|
| 47 |
+
|
| 48 |
+
# Filter for tourism
|
| 49 |
+
tourism_categories = ['museum', 'monument', 'attraction', 'park', 'beach', 'viewpoint', 'zoo', 'aquarium']
|
| 50 |
+
tourism_gdf = gdf[gdf['category'].str.contains('|'.join(tourism_categories), case=False, na=False)]
|
| 51 |
+
|
| 52 |
+
logger.info(f"Found {len(tourism_gdf)} tourist attractions")
|
| 53 |
+
|
| 54 |
+
# Save
|
| 55 |
+
output_path = OUTPUT_DIR / "tourist_attractions.geojson"
|
| 56 |
+
tourism_gdf.to_file(output_path, driver='GeoJSON')
|
| 57 |
+
|
| 58 |
+
return output_path, len(tourism_gdf)
|
| 59 |
+
|
| 60 |
+
def extract_accommodation():
|
| 61 |
+
"""Extract hotels and accommodation"""
|
| 62 |
+
logger.info("Extracting accommodation...")
|
| 63 |
+
|
| 64 |
+
places_path = OVERTURE_DIR / "places.geojson"
|
| 65 |
+
gdf = gpd.read_file(places_path)
|
| 66 |
+
|
| 67 |
+
# Filter for accommodation
|
| 68 |
+
accommodation_categories = ['hotel', 'hostel', 'motel', 'resort', 'lodge', 'guest_house']
|
| 69 |
+
accommodation_gdf = gdf[gdf['category'].str.contains('|'.join(accommodation_categories), case=False, na=False)]
|
| 70 |
+
|
| 71 |
+
logger.info(f"Found {len(accommodation_gdf)} accommodation facilities")
|
| 72 |
+
|
| 73 |
+
# Save
|
| 74 |
+
output_path = OUTPUT_DIR / "accommodation.geojson"
|
| 75 |
+
accommodation_gdf.to_file(output_path, driver='GeoJSON')
|
| 76 |
+
|
| 77 |
+
return output_path, len(accommodation_gdf)
|
| 78 |
+
|
| 79 |
+
def extract_restaurants():
|
| 80 |
+
"""Extract restaurants and food services"""
|
| 81 |
+
logger.info("Extracting restaurants...")
|
| 82 |
+
|
| 83 |
+
places_path = OVERTURE_DIR / "places.geojson"
|
| 84 |
+
gdf = gpd.read_file(places_path)
|
| 85 |
+
|
| 86 |
+
# Filter for restaurants
|
| 87 |
+
restaurant_categories = ['restaurant', 'cafe', 'bar', 'fast_food', 'food_court']
|
| 88 |
+
restaurant_gdf = gdf[gdf['category'].str.contains('|'.join(restaurant_categories), case=False, na=False)]
|
| 89 |
+
|
| 90 |
+
logger.info(f"Found {len(restaurant_gdf)} restaurants/cafes")
|
| 91 |
+
|
| 92 |
+
# Save
|
| 93 |
+
output_path = OUTPUT_DIR / "restaurants.geojson"
|
| 94 |
+
restaurant_gdf.to_file(output_path, driver='GeoJSON')
|
| 95 |
+
|
| 96 |
+
return output_path, len(restaurant_gdf)
|
| 97 |
+
|
| 98 |
+
def main():
|
| 99 |
+
logger.info("=== Extracting features from Overture data ===")
|
| 100 |
+
|
| 101 |
+
results = []
|
| 102 |
+
|
| 103 |
+
try:
|
| 104 |
+
path, count = extract_healthcare()
|
| 105 |
+
results.append({"dataset": "healthcare_facilities", "count": count})
|
| 106 |
+
except Exception as e:
|
| 107 |
+
logger.error(f"Failed healthcare extraction: {e}")
|
| 108 |
+
|
| 109 |
+
try:
|
| 110 |
+
path, count = extract_tourism()
|
| 111 |
+
results.append({"dataset": "tourist_attractions", "count": count})
|
| 112 |
+
except Exception as e:
|
| 113 |
+
logger.error(f"Failed tourism extraction: {e}")
|
| 114 |
+
|
| 115 |
+
try:
|
| 116 |
+
path, count = extract_accommodation()
|
| 117 |
+
results.append({"dataset": "accommodation", "count": count})
|
| 118 |
+
except Exception as e:
|
| 119 |
+
logger.error(f"Failed accommodation extraction: {e}")
|
| 120 |
+
|
| 121 |
+
try:
|
| 122 |
+
path, count = extract_restaurants()
|
| 123 |
+
results.append({"dataset": "restaurants", "count": count})
|
| 124 |
+
except Exception as e:
|
| 125 |
+
logger.error(f"Failed restaurant extraction: {e}")
|
| 126 |
+
|
| 127 |
+
logger.info("\n=== Extraction Summary ===")
|
| 128 |
+
for result in results:
|
| 129 |
+
logger.info(f" {result['dataset']}: {result['count']} features")
|
| 130 |
+
|
| 131 |
+
return results
|
| 132 |
+
|
| 133 |
+
if __name__ == "__main__":
|
| 134 |
+
main()
|
backend/scripts/ingest_hdx.py
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
HDX Data Ingestion Script
|
| 3 |
+
|
| 4 |
+
Downloads and processes humanitarian datasets from the Humanitarian Data Exchange (HDX)
|
| 5 |
+
for Panama, including population, health facilities, and other indicators.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import httpx
|
| 9 |
+
import json
|
| 10 |
+
import os
|
| 11 |
+
import asyncio
|
| 12 |
+
from pathlib import Path
|
| 13 |
+
|
| 14 |
+
# HDX API Base URL
|
| 15 |
+
HDX_API = "https://data.humdata.org/api/3"
|
| 16 |
+
|
| 17 |
+
# Datasets to download (name -> HDX dataset ID)
|
| 18 |
+
DATASETS = {
|
| 19 |
+
"population_worldpop": "worldpop-population-counts-for-panama",
|
| 20 |
+
"admin_boundaries": "cod-ab-pan",
|
| 21 |
+
"health_facilities": "panama-healthsites",
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
+
DATA_DIR = Path(__file__).parent.parent.parent / "data"
|
| 25 |
+
RAW_DIR = DATA_DIR / "raw" / "hdx"
|
| 26 |
+
PROCESSED_DIR = DATA_DIR / "processed"
|
| 27 |
+
|
| 28 |
+
def ensure_dirs():
|
| 29 |
+
"""Create data directories if they don't exist."""
|
| 30 |
+
RAW_DIR.mkdir(parents=True, exist_ok=True)
|
| 31 |
+
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
|
| 32 |
+
(PROCESSED_DIR / "demographics").mkdir(exist_ok=True)
|
| 33 |
+
(PROCESSED_DIR / "health").mkdir(exist_ok=True)
|
| 34 |
+
(PROCESSED_DIR / "infrastructure").mkdir(exist_ok=True)
|
| 35 |
+
|
| 36 |
+
async def get_dataset_resources(client: httpx.AsyncClient, dataset_id: str) -> list:
|
| 37 |
+
"""Get list of downloadable resources for a dataset."""
|
| 38 |
+
try:
|
| 39 |
+
response = await client.get(f"{HDX_API}/action/package_show", params={"id": dataset_id})
|
| 40 |
+
response.raise_for_status()
|
| 41 |
+
data = response.json()
|
| 42 |
+
|
| 43 |
+
if data.get("success"):
|
| 44 |
+
return data["result"].get("resources", [])
|
| 45 |
+
return []
|
| 46 |
+
except Exception as e:
|
| 47 |
+
print(f"Error fetching dataset {dataset_id}: {e}")
|
| 48 |
+
return []
|
| 49 |
+
|
| 50 |
+
async def download_resource(client: httpx.AsyncClient, resource: dict, output_dir: Path) -> str:
|
| 51 |
+
"""Download a single resource file."""
|
| 52 |
+
url = resource.get("url")
|
| 53 |
+
name = resource.get("name", "unknown")
|
| 54 |
+
format = resource.get("format", "").lower()
|
| 55 |
+
|
| 56 |
+
# Skip non-data formats
|
| 57 |
+
if format not in ["csv", "json", "geojson", "xlsx", "xls", "zip"]:
|
| 58 |
+
return None
|
| 59 |
+
|
| 60 |
+
filename = f"{name}.{format}"
|
| 61 |
+
filepath = output_dir / filename
|
| 62 |
+
|
| 63 |
+
# Skip if already downloaded
|
| 64 |
+
if filepath.exists():
|
| 65 |
+
print(f" Skipping (exists): {filename}")
|
| 66 |
+
return str(filepath)
|
| 67 |
+
|
| 68 |
+
print(f" Downloading: {filename}")
|
| 69 |
+
try:
|
| 70 |
+
response = await client.get(url, follow_redirects=True)
|
| 71 |
+
response.raise_for_status()
|
| 72 |
+
|
| 73 |
+
with open(filepath, "wb") as f:
|
| 74 |
+
f.write(response.content)
|
| 75 |
+
|
| 76 |
+
return str(filepath)
|
| 77 |
+
except Exception as e:
|
| 78 |
+
print(f" Error downloading {name}: {e}")
|
| 79 |
+
return None
|
| 80 |
+
|
| 81 |
+
async def ingest_hdx_datasets():
|
| 82 |
+
"""Main ingestion function."""
|
| 83 |
+
ensure_dirs()
|
| 84 |
+
|
| 85 |
+
print("=" * 60)
|
| 86 |
+
print("HDX Data Ingestion for Panama")
|
| 87 |
+
print("=" * 60)
|
| 88 |
+
|
| 89 |
+
async with httpx.AsyncClient(timeout=60.0) as client:
|
| 90 |
+
for name, dataset_id in DATASETS.items():
|
| 91 |
+
print(f"\n📦 Dataset: {name} ({dataset_id})")
|
| 92 |
+
|
| 93 |
+
# Create dataset-specific directory
|
| 94 |
+
dataset_dir = RAW_DIR / name
|
| 95 |
+
dataset_dir.mkdir(exist_ok=True)
|
| 96 |
+
|
| 97 |
+
# Get resources
|
| 98 |
+
resources = await get_dataset_resources(client, dataset_id)
|
| 99 |
+
print(f" Found {len(resources)} resources")
|
| 100 |
+
|
| 101 |
+
# Download each resource
|
| 102 |
+
for resource in resources:
|
| 103 |
+
await download_resource(client, resource, dataset_dir)
|
| 104 |
+
|
| 105 |
+
print("\n" + "=" * 60)
|
| 106 |
+
print("Ingestion complete!")
|
| 107 |
+
print("=" * 60)
|
| 108 |
+
|
| 109 |
+
if __name__ == "__main__":
|
| 110 |
+
asyncio.run(ingest_hdx_datasets())
|
backend/scripts/process_worldbank.py
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Process World Bank indicators and create GeoJSON layers
|
| 4 |
+
Joins most recent indicator data to Panama administrative boundaries
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import pandas as pd
|
| 8 |
+
import geopandas as gpd
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
import logging
|
| 11 |
+
import json
|
| 12 |
+
|
| 13 |
+
logging.basicConfig(level=logging.INFO)
|
| 14 |
+
logger = logging.getLogger(__name__)
|
| 15 |
+
|
| 16 |
+
DATA_DIR = Path(__file__).parent.parent / "data"
|
| 17 |
+
WB_DIR = DATA_DIR / "worldbank"
|
| 18 |
+
BASE_DIR = DATA_DIR / "base"
|
| 19 |
+
OUTPUT_DIR = DATA_DIR / "socioeconomic"
|
| 20 |
+
|
| 21 |
+
def load_admin_boundaries():
|
| 22 |
+
"""Load Panama administrative boundaries as GeoDataFrame"""
|
| 23 |
+
admin1_path = BASE_DIR / "pan_admin1.geojson"
|
| 24 |
+
|
| 25 |
+
if not admin1_path.exists():
|
| 26 |
+
logger.error(f"Admin boundaries not found: {admin1_path}")
|
| 27 |
+
return None
|
| 28 |
+
|
| 29 |
+
gdf = gpd.read_file(admin1_path)
|
| 30 |
+
logger.info(f"Loaded {len(gdf)} provinces")
|
| 31 |
+
return gdf
|
| 32 |
+
|
| 33 |
+
def process_indicators():
|
| 34 |
+
"""Load and process World Bank indicators"""
|
| 35 |
+
csv_path = WB_DIR / "panama_indicators.csv"
|
| 36 |
+
|
| 37 |
+
if not csv_path.exists():
|
| 38 |
+
logger.error(f"Indicators file not found: {csv_path}")
|
| 39 |
+
return None
|
| 40 |
+
|
| 41 |
+
df = pd.read_csv(csv_path)
|
| 42 |
+
logger.info(f"Loaded {len(df)} indicator records")
|
| 43 |
+
|
| 44 |
+
# Get most recent year for each indicator
|
| 45 |
+
latest_df = df.loc[df.groupby('indicator_code')['year'].idxmax()]
|
| 46 |
+
logger.info(f"Selected most recent data for {len(latest_df)} indicators")
|
| 47 |
+
|
| 48 |
+
return latest_df
|
| 49 |
+
|
| 50 |
+
def create_national_geojson(indicators_df, admin_gdf):
|
| 51 |
+
"""Create GeoJSON for national-level indicators"""
|
| 52 |
+
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
| 53 |
+
|
| 54 |
+
# Since WB data is national-level, we'll attach it to the country boundary (admin0)
|
| 55 |
+
# For now, create a simple point feature at Panama's center with the indicators
|
| 56 |
+
|
| 57 |
+
features = []
|
| 58 |
+
|
| 59 |
+
# Create one feature with all latest indicators
|
| 60 |
+
properties = {
|
| 61 |
+
'country': 'Panama',
|
| 62 |
+
'data_year': int(indicators_df['year'].max())
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
# Add each indicator as a property
|
| 66 |
+
for _, row in indicators_df.iterrows():
|
| 67 |
+
# Create clean column name (remove special chars)
|
| 68 |
+
col_name = row['indicator_code'].lower().replace('.', '_')
|
| 69 |
+
properties[col_name] = row['value']
|
| 70 |
+
properties[f"{col_name}_name"] = row['indicator_name']
|
| 71 |
+
|
| 72 |
+
# Use Panama's approximate center
|
| 73 |
+
feature = {
|
| 74 |
+
"type": "Feature",
|
| 75 |
+
"geometry": {
|
| 76 |
+
"type": "Point",
|
| 77 |
+
"coordinates": [-80.0, 8.5] # Approximate center of Panama
|
| 78 |
+
},
|
| 79 |
+
"properties": properties
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
geojson = {
|
| 83 |
+
"type": "FeatureCollection",
|
| 84 |
+
"features": [feature]
|
| 85 |
+
}
|
| 86 |
+
|
| 87 |
+
# Save GeoJSON
|
| 88 |
+
output_file = OUTPUT_DIR / "panama_national_indicators.geojson"
|
| 89 |
+
with open(output_file, 'w') as f:
|
| 90 |
+
json.dump(geojson, f, indent=2)
|
| 91 |
+
|
| 92 |
+
logger.info(f"Created national indicators GeoJSON: {output_file}")
|
| 93 |
+
logger.info(f" Indicators included: {len(indicators_df)}")
|
| 94 |
+
|
| 95 |
+
return output_file
|
| 96 |
+
|
| 97 |
+
def update_catalog(geojson_path):
|
| 98 |
+
"""Add the new dataset to catalog.json"""
|
| 99 |
+
catalog_path = DATA_DIR / "catalog.json"
|
| 100 |
+
|
| 101 |
+
with open(catalog_path, 'r') as f:
|
| 102 |
+
catalog = json.load(f)
|
| 103 |
+
|
| 104 |
+
# Add new entry
|
| 105 |
+
catalog["panama_national_indicators"] = {
|
| 106 |
+
"path": str(geojson_path.relative_to(DATA_DIR)),
|
| 107 |
+
"description": "National socio-economic indicators from World Bank (2000-2024)",
|
| 108 |
+
"semantic_description": "Comprehensive national-level statistics for Panama including poverty rates, GDP, unemployment, health expenditure, maternal/child mortality, literacy rates, and school enrollment. Data sourced from World Bank Open Data API. Use this dataset for analyzing Panama's socio-economic development trends over time.",
|
| 109 |
+
"tags": [
|
| 110 |
+
"socioeconomic",
|
| 111 |
+
"worldbank",
|
| 112 |
+
"poverty",
|
| 113 |
+
"gdp",
|
| 114 |
+
"employment",
|
| 115 |
+
"health",
|
| 116 |
+
"education",
|
| 117 |
+
"national",
|
| 118 |
+
"panama"
|
| 119 |
+
],
|
| 120 |
+
"data_type": "static",
|
| 121 |
+
"category": "socioeconomic",
|
| 122 |
+
"format": "geojson"
|
| 123 |
+
}
|
| 124 |
+
|
| 125 |
+
with open(catalog_path, 'w') as f:
|
| 126 |
+
json.dump(catalog, f, indent=2)
|
| 127 |
+
|
| 128 |
+
logger.info("Updated catalog.json")
|
| 129 |
+
|
| 130 |
+
def main():
|
| 131 |
+
logger.info("Processing World Bank indicators...")
|
| 132 |
+
|
| 133 |
+
# Load data
|
| 134 |
+
admin_gdf = load_admin_boundaries()
|
| 135 |
+
indicators_df = process_indicators()
|
| 136 |
+
|
| 137 |
+
if admin_gdf is None or indicators_df is None:
|
| 138 |
+
logger.error("Failed to load required data")
|
| 139 |
+
return
|
| 140 |
+
|
| 141 |
+
# Create GeoJSON
|
| 142 |
+
geojson_path = create_national_geojson(indicators_df, admin_gdf)
|
| 143 |
+
|
| 144 |
+
# Update catalog
|
| 145 |
+
update_catalog(geojson_path)
|
| 146 |
+
|
| 147 |
+
logger.info("Processing complete!")
|
| 148 |
+
|
| 149 |
+
if __name__ == "__main__":
|
| 150 |
+
main()
|
backend/scripts/register_global_datasets.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Register global datasets in catalog
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import json
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
import logging
|
| 9 |
+
|
| 10 |
+
logging.basicConfig(level=logging.INFO)
|
| 11 |
+
logger = logging.getLogger(__name__)
|
| 12 |
+
|
| 13 |
+
DATA_DIR = Path(__file__).parent.parent / "data"
|
| 14 |
+
CATALOG_PATH = DATA_DIR / "catalog.json"
|
| 15 |
+
|
| 16 |
+
def register_airports():
|
| 17 |
+
"""Register Panama airports dataset"""
|
| 18 |
+
with open(CATALOG_PATH, 'r') as f:
|
| 19 |
+
catalog = json.load(f)
|
| 20 |
+
|
| 21 |
+
catalog["panama_airports"] = {
|
| 22 |
+
"path": "global/airports/panama_airports.geojson",
|
| 23 |
+
"description": "Panama airports from OurAirports global database (91 airports)",
|
| 24 |
+
"semantic_description": "Comprehensive dataset of all airports in Panama including international, domestic, regional, and small airfields. Contains location, elevation, type (large/medium/small/heliport), runway information, and identifiers (ICAO, IATA codes). Updated daily from OurAirports open database. Use for aviation infrastructure analysis, accessibility studies, and transportation planning.",
|
| 25 |
+
"tags": [
|
| 26 |
+
"infrastructure",
|
| 27 |
+
"transportation",
|
| 28 |
+
"airports",
|
| 29 |
+
"aviation",
|
| 30 |
+
"panama",
|
| 31 |
+
"ourairports"
|
| 32 |
+
],
|
| 33 |
+
"data_type": "static",
|
| 34 |
+
"category": "infrastructure",
|
| 35 |
+
"format": "geojson",
|
| 36 |
+
"source": "OurAirports (davidmegginson/ourairports-data)",
|
| 37 |
+
"license": "Public Domain"
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
with open(CATALOG_PATH, 'w') as f:
|
| 41 |
+
json.dump(catalog, f, indent=2)
|
| 42 |
+
|
| 43 |
+
logger.info("Registered panama_airports in catalog")
|
| 44 |
+
|
| 45 |
+
def main():
|
| 46 |
+
logger.info("Registering datasets in catalog...")
|
| 47 |
+
register_airports()
|
| 48 |
+
logger.info("Complete!")
|
| 49 |
+
|
| 50 |
+
if __name__ == "__main__":
|
| 51 |
+
main()
|
backend/scripts/stri_catalog_scraper.py
ADDED
|
@@ -0,0 +1,348 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
STRI GIS Portal Catalog Scraper
|
| 4 |
+
|
| 5 |
+
Discovers and catalogs datasets from the Smithsonian Tropical Research Institute
|
| 6 |
+
GIS Portal using the ArcGIS Online API.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import requests
|
| 10 |
+
import json
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
import logging
|
| 13 |
+
from datetime import datetime
|
| 14 |
+
from typing import Dict, List, Optional
|
| 15 |
+
import re
|
| 16 |
+
|
| 17 |
+
logging.basicConfig(level=logging.INFO)
|
| 18 |
+
logger = logging.getLogger(__name__)
|
| 19 |
+
|
| 20 |
+
DATA_DIR = Path(__file__).parent.parent / "data" / "stri"
|
| 21 |
+
METADATA_DIR = DATA_DIR / "metadata"
|
| 22 |
+
|
| 23 |
+
# STRI GIS Portal ArcGIS Online Organization ID
|
| 24 |
+
STRI_ORG_ID = "nzS0F0zdNLvs7nc8"
|
| 25 |
+
ARCGIS_BASE_URL = "https://www.arcgis.com/sharing/rest"
|
| 26 |
+
|
| 27 |
+
# Priority keywords for dataset selection
|
| 28 |
+
HIGH_PRIORITY_KEYWORDS = [
|
| 29 |
+
"panama", "national", "country", "forest", "cover", "protected", "areas",
|
| 30 |
+
"land use", "biodiversity", "climate", "water", "infrastructure",
|
| 31 |
+
"administrative", "boundaries", "poverty", "population"
|
| 32 |
+
]
|
| 33 |
+
|
| 34 |
+
# Keywords to deprioritize (site-specific, not national)
|
| 35 |
+
LOW_PRIORITY_KEYWORDS = [
|
| 36 |
+
"bci", "barro colorado", "island", "pena blanca", "site-specific",
|
| 37 |
+
"trail", "sensor", "camera", "plot"
|
| 38 |
+
]
|
| 39 |
+
|
| 40 |
+
# Temporal dataset patterns (to identify multi-year series)
|
| 41 |
+
TEMPORAL_PATTERNS = [
|
| 42 |
+
r"\b(19\d{2}|20\d{2})\b", # Years like 1992, 2021
|
| 43 |
+
r"edition\s+(19\d{2}|20\d{2})",
|
| 44 |
+
r"version\s+(19\d{2}|20\d{2})"
|
| 45 |
+
]
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def search_stri_portal(query: str = "panama", num: int = 100, start: int = 1) -> Dict:
|
| 49 |
+
"""
|
| 50 |
+
Search the STRI GIS Portal using ArcGIS REST API
|
| 51 |
+
|
| 52 |
+
Args:
|
| 53 |
+
query: Search query string (default: "panama" for Panama-specific datasets)
|
| 54 |
+
num: Number of results per page (max 100)
|
| 55 |
+
start: Starting position
|
| 56 |
+
|
| 57 |
+
Returns:
|
| 58 |
+
JSON response with search results
|
| 59 |
+
"""
|
| 60 |
+
search_url = f"{ARCGIS_BASE_URL}/search"
|
| 61 |
+
|
| 62 |
+
# Search for Panama-related datasets within STRI organization
|
| 63 |
+
params = {
|
| 64 |
+
"q": f'orgid:{STRI_ORG_ID} AND (panama OR panamá)',
|
| 65 |
+
"f": "json",
|
| 66 |
+
"num": num,
|
| 67 |
+
"start": start,
|
| 68 |
+
"sortField": "modified",
|
| 69 |
+
"sortOrder": "desc"
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
try:
|
| 73 |
+
response = requests.get(search_url, params=params, timeout=30)
|
| 74 |
+
response.raise_for_status()
|
| 75 |
+
return response.json()
|
| 76 |
+
except Exception as e:
|
| 77 |
+
logger.error(f"Failed to search portal: {e}")
|
| 78 |
+
return {}
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def get_item_details(item_id: str) -> Optional[Dict]:
|
| 82 |
+
"""Get detailed metadata for a specific item"""
|
| 83 |
+
details_url = f"{ARCGIS_BASE_URL}/content/items/{item_id}"
|
| 84 |
+
|
| 85 |
+
params = {"f": "json"}
|
| 86 |
+
|
| 87 |
+
try:
|
| 88 |
+
response = requests.get(details_url, params=params, timeout=30)
|
| 89 |
+
response.raise_for_status()
|
| 90 |
+
return response.json()
|
| 91 |
+
except Exception as e:
|
| 92 |
+
logger.error(f"Failed to get item {item_id}: {e}")
|
| 93 |
+
return None
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def extract_year_from_title(title: str) -> Optional[int]:
|
| 97 |
+
"""Extract year from dataset title"""
|
| 98 |
+
for pattern in TEMPORAL_PATTERNS:
|
| 99 |
+
match = re.search(pattern, title, re.IGNORECASE)
|
| 100 |
+
if match:
|
| 101 |
+
year_str = match.group(1) if match.lastindex else match.group(0)
|
| 102 |
+
try:
|
| 103 |
+
return int(year_str)
|
| 104 |
+
except ValueError:
|
| 105 |
+
continue
|
| 106 |
+
return None
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
def calculate_priority_score(item: Dict) -> float:
|
| 110 |
+
"""
|
| 111 |
+
Calculate priority score for a dataset based on:
|
| 112 |
+
- National vs site-specific coverage
|
| 113 |
+
- Relevance keywords
|
| 114 |
+
- Data type (prefer Feature Services)
|
| 115 |
+
- Recency
|
| 116 |
+
"""
|
| 117 |
+
score = 50.0 # Baseline
|
| 118 |
+
|
| 119 |
+
title = item.get("title", "").lower() if item.get("title") else ""
|
| 120 |
+
description = item.get("description", "").lower() if item.get("description") else ""
|
| 121 |
+
tags = " ".join(item.get("tags", [])).lower() if item.get("tags") else ""
|
| 122 |
+
item_type = item.get("type", "")
|
| 123 |
+
|
| 124 |
+
combined_text = f"{title} {description} {tags}"
|
| 125 |
+
|
| 126 |
+
# Boost for high-priority keywords
|
| 127 |
+
for keyword in HIGH_PRIORITY_KEYWORDS:
|
| 128 |
+
if keyword in combined_text:
|
| 129 |
+
score += 5
|
| 130 |
+
|
| 131 |
+
# Penalty for low-priority (site-specific) keywords
|
| 132 |
+
for keyword in LOW_PRIORITY_KEYWORDS:
|
| 133 |
+
if keyword in combined_text:
|
| 134 |
+
score -= 15
|
| 135 |
+
|
| 136 |
+
# Prefer Feature Services (queryable GIS data)
|
| 137 |
+
if "Feature Service" in item_type:
|
| 138 |
+
score += 20
|
| 139 |
+
elif "Map Service" in item_type:
|
| 140 |
+
score += 10
|
| 141 |
+
|
| 142 |
+
# Boost for temporal datasets
|
| 143 |
+
if extract_year_from_title(title):
|
| 144 |
+
score += 10
|
| 145 |
+
|
| 146 |
+
# Boost for recent updates
|
| 147 |
+
modified = item.get("modified", 0)
|
| 148 |
+
if modified:
|
| 149 |
+
# Convert milliseconds to years since 2020
|
| 150 |
+
years_since_2020 = (modified - 1577836800000) / (365.25 * 24 * 60 * 60 * 1000)
|
| 151 |
+
score += min(years_since_2020 * 2, 10) # Max +10 for very recent
|
| 152 |
+
|
| 153 |
+
return score
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
def build_rest_endpoint(item: Dict) -> Optional[str]:
|
| 157 |
+
"""Construct the REST endpoint URL for a Feature Service"""
|
| 158 |
+
item_type = item.get("type", "")
|
| 159 |
+
|
| 160 |
+
if "Feature Service" not in item_type:
|
| 161 |
+
return None
|
| 162 |
+
|
| 163 |
+
# Standard ArcGIS REST endpoint pattern
|
| 164 |
+
url = item.get("url")
|
| 165 |
+
if url and "/FeatureServer" in url:
|
| 166 |
+
# Assume layer 0 if not specified
|
| 167 |
+
if not url.endswith(("FeatureServer", "FeatureServer/")):
|
| 168 |
+
return url
|
| 169 |
+
return f"{url.rstrip('/')}/0"
|
| 170 |
+
|
| 171 |
+
# Fallback: construct from item ID
|
| 172 |
+
item_id = item.get("id")
|
| 173 |
+
if item_id:
|
| 174 |
+
return f"https://services.arcgis.com/{STRI_ORG_ID}/arcgis/rest/services/{item_id}/FeatureServer/0"
|
| 175 |
+
|
| 176 |
+
return None
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
def catalog_datasets(max_datasets: int = 100) -> List[Dict]:
|
| 180 |
+
"""
|
| 181 |
+
Scrape the STRI portal and build a prioritized catalog
|
| 182 |
+
|
| 183 |
+
Args:
|
| 184 |
+
max_datasets: Maximum number of datasets to retrieve
|
| 185 |
+
|
| 186 |
+
Returns:
|
| 187 |
+
List of dataset metadata dictionaries
|
| 188 |
+
"""
|
| 189 |
+
datasets = []
|
| 190 |
+
start = 1
|
| 191 |
+
batch_size = 100
|
| 192 |
+
|
| 193 |
+
logger.info("Scraping STRI GIS Portal...")
|
| 194 |
+
|
| 195 |
+
while len(datasets) < max_datasets:
|
| 196 |
+
logger.info(f"Fetching items {start} to {start + batch_size - 1}...")
|
| 197 |
+
|
| 198 |
+
results = search_stri_portal(num=batch_size, start=start)
|
| 199 |
+
|
| 200 |
+
if not results or "results" not in results:
|
| 201 |
+
break
|
| 202 |
+
|
| 203 |
+
items = results["results"]
|
| 204 |
+
|
| 205 |
+
if not items:
|
| 206 |
+
break
|
| 207 |
+
|
| 208 |
+
for item in items:
|
| 209 |
+
# Focus on Feature Services (queryable geospatial data)
|
| 210 |
+
if "Feature Service" not in item.get("type", ""):
|
| 211 |
+
continue
|
| 212 |
+
|
| 213 |
+
# Calculate priority
|
| 214 |
+
priority = calculate_priority_score(item)
|
| 215 |
+
|
| 216 |
+
# Extract year if temporal
|
| 217 |
+
year = extract_year_from_title(item.get("title", ""))
|
| 218 |
+
|
| 219 |
+
# Build REST endpoint
|
| 220 |
+
rest_endpoint = build_rest_endpoint(item)
|
| 221 |
+
|
| 222 |
+
dataset = {
|
| 223 |
+
"id": item.get("id"),
|
| 224 |
+
"title": item.get("title"),
|
| 225 |
+
"description": item.get("description", ""),
|
| 226 |
+
"type": item.get("type"),
|
| 227 |
+
"tags": item.get("tags", []),
|
| 228 |
+
"modified": item.get("modified"),
|
| 229 |
+
"modified_date": datetime.fromtimestamp(
|
| 230 |
+
item.get("modified", 0) / 1000
|
| 231 |
+
).isoformat() if item.get("modified") else None,
|
| 232 |
+
"url": item.get("url"),
|
| 233 |
+
"rest_endpoint": rest_endpoint,
|
| 234 |
+
"year": year,
|
| 235 |
+
"priority_score": round(priority, 2)
|
| 236 |
+
}
|
| 237 |
+
|
| 238 |
+
datasets.append(dataset)
|
| 239 |
+
|
| 240 |
+
# Check if there are more results
|
| 241 |
+
if start + batch_size > results.get("total", 0):
|
| 242 |
+
break
|
| 243 |
+
|
| 244 |
+
start += batch_size
|
| 245 |
+
|
| 246 |
+
# Sort by priority score
|
| 247 |
+
datasets.sort(key=lambda x: x["priority_score"], reverse=True)
|
| 248 |
+
|
| 249 |
+
logger.info(f"Found {len(datasets)} Feature Service datasets")
|
| 250 |
+
|
| 251 |
+
return datasets[:max_datasets]
|
| 252 |
+
|
| 253 |
+
|
| 254 |
+
def identify_temporal_groups(datasets: List[Dict]) -> Dict[str, List[Dict]]:
|
| 255 |
+
"""
|
| 256 |
+
Group datasets by base name to identify temporal series
|
| 257 |
+
|
| 258 |
+
Returns:
|
| 259 |
+
Dictionary mapping base name to list of datasets with years
|
| 260 |
+
"""
|
| 261 |
+
temporal_groups = {}
|
| 262 |
+
|
| 263 |
+
for dataset in datasets:
|
| 264 |
+
if dataset["year"] is None:
|
| 265 |
+
continue
|
| 266 |
+
|
| 267 |
+
# Remove year from title to get base name
|
| 268 |
+
title = dataset["title"]
|
| 269 |
+
base_name = re.sub(r'\b(19\d{2}|20\d{2})\b', '', title)
|
| 270 |
+
base_name = re.sub(r'\s+', ' ', base_name).strip()
|
| 271 |
+
base_name = re.sub(r'edition|version', '', base_name, flags=re.IGNORECASE).strip()
|
| 272 |
+
|
| 273 |
+
if base_name not in temporal_groups:
|
| 274 |
+
temporal_groups[base_name] = []
|
| 275 |
+
|
| 276 |
+
temporal_groups[base_name].append(dataset)
|
| 277 |
+
|
| 278 |
+
# Filter to groups with multiple years
|
| 279 |
+
temporal_groups = {
|
| 280 |
+
k: sorted(v, key=lambda x: x["year"])
|
| 281 |
+
for k, v in temporal_groups.items()
|
| 282 |
+
if len(v) > 1
|
| 283 |
+
}
|
| 284 |
+
|
| 285 |
+
return temporal_groups
|
| 286 |
+
|
| 287 |
+
|
| 288 |
+
def save_catalog(datasets: List[Dict], temporal_groups: Dict[str, List[Dict]]):
|
| 289 |
+
"""Save catalog and temporal groups to JSON files"""
|
| 290 |
+
METADATA_DIR.mkdir(parents=True, exist_ok=True)
|
| 291 |
+
|
| 292 |
+
# Save main catalog
|
| 293 |
+
catalog_path = METADATA_DIR / "stri_catalog.json"
|
| 294 |
+
with open(catalog_path, 'w') as f:
|
| 295 |
+
json.dump({
|
| 296 |
+
"generated_at": datetime.now().isoformat(),
|
| 297 |
+
"total_datasets": len(datasets),
|
| 298 |
+
"datasets": datasets
|
| 299 |
+
}, f, indent=2)
|
| 300 |
+
|
| 301 |
+
logger.info(f"Saved catalog to {catalog_path}")
|
| 302 |
+
|
| 303 |
+
# Save temporal groups
|
| 304 |
+
if temporal_groups:
|
| 305 |
+
temporal_path = METADATA_DIR / "stri_temporal_groups.json"
|
| 306 |
+
with open(temporal_path, 'w') as f:
|
| 307 |
+
json.dump({
|
| 308 |
+
"generated_at": datetime.now().isoformat(),
|
| 309 |
+
"num_groups": len(temporal_groups),
|
| 310 |
+
"groups": temporal_groups
|
| 311 |
+
}, f, indent=2)
|
| 312 |
+
|
| 313 |
+
logger.info(f"Saved {len(temporal_groups)} temporal groups to {temporal_path}")
|
| 314 |
+
|
| 315 |
+
|
| 316 |
+
def main():
|
| 317 |
+
"""Main execution"""
|
| 318 |
+
logger.info("=== STRI GIS Portal Catalog Scraper ===")
|
| 319 |
+
|
| 320 |
+
# Catalog datasets
|
| 321 |
+
datasets = catalog_datasets(max_datasets=100)
|
| 322 |
+
|
| 323 |
+
# Identify temporal groups
|
| 324 |
+
temporal_groups = identify_temporal_groups(datasets)
|
| 325 |
+
|
| 326 |
+
# Save results
|
| 327 |
+
save_catalog(datasets, temporal_groups)
|
| 328 |
+
|
| 329 |
+
# Print summary
|
| 330 |
+
logger.info("\n" + "="*60)
|
| 331 |
+
logger.info(f"✅ Cataloged {len(datasets)} datasets")
|
| 332 |
+
logger.info(f"📊 Found {len(temporal_groups)} temporal dataset groups")
|
| 333 |
+
|
| 334 |
+
if temporal_groups:
|
| 335 |
+
logger.info("\nTemporal Groups:")
|
| 336 |
+
for base_name, group in list(temporal_groups.items())[:5]:
|
| 337 |
+
years = [d["year"] for d in group]
|
| 338 |
+
logger.info(f" - {base_name}: {years}")
|
| 339 |
+
|
| 340 |
+
logger.info("\nTop 10 Priority Datasets:")
|
| 341 |
+
for i, dataset in enumerate(datasets[:10], 1):
|
| 342 |
+
logger.info(f" {i}. [{dataset['priority_score']:.1f}] {dataset['title']}")
|
| 343 |
+
|
| 344 |
+
logger.info("="*60)
|
| 345 |
+
|
| 346 |
+
|
| 347 |
+
if __name__ == "__main__":
|
| 348 |
+
main()
|
backend/scripts/update_embeddings.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Update Embeddings for Semantic Search
|
| 3 |
+
|
| 4 |
+
Refreshes the embeddings.json index with any new tables in the catalog.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import sys
|
| 8 |
+
import asyncio
|
| 9 |
+
import logging
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
|
| 12 |
+
# Add project root to path
|
| 13 |
+
sys.path.append(str(Path(__file__).parent.parent.parent))
|
| 14 |
+
|
| 15 |
+
from backend.core.data_catalog import get_data_catalog
|
| 16 |
+
from backend.core.semantic_search import get_semantic_search
|
| 17 |
+
|
| 18 |
+
def update_embeddings():
|
| 19 |
+
print("="*60)
|
| 20 |
+
# Reload catalog to ensure latest
|
| 21 |
+
catalog = get_data_catalog()
|
| 22 |
+
catalog.load_catalog()
|
| 23 |
+
|
| 24 |
+
search_service = get_semantic_search()
|
| 25 |
+
|
| 26 |
+
print(f"Catalog size: {len(catalog.catalog)} tables")
|
| 27 |
+
print(f"Existing embeddings: {len(search_service.embeddings)}")
|
| 28 |
+
|
| 29 |
+
print("\nGenerating embeddings for new tables...")
|
| 30 |
+
new_count = search_service.embed_all_tables(catalog.catalog)
|
| 31 |
+
|
| 32 |
+
print(f"\n✅ Embedded {new_count} new tables.")
|
| 33 |
+
print(f"Total embedded: {len(search_service.embeddings)}")
|
| 34 |
+
|
| 35 |
+
if __name__ == "__main__":
|
| 36 |
+
update_embeddings()
|
| 37 |
+
|
backend/scripts/validate_censo.py
ADDED
|
@@ -0,0 +1,155 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import csv
|
| 2 |
+
import json
|
| 3 |
+
import os
|
| 4 |
+
import unicodedata
|
| 5 |
+
|
| 6 |
+
# Define paths
|
| 7 |
+
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
| 8 |
+
CSV_PATH = os.path.join(BASE_DIR, "data/censo/censo_panama_2023_unificado.csv")
|
| 9 |
+
GEOJSON_PATH = os.path.join(BASE_DIR, "data/base/pan_admin3.geojson")
|
| 10 |
+
|
| 11 |
+
def normalize_text(text):
|
| 12 |
+
if not text:
|
| 13 |
+
return ""
|
| 14 |
+
# Normalize unicode characters to ASCII (remove accents)
|
| 15 |
+
text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('ASCII')
|
| 16 |
+
return text.lower().strip()
|
| 17 |
+
|
| 18 |
+
def validate_censo_integration():
|
| 19 |
+
print(f"Loading CSV from {CSV_PATH}...")
|
| 20 |
+
csv_data = []
|
| 21 |
+
try:
|
| 22 |
+
with open(CSV_PATH, mode='r', encoding='utf-8') as f:
|
| 23 |
+
reader = csv.DictReader(f)
|
| 24 |
+
for row in reader:
|
| 25 |
+
csv_data.append(row)
|
| 26 |
+
except Exception as e:
|
| 27 |
+
print(f"Error loading CSV: {e}")
|
| 28 |
+
return
|
| 29 |
+
|
| 30 |
+
print(f"Loading GeoJSON from {GEOJSON_PATH}...")
|
| 31 |
+
try:
|
| 32 |
+
with open(GEOJSON_PATH, 'r') as f:
|
| 33 |
+
geojson = json.load(f)
|
| 34 |
+
except Exception as e:
|
| 35 |
+
print(f"Error loading GeoJSON: {e}")
|
| 36 |
+
return
|
| 37 |
+
|
| 38 |
+
# Build GeoJSON Lookup Map: (norm_prov, norm_dist, norm_corr) -> properties
|
| 39 |
+
geojson_lookup = {}
|
| 40 |
+
|
| 41 |
+
# Helper to handle common name variations found in Panama data
|
| 42 |
+
# (can add more rules as we discover mismatches)
|
| 43 |
+
def clean_name(name):
|
| 44 |
+
n = normalize_text(name)
|
| 45 |
+
# remove "distrito de", "comarca", etc if needed
|
| 46 |
+
return n
|
| 47 |
+
|
| 48 |
+
print("Building GeoJSON lookup table...")
|
| 49 |
+
for feature in geojson['features']:
|
| 50 |
+
props = feature.get('properties', {})
|
| 51 |
+
p_name = clean_name(props.get('adm1_name'))
|
| 52 |
+
d_name = clean_name(props.get('adm2_name'))
|
| 53 |
+
c_name = clean_name(props.get('adm3_name'))
|
| 54 |
+
|
| 55 |
+
key = (p_name, d_name, c_name)
|
| 56 |
+
if key in geojson_lookup:
|
| 57 |
+
print(f"Duplicate key in GeoJSON: {key}")
|
| 58 |
+
geojson_lookup[key] = props
|
| 59 |
+
|
| 60 |
+
print(f"GeoJSON lookup size: {len(geojson_lookup)}")
|
| 61 |
+
|
| 62 |
+
# Heuristics for Province Mapping (New -> Old)
|
| 63 |
+
PROV_MAPPING = {
|
| 64 |
+
"panama oeste": "panama",
|
| 65 |
+
"comarca naso tjer di": "bocas del toro" # Naso was part of Bocas
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
print("\nValidating CSV via Name Matching with Heuristics...")
|
| 69 |
+
|
| 70 |
+
matches = []
|
| 71 |
+
mismatches = []
|
| 72 |
+
|
| 73 |
+
for row in csv_data:
|
| 74 |
+
# CSV headers: nomb_prov, nomb_dist, nomb_corr
|
| 75 |
+
p_name = clean_name(row.get('nomb_prov'))
|
| 76 |
+
d_name = clean_name(row.get('nomb_dist'))
|
| 77 |
+
c_name = clean_name(row.get('nomb_corr'))
|
| 78 |
+
|
| 79 |
+
# Apply Province Mapping
|
| 80 |
+
search_p_name = PROV_MAPPING.get(p_name, p_name)
|
| 81 |
+
|
| 82 |
+
# 1. Try Exact Match (with mapped province)
|
| 83 |
+
key = (search_p_name, d_name, c_name)
|
| 84 |
+
if key in geojson_lookup:
|
| 85 |
+
matches.append(row)
|
| 86 |
+
row['geo_match_id'] = geojson_lookup[key].get('adm3_pcode')
|
| 87 |
+
continue
|
| 88 |
+
|
| 89 |
+
# 2. Relaxed District Match: Search in Province
|
| 90 |
+
# Find any entry in this province with the same corregimiento name
|
| 91 |
+
candidates = [k for k in geojson_lookup.keys() if k[0] == search_p_name and k[2] == c_name]
|
| 92 |
+
|
| 93 |
+
if len(candidates) == 1:
|
| 94 |
+
# Single match found in another district!
|
| 95 |
+
match_key = candidates[0]
|
| 96 |
+
matches.append(row)
|
| 97 |
+
row['geo_match_id'] = geojson_lookup[match_key].get('adm3_pcode')
|
| 98 |
+
# print(f"Relaxed Match: {c_name} (CSV Dist: {d_name}) -> (Geo Dist: {match_key[1]})")
|
| 99 |
+
continue
|
| 100 |
+
elif len(candidates) > 1:
|
| 101 |
+
# Ambiguous (same corregimiento name in multiple districts of same province - rare but possible)
|
| 102 |
+
# print(f"Ambiguous: {c_name} found in districts {[k[1] for k in candidates]}")
|
| 103 |
+
pass
|
| 104 |
+
|
| 105 |
+
# 3. Fuzzy/Typo Fixes (Specific hardcodes for common mismatch types if needed)
|
| 106 |
+
# E.g. "El Hato de San Juan de Dios" vs "El Hato de San Juan"
|
| 107 |
+
# We can perform a primitive "contains" check
|
| 108 |
+
|
| 109 |
+
best_candidate = None
|
| 110 |
+
# Get all corregimientos in this province
|
| 111 |
+
prov_corrs = [k for k in geojson_lookup.keys() if k[0] == search_p_name]
|
| 112 |
+
|
| 113 |
+
for k in prov_corrs:
|
| 114 |
+
geo_c = k[2]
|
| 115 |
+
# Check if one contains the other
|
| 116 |
+
if (c_name in geo_c or geo_c in c_name) and len(c_name) > 4 and len(geo_c) > 4:
|
| 117 |
+
# Check if starts matching
|
| 118 |
+
if c_name.startswith(geo_c) or geo_c.startswith(c_name):
|
| 119 |
+
best_candidate = k
|
| 120 |
+
break
|
| 121 |
+
|
| 122 |
+
if best_candidate:
|
| 123 |
+
matches.append(row)
|
| 124 |
+
row['geo_match_id'] = geojson_lookup[best_candidate].get('adm3_pcode')
|
| 125 |
+
# print(f"Fuzzy Match: '{c_name}' ~= '{best_candidate[2]}'")
|
| 126 |
+
continue
|
| 127 |
+
|
| 128 |
+
# No match
|
| 129 |
+
mismatches.append(row)
|
| 130 |
+
row['lookup_key'] = (search_p_name, d_name, c_name)
|
| 131 |
+
|
| 132 |
+
print(f"Total rows in CSV: {len(csv_data)}")
|
| 133 |
+
print(f"Matches found: {len(matches)}")
|
| 134 |
+
print(f"Mismatches found: {len(mismatches)}")
|
| 135 |
+
print(f"Match Rate: {len(matches)/len(csv_data)*100:.1f}%")
|
| 136 |
+
|
| 137 |
+
if mismatches:
|
| 138 |
+
print("\nMismatch Details (First 20):")
|
| 139 |
+
print(f"{'CSV Key (Prov, Dist, Corr)':<60} {'Closest Match?':<20}")
|
| 140 |
+
print("-" * 85)
|
| 141 |
+
for row in mismatches[:20]:
|
| 142 |
+
key = row['lookup_key']
|
| 143 |
+
print(f"{str(key):<60}")
|
| 144 |
+
|
| 145 |
+
# Analyze mismatches by Province
|
| 146 |
+
print("\nAnalyzing remaining mismatches by Province:")
|
| 147 |
+
prov_mismatches = {}
|
| 148 |
+
for row in mismatches:
|
| 149 |
+
p = row['nomb_prov']
|
| 150 |
+
prov_mismatches[p] = prov_mismatches.get(p, 0) + 1
|
| 151 |
+
for p, count in prov_mismatches.items():
|
| 152 |
+
print(f"{p}: {count}")
|
| 153 |
+
|
| 154 |
+
if __name__ == "__main__":
|
| 155 |
+
validate_censo_integration()
|
backend/services/data_loader.py
ADDED
|
@@ -0,0 +1,271 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Data Loader Service for Panama Geographic Data
|
| 3 |
+
|
| 4 |
+
Loads GeoJSON files from the data/raw directory and provides
|
| 5 |
+
query capabilities for the LLM to search and filter features.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import os
|
| 9 |
+
import json
|
| 10 |
+
from typing import List, Dict, Any, Optional
|
| 11 |
+
from functools import lru_cache
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class PanamaDataLoader:
|
| 15 |
+
"""
|
| 16 |
+
Singleton service to load and query Panama geographic data.
|
| 17 |
+
Loads data once on first access and caches in memory.
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
_instance = None
|
| 21 |
+
_data_loaded = False
|
| 22 |
+
|
| 23 |
+
# Data storage
|
| 24 |
+
admin0: List[Dict[str, Any]] = [] # Country
|
| 25 |
+
admin1: List[Dict[str, Any]] = [] # Provinces (13)
|
| 26 |
+
admin2: List[Dict[str, Any]] = [] # Districts (76)
|
| 27 |
+
admin3: List[Dict[str, Any]] = [] # Corregimientos (594)
|
| 28 |
+
|
| 29 |
+
def __new__(cls):
|
| 30 |
+
if cls._instance is None:
|
| 31 |
+
cls._instance = super().__new__(cls)
|
| 32 |
+
return cls._instance
|
| 33 |
+
|
| 34 |
+
def __init__(self):
|
| 35 |
+
if not PanamaDataLoader._data_loaded:
|
| 36 |
+
self._load_data()
|
| 37 |
+
PanamaDataLoader._data_loaded = True
|
| 38 |
+
|
| 39 |
+
def _get_data_path(self) -> str:
|
| 40 |
+
"""Get the path to the data/raw directory."""
|
| 41 |
+
# Navigate from backend/services to project root
|
| 42 |
+
current_dir = os.path.dirname(os.path.abspath(__file__))
|
| 43 |
+
project_root = os.path.dirname(os.path.dirname(current_dir))
|
| 44 |
+
return os.path.join(project_root, "data", "raw")
|
| 45 |
+
|
| 46 |
+
def _load_geojson(self, filename: str) -> List[Dict[str, Any]]:
|
| 47 |
+
"""Load a GeoJSON file and return its features."""
|
| 48 |
+
filepath = os.path.join(self._get_data_path(), filename)
|
| 49 |
+
|
| 50 |
+
if not os.path.exists(filepath):
|
| 51 |
+
print(f"Warning: {filepath} not found")
|
| 52 |
+
return []
|
| 53 |
+
|
| 54 |
+
try:
|
| 55 |
+
with open(filepath, 'r', encoding='utf-8') as f:
|
| 56 |
+
data = json.load(f)
|
| 57 |
+
features = data.get('features', [])
|
| 58 |
+
print(f" Loaded {len(features)} features from {filename}")
|
| 59 |
+
return features
|
| 60 |
+
except Exception as e:
|
| 61 |
+
print(f"Error loading {filename}: {e}")
|
| 62 |
+
return []
|
| 63 |
+
|
| 64 |
+
def _load_data(self):
|
| 65 |
+
"""Load all GeoJSON data files."""
|
| 66 |
+
print("=" * 50)
|
| 67 |
+
print("Loading Panama Geographic Data...")
|
| 68 |
+
print("=" * 50)
|
| 69 |
+
|
| 70 |
+
self.admin0 = self._load_geojson("pan_admin0.geojson")
|
| 71 |
+
self.admin1 = self._load_geojson("pan_admin1.geojson")
|
| 72 |
+
self.admin2 = self._load_geojson("pan_admin2.geojson")
|
| 73 |
+
self.admin3 = self._load_geojson("pan_admin3.geojson")
|
| 74 |
+
|
| 75 |
+
total = len(self.admin0) + len(self.admin1) + len(self.admin2) + len(self.admin3)
|
| 76 |
+
print(f"Total features loaded: {total}")
|
| 77 |
+
print("=" * 50)
|
| 78 |
+
|
| 79 |
+
def get_schema_context(self) -> str:
|
| 80 |
+
"""Return schema description for LLM context."""
|
| 81 |
+
return """
|
| 82 |
+
Panama Geographic Data (HDX Administrative Boundaries):
|
| 83 |
+
|
| 84 |
+
1. admin0 (Country Level)
|
| 85 |
+
- adm0_name: "Panamá"
|
| 86 |
+
- adm0_pcode: "PA"
|
| 87 |
+
- area_sqkm: country area in square kilometers
|
| 88 |
+
- geometry: MultiPolygon
|
| 89 |
+
|
| 90 |
+
2. admin1 (Provinces - 13 total)
|
| 91 |
+
- adm1_name: Province name (e.g., "Bocas del Toro", "Panamá", "Colón")
|
| 92 |
+
- adm1_pcode: Province code (e.g., "PA01", "PA08")
|
| 93 |
+
- adm0_name: "Panamá"
|
| 94 |
+
- area_sqkm: province area
|
| 95 |
+
- center_lat, center_lon: centroid coordinates
|
| 96 |
+
- geometry: MultiPolygon
|
| 97 |
+
|
| 98 |
+
3. admin2 (Districts - 76 total)
|
| 99 |
+
- adm2_name: District name
|
| 100 |
+
- adm2_pcode: District code (e.g., "PA0101")
|
| 101 |
+
- adm1_name: Parent province name
|
| 102 |
+
- adm1_pcode: Parent province code
|
| 103 |
+
- area_sqkm: district area
|
| 104 |
+
- center_lat, center_lon: centroid coordinates
|
| 105 |
+
- geometry: MultiPolygon
|
| 106 |
+
|
| 107 |
+
4. admin3 (Corregimientos - 594 total)
|
| 108 |
+
- adm3_name: Corregimiento name
|
| 109 |
+
- adm3_pcode: Corregimiento code (e.g., "PA010101")
|
| 110 |
+
- adm2_name: Parent district name
|
| 111 |
+
- adm2_pcode: Parent district code
|
| 112 |
+
- adm1_name: Parent province name
|
| 113 |
+
- area_sqkm: corregimiento area
|
| 114 |
+
- center_lat, center_lon: centroid coordinates
|
| 115 |
+
- geometry: MultiPolygon
|
| 116 |
+
|
| 117 |
+
Notes:
|
| 118 |
+
- All geometries use WGS84 (EPSG:4326) coordinate system
|
| 119 |
+
- P-codes follow ISO 3166-2 format
|
| 120 |
+
- Valid as of 2021-10-20
|
| 121 |
+
"""
|
| 122 |
+
|
| 123 |
+
def get_data_citations(self, admin_levels: List[str]) -> List[str]:
|
| 124 |
+
"""Return citations for the queried data."""
|
| 125 |
+
citations = []
|
| 126 |
+
level_names = {
|
| 127 |
+
"admin0": "Panama Country Boundary",
|
| 128 |
+
"admin1": "Panama Provinces",
|
| 129 |
+
"admin2": "Panama Districts",
|
| 130 |
+
"admin3": "Panama Corregimientos"
|
| 131 |
+
}
|
| 132 |
+
|
| 133 |
+
for level in admin_levels:
|
| 134 |
+
if level in level_names:
|
| 135 |
+
citations.append(f"{level_names[level]} (HDX COD-AB, 2021)")
|
| 136 |
+
|
| 137 |
+
return citations if citations else ["Panama Administrative Boundaries (HDX COD-AB, 2021)"]
|
| 138 |
+
|
| 139 |
+
def search_by_name(
|
| 140 |
+
self,
|
| 141 |
+
name: str,
|
| 142 |
+
admin_level: Optional[str] = None,
|
| 143 |
+
limit: int = 50
|
| 144 |
+
) -> List[Dict[str, Any]]:
|
| 145 |
+
"""
|
| 146 |
+
Search for features by name (case-insensitive partial match).
|
| 147 |
+
|
| 148 |
+
Args:
|
| 149 |
+
name: Search term
|
| 150 |
+
admin_level: Optional filter ("admin1", "admin2", "admin3")
|
| 151 |
+
limit: Maximum results to return
|
| 152 |
+
"""
|
| 153 |
+
name_lower = name.lower()
|
| 154 |
+
results = []
|
| 155 |
+
|
| 156 |
+
levels_to_search = []
|
| 157 |
+
if admin_level:
|
| 158 |
+
levels_to_search = [(admin_level, getattr(self, admin_level, []))]
|
| 159 |
+
else:
|
| 160 |
+
levels_to_search = [
|
| 161 |
+
("admin1", self.admin1),
|
| 162 |
+
("admin2", self.admin2),
|
| 163 |
+
("admin3", self.admin3)
|
| 164 |
+
]
|
| 165 |
+
|
| 166 |
+
for level_name, features in levels_to_search:
|
| 167 |
+
for feature in features:
|
| 168 |
+
props = feature.get("properties", {})
|
| 169 |
+
|
| 170 |
+
# Check various name fields
|
| 171 |
+
for key in ["adm1_name", "adm2_name", "adm3_name", "adm0_name"]:
|
| 172 |
+
value = props.get(key, "")
|
| 173 |
+
if value and name_lower in value.lower():
|
| 174 |
+
results.append({
|
| 175 |
+
"level": level_name,
|
| 176 |
+
"feature": feature
|
| 177 |
+
})
|
| 178 |
+
break
|
| 179 |
+
|
| 180 |
+
if len(results) >= limit:
|
| 181 |
+
break
|
| 182 |
+
|
| 183 |
+
if len(results) >= limit:
|
| 184 |
+
break
|
| 185 |
+
|
| 186 |
+
return results
|
| 187 |
+
|
| 188 |
+
def get_all_provinces(self) -> List[Dict[str, Any]]:
|
| 189 |
+
"""Get all provinces (admin1)."""
|
| 190 |
+
return self.admin1
|
| 191 |
+
|
| 192 |
+
def get_all_districts(self, province_pcode: Optional[str] = None) -> List[Dict[str, Any]]:
|
| 193 |
+
"""Get all districts, optionally filtered by province."""
|
| 194 |
+
if province_pcode:
|
| 195 |
+
return [
|
| 196 |
+
f for f in self.admin2
|
| 197 |
+
if f.get("properties", {}).get("adm1_pcode") == province_pcode
|
| 198 |
+
]
|
| 199 |
+
return self.admin2
|
| 200 |
+
|
| 201 |
+
def get_all_corregimientos(
|
| 202 |
+
self,
|
| 203 |
+
district_pcode: Optional[str] = None,
|
| 204 |
+
province_pcode: Optional[str] = None
|
| 205 |
+
) -> List[Dict[str, Any]]:
|
| 206 |
+
"""Get all corregimientos, optionally filtered."""
|
| 207 |
+
results = self.admin3
|
| 208 |
+
|
| 209 |
+
if district_pcode:
|
| 210 |
+
results = [
|
| 211 |
+
f for f in results
|
| 212 |
+
if f.get("properties", {}).get("adm2_pcode") == district_pcode
|
| 213 |
+
]
|
| 214 |
+
elif province_pcode:
|
| 215 |
+
results = [
|
| 216 |
+
f for f in results
|
| 217 |
+
if f.get("properties", {}).get("adm1_pcode") == province_pcode
|
| 218 |
+
]
|
| 219 |
+
|
| 220 |
+
return results
|
| 221 |
+
|
| 222 |
+
def get_by_pcode(self, pcode: str) -> Optional[Dict[str, Any]]:
|
| 223 |
+
"""Get a feature by its P-code."""
|
| 224 |
+
pcode_upper = pcode.upper()
|
| 225 |
+
|
| 226 |
+
# Determine level by P-code length
|
| 227 |
+
if len(pcode_upper) == 2: # Country
|
| 228 |
+
for f in self.admin0:
|
| 229 |
+
if f.get("properties", {}).get("adm0_pcode") == pcode_upper:
|
| 230 |
+
return f
|
| 231 |
+
elif len(pcode_upper) == 4: # Province
|
| 232 |
+
for f in self.admin1:
|
| 233 |
+
if f.get("properties", {}).get("adm1_pcode") == pcode_upper:
|
| 234 |
+
return f
|
| 235 |
+
elif len(pcode_upper) == 6: # District
|
| 236 |
+
for f in self.admin2:
|
| 237 |
+
if f.get("properties", {}).get("adm2_pcode") == pcode_upper:
|
| 238 |
+
return f
|
| 239 |
+
elif len(pcode_upper) == 8: # Corregimiento
|
| 240 |
+
for f in self.admin3:
|
| 241 |
+
if f.get("properties", {}).get("adm3_pcode") == pcode_upper:
|
| 242 |
+
return f
|
| 243 |
+
|
| 244 |
+
return None
|
| 245 |
+
|
| 246 |
+
def to_geojson(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
|
| 247 |
+
"""Convert a list of features to a GeoJSON FeatureCollection."""
|
| 248 |
+
# Handle both raw features and wrapped results from search
|
| 249 |
+
clean_features = []
|
| 250 |
+
for f in features:
|
| 251 |
+
if "feature" in f:
|
| 252 |
+
clean_features.append(f["feature"])
|
| 253 |
+
else:
|
| 254 |
+
clean_features.append(f)
|
| 255 |
+
|
| 256 |
+
return {
|
| 257 |
+
"type": "FeatureCollection",
|
| 258 |
+
"features": clean_features
|
| 259 |
+
}
|
| 260 |
+
|
| 261 |
+
|
| 262 |
+
# Singleton instance
|
| 263 |
+
_data_loader: Optional[PanamaDataLoader] = None
|
| 264 |
+
|
| 265 |
+
|
| 266 |
+
def get_data_loader() -> PanamaDataLoader:
|
| 267 |
+
"""Get the singleton data loader instance."""
|
| 268 |
+
global _data_loader
|
| 269 |
+
if _data_loader is None:
|
| 270 |
+
_data_loader = PanamaDataLoader()
|
| 271 |
+
return _data_loader
|
backend/services/executor.py
ADDED
|
@@ -0,0 +1,860 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Query Executor Service
|
| 3 |
+
|
| 4 |
+
Handles query processing with intent detection, data querying, and response generation.
|
| 5 |
+
Uses semantic search for scalable dataset discovery and session-scoped layer storage.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from backend.core.llm_gateway import LLMGateway
|
| 9 |
+
from backend.services.data_loader import get_data_loader
|
| 10 |
+
from backend.core.geo_engine import get_geo_engine
|
| 11 |
+
from backend.services.response_formatter import ResponseFormatter
|
| 12 |
+
from backend.core.session_store import get_session_store
|
| 13 |
+
from backend.core.semantic_search import get_semantic_search
|
| 14 |
+
from backend.core.data_catalog import get_data_catalog
|
| 15 |
+
from backend.core.query_planner import get_query_planner
|
| 16 |
+
from typing import List, Dict, Any, Optional
|
| 17 |
+
import json
|
| 18 |
+
import datetime
|
| 19 |
+
import uuid
|
| 20 |
+
import logging
|
| 21 |
+
|
| 22 |
+
logger = logging.getLogger(__name__)
|
| 23 |
+
|
| 24 |
+
# Default session ID for backward compatibility
|
| 25 |
+
DEFAULT_SESSION_ID = "default-session"
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
class QueryExecutor:
|
| 29 |
+
def __init__(self):
|
| 30 |
+
self.llm = LLMGateway()
|
| 31 |
+
self.data_loader = get_data_loader()
|
| 32 |
+
self.geo_engine = get_geo_engine()
|
| 33 |
+
self.session_store = get_session_store()
|
| 34 |
+
self.semantic_search = get_semantic_search()
|
| 35 |
+
self.catalog = get_data_catalog()
|
| 36 |
+
self.query_planner = get_query_planner()
|
| 37 |
+
|
| 38 |
+
def _get_schema_context(self) -> str:
|
| 39 |
+
"""Returns the database schema for the LLM context."""
|
| 40 |
+
return self.data_loader.get_schema_context()
|
| 41 |
+
|
| 42 |
+
async def process_query_with_context(self, query: str, history: List[Dict[str, str]]) -> Dict[str, Any]:
|
| 43 |
+
"""
|
| 44 |
+
Orchestrates the full query processing flow with conversation context.
|
| 45 |
+
"""
|
| 46 |
+
# 1. Detect intent
|
| 47 |
+
intent = await self.llm.detect_intent(query, history)
|
| 48 |
+
print(f"[GeoQuery] Detected intent: {intent}")
|
| 49 |
+
|
| 50 |
+
# 2. Route based on intent
|
| 51 |
+
if intent == "GENERAL_CHAT":
|
| 52 |
+
return await self._handle_general_chat(query, history)
|
| 53 |
+
elif intent in ["DATA_QUERY", "MAP_REQUEST"]:
|
| 54 |
+
# Always include map for data queries - the visual is helpful
|
| 55 |
+
return await self._handle_data_query(query, history, include_map=True)
|
| 56 |
+
elif intent == "SPATIAL_OP":
|
| 57 |
+
return await self._handle_spatial_op(query, history)
|
| 58 |
+
elif intent == "STAT_QUERY":
|
| 59 |
+
return await self._handle_stat_query(query, history)
|
| 60 |
+
else:
|
| 61 |
+
return await self._handle_general_chat(query, history)
|
| 62 |
+
|
| 63 |
+
async def process_query_stream(self, query: str, history: List[Dict[str, str]]):
|
| 64 |
+
"""
|
| 65 |
+
Streamable version of process_query_with_context.
|
| 66 |
+
Yields: {"event": "status"|"thought"|"chunk"|"result", "data": ...}
|
| 67 |
+
"""
|
| 68 |
+
|
| 69 |
+
# 1. Intent Detection with Thoughts
|
| 70 |
+
yield {"event": "status", "data": json.dumps({"status": "🧠 Understanding intent..."})}
|
| 71 |
+
|
| 72 |
+
intent = "GENERAL_CHAT"
|
| 73 |
+
intent_buffer = ""
|
| 74 |
+
|
| 75 |
+
try:
|
| 76 |
+
async for chunk in self.llm.stream_intent(query, history):
|
| 77 |
+
if chunk["type"] == "thought":
|
| 78 |
+
yield {"event": "chunk", "data": json.dumps({"type": "thought", "content": chunk["text"]})}
|
| 79 |
+
elif chunk["type"] == "content":
|
| 80 |
+
intent_buffer += chunk["text"]
|
| 81 |
+
except Exception as e:
|
| 82 |
+
print(f"Intent stream error: {e}")
|
| 83 |
+
|
| 84 |
+
intent = intent_buffer.strip().upper()
|
| 85 |
+
if not intent:
|
| 86 |
+
intent = "GENERAL_CHAT"
|
| 87 |
+
|
| 88 |
+
# Clean up intent string
|
| 89 |
+
for valid in ["GENERAL_CHAT", "DATA_QUERY", "MAP_REQUEST", "SPATIAL_OP", "STAT_QUERY"]:
|
| 90 |
+
if valid in intent:
|
| 91 |
+
intent = valid
|
| 92 |
+
break
|
| 93 |
+
|
| 94 |
+
yield {"event": "intent", "data": json.dumps({"intent": intent})}
|
| 95 |
+
print(f"[GeoQuery] Detected intent: {intent}")
|
| 96 |
+
|
| 97 |
+
if intent == "GENERAL_CHAT":
|
| 98 |
+
async for chunk in self.llm.generate_response_stream(query, history):
|
| 99 |
+
# Transform to frontend protocol
|
| 100 |
+
if chunk.get("type") == "content":
|
| 101 |
+
yield {"event": "chunk", "data": json.dumps({"type": "text", "content": chunk.get("text")})}
|
| 102 |
+
elif chunk.get("type") == "thought":
|
| 103 |
+
yield {"event": "chunk", "data": json.dumps({"type": "thought", "content": chunk.get("content")})}
|
| 104 |
+
|
| 105 |
+
# Send final result to clear loading status
|
| 106 |
+
yield {"event": "result", "data": json.dumps({"response": ""})}
|
| 107 |
+
return
|
| 108 |
+
|
| 109 |
+
# Handle Data/Map/Stat Queries together via a unified stream handler
|
| 110 |
+
|
| 111 |
+
if intent in ["DATA_QUERY", "MAP_REQUEST", "STAT_QUERY"]:
|
| 112 |
+
include_map = intent != "STAT_QUERY"
|
| 113 |
+
session_id = DEFAULT_SESSION_ID # TODO: Get from request context
|
| 114 |
+
|
| 115 |
+
# 0. Check query complexity
|
| 116 |
+
complexity = self.query_planner.detect_complexity(query)
|
| 117 |
+
|
| 118 |
+
if complexity["is_complex"]:
|
| 119 |
+
yield {"event": "status", "data": json.dumps({"status": "���� Complex query detected, planning steps..."})}
|
| 120 |
+
logger.info(f"Complex query detected: {complexity['reason']}")
|
| 121 |
+
|
| 122 |
+
# Use multi-step executor
|
| 123 |
+
async for event in self._execute_multi_step_query(query, history, include_map, session_id):
|
| 124 |
+
yield event
|
| 125 |
+
return
|
| 126 |
+
|
| 127 |
+
# Simple query - continue with existing flow
|
| 128 |
+
# 0. Semantic Discovery (scalable pre-filter)
|
| 129 |
+
yield {"event": "status", "data": json.dumps({"status": "📚 Searching data catalog..."})}
|
| 130 |
+
|
| 131 |
+
# Use semantic search to find top candidates
|
| 132 |
+
candidate_tables = self.semantic_search.search_table_names(query, top_k=15)
|
| 133 |
+
|
| 134 |
+
if candidate_tables:
|
| 135 |
+
# Get focused summaries for LLM refinement
|
| 136 |
+
candidate_summaries = self.catalog.get_summaries_for_tables(candidate_tables)
|
| 137 |
+
else:
|
| 138 |
+
# Fallback to all summaries (legacy behavior for small catalogs)
|
| 139 |
+
candidate_summaries = self.catalog.get_all_table_summaries()
|
| 140 |
+
|
| 141 |
+
# 1. LLM refines from candidates
|
| 142 |
+
yield {"event": "status", "data": json.dumps({"status": "🔍 Identifying relevant tables..."})}
|
| 143 |
+
relevant_tables = await self.llm.identify_relevant_tables(query, candidate_summaries)
|
| 144 |
+
|
| 145 |
+
# 2. Lazy Load
|
| 146 |
+
if relevant_tables:
|
| 147 |
+
yield {"event": "status", "data": json.dumps({"status": f"💾 Loading tables: {', '.join(relevant_tables)}..."})}
|
| 148 |
+
|
| 149 |
+
feature_tables = []
|
| 150 |
+
for table in relevant_tables:
|
| 151 |
+
if self.geo_engine.ensure_table_loaded(table):
|
| 152 |
+
feature_tables.append(table)
|
| 153 |
+
|
| 154 |
+
# 3. Schema
|
| 155 |
+
table_schema = self.geo_engine.get_table_schemas()
|
| 156 |
+
|
| 157 |
+
# 4. Generate SQL (Streaming Thoughts!)
|
| 158 |
+
yield {"event": "status", "data": json.dumps({"status": "✍️ Writing SQL query..."})}
|
| 159 |
+
|
| 160 |
+
sql_buffer = ""
|
| 161 |
+
async for chunk in self.llm.stream_analytical_sql(query, table_schema, history):
|
| 162 |
+
if chunk["type"] == "thought":
|
| 163 |
+
yield {"event": "chunk", "data": json.dumps({"type": "thought", "content": chunk["text"]})}
|
| 164 |
+
elif chunk["type"] == "content":
|
| 165 |
+
sql_buffer += chunk["text"]
|
| 166 |
+
|
| 167 |
+
sql = sql_buffer.replace("```sql", "").replace("```", "").strip()
|
| 168 |
+
|
| 169 |
+
# 5. Check for DATA_UNAVAILABLE error from LLM
|
| 170 |
+
if "DATA_UNAVAILABLE" in sql or sql.startswith("-- ERROR"):
|
| 171 |
+
yield {"event": "status", "data": json.dumps({"status": "ℹ️ Data not available"})}
|
| 172 |
+
|
| 173 |
+
requested = "the requested data"
|
| 174 |
+
available = "administrative boundaries (provinces, districts, corregimientos)"
|
| 175 |
+
|
| 176 |
+
for line in sql.split("\n"):
|
| 177 |
+
if "Requested:" in line:
|
| 178 |
+
requested = line.split("Requested:")[-1].strip()
|
| 179 |
+
elif "Available:" in line:
|
| 180 |
+
available = line.split("Available:")[-1].strip()
|
| 181 |
+
|
| 182 |
+
error_response = f"""I couldn't find data for **{requested}** in the current database.
|
| 183 |
+
|
| 184 |
+
**Available datasets include:**
|
| 185 |
+
- {available}
|
| 186 |
+
|
| 187 |
+
If you need additional data, please let me know and I can help you understand what's currently available or suggest alternative queries."""
|
| 188 |
+
|
| 189 |
+
yield {
|
| 190 |
+
"event": "result",
|
| 191 |
+
"data": json.dumps({
|
| 192 |
+
"response": error_response,
|
| 193 |
+
"sql_query": sql,
|
| 194 |
+
"geojson": None,
|
| 195 |
+
"data_citations": [],
|
| 196 |
+
"chart_data": None,
|
| 197 |
+
"raw_data": []
|
| 198 |
+
})
|
| 199 |
+
}
|
| 200 |
+
return
|
| 201 |
+
|
| 202 |
+
# 6. Execute query
|
| 203 |
+
yield {"event": "status", "data": json.dumps({"status": "⚡ Executing query..."})}
|
| 204 |
+
|
| 205 |
+
geojson = None
|
| 206 |
+
features = []
|
| 207 |
+
error_message = None
|
| 208 |
+
|
| 209 |
+
try:
|
| 210 |
+
geojson = self.geo_engine.execute_spatial_query(sql)
|
| 211 |
+
features = geojson.get("features", [])
|
| 212 |
+
yield {"event": "status", "data": json.dumps({"status": f"✅ Found {len(features)} results"})}
|
| 213 |
+
except Exception as e:
|
| 214 |
+
error_message = str(e)
|
| 215 |
+
yield {"event": "status", "data": json.dumps({"status": "⚠️ Query error, attempting repair..."})}
|
| 216 |
+
try:
|
| 217 |
+
sql = await self.llm.correct_sql(query, sql, error_message, str(table_schema))
|
| 218 |
+
geojson = self.geo_engine.execute_spatial_query(sql)
|
| 219 |
+
features = geojson.get("features", [])
|
| 220 |
+
error_message = None
|
| 221 |
+
except Exception as e2:
|
| 222 |
+
print(f"Repair failed: {e2}")
|
| 223 |
+
|
| 224 |
+
if error_message:
|
| 225 |
+
yield {
|
| 226 |
+
"event": "result",
|
| 227 |
+
"data": json.dumps({
|
| 228 |
+
"response": f"I was unable to process your request because the data query failed. \n\nError details: {error_message}",
|
| 229 |
+
"sql_query": sql,
|
| 230 |
+
"geojson": None,
|
| 231 |
+
"data_citations": [],
|
| 232 |
+
"chart_data": None,
|
| 233 |
+
"raw_data": []
|
| 234 |
+
})
|
| 235 |
+
}
|
| 236 |
+
return
|
| 237 |
+
|
| 238 |
+
# 7. Post-process using ResponseFormatter
|
| 239 |
+
citations = ResponseFormatter.generate_citations(relevant_tables, features)
|
| 240 |
+
|
| 241 |
+
# Chart
|
| 242 |
+
chart_data = ResponseFormatter.generate_chart_data(sql, features)
|
| 243 |
+
if intent == "STAT_QUERY" and not chart_data and features:
|
| 244 |
+
chart_data = ResponseFormatter.generate_chart_data("GROUP BY forced", features)
|
| 245 |
+
|
| 246 |
+
# Raw Data
|
| 247 |
+
raw_data = ResponseFormatter.prepare_raw_data(features)
|
| 248 |
+
|
| 249 |
+
# Map Config
|
| 250 |
+
if include_map and features and geojson:
|
| 251 |
+
# Generate AI layer name
|
| 252 |
+
layer_info = await self.llm.generate_layer_name(query, sql)
|
| 253 |
+
layer_name_ai = layer_info.get("name", "Map Layer")
|
| 254 |
+
layer_emoji = layer_info.get("emoji", "📍")
|
| 255 |
+
point_style = layer_info.get("pointStyle", None)
|
| 256 |
+
geojson, layer_id, layer_name = ResponseFormatter.format_geojson_layer(query, geojson, features, layer_name_ai, layer_emoji, point_style)
|
| 257 |
+
|
| 258 |
+
try:
|
| 259 |
+
table_name = self.geo_engine.register_layer(layer_id, geojson)
|
| 260 |
+
self.session_store.add_layer(session_id, {
|
| 261 |
+
"id": layer_id,
|
| 262 |
+
"name": layer_name,
|
| 263 |
+
"table_name": table_name,
|
| 264 |
+
"timestamp": datetime.datetime.now().isoformat()
|
| 265 |
+
})
|
| 266 |
+
except Exception as e:
|
| 267 |
+
logger.warning(f"Failed to register layer: {e}")
|
| 268 |
+
|
| 269 |
+
# 8. Explanation (Streaming!)
|
| 270 |
+
yield {"event": "status", "data": json.dumps({"status": "💬 Generating explanation..."})}
|
| 271 |
+
|
| 272 |
+
data_summary = ResponseFormatter.generate_data_summary(features)
|
| 273 |
+
|
| 274 |
+
explanation_buffer = ""
|
| 275 |
+
|
| 276 |
+
async for chunk in self.llm.stream_explanation(query, sql, data_summary, history):
|
| 277 |
+
if chunk["type"] == "thought":
|
| 278 |
+
yield {"event": "chunk", "data": json.dumps({"type": "thought", "content": chunk["text"]})}
|
| 279 |
+
elif chunk["type"] == "content":
|
| 280 |
+
explanation_buffer += chunk["text"]
|
| 281 |
+
yield {"event": "chunk", "data": json.dumps({"type": "text", "content": chunk["text"]})}
|
| 282 |
+
|
| 283 |
+
# 9. Final Result Event
|
| 284 |
+
yield {"event": "result", "data": json.dumps({
|
| 285 |
+
"response": explanation_buffer,
|
| 286 |
+
"sql_query": sql,
|
| 287 |
+
"geojson": geojson if include_map and features else None,
|
| 288 |
+
"chart_data": chart_data,
|
| 289 |
+
"raw_data": raw_data,
|
| 290 |
+
"data_citations": citations
|
| 291 |
+
})}
|
| 292 |
+
|
| 293 |
+
elif intent == "SPATIAL_OP":
|
| 294 |
+
yield {"event": "status", "data": json.dumps({"status": "📐 Preparing spatial operation..."})}
|
| 295 |
+
session_id = DEFAULT_SESSION_ID # TODO: Get from request context
|
| 296 |
+
|
| 297 |
+
# 0. Semantic Discovery for base tables
|
| 298 |
+
candidate_tables = self.semantic_search.search_table_names(query, top_k=15)
|
| 299 |
+
if candidate_tables:
|
| 300 |
+
candidate_summaries = self.catalog.get_summaries_for_tables(candidate_tables)
|
| 301 |
+
else:
|
| 302 |
+
candidate_summaries = self.catalog.get_all_table_summaries()
|
| 303 |
+
|
| 304 |
+
# 1. Identify relevant base tables from query
|
| 305 |
+
relevant_tables = await self.llm.identify_relevant_tables(query, candidate_summaries)
|
| 306 |
+
|
| 307 |
+
# 2. Lazy load those tables
|
| 308 |
+
for table in relevant_tables:
|
| 309 |
+
self.geo_engine.ensure_table_loaded(table)
|
| 310 |
+
|
| 311 |
+
# 3. Get schema of loaded base tables
|
| 312 |
+
base_table_schema = self.geo_engine.get_table_schemas()
|
| 313 |
+
|
| 314 |
+
# 4. Prepare Layer Context (user-created layers from session)
|
| 315 |
+
session_layers = self.session_store.get_layers(session_id)
|
| 316 |
+
layer_context = "User-Created Layers:\n"
|
| 317 |
+
if not session_layers:
|
| 318 |
+
layer_context += "(No user layers created yet.)\n"
|
| 319 |
+
else:
|
| 320 |
+
for i, layer in enumerate(session_layers):
|
| 321 |
+
layer_context += f"Layer {i+1}: {layer['name']} (Table: {layer['table_name']})\n"
|
| 322 |
+
|
| 323 |
+
# 5. Combine both contexts for LLM
|
| 324 |
+
full_context = f"{base_table_schema}\n\n{layer_context}"
|
| 325 |
+
|
| 326 |
+
# 6. Generate Spatial SQL
|
| 327 |
+
yield {"event": "status", "data": json.dumps({"status": "✍️ Writing spatial SQL..."})}
|
| 328 |
+
sql = await self.llm.generate_spatial_sql(query, full_context, history)
|
| 329 |
+
|
| 330 |
+
# 7. Execute
|
| 331 |
+
yield {"event": "status", "data": json.dumps({"status": "⚙️ Processing geometry..."})}
|
| 332 |
+
error_message = None
|
| 333 |
+
geojson = None
|
| 334 |
+
features = []
|
| 335 |
+
|
| 336 |
+
try:
|
| 337 |
+
geojson = self.geo_engine.execute_spatial_query(sql)
|
| 338 |
+
features = geojson.get("features", [])
|
| 339 |
+
yield {"event": "status", "data": json.dumps({"status": f"✅ Result contains {len(features)} features"})}
|
| 340 |
+
except Exception as e:
|
| 341 |
+
error_message = str(e)
|
| 342 |
+
yield {"event": "status", "data": json.dumps({"status": "⚠️ Spatial error, attempting repair..."})}
|
| 343 |
+
try:
|
| 344 |
+
sql = await self.llm.correct_sql(query, sql, error_message, full_context)
|
| 345 |
+
geojson = self.geo_engine.execute_spatial_query(sql)
|
| 346 |
+
features = geojson.get("features", [])
|
| 347 |
+
error_message = None
|
| 348 |
+
except Exception as e2:
|
| 349 |
+
yield {
|
| 350 |
+
"event": "result",
|
| 351 |
+
"data": json.dumps({
|
| 352 |
+
"response": f"I tried to perform the spatial operation but encountered an error: {str(e)}\n\nQuery: {sql}",
|
| 353 |
+
"sql_query": sql,
|
| 354 |
+
"geojson": None,
|
| 355 |
+
"data_citations": [],
|
| 356 |
+
"chart_data": None,
|
| 357 |
+
"raw_data": []
|
| 358 |
+
})
|
| 359 |
+
}
|
| 360 |
+
return
|
| 361 |
+
|
| 362 |
+
# 4. Result Processing
|
| 363 |
+
if features:
|
| 364 |
+
# Generate AI layer name
|
| 365 |
+
layer_info = await self.llm.generate_layer_name(query, sql)
|
| 366 |
+
layer_name_ai = layer_info.get("name", "Map Layer")
|
| 367 |
+
layer_emoji = layer_info.get("emoji", "📍")
|
| 368 |
+
point_style = layer_info.get("pointStyle", None)
|
| 369 |
+
geojson, layer_id, layer_name = ResponseFormatter.format_geojson_layer(query, geojson, features, layer_name_ai, layer_emoji, point_style)
|
| 370 |
+
|
| 371 |
+
try:
|
| 372 |
+
table_name = self.geo_engine.register_layer(layer_id, geojson)
|
| 373 |
+
self.session_store.add_layer(session_id, {
|
| 374 |
+
"id": layer_id,
|
| 375 |
+
"name": layer_name,
|
| 376 |
+
"table_name": table_name,
|
| 377 |
+
"timestamp": datetime.datetime.now().isoformat()
|
| 378 |
+
})
|
| 379 |
+
except Exception as e:
|
| 380 |
+
logger.warning(f"Failed to register layer: {e}")
|
| 381 |
+
|
| 382 |
+
# 5. Explanation
|
| 383 |
+
yield {"event": "status", "data": json.dumps({"status": "💬 Explaining results..."})}
|
| 384 |
+
data_summary = f"Spatial operation resulted in {len(features)} features."
|
| 385 |
+
|
| 386 |
+
explanation_buffer = ""
|
| 387 |
+
async for chunk in self.llm.stream_explanation(query, sql, data_summary, history):
|
| 388 |
+
if chunk["type"] == "thought":
|
| 389 |
+
yield {"event": "chunk", "data": json.dumps({"type": "thought", "content": chunk["text"]})}
|
| 390 |
+
elif chunk["type"] == "content":
|
| 391 |
+
explanation_buffer += chunk["text"]
|
| 392 |
+
yield {"event": "chunk", "data": json.dumps({"type": "text", "content": chunk["text"]})}
|
| 393 |
+
|
| 394 |
+
# 6. Final Result
|
| 395 |
+
yield {"event": "result", "data": json.dumps({
|
| 396 |
+
"response": explanation_buffer,
|
| 397 |
+
"sql_query": sql,
|
| 398 |
+
"geojson": geojson,
|
| 399 |
+
"chart_data": None,
|
| 400 |
+
"raw_data": [], # Spatial ops usually visual
|
| 401 |
+
"data_citations": []
|
| 402 |
+
})}
|
| 403 |
+
return
|
| 404 |
+
|
| 405 |
+
else:
|
| 406 |
+
# Fallback
|
| 407 |
+
yield {"event": "chunk", "data": json.dumps({"type": "text", "content": "I'm not sure how to handle this query yet."})}
|
| 408 |
+
|
| 409 |
+
async def _handle_general_chat(self, query: str, history: List[Dict[str, str]]) -> Dict[str, Any]:
|
| 410 |
+
"""Handles general conversational queries."""
|
| 411 |
+
# Add schema context to help the LLM answer questions about the data
|
| 412 |
+
enhanced_query = f"""The user is asking about Panama geographic data.
|
| 413 |
+
|
| 414 |
+
Available data: {len(self.data_loader.admin1)} provinces, {len(self.data_loader.admin2)} districts, {len(self.data_loader.admin3)} corregimientos.
|
| 415 |
+
|
| 416 |
+
User question: {query}
|
| 417 |
+
|
| 418 |
+
Respond helpfully as GeoQuery, the territorial intelligence assistant."""
|
| 419 |
+
|
| 420 |
+
response = await self.llm.generate_response(enhanced_query, history)
|
| 421 |
+
|
| 422 |
+
return {
|
| 423 |
+
"response": response,
|
| 424 |
+
"sql_query": None,
|
| 425 |
+
"geojson": None,
|
| 426 |
+
"data_citations": [],
|
| 427 |
+
"intent": "GENERAL_CHAT"
|
| 428 |
+
}
|
| 429 |
+
|
| 430 |
+
async def _handle_data_query(self, query: str, history: List[Dict[str, str]], include_map: bool = True) -> Dict[str, Any]:
|
| 431 |
+
"""
|
| 432 |
+
Handles data queries using text-to-SQL with SOTA Smart Discovery.
|
| 433 |
+
"""
|
| 434 |
+
print(f"[GeoQuery] Starting Data Query: {query}")
|
| 435 |
+
|
| 436 |
+
# 0. Get Catalog
|
| 437 |
+
from backend.core.data_catalog import get_data_catalog
|
| 438 |
+
catalog = get_data_catalog()
|
| 439 |
+
|
| 440 |
+
# 1. Smart Discovery: Identify relevant tables
|
| 441 |
+
summaries = catalog.get_all_table_summaries()
|
| 442 |
+
|
| 443 |
+
# Ask LLM which tables are relevant
|
| 444 |
+
relevant_tables = await self.llm.identify_relevant_tables(query, summaries)
|
| 445 |
+
|
| 446 |
+
# 2. Lazy Loading
|
| 447 |
+
feature_tables = []
|
| 448 |
+
for table in relevant_tables:
|
| 449 |
+
if self.geo_engine.ensure_table_loaded(table):
|
| 450 |
+
feature_tables.append(table)
|
| 451 |
+
else:
|
| 452 |
+
print(f"[GeoQuery] Warning: Could not load relevant table '{table}'")
|
| 453 |
+
|
| 454 |
+
# 3. Get schema context (now includes the newly loaded tables)
|
| 455 |
+
table_schema = self.geo_engine.get_table_schemas()
|
| 456 |
+
|
| 457 |
+
# Fallback for empty schema
|
| 458 |
+
if len(table_schema) < 50:
|
| 459 |
+
print("[GeoQuery] GeoEngine schema empty. Fetching from Catalog Metadata.")
|
| 460 |
+
fallback_tables = list(set(feature_tables + ["pan_admin1", "pan_admin2", "pan_admin3"]))
|
| 461 |
+
table_schema = catalog.get_specific_table_schemas(fallback_tables)
|
| 462 |
+
|
| 463 |
+
# 4. Generate real SQL using LLM
|
| 464 |
+
print(f"[GeoQuery] Generating SQL with context size: {len(table_schema)} chars")
|
| 465 |
+
sql = await self.llm.generate_analytical_sql(query, table_schema, history)
|
| 466 |
+
|
| 467 |
+
# Check for SQL generation errors
|
| 468 |
+
if sql.startswith("-- Error"):
|
| 469 |
+
available_data = ", ".join(feature_tables) if feature_tables else "Administrative Boundaries"
|
| 470 |
+
return {
|
| 471 |
+
"response": f"I couldn't find the specific data you asked for. I have access to: {available_data}. \n\nOriginal request: {query}",
|
| 472 |
+
"sql_query": sql,
|
| 473 |
+
"intent": "DATA_QUERY"
|
| 474 |
+
}
|
| 475 |
+
|
| 476 |
+
# 5. Execute SQL in DuckDB
|
| 477 |
+
error_message = None
|
| 478 |
+
try:
|
| 479 |
+
geojson = self.geo_engine.execute_spatial_query(sql)
|
| 480 |
+
features = geojson.get("features", [])
|
| 481 |
+
print(f"[GeoQuery] Query returned {len(features)} features")
|
| 482 |
+
except Exception as e:
|
| 483 |
+
error_message = str(e)
|
| 484 |
+
print(f"[GeoQuery] SQL execution error: {error_message}")
|
| 485 |
+
|
| 486 |
+
# Self-Correction Loop
|
| 487 |
+
try:
|
| 488 |
+
sql = await self.llm.correct_sql(query, sql, error_message, str(table_schema))
|
| 489 |
+
geojson = self.geo_engine.execute_spatial_query(sql)
|
| 490 |
+
features = geojson.get("features", [])
|
| 491 |
+
error_message = None
|
| 492 |
+
except Exception as e2:
|
| 493 |
+
return {
|
| 494 |
+
"response": f"The SQL query failed to execute even after an automatic repair attempt.\nOriginal Error: {error_message}\nRepair Error: {str(e2)}",
|
| 495 |
+
"sql_query": sql,
|
| 496 |
+
"intent": "DATA_QUERY"
|
| 497 |
+
}
|
| 498 |
+
|
| 499 |
+
# 6. Post-Process via ResponseFormatter
|
| 500 |
+
citations = ResponseFormatter.generate_citations(relevant_tables, features)
|
| 501 |
+
data_summary = ResponseFormatter.generate_data_summary(features)
|
| 502 |
+
|
| 503 |
+
# 7. Generate explanation
|
| 504 |
+
explanation = await self.llm.generate_explanation(query, sql, data_summary, history)
|
| 505 |
+
|
| 506 |
+
# 8. Add Layer Metadata to GeoJSON and REGISTER in GeoEngine
|
| 507 |
+
if include_map and features:
|
| 508 |
+
# Generate AI layer name
|
| 509 |
+
layer_info = await self.llm.generate_layer_name(query, sql)
|
| 510 |
+
layer_name_ai = layer_info.get("name", "Map Layer")
|
| 511 |
+
layer_emoji = layer_info.get("emoji", "📍")
|
| 512 |
+
point_style = layer_info.get("pointStyle", None)
|
| 513 |
+
geojson, layer_id, layer_name = ResponseFormatter.format_geojson_layer(query, geojson, features, layer_name_ai, layer_emoji, point_style)
|
| 514 |
+
|
| 515 |
+
try:
|
| 516 |
+
table_name = self.geo_engine.register_layer(layer_id, geojson)
|
| 517 |
+
self.session_store.add_layer(DEFAULT_SESSION_ID, {
|
| 518 |
+
"id": layer_id,
|
| 519 |
+
"name": layer_name,
|
| 520 |
+
"table_name": table_name,
|
| 521 |
+
"timestamp": datetime.datetime.now().isoformat()
|
| 522 |
+
})
|
| 523 |
+
except Exception as e:
|
| 524 |
+
logger.warning(f"Failed to register layer in GeoEngine: {e}")
|
| 525 |
+
|
| 526 |
+
# 9. Auto-generate Chart
|
| 527 |
+
chart_data = ResponseFormatter.generate_chart_data(sql, features)
|
| 528 |
+
|
| 529 |
+
# 10. Prepare Raw Data
|
| 530 |
+
raw_data = ResponseFormatter.prepare_raw_data(features)
|
| 531 |
+
|
| 532 |
+
return {
|
| 533 |
+
"response": explanation,
|
| 534 |
+
"sql_query": sql,
|
| 535 |
+
"geojson": geojson if include_map and features else None,
|
| 536 |
+
"data_citations": citations,
|
| 537 |
+
"chart_data": chart_data,
|
| 538 |
+
"raw_data": raw_data,
|
| 539 |
+
"intent": "DATA_QUERY" if not include_map else "MAP_REQUEST"
|
| 540 |
+
}
|
| 541 |
+
|
| 542 |
+
async def _handle_spatial_op(self, query: str, history: List[Dict[str, str]]) -> Dict[str, Any]:
|
| 543 |
+
"""Handles spatial operations (Difference, Intersection, etc) using GeoEngine."""
|
| 544 |
+
# 0. Get data catalog for relevant tables
|
| 545 |
+
from backend.core.data_catalog import get_data_catalog
|
| 546 |
+
catalog = get_data_catalog()
|
| 547 |
+
summaries = catalog.get_all_table_summaries()
|
| 548 |
+
|
| 549 |
+
# 1. Identify relevant base tables from query
|
| 550 |
+
relevant_tables = await self.llm.identify_relevant_tables(query, summaries)
|
| 551 |
+
|
| 552 |
+
# 2. Lazy load those tables
|
| 553 |
+
for table in relevant_tables:
|
| 554 |
+
self.geo_engine.ensure_table_loaded(table)
|
| 555 |
+
|
| 556 |
+
# 3. Get schema of loaded base tables
|
| 557 |
+
base_table_schema = self.geo_engine.get_table_schemas()
|
| 558 |
+
|
| 559 |
+
# 4. Prepare Layer Context (user-created layers from session)
|
| 560 |
+
session_layers = self.session_store.get_layers(DEFAULT_SESSION_ID)
|
| 561 |
+
layer_context = "User-Created Layers:\n"
|
| 562 |
+
if not session_layers:
|
| 563 |
+
layer_context += "(No user layers created yet.)\n"
|
| 564 |
+
else:
|
| 565 |
+
for i, layer in enumerate(session_layers):
|
| 566 |
+
layer_context += f"Layer {i+1}: {layer['name']} (Table: {layer['table_name']})\n"
|
| 567 |
+
|
| 568 |
+
# 5. Combine both contexts for LLM
|
| 569 |
+
full_context = f"{base_table_schema}\n\n{layer_context}"
|
| 570 |
+
|
| 571 |
+
# 6. Generate Spatial SQL
|
| 572 |
+
sql = await self.llm.generate_spatial_sql(query, full_context, history)
|
| 573 |
+
|
| 574 |
+
# 7. Execute
|
| 575 |
+
error_message = None
|
| 576 |
+
geojson = None
|
| 577 |
+
features = []
|
| 578 |
+
|
| 579 |
+
try:
|
| 580 |
+
geojson = self.geo_engine.execute_spatial_query(sql)
|
| 581 |
+
features = geojson.get("features", [])
|
| 582 |
+
except Exception as e:
|
| 583 |
+
error_message = str(e)
|
| 584 |
+
try:
|
| 585 |
+
sql = await self.llm.correct_sql(query, sql, error_message, full_context)
|
| 586 |
+
geojson = self.geo_engine.execute_spatial_query(sql)
|
| 587 |
+
features = geojson.get("features", [])
|
| 588 |
+
error_message = None
|
| 589 |
+
except Exception as e2:
|
| 590 |
+
return {
|
| 591 |
+
"response": f"I tried to perform the spatial operation but encountered an error: {str(e)}\n\nQuery: {sql}",
|
| 592 |
+
"sql_query": sql,
|
| 593 |
+
"intent": "SPATIAL_OP"
|
| 594 |
+
}
|
| 595 |
+
|
| 596 |
+
# 4. Result Processing
|
| 597 |
+
if features:
|
| 598 |
+
# Generate AI layer name
|
| 599 |
+
layer_info = await self.llm.generate_layer_name(query, sql)
|
| 600 |
+
layer_name_ai = layer_info.get("name", "Map Layer")
|
| 601 |
+
layer_emoji = layer_info.get("emoji", "📍")
|
| 602 |
+
point_style = layer_info.get("pointStyle", None)
|
| 603 |
+
geojson, layer_id, layer_name = ResponseFormatter.format_geojson_layer(query, geojson, features, layer_name_ai, layer_emoji, point_style)
|
| 604 |
+
table_name = self.geo_engine.register_layer(layer_id, geojson)
|
| 605 |
+
self.session_store.add_layer(DEFAULT_SESSION_ID, {
|
| 606 |
+
"id": layer_id,
|
| 607 |
+
"name": layer_name,
|
| 608 |
+
"table_name": table_name,
|
| 609 |
+
"timestamp": datetime.datetime.now().isoformat()
|
| 610 |
+
})
|
| 611 |
+
|
| 612 |
+
data_summary = f"Spatial operation resulted in {len(features)} features."
|
| 613 |
+
explanation = await self.llm.generate_explanation(query, sql, data_summary, history)
|
| 614 |
+
|
| 615 |
+
return {
|
| 616 |
+
"response": explanation,
|
| 617 |
+
"sql_query": sql,
|
| 618 |
+
"geojson": geojson,
|
| 619 |
+
"data_citations": [],
|
| 620 |
+
"intent": "SPATIAL_OP"
|
| 621 |
+
}
|
| 622 |
+
|
| 623 |
+
async def _handle_stat_query(self, query: str, history: List[Dict[str, str]]) -> Dict[str, Any]:
|
| 624 |
+
"""
|
| 625 |
+
Handles statistical queries where charts/tables are more important than maps.
|
| 626 |
+
"""
|
| 627 |
+
# Reuse data query logic but without map emphasis
|
| 628 |
+
result = await self._handle_data_query(query, history, include_map=False)
|
| 629 |
+
result["intent"] = "STAT_QUERY"
|
| 630 |
+
|
| 631 |
+
# Ensure chart data is present if possible
|
| 632 |
+
if not result.get("chart_data") and result.get("raw_data"):
|
| 633 |
+
# Force chart attempt
|
| 634 |
+
features_mock = [{"properties": d} for d in result["raw_data"]]
|
| 635 |
+
result["chart_data"] = ResponseFormatter.generate_chart_data(result.get("sql_query", ""), features_mock)
|
| 636 |
+
|
| 637 |
+
return result
|
| 638 |
+
|
| 639 |
+
async def _execute_multi_step_query(
|
| 640 |
+
self,
|
| 641 |
+
query: str,
|
| 642 |
+
history: List[Dict[str, str]],
|
| 643 |
+
include_map: bool,
|
| 644 |
+
session_id: str
|
| 645 |
+
):
|
| 646 |
+
"""
|
| 647 |
+
Execute a complex query by breaking it into multiple steps.
|
| 648 |
+
|
| 649 |
+
Yields streaming events throughout the multi-step process.
|
| 650 |
+
"""
|
| 651 |
+
import asyncio
|
| 652 |
+
|
| 653 |
+
# 1. Get candidate tables for planning
|
| 654 |
+
yield {"event": "status", "data": json.dumps({"status": "📚 Discovering relevant datasets..."})}
|
| 655 |
+
|
| 656 |
+
candidate_tables = self.semantic_search.search_table_names(query, top_k=20)
|
| 657 |
+
if not candidate_tables:
|
| 658 |
+
candidate_tables = list(self.catalog.catalog.keys())
|
| 659 |
+
|
| 660 |
+
# 2. Plan the query
|
| 661 |
+
yield {"event": "status", "data": json.dumps({"status": "📋 Creating execution plan..."})}
|
| 662 |
+
|
| 663 |
+
plan = await self.query_planner.plan_query(query, candidate_tables, self.llm)
|
| 664 |
+
|
| 665 |
+
if not plan.is_complex or not plan.steps:
|
| 666 |
+
# Fallback to simple execution
|
| 667 |
+
yield {"event": "status", "data": json.dumps({"status": "📚 Executing as simple query..."})}
|
| 668 |
+
# Re-route to simple path by manually calling the logic
|
| 669 |
+
candidate_summaries = self.catalog.get_summaries_for_tables(candidate_tables)
|
| 670 |
+
relevant_tables = await self.llm.identify_relevant_tables(query, candidate_summaries)
|
| 671 |
+
|
| 672 |
+
for table in relevant_tables:
|
| 673 |
+
self.geo_engine.ensure_table_loaded(table)
|
| 674 |
+
|
| 675 |
+
table_schema = self.geo_engine.get_table_schemas()
|
| 676 |
+
|
| 677 |
+
yield {"event": "status", "data": json.dumps({"status": "✍️ Writing SQL query..."})}
|
| 678 |
+
sql = await self.llm.generate_analytical_sql(query, table_schema, history)
|
| 679 |
+
sql = sql.replace("```sql", "").replace("```", "").strip()
|
| 680 |
+
|
| 681 |
+
try:
|
| 682 |
+
geojson = self.geo_engine.execute_spatial_query(sql)
|
| 683 |
+
features = geojson.get("features", [])
|
| 684 |
+
except Exception as e:
|
| 685 |
+
yield {"event": "result", "data": json.dumps({
|
| 686 |
+
"response": f"Query execution failed: {str(e)}",
|
| 687 |
+
"sql_query": sql
|
| 688 |
+
})}
|
| 689 |
+
return
|
| 690 |
+
|
| 691 |
+
data_summary = ResponseFormatter.generate_data_summary(features)
|
| 692 |
+
explanation = await self.llm.generate_explanation(query, sql, data_summary, history)
|
| 693 |
+
|
| 694 |
+
yield {"event": "result", "data": json.dumps({
|
| 695 |
+
"response": explanation,
|
| 696 |
+
"sql_query": sql,
|
| 697 |
+
"geojson": geojson if include_map and features else None,
|
| 698 |
+
"chart_data": ResponseFormatter.generate_chart_data(sql, features),
|
| 699 |
+
"raw_data": ResponseFormatter.prepare_raw_data(features),
|
| 700 |
+
"data_citations": []
|
| 701 |
+
})}
|
| 702 |
+
return
|
| 703 |
+
|
| 704 |
+
# 3. Show plan to user
|
| 705 |
+
step_descriptions = [f"Step {i+1}: {s.description}" for i, s in enumerate(plan.steps)]
|
| 706 |
+
yield {"event": "chunk", "data": json.dumps({
|
| 707 |
+
"type": "thought",
|
| 708 |
+
"content": f"Planning multi-step execution:\n" + "\n".join(step_descriptions)
|
| 709 |
+
})}
|
| 710 |
+
|
| 711 |
+
# 4. Load all needed tables
|
| 712 |
+
all_tables = set()
|
| 713 |
+
for step in plan.steps:
|
| 714 |
+
all_tables.update(step.tables_needed)
|
| 715 |
+
|
| 716 |
+
if all_tables:
|
| 717 |
+
yield {"event": "status", "data": json.dumps({"status": f"💾 Loading {len(all_tables)} datasets..."})}
|
| 718 |
+
for table in all_tables:
|
| 719 |
+
self.geo_engine.ensure_table_loaded(table)
|
| 720 |
+
|
| 721 |
+
# 5. Execute steps by parallel groups
|
| 722 |
+
intermediate_results = {}
|
| 723 |
+
all_features = []
|
| 724 |
+
all_sql = []
|
| 725 |
+
|
| 726 |
+
for group_idx, group in enumerate(plan.parallel_groups):
|
| 727 |
+
group_steps = [s for s in plan.steps if s.step_id in group]
|
| 728 |
+
|
| 729 |
+
yield {"event": "status", "data": json.dumps({
|
| 730 |
+
"status": f"⚡ Executing step group {group_idx + 1}/{len(plan.parallel_groups)}..."
|
| 731 |
+
})}
|
| 732 |
+
|
| 733 |
+
# Execute steps in this group (could be parallel, but sequential for simplicity)
|
| 734 |
+
for step in group_steps:
|
| 735 |
+
yield {"event": "status", "data": json.dumps({
|
| 736 |
+
"status": f"🔄 {step.description}..."
|
| 737 |
+
})}
|
| 738 |
+
|
| 739 |
+
# Generate SQL for this step
|
| 740 |
+
table_schema = self.geo_engine.get_table_schemas()
|
| 741 |
+
|
| 742 |
+
# Build step-specific prompt
|
| 743 |
+
step_query = f"""Execute this step: {step.description}
|
| 744 |
+
|
| 745 |
+
Original user request: {query}
|
| 746 |
+
|
| 747 |
+
SQL Hint: {step.sql_template or 'None'}
|
| 748 |
+
|
| 749 |
+
Previous step results available: {list(intermediate_results.keys())}"""
|
| 750 |
+
|
| 751 |
+
sql = await self.llm.generate_analytical_sql(step_query, table_schema, history)
|
| 752 |
+
sql = sql.replace("```sql", "").replace("```", "").strip()
|
| 753 |
+
|
| 754 |
+
# Skip if LLM returned an error
|
| 755 |
+
if "DATA_UNAVAILABLE" in sql or sql.startswith("-- ERROR"):
|
| 756 |
+
logger.warning(f"Step {step.step_id} indicated data unavailable")
|
| 757 |
+
intermediate_results[step.result_name] = {"features": [], "sql": sql}
|
| 758 |
+
continue
|
| 759 |
+
|
| 760 |
+
try:
|
| 761 |
+
geojson = self.geo_engine.execute_spatial_query(sql)
|
| 762 |
+
features = geojson.get("features", [])
|
| 763 |
+
|
| 764 |
+
intermediate_results[step.result_name] = {
|
| 765 |
+
"features": features,
|
| 766 |
+
"sql": sql,
|
| 767 |
+
"geojson": geojson
|
| 768 |
+
}
|
| 769 |
+
all_features.extend(features)
|
| 770 |
+
all_sql.append(f"-- {step.description}\n{sql}")
|
| 771 |
+
|
| 772 |
+
yield {"event": "status", "data": json.dumps({
|
| 773 |
+
"status": f"✅ Step got {len(features)} results"
|
| 774 |
+
})}
|
| 775 |
+
|
| 776 |
+
except Exception as e:
|
| 777 |
+
logger.error(f"Step {step.step_id} failed: {e}")
|
| 778 |
+
# Try to repair
|
| 779 |
+
try:
|
| 780 |
+
sql = await self.llm.correct_sql(step_query, sql, str(e), table_schema)
|
| 781 |
+
geojson = self.geo_engine.execute_spatial_query(sql)
|
| 782 |
+
features = geojson.get("features", [])
|
| 783 |
+
intermediate_results[step.result_name] = {
|
| 784 |
+
"features": features,
|
| 785 |
+
"sql": sql,
|
| 786 |
+
"geojson": geojson
|
| 787 |
+
}
|
| 788 |
+
all_features.extend(features)
|
| 789 |
+
all_sql.append(f"-- {step.description} (repaired)\n{sql}")
|
| 790 |
+
except Exception as e2:
|
| 791 |
+
logger.error(f"Step repair also failed: {e2}")
|
| 792 |
+
intermediate_results[step.result_name] = {"features": [], "sql": sql, "error": str(e2)}
|
| 793 |
+
|
| 794 |
+
# 6. Generate final combined result
|
| 795 |
+
yield {"event": "status", "data": json.dumps({"status": "💬 Generating combined analysis..."})}
|
| 796 |
+
|
| 797 |
+
# Summarize intermediate results for explanation
|
| 798 |
+
result_summary = []
|
| 799 |
+
for name, result in intermediate_results.items():
|
| 800 |
+
features = result.get("features", [])
|
| 801 |
+
result_summary.append(f"{name}: {len(features)} records")
|
| 802 |
+
|
| 803 |
+
combined_summary = f"""Multi-step query completed with {len(plan.steps)} steps.
|
| 804 |
+
|
| 805 |
+
Results:
|
| 806 |
+
{chr(10).join(result_summary)}
|
| 807 |
+
|
| 808 |
+
Combination logic: {plan.final_combination_logic}"""
|
| 809 |
+
|
| 810 |
+
# Get combined explanation
|
| 811 |
+
explanation_buffer = ""
|
| 812 |
+
async for chunk in self.llm.stream_explanation(query, "\n\n".join(all_sql), combined_summary, history):
|
| 813 |
+
if chunk["type"] == "content":
|
| 814 |
+
explanation_buffer += chunk["text"]
|
| 815 |
+
yield {"event": "chunk", "data": json.dumps({"type": "text", "content": chunk["text"]})}
|
| 816 |
+
|
| 817 |
+
# Find the best geojson to display (use the one with most features)
|
| 818 |
+
best_geojson = None
|
| 819 |
+
best_features = []
|
| 820 |
+
for name, result in intermediate_results.items():
|
| 821 |
+
features = result.get("features", [])
|
| 822 |
+
if len(features) > len(best_features):
|
| 823 |
+
best_features = features
|
| 824 |
+
best_geojson = result.get("geojson")
|
| 825 |
+
|
| 826 |
+
# Generate layer if we have features
|
| 827 |
+
if include_map and best_features and best_geojson:
|
| 828 |
+
layer_info = await self.llm.generate_layer_name(query, all_sql[0] if all_sql else "")
|
| 829 |
+
layer_name_ai = layer_info.get("name", "Multi-Step Result")
|
| 830 |
+
layer_emoji = layer_info.get("emoji", "📊")
|
| 831 |
+
best_geojson, layer_id, layer_name = ResponseFormatter.format_geojson_layer(
|
| 832 |
+
query, best_geojson, best_features, layer_name_ai, layer_emoji
|
| 833 |
+
)
|
| 834 |
+
|
| 835 |
+
try:
|
| 836 |
+
table_name = self.geo_engine.register_layer(layer_id, best_geojson)
|
| 837 |
+
self.session_store.add_layer(session_id, {
|
| 838 |
+
"id": layer_id,
|
| 839 |
+
"name": layer_name,
|
| 840 |
+
"table_name": table_name,
|
| 841 |
+
"timestamp": datetime.datetime.now().isoformat()
|
| 842 |
+
})
|
| 843 |
+
except Exception as e:
|
| 844 |
+
logger.warning(f"Failed to register multi-step layer: {e}")
|
| 845 |
+
|
| 846 |
+
# Generate chart from combined results
|
| 847 |
+
chart_data = ResponseFormatter.generate_chart_data("\n".join(all_sql), best_features)
|
| 848 |
+
raw_data = ResponseFormatter.prepare_raw_data(best_features)
|
| 849 |
+
|
| 850 |
+
# Final result
|
| 851 |
+
yield {"event": "result", "data": json.dumps({
|
| 852 |
+
"response": explanation_buffer,
|
| 853 |
+
"sql_query": "\n\n".join(all_sql),
|
| 854 |
+
"geojson": best_geojson if include_map and best_features else None,
|
| 855 |
+
"chart_data": chart_data,
|
| 856 |
+
"raw_data": raw_data,
|
| 857 |
+
"data_citations": [],
|
| 858 |
+
"multi_step": True,
|
| 859 |
+
"steps_executed": len(plan.steps)
|
| 860 |
+
})}
|
backend/services/orchestrator.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Dict, Any
|
| 2 |
+
from backend.services.executor import QueryExecutor
|
| 3 |
+
|
| 4 |
+
class OrchestratorAgent:
|
| 5 |
+
def __init__(self):
|
| 6 |
+
self.executor = QueryExecutor()
|
| 7 |
+
|
| 8 |
+
async def process_query(self, query: str, history: list[Dict[str, str]] = None, model: str = None) -> Dict[str, Any]:
|
| 9 |
+
"""
|
| 10 |
+
Delegates to QueryExecutor. Model param can be used to configure LLM if needed.
|
| 11 |
+
"""
|
| 12 |
+
# For now, we rely on the default configured in LLMGateway
|
| 13 |
+
return await self.executor.process_query_with_context(query, history or [])
|
backend/services/response_formatter.py
ADDED
|
@@ -0,0 +1,287 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Response Formatter Service
|
| 3 |
+
|
| 4 |
+
Handles formatting of query results into citations, charts, GeoJSON layers, and raw data for the frontend.
|
| 5 |
+
Separates presentation logic from execution logic.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from typing import List, Dict, Any, Optional
|
| 9 |
+
import uuid
|
| 10 |
+
|
| 11 |
+
class ResponseFormatter:
|
| 12 |
+
@staticmethod
|
| 13 |
+
def generate_citations(tables: List[str], features: Optional[List[Dict]] = None) -> List[str]:
|
| 14 |
+
"""Generates readable citations based on table names and returned features."""
|
| 15 |
+
citations = []
|
| 16 |
+
processed = set()
|
| 17 |
+
|
| 18 |
+
# Check explicit table list
|
| 19 |
+
for table in tables:
|
| 20 |
+
table = table.lower()
|
| 21 |
+
if table in processed: continue
|
| 22 |
+
|
| 23 |
+
if "universit" in table:
|
| 24 |
+
citations.append("Universities Data (OpenStreetMap, 2024)")
|
| 25 |
+
elif "school" in table or "education" in table:
|
| 26 |
+
citations.append("Education Facilities (OpenStreetMap, 2024)")
|
| 27 |
+
elif "hospital" in table or "health" in table:
|
| 28 |
+
citations.append("Health Facilities (OpenStreetMap, 2024)")
|
| 29 |
+
elif "airport" in table:
|
| 30 |
+
citations.append("Airports Data (OpenStreetMap, 2024)")
|
| 31 |
+
elif "road" in table:
|
| 32 |
+
citations.append("Road Network (OpenStreetMap, 2024)")
|
| 33 |
+
elif "population" in table or "census" in table:
|
| 34 |
+
citations.append("Panama Census Data (INEC, 2023)")
|
| 35 |
+
elif "admin" in table or "boundar" in table:
|
| 36 |
+
if "Admin Boundaries" not in processed:
|
| 37 |
+
citations.append("Panama Administrative Boundaries (HDX COD-AB, 2021)")
|
| 38 |
+
processed.add("Admin Boundaries")
|
| 39 |
+
continue
|
| 40 |
+
|
| 41 |
+
processed.add(table)
|
| 42 |
+
|
| 43 |
+
# Fallback check on features if no specific tables cited but admin data returned
|
| 44 |
+
if not citations and features:
|
| 45 |
+
if any(k.startswith("adm") for k in features[0].get("properties", {}).keys()):
|
| 46 |
+
citations.append("Panama Administrative Boundaries (HDX COD-AB, 2021)")
|
| 47 |
+
|
| 48 |
+
return list(set(citations))
|
| 49 |
+
|
| 50 |
+
@staticmethod
|
| 51 |
+
def generate_chart_data(sql: str, features: List[Dict]) -> Optional[Dict[str, Any]]:
|
| 52 |
+
"""
|
| 53 |
+
Generates Chart.js compatible data structure if the query looks aggregative.
|
| 54 |
+
"""
|
| 55 |
+
if not features:
|
| 56 |
+
return None
|
| 57 |
+
|
| 58 |
+
# Heuristic: If GROUP BY or ORDER BY ... LIMIT is used, likely suitable for charting
|
| 59 |
+
# Or if explicitly requested via intent (logic handled in caller, but we check SQL signature here too)
|
| 60 |
+
|
| 61 |
+
# Try to find string (label) and number (value) in properties
|
| 62 |
+
try:
|
| 63 |
+
chart_items = []
|
| 64 |
+
x_key = "name"
|
| 65 |
+
y_key = "value"
|
| 66 |
+
x_label = "Feature"
|
| 67 |
+
y_label = "Value"
|
| 68 |
+
|
| 69 |
+
# 1. Analyze properties to find X (Label) and Y (Value)
|
| 70 |
+
if features:
|
| 71 |
+
sample_props = features[0].get("properties", {})
|
| 72 |
+
|
| 73 |
+
# Exclude system keys
|
| 74 |
+
valid_keys = [k for k in sample_props.keys() if k not in ["geom", "geometry", "style", "layer_name", "layer_id", "choropleth", "fillColor", "color"]]
|
| 75 |
+
|
| 76 |
+
# Find Y (Value) - First numeric column
|
| 77 |
+
for k in valid_keys:
|
| 78 |
+
if isinstance(sample_props[k], (int, float)) and not k.endswith("_id") and not k.endswith("_code"):
|
| 79 |
+
y_key = k
|
| 80 |
+
y_label = k.replace("_", " ").title()
|
| 81 |
+
if "sqkm" in k: y_label = "Area (km²)"
|
| 82 |
+
elif "pop" in k: y_label = "Population"
|
| 83 |
+
elif "count" in k: y_label = "Count"
|
| 84 |
+
break
|
| 85 |
+
|
| 86 |
+
# Find X (Label) - First string column (excluding IDs if possible)
|
| 87 |
+
for k in valid_keys:
|
| 88 |
+
if isinstance(sample_props[k], str) and "name" in k:
|
| 89 |
+
x_key = k
|
| 90 |
+
x_label = k.replace("_", " ").title().replace("Name", "").strip() or "Region"
|
| 91 |
+
break
|
| 92 |
+
|
| 93 |
+
# 2. Build Data
|
| 94 |
+
for f in features:
|
| 95 |
+
props = f.get("properties", {})
|
| 96 |
+
label = props.get(x_key)
|
| 97 |
+
value = props.get(y_key)
|
| 98 |
+
|
| 99 |
+
if label is not None and value is not None:
|
| 100 |
+
chart_items.append({"name": str(label), "value": value})
|
| 101 |
+
|
| 102 |
+
if chart_items:
|
| 103 |
+
# auto-sort descending
|
| 104 |
+
chart_items.sort(key=lambda x: x["value"], reverse=True)
|
| 105 |
+
|
| 106 |
+
return {
|
| 107 |
+
"type": "bar",
|
| 108 |
+
"title": f"{y_label} by {x_label}",
|
| 109 |
+
"data": chart_items[:15], # Limit to top 15 for readability
|
| 110 |
+
"xKey": "name",
|
| 111 |
+
"yKey": "value",
|
| 112 |
+
"xAxisLabel": x_label,
|
| 113 |
+
"yAxisLabel": y_label
|
| 114 |
+
}
|
| 115 |
+
except Exception as e:
|
| 116 |
+
print(f"Error generating chart data: {e}")
|
| 117 |
+
return None
|
| 118 |
+
|
| 119 |
+
return None
|
| 120 |
+
|
| 121 |
+
@staticmethod
|
| 122 |
+
def prepare_raw_data(features: List[Dict]) -> List[Dict]:
|
| 123 |
+
"""Cleans feature properties for display in the raw data table."""
|
| 124 |
+
raw_data = []
|
| 125 |
+
if not features:
|
| 126 |
+
return raw_data
|
| 127 |
+
|
| 128 |
+
for f in features:
|
| 129 |
+
props = f.get("properties", {}).copy()
|
| 130 |
+
# Serialize
|
| 131 |
+
props = ResponseFormatter._serialize_properties(props)
|
| 132 |
+
|
| 133 |
+
# Remove system/visual properties
|
| 134 |
+
for key in ["geom", "geometry", "style", "layer_name", "layer_id", "choropleth", "fillColor", "color"]:
|
| 135 |
+
props.pop(key, None)
|
| 136 |
+
raw_data.append(props)
|
| 137 |
+
|
| 138 |
+
return raw_data
|
| 139 |
+
|
| 140 |
+
@staticmethod
|
| 141 |
+
def format_geojson_layer(query: str, geojson: Dict[str, Any], features: List[Dict], layer_name: str, layer_emoji: str = "📍", point_style: Optional[str] = None, admin_levels: Optional[List[str]] = None) -> tuple[Dict[str, Any], str, str]:
|
| 142 |
+
"""
|
| 143 |
+
styles the GeoJSON layer and generates metadata (ID, Name, Choropleth).
|
| 144 |
+
|
| 145 |
+
Args:
|
| 146 |
+
point_style: "icon" for emoji markers, "circle" for simple colored circles, None for auto-detect
|
| 147 |
+
"""
|
| 148 |
+
|
| 149 |
+
# 0. Serialize properties to avoid datetime errors
|
| 150 |
+
if features:
|
| 151 |
+
for f in features:
|
| 152 |
+
if "properties" in f:
|
| 153 |
+
f["properties"] = ResponseFormatter._serialize_properties(f["properties"])
|
| 154 |
+
|
| 155 |
+
# 2. Random/Distinct Colors
|
| 156 |
+
# Palette of distinct colors (avoiding pure blue which is default)
|
| 157 |
+
palette = [
|
| 158 |
+
"#E63946", # Red
|
| 159 |
+
"#F4A261", # Orange
|
| 160 |
+
"#2A9D8F", # Teal
|
| 161 |
+
"#E9C46A", # Yellow
|
| 162 |
+
"#9C6644", # Brown
|
| 163 |
+
"#D62828", # Dark Red
|
| 164 |
+
"#8338EC", # Purple
|
| 165 |
+
"#3A86FF", # Blue-ish (but distinct)
|
| 166 |
+
"#FB5607", # Orange-Red
|
| 167 |
+
"#FF006E", # Pink
|
| 168 |
+
]
|
| 169 |
+
|
| 170 |
+
# Deterministic color based on query hash to keep it stable for same query
|
| 171 |
+
color_idx = abs(hash(query)) % len(palette)
|
| 172 |
+
layer_color = palette[color_idx]
|
| 173 |
+
|
| 174 |
+
# Choropleth Logic
|
| 175 |
+
# 1. Identify valid numeric column
|
| 176 |
+
choropleth_col = None
|
| 177 |
+
if features:
|
| 178 |
+
sample = features[0].get("properties", {})
|
| 179 |
+
valid_numerics = [
|
| 180 |
+
k for k, v in sample.items()
|
| 181 |
+
if isinstance(v, (int, float))
|
| 182 |
+
and k not in ["layer_id", "style"]
|
| 183 |
+
and not k.endswith("_code")
|
| 184 |
+
and not k.endswith("_id")
|
| 185 |
+
]
|
| 186 |
+
|
| 187 |
+
# Prioritize 'population', 'area', 'count'
|
| 188 |
+
priority_cols = ["population", "pop", "count", "num", "density", "area_sqkm", "area"]
|
| 189 |
+
|
| 190 |
+
for p in priority_cols:
|
| 191 |
+
matches = [c for c in valid_numerics if p in c]
|
| 192 |
+
if matches:
|
| 193 |
+
choropleth_col = matches[0]
|
| 194 |
+
break
|
| 195 |
+
|
| 196 |
+
# Fallback to first numeric
|
| 197 |
+
if not choropleth_col and valid_numerics:
|
| 198 |
+
choropleth_col = valid_numerics[0]
|
| 199 |
+
|
| 200 |
+
# 2. Enable if appropriate
|
| 201 |
+
if choropleth_col:
|
| 202 |
+
# Check if values actually vary
|
| 203 |
+
values = [f["properties"].get(choropleth_col, 0) for f in features]
|
| 204 |
+
if len(set(values)) > 1:
|
| 205 |
+
geojson["properties"]["choropleth"] = {
|
| 206 |
+
"enabled": True,
|
| 207 |
+
"palette": "viridis",
|
| 208 |
+
"column": choropleth_col,
|
| 209 |
+
"scale": "log" if "pop" in choropleth_col or "density" in choropleth_col else "linear"
|
| 210 |
+
}
|
| 211 |
+
else:
|
| 212 |
+
# Apply random color if NOT a choropleth
|
| 213 |
+
geojson["properties"]["style"] = {
|
| 214 |
+
"color": layer_color,
|
| 215 |
+
"fillColor": layer_color,
|
| 216 |
+
"opacity": 0.8,
|
| 217 |
+
"fillOpacity": 0.4
|
| 218 |
+
}
|
| 219 |
+
|
| 220 |
+
layer_id = str(uuid.uuid4())[:8]
|
| 221 |
+
geojson["properties"]["layer_name"] = layer_name
|
| 222 |
+
geojson["properties"]["layer_id"] = layer_id
|
| 223 |
+
|
| 224 |
+
# Add Point Marker Configuration
|
| 225 |
+
# Use pointStyle to determine whether to show icon or circle
|
| 226 |
+
marker_icon = None
|
| 227 |
+
marker_style = "circle" # default
|
| 228 |
+
|
| 229 |
+
if point_style == "icon":
|
| 230 |
+
# Use emoji icon for categorical POI
|
| 231 |
+
marker_icon = layer_emoji
|
| 232 |
+
marker_style = "icon"
|
| 233 |
+
elif point_style == "circle":
|
| 234 |
+
# Use simple circle for large datasets or density viz
|
| 235 |
+
marker_icon = None
|
| 236 |
+
marker_style = "circle"
|
| 237 |
+
else:
|
| 238 |
+
# Auto-detect: default to icon for now (backward compatibility)
|
| 239 |
+
marker_icon = layer_emoji
|
| 240 |
+
marker_style = "icon"
|
| 241 |
+
|
| 242 |
+
geojson["properties"]["pointMarker"] = {
|
| 243 |
+
"icon": marker_icon,
|
| 244 |
+
"style": marker_style,
|
| 245 |
+
"color": layer_color,
|
| 246 |
+
"size": 32
|
| 247 |
+
}
|
| 248 |
+
|
| 249 |
+
return geojson, layer_id, layer_name
|
| 250 |
+
|
| 251 |
+
@staticmethod
|
| 252 |
+
def generate_data_summary(features: List[Dict]) -> str:
|
| 253 |
+
"""Generates a text summary of the features for the LLM explanation context."""
|
| 254 |
+
if features:
|
| 255 |
+
sample_names = []
|
| 256 |
+
for f in features[:5]:
|
| 257 |
+
props = f.get("properties", {})
|
| 258 |
+
name = props.get("adm3_name") or props.get("adm2_name") or props.get("adm1_name") or props.get("name") or "Feature"
|
| 259 |
+
area = props.get("area_sqkm")
|
| 260 |
+
if area:
|
| 261 |
+
sample_names.append(f"{name} ({float(area):.1f} km²)")
|
| 262 |
+
else:
|
| 263 |
+
sample_names.append(name)
|
| 264 |
+
return f"Found {len(features)} features. Sample: {', '.join(sample_names)}"
|
| 265 |
+
return f"Found {len(features)} features. Sample: {', '.join(sample_names)}"
|
| 266 |
+
else:
|
| 267 |
+
return "No features found matching the query."
|
| 268 |
+
|
| 269 |
+
@staticmethod
|
| 270 |
+
def _serialize_properties(properties: Dict[str, Any]) -> Dict[str, Any]:
|
| 271 |
+
"""Recursively converts datetime/date objects to strings for JSON serialization."""
|
| 272 |
+
from datetime import datetime, date
|
| 273 |
+
|
| 274 |
+
serialized = {}
|
| 275 |
+
for k, v in properties.items():
|
| 276 |
+
if isinstance(v, (datetime, date)):
|
| 277 |
+
serialized[k] = v.isoformat()
|
| 278 |
+
elif isinstance(v, dict):
|
| 279 |
+
serialized[k] = ResponseFormatter._serialize_properties(v)
|
| 280 |
+
elif isinstance(v, list):
|
| 281 |
+
serialized[k] = [
|
| 282 |
+
x.isoformat() if isinstance(x, (datetime, date)) else x
|
| 283 |
+
for x in v
|
| 284 |
+
]
|
| 285 |
+
else:
|
| 286 |
+
serialized[k] = v
|
| 287 |
+
return serialized
|
docker-compose.yml
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version: '3.8'
|
| 2 |
+
|
| 3 |
+
services:
|
| 4 |
+
app:
|
| 5 |
+
build: .
|
| 6 |
+
image: geoquery:latest
|
| 7 |
+
ports:
|
| 8 |
+
- "8000:8000"
|
| 9 |
+
environment:
|
| 10 |
+
- GEMINI_API_KEY=${GEMINI_API_KEY}
|
| 11 |
+
volumes:
|
| 12 |
+
# Optional: Mount data directory if you want to persist changes or add data
|
| 13 |
+
# - ./backend/data:/app/backend/data
|
| 14 |
+
- ./backend/data/custom:/app/backend/data/custom
|