GerardCB commited on
Commit
4851501
·
0 Parent(s):

Deploy to Spaces (Final Clean)

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitignore +65 -0
  2. ARCHITECTURE.md +426 -0
  3. DEPLOYMENT.md +58 -0
  4. Dockerfile +58 -0
  5. README.md +277 -0
  6. SETUP.md +455 -0
  7. backend/__init__.py +0 -0
  8. backend/api/api.py +8 -0
  9. backend/api/endpoints/catalog.py +235 -0
  10. backend/api/endpoints/chat.py +90 -0
  11. backend/api/endpoints/schema.py +84 -0
  12. backend/core/catalog_enricher.py +221 -0
  13. backend/core/data_catalog.py +445 -0
  14. backend/core/database.py +34 -0
  15. backend/core/geo_engine.py +244 -0
  16. backend/core/llm_gateway.py +500 -0
  17. backend/core/prompts.py +279 -0
  18. backend/core/query_planner.py +291 -0
  19. backend/core/semantic_search.py +259 -0
  20. backend/core/session_store.py +179 -0
  21. backend/data/catalog.json +1290 -0
  22. backend/data/catalog_schema.json +145 -0
  23. backend/data/censo/censo_2023_enriched.csv +0 -0
  24. backend/data/censo/censo_panama_2023_unificado.csv +0 -0
  25. backend/data/global/airports/panama_airports.geojson +98 -0
  26. backend/main.py +68 -0
  27. backend/pyproject.toml +31 -0
  28. backend/requirements.txt +18 -0
  29. backend/scripts/create_province_layer.py +196 -0
  30. backend/scripts/download_geofabrik.py +192 -0
  31. backend/scripts/download_global_datasets.py +133 -0
  32. backend/scripts/download_hdx.py +72 -0
  33. backend/scripts/download_hdx_panama.py +102 -0
  34. backend/scripts/download_kontur.py +239 -0
  35. backend/scripts/download_overture.py +133 -0
  36. backend/scripts/download_stri_data.py +79 -0
  37. backend/scripts/download_worldbank.py +141 -0
  38. backend/scripts/enrich_censo.py +115 -0
  39. backend/scripts/extract_overture_features.py +134 -0
  40. backend/scripts/ingest_hdx.py +110 -0
  41. backend/scripts/process_worldbank.py +150 -0
  42. backend/scripts/register_global_datasets.py +51 -0
  43. backend/scripts/stri_catalog_scraper.py +348 -0
  44. backend/scripts/update_embeddings.py +37 -0
  45. backend/scripts/validate_censo.py +155 -0
  46. backend/services/data_loader.py +271 -0
  47. backend/services/executor.py +860 -0
  48. backend/services/orchestrator.py +13 -0
  49. backend/services/response_formatter.py +287 -0
  50. docker-compose.yml +14 -0
.gitignore ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ venv/
8
+ ENV/
9
+ env/
10
+ .venv/
11
+
12
+ # Node
13
+ node_modules/
14
+ .next/
15
+ out/
16
+
17
+ # IDE
18
+ .vscode/
19
+ .idea/
20
+ *.swp
21
+ *.swo
22
+
23
+ # OS
24
+ .DS_Store
25
+ Thumbs.db
26
+
27
+ # Environment
28
+ .env
29
+ .env.local
30
+ .env.*.local
31
+
32
+ # Data files (keep structure, not data)
33
+ *.parquet
34
+ *.duckdb
35
+ *.duckdb.wal
36
+
37
+ # Large data files and binaries (downloaded at build time)
38
+ backend/data/embeddings.json
39
+ backend/data/*.geojson
40
+ backend/data/*.gz
41
+ backend/data/*.xlsx
42
+ backend/data/global/airports/airports_global.csv
43
+ backend/data/*.gpkg
44
+ backend/data/osm/
45
+ backend/data/overture/
46
+ backend/data/kontur/
47
+ backend/data/hdx/
48
+ backend/data/base/
49
+ backend/data/inec/
50
+ backend/data/temp/
51
+ backend/data/climate/
52
+ backend/data/ms_buildings/
53
+ backend/data/stri/
54
+ backend/data/socioeconomic/
55
+ backend/data/terrain/
56
+ backend/data/worldbank/
57
+
58
+ # Logs
59
+ *.log
60
+ npm-debug.log*
61
+
62
+ # Build
63
+ dist/
64
+ build/
65
+ *.egg-info/
ARCHITECTURE.md ADDED
@@ -0,0 +1,426 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # GeoQuery Architecture
2
+
3
+ ## System Overview
4
+
5
+ GeoQuery is a **Territorial Intelligence Platform** that combines Large Language Models (LLMs) with geospatial analysis to enable natural language querying of geographic datasets. The system translates conversational queries into SQL, executes spatial operations, and presents results through interactive maps and data visualizations.
6
+
7
+ ### Design Philosophy
8
+
9
+ 1. **Natural Language First**: Users interact through conversational queries, not SQL or GIS interfaces
10
+ 2. **Dynamic Data Discovery**: No fixed schema—the system adapts to any GeoJSON dataset added to the catalog
11
+ 3. **Streaming Intelligence**: Real-time thought processes and incremental results via Server-Sent Events
12
+ 4. **Spatial Native**: PostGIS-compatible spatial operations in DuckDB for performant geospatial analysis
13
+ 5. **Visual by Default**: Automatic map visualization, choropleth generation, and data presentation
14
+
15
+ ---
16
+
17
+ ## High-Level Architecture
18
+
19
+ ```
20
+ ┌─────────────────────────────────────────────────────────────┐
21
+ │ Frontend │
22
+ │ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │
23
+ │ │ ChatPanel │ │ MapViewer │ │ DataExplorer │ │
24
+ │ │ (React) │ │ (Leaflet) │ │ (Table) │ │
25
+ │ └──────────────┘ └──────────────┘ └──────────────┘ │
26
+ │ │ │ │ │
27
+ │ └──────────────────┴──────────────────┘ │
28
+ │ │ (SSE/HTTP) │
29
+ └───────────────────────────┼─────────────────────────────────┘
30
+
31
+ ┌───────────────────────────┼─────────────────────────────────┐
32
+ │ API Layer │
33
+ │ ┌──────────────────────────────────────────────────┐ │
34
+ │ │ FastAPI Endpoints │ │
35
+ │ │ /api/chat (SSE) │ /api/catalog │ /api/schema │ │
36
+ │ └──────────────────────────────────────────────────┘ │
37
+ │ │ │
38
+ └───────────────────────────┼─────────────────────────────────┘
39
+
40
+ ┌───────────────────────────┼─────────────────────────────────┐
41
+ │ Service Layer │
42
+ │ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │
43
+ │ │ QueryExecutor│ │ LLMGateway │ │ GeoEngine │ │
44
+ │ │ │ │ (Gemini) │ │ (DuckDB) │ │
45
+ │ └──────────────┘ └──────────────┘ └──────────────┘ │
46
+ │ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │
47
+ │ │ DataCatalog │ │SemanticSearch│ │ SessionStore │ │
48
+ │ │ (Embeddings) │ │ (Vectors) │ │ (Layers) │ │
49
+ │ └──────────────┘ └──────────────┘ └──────────────┘ │
50
+ └─────────────────────────────────────────────────────────────┘
51
+
52
+ ┌───────────────────────────┼─────────────────────────────────┐
53
+ │ Data Layer │
54
+ │ ┌──────────────┐ ┌─────���────────┐ ┌──────────────┐ │
55
+ │ │ catalog.json │ │ GeoJSON │ │ embeddings │ │
56
+ │ │ (Metadata) │ │ (Datasets) │ │ (.npy) │ │
57
+ │ └──────────────┘ └──────────────┘ └──────────────┘ │
58
+ │ ┌──────────────────────────────────────────────────┐ │
59
+ │ │ DuckDB In-Memory Database │ │
60
+ │ │ (Spatial Tables, Temporary Layers, Indexes) │ │
61
+ │ └──────────────────────────────────────────────────┘ │
62
+ └─────────────────────────────────────────────────────────────┘
63
+ ```
64
+
65
+ ---
66
+
67
+ ## Core Components
68
+
69
+ ### 1. Frontend (Next.js + React)
70
+
71
+ **Location**: `frontend/src/`
72
+
73
+ The frontend is a single-page application built with Next.js that provides:
74
+ - **ChatPanel**: Conversational interface with streaming responses
75
+ - **MapViewer**: Interactive Leaflet map with layer management
76
+ - **DataExplorer**: Tabular data view with export capabilities
77
+
78
+ **Key Technologies**:
79
+ - Next.js 14 (App Router)
80
+ - React 18 with hooks
81
+ - Leaflet for map rendering
82
+ - Server-Sent Events (SSE) for streaming
83
+ - dnd-kit for drag-and-drop layer reordering
84
+
85
+ ### 2. API Layer (FastAPI)
86
+
87
+ **Location**: `backend/api/`
88
+
89
+ RESTful API with streaming support:
90
+ - **`/api/chat`** (POST): Main query endpoint with SSE streaming
91
+ - **`/api/catalog`** (GET): Returns available datasets
92
+ - **`/api/schema`** (GET): Returns database schema
93
+
94
+ **Key Technologies**:
95
+ - FastAPI for async HTTP
96
+ - Starlette for SSE streaming
97
+ - CORS middleware for cross-origin requests
98
+
99
+ ### 3. Service Layer
100
+
101
+ #### QueryExecutor (`backend/services/executor.py`)
102
+ Orchestrates the entire query pipeline:
103
+ 1. Intent detection
104
+ 2. Data discovery
105
+ 3. SQL generation
106
+ 4. Query execution
107
+ 5. Response formatting
108
+ 6. Explanation generation
109
+
110
+ #### LLMGateway (`backend/core/llm_gateway.py`)
111
+ Interfaces with Gemini API:
112
+ - Intent detection with thinking
113
+ - Text-to-SQL generation
114
+ - Natural language explanations
115
+ - Layer naming and styling
116
+ - Error correction
117
+ - Streaming support
118
+
119
+ #### GeoEngine (`backend/core/geo_engine.py`)
120
+ Manages spatial database:
121
+ - DuckDB connection with Spatial extension
122
+ - Lazy table loading from GeoJSON
123
+ - SQL query execution
124
+ - Result formatting to GeoJSON
125
+ - Temporary layer registration
126
+
127
+ #### DataCatalog (`backend/core/data_catalog.py`)
128
+ Dataset discovery system:
129
+ - Loads `catalog.json` metadata
130
+ - Generates table summaries for LLM context
131
+ - Provides schema information
132
+ - Manages dataset metadata
133
+
134
+ #### SemanticSearch (`backend/core/semantic_search.py`)
135
+ Vector-based dataset discovery:
136
+ - Generates embeddings for dataset descriptions
137
+ - Performs cosine similarity search
138
+ - Returns top-k relevant datasets
139
+ - Scales to large catalogs (100+ datasets)
140
+
141
+ #### SessionStore (`backend/core/session_store.py`)
142
+ User session management:
143
+ - Tracks created map layers per session
144
+ - Enables spatial operations on user layers
145
+ - Maintains layer metadata
146
+
147
+ ### 4. Data Layer
148
+
149
+ #### Catalog System (`backend/data/catalog.json`)
150
+ Central metadata registry:
151
+ - Dataset paths and descriptions
152
+ - Semantic descriptions for AI discovery
153
+ - Categories and tags
154
+ - Schema information
155
+ - Data provenance
156
+
157
+ #### GeoJSON Datasets (`backend/data/`)
158
+ Organized by source:
159
+ - `osm/` - OpenStreetMap data (roads, buildings, POI)
160
+ - `admin/` - Administrative boundaries (HDX)
161
+ - `global/` - Global datasets (Kontur, Natural Earth)
162
+ - `socioeconomic/` - World Bank, MPI data
163
+ - `stri/` - STRI GIS Portal datasets
164
+
165
+ #### Vector Embeddings (`backend/data/embeddings.npy`)
166
+ Sentence transformer embeddings for semantic search
167
+
168
+ ---
169
+
170
+ ## Data Flow: User Query to Response
171
+
172
+ ### Step 1: User Input
173
+ ```
174
+ User: "Show me hospitals in Panama City"
175
+ ```
176
+
177
+ ### Step 2: Frontend → Backend
178
+ ```
179
+ POST /api/chat
180
+ {
181
+ "message": "Show me hospitals in Panama City",
182
+ "history": []
183
+ }
184
+ ```
185
+
186
+ ### Step 3: Intent Detection (LLM)
187
+ ```python
188
+ # QueryExecutor calls LLMGateway.detect_intent()
189
+ intent = await llm.detect_intent(query, history)
190
+ # Returns: "MAP_REQUEST"
191
+ ```
192
+
193
+ ### Step 4: Semantic Discovery
194
+ ```python
195
+ # SemanticSearch finds relevant tables
196
+ candidates = semantic_search.search_table_names(query, top_k=15)
197
+ # Returns: ["panama_healthsites_geojson", "osm_amenities", ...]
198
+ ```
199
+
200
+ ### Step 5: Table Schema Retrieval
201
+ ```python
202
+ # GeoEngine loads relevant tables
203
+ geo_engine.ensure_table_loaded("panama_healthsites_geojson")
204
+ schema = geo_engine.get_table_schemas()
205
+ # Returns: "Table: panama_healthsites_geojson\nColumns: name, amenity, geom..."
206
+ ```
207
+
208
+ ### Step 6: SQL Generation (LLM)
209
+ ```python
210
+ # LLMGateway generates SQL
211
+ sql = await llm.generate_analytical_sql(query, schema, history)
212
+ # Returns: "SELECT name, amenity, geom FROM panama_healthsites_geojson
213
+ # WHERE amenity = 'hospital' AND ST_Intersects(geom, ...)"
214
+ ```
215
+
216
+ ### Step 7: Query Execution
217
+ ```python
218
+ # GeoEngine executes spatial query
219
+ geojson = geo_engine.execute_spatial_query(sql)
220
+ # Returns: GeoJSON with 45 hospital features
221
+ ```
222
+
223
+ ### Step 8: Response Formatting
224
+ ```python
225
+ # Add layer metadata, generate name, configure visualization
226
+ layer_info = await llm.generate_layer_name(query, sql)
227
+ # Returns: {"name": "Hospitals in Panama City", "emoji": "🏥", "pointStyle": "icon"}
228
+
229
+ geojson = format_geojson_layer(query, geojson, features,
230
+ layer_info["name"],
231
+ layer_info["emoji"],
232
+ layer_info["pointStyle"])
233
+ ```
234
+
235
+ ### Step 9: Explanation Generation (Streaming)
236
+ ```python
237
+ # LLMGateway generates explanation with streaming
238
+ async for chunk in llm.stream_explanation(query, sql, data_summary, history):
239
+ if chunk["type"] == "thought":
240
+ # Stream thinking process to frontend
241
+ elif chunk["type"] == "content":
242
+ # Stream actual response text
243
+ ```
244
+
245
+ ### Step 10: Frontend Rendering
246
+ - ChatPanel displays streamed explanation
247
+ - MapViewer renders GeoJSON layer with hospital icons
248
+ - DataExplorer shows tabular data
249
+
250
+ ---
251
+
252
+ ## Key Design Decisions
253
+
254
+ ### 1. Why DuckDB Instead of PostgreSQL?
255
+
256
+ **Chosen**: DuckDB with Spatial extension
257
+
258
+ **Rationale**:
259
+ - **Zero Configuration**: Embedded database, no separate server
260
+ - **Fast Analytics**: Columnar storage optimized for analytical queries
261
+ - **Spatial Support**: Full PostGIS compatibility via spatial extension
262
+ - **GeoJSON Native**: Direct GeoJSON import/export
263
+ - **Lightweight**: Perfect for development and small deployments
264
+
265
+ **Trade-off**: Limited concurrency compared to PostgreSQL (acceptable for our use case)
266
+
267
+ ### 2. Why Semantic Search for Dataset Discovery?
268
+
269
+ **Chosen**: Sentence transformer embeddings + cosine similarity
270
+
271
+ **Rationale**:
272
+ - **Scalability**: Works with 100+ datasets without overwhelming LLM context
273
+ - **Accuracy**: Better matches than keyword search
274
+ - **Token Efficiency**: Only sends relevant table schemas to LLM
275
+
276
+ **Example**:
277
+ - Query: "Where can I find doctors?"
278
+ - Semantic search finds: `panama_healthsites_geojson` (closest match)
279
+ - LLM then generates SQL using only relevant schema
280
+
281
+ ### 3. Why Server-Sent Events for Streaming?
282
+
283
+ **Chosen**: SSE instead of WebSockets
284
+
285
+ **Rationale**:
286
+ - **Simpler Protocol**: One-way communication (server → client)
287
+ - **HTTP Compatible**: Works through firewalls and proxies
288
+ - **Auto Reconnect**: Built-in browser support
289
+ - **Event Types**: Named events for different message types
290
+
291
+ **Trade-off**: No client → server streaming (not needed for our use case)
292
+
293
+ ### 4. Why Lazy Table Loading?
294
+
295
+ **Chosen**: Load GeoJSON only when needed
296
+
297
+ **Rationale**:
298
+ - **Fast Startup**: Don't load all datasets on initialization
299
+ - **Memory Efficient**: Only keep active tables in memory
300
+ - **Flexible**: Easy to add new datasets without restart
301
+
302
+ **Implementation**:
303
+ ```python
304
+ def ensure_table_loaded(self, table_name: str) -> bool:
305
+ if table_name not in self.loaded_tables:
306
+ self.load_geojson_to_table(table_name)
307
+ return table_name in self.loaded_tables
308
+ ```
309
+
310
+ ### 5. Why Choropleth Auto-Detection?
311
+
312
+ **Chosen**: Automatic choropleth configuration based on data
313
+
314
+ **Rationale**:
315
+ - **User Friendly**: No manual configuration needed
316
+ - **Intelligent**: Prioritizes meaningful columns (population, area, density)
317
+ - **Adaptive**: Works with any numeric column
318
+
319
+ **Logic**:
320
+ 1. Find numeric columns
321
+ 2. Prioritize keywords (population, area, count)
322
+ 3. Check value variance (skip if all same)
323
+ 4. Enable choropleth with appropriate scale (linear/log)
324
+
325
+ ---
326
+
327
+ ##Error Handling & Resilience
328
+
329
+ ### SQL Error Correction
330
+ When a generated SQL query fails:
331
+ 1. Extract error message
332
+ 2. Send to LLM with original query and schema
333
+ 3. LLM generates corrected SQL
334
+ 4. Execute repaired query
335
+ 5. If still fails, return error to user
336
+
337
+ ### Data Unavailable Handling
338
+ When requested data doesn't exist:
339
+ 1. LLM returns special error marker: `-- ERROR: DATA_UNAVAILABLE`
340
+ 2. System extracts "Requested" and "Available" from response
341
+ 3. Returns helpful message to user with alternatives
342
+
343
+ ### Missing Tables
344
+ - Catalog lists all datasets but not all loaded
345
+ - Lazy loading attempts to load on demand
346
+ - If file missing, logs warning and continues
347
+
348
+ ---
349
+
350
+ ## Performance Considerations
351
+
352
+ ### Query Optimization
353
+ - **Spatial Indexes**: DuckDB automatically indexes geometry columns
354
+ - **Top-K Limits**: Large result sets limited to prevent memory issues
355
+ - **Lazy Evaluation**: Stream results when possible
356
+
357
+ ### Embedding Cache
358
+ - Embeddings pre-computed and stored in `.npy` file
359
+ - Only regenerated when catalog changes
360
+ - Fast cosine similarity via NumPy vectorization
361
+
362
+ ### Frontend Rendering
363
+ - **Layer Virtualization**: Large point datasets use circle markers for performance
364
+ - **Choropleth Colors**: Pre-computed color palettes
365
+ - **Lazy Map Loading**: Only render visible layers
366
+
367
+ ---
368
+
369
+ ## Security Considerations
370
+
371
+ ### LLM Prompt Injection
372
+ - **Mitigation**: Clear separation of user query and system instructions
373
+ - **Validation**: SQL parsing and column name verification
374
+ - **Sandboxing**: Read-only queries (no INSERT/UPDATE/DELETE)
375
+
376
+ ### API Access
377
+ - **CORS**: Configured allowed origins
378
+ - **Rate Limiting**: Can be added via middleware (not currently implemented)
379
+ - **Authentication**: Not implemented (suitable for internal/demo deployments)
380
+
381
+ ### Data Privacy
382
+ - No user data stored (stateless queries)
383
+ - Session layers stored in-memory only
384
+ - No query logging by default
385
+
386
+ ---
387
+
388
+ ## Scalability Path
389
+
390
+ ### Current Limitations
391
+ - **Single Process**: No horizontal scaling
392
+ - **In-Memory Database**: Limited by RAM
393
+ - **No Caching**: Repeated queries re-execute
394
+
395
+ ### Future Enhancements
396
+ 1. **Add PostgreSQL/PostGIS**: For production deployments with persistence
397
+ 2. **Redis Cache**: Cache query results and embeddings
398
+ 3. **Load Balancer**: Multiple FastAPI instances
399
+ 4. **Background Workers**: Async data ingestion with Celery
400
+ 5. **CDN**: Serve GeoJSON datasets from cloud storage
401
+
402
+ ---
403
+
404
+ ## Technology Choices Summary
405
+
406
+ | Component | Technology | Why? |
407
+ |-----------|-----------|------|
408
+ | **Backend Language** | Python 3.11+ | Rich geospatial ecosystem, LLM SDKs |
409
+ | **Web Framework** | FastAPI | Async support, OpenAPI docs, SSE |
410
+ | **Database** | DuckDB | Embedded, fast analytics, spatial support |
411
+ | **LLM** | Google Gemini | Thinking mode, streaming, JSON output |
412
+ | **Frontend Framework** | Next.js 14 | React, SSR, App Router, TypeScript |
413
+ | **Map Library** | Leaflet | Lightweight, flexible, plugin ecosystem |
414
+ | **Embeddings** | sentence-transformers | Multilingual, semantic similarity |
415
+ | **Data Format** | GeoJSON | Standard, human-readable, LLM-friendly |
416
+
417
+ ---
418
+
419
+ ## Next Steps
420
+
421
+ For detailed information on specific components:
422
+ - [Backend Services](docs/backend/CORE_SERVICES.md)
423
+ - [API Reference](docs/backend/API_ENDPOINTS.md)
424
+ - [Frontend Components](docs/frontend/COMPONENTS.md)
425
+ - [Data Flow](docs/DATA_FLOW.md)
426
+ - [Setup Guide](SETUP.md)
DEPLOYMENT.md ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Deployment Guide
2
+
3
+ This guide describes how to deploy the GeoQuery platform for public access.
4
+
5
+ ## Strategy
6
+ We use a **single-container** approach where the backend (FastAPI) serves the frontend (Next.js) as static files. This simplifies deployment to PaaS providers like Railway, Render, or Hugging Face Spaces.
7
+
8
+ ### Architecture
9
+ - **Build Stage**: Node.js builder compiles the Next.js frontend into static HTML/CSS/JS (`frontend/out`).
10
+ - **Runtime Stage**: Python 3.11 image installs backend dependencies.
11
+ - **Serving**: FastAPI mounts the static build at `/` and serves the API at `/api`.
12
+ - **Data**: Geospatial data (`backend/data`) is included in the image (~2GB).
13
+
14
+ ## Prerequisites
15
+ - Docker
16
+ - ~5GB Free disk space (for image build)
17
+ - 4GB+ RAM on host machine (for DuckDB in-memory analytics)
18
+
19
+ ## Local Build & Run
20
+ ```bash
21
+ # Build the image
22
+ docker build -t geoquery .
23
+
24
+ # Run the container (Mapping 7860 to 7860 to match standard Space config)
25
+ docker run -p 7860:7860 -e GEMINI_API_KEY=your_key_here geoquery
26
+ ```
27
+
28
+ ## Hosting Options (Getting a Public URL)
29
+
30
+ To share this demo with others, you need to host the Docker container on a cloud provider.
31
+
32
+ ### Option A: Hugging Face Spaces (Easiest & Free)
33
+ This will give you a public URL like `https://huggingface.co/spaces/username/geoquery`.
34
+
35
+ 1. **Create Space**: Go to [huggingface.co/spaces](https://huggingface.co/spaces) -> "Create new Space".
36
+ - SDK: **Docker**
37
+ - Template: **Blank**
38
+ 2. **Push Code**:
39
+ ```bash
40
+ git remote add space https://huggingface.co/spaces/YOUR_USERNAME/SPACE_NAME
41
+ git push space main
42
+ ```
43
+ 3. **Configure Secrets**: In the Space "Settings" tab, add a "Repository Secret" named `GEMINI_API_KEY` with your key.
44
+
45
+ ### Option B: Railway / Render
46
+ 1. Connect your GitHub repository.
47
+ 2. Railway/Render will detect the `Dockerfile`.
48
+ 3. Set the environment variable `GEMINI_API_KEY`.
49
+ 4. Detailed output will be available at a URL like `https://geoquery-production.up.railway.app`.
50
+
51
+ ### Option C: Google Cloud Run
52
+ 1. Build: `gcloud builds submit --tag gcr.io/PROJECT_ID/geoquery`
53
+ 2. Deploy: `gcloud run deploy geoquery --image gcr.io/PROJECT_ID/geoquery --platform managed`
54
+
55
+
56
+ ## Notes
57
+ - **Data Persistence**: The current setup uses read-only data baked into the image. User uploads will be lost on restart unless a volume is mounted to `/app/backend/data/custom`.
58
+ - **Memory Usage**: DuckDB processes data in-memory. For large queries, ensure the host has sufficient RAM.
Dockerfile ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ==========================================
2
+ # Stage 1: Build Frontend (Next.js)
3
+ # ==========================================
4
+ FROM node:20-alpine AS frontend
5
+ WORKDIR /app
6
+
7
+ # Install dependencies
8
+ COPY frontend/package*.json ./
9
+ RUN npm ci
10
+
11
+ # Copy source code
12
+ COPY frontend/ ./
13
+
14
+ # Configure for static export
15
+ ENV NEXT_PUBLIC_API_URL=/api/v1
16
+ # Run build (creates /app/out)
17
+ RUN npm run build
18
+
19
+ # ==========================================
20
+ # Stage 2: Runtime (Python + FastAPI)
21
+ # ==========================================
22
+ FROM python:3.11-slim
23
+
24
+ # Create a non-root user (Recommended for HF Spaces)
25
+ RUN useradd -m -u 1000 user
26
+ USER user
27
+ ENV PATH="/home/user/.local/bin:$PATH"
28
+
29
+ WORKDIR /app
30
+
31
+ # Install system dependencies (as root before switching user)
32
+ USER root
33
+ RUN apt-get update && apt-get install -y \
34
+ build-essential \
35
+ libgeos-dev \
36
+ && rm -rf /var/lib/apt/lists/*
37
+ USER user
38
+
39
+ # Install Python dependencies
40
+ COPY --chown=user backend/requirements.txt .
41
+ RUN pip install --no-cache-dir -r requirements.txt
42
+
43
+ # Copy download script and execute data fetch
44
+ COPY backend/scripts/download_hdx_panama.py backend/scripts/
45
+ RUN python backend/scripts/download_hdx_panama.py
46
+
47
+ # Copy Backend Code
48
+ COPY --chown=user backend/ backend/
49
+
50
+ # Copy Built Frontend to Backend Static Directory
51
+ # ensure strict permissions
52
+ COPY --from=frontend --chown=user /app/out /app/backend/static
53
+
54
+ # Expose port 7860 (Standard for HF Spaces)
55
+ EXPOSE 7860
56
+
57
+ # Run Application
58
+ CMD ["uvicorn", "backend.main:app", "--host", "0.0.0.0", "--port", "7860"]
README.md ADDED
@@ -0,0 +1,277 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: GeoQuery
3
+ emoji: 🌍
4
+ colorFrom: blue
5
+ colorTo: green
6
+ sdk: docker
7
+ pinned: false
8
+ license: mit
9
+ app_port: 7860
10
+ ---
11
+
12
+ # GeoQuery
13
+ 🌍🤖
14
+
15
+ **Territorial Intelligence Platform** - Natural language interface for geospatial data analysis powered by LLMs and DuckDB Spatial.
16
+
17
+ ![Status](https://img.shields.io/badge/Status-Active-success) ![Python](https://img.shields.io/badge/Python-3.11+-blue) ![Next.js](https://img.shields.io/badge/Next.js-15-black) ![License](https://img.shields.io/badge/License-MIT-green)
18
+
19
+ ---
20
+
21
+ ## ✨ What is GeoQuery?
22
+
23
+ GeoQuery transforms geographic data analysis by combining **Large Language Models** with **spatial databases**. Simply ask questions in natural language and get instant maps, charts, and insights.
24
+
25
+ **Example**: *"Show me hospitals in Panama City"* → Interactive map with 45 hospital locations, automatically styled with 🏥 icons.
26
+
27
+ ### Key Capabilities
28
+
29
+ - 🗣️ **Conversational Queries** - Natural language instead of SQL or GIS interfaces
30
+ - 🗺️ **Auto-Visualization** - Smart choropleth maps, point markers, and heatmaps
31
+ - 📊 **Dynamic Charts** - Automatic bar, pie, and line chart generation
32
+ - 🔍 **Semantic Discovery** - Finds relevant datasets from 100+ options using AI embeddings
33
+ - 🧩 **Multi-Step Analysis** - Complex queries automatically decomposed and executed
34
+ - 💡 **Thinking Transparency** - See the LLM's reasoning process in real-time
35
+ - 🎨 **Custom Point Styles** - Icon markers for POI, circle points for large datasets
36
+
37
+ ---
38
+
39
+ ## 🎬 Quick Demo
40
+
41
+ ### Try These Queries
42
+
43
+ | Query | What You Get |
44
+ |-------|--------------|
45
+ | "Show me all provinces colored by area" | Choropleth map with size-based gradient |
46
+ | "Where are the universities?" | Point map with 🎓 icons |
47
+ | "Compare hospital count vs school count by province" | Multi-step analysis with side-by-side bar charts |
48
+ | "Show intersections in David as circle points" | 1,288 traffic intersections as simple colored circles |
49
+ | "Population density in Veraguas" | H3 hexagon heatmap (33K cells) |
50
+
51
+ ---
52
+
53
+ ## 🏗️ Architecture
54
+
55
+ ```
56
+ ┌──────────────────────────────────────────────────────────┐
57
+ │ Frontend (Next.js) │
58
+ │ Chat Interface │ Leaflet Maps │ Data Explorer │
59
+ └────────────────────────┬─────────────────────────────────┘
60
+ │ (SSE Streaming)
61
+ ┌────────────────────────┴─────────────────────────────────┐
62
+ │ Backend (FastAPI) │
63
+ │ Intent Detection → Semantic Search → SQL Generation │
64
+ │ ↓ ↓ ↓ │
65
+ │ Gemini LLM DataCatalog (Embeddings) DuckDB Spatial │
66
+ └──────────────────────────────────────────────────────────┘
67
+ ```
68
+
69
+ It supports dynamic dataset discovery via semantic embeddings + LLM-generated spatial SQL.
70
+
71
+ 📖 **[Detailed Architecture](ARCHITECTURE.md)**
72
+
73
+ ---
74
+
75
+ ## 🚀 Quick Start
76
+
77
+ ### Prerequisites
78
+
79
+ - **Python 3.11+**
80
+ - **Node.js 18+**
81
+ - **Google AI API Key** ([Get one free](https://aistudio.google.com/app/apikey))
82
+
83
+ ### Installation
84
+
85
+ ```bash
86
+ # 1. Clone repository
87
+ git clone https://github.com/GerardCB/GeoQuery.git
88
+ cd GeoQuery
89
+
90
+ # 2. Backend setup
91
+ cd backend
92
+ python -m venv venv
93
+ source venv/bin/activate # On Windows: venv\Scripts\activate
94
+ pip install -e .
95
+
96
+ # 3. Configure API key
97
+ export GEMINI_API_KEY="your-api-key-here"
98
+
99
+ # 4. Start backend
100
+ uvicorn backend.main:app --reload --host 0.0.0.0 --port 8000
101
+
102
+ # 5. Frontend setup (new terminal)
103
+ cd frontend
104
+ npm install
105
+ npm run dev
106
+ ```
107
+
108
+ ### 🎉 Done!
109
+
110
+ Open **http://localhost:3000** and start querying!
111
+
112
+ 📘 **[Detailed Setup Guide](SETUP.md)**
113
+
114
+ ---
115
+
116
+ ## 📂 Project Structure
117
+
118
+ ```
119
+ GeoQuery/
120
+ ├── backend/
121
+ │ ├── api/ # FastAPI endpoints
122
+ │ │ └── endpoints/ # /chat, /catalog, /schema
123
+ │ ├── core/ # Core services
124
+ │ │ ├── llm_gateway.py # Gemini API integration
125
+ │ │ ├── geo_engine.py # DuckDB Spatial wrapper
126
+ │ │ ├── semantic_search.py # Embedding-based discovery
127
+ │ │ ├── data_catalog.py # Dataset metadata management
128
+ │ │ ├── query_planner.py # Multi-step query orchestration
129
+ │ │ └── prompts.py # LLM system instructions
130
+ │ ├── services/ # Business logic
131
+ │ │ ├── executor.py # Query pipeline orchestrator
132
+ │ │ └── response_formatter.py # GeoJSON/chart formatting
133
+ │ ├── data/ # Datasets and metadata
134
+ │ │ ├── catalog.json # Dataset registry
135
+ │ │ ├── embeddings.npy # Vector embeddings
136
+ │ │ ├── osm/ # OpenStreetMap data
137
+ │ │ ├── admin/ # Administrative boundaries
138
+ │ │ ├── global/ # Global datasets (Kontur, etc.)
139
+ │ │ └── socioeconomic/ # World Bank, poverty data
140
+ │ └── scripts/ # Data ingestion scripts
141
+ │ ├── download_geofabrik.py
142
+ │ ├── download_hdx_panama.py
143
+ │ └── stri_catalog_scraper.py
144
+ ├── frontend/
145
+ │ └── src/
146
+ │ ├── app/ # Next.js App Router pages
147
+ │ └── components/
148
+ │ ├── ChatPanel.tsx # Chat interface with SSE
149
+ │ ├── MapViewer.tsx # Leaflet map with layers
150
+ │ └── DataExplorer.tsx # Tabular data view
151
+ └── docs/ # Technical documentation
152
+ ├── backend/ # Backend deep-dives
153
+ ├── frontend/ # Frontend architecture
154
+ └── data/ # Data system docs
155
+ ```
156
+
157
+ ---
158
+
159
+ ## 🔧 Technology Stack
160
+
161
+ | Layer | Technology | Purpose |
162
+ |-------|-----------|---------|
163
+ | **LLM** | Google Gemini 2.0 | Intent detection, SQL generation, explanations |
164
+ | **Backend** | Python 3.11 + FastAPI | Async HTTP server with SSE streaming |
165
+ | **Database** | DuckDB with Spatial | In-memory spatial analytics |
166
+ | **Frontend** | Next.js 15 + React 18 | Server-side rendering + interactive UI |
167
+ | **Maps** | Leaflet 1.9 | Interactive web maps |
168
+ | **Embeddings** | sentence-transformers | Semantic dataset search |
169
+ | **Data** | GeoJSON + Parquet | Standardized geospatial formats |
170
+
171
+ ---
172
+
173
+ ## 📊 Available Datasets
174
+
175
+ GeoQuery currently includes 100+ datasets across multiple categories:
176
+
177
+ ### Administrative
178
+ - Panama provinces, districts, corregimientos (HDX 2021)
179
+ - Comarca boundaries
180
+ - Electoral districts
181
+
182
+ ### Infrastructure
183
+ - Roads and highways (OpenStreetMap)
184
+ - Hospitals and health facilities (986 locations)
185
+ - Universities and schools (200+ institutions)
186
+ - Airports, ports, power plants
187
+
188
+ ### Socioeconomic
189
+ - World Bank development indicators
190
+ - Multidimensional poverty index (MPI)
191
+ - Population density (Kontur H3 hexagons - 33K cells)
192
+
193
+ ### Natural Environment
194
+ - Protected areas (STRI GIS Portal)
195
+ - Forest cover and land use
196
+ - Rivers and water bodies
197
+
198
+ 📖 **[Full Dataset List](docs/data/DATASET_SOURCES.md)** | **[Adding New Data](docs/backend/SCRIPTS.md)**
199
+
200
+ ---
201
+
202
+ ## 💡 How It Works
203
+
204
+ 1. **User Query**: "Show me hospitals in Panama City"
205
+ 2. **Intent Detection**: LLM classifies as MAP_REQUEST
206
+ 3. **Semantic Search**: Finds `panama_healthsites_geojson` via embeddings
207
+ 4. **SQL Generation**: LLM creates: `SELECT name, geom FROM panama_healthsites_geojson WHERE ST_Intersects(geom, (SELECT geom FROM pan_admin2 WHERE adm2_name = 'Panamá'))`
208
+ 5. **Execution**: DuckDB Spatial runs query → 45 features
209
+ 6. **Visualization**: Auto-styled map with 🏥 icons
210
+ 7. **Explanation**: LLM streams natural language summary
211
+
212
+ **Streaming**: See the LLM's thinking process in real-time via Server-Sent Events.
213
+
214
+ 📖 **[Detailed Data Flow](docs/DATA_FLOW.md)** | **[LLM Integration](docs/backend/LLM_INTEGRATION.md)**
215
+
216
+ ---
217
+
218
+ ## 🗺️ Advanced Features
219
+
220
+ ### Choropleth Maps
221
+ Automatically detects numeric columns and creates color gradients:
222
+ - **Linear scale**: For area, count
223
+ - **Logarithmic scale**: For population, density
224
+
225
+ ### Point Visualization Modes
226
+ - **Icon markers** 🏥🎓⛪: For categorical POI (<500 points)
227
+ - **Circle points** ⭕: For large datasets like intersections (>500 points)
228
+
229
+ ### Spatial Operations
230
+ - Intersection: "Find hospitals within protected areas"
231
+ - Difference: "Show me areas outside national parks"
232
+ - Buffer: "Show 5km radius around hospitals"
233
+
234
+ ### Multi-Step Queries
235
+ Complex questions automatically decomposed:
236
+ - "Compare population density with hospital coverage by province"
237
+ 1. Calculate population per province
238
+ 2. Count hospitals per province
239
+ 3. Compute ratios
240
+ 4. Generate comparison chart
241
+
242
+ ---
243
+
244
+ ## 📚 Documentation
245
+
246
+ | Document | Description |
247
+ |----------|-------------|
248
+ | **[ARCHITECTURE.md](ARCHITECTURE.md)** | System design, components, decisions |
249
+ | **[SETUP.md](SETUP.md)** | Development environment setup |
250
+ | **[docs/backend/CORE_SERVICES.md](docs/backend/CORE_SERVICES.md)** | Backend services reference |
251
+ | **[docs/backend/API_ENDPOINTS.md](docs/backend/API_ENDPOINTS.md)** | API endpoint documentation |
252
+ | **[docs/frontend/COMPONENTS.md](docs/frontend/COMPONENTS.md)** | React component architecture |
253
+ | **[docs/DATA_FLOW.md](docs/DATA_FLOW.md)** | End-to-end request walkthrough |
254
+
255
+ ---
256
+
257
+ ## 📄 License
258
+
259
+ MIT License - see **[LICENSE](LICENSE)** for details.
260
+
261
+ ---
262
+
263
+ ## 🙏 Acknowledgments
264
+
265
+ **Data Sources**:
266
+ - [OpenStreetMap](https://www.openstreetmap.org/) - Infrastructure and POI data
267
+ - [Humanitarian Data Exchange (HDX)](https://data.humdata.org/) - Administrative boundaries
268
+ - [World Bank Open Data](https://data.worldbank.org/) - Socioeconomic indicators
269
+ - [Kontur Population Dataset](https://data.humdata.org/organization/kontur) - H3 population grid
270
+ - [STRI GIS Portal](https://stridata-si.opendata.arcgis.com/) - Environmental datasets
271
+
272
+ **Technologies**:
273
+ - [Google Gemini](https://ai.google.dev/) - LLM API
274
+ - [DuckDB](https://duckdb.org/) - Fast in-process analytics
275
+ - [Leaflet](https://leafletjs.com/) - Interactive maps
276
+ - [Next.js](https://nextjs.org/) - React framework
277
+
SETUP.md ADDED
@@ -0,0 +1,455 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # GeoQuery Setup Guide
2
+
3
+ Complete guide for setting up the GeoQuery development environment.
4
+
5
+ ---
6
+
7
+ ## Prerequisites
8
+
9
+ ### Required Software
10
+
11
+ | Requirement | Minimum Version | Purpose |
12
+ |------------|----------------|---------|
13
+ | **Python** | 3.11+ | Backend runtime |
14
+ | **Node.js** | 18+ | Frontend runtime |
15
+ | **npm** | 9+ | Package management |
16
+ | **Git** | 2.0+ | Version control |
17
+
18
+ ### API Keys
19
+
20
+ - **Google AI API Key (Gemini)**: Required for LLM functionality
21
+ - Get one free at: https://aistudio.google.com/app/apikey
22
+ - Free tier: 15 requests/minute, 1500/day
23
+
24
+ ### System Requirements
25
+
26
+ - **RAM**: 4GB minimum, 8GB recommended (for DuckDB in-memory database)
27
+ - **Disk**: 2GB for datasets
28
+ - **OS**: macOS, Linux, or Windows (WSL recommended)
29
+
30
+ ---
31
+
32
+ ## Installation
33
+
34
+ ### 1. Clone Repository
35
+
36
+ ```bash
37
+ git clone https://github.com/GerardCB/GeoQuery.git
38
+ cd GeoQuery
39
+ ```
40
+
41
+ ### 2. Backend Setup
42
+
43
+ #### Create Virtual Environment
44
+
45
+ ```bash
46
+ cd backend
47
+ python3 -m venv venv
48
+ ```
49
+
50
+ #### Activate Virtual Environment
51
+
52
+ **macOS/Linux**:
53
+ ```bash
54
+ source venv/bin/activate
55
+ ```
56
+
57
+ **Windows** (PowerShell):
58
+ ```powershell
59
+ venv\Scripts\Activate.ps1
60
+ ```
61
+
62
+ **Windows** (CMD):
63
+ ```cmd
64
+ venv\Scripts\activate.bat
65
+ ```
66
+
67
+ #### Install Dependencies
68
+
69
+ ```bash
70
+ pip install --upgrade pip
71
+ pip install -e .
72
+ ```
73
+
74
+ This installs the package in editable mode, including all dependencies from `setup.py`.
75
+
76
+ **Key Dependencies**:
77
+ - `fastapi` - Web framework
78
+ - `uvicorn` - ASGI server
79
+ - `duckdb` - Embedded database
80
+ - `geopandas` - Geospatial data processing
81
+ - `sentence-transformers` - Embeddings
82
+ - `google-generativeai` - Gemini SDK
83
+
84
+ #### Configure Environment Variables
85
+
86
+ Create `.env` file in `backend/` directory:
87
+
88
+ ```bash
89
+ # Required
90
+ GEMINI_API_KEY=your-api-key-here
91
+
92
+ # Optional (defaults shown)
93
+ PORT=8000
94
+ HOST=0.0.0.0
95
+ LOG_LEVEL=INFO
96
+ ```
97
+
98
+ **Alternative**: Export directly in terminal:
99
+
100
+ ```bash
101
+ export GEMINI_API_KEY="your-api-key-here"
102
+ ```
103
+
104
+ **Windows**:
105
+ ```powershell
106
+ $env:GEMINI_API_KEY="your-api-key-here"
107
+ ```
108
+
109
+ #### Verify Backend Installation
110
+
111
+ ```bash
112
+ python -c "import backend; print('Backend installed successfully')"
113
+ ```
114
+
115
+ ### 3. Frontend Setup
116
+
117
+ ```bash
118
+ cd ../frontend # From backend directory
119
+ npm install
120
+ ```
121
+
122
+ **Key Dependencies**:
123
+ - `next` - React framework
124
+ - `react` - UI library
125
+ - `leaflet` - Map library
126
+ - `react-leaflet` - React bindings for Leaflet
127
+ - `@dnd-kit/core` - Drag and drop
128
+
129
+ #### Configure Frontend (Optional)
130
+
131
+ Edit `frontend/.env.local` if backend is not on default port:
132
+
133
+ ```bash
134
+ NEXT_PUBLIC_API_URL=http://localhost:8000
135
+ ```
136
+
137
+ ---
138
+
139
+ ## Running Locally
140
+
141
+ ### Start Backend
142
+
143
+ From `backend/` directory with venv activated:
144
+
145
+ ```bash
146
+ uvicorn backend.main:app --reload --host 0.0.0.0 --port 8000
147
+ ```
148
+
149
+ **Flags**:
150
+ - `--reload`: Auto-restart on code changes
151
+ - `--host 0.0.0.0`: Allow external connections
152
+ - `--port 8000`: Port number
153
+
154
+ **Expected Output**:
155
+ ```
156
+ INFO: Uvicorn running on http://0.0.0.0:8000
157
+ INFO: Application startup complete.
158
+ ```
159
+
160
+ **Verify**:
161
+ - Open http://localhost:8000/docs → Should show FastAPI Swagger UI
162
+ - Check http://localhost:8000/api/catalog → Should return GeoJSON catalog
163
+
164
+ ### Start Frontend
165
+
166
+ From `frontend/` directory:
167
+
168
+ ```bash
169
+ npm run dev
170
+ ```
171
+
172
+ **Expected Output**:
173
+ ```
174
+ ▲ Next.js 15.1.3
175
+ - Local: http://localhost:3000
176
+ - Ready in 2.1s
177
+ ```
178
+
179
+ **Verify**:
180
+ - Open http://localhost:3000 → Should show GeoQuery chat interface
181
+
182
+ ---
183
+
184
+ ## Database Setup
185
+
186
+ ### DuckDB Initialization
187
+
188
+ **Automatic**: Database is created in-memory on first query.
189
+
190
+ **Manual Test**:
191
+
192
+ ```python
193
+ from backend.core.geo_engine import get_geo_engine
194
+
195
+ engine = get_geo_engine()
196
+ print(f"Loaded tables: {list(engine.loaded_tables.keys())}")
197
+ ```
198
+
199
+ ### Load Initial Datasets
200
+
201
+ Datasets are loaded lazily (on-demand). To pre-load common datasets:
202
+
203
+ ```python
204
+ from backend.core.geo_engine import get_geo_engine
205
+
206
+ engine = get_geo_engine()
207
+ engine.ensure_table_loaded("pan_admin1") # Provinces
208
+ engine.ensure_table_loaded("panama_healthsites_geojson") # Hospitals
209
+ ```
210
+
211
+ ### Generate Embeddings
212
+
213
+ Required for semantic search:
214
+
215
+ ```bash
216
+ cd backend
217
+ python -c "from backend.core.semantic_search import get_semantic_search; get_semantic_search()"
218
+ ```
219
+
220
+ This generates `backend/data/embeddings.npy` (cached for future use).
221
+
222
+ ---
223
+
224
+ ## Directory Structure After Setup
225
+
226
+ ```
227
+ GeoQuery/
228
+ ├── backend/
229
+ │ ├── venv/ # Virtual environment (created)
230
+ │ ├── .env # Environment variables (created)
231
+ │ ├── data/
232
+ │ │ ├── embeddings.npy # Generated embeddings (created)
233
+ │ │ ├── catalog.json # Dataset registry (existing)
234
+ │ │ └── osm/ # GeoJSON datasets (existing)
235
+ │ └── <source files>
236
+ ├── frontend/
237
+ │ ├── node_modules/ # npm packages (created)
238
+ │ ├── .next/ # Build output (created)
239
+ │ └── <source files>
240
+ └── <other files>
241
+ ```
242
+
243
+ ---
244
+
245
+ ## Common Issues & Troubleshooting
246
+
247
+ ### Backend Issues
248
+
249
+ #### Issue: "ModuleNotFoundError: No module named 'backend'"
250
+
251
+ **Cause**: Virtual environment not activated or package not installed.
252
+
253
+ **Solution**:
254
+ ```bash
255
+ source venv/bin/activate # Activate venv
256
+ pip install -e . # Install package
257
+ ```
258
+
259
+ #### Issue: "duckdb.IOException: No files found that match the pattern"
260
+
261
+ **Cause**: GeoJSON file missing or incorrect path in catalog.json.
262
+
263
+ **Solution**:
264
+ 1. Check file exists: `ls backend/data/osm/hospitals.geojson`
265
+ 2. Verify path in `catalog.json`
266
+ 3. Download missing data: `python backend/scripts/download_geofabrik.py`
267
+
268
+ #### Issue: "google.api_core.exceptions.PermissionDenied: API key not valid"
269
+
270
+ **Cause**: Invalid or missing GEMINI_API_KEY.
271
+
272
+ **Solution**:
273
+ ```bash
274
+ export GEMINI_API_KEY="your-actual-api-key"
275
+ # Restart backend
276
+ ```
277
+
278
+ #### Issue: "Module 'sentence_transformers' has no attribute 'SentenceTransformer'"
279
+
280
+ **Cause**: Corrupted installation.
281
+
282
+ **Solution**:
283
+ ```bash
284
+ pip uninstall sentence-transformers
285
+ pip install sentence-transformers --no-cache-dir
286
+ ```
287
+
288
+ ### Frontend Issues
289
+
290
+ #### Issue: "Error: Cannot find module 'next'"
291
+
292
+ **Cause**: npm packages not installed.
293
+
294
+ **Solution**:
295
+ ```bash
296
+ cd frontend
297
+ rm -rf node_modules package-lock.json
298
+ npm install
299
+ ```
300
+
301
+ #### Issue: "Failed to fetch from localhost:8000"
302
+
303
+ **Cause**: Backend not running or CORS issue.
304
+
305
+ **Solution**:
306
+ 1. Verify backend is running: `curl http://localhost:8000/api/catalog`
307
+ 2. Check CORS settings in `backend/main.py`
308
+ 3. Verify `NEXT_PUBLIC_API_URL` in frontend `.env.local`
309
+
310
+ #### Issue: "Map tiles not loading"
311
+
312
+ **Cause**: Network issue or ad blocker.
313
+
314
+ **Solution**:
315
+ 1. Check internet connection
316
+ 2. Disable ad blocker for localhost
317
+ 3. Alternative tile server in `MapViewer.tsx`:
318
+ ```typescript
319
+ url="https://{s}.tile.openstreetmap.org/{z}/{x}/{y}.png"
320
+ ```
321
+
322
+ ### General Issues
323
+
324
+ #### Issue: Port 8000 already in use
325
+
326
+ **Solution**:
327
+ ```bash
328
+ # Find process using port
329
+ lsof -ti:8000
330
+
331
+ # Kill process
332
+ kill -9 $(lsof -ti:8000)
333
+
334
+ # Or use different port
335
+ uvicorn backend.main:app --port 8001
336
+ ```
337
+
338
+ #### Issue: Out of memory errors
339
+
340
+ **Cause**: Loading too many large datasets.
341
+
342
+ **Solution**:
343
+ 1. Reduce dataset size (filter before loading)
344
+ 2. Increase system RAM
345
+ 3. Use query limits: `LIMIT 10000`
346
+
347
+ ---
348
+
349
+ ## Development Workflow
350
+
351
+ ### Code Changes
352
+
353
+ **Backend**:
354
+ - Python files auto-reload with `--reload` flag
355
+ - Changes in `core/`, `services/`, `api/` take effect immediately
356
+
357
+ **Frontend**:
358
+ - Hot Module Replacement (HMR) enabled
359
+ - Changes in `components/`, `app/` reload automatically
360
+
361
+ ### Adding New Datasets
362
+
363
+ 1. **Add GeoJSON file** to appropriate directory (e.g., `backend/data/osm/`)
364
+
365
+ 2. **Update catalog.json**:
366
+ ```json
367
+ "my_new_dataset": {
368
+ "path": "osm/my_new_dataset.geojson",
369
+ "description": "Description for display",
370
+ "semantic_description": "Detailed description for AI",
371
+ "categories": ["infrastructure"],
372
+ "tags": ["roads", "transport"]
373
+ }
374
+ ```
375
+
376
+ 3. **Regenerate embeddings**:
377
+ ```bash
378
+ rm backend/data/embeddings.npy
379
+ python -c "from backend.core.semantic_search import get_semantic_search; get_semantic_search()"
380
+ ```
381
+
382
+ 4. **Test**: Query for the new dataset
383
+
384
+ See [docs/backend/SCRIPTS.md](docs/backend/SCRIPTS.md) for data ingestion scripts.
385
+
386
+ ### Testing API Endpoints
387
+
388
+ **Using curl**:
389
+ ```bash
390
+ # Get catalog
391
+ curl http://localhost:8000/api/catalog
392
+
393
+ # Query chat endpoint
394
+ curl -X POST http://localhost:8000/api/chat \
395
+ -H "Content-Type: application/json" \
396
+ -d '{"message": "Show me provinces", "history": []}'
397
+ ```
398
+
399
+ **Using Swagger UI**:
400
+ - Open http://localhost:8000/docs
401
+ - Try endpoints interactively
402
+
403
+ ---
404
+
405
+ ## Environment Variables Reference
406
+
407
+ | Variable | Required | Default | Description |
408
+ |----------|----------|---------|-------------|
409
+ | `GEMINI_API_KEY` | ✅ Yes | - | Google AI API key |
410
+ | `PORT` | ❌ No | 8000 | Backend server port |
411
+ | `HOST` | ❌ No | 0.0.0.0 | Backend host |
412
+ | `LOG_LEVEL` | ❌ No | INFO | Logging level (DEBUG, INFO, WARNING, ERROR) |
413
+ | `DATABASE_PATH` | ❌ No | :memory: | DuckDB database path (use for persistence) |
414
+
415
+ ---
416
+
417
+ ## IDE Setup
418
+
419
+ ### VS Code
420
+
421
+ **Recommended Extensions**:
422
+ - Python (`ms-python.python`)
423
+ - Pylance (`ms-python.vscode-pylance`)
424
+ - ESLint (`dbaeumer.vscode-eslint`)
425
+ - Prettier (`esbenp.prettier-vscode`)
426
+
427
+ **Settings** (`.vscode/settings.json`):
428
+ ```json
429
+ {
430
+ "python.defaultInterpreterPath": "./backend/venv/bin/python",
431
+ "python.linting.enabled": true,
432
+ "python.formatting.provider": "black",
433
+ "editor.formatOnSave": true,
434
+ "[typescript]": {
435
+ "editor.defaultFormatter": "esbenp.prettier-vscode"
436
+ }
437
+ }
438
+ ```
439
+
440
+ ### PyCharm
441
+
442
+ 1. **Set Python Interpreter**: Settings → Project → Python Interpreter → Add → Existing Environment → `backend/venv/bin/python`
443
+ 2. **Enable FastAPI**: Settings → Languages & Frameworks → FastAPI
444
+ 3. **Configure Run**: Run → Edit Configurations → Add → Python → Script path: `backend/main.py`
445
+
446
+ ---
447
+
448
+ ## Next Steps
449
+
450
+ - ✅ **Verify installation** by running a test query
451
+ - 📖 **Read [ARCHITECTURE.md](../ARCHITECTURE.md)** to understand the system
452
+ - 🔧 **Explore [docs/backend/CORE_SERVICES.md](docs/backend/CORE_SERVICES.md)** for component details
453
+ - 📊 **Review [docs/data/DATASET_SOURCES.md](docs/data/DATASET_SOURCES.md)** for available data
454
+
455
+
backend/__init__.py ADDED
File without changes
backend/api/api.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter
2
+ from backend.api.endpoints import chat, schema, catalog
3
+
4
+ api_router = APIRouter()
5
+ api_router.include_router(chat.router, prefix="/chat", tags=["chat"])
6
+ api_router.include_router(schema.router, prefix="/schema", tags=["schema"])
7
+ api_router.include_router(catalog.router, prefix="/catalog", tags=["catalog"])
8
+
backend/api/endpoints/catalog.py ADDED
@@ -0,0 +1,235 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Catalog Management Endpoints
3
+
4
+ Provides API for viewing and enriching the data catalog.
5
+ """
6
+
7
+ from fastapi import APIRouter, HTTPException, BackgroundTasks
8
+ from pydantic import BaseModel
9
+ from typing import List, Optional, Dict, Any
10
+
11
+ router = APIRouter()
12
+
13
+
14
+ class CatalogStatsResponse(BaseModel):
15
+ total_datasets: int
16
+ enriched_datasets: int
17
+ by_category: Dict[str, int]
18
+ by_tag: Dict[str, int]
19
+
20
+
21
+ class TableMetadataResponse(BaseModel):
22
+ name: str
23
+ path: str
24
+ description: str
25
+ semantic_description: Optional[str]
26
+ tags: List[str]
27
+ data_type: str
28
+ columns: List[str]
29
+ row_count: Optional[int]
30
+ category: str
31
+ last_indexed: Optional[str]
32
+ last_enriched: Optional[str]
33
+
34
+
35
+ class EnrichmentRequest(BaseModel):
36
+ table_names: Optional[List[str]] = None # None = all tables
37
+ force_refresh: bool = False
38
+
39
+
40
+ class EnrichmentResponse(BaseModel):
41
+ status: str
42
+ message: str
43
+ tables_queued: int
44
+
45
+
46
+ @router.get("/stats", response_model=CatalogStatsResponse)
47
+ async def get_catalog_stats():
48
+ """Get statistics about the data catalog."""
49
+ from backend.core.data_catalog import get_data_catalog
50
+
51
+ catalog = get_data_catalog()
52
+ stats = catalog.get_stats()
53
+
54
+ return CatalogStatsResponse(
55
+ total_datasets=stats["total_datasets"],
56
+ enriched_datasets=stats.get("enriched_datasets", 0),
57
+ by_category=stats["by_category"],
58
+ by_tag=stats["by_tag"]
59
+ )
60
+
61
+
62
+ @router.get("/tables", response_model=List[TableMetadataResponse])
63
+ async def list_catalog_tables():
64
+ """List all tables in the catalog with their metadata."""
65
+ from backend.core.data_catalog import get_data_catalog
66
+
67
+ catalog = get_data_catalog()
68
+ tables = []
69
+
70
+ for name, meta in catalog.catalog.items():
71
+ tables.append(TableMetadataResponse(
72
+ name=name,
73
+ path=meta.get("path", ""),
74
+ description=meta.get("description", ""),
75
+ semantic_description=meta.get("semantic_description"),
76
+ tags=meta.get("tags", []),
77
+ data_type=meta.get("data_type", "static"),
78
+ columns=meta.get("columns", []),
79
+ row_count=meta.get("row_count"),
80
+ category=meta.get("category", "unknown"),
81
+ last_indexed=meta.get("last_indexed"),
82
+ last_enriched=meta.get("last_enriched")
83
+ ))
84
+
85
+ return tables
86
+
87
+
88
+ @router.get("/tables/{table_name}", response_model=TableMetadataResponse)
89
+ async def get_table_metadata(table_name: str):
90
+ """Get metadata for a specific table."""
91
+ from backend.core.data_catalog import get_data_catalog
92
+
93
+ catalog = get_data_catalog()
94
+ meta = catalog.get_table_metadata(table_name)
95
+
96
+ if not meta:
97
+ raise HTTPException(status_code=404, detail=f"Table '{table_name}' not found")
98
+
99
+ return TableMetadataResponse(
100
+ name=table_name,
101
+ path=meta.get("path", ""),
102
+ description=meta.get("description", ""),
103
+ semantic_description=meta.get("semantic_description"),
104
+ tags=meta.get("tags", []),
105
+ data_type=meta.get("data_type", "static"),
106
+ columns=meta.get("columns", []),
107
+ row_count=meta.get("row_count"),
108
+ category=meta.get("category", "unknown"),
109
+ last_indexed=meta.get("last_indexed"),
110
+ last_enriched=meta.get("last_enriched")
111
+ )
112
+
113
+
114
+ @router.post("/enrich", response_model=EnrichmentResponse)
115
+ async def enrich_catalog(request: EnrichmentRequest, background_tasks: BackgroundTasks):
116
+ """
117
+ Trigger LLM enrichment for catalog tables.
118
+
119
+ Enrichment generates semantic descriptions and refined tags.
120
+ Runs in the background to avoid blocking.
121
+ """
122
+ from backend.core.data_catalog import get_data_catalog
123
+
124
+ catalog = get_data_catalog()
125
+
126
+ if request.table_names:
127
+ # Validate table names
128
+ invalid = [t for t in request.table_names if t not in catalog.catalog]
129
+ if invalid:
130
+ raise HTTPException(
131
+ status_code=400,
132
+ detail=f"Unknown tables: {invalid}"
133
+ )
134
+ tables_to_enrich = request.table_names
135
+ else:
136
+ tables_to_enrich = list(catalog.catalog.keys())
137
+
138
+ # Queue enrichment in background
139
+ async def run_enrichment():
140
+ for table_name in tables_to_enrich:
141
+ await catalog.enrich_table(table_name, request.force_refresh)
142
+
143
+ background_tasks.add_task(run_enrichment)
144
+
145
+ return EnrichmentResponse(
146
+ status="queued",
147
+ message=f"Enrichment started for {len(tables_to_enrich)} tables",
148
+ tables_queued=len(tables_to_enrich)
149
+ )
150
+
151
+
152
+ @router.post("/enrich/{table_name}")
153
+ async def enrich_single_table(table_name: str, force: bool = False):
154
+ """
155
+ Immediately enrich a single table (synchronous).
156
+
157
+ Use for testing or when you need the result right away.
158
+ """
159
+ from backend.core.data_catalog import get_data_catalog
160
+
161
+ catalog = get_data_catalog()
162
+
163
+ if table_name not in catalog.catalog:
164
+ raise HTTPException(status_code=404, detail=f"Table '{table_name}' not found")
165
+
166
+ success = await catalog.enrich_table(table_name, force)
167
+
168
+ if success:
169
+ meta = catalog.get_table_metadata(table_name)
170
+ return {
171
+ "status": "success",
172
+ "table": table_name,
173
+ "semantic_description": meta.get("semantic_description"),
174
+ "tags": meta.get("tags", [])
175
+ }
176
+ else:
177
+ raise HTTPException(status_code=500, detail=f"Failed to enrich table '{table_name}'")
178
+
179
+
180
+ @router.get("/search")
181
+ async def search_tables(query: str, top_k: int = 10):
182
+ """
183
+ Search for tables using semantic search.
184
+
185
+ Returns the most relevant tables for a natural language query.
186
+ """
187
+ from backend.core.semantic_search import get_semantic_search
188
+ from backend.core.data_catalog import get_data_catalog
189
+
190
+ semantic = get_semantic_search()
191
+ catalog = get_data_catalog()
192
+
193
+ results = semantic.search(query, top_k=top_k)
194
+
195
+ response = []
196
+ for table_name, score in results:
197
+ meta = catalog.get_table_metadata(table_name)
198
+ if meta:
199
+ response.append({
200
+ "table": table_name,
201
+ "score": round(score, 4),
202
+ "description": meta.get("semantic_description") or meta.get("description"),
203
+ "tags": meta.get("tags", [])
204
+ })
205
+
206
+ return {"query": query, "results": response}
207
+
208
+
209
+ @router.post("/rebuild-embeddings")
210
+ async def rebuild_embeddings():
211
+ """
212
+ Rebuild all semantic search embeddings from current catalog.
213
+
214
+ Use after bulk enrichment or catalog updates.
215
+ """
216
+ from backend.core.semantic_search import get_semantic_search
217
+ from backend.core.data_catalog import get_data_catalog
218
+
219
+ semantic = get_semantic_search()
220
+ catalog = get_data_catalog()
221
+
222
+ # Force re-embed all tables
223
+ count = 0
224
+ for table_name, metadata in catalog.catalog.items():
225
+ if semantic.embed_table(table_name, metadata):
226
+ count += 1
227
+
228
+ semantic._save_embeddings()
229
+
230
+ return {
231
+ "status": "success",
232
+ "message": f"Rebuilt embeddings for {count} tables",
233
+ "total_embeddings": len(semantic.embeddings)
234
+ }
235
+
backend/api/endpoints/chat.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter
2
+ from pydantic import BaseModel
3
+ from typing import Optional
4
+ from backend.services.executor import QueryExecutor
5
+
6
+ router = APIRouter()
7
+
8
+
9
+ class MessageHistory(BaseModel):
10
+ role: str # "user" or "assistant"
11
+ content: str
12
+
13
+
14
+ class ChatRequest(BaseModel):
15
+ message: str
16
+ history: list[MessageHistory] = []
17
+
18
+
19
+ class ChartData(BaseModel):
20
+ type: str # 'bar', 'line', 'pie', 'donut'
21
+ title: Optional[str] = None
22
+ data: list[dict] = []
23
+ xKey: Optional[str] = None
24
+ yKey: Optional[str] = None
25
+ lines: Optional[list[dict]] = None
26
+
27
+
28
+ class ChatResponse(BaseModel):
29
+ response: str
30
+ sql_query: Optional[str] = None
31
+ geojson: Optional[dict] = None
32
+ data_citations: list[str] = []
33
+ intent: Optional[str] = None
34
+ chart_data: Optional[ChartData] = None # NEW: For STAT_QUERY responses
35
+
36
+
37
+ @router.post("/", response_model=ChatResponse)
38
+ async def chat(request: ChatRequest):
39
+ """
40
+ Main chat endpoint that handles conversation with context.
41
+ Routes to appropriate handler based on detected intent.
42
+ """
43
+ executor = QueryExecutor()
44
+
45
+ # Convert history to dict format for the executor
46
+ history = [{"role": h.role, "content": h.content} for h in request.history]
47
+
48
+ # Process the query with full context
49
+ result = await executor.process_query_with_context(
50
+ query=request.message,
51
+ history=history
52
+ )
53
+
54
+ return ChatResponse(
55
+ response=result.get("response", "I processed your request."),
56
+ sql_query=result.get("sql_query"),
57
+ geojson=result.get("geojson"),
58
+ data_citations=result.get("data_citations", []),
59
+ intent=result.get("intent"),
60
+ chart_data=result.get("chart_data"),
61
+ raw_data=result.get("raw_data")
62
+ )
63
+
64
+
65
+ from sse_starlette.sse import EventSourceResponse
66
+ import json
67
+ import asyncio
68
+
69
+ @router.post("/stream")
70
+ async def chat_stream(request: ChatRequest):
71
+ """
72
+ Streaming chat endpoint that returns Server-Sent Events (SSE).
73
+ """
74
+ executor = QueryExecutor()
75
+ history = [{"role": h.role, "content": h.content} for h in request.history]
76
+
77
+ async def event_generator():
78
+ try:
79
+ # Delegate entirely to the executor's streaming process
80
+ async for event in executor.process_query_stream(request.message, history):
81
+ yield event
82
+
83
+ except Exception as e:
84
+ print(f"Stream error: {e}")
85
+ yield {
86
+ "event": "chunk",
87
+ "data": json.dumps({"type": "text", "content": f"\n\nError: {str(e)}"})
88
+ }
89
+
90
+ return EventSourceResponse(event_generator())
backend/api/endpoints/schema.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Schema endpoint - Provides data catalog information to users.
3
+ Shows available tables, columns, and data descriptions.
4
+ """
5
+
6
+ from fastapi import APIRouter
7
+ from pydantic import BaseModel
8
+ from typing import Optional, List, Any
9
+ from backend.core.data_catalog import get_data_catalog
10
+
11
+ router = APIRouter()
12
+
13
+
14
+ class ColumnInfo(BaseModel):
15
+ name: str
16
+ type: str
17
+ description: Optional[str] = None
18
+
19
+
20
+ class TableInfo(BaseModel):
21
+ name: str
22
+ description: str
23
+ row_count: int
24
+ columns: List[ColumnInfo]
25
+
26
+
27
+ class SchemaResponse(BaseModel):
28
+ tables: List[TableInfo]
29
+ last_updated: str
30
+ data_source: str
31
+
32
+
33
+ @router.get("/", response_model=SchemaResponse)
34
+ async def get_schema():
35
+ """
36
+ Returns the dynamic data catalog with all available tables and their schemas.
37
+ """
38
+ catalog = get_data_catalog()
39
+ tables = []
40
+
41
+ for table_name, meta in catalog.catalog.items():
42
+ # Map catalog columns to Schema columns
43
+ # Catalog columns are just a list of strings usually
44
+ cols = []
45
+ raw_cols = meta.get("columns", [])
46
+
47
+ # Helper to guess type
48
+ def guess_type(col_name):
49
+ if col_name == "geom": return "geometry"
50
+ if "id" in col_name: return "integer"
51
+ if "name" in col_name: return "text"
52
+ return "text" # Default
53
+
54
+ for col in raw_cols:
55
+ cols.append(ColumnInfo(
56
+ name=col,
57
+ type=guess_type(col),
58
+ description=None
59
+ ))
60
+
61
+ tables.append(TableInfo(
62
+ name=table_name,
63
+ description=meta.get("semantic_description") or meta.get("description", ""),
64
+ row_count=meta.get("row_count") or 0,
65
+ columns=cols
66
+ ))
67
+
68
+ return SchemaResponse(
69
+ tables=tables,
70
+ last_updated="Dynamic",
71
+ data_source="GeoQuery Data Catalog (OSM, Overture, HDX, INEC)"
72
+ )
73
+
74
+
75
+ @router.get("/tables")
76
+ async def list_tables():
77
+ """
78
+ Returns a simple list of available table names.
79
+ """
80
+ catalog = get_data_catalog()
81
+ return {
82
+ "tables": list(catalog.catalog.keys()),
83
+ "count": len(catalog.catalog)
84
+ }
backend/core/catalog_enricher.py ADDED
@@ -0,0 +1,221 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Catalog Enricher Service
3
+
4
+ Automatically generates rich metadata for datasets using LLM.
5
+ Enhances table descriptions and tags for better semantic search.
6
+ """
7
+
8
+ import logging
9
+ from typing import Dict, List, Any, Optional
10
+ from backend.core.llm_gateway import LLMGateway
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ # Prompt for generating semantic descriptions
16
+ DESCRIPTION_PROMPT = """Generate a concise 2-3 sentence description for this geographic dataset.
17
+
18
+ Table Name: {table_name}
19
+ Category: {category}
20
+ Columns: {columns}
21
+ Sample Column Values: {sample_values}
22
+ Row Count: {row_count}
23
+
24
+ Focus on:
25
+ 1. What geographic entities it contains (districts, health facilities, roads, etc.)
26
+ 2. The geographic scope (Panama, specific province, etc.)
27
+ 3. Common use cases (administrative analysis, health coverage, etc.)
28
+
29
+ Return ONLY the description, no formatting or labels."""
30
+
31
+ # Prompt for generating/refining tags
32
+ TAG_PROMPT = """Suggest 5-8 relevant tags for this geographic dataset.
33
+
34
+ Table Name: {table_name}
35
+ Description: {description}
36
+ Columns: {columns}
37
+ Current Tags: {current_tags}
38
+
39
+ Rules:
40
+ 1. Tags should be lowercase, single words or hyphenated
41
+ 2. Include domain tags (health, education, infrastructure)
42
+ 3. Include geographic tags (administrative, boundaries, points)
43
+ 4. Include data type tags (census, osm, government)
44
+
45
+ Return ONLY a JSON array of strings, e.g. ["health", "facilities", "infrastructure"]"""
46
+
47
+
48
+ class CatalogEnricher:
49
+ """
50
+ Enriches catalog metadata with LLM-generated descriptions and tags.
51
+
52
+ Can be run on-demand for new datasets or batch-run for existing ones.
53
+ """
54
+
55
+ _instance = None
56
+
57
+ def __new__(cls):
58
+ if cls._instance is None:
59
+ cls._instance = super(CatalogEnricher, cls).__new__(cls)
60
+ cls._instance.initialized = False
61
+ return cls._instance
62
+
63
+ def __init__(self):
64
+ if self.initialized:
65
+ return
66
+
67
+ self.llm = LLMGateway()
68
+ self.initialized = True
69
+
70
+ async def generate_description(
71
+ self,
72
+ table_name: str,
73
+ metadata: Dict[str, Any],
74
+ sample_values: Optional[Dict[str, str]] = None
75
+ ) -> str:
76
+ """
77
+ Generate a semantic description for a dataset using LLM.
78
+
79
+ Args:
80
+ table_name: Name of the table
81
+ metadata: Catalog metadata dict
82
+ sample_values: Optional dict of column -> sample value
83
+
84
+ Returns:
85
+ Generated description string
86
+ """
87
+ columns = metadata.get("columns", [])
88
+ category = metadata.get("category", "unknown")
89
+ row_count = metadata.get("row_count", "unknown")
90
+
91
+ # Format sample values
92
+ sample_str = "Not available"
93
+ if sample_values:
94
+ sample_str = ", ".join(f"{k}: {v}" for k, v in list(sample_values.items())[:5])
95
+
96
+ prompt = DESCRIPTION_PROMPT.format(
97
+ table_name=table_name,
98
+ category=category,
99
+ columns=", ".join(columns[:15]), # Limit columns
100
+ sample_values=sample_str,
101
+ row_count=row_count
102
+ )
103
+
104
+ try:
105
+ response = await self.llm.generate_response(prompt)
106
+ description = response.strip()
107
+
108
+ # Basic validation
109
+ if len(description) < 20 or len(description) > 500:
110
+ logger.warning(f"Generated description for {table_name} seems unusual: {len(description)} chars")
111
+
112
+ return description
113
+
114
+ except Exception as e:
115
+ logger.error(f"Failed to generate description for {table_name}: {e}")
116
+ return metadata.get("description", f"Geographic data from {category}")
117
+
118
+ async def generate_tags(
119
+ self,
120
+ table_name: str,
121
+ metadata: Dict[str, Any]
122
+ ) -> List[str]:
123
+ """
124
+ Generate or refine tags for a dataset using LLM.
125
+
126
+ Args:
127
+ table_name: Name of the table
128
+ metadata: Catalog metadata dict
129
+
130
+ Returns:
131
+ List of tag strings
132
+ """
133
+ columns = metadata.get("columns", [])
134
+ description = metadata.get("semantic_description") or metadata.get("description", "")
135
+ current_tags = metadata.get("tags", [])
136
+
137
+ prompt = TAG_PROMPT.format(
138
+ table_name=table_name,
139
+ description=description,
140
+ columns=", ".join(columns[:15]),
141
+ current_tags=current_tags
142
+ )
143
+
144
+ try:
145
+ import json
146
+ response = await self.llm.generate_response(prompt)
147
+
148
+ # Parse JSON array
149
+ response = response.strip()
150
+ if response.startswith("```"):
151
+ response = response.split("```")[1]
152
+ if response.startswith("json"):
153
+ response = response[4:]
154
+
155
+ tags = json.loads(response)
156
+
157
+ if isinstance(tags, list):
158
+ # Validate and clean tags
159
+ clean_tags = []
160
+ for tag in tags:
161
+ if isinstance(tag, str):
162
+ tag = tag.lower().strip()
163
+ if 2 <= len(tag) <= 30:
164
+ clean_tags.append(tag)
165
+
166
+ return clean_tags
167
+
168
+ except Exception as e:
169
+ logger.error(f"Failed to generate tags for {table_name}: {e}")
170
+
171
+ return current_tags
172
+
173
+ async def enrich_table(
174
+ self,
175
+ table_name: str,
176
+ metadata: Dict[str, Any],
177
+ sample_values: Optional[Dict[str, str]] = None,
178
+ force_refresh: bool = False
179
+ ) -> Dict[str, Any]:
180
+ """
181
+ Fully enrich a table's metadata with description and tags.
182
+
183
+ Args:
184
+ table_name: Name of the table
185
+ metadata: Current catalog metadata
186
+ sample_values: Optional sample data for context
187
+ force_refresh: If True, regenerate even if already enriched
188
+
189
+ Returns:
190
+ Updated metadata dict
191
+ """
192
+ updated = metadata.copy()
193
+
194
+ # Generate description if missing or forced
195
+ if force_refresh or not metadata.get("semantic_description"):
196
+ logger.info(f"Generating semantic description for {table_name}...")
197
+ description = await self.generate_description(table_name, metadata, sample_values)
198
+ updated["semantic_description"] = description
199
+
200
+ # Generate/refine tags (always, to improve quality)
201
+ if force_refresh or len(metadata.get("tags", [])) < 3:
202
+ logger.info(f"Generating tags for {table_name}...")
203
+ tags = await self.generate_tags(table_name, updated)
204
+ # Merge with existing, deduplicate
205
+ existing_tags = set(metadata.get("tags", []))
206
+ new_tags = set(tags)
207
+ updated["tags"] = list(existing_tags | new_tags)
208
+
209
+ return updated
210
+
211
+
212
+ # Singleton accessor
213
+ _catalog_enricher: Optional[CatalogEnricher] = None
214
+
215
+
216
+ def get_catalog_enricher() -> CatalogEnricher:
217
+ """Get the singleton catalog enricher instance."""
218
+ global _catalog_enricher
219
+ if _catalog_enricher is None:
220
+ _catalog_enricher = CatalogEnricher()
221
+ return _catalog_enricher
backend/core/data_catalog.py ADDED
@@ -0,0 +1,445 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Data Catalog Service
3
+
4
+ Manages metadata for all datasets available in the platform.
5
+ Supports semantic search integration for scalable discovery.
6
+ """
7
+
8
+ import json
9
+ import duckdb
10
+ import logging
11
+ from datetime import datetime
12
+ from pathlib import Path
13
+ from typing import List, Dict, Any, Optional
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ # Tag inference rules for auto-tagging datasets
19
+ TAG_RULES = {
20
+ # Keywords in table name -> tags
21
+ "health": ["health", "facilities", "infrastructure"],
22
+ "hospital": ["health", "facilities", "medical"],
23
+ "clinic": ["health", "facilities", "medical"],
24
+ "school": ["education", "facilities", "infrastructure"],
25
+ "university": ["education", "facilities", "higher-education"],
26
+ "education": ["education", "facilities"],
27
+ "road": ["transportation", "infrastructure", "roads"],
28
+ "street": ["transportation", "infrastructure", "roads"],
29
+ "highway": ["transportation", "infrastructure", "roads"],
30
+ "airport": ["transportation", "infrastructure", "aviation"],
31
+ "port": ["transportation", "infrastructure", "maritime"],
32
+ "population": ["demographics", "census", "population"],
33
+ "census": ["demographics", "census", "statistics"],
34
+ "admin": ["administrative", "boundaries", "government"],
35
+ "district": ["administrative", "boundaries"],
36
+ "province": ["administrative", "boundaries"],
37
+ "corregimiento": ["administrative", "boundaries"],
38
+ "park": ["recreation", "green-space", "amenities"],
39
+ "water": ["hydrology", "natural-resources"],
40
+ "river": ["hydrology", "water"],
41
+ "forest": ["environment", "natural-resources", "land-cover"],
42
+ "building": ["infrastructure", "built-environment"],
43
+ "poi": ["points-of-interest", "amenities"],
44
+ }
45
+
46
+
47
+ class DataCatalog:
48
+ """
49
+ Singleton service managing dataset metadata.
50
+
51
+ Features:
52
+ - Auto-discovery of GeoJSON files in data directories
53
+ - Schema inference from first record
54
+ - Auto-tagging based on naming conventions
55
+ - Integration with semantic search for scalable discovery
56
+ """
57
+
58
+ _instance = None
59
+
60
+ DATA_DIR = Path(__file__).parent.parent / "data"
61
+ CATALOG_FILE = DATA_DIR / "catalog.json"
62
+
63
+ def __new__(cls):
64
+ if cls._instance is None:
65
+ cls._instance = super(DataCatalog, cls).__new__(cls)
66
+ cls._instance.initialized = False
67
+ return cls._instance
68
+
69
+ def __init__(self):
70
+ if self.initialized:
71
+ return
72
+
73
+ self.catalog: Dict[str, Any] = {}
74
+ self.load_catalog()
75
+ self.scan_and_update()
76
+ self._init_semantic_search()
77
+ self.initialized = True
78
+
79
+ def load_catalog(self):
80
+ """Load catalog from JSON file."""
81
+ if self.CATALOG_FILE.exists():
82
+ try:
83
+ with open(self.CATALOG_FILE, 'r') as f:
84
+ self.catalog = json.load(f)
85
+ except Exception as e:
86
+ logger.error(f"Failed to load catalog: {e}")
87
+ self.catalog = {}
88
+ else:
89
+ self.catalog = {}
90
+
91
+ def save_catalog(self):
92
+ """Save catalog to JSON file."""
93
+ try:
94
+ with open(self.CATALOG_FILE, 'w') as f:
95
+ json.dump(self.catalog, f, indent=2)
96
+ except Exception as e:
97
+ logger.error(f"Failed to save catalog: {e}")
98
+
99
+ def _infer_tags(self, table_name: str, columns: List[str]) -> List[str]:
100
+ """Auto-generate tags based on table name and columns."""
101
+ tags = set()
102
+ name_lower = table_name.lower()
103
+
104
+ # Check table name against rules
105
+ for keyword, keyword_tags in TAG_RULES.items():
106
+ if keyword in name_lower:
107
+ tags.update(keyword_tags)
108
+
109
+ # Check columns for additional hints
110
+ columns_lower = [c.lower() for c in columns]
111
+ if any('pop' in c for c in columns_lower):
112
+ tags.add("population")
113
+ if any('area' in c for c in columns_lower):
114
+ tags.add("geographic")
115
+ if 'geom' in columns_lower or 'geometry' in columns_lower:
116
+ tags.add("spatial")
117
+
118
+ return list(tags)
119
+
120
+ def _infer_data_type(self, category: str, table_name: str) -> str:
121
+ """Infer data type (static, semi-static, realtime)."""
122
+ # Base admin data is static
123
+ if category == "base":
124
+ return "static"
125
+
126
+ # OSM data is semi-static (updated periodically)
127
+ if category == "osm":
128
+ return "semi-static"
129
+
130
+ # HDX humanitarian data - varies
131
+ if category == "hdx":
132
+ return "semi-static"
133
+
134
+ # Census data is static
135
+ if "census" in table_name.lower():
136
+ return "static"
137
+
138
+ return "static"
139
+
140
+ def scan_and_update(self):
141
+ """Scan data directories and update catalog with new files."""
142
+ logger.info("Scanning data directories...")
143
+
144
+ # Define directories to scan
145
+ subdirs = ['base', 'osm', 'inec', 'hdx', 'custom', 'overture', 'ms_buildings']
146
+
147
+ # Temporary connection for schema inference
148
+ con = duckdb.connect(':memory:')
149
+ con.install_extension('spatial')
150
+ con.load_extension('spatial')
151
+
152
+ updated = False
153
+
154
+ for subdir in subdirs:
155
+ dir_path = self.DATA_DIR / subdir
156
+ if not dir_path.exists():
157
+ continue
158
+
159
+ # Scan for both .geojson and .geojson.gz
160
+ for file_path in list(dir_path.glob('**/*.geojson')) + list(dir_path.glob('**/*.geojson.gz')):
161
+ table_name = file_path.name.replace('.geojson.gz', '').replace('.geojson', '').lower().replace('-', '_').replace(' ', '_')
162
+
163
+ # Check if file path changed (file moved/renamed)
164
+ existing = self.catalog.get(table_name)
165
+ rel_path = str(file_path.relative_to(self.DATA_DIR))
166
+
167
+ if existing and existing.get('path') == rel_path:
168
+ # Already indexed with same path, skip unless missing new fields
169
+ if 'tags' in existing and 'data_type' in existing:
170
+ continue
171
+
172
+ try:
173
+ logger.info(f"Indexing {table_name}...")
174
+
175
+ # Read first row to get columns
176
+ query = f"SELECT * FROM ST_Read('{file_path}') LIMIT 1"
177
+ df = con.execute(query).fetchdf()
178
+ columns = list(df.columns)
179
+
180
+ # Count rows (for metadata)
181
+ row_count_query = f"SELECT COUNT(*) FROM ST_Read('{file_path}')"
182
+ row_count = con.execute(row_count_query).fetchone()[0]
183
+
184
+ # Auto-generate tags
185
+ tags = self._infer_tags(table_name, columns)
186
+
187
+ # Infer data type
188
+ data_type = self._infer_data_type(subdir, table_name)
189
+
190
+ # Build catalog entry
191
+ self.catalog[table_name] = {
192
+ "path": rel_path,
193
+ "description": f"Data from {subdir}/{file_path.name}",
194
+ "semantic_description": None, # LLM-generated on demand
195
+ "tags": tags,
196
+ "data_type": data_type,
197
+ "update_frequency": None,
198
+ "columns": columns,
199
+ "row_count": row_count,
200
+ "category": subdir,
201
+ "format": "geojson",
202
+ "last_indexed": datetime.now().isoformat()
203
+ }
204
+ updated = True
205
+
206
+ except Exception as e:
207
+ logger.warning(f"Failed to index {file_path}: {e}")
208
+
209
+ con.close()
210
+
211
+ if updated:
212
+ self.save_catalog()
213
+ logger.info("Catalog updated.")
214
+
215
+ def _init_semantic_search(self):
216
+ """Initialize semantic search with current catalog."""
217
+ try:
218
+ from backend.core.semantic_search import get_semantic_search
219
+ semantic = get_semantic_search()
220
+
221
+ # Embed all tables
222
+ new_embeddings = semantic.embed_all_tables(self.catalog)
223
+ if new_embeddings > 0:
224
+ logger.info(f"Created {new_embeddings} new semantic embeddings.")
225
+ except Exception as e:
226
+ logger.warning(f"Semantic search initialization failed: {e}")
227
+
228
+ def get_table_metadata(self, table_name: str) -> Optional[Dict]:
229
+ """Get metadata for a specific table."""
230
+ return self.catalog.get(table_name)
231
+
232
+ def get_all_table_summaries(self) -> str:
233
+ """
234
+ Returns a concise summary of all tables.
235
+
236
+ WARNING: This can be very large with many datasets.
237
+ Prefer using semantic_search.search() for discovery.
238
+ """
239
+ summary = "Available Data Tables:\n"
240
+
241
+ # Group by category
242
+ by_category: Dict[str, List] = {}
243
+ for name, meta in self.catalog.items():
244
+ cat = meta.get('category', 'other')
245
+ if cat not in by_category:
246
+ by_category[cat] = []
247
+ by_category[cat].append((name, meta))
248
+
249
+ for cat, items in by_category.items():
250
+ summary += f"\n## {cat.upper()}\n"
251
+ for name, meta in items:
252
+ desc = meta.get('semantic_description') or meta.get('description', 'No description')
253
+ tags = meta.get('tags', [])
254
+ tag_str = f" [{', '.join(tags[:3])}]" if tags else ""
255
+ summary += f"- {name}: {desc}{tag_str}\n"
256
+
257
+ return summary
258
+
259
+ def get_summaries_for_tables(self, table_names: List[str]) -> str:
260
+ """
261
+ Get summaries only for specified tables.
262
+
263
+ Used after semantic pre-filtering to build focused LLM context.
264
+ """
265
+ summary = "Relevant Data Tables:\n\n"
266
+
267
+ for name in table_names:
268
+ meta = self.catalog.get(name)
269
+ if not meta:
270
+ continue
271
+
272
+ desc = meta.get('semantic_description') or meta.get('description', 'No description')
273
+ tags = meta.get('tags', [])
274
+ columns = meta.get('columns', [])[:10] # Limit columns
275
+ row_count = meta.get('row_count', 'unknown')
276
+
277
+ summary += f"### {name}\n"
278
+ summary += f"Description: {desc}\n"
279
+ if tags:
280
+ summary += f"Tags: {', '.join(tags)}\n"
281
+ summary += f"Columns: {', '.join(columns)}\n"
282
+ summary += f"Rows: {row_count}\n\n"
283
+
284
+ return summary
285
+
286
+ def get_specific_table_schemas(self, table_names: List[str]) -> str:
287
+ """Returns detailed schema for specific tables."""
288
+ output = ""
289
+ for name in table_names:
290
+ meta = self.catalog.get(name)
291
+ if not meta:
292
+ continue
293
+
294
+ output += f"### {name}\n"
295
+ output += f"Description: {meta.get('description')}\n"
296
+ output += "Columns: " + ", ".join(meta.get('columns', [])) + "\n\n"
297
+ return output
298
+
299
+ def get_file_path(self, table_name: str) -> Optional[Path]:
300
+ """Get absolute path for a table's data file."""
301
+ meta = self.catalog.get(table_name)
302
+ if meta and 'path' in meta:
303
+ return self.DATA_DIR / meta['path']
304
+ return None
305
+
306
+ def get_tables_by_tag(self, tag: str) -> List[str]:
307
+ """Get all table names that have a specific tag."""
308
+ return [
309
+ name for name, meta in self.catalog.items()
310
+ if tag in meta.get('tags', [])
311
+ ]
312
+
313
+ def get_tables_by_category(self, category: str) -> List[str]:
314
+ """Get all table names in a specific category."""
315
+ return [
316
+ name for name, meta in self.catalog.items()
317
+ if meta.get('category') == category
318
+ ]
319
+
320
+ def get_stats(self) -> dict:
321
+ """Return statistics about the catalog."""
322
+ categories = {}
323
+ tags = {}
324
+ enriched_count = 0
325
+
326
+ for meta in self.catalog.values():
327
+ cat = meta.get('category', 'other')
328
+ categories[cat] = categories.get(cat, 0) + 1
329
+
330
+ if meta.get('semantic_description'):
331
+ enriched_count += 1
332
+
333
+ for tag in meta.get('tags', []):
334
+ tags[tag] = tags.get(tag, 0) + 1
335
+
336
+ return {
337
+ "total_datasets": len(self.catalog),
338
+ "enriched_datasets": enriched_count,
339
+ "by_category": categories,
340
+ "by_tag": dict(sorted(tags.items(), key=lambda x: -x[1])[:20]),
341
+ "catalog_file": str(self.CATALOG_FILE)
342
+ }
343
+
344
+ async def enrich_table(self, table_name: str, force_refresh: bool = False) -> bool:
345
+ """
346
+ Enrich a single table with LLM-generated metadata.
347
+
348
+ Returns True if enrichment was successful.
349
+ """
350
+ if table_name not in self.catalog:
351
+ logger.warning(f"Table {table_name} not found in catalog")
352
+ return False
353
+
354
+ metadata = self.catalog[table_name]
355
+
356
+ # Skip if already enriched (unless forced)
357
+ if not force_refresh and metadata.get('semantic_description'):
358
+ logger.info(f"Table {table_name} already enriched, skipping")
359
+ return True
360
+
361
+ try:
362
+ from backend.core.catalog_enricher import get_catalog_enricher
363
+ enricher = get_catalog_enricher()
364
+
365
+ # Get sample values for context
366
+ sample_values = await self._get_sample_values(table_name)
367
+
368
+ # Enrich
369
+ enriched = await enricher.enrich_table(table_name, metadata, sample_values, force_refresh)
370
+
371
+ # Update catalog
372
+ enriched['last_enriched'] = datetime.now().isoformat()
373
+ self.catalog[table_name] = enriched
374
+ self.save_catalog()
375
+
376
+ # Re-embed with new description
377
+ self._update_embedding(table_name, enriched)
378
+
379
+ logger.info(f"Successfully enriched {table_name}")
380
+ return True
381
+
382
+ except Exception as e:
383
+ logger.error(f"Failed to enrich {table_name}: {e}")
384
+ return False
385
+
386
+ async def enrich_all_tables(self, force_refresh: bool = False) -> Dict[str, bool]:
387
+ """
388
+ Enrich all tables in the catalog.
389
+
390
+ Returns dict of table_name -> success status.
391
+ """
392
+ results = {}
393
+
394
+ for table_name in self.catalog.keys():
395
+ success = await self.enrich_table(table_name, force_refresh)
396
+ results[table_name] = success
397
+
398
+ return results
399
+
400
+ async def _get_sample_values(self, table_name: str) -> Optional[Dict[str, str]]:
401
+ """Get sample values from a table for enrichment context."""
402
+ try:
403
+ from backend.core.geo_engine import get_geo_engine
404
+ geo_engine = get_geo_engine()
405
+
406
+ # Ensure table is loaded
407
+ geo_engine.ensure_table_loaded(table_name)
408
+
409
+ # Get one row
410
+ result = geo_engine.con.execute(f"SELECT * FROM {table_name} LIMIT 1").fetchdf()
411
+
412
+ if len(result) > 0:
413
+ sample = {}
414
+ for col in result.columns:
415
+ if col != 'geom':
416
+ val = result[col].iloc[0]
417
+ if val is not None:
418
+ sample[col] = str(val)[:50] # Limit value length
419
+ return sample
420
+
421
+ except Exception as e:
422
+ logger.debug(f"Could not get sample values for {table_name}: {e}")
423
+
424
+ return None
425
+
426
+ def _update_embedding(self, table_name: str, metadata: Dict[str, Any]) -> None:
427
+ """Update semantic search embedding for a table."""
428
+ try:
429
+ from backend.core.semantic_search import get_semantic_search
430
+ semantic = get_semantic_search()
431
+ semantic.embed_table(table_name, metadata)
432
+ semantic._save_embeddings()
433
+ except Exception as e:
434
+ logger.warning(f"Could not update embedding for {table_name}: {e}")
435
+
436
+
437
+ _data_catalog = None
438
+
439
+
440
+ def get_data_catalog() -> DataCatalog:
441
+ """Get the singleton data catalog instance."""
442
+ global _data_catalog
443
+ if _data_catalog is None:
444
+ _data_catalog = DataCatalog()
445
+ return _data_catalog
backend/core/database.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sqlmodel import SQLModel, create_engine
2
+ from sqlmodel.ext.asyncio.session import AsyncSession
3
+ from sqlalchemy.orm import sessionmaker
4
+ from sqlalchemy.pool import NullPool
5
+ import os
6
+
7
+ # Using local Postgres.app
8
+ # Format: postgresql+asyncpg://user:password@host/dbname
9
+ # Postgres.app usually defaults to the current user with no password
10
+ user = os.getenv("USER", "postgres")
11
+ DATABASE_URL = f"postgresql+asyncpg://{user}:@localhost/geoquery"
12
+
13
+ engine = create_engine(
14
+ DATABASE_URL,
15
+ echo=True,
16
+ future=True,
17
+ poolclass=NullPool # Disable pooling for asyncpg if needed, or adjust
18
+ )
19
+
20
+ # Async Engine for AsyncPG
21
+ from sqlalchemy.ext.asyncio import create_async_engine
22
+ async_engine = create_async_engine(DATABASE_URL, echo=True, future=True)
23
+
24
+ async def get_session() -> AsyncSession:
25
+ async_session = sessionmaker(
26
+ async_engine, class_=AsyncSession, expire_on_commit=False
27
+ )
28
+ async with async_session() as session:
29
+ yield session
30
+
31
+ async def init_db():
32
+ async with async_engine.begin() as conn:
33
+ # await conn.run_sync(SQLModel.metadata.drop_all)
34
+ await conn.run_sync(SQLModel.metadata.create_all)
backend/core/geo_engine.py ADDED
@@ -0,0 +1,244 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import duckdb
2
+ import json
3
+ import logging
4
+ import os
5
+ from typing import Dict, Any, Optional, List
6
+ from backend.core.data_catalog import get_data_catalog
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+ class GeoEngine:
11
+ _instance = None
12
+
13
+ def __new__(cls):
14
+ if cls._instance is None:
15
+ cls._instance = super(GeoEngine, cls).__new__(cls)
16
+ cls._instance.initialized = False
17
+ return cls._instance
18
+
19
+ def __init__(self):
20
+ if self.initialized:
21
+ return
22
+
23
+ logger.info("Initializing GeoEngine (DuckDB)...")
24
+ try:
25
+ self.con = duckdb.connect(database=':memory:')
26
+ self.con.install_extension('spatial')
27
+ self.con.load_extension('spatial')
28
+ logger.info("GeoEngine initialized with Spatial extension.")
29
+ except Exception as e:
30
+ logger.error(f"Failed to initialize GeoEngine: {e}")
31
+ raise e
32
+
33
+ self.layers = {} # layer_id -> table_name
34
+ self.catalog = get_data_catalog()
35
+ self.base_tables_loaded = False
36
+ self.initialized = True
37
+
38
+ # Automatically load base tables
39
+ self.initialize_base_tables()
40
+
41
+ def initialize_base_tables(self):
42
+ """
43
+ Load essential administrative boundary files into DuckDB tables.
44
+ """
45
+ if self.base_tables_loaded:
46
+ return
47
+
48
+ logger.info("Loading base tables into DuckDB...")
49
+
50
+ # Load core admin tables from catalog
51
+ # We look for tables starting with 'pan_admin' in the 'base' category
52
+ base_tables = [
53
+ name for name, meta in self.catalog.catalog.items()
54
+ if meta.get('category') == 'base'
55
+ ]
56
+
57
+ for table_name in base_tables:
58
+ self.ensure_table_loaded(table_name)
59
+
60
+ self.base_tables_loaded = True
61
+ logger.info("Base tables loaded.")
62
+
63
+ def ensure_table_loaded(self, table_name: str) -> bool:
64
+ """
65
+ Ensure a table is loaded in DuckDB. If not, load it from the catalog.
66
+ Returns True if successful, False otherwise.
67
+ """
68
+ # Check if already loaded
69
+ try:
70
+ self.con.execute(f"DESCRIBE {table_name}")
71
+ return True
72
+ except:
73
+ pass # Not loaded
74
+
75
+ # Look up in catalog
76
+ file_path = self.catalog.get_file_path(table_name)
77
+ if not file_path or not file_path.exists():
78
+ logger.warning(f"Table {table_name} not found in catalog or file missing.")
79
+ return False
80
+
81
+ try:
82
+ logger.info(f"Lazy loading table: {table_name}")
83
+ self.con.execute(f"CREATE OR REPLACE TABLE {table_name} AS SELECT * FROM ST_Read('{file_path}')")
84
+ return True
85
+ except Exception as e:
86
+ logger.error(f"Failed to load {table_name}: {e}")
87
+ return False
88
+
89
+ def get_table_schemas(self) -> str:
90
+ """
91
+ Get schema of currently loaded tables for LLM context.
92
+ """
93
+ result = "Currently Loaded Tables:\n\n"
94
+
95
+ try:
96
+ # Get all tables
97
+ tables = self.con.execute("SHOW TABLES").fetchall()
98
+ for table in tables:
99
+ table_name = table[0]
100
+ try:
101
+ columns = self.con.execute(f"DESCRIBE {table_name}").fetchall()
102
+ row_count = self.con.execute(f"SELECT COUNT(*) FROM {table_name}").fetchone()[0]
103
+
104
+ result += f"### {table_name} ({row_count} rows)\n"
105
+ result += "Columns:\n"
106
+
107
+ for col in columns:
108
+ col_name, col_type = col[0], col[1]
109
+ if col_name == 'geom':
110
+ result += f" - geom: GEOMETRY (spatial data)\n"
111
+ else:
112
+ result += f" - {col_name}: {col_type}\n"
113
+ result += "\n"
114
+ except:
115
+ pass
116
+ except Exception as e:
117
+ logger.error(f"Error getting schemas: {e}")
118
+
119
+ return result
120
+
121
+ def get_table_list(self) -> List[str]:
122
+ """Return list of all available table names."""
123
+ tables = list(self.BASE_TABLES.keys())
124
+ tables.extend(self.layers.values())
125
+ return tables
126
+
127
+ def register_layer(self, layer_id: str, geojson: Dict[str, Any]) -> str:
128
+ """
129
+ Registers a GeoJSON object as a table in DuckDB.
130
+ Returns the table name.
131
+ """
132
+ table_name = f"layer_{layer_id.replace('-', '_')}"
133
+
134
+ # If table exists, drop it
135
+ self.con.execute(f"DROP TABLE IF EXISTS {table_name}")
136
+
137
+ # DuckDB can read JSON objects directly via read_json_auto?
138
+ # Easier to dump to string and read from memory or temporary file.
139
+ # For in-memory, we can use binding or just simple JSON text.
140
+
141
+ # Strategy: Create a table with a JSON column, then unpack?
142
+ # Better: ST_Read can read from a file.
143
+ # Using python objects directly with DuckDB replacement scan is possible but complex for nested GeoJSON.
144
+
145
+ # Simplest: Write to temp file, load with ST_Read.
146
+ try:
147
+ import tempfile
148
+ import os
149
+
150
+ def json_serial(obj):
151
+ """JSON serializer for objects not serializable by default json code"""
152
+ if hasattr(obj, 'isoformat'):
153
+ return obj.isoformat()
154
+ raise TypeError (f"Type {type(obj)} not serializable")
155
+
156
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as tmp:
157
+ json.dump(geojson, tmp, default=json_serial)
158
+ tmp_path = tmp.name
159
+
160
+ self.con.execute(f"CREATE TABLE {table_name} AS SELECT * FROM ST_Read('{tmp_path}')")
161
+ os.unlink(tmp_path)
162
+
163
+ self.layers[layer_id] = table_name
164
+ logger.info(f"Registered layer {layer_id} as table {table_name}")
165
+ return table_name
166
+
167
+ except Exception as e:
168
+ logger.error(f"Error registering layer {layer_id}: {e}")
169
+ raise e
170
+
171
+ def execute_spatial_query(self, sql: str) -> Dict[str, Any]:
172
+ """
173
+ Executes a SQL query and returns the result as a GeoJSON FeatureCollection.
174
+ Expects the query to return a geometry column.
175
+ """
176
+ try:
177
+ logger.info(f"Executing Spatial SQL: {sql}")
178
+
179
+ # Use ST_AsGeoJSON to format the geometry column
180
+ # We assume the user/LLM selects *
181
+ # We need to wrap the user query to convert to GeoJSON format
182
+
183
+ # The query usually returns rows. We need to aggregate to FeatureCollection.
184
+ # DuckDB Spatial doesn't automagically output FeatureCollection structure.
185
+ # But the 'geojson' driver for ST_Read works. ST_AsGeoJSON works on geometries.
186
+
187
+ # Approach: Create a temporary table from the result, then export?
188
+ # Or fetch as Python objects.
189
+
190
+ self.con.execute(f"CREATE OR REPLACE TEMP TABLE query_result AS {sql}")
191
+
192
+ # Check columns to find geometry
193
+ columns = self.con.execute("DESCRIBE query_result").fetchall()
194
+ geom_col = next((c[0] for c in columns if c[0] in ['geom', 'geometry']), None)
195
+
196
+ if not geom_col and 'geometry' not in [c[0] for c in columns]:
197
+ # Maybe the user didn't select geometry?
198
+ pass
199
+
200
+ # Construct GeoJSON manually from rows
201
+ # Select ST_AsGeoJSON(geom), * EXCLUDE (geom)
202
+
203
+ other_cols = [c[0] for c in columns if c[0] != geom_col]
204
+ other_cols_select = ", ".join(other_cols) if other_cols else ""
205
+
206
+ select_clause = f"ST_AsGeoJSON({geom_col})"
207
+ if other_cols_select:
208
+ select_clause += f", {other_cols_select}"
209
+
210
+ rows = self.con.execute(f"SELECT {select_clause} FROM query_result").fetchall()
211
+
212
+ features = []
213
+ for row in rows:
214
+ geometry = json.loads(row[0])
215
+ properties = {}
216
+ for i, col_name in enumerate(other_cols):
217
+ properties[col_name] = row[i+1]
218
+
219
+ features.append({
220
+ "type": "Feature",
221
+ "geometry": geometry,
222
+ "properties": properties
223
+ })
224
+
225
+ return {
226
+ "type": "FeatureCollection",
227
+ "features": features,
228
+ "properties": {}
229
+ }
230
+
231
+ except Exception as e:
232
+ logger.error(f"Spatial query failed: {e}")
233
+ raise e
234
+
235
+ def get_table_name(self, layer_id: str) -> Optional[str]:
236
+ return self.layers.get(layer_id)
237
+
238
+ _geo_engine = None
239
+
240
+ def get_geo_engine() -> GeoEngine:
241
+ global _geo_engine
242
+ if _geo_engine is None:
243
+ _geo_engine = GeoEngine()
244
+ return _geo_engine
backend/core/llm_gateway.py ADDED
@@ -0,0 +1,500 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import asyncio
3
+ import json
4
+ from google import genai
5
+ from google.genai import types
6
+ from dotenv import load_dotenv
7
+
8
+ from backend.core.prompts import (
9
+ SYSTEM_INSTRUCTION,
10
+ INTENT_DETECTION_PROMPT,
11
+ DATA_DISCOVERY_PROMPT,
12
+ SQL_GENERATION_PROMPT,
13
+ EXPLANATION_PROMPT,
14
+ SPATIAL_SQL_PROMPT,
15
+ SPATIAL_SQL_PROMPT,
16
+ SQL_CORRECTION_PROMPT,
17
+ LAYER_NAME_PROMPT
18
+ )
19
+
20
+ class LLMGateway:
21
+ def __init__(self, model_name: str = "gemini-3-flash-preview"):
22
+ # Load environment variables if not already loaded
23
+ load_dotenv()
24
+
25
+ self.api_key = os.getenv("GEMINI_API_KEY") or os.getenv("GOOGLE_API_KEY")
26
+ if not self.api_key:
27
+ print("WARNING: GEMINI_API_KEY/GOOGLE_API_KEY not found. LLM features will not work.")
28
+ self.client = None
29
+ else:
30
+ # Explicitly setting the environment variable for the SDK if it's not set
31
+ if "GEMINI_API_KEY" not in os.environ and self.api_key:
32
+ os.environ["GEMINI_API_KEY"] = self.api_key
33
+
34
+ # The SDK automatically picks up GEMINI_API_KEY
35
+ self.client = genai.Client()
36
+
37
+ self.model = model_name
38
+
39
+ def _build_contents_from_history(self, history: list[dict], current_message: str) -> list:
40
+ """
41
+ Converts conversation history to the format expected by the Gemini API.
42
+ History format: [{"role": "user"|"assistant", "content": "..."}]
43
+ """
44
+ contents = []
45
+ for msg in history:
46
+ # Map 'assistant' to 'model' for Gemini API
47
+ role = "model" if msg["role"] == "assistant" else "user"
48
+ contents.append(
49
+ types.Content(
50
+ role=role,
51
+ parts=[types.Part.from_text(text=msg["content"])]
52
+ )
53
+ )
54
+
55
+ # Add the current message
56
+ contents.append(
57
+ types.Content(
58
+ role="user",
59
+ parts=[types.Part.from_text(text=current_message)]
60
+ )
61
+ )
62
+ return contents
63
+
64
+ async def generate_response_stream(self, user_query: str, history: list[dict] = None):
65
+ """
66
+ Generates a streaming response using conversation history for context.
67
+ Yields chunks of text and thought summaries.
68
+ """
69
+ if not self.client:
70
+ yield "I couldn't generate a response because the API key is missing."
71
+ return
72
+
73
+ if history is None:
74
+ history = []
75
+
76
+ try:
77
+ contents = self._build_contents_from_history(history, user_query)
78
+
79
+ # Enable thinking mode for general chat as well
80
+ config = types.GenerateContentConfig(
81
+ system_instruction=SYSTEM_INSTRUCTION,
82
+ thinking_config=types.ThinkingConfig(
83
+ include_thoughts=True # Enable thought summaries
84
+ )
85
+ )
86
+
87
+ stream = await asyncio.to_thread(
88
+ self.client.models.generate_content_stream,
89
+ model=self.model,
90
+ contents=contents,
91
+ config=config,
92
+ )
93
+
94
+ for chunk in stream:
95
+ for part in chunk.candidates[0].content.parts:
96
+ if part.thought:
97
+ yield {"type": "thought", "content": part.text}
98
+ elif part.text:
99
+ yield {"type": "content", "text": part.text}
100
+
101
+ except Exception as e:
102
+ print(f"Error calling Gemini stream: {e}")
103
+ yield f"Error: {str(e)}"
104
+
105
+ async def generate_response(self, user_query: str, history: list[dict] = None) -> str:
106
+ """
107
+ Generates a response using conversation history for context.
108
+ """
109
+ if not self.client:
110
+ return "I couldn't generate a response because the API key is missing."
111
+
112
+ if history is None:
113
+ history = []
114
+
115
+ try:
116
+ contents = self._build_contents_from_history(history, user_query)
117
+
118
+ config = types.GenerateContentConfig(
119
+ system_instruction=SYSTEM_INSTRUCTION,
120
+ )
121
+
122
+ response = await asyncio.to_thread(
123
+ self.client.models.generate_content,
124
+ model=self.model,
125
+ contents=contents,
126
+ config=config,
127
+ )
128
+ return response.text
129
+ except Exception as e:
130
+ print(f"Error calling Gemini: {e}")
131
+ return f"I encountered an error: {e}"
132
+
133
+ async def detect_intent(self, user_query: str, history: list[dict] = None) -> str:
134
+ """
135
+ Detects the intent of the user's query using Gemini thinking mode.
136
+ Returns: GENERAL_CHAT, DATA_QUERY, MAP_REQUEST, SPATIAL_OP, or STAT_QUERY
137
+ """
138
+ if not self.client:
139
+ return "GENERAL_CHAT"
140
+
141
+ intent_prompt = INTENT_DETECTION_PROMPT.format(user_query=user_query)
142
+
143
+ try:
144
+ # Use thinking mode for better intent classification
145
+ config = types.GenerateContentConfig(
146
+ thinking_config=types.ThinkingConfig(
147
+ thinking_level="medium" # Balanced thinking for intent detection
148
+ )
149
+ )
150
+
151
+ response = await asyncio.to_thread(
152
+ self.client.models.generate_content,
153
+ model=self.model,
154
+ contents=intent_prompt,
155
+ config=config,
156
+ )
157
+ intent = response.text.strip().upper()
158
+
159
+ # Validate the intent
160
+ if intent in ["GENERAL_CHAT", "DATA_QUERY", "MAP_REQUEST", "SPATIAL_OP", "STAT_QUERY"]:
161
+ return intent
162
+
163
+ # Default fallback
164
+ return "GENERAL_CHAT"
165
+ except Exception as e:
166
+ print(f"Error detecting intent: {e}")
167
+ return "GENERAL_CHAT"
168
+
169
+ async def stream_intent(self, user_query: str, history: list[dict] = None):
170
+ """
171
+ Streams intent detection, yielding thoughts.
172
+ """
173
+ if not self.client:
174
+ yield {"type": "error", "text": "API Key missing"}
175
+ return
176
+
177
+ intent_prompt = INTENT_DETECTION_PROMPT.format(user_query=user_query)
178
+
179
+ try:
180
+ config = types.GenerateContentConfig(
181
+ thinking_config=types.ThinkingConfig(
182
+ thinking_level="medium",
183
+ include_thoughts=True
184
+ )
185
+ )
186
+
187
+ stream = await asyncio.to_thread(
188
+ self.client.models.generate_content_stream,
189
+ model=self.model,
190
+ contents=intent_prompt,
191
+ config=config,
192
+ )
193
+
194
+ for chunk in stream:
195
+ for part in chunk.candidates[0].content.parts:
196
+ if part.thought:
197
+ yield {"type": "thought", "text": part.text}
198
+ elif part.text:
199
+ yield {"type": "content", "text": part.text}
200
+
201
+ except Exception as e:
202
+ print(f"Error detecting intent: {e}")
203
+ yield {"type": "error", "text": str(e)}
204
+
205
+ # Legacy generate_sql removed.
206
+
207
+ async def identify_relevant_tables(self, user_query: str, table_summaries: str) -> list[str]:
208
+ """
209
+ Identifies which tables are relevant for the user's query from the catalog summary.
210
+ Returns a JSON list of table names.
211
+ """
212
+ if not self.client:
213
+ return []
214
+
215
+ prompt = DATA_DISCOVERY_PROMPT.format(user_query=user_query, table_summaries=table_summaries)
216
+
217
+ try:
218
+ config = types.GenerateContentConfig(
219
+ response_mime_type="application/json"
220
+ )
221
+
222
+ response = await asyncio.to_thread(
223
+ self.client.models.generate_content,
224
+ model=self.model,
225
+ contents=prompt,
226
+ config=config,
227
+ )
228
+
229
+ text = response.text.replace("```json", "").replace("```", "").strip()
230
+ tables = json.loads(text)
231
+ return tables if isinstance(tables, list) else []
232
+
233
+ except Exception as e:
234
+ print(f"Error identifying tables: {e}")
235
+ return []
236
+
237
+ async def generate_analytical_sql(self, user_query: str, table_schema: str, history: list[dict] = None) -> str:
238
+ """
239
+ Generates a DuckDB SQL query for analytical/statistical questions about geographic data.
240
+ This is the core of the text-to-SQL system.
241
+ """
242
+ if not self.client:
243
+ return "-- Error: API Key missing"
244
+
245
+ prompt = SQL_GENERATION_PROMPT.format(table_schema=table_schema, user_query=user_query)
246
+
247
+ try:
248
+ # Use thinking mode for complex SQL generation
249
+ config = types.GenerateContentConfig(
250
+ temperature=1,
251
+ thinking_config=types.ThinkingConfig(
252
+ thinking_level="high" # Maximum reasoning for SQL generation
253
+ )
254
+ )
255
+
256
+ response = await asyncio.wait_for(
257
+ asyncio.to_thread(
258
+ self.client.models.generate_content,
259
+ model=self.model,
260
+ contents=prompt,
261
+ config=config,
262
+ ),
263
+ timeout=120.0
264
+ )
265
+
266
+ sql = response.text.replace("```sql", "").replace("```", "").strip()
267
+
268
+ # Basic validation: must start with SELECT
269
+ if not sql.upper().strip().startswith("SELECT") and "-- ERROR" not in sql:
270
+ print(f"Warning: Generated SQL doesn't start with SELECT: {sql[:100]}")
271
+ if "SELECT" in sql.upper():
272
+ start_idx = sql.upper().find("SELECT")
273
+ sql = sql[start_idx:]
274
+
275
+ return sql
276
+
277
+ except asyncio.TimeoutError:
278
+ print("Gemini API call timed out after 30 seconds")
279
+ return "-- Error: API call timed out. Please try again."
280
+ except Exception as e:
281
+ print(f"Error calling Gemini for analytical SQL: {e}")
282
+ return f"-- Error generating SQL: {str(e)}"
283
+
284
+ async def stream_analytical_sql(self, user_query: str, table_schema: str, history: list[dict] = None):
285
+ """
286
+ Streams the generation of DuckDB SQL, yielding thoughts and chunks.
287
+ """
288
+ if not self.client:
289
+ yield {"type": "error", "text": "API Key missing"}
290
+ return
291
+
292
+ prompt = SQL_GENERATION_PROMPT.format(table_schema=table_schema, user_query=user_query)
293
+
294
+ try:
295
+ config = types.GenerateContentConfig(
296
+ temperature=1,
297
+ thinking_config=types.ThinkingConfig(
298
+ thinking_level="high",
299
+ include_thoughts=True
300
+ )
301
+ )
302
+
303
+ stream = await asyncio.to_thread(
304
+ self.client.models.generate_content_stream,
305
+ model=self.model,
306
+ contents=prompt,
307
+ config=config,
308
+ )
309
+
310
+ for chunk in stream:
311
+ for part in chunk.candidates[0].content.parts:
312
+ if part.thought:
313
+ yield {"type": "thought", "text": part.text}
314
+ elif part.text:
315
+ yield {"type": "content", "text": part.text}
316
+
317
+ except Exception as e:
318
+ print(f"Error streaming SQL: {e}")
319
+ yield {"type": "error", "text": str(e)}
320
+
321
+ async def stream_explanation(self, user_query: str, sql_query: str, data_summary: str, history: list[dict] = None):
322
+ """
323
+ Streams the explanation.
324
+ """
325
+ if not self.client:
326
+ yield {"type": "error", "text": "API Key missing"}
327
+ return
328
+
329
+ # Build context from history if available
330
+ context_str = ""
331
+ if history:
332
+ context_str = "Previous conversation context:\n"
333
+ for msg in history[-4:]: # Last 4 messages for context
334
+ context_str += f"- {msg['role']}: {msg['content'][:100]}...\n"
335
+
336
+ prompt = EXPLANATION_PROMPT.format(context_str=context_str, user_query=user_query, sql_query=sql_query, data_summary=data_summary)
337
+
338
+ try:
339
+ config = types.GenerateContentConfig(
340
+ system_instruction=SYSTEM_INSTRUCTION,
341
+ thinking_config=types.ThinkingConfig(
342
+ thinking_level="low",
343
+ include_thoughts=True
344
+ )
345
+ )
346
+
347
+ stream = await asyncio.to_thread(
348
+ self.client.models.generate_content_stream,
349
+ model=self.model,
350
+ contents=prompt,
351
+ config=config,
352
+ )
353
+
354
+ for chunk in stream:
355
+ for part in chunk.candidates[0].content.parts:
356
+ if part.thought:
357
+ yield {"type": "thought", "text": part.text}
358
+ elif part.text:
359
+ yield {"type": "content", "text": part.text}
360
+
361
+ except Exception as e:
362
+ print(f"Error generating explanation: {e}")
363
+ yield {"type": "error", "text": str(e)}
364
+
365
+ async def generate_explanation(self, user_query: str, sql_query: str, data_summary: str, history: list[dict] = None) -> str:
366
+ """
367
+ Explains the results of the query to the user, maintaining conversation context.
368
+ """
369
+ if not self.client:
370
+ return "I couldn't generate an explanation because the API key is missing."
371
+
372
+ # Build context from history if available
373
+ context_str = ""
374
+ if history:
375
+ context_str = "Previous conversation context:\n"
376
+ for msg in history[-4:]: # Last 4 messages for context
377
+ context_str += f"- {msg['role']}: {msg['content'][:100]}...\n"
378
+
379
+ prompt = EXPLANATION_PROMPT.format(context_str=context_str, user_query=user_query, sql_query=sql_query, data_summary=data_summary)
380
+
381
+ try:
382
+ config = types.GenerateContentConfig(
383
+ system_instruction=SYSTEM_INSTRUCTION,
384
+ thinking_config=types.ThinkingConfig(
385
+ thinking_level="low" # Fast response for explanations
386
+ )
387
+ )
388
+
389
+ response = await asyncio.to_thread(
390
+ self.client.models.generate_content,
391
+ model=self.model,
392
+ contents=prompt,
393
+ config=config,
394
+ )
395
+ return response.text
396
+ except Exception as e:
397
+ print(f"Error generating explanation: {e}")
398
+ return "Here are the results from the query."
399
+
400
+ async def generate_spatial_sql(self, user_query: str, layer_context: str, history: list[dict] = None) -> str:
401
+ """
402
+ Generates a DuckDB Spatial SQL query for geometric operations on layers.
403
+ """
404
+ if not self.client:
405
+ return "-- Error: API Key missing"
406
+
407
+ prompt = SPATIAL_SQL_PROMPT.format(layer_context=layer_context, user_query=user_query)
408
+
409
+ try:
410
+ config = types.GenerateContentConfig(
411
+ temperature=1,
412
+ )
413
+
414
+ # Add timeout to prevent indefinite hangs
415
+ response = await asyncio.wait_for(
416
+ asyncio.to_thread(
417
+ self.client.models.generate_content,
418
+ model=self.model,
419
+ contents=prompt,
420
+ config=config,
421
+ ),
422
+ timeout=120.0
423
+ )
424
+
425
+ sql = response.text.replace("```sql", "").replace("```", "").strip()
426
+ return sql
427
+
428
+ except asyncio.TimeoutError:
429
+ print("Gemini API call timed out after 30 seconds")
430
+ return "-- Error: API call timed out. Please try again."
431
+ except Exception as e:
432
+ print(f"Error calling Gemini: {e}")
433
+ return f"-- Error generating SQL: {str(e)}"
434
+
435
+ async def correct_sql(self, user_query: str, incorrect_sql: str, error_message: str, schema_context: str) -> str:
436
+ """
437
+ Corrects a failed SQL query based on the error message.
438
+ """
439
+ if not self.client:
440
+ return "-- Error: API Key missing"
441
+
442
+ prompt = SQL_CORRECTION_PROMPT.format(
443
+ error_message=error_message,
444
+ incorrect_sql=incorrect_sql,
445
+ user_query=user_query,
446
+ schema_context=schema_context
447
+ )
448
+
449
+ try:
450
+ config = types.GenerateContentConfig(
451
+ temperature=1,
452
+ )
453
+
454
+ response = await asyncio.to_thread(
455
+ self.client.models.generate_content,
456
+ model=self.model,
457
+ contents=prompt,
458
+ config=config,
459
+ )
460
+
461
+ sql = response.text.replace("```sql", "").replace("```", "").strip()
462
+ return sql
463
+
464
+ except Exception as e:
465
+ print(f"Error correcting SQL: {e}")
466
+ return incorrect_sql
467
+
468
+ async def generate_layer_name(self, user_query: str, sql_query: str) -> dict:
469
+ """
470
+ Generates a short, descriptive name, emoji, and point style for a map layer.
471
+ Returns: {"name": str, "emoji": str, "pointStyle": str | None}
472
+ """
473
+ if not self.client:
474
+ return {"name": "New Layer", "emoji": "📍", "pointStyle": None}
475
+
476
+ prompt = LAYER_NAME_PROMPT.format(user_query=user_query, sql_query=sql_query)
477
+
478
+ try:
479
+ config = types.GenerateContentConfig(
480
+ temperature=1,
481
+ response_mime_type="application/json"
482
+ )
483
+
484
+ # Use simple generate content (not streaming)
485
+ response = await asyncio.to_thread(
486
+ self.client.models.generate_content,
487
+ model=self.model,
488
+ contents=prompt,
489
+ config=config,
490
+ )
491
+
492
+ result = json.loads(response.text)
493
+ return {
494
+ "name": result.get("name", "Map Layer"),
495
+ "emoji": result.get("emoji", "📍"),
496
+ "pointStyle": result.get("pointStyle", None)
497
+ }
498
+ except Exception as e:
499
+ print(f"Error generating layer name: {e}")
500
+ return {"name": "Map Layer", "emoji": "📍", "pointStyle": None}
backend/core/prompts.py ADDED
@@ -0,0 +1,279 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Centralized storage for all LLM system instructions and prompt templates.
3
+ """
4
+
5
+ SYSTEM_INSTRUCTION = """You are GeoQuery, an advanced Territorial Intelligence Agent capable of analyzing diverse geographic datasets.
6
+
7
+ ## Your Capabilities
8
+ You are not limited to a fixed schema. You have a **Dynamic Metadata Catalog** that allows you to discover and query any dataset ingested into the system.
9
+ - **Administrative Data**: Provinces, districts, corregimientos (always available).
10
+ - **Dynamic Data**: You can query *any* table present in the database (e.g., population, health, infrastructure, biodiversity).
11
+ - **Spatial Analysis**: You can perform complex spatial joins, intersections, and aggregations using PostGIS/DuckDB Spatial functions.
12
+
13
+ ## Output Guidelines
14
+ 1. **Be Data-Driven**: Base your answers strictly on the results of the SQL queries.
15
+ 2. **Be Visual**:
16
+ - Use **Choropleth Maps** (color gradients) for comparisons/densities.
17
+ - Use **Point Maps** for locating specific facilities or events.
18
+ - Use **Charts** (Bar, Pie, Line) for statistical summaries.
19
+ 3. **Be Transparent**:
20
+ - Always **Explain** your reasoning.
21
+ - **Cite** the specific table names used (e.g., "Source: `osm/universities.geojson`").
22
+ - If data is missing *after* checking the catalog, explain clearly what is available vs. what is missing.
23
+
24
+ ## Interaction Style
25
+ - Professional, concise, and helpful.
26
+ - "Thinking" is enabled: Use your internal thought process to plan complex queries before generating SQL.
27
+ - If a query fails, self-correct by analyzing the error message.
28
+ """
29
+
30
+ INTENT_DETECTION_PROMPT = """Analyze this user query and determine the best output type.
31
+
32
+ User Query: "{user_query}"
33
+
34
+ THINK STEP BY STEP:
35
+ 1. What is the user asking for?
36
+ 2. Does this require geographic visualization (map)?
37
+ 3. Does this require a chart/graph?
38
+ 4. Is this a general question or conversation?
39
+
40
+ Then respond with ONLY ONE of these exact words:
41
+ - GENERAL_CHAT: General question, greeting, or conversational message
42
+ - DATA_QUERY: Wants textual information or data that should be shown on a map
43
+ - MAP_REQUEST: Wants to SEE or VISUALIZE geographic data on a map (show, display, plot, color, compare regions)
44
+ - SPATIAL_OP: Geometric operation between layers (Intersection, Buffer, Union, Difference)
45
+ - STAT_QUERY: EXPLICITLY asks for a CHART or GRAPH (bar chart, pie chart, line graph)
46
+
47
+ Key rules:
48
+ - "color by", "compare regions", "show largest/smallest" → MAP_REQUEST (for choropleth)
49
+ - "show me provinces", "display districts" → MAP_REQUEST
50
+ - "create a chart", "bar graph" → STAT_QUERY
51
+ - Questions about data availability → GENERAL_CHAT
52
+
53
+ Respond with only the intent category, nothing else."""
54
+
55
+ DATA_DISCOVERY_PROMPT = """You are a Data Discovery Agent. Convert the user's request into a list of relevant table names from the available data.
56
+
57
+ User Request: "{user_query}"
58
+
59
+ Available Data Tables:
60
+ {table_summaries}
61
+
62
+ Rules:
63
+ 1. Return ONLY a valid JSON list of strings, e.g. ["table1", "table2"].
64
+ 2. Select tables that might contain the answer.
65
+ 3. If asking for "colleges" or "education", include 'universities', 'schools', etc.
66
+ 4. If asking for "health", include 'hospitals'.
67
+ 5. Always include 'admin1', 'admin2', 'admin3' if the query involves regions.
68
+ 6. If no specific table matches, return empty list [].
69
+ """
70
+
71
+ SQL_GENERATION_PROMPT = """You are a DuckDB SQL expert for geographic data analysis. Generate a valid DuckDB SQL query for the following request.
72
+
73
+ {table_schema}
74
+
75
+ ### CRITICAL - Data Availability:
76
+ ✅ You may ONLY query the tables listed above.
77
+ ❌ Do NOT invent table names or column names.
78
+
79
+ **If the requested data is NOT in the schema above, IMMEDIATELY return this exact response and STOP:**
80
+ -- ERROR: DATA_UNAVAILABLE
81
+ -- Requested: [what the user asked for]
82
+ -- Available: [list the tables you DO have]
83
+
84
+ **Do NOT keep thinking or try alternative approaches. Just return the error and stop.**
85
+
86
+ ### User Request: "{user_query}"
87
+
88
+ ### Rules:
89
+ 1. Return ONLY the SQL query. No explanation, no markdown formatting.
90
+ 2. Use DuckDB syntax (ILIKE for case-insensitive matching).
91
+ 3. ALWAYS include 'geom' in SELECT for map visualization.
92
+ 4. For "top N" or "largest" queries, use ORDER BY ... DESC LIMIT N.
93
+ 5. For "per group" queries, use window functions.
94
+ 6. Do NOT add LIMIT unless the user explicitly asks for a specific count (e.g., "top 10", "first 5"). Return all matching rows by default.
95
+ 7. NEVER invent columns that don't exist.
96
+
97
+ ### Special Datasets:
98
+ - **Population/Demographics**: Use `kontur_population` (H3 hexagons).
99
+ - Columns: `population`, `geom`.
100
+ - Query: `SELECT population, geom FROM kontur_population ...`
101
+ - Visualization: The system detects the `population` column and automatically renders a heatmap (choropleth).
102
+ - Note: This dataset is large (33k hexagons). If querying the entire country, use `LIMIT 40000` to ensure full coverage, or filter by specific province/district.
103
+
104
+ ### Example Queries:
105
+
106
+ -- Largest provinces by area
107
+ SELECT adm1_name, area_sqkm, geom FROM admin1 ORDER BY area_sqkm DESC LIMIT 10
108
+
109
+ -- Population Density Heatmap for a Region (e.g., Veraguas)
110
+ SELECT population, geom FROM kontur_population
111
+ WHERE ST_Intersects(geom, (SELECT geom FROM pan_admin1 WHERE adm1_name = 'Veraguas'))
112
+ LIMIT 5000
113
+
114
+ -- Largest district in each province
115
+ SELECT adm1_name, adm2_name, area_sqkm, geom FROM (
116
+ SELECT *, ROW_NUMBER() OVER (PARTITION BY adm1_name ORDER BY area_sqkm DESC) as rn
117
+ FROM admin2
118
+ ) WHERE rn = 1
119
+
120
+ Now generate the SQL for the user's request:"""
121
+
122
+ EXPLANATION_PROMPT = """Explain the results of this data query to the user.
123
+
124
+ {context_str}
125
+
126
+ User Question: "{user_query}"
127
+ SQL Query Used: {sql_query}
128
+ Data Result Summary: {data_summary}
129
+
130
+ Instructions:
131
+ 1. Keep your response concise and helpful
132
+ 2. Only describe data that was ACTUALLY returned in the query results
133
+ 3. The available metrics include: area (area_sqkm), population (kontur_population), names, and geographic codes
134
+ 4. If the user asked for data that doesn't exist, explain that clearly
135
+ 5. Cite: "Administrative boundary data from HDX/INEC, 2021" or "Population data from Kontur, 2022"
136
+ 6. Speak as GeoQuery, the platform itself
137
+ """
138
+
139
+ SPATIAL_SQL_PROMPT = """You are a GIS expert using DuckDB Spatial. Generate a valid SQL query for the following request.
140
+
141
+ Available Data:
142
+ {layer_context}
143
+
144
+ User Request: "{user_query}"
145
+
146
+ Rules:
147
+ 1. Return ONLY the SQL query. No markdown formatting, no explanation.
148
+ 2. Use DuckDB Spatial functions (ST_Difference, ST_Intersection, ST_Union, ST_Buffer, ST_Within, ST_Contains).
149
+ 3. The geometry column is named 'geom'. Use 'geom' for all spatial functions.
150
+ 4. CRITICAL: Use ONLY the EXACT table names shown above in your FROM clause.
151
+ - Base tables are shown with their schema (e.g., panama_healthsites_geojson)
152
+ - User-created layers are shown as "Layer N: Name (Table: layer_xxxxx)"
153
+ 5. IMPORTANT: For operations that aggregate geometries (ST_Union), use CTE pattern, NOT scalar subqueries:
154
+ CORRECT (CTE pattern):
155
+ ```sql
156
+ WITH layer_b_union AS (SELECT ST_Union(geom) as geom FROM layer_b)
157
+ SELECT a.*, ST_Difference(a.geom, b.geom) as geom FROM layer_a a, layer_b_union b
158
+ ```
159
+ WRONG (scalar subquery - causes syntax errors):
160
+ ```sql
161
+ SELECT ST_Difference(geom, (SELECT ST_Union(geom) FROM layer_b)) FROM layer_a
162
+ ```
163
+ 6. For containment queries (points within polygons), use ST_Within(points.geom, polygons.geom).
164
+ 7. Handle joins properly (e.g., CROSS JOIN or comma-join for combining with CTEs).
165
+ 8. IMPORTANT: Preserve 'name' properties if possible.
166
+ 9. OUTPUT: SELECT with geom column included.
167
+ """
168
+
169
+ SQL_CORRECTION_PROMPT = """You are a DuckDB SQL expert. Your previous query failed to execute. Fix it.
170
+
171
+ ### Error Message:
172
+ {error_message}
173
+
174
+ ### Failed SQL:
175
+ {incorrect_sql}
176
+
177
+ ### User Request:
178
+ "{user_query}"
179
+
180
+ ### Database Schema:
181
+ {schema_context}
182
+
183
+ ### Rules:
184
+ 1. Fix the error described in the message (e.g., column ambiguity, missing column, syntax error).
185
+ 2. Return ONLY the valid SQL query. No explanation.
186
+ 3. Keep the query logic consistent with the User Request.
187
+ 4. Ensure 'geom' is selected for map visualization if needed.
188
+ """
189
+
190
+ LAYER_NAME_PROMPT = """You are a helpful assistant generating a short, descriptive name for a map layer.
191
+
192
+ User Request: "{user_query}"
193
+ SQL Query: "{sql_query}"
194
+
195
+ Rules:
196
+ 1. Return a VALID JSON object with three keys: "name", "emoji", and "pointStyle".
197
+ 2. "name": A short descriptive name (1-4 words).
198
+ 3. "emoji": A single emoji representing the data content (e.g., "🏥" for hospitals, "🎓" for schools, "👥" for population).
199
+ 4. "pointStyle": Determines how POINT geometries should be rendered on the map (ONLY applies to Point geometry types):
200
+ - "icon": Use for specific, categorical points of interest (hospitals, schools, parks, landmarks)
201
+ * Best for: Small to medium point datasets (<500 points)
202
+ * Best for: When each point represents a distinct, identifiable feature
203
+ * The emoji will be displayed on the map as the marker icon
204
+ - "circle": Use for large point datasets
205
+ * Best for: Large point datasets (>500 points) like street intersections, sensor locations
206
+ * Renders as simple colored circles for better performance
207
+ - NOTE: For polygon data (H3 hexagons, administrative boundaries), the system automatically uses choropleth rendering (colored polygons). Do NOT set pointStyle for polygon data.
208
+ 5. Examples:
209
+ {{"name": "Schools in Panama", "emoji": "🏫", "pointStyle": "icon"}}
210
+ {{"name": "Population Density", "emoji": "👥", "pointStyle": null}} # H3 hexagons are POLYGONS, not points
211
+ {{"name": "National Parks", "emoji": "🌲", "pointStyle": "icon"}}
212
+ {{"name": "Street Intersections", "emoji": "🚦", "pointStyle": "circle"}}
213
+ 6. Do NOT return markdown formatting (no ```json). Just the raw JSON string.
214
+
215
+ """
216
+
217
+ QUERY_PLANNING_PROMPT = """You are a Query Planning Agent. Decompose this complex query into atomic execution steps.
218
+
219
+ User Query: "{user_query}"
220
+
221
+ Available Tables:
222
+ {available_tables}
223
+
224
+ TASK: Break down this query into sequential steps that can be executed independently.
225
+
226
+ RULES:
227
+ 1. Each step should query a SINGLE dataset or combine results from previous steps.
228
+ 2. Steps that don't depend on each other can run in parallel.
229
+ 3. The final step should combine/compare results if needed.
230
+ 4. Use ONLY the table names listed above.
231
+
232
+ Return a JSON object with this structure:
233
+ {{
234
+ "steps": [
235
+ {{
236
+ "type": "data_query" | "aggregation" | "comparison" | "spatial_join" | "combine",
237
+ "description": "Human-readable description of this step",
238
+ "tables": ["table_name"],
239
+ "sql_hint": "Optional SQL pattern or hint",
240
+ "depends_on": [],
241
+ "result_name": "descriptive_name_for_result"
242
+ }}
243
+ ],
244
+ "combination_logic": "How to combine the step results for the final answer"
245
+ }}
246
+
247
+ EXAMPLE for "Compare hospital count vs school count by province":
248
+ {{
249
+ "steps": [
250
+ {{
251
+ "type": "aggregation",
252
+ "description": "Count hospitals per province",
253
+ "tables": ["panama_healthsites_geojson", "pan_admin1"],
254
+ "sql_hint": "SELECT province, COUNT(*) as hospital_count FROM ... GROUP BY province",
255
+ "depends_on": [],
256
+ "result_name": "hospitals_by_province"
257
+ }},
258
+ {{
259
+ "type": "aggregation",
260
+ "description": "Count schools per province",
261
+ "tables": ["schools", "pan_admin1"],
262
+ "sql_hint": "SELECT province, COUNT(*) as school_count FROM ... GROUP BY province",
263
+ "depends_on": [],
264
+ "result_name": "schools_by_province"
265
+ }},
266
+ {{
267
+ "type": "combine",
268
+ "description": "Join hospital and school counts by province for comparison",
269
+ "tables": [],
270
+ "sql_hint": "JOIN hospitals_by_province h ON schools_by_province s ON h.province = s.province",
271
+ "depends_on": ["hospitals_by_province", "schools_by_province"],
272
+ "result_name": "comparison_result"
273
+ }}
274
+ ],
275
+ "combination_logic": "Display side-by-side comparison with bar chart showing both counts per province"
276
+ }}
277
+
278
+ Now decompose the user's query. Return ONLY the JSON, no markdown formatting.
279
+ """
backend/core/query_planner.py ADDED
@@ -0,0 +1,291 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Multi-Step Query Planner
3
+
4
+ Detects complex queries that require multiple datasets or operations,
5
+ decomposes them into atomic steps, and orchestrates execution.
6
+ """
7
+
8
+ import json
9
+ import logging
10
+ from dataclasses import dataclass, field
11
+ from typing import List, Dict, Any, Optional
12
+ from enum import Enum
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ class StepType(Enum):
18
+ """Types of query steps."""
19
+ DATA_QUERY = "data_query" # Simple data retrieval
20
+ AGGREGATION = "aggregation" # COUNT, SUM, GROUP BY
21
+ COMPARISON = "comparison" # Comparing results from previous steps
22
+ SPATIAL_JOIN = "spatial_join" # Joining datasets spatially
23
+ COMBINE = "combine" # Merge/combine step results
24
+
25
+
26
+ @dataclass
27
+ class QueryStep:
28
+ """A single atomic step in a query plan."""
29
+ step_id: str
30
+ step_type: StepType
31
+ description: str
32
+ tables_needed: List[str]
33
+ sql_template: Optional[str] = None
34
+ depends_on: List[str] = field(default_factory=list)
35
+ result_name: str = "" # Name for intermediate result
36
+
37
+ def to_dict(self) -> Dict[str, Any]:
38
+ return {
39
+ "step_id": self.step_id,
40
+ "step_type": self.step_type.value,
41
+ "description": self.description,
42
+ "tables_needed": self.tables_needed,
43
+ "sql_template": self.sql_template,
44
+ "depends_on": self.depends_on,
45
+ "result_name": self.result_name
46
+ }
47
+
48
+
49
+ @dataclass
50
+ class QueryPlan:
51
+ """Complete execution plan for a complex query."""
52
+ original_query: str
53
+ is_complex: bool
54
+ steps: List[QueryStep] = field(default_factory=list)
55
+ parallel_groups: List[List[str]] = field(default_factory=list) # Steps that can run in parallel
56
+ final_combination_logic: str = ""
57
+
58
+ def to_dict(self) -> Dict[str, Any]:
59
+ return {
60
+ "original_query": self.original_query,
61
+ "is_complex": self.is_complex,
62
+ "steps": [s.to_dict() for s in self.steps],
63
+ "parallel_groups": self.parallel_groups,
64
+ "final_combination_logic": self.final_combination_logic
65
+ }
66
+
67
+
68
+ class QueryPlanner:
69
+ """
70
+ Multi-step query planning service.
71
+
72
+ Analyzes queries to determine complexity and decomposes
73
+ complex queries into executable atomic steps.
74
+ """
75
+
76
+ _instance = None
77
+
78
+ # Keywords that often indicate multi-step queries
79
+ COMPLEXITY_INDICATORS = [
80
+ "compare", "comparison", "versus", "vs",
81
+ "more than", "less than", "higher than", "lower than",
82
+ "both", "and also", "as well as",
83
+ "ratio", "percentage", "proportion",
84
+ "correlation", "relationship between",
85
+ "combine", "merge", "together with",
86
+ "relative to", "compared to",
87
+ "difference between", "gap between"
88
+ ]
89
+
90
+ # Keywords indicating multiple distinct data types
91
+ MULTI_DOMAIN_KEYWORDS = {
92
+ "health": ["hospital", "clinic", "healthcare", "health", "medical"],
93
+ "education": ["school", "university", "education", "college", "student"],
94
+ "infrastructure": ["road", "bridge", "infrastructure", "building"],
95
+ "environment": ["forest", "water", "environment", "park", "protected"],
96
+ "population": ["population", "demographic", "census", "people", "resident"]
97
+ }
98
+
99
+ def __new__(cls):
100
+ if cls._instance is None:
101
+ cls._instance = super(QueryPlanner, cls).__new__(cls)
102
+ cls._instance.initialized = False
103
+ return cls._instance
104
+
105
+ def __init__(self):
106
+ if self.initialized:
107
+ return
108
+ self.initialized = True
109
+
110
+ def detect_complexity(self, query: str) -> Dict[str, Any]:
111
+ """
112
+ Analyze a query to determine if it requires multi-step planning.
113
+
114
+ Returns:
115
+ {
116
+ "is_complex": bool,
117
+ "reason": str,
118
+ "detected_domains": List[str],
119
+ "complexity_indicators": List[str]
120
+ }
121
+ """
122
+ query_lower = query.lower()
123
+
124
+ # Check for complexity indicators
125
+ found_indicators = [
126
+ ind for ind in self.COMPLEXITY_INDICATORS
127
+ if ind in query_lower
128
+ ]
129
+
130
+ # Check for multiple data domains
131
+ found_domains = []
132
+ for domain, keywords in self.MULTI_DOMAIN_KEYWORDS.items():
133
+ if any(kw in query_lower for kw in keywords):
134
+ found_domains.append(domain)
135
+
136
+ # Determine complexity
137
+ is_complex = (
138
+ len(found_indicators) > 0 and len(found_domains) >= 2
139
+ ) or (
140
+ len(found_domains) >= 3
141
+ ) or (
142
+ any(x in query_lower for x in ["compare", "ratio", "correlation", "versus", " vs "])
143
+ and len(found_domains) >= 2
144
+ )
145
+
146
+ reason = ""
147
+ if is_complex:
148
+ if len(found_domains) >= 2:
149
+ reason = f"Query involves multiple data domains: {', '.join(found_domains)}"
150
+ if found_indicators:
151
+ reason += f". Contains comparison/aggregation keywords: {', '.join(found_indicators[:3])}"
152
+
153
+ return {
154
+ "is_complex": is_complex,
155
+ "reason": reason,
156
+ "detected_domains": found_domains,
157
+ "complexity_indicators": found_indicators
158
+ }
159
+
160
+ async def plan_query(
161
+ self,
162
+ query: str,
163
+ available_tables: List[str],
164
+ llm_gateway
165
+ ) -> QueryPlan:
166
+ """
167
+ Create an execution plan for a complex query.
168
+
169
+ Uses LLM to decompose the query into atomic steps.
170
+ """
171
+ from backend.core.prompts import QUERY_PLANNING_PROMPT
172
+
173
+ # Build table context
174
+ table_list = "\n".join(f"- {t}" for t in available_tables)
175
+
176
+ prompt = QUERY_PLANNING_PROMPT.format(
177
+ user_query=query,
178
+ available_tables=table_list
179
+ )
180
+
181
+ try:
182
+ response = await llm_gateway.generate_response(prompt, [])
183
+
184
+ # Parse JSON response
185
+ response_clean = response.strip()
186
+ if response_clean.startswith("```json"):
187
+ response_clean = response_clean[7:]
188
+ if response_clean.startswith("```"):
189
+ response_clean = response_clean[3:]
190
+ if response_clean.endswith("```"):
191
+ response_clean = response_clean[:-3]
192
+
193
+ plan_data = json.loads(response_clean.strip())
194
+
195
+ # Convert to QueryPlan
196
+ steps = []
197
+ for i, step_data in enumerate(plan_data.get("steps", [])):
198
+ step = QueryStep(
199
+ step_id=f"step_{i+1}",
200
+ step_type=StepType(step_data.get("type", "data_query")),
201
+ description=step_data.get("description", ""),
202
+ tables_needed=step_data.get("tables", []),
203
+ sql_template=step_data.get("sql_hint", None),
204
+ depends_on=step_data.get("depends_on", []),
205
+ result_name=step_data.get("result_name", f"result_{i+1}")
206
+ )
207
+ steps.append(step)
208
+
209
+ # Determine parallel groups (steps with no dependencies can run together)
210
+ parallel_groups = self._compute_parallel_groups(steps)
211
+
212
+ return QueryPlan(
213
+ original_query=query,
214
+ is_complex=True,
215
+ steps=steps,
216
+ parallel_groups=parallel_groups,
217
+ final_combination_logic=plan_data.get("combination_logic", "")
218
+ )
219
+
220
+ except Exception as e:
221
+ logger.error(f"Query planning failed: {e}")
222
+ # Return single-step fallback
223
+ return QueryPlan(
224
+ original_query=query,
225
+ is_complex=False,
226
+ steps=[],
227
+ parallel_groups=[],
228
+ final_combination_logic=""
229
+ )
230
+
231
+ def _compute_parallel_groups(self, steps: List[QueryStep]) -> List[List[str]]:
232
+ """
233
+ Compute which steps can be executed in parallel.
234
+
235
+ Steps with no dependencies (or only completed dependencies)
236
+ can run together.
237
+ """
238
+ if not steps:
239
+ return []
240
+
241
+ groups = []
242
+ executed = set()
243
+ remaining = {s.step_id: s for s in steps}
244
+
245
+ while remaining:
246
+ # Find steps whose dependencies are all satisfied
247
+ ready = [
248
+ step_id for step_id, step in remaining.items()
249
+ if all(dep in executed for dep in step.depends_on)
250
+ ]
251
+
252
+ if not ready:
253
+ # Avoid infinite loop - add remaining as sequential
254
+ ready = list(remaining.keys())[:1]
255
+
256
+ groups.append(ready)
257
+
258
+ for step_id in ready:
259
+ executed.add(step_id)
260
+ del remaining[step_id]
261
+
262
+ return groups
263
+
264
+ def create_simple_plan(self, query: str) -> QueryPlan:
265
+ """Create a simple single-step plan for non-complex queries."""
266
+ return QueryPlan(
267
+ original_query=query,
268
+ is_complex=False,
269
+ steps=[
270
+ QueryStep(
271
+ step_id="step_1",
272
+ step_type=StepType.DATA_QUERY,
273
+ description="Execute query directly",
274
+ tables_needed=[],
275
+ depends_on=[]
276
+ )
277
+ ],
278
+ parallel_groups=[["step_1"]]
279
+ )
280
+
281
+
282
+ # Singleton accessor
283
+ _query_planner: Optional[QueryPlanner] = None
284
+
285
+
286
+ def get_query_planner() -> QueryPlanner:
287
+ """Get the singleton query planner instance."""
288
+ global _query_planner
289
+ if _query_planner is None:
290
+ _query_planner = QueryPlanner()
291
+ return _query_planner
backend/core/semantic_search.py ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Semantic Search Service for Dataset Discovery
3
+
4
+ Uses Gemini embeddings to find relevant datasets from a query,
5
+ enabling scalable discovery across 250+ datasets without context overflow.
6
+ """
7
+
8
+ import json
9
+ import logging
10
+ import numpy as np
11
+ from pathlib import Path
12
+ from typing import Dict, List, Optional, Tuple
13
+ from google import genai
14
+ from google.genai import types
15
+ import os
16
+ from dotenv import load_dotenv
17
+
18
+ load_dotenv()
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ class SemanticSearch:
24
+ """
25
+ Embedding-based semantic search for dataset discovery.
26
+
27
+ Embeds dataset metadata (name, description, tags, columns) and
28
+ finds the most relevant datasets for a user query using cosine similarity.
29
+ """
30
+
31
+ _instance = None
32
+ EMBEDDINGS_FILE = Path(__file__).parent.parent / "data" / "embeddings.json"
33
+ EMBEDDING_MODEL = "models/text-embedding-004"
34
+
35
+ def __new__(cls):
36
+ if cls._instance is None:
37
+ cls._instance = super(SemanticSearch, cls).__new__(cls)
38
+ cls._instance.initialized = False
39
+ return cls._instance
40
+
41
+ def __init__(self):
42
+ if self.initialized:
43
+ return
44
+
45
+ self.embeddings: Dict[str, List[float]] = {}
46
+ self.metadata_cache: Dict[str, str] = {} # table_name -> embedded text
47
+
48
+ # Initialize Gemini client
49
+ api_key = os.getenv("GEMINI_API_KEY") or os.getenv("GOOGLE_API_KEY")
50
+ if api_key:
51
+ self.client = genai.Client()
52
+ else:
53
+ self.client = None
54
+ logger.warning("No API key found. Semantic search will use fallback keyword matching.")
55
+
56
+ self._load_embeddings()
57
+ self.initialized = True
58
+
59
+ def _load_embeddings(self) -> None:
60
+ """Load cached embeddings from disk."""
61
+ if self.EMBEDDINGS_FILE.exists():
62
+ try:
63
+ with open(self.EMBEDDINGS_FILE, 'r') as f:
64
+ data = json.load(f)
65
+ self.embeddings = data.get("embeddings", {})
66
+ self.metadata_cache = data.get("metadata", {})
67
+ logger.info(f"Loaded {len(self.embeddings)} cached embeddings.")
68
+ except Exception as e:
69
+ logger.error(f"Failed to load embeddings: {e}")
70
+ self.embeddings = {}
71
+ self.metadata_cache = {}
72
+
73
+ def _save_embeddings(self) -> None:
74
+ """Save embeddings cache to disk."""
75
+ try:
76
+ self.EMBEDDINGS_FILE.parent.mkdir(parents=True, exist_ok=True)
77
+ with open(self.EMBEDDINGS_FILE, 'w') as f:
78
+ json.dump({
79
+ "embeddings": self.embeddings,
80
+ "metadata": self.metadata_cache
81
+ }, f)
82
+ logger.info(f"Saved {len(self.embeddings)} embeddings to cache.")
83
+ except Exception as e:
84
+ logger.error(f"Failed to save embeddings: {e}")
85
+
86
+ def _build_embedding_text(self, table_name: str, metadata: dict) -> str:
87
+ """Build text representation of a table for embedding."""
88
+ parts = [f"Table: {table_name}"]
89
+
90
+ # Description (prefer semantic if available)
91
+ desc = metadata.get("semantic_description") or metadata.get("description", "")
92
+ if desc:
93
+ parts.append(f"Description: {desc}")
94
+
95
+ # Tags
96
+ tags = metadata.get("tags", [])
97
+ if tags:
98
+ parts.append(f"Tags: {', '.join(tags)}")
99
+
100
+ # Category
101
+ category = metadata.get("category", "")
102
+ if category:
103
+ parts.append(f"Category: {category}")
104
+
105
+ # Key columns (limit to first 15 for embedding efficiency)
106
+ columns = metadata.get("columns", [])
107
+ # Filter out generic columns
108
+ meaningful_cols = [c for c in columns[:15] if c not in ['geom', 'geometry', 'id', 'fid']]
109
+ if meaningful_cols:
110
+ parts.append(f"Columns: {', '.join(meaningful_cols)}")
111
+
112
+ # Data type
113
+ data_type = metadata.get("data_type", "static")
114
+ parts.append(f"Data type: {data_type}")
115
+
116
+ return ". ".join(parts)
117
+
118
+ def _embed_text(self, text: str) -> Optional[List[float]]:
119
+ """Get embedding for a text string."""
120
+ if not self.client:
121
+ return None
122
+
123
+ try:
124
+ result = self.client.models.embed_content(
125
+ model=self.EMBEDDING_MODEL,
126
+ contents=text # Note: 'contents' not 'content'
127
+ )
128
+ return result.embeddings[0].values
129
+ except Exception as e:
130
+ logger.error(f"Embedding failed: {e}")
131
+ return None
132
+
133
+ def _cosine_similarity(self, a: List[float], b: List[float]) -> float:
134
+ """Compute cosine similarity between two vectors."""
135
+ a_np = np.array(a)
136
+ b_np = np.array(b)
137
+
138
+ dot_product = np.dot(a_np, b_np)
139
+ norm_a = np.linalg.norm(a_np)
140
+ norm_b = np.linalg.norm(b_np)
141
+
142
+ if norm_a == 0 or norm_b == 0:
143
+ return 0.0
144
+
145
+ return float(dot_product / (norm_a * norm_b))
146
+
147
+ def embed_table(self, table_name: str, metadata: dict) -> bool:
148
+ """
149
+ Embed a table's metadata for semantic search.
150
+
151
+ Returns True if embedding was successful or already cached.
152
+ """
153
+ text = self._build_embedding_text(table_name, metadata)
154
+
155
+ # Check if already embedded with same text
156
+ if table_name in self.metadata_cache and self.metadata_cache[table_name] == text:
157
+ return True
158
+
159
+ embedding = self._embed_text(text)
160
+ if embedding:
161
+ self.embeddings[table_name] = embedding
162
+ self.metadata_cache[table_name] = text
163
+ return True
164
+
165
+ return False
166
+
167
+ def embed_all_tables(self, catalog: Dict[str, dict]) -> int:
168
+ """
169
+ Embed all tables in the catalog.
170
+
171
+ Returns number of newly embedded tables.
172
+ """
173
+ new_count = 0
174
+
175
+ for table_name, metadata in catalog.items():
176
+ text = self._build_embedding_text(table_name, metadata)
177
+
178
+ # Skip if already embedded with same text
179
+ if table_name in self.metadata_cache and self.metadata_cache[table_name] == text:
180
+ continue
181
+
182
+ if self.embed_table(table_name, metadata):
183
+ new_count += 1
184
+
185
+ if new_count > 0:
186
+ self._save_embeddings()
187
+ logger.info(f"Embedded {new_count} new tables.")
188
+
189
+ return new_count
190
+
191
+ def search(self, query: str, top_k: int = 15) -> List[Tuple[str, float]]:
192
+ """
193
+ Find the most relevant tables for a query.
194
+
195
+ Returns list of (table_name, similarity_score) tuples, sorted by relevance.
196
+ """
197
+ if not self.embeddings:
198
+ logger.warning("No embeddings available. Returning empty results.")
199
+ return []
200
+
201
+ # Embed the query
202
+ query_embedding = self._embed_text(query)
203
+
204
+ if not query_embedding:
205
+ # Fallback to keyword matching
206
+ return self._keyword_fallback(query, top_k)
207
+
208
+ # Compute similarities
209
+ scores = []
210
+ for table_name, table_embedding in self.embeddings.items():
211
+ score = self._cosine_similarity(query_embedding, table_embedding)
212
+ scores.append((table_name, score))
213
+
214
+ # Sort by similarity (descending)
215
+ scores.sort(key=lambda x: -x[1])
216
+
217
+ return scores[:top_k]
218
+
219
+ def search_table_names(self, query: str, top_k: int = 15) -> List[str]:
220
+ """Convenience method that returns just table names."""
221
+ results = self.search(query, top_k)
222
+ return [name for name, _ in results]
223
+
224
+ def _keyword_fallback(self, query: str, top_k: int) -> List[Tuple[str, float]]:
225
+ """
226
+ Simple keyword matching fallback when embeddings unavailable.
227
+ """
228
+ query_terms = query.lower().split()
229
+ scores = []
230
+
231
+ for table_name, text in self.metadata_cache.items():
232
+ text_lower = text.lower()
233
+ score = sum(1 for term in query_terms if term in text_lower)
234
+ if score > 0:
235
+ scores.append((table_name, score / len(query_terms)))
236
+
237
+ scores.sort(key=lambda x: -x[1])
238
+ return scores[:top_k]
239
+
240
+ def get_stats(self) -> dict:
241
+ """Return statistics about the semantic search index."""
242
+ return {
243
+ "total_tables": len(self.embeddings),
244
+ "cache_file": str(self.EMBEDDINGS_FILE),
245
+ "cache_exists": self.EMBEDDINGS_FILE.exists(),
246
+ "client_available": self.client is not None
247
+ }
248
+
249
+
250
+ # Singleton accessor
251
+ _semantic_search: Optional[SemanticSearch] = None
252
+
253
+
254
+ def get_semantic_search() -> SemanticSearch:
255
+ """Get the singleton semantic search instance."""
256
+ global _semantic_search
257
+ if _semantic_search is None:
258
+ _semantic_search = SemanticSearch()
259
+ return _semantic_search
backend/core/session_store.py ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Session Store Service
3
+
4
+ Thread-safe session-scoped storage for user layers and context.
5
+ Replaces global SESSION_LAYERS with per-session isolation.
6
+ """
7
+
8
+ import logging
9
+ import threading
10
+ from datetime import datetime, timedelta
11
+ from typing import Dict, List, Optional, Any
12
+
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ class SessionStore:
18
+ """
19
+ Thread-safe session-scoped storage with TTL expiration.
20
+
21
+ Each session maintains its own:
22
+ - layers: Map layers created by the user
23
+ - context: Optional conversation context
24
+
25
+ Sessions expire after configurable TTL (default 2 hours).
26
+ """
27
+
28
+ _instance = None
29
+
30
+ def __new__(cls):
31
+ if cls._instance is None:
32
+ cls._instance = super(SessionStore, cls).__new__(cls)
33
+ cls._instance.initialized = False
34
+ return cls._instance
35
+
36
+ def __init__(self, ttl_hours: int = 2, max_layers_per_session: int = 15):
37
+ if self.initialized:
38
+ return
39
+
40
+ self._sessions: Dict[str, dict] = {}
41
+ self._lock = threading.Lock()
42
+ self.ttl = timedelta(hours=ttl_hours)
43
+ self.max_layers = max_layers_per_session
44
+ self.initialized = True
45
+
46
+ logger.info(f"SessionStore initialized with TTL={ttl_hours}h, max_layers={max_layers_per_session}")
47
+
48
+ def _get_or_create_session(self, session_id: str) -> dict:
49
+ """Get existing session or create new one."""
50
+ if session_id not in self._sessions:
51
+ self._sessions[session_id] = {
52
+ "layers": [],
53
+ "created": datetime.now(),
54
+ "accessed": datetime.now()
55
+ }
56
+ return self._sessions[session_id]
57
+
58
+ def get_layers(self, session_id: str) -> List[dict]:
59
+ """Get all layers for a session."""
60
+ with self._lock:
61
+ session = self._get_or_create_session(session_id)
62
+ session["accessed"] = datetime.now()
63
+ return session["layers"].copy()
64
+
65
+ def add_layer(self, session_id: str, layer: dict) -> None:
66
+ """
67
+ Add a layer to a session.
68
+
69
+ Enforces max_layers limit by removing oldest layers.
70
+ """
71
+ with self._lock:
72
+ session = self._get_or_create_session(session_id)
73
+ session["layers"].append(layer)
74
+ session["accessed"] = datetime.now()
75
+
76
+ # Enforce layer limit
77
+ while len(session["layers"]) > self.max_layers:
78
+ removed = session["layers"].pop(0)
79
+ logger.debug(f"Session {session_id[:8]}: removed oldest layer {removed.get('name', 'unknown')}")
80
+
81
+ def update_layer(self, session_id: str, layer_id: str, updates: dict) -> bool:
82
+ """
83
+ Update an existing layer in a session.
84
+
85
+ Returns True if layer was found and updated.
86
+ """
87
+ with self._lock:
88
+ session = self._sessions.get(session_id)
89
+ if not session:
90
+ return False
91
+
92
+ for layer in session["layers"]:
93
+ if layer.get("id") == layer_id:
94
+ layer.update(updates)
95
+ session["accessed"] = datetime.now()
96
+ return True
97
+
98
+ return False
99
+
100
+ def remove_layer(self, session_id: str, layer_id: str) -> bool:
101
+ """
102
+ Remove a layer from a session.
103
+
104
+ Returns True if layer was found and removed.
105
+ """
106
+ with self._lock:
107
+ session = self._sessions.get(session_id)
108
+ if not session:
109
+ return False
110
+
111
+ original_len = len(session["layers"])
112
+ session["layers"] = [l for l in session["layers"] if l.get("id") != layer_id]
113
+ session["accessed"] = datetime.now()
114
+
115
+ return len(session["layers"]) < original_len
116
+
117
+ def clear_session(self, session_id: str) -> None:
118
+ """Clear all data for a session."""
119
+ with self._lock:
120
+ if session_id in self._sessions:
121
+ del self._sessions[session_id]
122
+
123
+ def get_layer_by_index(self, session_id: str, index: int) -> Optional[dict]:
124
+ """Get a specific layer by 1-based index (for user references like 'Layer 1')."""
125
+ with self._lock:
126
+ session = self._sessions.get(session_id)
127
+ if not session:
128
+ return None
129
+
130
+ layers = session["layers"]
131
+ if 1 <= index <= len(layers):
132
+ return layers[index - 1].copy()
133
+
134
+ return None
135
+
136
+ def cleanup_expired(self) -> int:
137
+ """
138
+ Remove sessions older than TTL.
139
+
140
+ Returns number of expired sessions removed.
141
+ """
142
+ with self._lock:
143
+ now = datetime.now()
144
+ expired = [
145
+ sid for sid, data in self._sessions.items()
146
+ if now - data.get("accessed", data["created"]) > self.ttl
147
+ ]
148
+
149
+ for sid in expired:
150
+ del self._sessions[sid]
151
+
152
+ if expired:
153
+ logger.info(f"Cleaned up {len(expired)} expired sessions.")
154
+
155
+ return len(expired)
156
+
157
+ def get_stats(self) -> dict:
158
+ """Return statistics about active sessions."""
159
+ with self._lock:
160
+ total_layers = sum(len(s["layers"]) for s in self._sessions.values())
161
+
162
+ return {
163
+ "active_sessions": len(self._sessions),
164
+ "total_layers": total_layers,
165
+ "ttl_hours": self.ttl.total_seconds() / 3600,
166
+ "max_layers_per_session": self.max_layers
167
+ }
168
+
169
+
170
+ # Singleton accessor
171
+ _session_store: Optional[SessionStore] = None
172
+
173
+
174
+ def get_session_store() -> SessionStore:
175
+ """Get the singleton session store instance."""
176
+ global _session_store
177
+ if _session_store is None:
178
+ _session_store = SessionStore()
179
+ return _session_store
backend/data/catalog.json ADDED
@@ -0,0 +1,1290 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "pan_admin3": {
3
+ "path": "base/pan_admin3.geojson",
4
+ "description": "Data from base/pan_admin3.geojson",
5
+ "semantic_description": "This dataset contains the third-level administrative boundaries (corregimientos) of Panama, including their hierarchical relationships to districts and provinces. It provides nationwide coverage of 594 territorial units, making it essential for localized demographic analysis, regional planning, and mapping public service distribution.",
6
+ "tags": [
7
+ "administrative",
8
+ "government",
9
+ "geographic",
10
+ "spatial",
11
+ "boundaries"
12
+ ],
13
+ "data_type": "static",
14
+ "update_frequency": null,
15
+ "columns": [
16
+ "adm3_name",
17
+ "adm3_name1",
18
+ "adm3_name2",
19
+ "adm3_name3",
20
+ "adm3_pcode",
21
+ "adm2_name",
22
+ "adm2_name1",
23
+ "adm2_name2",
24
+ "adm2_name3",
25
+ "adm2_pcode",
26
+ "adm1_name",
27
+ "adm1_name1",
28
+ "adm1_name2",
29
+ "adm1_name3",
30
+ "adm1_pcode",
31
+ "adm0_name",
32
+ "adm0_name1",
33
+ "adm0_name2",
34
+ "adm0_name3",
35
+ "adm0_pcode",
36
+ "valid_on",
37
+ "valid_to",
38
+ "area_sqkm",
39
+ "version",
40
+ "lang",
41
+ "lang1",
42
+ "lang2",
43
+ "lang3",
44
+ "adm3_ref_name",
45
+ "center_lat",
46
+ "center_lon",
47
+ "geom"
48
+ ],
49
+ "row_count": 594,
50
+ "category": "base",
51
+ "format": "geojson",
52
+ "last_indexed": "2026-01-09T16:15:16.691836",
53
+ "last_enriched": "2026-01-09T16:36:08.469629"
54
+ },
55
+ "pan_adminpoints": {
56
+ "path": "base/pan_adminpoints.geojson",
57
+ "description": "Data from base/pan_adminpoints.geojson",
58
+ "semantic_description": "This dataset provides point locations for administrative centers across Panama, covering hierarchical levels from the national capital down to sub-district seats. It includes geographic coordinates and standardized naming conventions, making it a foundational tool for territorial planning and administrative spatial analysis.",
59
+ "tags": [
60
+ "administrative",
61
+ "points-of-interest",
62
+ "amenities",
63
+ "government",
64
+ "spatial",
65
+ "boundaries"
66
+ ],
67
+ "data_type": "static",
68
+ "update_frequency": null,
69
+ "columns": [
70
+ "admin_level",
71
+ "name",
72
+ "name1",
73
+ "name2",
74
+ "name3",
75
+ "x_coord",
76
+ "y_coord",
77
+ "adm4_name",
78
+ "adm4_name1",
79
+ "adm4_name2",
80
+ "adm4_name3",
81
+ "adm4_pcode",
82
+ "adm3_name",
83
+ "adm3_name1",
84
+ "adm3_name2",
85
+ "adm3_name3",
86
+ "adm3_pcode",
87
+ "adm2_name",
88
+ "adm2_name1",
89
+ "adm2_name2",
90
+ "adm2_name3",
91
+ "adm2_pcode",
92
+ "adm1_name",
93
+ "adm1_name1",
94
+ "adm1_name2",
95
+ "adm1_name3",
96
+ "adm1_pcode",
97
+ "adm0_name",
98
+ "adm0_name1",
99
+ "adm0_name2",
100
+ "adm0_name3",
101
+ "adm0_pcode",
102
+ "valid_on",
103
+ "valid_to",
104
+ "version",
105
+ "lang",
106
+ "lang1",
107
+ "lang2",
108
+ "lang3",
109
+ "geom"
110
+ ],
111
+ "row_count": 684,
112
+ "category": "base",
113
+ "format": "geojson",
114
+ "last_indexed": "2026-01-09T16:15:16.764000",
115
+ "last_enriched": "2026-01-09T16:36:12.240069"
116
+ },
117
+ "pan_admin2": {
118
+ "path": "base/pan_admin2.geojson",
119
+ "description": "Data from base/pan_admin2.geojson",
120
+ "semantic_description": "This dataset comprises the second-level administrative boundaries for Panama, specifically representing the country's 76 districts. It includes standardized names and hierarchical P-codes for districts, provinces, and the national level to ensure data interoperability. This base layer is primarily used for administrative mapping, regional statistical analysis, and territorial planning.",
121
+ "tags": [
122
+ "administrative",
123
+ "government",
124
+ "geographic",
125
+ "spatial",
126
+ "boundaries"
127
+ ],
128
+ "data_type": "static",
129
+ "update_frequency": null,
130
+ "columns": [
131
+ "adm2_name",
132
+ "adm2_name1",
133
+ "adm2_name2",
134
+ "adm2_name3",
135
+ "adm2_pcode",
136
+ "adm1_name",
137
+ "adm1_name1",
138
+ "adm1_name2",
139
+ "adm1_name3",
140
+ "adm1_pcode",
141
+ "adm0_name",
142
+ "adm0_name1",
143
+ "adm0_name2",
144
+ "adm0_name3",
145
+ "adm0_pcode",
146
+ "valid_on",
147
+ "valid_to",
148
+ "area_sqkm",
149
+ "version",
150
+ "lang",
151
+ "lang1",
152
+ "lang2",
153
+ "lang3",
154
+ "adm2_ref_name",
155
+ "center_lat",
156
+ "center_lon",
157
+ "geom"
158
+ ],
159
+ "row_count": 76,
160
+ "category": "base",
161
+ "format": "geojson",
162
+ "last_indexed": "2026-01-09T16:15:17.560205",
163
+ "last_enriched": "2026-01-09T16:36:16.456717"
164
+ },
165
+ "pan_admin0": {
166
+ "path": "base/pan_admin0.geojson",
167
+ "description": "Data from base/pan_admin0.geojson",
168
+ "semantic_description": "This dataset defines the national boundary of Panama, representing the country's primary administrative level (ADM0). It is typically used as a foundational layer for national-level spatial analysis, cartographic visualizations, and as a reference for countrywide area calculations and statistical aggregation.",
169
+ "tags": [
170
+ "administrative",
171
+ "government",
172
+ "geographic",
173
+ "spatial",
174
+ "boundaries"
175
+ ],
176
+ "data_type": "static",
177
+ "update_frequency": null,
178
+ "columns": [
179
+ "iso2",
180
+ "iso3",
181
+ "adm0_name",
182
+ "adm0_name1",
183
+ "adm0_name2",
184
+ "adm0_name3",
185
+ "adm0_pcode",
186
+ "valid_on",
187
+ "valid_to",
188
+ "version",
189
+ "area_sqkm",
190
+ "lang",
191
+ "lang1",
192
+ "lang2",
193
+ "lang3",
194
+ "adm0_ref_name",
195
+ "center_lat",
196
+ "center_lon",
197
+ "geom"
198
+ ],
199
+ "row_count": 1,
200
+ "category": "base",
201
+ "format": "geojson",
202
+ "last_indexed": "2026-01-09T16:15:18.159865",
203
+ "last_enriched": "2026-01-09T16:36:18.989777"
204
+ },
205
+ "pan_admin1": {
206
+ "path": "base/pan_admin1.geojson",
207
+ "description": "Data from base/pan_admin1.geojson",
208
+ "semantic_description": "This dataset defines the 13 primary administrative divisions of Panama, including its provinces and major indigenous territories. It provides standardized names, area measurements, and codes necessary for nationwide spatial analysis and regional reporting. The layer serves as a foundational component for administrative mapping and aggregating statistical data at the provincial level.",
209
+ "tags": [
210
+ "administrative",
211
+ "government",
212
+ "geographic",
213
+ "spatial",
214
+ "boundaries"
215
+ ],
216
+ "data_type": "static",
217
+ "update_frequency": null,
218
+ "columns": [
219
+ "adm1_name",
220
+ "adm1_name1",
221
+ "adm1_name2",
222
+ "adm1_name3",
223
+ "adm1_pcode",
224
+ "adm0_name",
225
+ "adm0_name1",
226
+ "adm0_name2",
227
+ "adm0_name3",
228
+ "adm0_pcode",
229
+ "valid_on",
230
+ "valid_to",
231
+ "area_sqkm",
232
+ "version",
233
+ "lang",
234
+ "lang1",
235
+ "lang2",
236
+ "lang3",
237
+ "adm1_ref_name",
238
+ "center_lat",
239
+ "center_lon",
240
+ "geom"
241
+ ],
242
+ "row_count": 13,
243
+ "category": "base",
244
+ "format": "geojson",
245
+ "last_indexed": "2026-01-09T16:15:18.936257",
246
+ "last_enriched": "2026-01-09T16:36:22.439266"
247
+ },
248
+ "pan_adminlines": {
249
+ "path": "base/pan_adminlines.geojson",
250
+ "description": "Data from base/pan_adminlines.geojson",
251
+ "semantic_description": "This dataset contains the linear administrative boundaries of Panama across various hierarchical levels, including province and district borders identified by standard P-codes. It serves as a foundational spatial layer for delineating jurisdictional limits and performing territorial analysis. The data is primarily used for cartographic visualization and spatial joins that require precise border definitions for administrative planning.",
252
+ "tags": [
253
+ "administrative",
254
+ "spatial",
255
+ "boundaries",
256
+ "government"
257
+ ],
258
+ "data_type": "static",
259
+ "update_frequency": null,
260
+ "columns": [
261
+ "adm_level",
262
+ "name",
263
+ "valid_on",
264
+ "valid_to",
265
+ "version",
266
+ "right_pcod",
267
+ "left_pcod",
268
+ "geom"
269
+ ],
270
+ "row_count": 2338,
271
+ "category": "base",
272
+ "format": "geojson",
273
+ "last_indexed": "2026-01-09T16:15:19.846930",
274
+ "last_enriched": "2026-01-09T16:36:26.375260"
275
+ },
276
+ "universities": {
277
+ "path": "osm/universities.geojson",
278
+ "description": "Data from osm/universities.geojson",
279
+ "semantic_description": "This dataset identifies 62 higher education institutions across Panama, including attributes for names, operators, and facility types. It is designed for spatial analysis of educational coverage, urban infrastructure planning, and socio-economic research.",
280
+ "tags": [
281
+ "higher-education",
282
+ "education",
283
+ "infrastructure",
284
+ "osm",
285
+ "spatial",
286
+ "facilities",
287
+ "points",
288
+ "panama"
289
+ ],
290
+ "data_type": "semi-static",
291
+ "update_frequency": null,
292
+ "columns": [
293
+ "name",
294
+ "osm_id",
295
+ "feature_type",
296
+ "operator",
297
+ "education_type",
298
+ "icon",
299
+ "geom"
300
+ ],
301
+ "row_count": 62,
302
+ "category": "osm",
303
+ "format": "geojson",
304
+ "last_indexed": "2026-01-09T16:15:19.856764",
305
+ "last_enriched": "2026-01-09T16:36:33.114270"
306
+ },
307
+ "panama_healthsites_geojson": {
308
+ "path": "hdx/health_facilities/panama-healthsites-geojson.geojson",
309
+ "description": "Data from hdx/panama-healthsites-geojson.geojson",
310
+ "semantic_description": "This dataset provides location and attribute data for 986 health facilities across Panama, including hospitals, laboratories, and clinics sourced from OpenStreetMap. It includes detailed information on operational status, bed capacity, and staffing levels, making it suitable for analyzing health infrastructure distribution and healthcare accessibility mapping.",
311
+ "tags": [
312
+ "health",
313
+ "facilities",
314
+ "geographic",
315
+ "spatial",
316
+ "infrastructure"
317
+ ],
318
+ "data_type": "semi-static",
319
+ "update_frequency": null,
320
+ "columns": [
321
+ "osm_id",
322
+ "osm_type",
323
+ "completeness",
324
+ "amenity",
325
+ "healthcare",
326
+ "name",
327
+ "operator",
328
+ "source",
329
+ "speciality",
330
+ "operator_type",
331
+ "operational_status",
332
+ "opening_hours",
333
+ "beds",
334
+ "staff_doctors",
335
+ "staff_nurses",
336
+ "health_amenity_type",
337
+ "dispensing",
338
+ "wheelchair",
339
+ "emergency",
340
+ "insurance",
341
+ "water_source",
342
+ "electricity",
343
+ "is_in_health_area",
344
+ "is_in_health_zone",
345
+ "url",
346
+ "addr_housenumber",
347
+ "addr_street",
348
+ "addr_postcode",
349
+ "addr_city",
350
+ "changeset_id",
351
+ "changeset_version",
352
+ "changeset_timestamp",
353
+ "uuid",
354
+ "geom"
355
+ ],
356
+ "row_count": 986,
357
+ "category": "hdx",
358
+ "format": "geojson",
359
+ "last_indexed": "2026-01-09T16:15:20.039814",
360
+ "last_enriched": "2026-01-09T16:35:58.888442"
361
+ },
362
+ "panama_healthsites_hxl_geojson": {
363
+ "path": "hdx/health_facilities/panama-healthsites-hxl-geojson.geojson",
364
+ "description": "Data from hdx/panama-healthsites-hxl-geojson.geojson",
365
+ "semantic_description": "This dataset comprises 986 health facilities across Panama, detailing hospitals, clinics, and laboratories with information on capacity, specialties, and operational status. It is designed for analyzing healthcare accessibility, resource distribution, and infrastructure coverage at a national level.",
366
+ "tags": [
367
+ "health",
368
+ "facilities",
369
+ "geographic",
370
+ "spatial",
371
+ "infrastructure"
372
+ ],
373
+ "data_type": "semi-static",
374
+ "update_frequency": null,
375
+ "columns": [
376
+ "osm_id",
377
+ "osm_type",
378
+ "completeness",
379
+ "#loc+amenity",
380
+ "#meta+healthcare",
381
+ "#loc +name",
382
+ "#meta +operator",
383
+ "#geo+bounds+url",
384
+ "#meta +speciality",
385
+ "#meta +operator_type",
386
+ "#contact +phone",
387
+ "#status+operational_status",
388
+ "#access +hours",
389
+ "#capacity +beds",
390
+ "#capacity +staff",
391
+ "#meta +health_amenity_type",
392
+ "#meta+dispensing",
393
+ "#meta+wheelchair",
394
+ "#meta+emergency",
395
+ "#meta+insurance",
396
+ "#meta+water_source",
397
+ "#meta+electricity",
398
+ "#meta+is_in_health_area",
399
+ "#meta+is_in_health_zone",
400
+ "#contact +url",
401
+ "addr_housenumber",
402
+ "addr_street",
403
+ "addr_postcode",
404
+ "addr_city",
405
+ "changeset_id",
406
+ "changeset_version",
407
+ "changeset_timestamp",
408
+ "#meta +id",
409
+ "geom"
410
+ ],
411
+ "row_count": 986,
412
+ "category": "hdx",
413
+ "format": "geojson",
414
+ "last_indexed": "2026-01-09T16:15:20.152069",
415
+ "last_enriched": "2026-01-09T16:36:36.834369"
416
+ },
417
+ "kontur_population": {
418
+ "description": "Population density grid for Panama at 400m H3 hexagon resolution. Based on GHSL, Facebook HRSL, and Microsoft Buildings data.",
419
+ "tags": [
420
+ "population",
421
+ "density",
422
+ "panama",
423
+ "h3",
424
+ "hexagon",
425
+ "kontur",
426
+ "demographics"
427
+ ],
428
+ "data_type": "vector",
429
+ "geometry_type": "polygon",
430
+ "semantic_description": "Population count per 400m H3 hexagonal grid cell. Use for population density analysis, demographic studies, and urban/rural classification.",
431
+ "path": "kontur/kontur_population_EPSG4326.gpkg",
432
+ "columns": [
433
+ "h3",
434
+ "population",
435
+ "geom"
436
+ ],
437
+ "row_count": 33114
438
+ },
439
+ "osm_roads": {
440
+ "description": "OpenStreetMap Road network with classification for Panama",
441
+ "tags": [
442
+ "osm",
443
+ "panama",
444
+ "roads"
445
+ ],
446
+ "data_type": "vector",
447
+ "geometry_type": "auto",
448
+ "path": "osm/roads.geojson"
449
+ },
450
+ "osm_pois": {
451
+ "description": "OpenStreetMap Points of interest (restaurants, shops, etc.) for Panama",
452
+ "tags": [
453
+ "osm",
454
+ "panama",
455
+ "pois"
456
+ ],
457
+ "data_type": "vector",
458
+ "geometry_type": "auto",
459
+ "path": "osm/pois.geojson"
460
+ },
461
+ "osm_pois_areas": {
462
+ "description": "OpenStreetMap POI areas (larger venues) for Panama",
463
+ "tags": [
464
+ "osm",
465
+ "panama",
466
+ "pois areas"
467
+ ],
468
+ "data_type": "vector",
469
+ "geometry_type": "auto",
470
+ "path": "osm/pois_areas.geojson"
471
+ },
472
+ "osm_buildings": {
473
+ "description": "OpenStreetMap Building footprints for Panama",
474
+ "tags": [
475
+ "osm",
476
+ "panama",
477
+ "buildings"
478
+ ],
479
+ "data_type": "vector",
480
+ "geometry_type": "auto",
481
+ "path": "osm/buildings.geojson"
482
+ },
483
+ "osm_landuse": {
484
+ "description": "OpenStreetMap Land use zones (residential, commercial, etc.) for Panama",
485
+ "tags": [
486
+ "osm",
487
+ "panama",
488
+ "landuse"
489
+ ],
490
+ "data_type": "vector",
491
+ "geometry_type": "auto",
492
+ "path": "osm/landuse.geojson"
493
+ },
494
+ "osm_natural_points": {
495
+ "description": "OpenStreetMap Natural features (trees, peaks) for Panama",
496
+ "tags": [
497
+ "osm",
498
+ "panama",
499
+ "natural points"
500
+ ],
501
+ "data_type": "vector",
502
+ "geometry_type": "auto",
503
+ "path": "osm/natural_points.geojson"
504
+ },
505
+ "osm_natural_areas": {
506
+ "description": "OpenStreetMap Natural areas (forests, parks) for Panama",
507
+ "tags": [
508
+ "osm",
509
+ "panama",
510
+ "natural areas"
511
+ ],
512
+ "data_type": "vector",
513
+ "geometry_type": "auto",
514
+ "path": "osm/natural_areas.geojson"
515
+ },
516
+ "osm_water_areas": {
517
+ "description": "OpenStreetMap Water bodies (lakes, reservoirs) for Panama",
518
+ "tags": [
519
+ "osm",
520
+ "panama",
521
+ "water areas"
522
+ ],
523
+ "data_type": "vector",
524
+ "geometry_type": "auto",
525
+ "path": "osm/water_areas.geojson"
526
+ },
527
+ "osm_waterways": {
528
+ "description": "OpenStreetMap Rivers and streams for Panama",
529
+ "tags": [
530
+ "osm",
531
+ "panama",
532
+ "waterways"
533
+ ],
534
+ "data_type": "vector",
535
+ "geometry_type": "auto",
536
+ "path": "osm/waterways.geojson"
537
+ },
538
+ "osm_railways": {
539
+ "description": "OpenStreetMap Railway lines for Panama",
540
+ "tags": [
541
+ "osm",
542
+ "panama",
543
+ "railways"
544
+ ],
545
+ "data_type": "vector",
546
+ "geometry_type": "auto",
547
+ "path": "osm/railways.geojson"
548
+ },
549
+ "osm_traffic": {
550
+ "description": "OpenStreetMap Traffic infrastructure (signals, crossings) for Panama",
551
+ "tags": [
552
+ "osm",
553
+ "panama",
554
+ "traffic"
555
+ ],
556
+ "data_type": "vector",
557
+ "geometry_type": "auto",
558
+ "path": "osm/traffic.geojson"
559
+ },
560
+ "osm_traffic_areas": {
561
+ "description": "OpenStreetMap Traffic areas (parking lots) for Panama",
562
+ "tags": [
563
+ "osm",
564
+ "panama",
565
+ "traffic areas"
566
+ ],
567
+ "data_type": "vector",
568
+ "geometry_type": "auto",
569
+ "path": "osm/traffic_areas.geojson"
570
+ },
571
+ "osm_transport": {
572
+ "description": "OpenStreetMap Transport points (bus stops, stations) for Panama",
573
+ "tags": [
574
+ "osm",
575
+ "panama",
576
+ "transport"
577
+ ],
578
+ "data_type": "vector",
579
+ "geometry_type": "auto",
580
+ "path": "osm/transport.geojson"
581
+ },
582
+ "osm_transport_areas": {
583
+ "description": "OpenStreetMap Transport areas (airports, ports) for Panama",
584
+ "tags": [
585
+ "osm",
586
+ "panama",
587
+ "transport areas"
588
+ ],
589
+ "data_type": "vector",
590
+ "geometry_type": "auto",
591
+ "path": "osm/transport_areas.geojson"
592
+ },
593
+ "osm_places": {
594
+ "description": "OpenStreetMap Place names (cities, towns, villages) for Panama",
595
+ "tags": [
596
+ "osm",
597
+ "panama",
598
+ "places"
599
+ ],
600
+ "data_type": "vector",
601
+ "geometry_type": "auto",
602
+ "path": "osm/places.geojson"
603
+ },
604
+ "osm_places_areas": {
605
+ "description": "OpenStreetMap Place areas for Panama",
606
+ "tags": [
607
+ "osm",
608
+ "panama",
609
+ "places areas"
610
+ ],
611
+ "data_type": "vector",
612
+ "geometry_type": "auto",
613
+ "path": "osm/places_areas.geojson"
614
+ },
615
+ "osm_places_of_worship": {
616
+ "description": "OpenStreetMap Places of worship for Panama",
617
+ "tags": [
618
+ "osm",
619
+ "panama",
620
+ "places of worship"
621
+ ],
622
+ "data_type": "vector",
623
+ "geometry_type": "auto",
624
+ "path": "osm/places_of_worship.geojson"
625
+ },
626
+ "osm_places_of_worship_areas": {
627
+ "description": "OpenStreetMap Places of worship (buildings) for Panama",
628
+ "tags": [
629
+ "osm",
630
+ "panama",
631
+ "places of worship areas"
632
+ ],
633
+ "data_type": "vector",
634
+ "geometry_type": "auto",
635
+ "path": "osm/places_of_worship_areas.geojson"
636
+ },
637
+ "roads": {
638
+ "path": "osm/roads.geojson",
639
+ "description": "Data from osm/roads.geojson",
640
+ "semantic_description": null,
641
+ "tags": [
642
+ "spatial",
643
+ "infrastructure",
644
+ "roads",
645
+ "transportation"
646
+ ],
647
+ "data_type": "semi-static",
648
+ "update_frequency": null,
649
+ "columns": [
650
+ "osm_id",
651
+ "code",
652
+ "fclass",
653
+ "name",
654
+ "ref",
655
+ "oneway",
656
+ "maxspeed",
657
+ "layer",
658
+ "bridge",
659
+ "tunnel",
660
+ "geom"
661
+ ],
662
+ "row_count": 118464,
663
+ "category": "osm",
664
+ "format": "geojson",
665
+ "last_indexed": "2026-01-09T18:18:59.409660"
666
+ },
667
+ "places_of_worship_areas": {
668
+ "path": "osm/places_of_worship_areas.geojson",
669
+ "description": "Data from osm/places_of_worship_areas.geojson",
670
+ "semantic_description": null,
671
+ "tags": [
672
+ "spatial"
673
+ ],
674
+ "data_type": "semi-static",
675
+ "update_frequency": null,
676
+ "columns": [
677
+ "osm_id",
678
+ "code",
679
+ "fclass",
680
+ "name",
681
+ "geom"
682
+ ],
683
+ "row_count": 694,
684
+ "category": "osm",
685
+ "format": "geojson",
686
+ "last_indexed": "2026-01-09T18:18:59.460933"
687
+ },
688
+ "transport": {
689
+ "path": "osm/transport.geojson",
690
+ "description": "Data from osm/transport.geojson",
691
+ "semantic_description": null,
692
+ "tags": [
693
+ "maritime",
694
+ "spatial",
695
+ "infrastructure",
696
+ "transportation"
697
+ ],
698
+ "data_type": "semi-static",
699
+ "update_frequency": null,
700
+ "columns": [
701
+ "osm_id",
702
+ "code",
703
+ "fclass",
704
+ "name",
705
+ "geom"
706
+ ],
707
+ "row_count": 1891,
708
+ "category": "osm",
709
+ "format": "geojson",
710
+ "last_indexed": "2026-01-09T18:18:59.506892"
711
+ },
712
+ "pois_areas": {
713
+ "path": "osm/pois_areas.geojson",
714
+ "description": "Data from osm/pois_areas.geojson",
715
+ "semantic_description": null,
716
+ "tags": [
717
+ "spatial",
718
+ "points-of-interest",
719
+ "amenities"
720
+ ],
721
+ "data_type": "semi-static",
722
+ "update_frequency": null,
723
+ "columns": [
724
+ "osm_id",
725
+ "code",
726
+ "fclass",
727
+ "name",
728
+ "geom"
729
+ ],
730
+ "row_count": 11583,
731
+ "category": "osm",
732
+ "format": "geojson",
733
+ "last_indexed": "2026-01-09T18:19:00.011175"
734
+ },
735
+ "railways": {
736
+ "path": "osm/railways.geojson",
737
+ "description": "Data from osm/railways.geojson",
738
+ "semantic_description": null,
739
+ "tags": [
740
+ "spatial"
741
+ ],
742
+ "data_type": "semi-static",
743
+ "update_frequency": null,
744
+ "columns": [
745
+ "osm_id",
746
+ "code",
747
+ "fclass",
748
+ "name",
749
+ "layer",
750
+ "bridge",
751
+ "tunnel",
752
+ "geom"
753
+ ],
754
+ "row_count": 296,
755
+ "category": "osm",
756
+ "format": "geojson",
757
+ "last_indexed": "2026-01-09T18:19:00.034635"
758
+ },
759
+ "pois": {
760
+ "path": "osm/pois.geojson",
761
+ "description": "Data from osm/pois.geojson",
762
+ "semantic_description": null,
763
+ "tags": [
764
+ "spatial",
765
+ "points-of-interest",
766
+ "amenities"
767
+ ],
768
+ "data_type": "semi-static",
769
+ "update_frequency": null,
770
+ "columns": [
771
+ "osm_id",
772
+ "code",
773
+ "fclass",
774
+ "name",
775
+ "geom"
776
+ ],
777
+ "row_count": 11129,
778
+ "category": "osm",
779
+ "format": "geojson",
780
+ "last_indexed": "2026-01-09T18:19:00.261571"
781
+ },
782
+ "natural_points": {
783
+ "path": "osm/natural_points.geojson",
784
+ "description": "Data from osm/natural_points.geojson",
785
+ "semantic_description": null,
786
+ "tags": [
787
+ "spatial",
788
+ "points-of-interest",
789
+ "amenities"
790
+ ],
791
+ "data_type": "semi-static",
792
+ "update_frequency": null,
793
+ "columns": [
794
+ "osm_id",
795
+ "code",
796
+ "fclass",
797
+ "name",
798
+ "geom"
799
+ ],
800
+ "row_count": 6500,
801
+ "category": "osm",
802
+ "format": "geojson",
803
+ "last_indexed": "2026-01-09T18:19:00.395667"
804
+ },
805
+ "traffic": {
806
+ "path": "osm/traffic.geojson",
807
+ "description": "Data from osm/traffic.geojson",
808
+ "semantic_description": null,
809
+ "tags": [
810
+ "spatial"
811
+ ],
812
+ "data_type": "semi-static",
813
+ "update_frequency": null,
814
+ "columns": [
815
+ "osm_id",
816
+ "code",
817
+ "fclass",
818
+ "name",
819
+ "geom"
820
+ ],
821
+ "row_count": 5902,
822
+ "category": "osm",
823
+ "format": "geojson",
824
+ "last_indexed": "2026-01-09T18:19:00.509922"
825
+ },
826
+ "traffic_areas": {
827
+ "path": "osm/traffic_areas.geojson",
828
+ "description": "Data from osm/traffic_areas.geojson",
829
+ "semantic_description": null,
830
+ "tags": [
831
+ "spatial"
832
+ ],
833
+ "data_type": "semi-static",
834
+ "update_frequency": null,
835
+ "columns": [
836
+ "osm_id",
837
+ "code",
838
+ "fclass",
839
+ "name",
840
+ "geom"
841
+ ],
842
+ "row_count": 3403,
843
+ "category": "osm",
844
+ "format": "geojson",
845
+ "last_indexed": "2026-01-09T18:19:00.682898"
846
+ },
847
+ "buildings": {
848
+ "path": "osm/buildings.geojson",
849
+ "description": "Data from osm/buildings.geojson",
850
+ "semantic_description": null,
851
+ "tags": [
852
+ "spatial",
853
+ "built-environment",
854
+ "infrastructure"
855
+ ],
856
+ "data_type": "semi-static",
857
+ "update_frequency": null,
858
+ "columns": [
859
+ "osm_id",
860
+ "code",
861
+ "fclass",
862
+ "name",
863
+ "type",
864
+ "geom"
865
+ ],
866
+ "row_count": 233936,
867
+ "category": "osm",
868
+ "format": "geojson",
869
+ "last_indexed": "2026-01-09T18:19:08.488004"
870
+ },
871
+ "places": {
872
+ "path": "osm/places.geojson",
873
+ "description": "Data from osm/places.geojson",
874
+ "semantic_description": null,
875
+ "tags": [
876
+ "spatial",
877
+ "population"
878
+ ],
879
+ "data_type": "semi-static",
880
+ "update_frequency": null,
881
+ "columns": [
882
+ "osm_id",
883
+ "code",
884
+ "fclass",
885
+ "population",
886
+ "name",
887
+ "geom"
888
+ ],
889
+ "row_count": 3683,
890
+ "category": "osm",
891
+ "format": "geojson",
892
+ "last_indexed": "2026-01-09T18:19:08.594144"
893
+ },
894
+ "places_of_worship": {
895
+ "path": "osm/places_of_worship.geojson",
896
+ "description": "Data from osm/places_of_worship.geojson",
897
+ "semantic_description": null,
898
+ "tags": [
899
+ "spatial"
900
+ ],
901
+ "data_type": "semi-static",
902
+ "update_frequency": null,
903
+ "columns": [
904
+ "osm_id",
905
+ "code",
906
+ "fclass",
907
+ "name",
908
+ "geom"
909
+ ],
910
+ "row_count": 228,
911
+ "category": "osm",
912
+ "format": "geojson",
913
+ "last_indexed": "2026-01-09T18:19:08.609384"
914
+ },
915
+ "natural_areas": {
916
+ "path": "osm/natural_areas.geojson",
917
+ "description": "Data from osm/natural_areas.geojson",
918
+ "semantic_description": null,
919
+ "tags": [
920
+ "spatial"
921
+ ],
922
+ "data_type": "semi-static",
923
+ "update_frequency": null,
924
+ "columns": [
925
+ "osm_id",
926
+ "code",
927
+ "fclass",
928
+ "name",
929
+ "geom"
930
+ ],
931
+ "row_count": 434,
932
+ "category": "osm",
933
+ "format": "geojson",
934
+ "last_indexed": "2026-01-09T18:19:08.673965"
935
+ },
936
+ "waterways": {
937
+ "path": "osm/waterways.geojson",
938
+ "description": "Data from osm/waterways.geojson",
939
+ "semantic_description": null,
940
+ "tags": [
941
+ "natural-resources",
942
+ "spatial",
943
+ "hydrology"
944
+ ],
945
+ "data_type": "semi-static",
946
+ "update_frequency": null,
947
+ "columns": [
948
+ "osm_id",
949
+ "code",
950
+ "fclass",
951
+ "width",
952
+ "name",
953
+ "geom"
954
+ ],
955
+ "row_count": 15532,
956
+ "category": "osm",
957
+ "format": "geojson",
958
+ "last_indexed": "2026-01-09T18:19:10.791546"
959
+ },
960
+ "water_areas": {
961
+ "path": "osm/water_areas.geojson",
962
+ "description": "Data from osm/water_areas.geojson",
963
+ "semantic_description": null,
964
+ "tags": [
965
+ "natural-resources",
966
+ "spatial",
967
+ "hydrology"
968
+ ],
969
+ "data_type": "semi-static",
970
+ "update_frequency": null,
971
+ "columns": [
972
+ "osm_id",
973
+ "code",
974
+ "fclass",
975
+ "name",
976
+ "geom"
977
+ ],
978
+ "row_count": 3733,
979
+ "category": "osm",
980
+ "format": "geojson",
981
+ "last_indexed": "2026-01-09T18:19:12.941528"
982
+ },
983
+ "landuse": {
984
+ "path": "osm/landuse.geojson",
985
+ "description": "Data from osm/landuse.geojson",
986
+ "semantic_description": null,
987
+ "tags": [
988
+ "spatial"
989
+ ],
990
+ "data_type": "semi-static",
991
+ "update_frequency": null,
992
+ "columns": [
993
+ "osm_id",
994
+ "code",
995
+ "fclass",
996
+ "name",
997
+ "geom"
998
+ ],
999
+ "row_count": 16075,
1000
+ "category": "osm",
1001
+ "format": "geojson",
1002
+ "last_indexed": "2026-01-09T18:19:15.893984"
1003
+ },
1004
+ "transport_areas": {
1005
+ "path": "osm/transport_areas.geojson",
1006
+ "description": "Data from osm/transport_areas.geojson",
1007
+ "semantic_description": null,
1008
+ "tags": [
1009
+ "maritime",
1010
+ "spatial",
1011
+ "infrastructure",
1012
+ "transportation"
1013
+ ],
1014
+ "data_type": "semi-static",
1015
+ "update_frequency": null,
1016
+ "columns": [
1017
+ "osm_id",
1018
+ "code",
1019
+ "fclass",
1020
+ "name",
1021
+ "geom"
1022
+ ],
1023
+ "row_count": 196,
1024
+ "category": "osm",
1025
+ "format": "geojson",
1026
+ "last_indexed": "2026-01-09T18:19:15.917475"
1027
+ },
1028
+ "places_areas": {
1029
+ "path": "osm/places_areas.geojson",
1030
+ "description": "Data from osm/places_areas.geojson",
1031
+ "semantic_description": null,
1032
+ "tags": [
1033
+ "spatial",
1034
+ "population"
1035
+ ],
1036
+ "data_type": "semi-static",
1037
+ "update_frequency": null,
1038
+ "columns": [
1039
+ "osm_id",
1040
+ "code",
1041
+ "fclass",
1042
+ "population",
1043
+ "name",
1044
+ "geom"
1045
+ ],
1046
+ "row_count": 239,
1047
+ "category": "osm",
1048
+ "format": "geojson",
1049
+ "last_indexed": "2026-01-09T18:19:16.220819"
1050
+ },
1051
+ "overture_places": {
1052
+ "path": "overture/overture_places.geojson",
1053
+ "description": "Points of Interest from Overture Maps (Places theme)",
1054
+ "semantic_description": "Comprehensive list of businesses and landmarks with names and categories. Use this for finding specific amenities, shops, or named locations.",
1055
+ "tags": [
1056
+ "overture",
1057
+ "places",
1058
+ "poi",
1059
+ "businesses",
1060
+ "landmarks",
1061
+ "spatial",
1062
+ "panama"
1063
+ ],
1064
+ "data_type": "static",
1065
+ "update_frequency": null,
1066
+ "columns": [
1067
+ "id",
1068
+ "version",
1069
+ "sources",
1070
+ "names",
1071
+ "categories",
1072
+ "basic_category",
1073
+ "taxonomy",
1074
+ "confidence",
1075
+ "websites",
1076
+ "socials",
1077
+ "emails",
1078
+ "phones",
1079
+ "brand",
1080
+ "addresses",
1081
+ "operating_status",
1082
+ "geom"
1083
+ ],
1084
+ "row_count": 33362,
1085
+ "category": "overture",
1086
+ "format": "geojson",
1087
+ "last_indexed": "2026-01-09T18:37:03.188928"
1088
+ },
1089
+ "overture_roads": {
1090
+ "path": "overture/overture_roads.geojson",
1091
+ "description": "Road network segments from Overture Maps",
1092
+ "semantic_description": "Road network segments including highways, streets, and paths. Contains road names and classification.",
1093
+ "tags": [
1094
+ "overture",
1095
+ "roads",
1096
+ "transportation",
1097
+ "infrastructure",
1098
+ "spatial",
1099
+ "panama"
1100
+ ],
1101
+ "data_type": "static",
1102
+ "update_frequency": null,
1103
+ "columns": [
1104
+ "id",
1105
+ "version",
1106
+ "sources",
1107
+ "subtype",
1108
+ "class",
1109
+ "names",
1110
+ "connectors",
1111
+ "routes",
1112
+ "subclass_rules",
1113
+ "access_restrictions",
1114
+ "level_rules",
1115
+ "destinations",
1116
+ "prohibited_transitions",
1117
+ "rail_flags",
1118
+ "road_surface",
1119
+ "road_flags",
1120
+ "speed_limits",
1121
+ "width_rules",
1122
+ "subclass",
1123
+ "geom"
1124
+ ],
1125
+ "row_count": 179610,
1126
+ "category": "overture",
1127
+ "format": "geojson",
1128
+ "last_indexed": "2026-01-09T18:37:18.729125"
1129
+ },
1130
+ "overture_buildings": {
1131
+ "path": "overture/overture_buildings.geojson",
1132
+ "description": "Building footprints from Overture Maps",
1133
+ "semantic_description": "Building footprints including Microsoft and OSM data. Useful for urban density, infrastructure planning, and built-environment analysis.",
1134
+ "tags": [
1135
+ "overture",
1136
+ "buildings",
1137
+ "footprints",
1138
+ "infrastructure",
1139
+ "spatial",
1140
+ "panama"
1141
+ ],
1142
+ "data_type": "static",
1143
+ "update_frequency": null,
1144
+ "columns": [
1145
+ "id",
1146
+ "version",
1147
+ "sources",
1148
+ "level",
1149
+ "subtype",
1150
+ "class",
1151
+ "height",
1152
+ "names",
1153
+ "has_parts",
1154
+ "is_underground",
1155
+ "num_floors",
1156
+ "min_height",
1157
+ "min_floor",
1158
+ "num_floors_underground",
1159
+ "facade_color",
1160
+ "facade_material",
1161
+ "roof_material",
1162
+ "roof_shape",
1163
+ "roof_direction",
1164
+ "roof_orientation",
1165
+ "roof_color",
1166
+ "roof_height",
1167
+ "geom"
1168
+ ],
1169
+ "row_count": 1888314,
1170
+ "category": "overture",
1171
+ "format": "geojson",
1172
+ "last_indexed": "2026-01-09T18:38:50.416300"
1173
+ },
1174
+ "panama_weather_stations": {
1175
+ "path": "climate/weather_stations.geojson",
1176
+ "description": "Major weather stations in Panama with average temperature and rainfall data.",
1177
+ "semantic_description": "This dataset contains the locations of major weather stations in Panama (Tocumen, David, Bocas del Toro, etc.). It includes attributes for average annual temperature (Celsius), annual rainfall (mm), and elevation. Use this for analyzing climatic differences across the country.",
1178
+ "tags": [
1179
+ "climate",
1180
+ "weather",
1181
+ "temperature",
1182
+ "rainfall",
1183
+ "stations"
1184
+ ],
1185
+ "data_type": "static",
1186
+ "category": "climate",
1187
+ "format": "geojson"
1188
+ },
1189
+ "panama_terrain_features": {
1190
+ "path": "terrain/simplified_terrain.geojson",
1191
+ "description": "Simplifed terrain features including major peaks and mountain ranges.",
1192
+ "semantic_description": "A simplified dataset showing major terrain features of Panama, including Volc\u00e1n Bar\u00fa (highest peak) and the Central Cordillera. Contains points for peaks and lines for ranges, with elevation attributes.",
1193
+ "tags": [
1194
+ "terrain",
1195
+ "elevation",
1196
+ "mountains",
1197
+ "volcano",
1198
+ "geography"
1199
+ ],
1200
+ "data_type": "static",
1201
+ "category": "climate",
1202
+ "format": "geojson"
1203
+ },
1204
+ "panama_national_indicators": {
1205
+ "path": "socioeconomic/panama_national_indicators.geojson",
1206
+ "description": "National socio-economic indicators from World Bank (2000-2024)",
1207
+ "semantic_description": "Comprehensive national-level statistics for Panama including poverty rates, GDP, unemployment, health expenditure, maternal/child mortality, literacy rates, and school enrollment. Data sourced from World Bank Open Data API. Use this dataset for analyzing Panama's socio-economic development trends over time.",
1208
+ "tags": [
1209
+ "socioeconomic",
1210
+ "worldbank",
1211
+ "poverty",
1212
+ "gdp",
1213
+ "employment",
1214
+ "health",
1215
+ "education",
1216
+ "national",
1217
+ "panama"
1218
+ ],
1219
+ "data_type": "static",
1220
+ "category": "socioeconomic",
1221
+ "format": "geojson"
1222
+ },
1223
+ "province_socioeconomic": {
1224
+ "path": "socioeconomic/province_socioeconomic.geojson",
1225
+ "description": "Province-level socioeconomic indicators for Panama (2023)",
1226
+ "semantic_description": "Socioeconomic data at the province level including Multidimensional Poverty Index (MPI), population from Censo 2023, average income, and disability rates. Shows dramatic geographic inequality: Ng\u00e4be-Bugl\u00e9 comarca has 93.4% poverty vs 15% in Panam\u00e1 province. Use for analyzing regional disparities in poverty, development, and demographics.",
1227
+ "tags": [
1228
+ "socioeconomic",
1229
+ "poverty",
1230
+ "mpi",
1231
+ "census",
1232
+ "province",
1233
+ "admin1",
1234
+ "demographics",
1235
+ "inequality",
1236
+ "panama"
1237
+ ],
1238
+ "data_type": "static",
1239
+ "category": "socioeconomic",
1240
+ "format": "geojson"
1241
+ },
1242
+ "panama_airports": {
1243
+ "path": "global/airports/panama_airports.geojson",
1244
+ "description": "Panama airports from OurAirports global database (91 airports)",
1245
+ "semantic_description": "Comprehensive dataset of all airports in Panama including international, domestic, regional, and small airfields. Contains location, elevation, type (large/medium/small/heliport), runway information, and identifiers (ICAO, IATA codes). Updated daily from OurAirports open database. Use for aviation infrastructure analysis, accessibility studies, and transportation planning.",
1246
+ "tags": [
1247
+ "infrastructure",
1248
+ "transportation",
1249
+ "airports",
1250
+ "aviation",
1251
+ "panama",
1252
+ "ourairports"
1253
+ ],
1254
+ "data_type": "static",
1255
+ "category": "infrastructure",
1256
+ "format": "geojson",
1257
+ "source": "OurAirports (davidmegginson/ourairports-data)",
1258
+ "license": "Public Domain"
1259
+ },
1260
+ "censo_2023": {
1261
+ "path": "censo/censo_2023_enriched.csv",
1262
+ "description": "Panama Census 2023: Demographics & Housing (Corregimiento Level)",
1263
+ "semantic_description": "Detailed dataset from the 2023 National Census of Population and Housing (Part I & II). Contains granular data at the Corregimiento level (Admin 3) covering: housing types, water access, sanitation, electricity sources, internet/computer access, education levels, and population demographics. Enriched with 'adm3_pcode' to enable spatial joining with 'pan_admin3'. Use for demographic analysis, infrastructure planning, and social program targeting.",
1264
+ "tags": [
1265
+ "census",
1266
+ "demographics",
1267
+ "housing",
1268
+ "population",
1269
+ "water",
1270
+ "electricity",
1271
+ "education",
1272
+ "panama",
1273
+ "2023"
1274
+ ],
1275
+ "data_type": "static",
1276
+ "category": "socioeconomic",
1277
+ "format": "csv",
1278
+ "columns": [
1279
+ "adm3_pcode",
1280
+ "cod_corr",
1281
+ "nomb_prov",
1282
+ "nomb_dist",
1283
+ "nomb_corr",
1284
+ "v1_tipo_vivienda__individual_permanente",
1285
+ "v8_abastecimiento_de_agua__acueducto_publico_del_idaan",
1286
+ "p13_acceso_internet__si",
1287
+ "p3_edad_digitos__total"
1288
+ ]
1289
+ }
1290
+ }
backend/data/catalog_schema.json ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "$schema": "http://json-schema.org/draft-07/schema#",
3
+ "title": "GeoQuery Data Catalog Entry",
4
+ "description": "Schema for dataset metadata in the GeoQuery platform catalog",
5
+ "type": "object",
6
+ "required": [
7
+ "path",
8
+ "columns",
9
+ "category",
10
+ "format"
11
+ ],
12
+ "properties": {
13
+ "path": {
14
+ "type": "string",
15
+ "description": "Relative path to the data file from the data directory"
16
+ },
17
+ "description": {
18
+ "type": "string",
19
+ "description": "Auto-generated basic description (e.g., 'Data from hdx/health.geojson')"
20
+ },
21
+ "semantic_description": {
22
+ "type": [
23
+ "string",
24
+ "null"
25
+ ],
26
+ "description": "LLM-generated rich description explaining the dataset's contents and use cases"
27
+ },
28
+ "tags": {
29
+ "type": "array",
30
+ "items": {
31
+ "type": "string"
32
+ },
33
+ "description": "Searchable tags for categorization (e.g., ['health', 'facilities', 'infrastructure'])"
34
+ },
35
+ "data_type": {
36
+ "type": "string",
37
+ "enum": [
38
+ "static",
39
+ "semi-static",
40
+ "realtime"
41
+ ],
42
+ "description": "How frequently the data changes",
43
+ "default": "static"
44
+ },
45
+ "update_frequency": {
46
+ "type": [
47
+ "string",
48
+ "null"
49
+ ],
50
+ "enum": [
51
+ null,
52
+ "yearly",
53
+ "monthly",
54
+ "weekly",
55
+ "daily",
56
+ "hourly",
57
+ "realtime"
58
+ ],
59
+ "description": "Expected update frequency for the dataset"
60
+ },
61
+ "columns": {
62
+ "type": "array",
63
+ "items": {
64
+ "type": "string"
65
+ },
66
+ "description": "List of column names in the dataset"
67
+ },
68
+ "row_count": {
69
+ "type": [
70
+ "integer",
71
+ "null"
72
+ ],
73
+ "description": "Number of features/rows in the dataset"
74
+ },
75
+ "category": {
76
+ "type": "string",
77
+ "description": "Source category (base, osm, hdx, inec, custom)"
78
+ },
79
+ "format": {
80
+ "type": "string",
81
+ "enum": [
82
+ "geojson",
83
+ "shapefile",
84
+ "geoparquet",
85
+ "csv"
86
+ ],
87
+ "description": "File format of the source data"
88
+ },
89
+ "geometry_type": {
90
+ "type": [
91
+ "string",
92
+ "null"
93
+ ],
94
+ "enum": [
95
+ null,
96
+ "Point",
97
+ "MultiPoint",
98
+ "LineString",
99
+ "MultiLineString",
100
+ "Polygon",
101
+ "MultiPolygon"
102
+ ],
103
+ "description": "Type of geometries in the dataset"
104
+ },
105
+ "bbox": {
106
+ "type": [
107
+ "array",
108
+ "null"
109
+ ],
110
+ "items": {
111
+ "type": "number"
112
+ },
113
+ "minItems": 4,
114
+ "maxItems": 4,
115
+ "description": "Bounding box [minLon, minLat, maxLon, maxLat]"
116
+ },
117
+ "source": {
118
+ "type": [
119
+ "string",
120
+ "null"
121
+ ],
122
+ "description": "Original source of the data (e.g., 'OpenStreetMap', 'INEC Census 2023')"
123
+ },
124
+ "license": {
125
+ "type": [
126
+ "string",
127
+ "null"
128
+ ],
129
+ "description": "Data license (e.g., 'ODbL', 'CC-BY-4.0', 'Public Domain')"
130
+ },
131
+ "last_indexed": {
132
+ "type": "string",
133
+ "format": "date-time",
134
+ "description": "ISO timestamp when the dataset was last indexed"
135
+ },
136
+ "last_enriched": {
137
+ "type": [
138
+ "string",
139
+ "null"
140
+ ],
141
+ "format": "date-time",
142
+ "description": "ISO timestamp when LLM enrichment was last run"
143
+ }
144
+ }
145
+ }
backend/data/censo/censo_2023_enriched.csv ADDED
The diff for this file is too large to render. See raw diff
 
backend/data/censo/censo_panama_2023_unificado.csv ADDED
The diff for this file is too large to render. See raw diff
 
backend/data/global/airports/panama_airports.geojson ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "type": "FeatureCollection",
3
+ "name": "panama_airports",
4
+ "crs": { "type": "name", "properties": { "name": "urn:ogc:def:crs:OGC:1.3:CRS84" } },
5
+ "features": [
6
+ { "type": "Feature", "properties": { "id": 308731, "ident": "CZJ", "type": "small_airport", "name": "Corazón de Jesús Airport", "latitude_deg": 9.44686, "longitude_deg": -78.575678, "elevation_ft": 8.0, "continent": null, "iso_country": "PA", "iso_region": "PA-GY", "municipality": "Tupile", "scheduled_service": "no", "icao_code": null, "iata_code": "CZJ", "gps_code": "MPCJ", "local_code": null, "home_link": null, "wikipedia_link": "https://en.wikipedia.org/wiki/Coraz%C3%B3n_de_Jes%C3%BAs_Airport", "keywords": "Narganá, Usdup" }, "geometry": { "type": "Point", "coordinates": [ -78.575678, 9.44686 ] } },
7
+ { "type": "Feature", "properties": { "id": 309162, "ident": "GHE", "type": "small_airport", "name": "Garachiné Airport", "latitude_deg": 8.0644, "longitude_deg": -78.3673, "elevation_ft": 42.0, "continent": "SA", "iso_country": "PA", "iso_region": "PA-5", "municipality": "Garachiné", "scheduled_service": "no", "icao_code": null, "iata_code": "GHE", "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": "https://en.wikipedia.org/wiki/Garachin%C3%A9_Airport", "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -78.3673, 8.0644 ] } },
8
+ { "type": "Feature", "properties": { "id": 316549, "ident": "IVI", "type": "small_airport", "name": "Viveros Island Airport", "latitude_deg": 8.4693, "longitude_deg": -79.0016, "elevation_ft": 100.0, "continent": null, "iso_country": "PA", "iso_region": "PA-8", "municipality": "Isla Viveros", "scheduled_service": "no", "icao_code": null, "iata_code": "IVI", "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -79.0016, 8.4693 ] } },
9
+ { "type": "Feature", "properties": { "id": 5323, "ident": "MP01", "type": "small_airport", "name": "Finca Ceiba Airport", "latitude_deg": 8.3549995422363281, "longitude_deg": -82.836402893066406, "elevation_ft": 52.0, "continent": null, "iso_country": "PA", "iso_region": "PA-4", "municipality": "Finca Jaguá", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": "MP01", "local_code": "MP01", "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -82.836402893066406, 8.354999542236328 ] } },
10
+ { "type": "Feature", "properties": { "id": 5324, "ident": "MP02", "type": "small_airport", "name": "Finca 45 Airport", "latitude_deg": 9.543330192565918, "longitude_deg": -82.733802795410156, "elevation_ft": 56.0, "continent": null, "iso_country": "PA", "iso_region": "PA-1", "municipality": "Dos Caños", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": "MP02", "local_code": "MP02", "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -82.733802795410156, 9.543330192565918 ] } },
11
+ { "type": "Feature", "properties": { "id": 5325, "ident": "MP03", "type": "small_airport", "name": "La Cabezona Airport", "latitude_deg": 8.3457, "longitude_deg": -82.5042, "elevation_ft": 31.0, "continent": null, "iso_country": "PA", "iso_region": "PA-4", "municipality": "Guarumal", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": "MPCB", "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": "MP03" }, "geometry": { "type": "Point", "coordinates": [ -82.5042, 8.3457 ] } },
12
+ { "type": "Feature", "properties": { "id": 5326, "ident": "MP17", "type": "small_airport", "name": "Finca 67 Airport", "latitude_deg": 9.4344100952148438, "longitude_deg": -82.499099731445312, "elevation_ft": 30.0, "continent": null, "iso_country": "PA", "iso_region": "PA-1", "municipality": "Changuinola", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": "MP17", "local_code": "MP17", "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -82.499099731445312, 9.434410095214844 ] } },
13
+ { "type": "Feature", "properties": { "id": 5327, "ident": "MP18", "type": "small_airport", "name": "Guillermo Palm Jaén Airport", "latitude_deg": 8.50383, "longitude_deg": -80.360298, "elevation_ft": 282.0, "continent": null, "iso_country": "PA", "iso_region": "PA-2", "municipality": "Penonomé", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": "MPPN", "local_code": null, "home_link": "https://aerodromo-guillermo-palm-jaen.negocio.site/", "wikipedia_link": null, "keywords": "MP18" }, "geometry": { "type": "Point", "coordinates": [ -80.360298, 8.50383 ] } },
14
+ { "type": "Feature", "properties": { "id": 5330, "ident": "MP21", "type": "small_airport", "name": "Alvaro Berroa Airport", "latitude_deg": 8.7703895568847656, "longitude_deg": -82.664398193359375, "elevation_ft": 5000.0, "continent": null, "iso_country": "PA", "iso_region": "PA-4", "municipality": "Nueva California", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": "MP21", "local_code": "MP21", "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -82.664398193359375, 8.770389556884766 ] } },
15
+ { "type": "Feature", "properties": { "id": 5331, "ident": "MP22", "type": "small_airport", "name": "Ingenio Santa Rosa Airport", "latitude_deg": 8.1952199935913086, "longitude_deg": -80.658699035644531, "elevation_ft": 109.0, "continent": null, "iso_country": "PA", "iso_region": "PA-2", "municipality": "Ingenio Santa Rosa", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": "MP22", "local_code": "MP22", "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -80.658699035644531, 8.195219993591309 ] } },
16
+ { "type": "Feature", "properties": { "id": 5332, "ident": "MP23", "type": "small_airport", "name": "Capt. Alex H. Bosquez Airport", "latitude_deg": 9.16628, "longitude_deg": -79.545205, "elevation_ft": 394.0, "continent": null, "iso_country": "PA", "iso_region": "PA-8", "municipality": "Calzada Larga", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": "MPCL", "local_code": null, "home_link": null, "wikipedia_link": "https://en.wikipedia.org/wiki/Calzada_Larga_Airport", "keywords": "MP23" }, "geometry": { "type": "Point", "coordinates": [ -79.545205, 9.16628 ] } },
17
+ { "type": "Feature", "properties": { "id": 5333, "ident": "MP24", "type": "small_airport", "name": "Captain Krish E. Persaud Airport", "latitude_deg": 8.58846, "longitude_deg": -79.889702, "elevation_ft": 141.0, "continent": null, "iso_country": "PA", "iso_region": "PA-10", "municipality": "Chame", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": "MPCM", "local_code": null, "home_link": null, "wikipedia_link": "https://en.wikipedia.org/wiki/Chame_Airport", "keywords": "Chame Airfield #1, MP24" }, "geometry": { "type": "Point", "coordinates": [ -79.889702, 8.58846 ] } },
18
+ { "type": "Feature", "properties": { "id": 5334, "ident": "MP26", "type": "small_airport", "name": "Punta Cocos Airport", "latitude_deg": 8.22485, "longitude_deg": -78.904404, "elevation_ft": 66.0, "continent": null, "iso_country": "PA", "iso_region": "PA-8", "municipality": "Punta Cocos", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": "MPPU", "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": "MP26" }, "geometry": { "type": "Point", "coordinates": [ -78.904404, 8.22485 ] } },
19
+ { "type": "Feature", "properties": { "id": 5335, "ident": "MP27", "type": "small_airport", "name": "Deborah Airport", "latitude_deg": 9.51614, "longitude_deg": -82.595497, "elevation_ft": 20.0, "continent": null, "iso_country": "PA", "iso_region": "PA-1", "municipality": "Guabito", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": "MP27", "local_code": "MP27", "home_link": null, "wikipedia_link": null, "keywords": "Guabito California" }, "geometry": { "type": "Point", "coordinates": [ -82.595497, 9.51614 ] } },
20
+ { "type": "Feature", "properties": { "id": 515607, "ident": "MPAG", "type": "small_airport", "name": "El Aguila Airstrip", "latitude_deg": 8.37168, "longitude_deg": -80.351676, "elevation_ft": 75.0, "continent": null, "iso_country": "PA", "iso_region": "PA-2", "municipality": "El Aguila", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": "MPAG", "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -80.351676, 8.37168 ] } },
21
+ { "type": "Feature", "properties": { "id": 4786, "ident": "MPBO", "type": "medium_airport", "name": "Bocas del Toro International Airport", "latitude_deg": 9.34085, "longitude_deg": -82.250801, "elevation_ft": 10.0, "continent": null, "iso_country": "PA", "iso_region": "PA-1", "municipality": "Isla Colón", "scheduled_service": "yes", "icao_code": "MPBO", "iata_code": "BOC", "gps_code": "MPBO", "local_code": null, "home_link": null, "wikipedia_link": "https://en.wikipedia.org/wiki/Bocas_del_Toro_%22Isla_Colón%22_International_Airport", "keywords": "Jose Ezequiel Hall" }, "geometry": { "type": "Point", "coordinates": [ -82.250801, 9.34085 ] } },
22
+ { "type": "Feature", "properties": { "id": 4787, "ident": "MPCE", "type": "medium_airport", "name": "Alonso Valderrama Airport", "latitude_deg": 7.98784, "longitude_deg": -80.409837, "elevation_ft": 33.0, "continent": null, "iso_country": "PA", "iso_region": "PA-6", "municipality": "Chitré", "scheduled_service": "yes", "icao_code": "MPCE", "iata_code": "CTD", "gps_code": "MPCE", "local_code": null, "home_link": null, "wikipedia_link": "https://en.wikipedia.org/wiki/Chitré_Alonso_Valderrama_Airport", "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -80.409837, 7.98784 ] } },
23
+ { "type": "Feature", "properties": { "id": 4788, "ident": "MPCH", "type": "medium_airport", "name": "Changuinola Captain Manuel Niño International Airport", "latitude_deg": 9.458962, "longitude_deg": -82.515062, "elevation_ft": 19.0, "continent": null, "iso_country": "PA", "iso_region": "PA-1", "municipality": "Changuinola", "scheduled_service": "yes", "icao_code": "MPCH", "iata_code": "CHX", "gps_code": "MPCH", "local_code": null, "home_link": null, "wikipedia_link": "https://en.wikipedia.org/wiki/Changuinola_%22Capit%C3%A1n_Manuel_Ni%C3%B1o%22_International_Airport", "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -82.515062, 9.458962 ] } },
24
+ { "type": "Feature", "properties": { "id": 4789, "ident": "MPDA", "type": "medium_airport", "name": "Enrique Malek International Airport", "latitude_deg": 8.391, "longitude_deg": -82.434998, "elevation_ft": 89.0, "continent": null, "iso_country": "PA", "iso_region": "PA-4", "municipality": "David", "scheduled_service": "yes", "icao_code": "MPDA", "iata_code": "DAV", "gps_code": "MPDA", "local_code": null, "home_link": null, "wikipedia_link": "https://en.wikipedia.org/wiki/Enrique_Malek_International_Airport", "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -82.434998, 8.391 ] } },
25
+ { "type": "Feature", "properties": { "id": 4790, "ident": "MPEJ", "type": "medium_airport", "name": "Enrique Adolfo Jimenez Airport", "latitude_deg": 9.35664, "longitude_deg": -79.867401, "elevation_ft": 25.0, "continent": null, "iso_country": "PA", "iso_region": "PA-3", "municipality": "Colón", "scheduled_service": "yes", "icao_code": "MPEJ", "iata_code": "ONX", "gps_code": "MPEJ", "local_code": null, "home_link": null, "wikipedia_link": "https://en.wikipedia.org/wiki/Enrique_Adolfo_Jim%C3%A9nez_Airport", "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -79.867401, 9.35664 ] } },
26
+ { "type": "Feature", "properties": { "id": 525236, "ident": "MPFE", "type": "small_airport", "name": "Fernando Eleta Airport", "latitude_deg": 8.411389, "longitude_deg": -79.111115, "elevation_ft": 311.0, "continent": null, "iso_country": "PA", "iso_region": "PA-8", "municipality": "Pedro de Cocal", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": "MPFE", "local_code": null, "home_link": null, "wikipedia_link": "https://en.wikipedia.org/wiki/en:Fernando%20Eleta%20Airport", "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -79.111115, 8.411389 ] } },
27
+ { "type": "Feature", "properties": { "id": 42190, "ident": "MPFS", "type": "small_airport", "name": "Fort Sherman Airport", "latitude_deg": 9.3650903701782244, "longitude_deg": -79.949798583984375, "elevation_ft": 10.0, "continent": null, "iso_country": "PA", "iso_region": "PA-3", "municipality": "Fort Sherman", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": "MPFS", "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -79.949798583984375, 9.365090370178224 ] } },
28
+ { "type": "Feature", "properties": { "id": 30768, "ident": "MPHO", "type": "small_airport", "name": "Panamá Pacífico International Airport", "latitude_deg": 8.91479, "longitude_deg": -79.599602, "elevation_ft": 52.0, "continent": null, "iso_country": "PA", "iso_region": "PA-10", "municipality": "Panamá City", "scheduled_service": "yes", "icao_code": null, "iata_code": "BLB", "gps_code": "MPPA", "local_code": null, "home_link": "http://www.panamapacifico.com/", "wikipedia_link": "https://en.wikipedia.org/wiki/Panam%C3%A1_Pac%C3%ADfico_International_Airport", "keywords": "HOW, Howard Air Force Base, Panama Pacifico" }, "geometry": { "type": "Point", "coordinates": [ -79.599602, 8.91479 ] } },
29
+ { "type": "Feature", "properties": { "id": 316555, "ident": "MPI", "type": "small_airport", "name": "Mamitupu Airport", "latitude_deg": 9.1851, "longitude_deg": -77.9841, "elevation_ft": 25.0, "continent": "SA", "iso_country": "PA", "iso_region": "PA-GY", "municipality": "Mamitupu", "scheduled_service": "no", "icao_code": null, "iata_code": "MPI", "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": "https://en.wikipedia.org/wiki/Mamitupu_Airport", "keywords": "Mamitupo" }, "geometry": { "type": "Point", "coordinates": [ -77.9841, 9.1851 ] } },
30
+ { "type": "Feature", "properties": { "id": 31937, "ident": "MPJE", "type": "small_airport", "name": "Jaqué Airport", "latitude_deg": 7.51778, "longitude_deg": -78.157204, "elevation_ft": 29.0, "continent": null, "iso_country": "PA", "iso_region": "PA-5", "municipality": "Jaqué", "scheduled_service": "no", "icao_code": "MPJE", "iata_code": "JQE", "gps_code": "MPJE", "local_code": null, "home_link": null, "wikipedia_link": "https://en.wikipedia.org/wiki/Jaqué_Airport", "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -78.157204, 7.51778 ] } },
31
+ { "type": "Feature", "properties": { "id": 346902, "ident": "MPMC", "type": "small_airport", "name": "Chame Mayor Airport", "latitude_deg": 8.591418, "longitude_deg": -79.869189, "elevation_ft": 79.0, "continent": null, "iso_country": "PA", "iso_region": "PA-10", "municipality": "Chame", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": "MPMC", "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -79.869189, 8.591418 ] } },
32
+ { "type": "Feature", "properties": { "id": 4791, "ident": "MPMG", "type": "medium_airport", "name": "Marcos A. Gelabert International Airport", "latitude_deg": 8.97334, "longitude_deg": -79.555603, "elevation_ft": 31.0, "continent": null, "iso_country": "PA", "iso_region": "PA-8", "municipality": "Albrook", "scheduled_service": "yes", "icao_code": "MPMG", "iata_code": "PAC", "gps_code": "MPMG", "local_code": null, "home_link": null, "wikipedia_link": "https://en.wikipedia.org/wiki/Albrook_%22Marcos_A._Gelabert%22_International_Airport", "keywords": "Balboa. Albrook AFS. MPLB" }, "geometry": { "type": "Point", "coordinates": [ -79.555603, 8.97334 ] } },
33
+ { "type": "Feature", "properties": { "id": 31939, "ident": "MPNU", "type": "small_airport", "name": "Augusto Vergara Airport", "latitude_deg": 7.8575, "longitude_deg": -80.276167, "elevation_ft": 49.0, "continent": null, "iso_country": "PA", "iso_region": "PA-7", "municipality": "Los Santos", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": "MPGU", "local_code": null, "home_link": null, "wikipedia_link": "https://en.wikipedia.org/wiki/Augusto_Vergara_Airport", "keywords": "Guararé" }, "geometry": { "type": "Point", "coordinates": [ -80.276167, 7.8575 ] } },
34
+ { "type": "Feature", "properties": { "id": 42197, "ident": "MPOA", "type": "small_airport", "name": "Puerto Obaldía Airport", "latitude_deg": 8.668813, "longitude_deg": -77.417399, "elevation_ft": 223.0, "continent": null, "iso_country": "PA", "iso_region": "PA-GY", "municipality": "Puerto Obaldía", "scheduled_service": "no", "icao_code": "MPOA", "iata_code": "PUE", "gps_code": "MPOA", "local_code": null, "home_link": null, "wikipedia_link": "https://en.wikipedia.org/wiki/Puerto_Obaldia_Airport", "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -77.417399, 8.668813 ] } },
35
+ { "type": "Feature", "properties": { "id": 346878, "ident": "MPPD", "type": "small_airport", "name": "Capt. J. Montenegro Airport", "latitude_deg": 7.534801, "longitude_deg": -80.043347, "elevation_ft": 148.0, "continent": null, "iso_country": "PA", "iso_region": "PA-7", "municipality": "Pedasí", "scheduled_service": "yes", "icao_code": null, "iata_code": "PDM", "gps_code": "MPPD", "local_code": null, "home_link": null, "wikipedia_link": "https://es.wikipedia.org/wiki/Aeropuerto_Capit%C3%A1n_Justiniano_Montenegro", "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -80.043347, 7.534801 ] } },
36
+ { "type": "Feature", "properties": { "id": 515602, "ident": "MPPT", "type": "small_airport", "name": "Punta Patiño Airstrip", "latitude_deg": 8.252816, "longitude_deg": -78.278618, "elevation_ft": 10.0, "continent": null, "iso_country": "PA", "iso_region": "PA-5", "municipality": "Punta Patiño", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": "MPPT", "local_code": null, "home_link": null, "wikipedia_link": "https://en.wikipedia.org/wiki/Punta_Pati%C3%B1o_Airport", "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -78.278618, 8.252816 ] } },
37
+ { "type": "Feature", "properties": { "id": 4792, "ident": "MPSA", "type": "medium_airport", "name": "Ruben Cantu Airport", "latitude_deg": 8.0855998992919922, "longitude_deg": -80.945297241210938, "elevation_ft": 272.0, "continent": null, "iso_country": "PA", "iso_region": "PA-9", "municipality": "Santiago", "scheduled_service": "no", "icao_code": "MPSA", "iata_code": "SYP", "gps_code": "MPSA", "local_code": null, "home_link": null, "wikipedia_link": "https://en.wikipedia.org/wiki/Ruben_Cantu_Airport", "keywords": "Santiago" }, "geometry": { "type": "Point", "coordinates": [ -80.945297241210938, 8.085599899291992 ] } },
38
+ { "type": "Feature", "properties": { "id": 31940, "ident": "MPSM", "type": "small_airport", "name": "Scarlett Martinez International Airport", "latitude_deg": 8.3758802413940003, "longitude_deg": -80.127899169922003, "elevation_ft": 105.0, "continent": null, "iso_country": "PA", "iso_region": "PA-2", "municipality": "Río Hato", "scheduled_service": "yes", "icao_code": "MPSM", "iata_code": "RIH", "gps_code": "MPSM", "local_code": null, "home_link": null, "wikipedia_link": "https://en.wikipedia.org/wiki/R%C3%ADo_Hato_Airport", "keywords": "MPRH, Río Hato Army Air Base, Captain Scarlett Martinez" }, "geometry": { "type": "Point", "coordinates": [ -80.127899169922003, 8.375880241394 ] } },
39
+ { "type": "Feature", "properties": { "id": 4793, "ident": "MPTO", "type": "large_airport", "name": "Tocumen International Airport", "latitude_deg": 9.07136, "longitude_deg": -79.383499, "elevation_ft": 135.0, "continent": null, "iso_country": "PA", "iso_region": "PA-8", "municipality": "Tocumen", "scheduled_service": "yes", "icao_code": "MPTO", "iata_code": "PTY", "gps_code": "MPTO", "local_code": null, "home_link": "https://www.tocumenpanama.aero/", "wikipedia_link": "https://en.wikipedia.org/wiki/Tocumen_International_Airport", "keywords": "La Joya No 1" }, "geometry": { "type": "Point", "coordinates": [ -79.383499, 9.07136 ] } },
40
+ { "type": "Feature", "properties": { "id": 42187, "ident": "MPVR", "type": "small_airport", "name": "El Porvenir Airport", "latitude_deg": 9.559212, "longitude_deg": -78.946631, "elevation_ft": 17.0, "continent": null, "iso_country": "PA", "iso_region": "PA-GY", "municipality": "El Porvenir", "scheduled_service": "no", "icao_code": "MPVR", "iata_code": "PVE", "gps_code": "MPVR", "local_code": null, "home_link": null, "wikipedia_link": "https://en.wikipedia.org/wiki/El_Porvenir_Airport", "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -78.946631, 9.559212 ] } },
41
+ { "type": "Feature", "properties": { "id": 32008, "ident": "MPWN", "type": "small_airport", "name": "Wannukandi Airport", "latitude_deg": 9.273476, "longitude_deg": -78.139848, "elevation_ft": 6.0, "continent": null, "iso_country": "PA", "iso_region": "PA-GY", "municipality": "San Blas", "scheduled_service": "no", "icao_code": "MPWN", "iata_code": "NBL", "gps_code": "MPWN", "local_code": null, "home_link": null, "wikipedia_link": "https://en.wikipedia.org/wiki/Wannukandi_Airport", "keywords": "San Blas Airport" }, "geometry": { "type": "Point", "coordinates": [ -78.139848, 9.273476 ] } },
42
+ { "type": "Feature", "properties": { "id": 4794, "ident": "MPZL", "type": "small_airport", "name": "Finca 32 Airport", "latitude_deg": 9.4270896911621094, "longitude_deg": -82.562698364257812, "elevation_ft": 23.0, "continent": null, "iso_country": "PA", "iso_region": "PA-1", "municipality": "La Dalia", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": "MPZL", "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -82.562698364257812, 9.427089691162109 ] } },
43
+ { "type": "Feature", "properties": { "id": 315194, "ident": "OGM", "type": "small_airport", "name": "Ogobsucum Airport", "latitude_deg": 9.1383, "longitude_deg": -77.93385, "elevation_ft": 13.0, "continent": null, "iso_country": "PA", "iso_region": "PA-GY", "municipality": "Ustupu", "scheduled_service": "no", "icao_code": null, "iata_code": "OGM", "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": "https://en.wikipedia.org/wiki/Ustupu-Ogobsucum_Airport", "keywords": "Ogobsucun, Ogubsucum, Ogubsucun, Ustupo" }, "geometry": { "type": "Point", "coordinates": [ -77.93385, 9.1383 ] } },
44
+ { "type": "Feature", "properties": { "id": 42182, "ident": "PA-0001", "type": "small_airport", "name": "Achutupu Airport", "latitude_deg": 9.188481, "longitude_deg": -77.994153, "elevation_ft": 10.0, "continent": null, "iso_country": "PA", "iso_region": "PA-GY", "municipality": "Mamitupu", "scheduled_service": "no", "icao_code": null, "iata_code": "ACU", "gps_code": "MPAC", "local_code": null, "home_link": null, "wikipedia_link": "https://en.wikipedia.org/wiki/Achutupo_Airport", "keywords": "Achutupo" }, "geometry": { "type": "Point", "coordinates": [ -77.994153, 9.188481 ] } },
45
+ { "type": "Feature", "properties": { "id": 42183, "ident": "PA-0002", "type": "small_airport", "name": "Aguadulce Airport", "latitude_deg": 8.2516498565673828, "longitude_deg": -80.565399169921875, "elevation_ft": 104.0, "continent": null, "iso_country": "PA", "iso_region": "PA-2", "municipality": "Aguadulce", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -80.565399169921875, 8.251649856567383 ] } },
46
+ { "type": "Feature", "properties": { "id": 42184, "ident": "PA-0003", "type": "small_airport", "name": "Ailigandí Airport", "latitude_deg": 9.2226, "longitude_deg": -78.0236, "elevation_ft": 55.0, "continent": null, "iso_country": "PA", "iso_region": "PA-GY", "municipality": "Isla Lorenzo Bello", "scheduled_service": "no", "icao_code": null, "iata_code": "AIL", "gps_code": "MPAI", "local_code": null, "home_link": null, "wikipedia_link": "https://en.wikipedia.org/wiki/Ailigandí_Airport", "keywords": "Ailigandi, Alligandi" }, "geometry": { "type": "Point", "coordinates": [ -78.0236, 9.2226 ] } },
47
+ { "type": "Feature", "properties": { "id": 42185, "ident": "PA-0004", "type": "small_airport", "name": "Cartí Airport", "latitude_deg": 9.452863, "longitude_deg": -78.978917, "elevation_ft": 6.0, "continent": null, "iso_country": "PA", "iso_region": "PA-GY", "municipality": "Cartí Islands", "scheduled_service": "no", "icao_code": null, "iata_code": "CTE", "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": "https://en.wikipedia.org/wiki/Cart%C3%AD_Airport", "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -78.978917, 9.452863 ] } },
48
+ { "type": "Feature", "properties": { "id": 42186, "ident": "PA-0005", "type": "small_airport", "name": "Corazón de Jesús Airport", "latitude_deg": 9.0172195434570312, "longitude_deg": -77.980697631835938, "elevation_ft": 1008.0, "continent": null, "iso_country": "PA", "iso_region": "PA-5", "municipality": "Nurna", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -77.980697631835938, 9.017219543457031 ] } },
49
+ { "type": "Feature", "properties": { "id": 42188, "ident": "PA-0006", "type": "small_airport", "name": "Finca Blanco Airport", "latitude_deg": 8.389832, "longitude_deg": -82.870847, "elevation_ft": 72.0, "continent": null, "iso_country": "PA", "iso_region": "PA-4", "municipality": "Finca Blanco", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -82.870847, 8.389832 ] } },
50
+ { "type": "Feature", "properties": { "id": 42189, "ident": "PA-0007", "type": "small_airport", "name": "Finca Fátima Airport", "latitude_deg": 8.388027, "longitude_deg": -82.748509, "elevation_ft": 26.0, "continent": null, "iso_country": "PA", "iso_region": "PA-4", "municipality": "Finca Fátima", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -82.748509, 8.388027 ] } },
51
+ { "type": "Feature", "properties": { "id": 42191, "ident": "PA-0008", "type": "small_airport", "name": "La Joya Airport", "latitude_deg": 9.1385602951049805, "longitude_deg": -79.240196228027344, "elevation_ft": 96.0, "continent": null, "iso_country": "PA", "iso_region": "PA-8", "municipality": "La Joya", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -79.240196228027344, 9.13856029510498 ] } },
52
+ { "type": "Feature", "properties": { "id": 42192, "ident": "PA-0009", "type": "small_airport", "name": "La Plantación Airport", "latitude_deg": 7.6628899574279794, "longitude_deg": -81.006103515625, "elevation_ft": 21.0, "continent": null, "iso_country": "PA", "iso_region": "PA-9", "municipality": "La Plantación", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -81.006103515625, 7.662889957427979 ] } },
53
+ { "type": "Feature", "properties": { "id": 42193, "ident": "PA-0010", "type": "small_airport", "name": "Mandinga Airport", "latitude_deg": 9.454635, "longitude_deg": -79.086507, "elevation_ft": 38.0, "continent": null, "iso_country": "PA", "iso_region": "PA-GY", "municipality": "Mandinga", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -79.086507, 9.454635 ] } },
54
+ { "type": "Feature", "properties": { "id": 42194, "ident": "PA-0011", "type": "small_airport", "name": "Mulatupo Airport", "latitude_deg": 8.945487, "longitude_deg": -77.733486, "elevation_ft": 15.0, "continent": null, "iso_country": "PA", "iso_region": "PA-GY", "municipality": "Mulatupo", "scheduled_service": "no", "icao_code": null, "iata_code": "MPP", "gps_code": "MPMU", "local_code": null, "home_link": null, "wikipedia_link": "https://en.wikipedia.org/wiki/Mulatupo_Airport", "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -77.733486, 8.945487 ] } },
55
+ { "type": "Feature", "properties": { "id": 42195, "ident": "PA-0012", "type": "closed", "name": "Narganá Airport", "latitude_deg": 9.444659, "longitude_deg": -78.588896, "elevation_ft": 7.0, "continent": null, "iso_country": "PA", "iso_region": "PA-GY", "municipality": "Tupile", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": "NGN, Corazón de Jesús" }, "geometry": { "type": "Point", "coordinates": [ -78.588896, 9.444659 ] } },
56
+ { "type": "Feature", "properties": { "id": 42196, "ident": "PA-0013", "type": "small_airport", "name": "Playón Chico Airport", "latitude_deg": 9.30692, "longitude_deg": -78.235273, "elevation_ft": 18.0, "continent": null, "iso_country": "PA", "iso_region": "PA-GY", "municipality": "Ukupseni", "scheduled_service": "no", "icao_code": null, "iata_code": "PYC", "gps_code": "MPPH", "local_code": null, "home_link": null, "wikipedia_link": "https://en.wikipedia.org/wiki/Play%C3%B3n_Chico_Airport", "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -78.235273, 9.30692 ] } },
57
+ { "type": "Feature", "properties": { "id": 42198, "ident": "PA-0014", "type": "small_airport", "name": "Río Azúcar Airport", "latitude_deg": 9.4247, "longitude_deg": -78.6269, "elevation_ft": 12.0, "continent": null, "iso_country": "PA", "iso_region": "PA-GY", "municipality": "Río Azúcar", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -78.6269, 9.4247 ] } },
58
+ { "type": "Feature", "properties": { "id": 42199, "ident": "PA-0015", "type": "small_airport", "name": "Rio Sidra Airport", "latitude_deg": 9.3167896270751953, "longitude_deg": -79.282997131347656, "elevation_ft": 2719.0, "continent": null, "iso_country": "PA", "iso_region": "PA-8", "municipality": "Rio Sidra", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -79.282997131347656, 9.316789627075195 ] } },
59
+ { "type": "Feature", "properties": { "id": 42200, "ident": "PA-0016", "type": "small_airport", "name": "Río Tigre Airport", "latitude_deg": 9.2508802413940447, "longitude_deg": -78.498703002929688, "elevation_ft": 1095.0, "continent": null, "iso_country": "PA", "iso_region": "PA-GY", "municipality": "Río Tigre", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -78.498703002929688, 9.250880241394045 ] } },
60
+ { "type": "Feature", "properties": { "id": 42201, "ident": "PA-0017", "type": "small_airport", "name": "San Miguel Airport", "latitude_deg": 8.456507, "longitude_deg": -78.934214, "elevation_ft": 70.0, "continent": null, "iso_country": "PA", "iso_region": "PA-8", "municipality": "Isla del Rey", "scheduled_service": "no", "icao_code": null, "iata_code": "NMG", "gps_code": "MPMI", "local_code": null, "home_link": null, "wikipedia_link": "https://en.wikipedia.org/wiki/San_Miguel_Airport,_Panama", "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -78.934214, 8.456507 ] } },
61
+ { "type": "Feature", "properties": { "id": 42202, "ident": "PA-0018", "type": "small_airport", "name": "Tubualá Airport", "latitude_deg": 8.918601, "longitude_deg": -77.709182, "elevation_ft": 20.0, "continent": null, "iso_country": "PA", "iso_region": "PA-GY", "municipality": "Coetupo", "scheduled_service": "no", "icao_code": null, "iata_code": "TUW", "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": "https://en.wikipedia.org/wiki/Tubual%C3%A1_Airport", "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -77.709182, 8.918601 ] } },
62
+ { "type": "Feature", "properties": { "id": 42203, "ident": "PA-0019", "type": "small_airport", "name": "Tupile Airport", "latitude_deg": 9.2465801239013672, "longitude_deg": -78.362503051757812, "elevation_ft": 1374.0, "continent": null, "iso_country": "PA", "iso_region": "PA-GY", "municipality": "Tupile", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -78.362503051757812, 9.246580123901367 ] } },
63
+ { "type": "Feature", "properties": { "id": 342550, "ident": "PA-0020", "type": "small_airport", "name": "Coral Lodge Airport", "latitude_deg": 9.55488, "longitude_deg": -79.13786, "elevation_ft": 20.0, "continent": null, "iso_country": "PA", "iso_region": "PA-3", "municipality": "Santa Isabel", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -79.13786, 9.55488 ] } },
64
+ { "type": "Feature", "properties": { "id": 42205, "ident": "PA-0021", "type": "closed", "name": "Ailigandí North Airport", "latitude_deg": 9.23903, "longitude_deg": -78.03922, "elevation_ft": 19.0, "continent": null, "iso_country": "PA", "iso_region": "PA-GY", "municipality": "Ailigandí", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -78.03922, 9.23903 ] } },
65
+ { "type": "Feature", "properties": { "id": 42206, "ident": "PA-0022", "type": "small_airport", "name": "Yaviza Airport", "latitude_deg": 8.1528, "longitude_deg": -77.687, "elevation_ft": 75.0, "continent": null, "iso_country": "PA", "iso_region": "PA-5", "municipality": "Yaviza", "scheduled_service": "no", "icao_code": null, "iata_code": "PYV", "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": "https://en.wikipedia.org/wiki/Yaviza_Airport", "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -77.687, 8.1528 ] } },
66
+ { "type": "Feature", "properties": { "id": 315017, "ident": "PA-0023", "type": "closed", "name": "Isla Tigre Airstrip", "latitude_deg": 9.4339, "longitude_deg": -78.5235, "elevation_ft": 7.0, "continent": null, "iso_country": "PA", "iso_region": "PA-GY", "municipality": "Mamartupu", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -78.5235, 9.4339 ] } },
67
+ { "type": "Feature", "properties": { "id": 316550, "ident": "PA-0024", "type": "small_airport", "name": "Coiba Airport", "latitude_deg": 7.5068, "longitude_deg": -81.6981, "elevation_ft": 255.0, "continent": null, "iso_country": "PA", "iso_region": "PA-9", "municipality": "Isla de Coiba", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -81.6981, 7.5068 ] } },
68
+ { "type": "Feature", "properties": { "id": 316551, "ident": "PA-0025", "type": "small_airport", "name": "Arenas Airport", "latitude_deg": 7.3713, "longitude_deg": -80.846, "elevation_ft": 85.0, "continent": null, "iso_country": "PA", "iso_region": "PA-9", "municipality": "Arenas", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": "MPAR", "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -80.846, 7.3713 ] } },
69
+ { "type": "Feature", "properties": { "id": 316553, "ident": "PA-0026", "type": "small_airport", "name": "Tonosí Airport", "latitude_deg": 7.4148, "longitude_deg": -80.4466, "elevation_ft": 55.0, "continent": null, "iso_country": "PA", "iso_region": "PA-7", "municipality": "Tonosí", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -80.4466, 7.4148 ] } },
70
+ { "type": "Feature", "properties": { "id": 316554, "ident": "PA-0027", "type": "small_airport", "name": "Candelaria Airport", "latitude_deg": 7.7326, "longitude_deg": -80.1403, "elevation_ft": 65.0, "continent": null, "iso_country": "PA", "iso_region": "PA-7", "municipality": "La Candelaria", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -80.1403, 7.7326 ] } },
71
+ { "type": "Feature", "properties": { "id": 342551, "ident": "PA-0028", "type": "small_airport", "name": "Nusatupo Airport", "latitude_deg": 9.43392, "longitude_deg": -78.83173, "elevation_ft": 18.0, "continent": null, "iso_country": "PA", "iso_region": "PA-GY", "municipality": "Nusatupo", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -78.83173, 9.43392 ] } },
72
+ { "type": "Feature", "properties": { "id": 342552, "ident": "PA-0029", "type": "small_airport", "name": "Wannukandi Airport", "latitude_deg": 9.273166, "longitude_deg": -78.139873, "elevation_ft": 13.0, "continent": null, "iso_country": "PA", "iso_region": "PA-GY", "municipality": "Wannukandi", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -78.139873, 9.273166 ] } },
73
+ { "type": "Feature", "properties": { "id": 342553, "ident": "PA-0030", "type": "small_airport", "name": "Mansukun Airport", "latitude_deg": 9.05011, "longitude_deg": -77.80985, "elevation_ft": 10.0, "continent": null, "iso_country": "PA", "iso_region": "PA-GY", "municipality": "Mansukum", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -77.80985, 9.05011 ] } },
74
+ { "type": "Feature", "properties": { "id": 342554, "ident": "PA-0031", "type": "closed", "name": "Napakanti Airport", "latitude_deg": 9.012796, "longitude_deg": -77.802531, "elevation_ft": 66.0, "continent": null, "iso_country": "PA", "iso_region": "PA-GY", "municipality": "Napakanti", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -77.802531, 9.012796 ] } },
75
+ { "type": "Feature", "properties": { "id": 342555, "ident": "PA-0032", "type": "small_airport", "name": "Caledonia Airport", "latitude_deg": 8.90201, "longitude_deg": -77.69286, "elevation_ft": 3.0, "continent": null, "iso_country": "PA", "iso_region": "PA-GY", "municipality": "Suletupu", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": "MPCA", "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -77.69286, 8.90201 ] } },
76
+ { "type": "Feature", "properties": { "id": 430649, "ident": "PA-0033", "type": "heliport", "name": "Soloy Heliport", "latitude_deg": 8.4831, "longitude_deg": -82.0816, "elevation_ft": 424.0, "continent": null, "iso_country": "PA", "iso_region": "PA-NB", "municipality": "Soloy", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -82.0816, 8.4831 ] } },
77
+ { "type": "Feature", "properties": { "id": 505196, "ident": "PA-0034", "type": "closed", "name": "Aidirgandí Airport", "latitude_deg": 9.35515, "longitude_deg": -78.34587, "elevation_ft": 23.0, "continent": null, "iso_country": "PA", "iso_region": "PA-GY", "municipality": "Aidirgandí", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -78.34587, 9.35515 ] } },
78
+ { "type": "Feature", "properties": { "id": 505212, "ident": "PA-0035", "type": "closed", "name": "Ingenio Las Cabras Airstrip", "latitude_deg": 7.90044, "longitude_deg": -80.540391, "elevation_ft": 112.0, "continent": null, "iso_country": "PA", "iso_region": "PA-6", "municipality": "Las Cabras", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -80.540391, 7.90044 ] } },
79
+ { "type": "Feature", "properties": { "id": 506050, "ident": "PA-0036", "type": "closed", "name": "Punta Hermosa Airstrip", "latitude_deg": 7.527853, "longitude_deg": -81.849575, "elevation_ft": 250.0, "continent": null, "iso_country": "PA", "iso_region": "PA-9", "municipality": "Isla de Coiba", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -81.849575, 7.527853 ] } },
80
+ { "type": "Feature", "properties": { "id": 506051, "ident": "PA-0037", "type": "small_airport", "name": "Coibito Landing Airstrip", "latitude_deg": 7.639068, "longitude_deg": -81.702433, "elevation_ft": null, "continent": null, "iso_country": "PA", "iso_region": "PA-9", "municipality": "Isla Rancheria", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -81.702433, 7.639068 ] } },
81
+ { "type": "Feature", "properties": { "id": 506052, "ident": "PA-0038", "type": "small_airport", "name": "Pixvae Airstrip", "latitude_deg": 7.841248, "longitude_deg": -81.567301, "elevation_ft": 56.0, "continent": null, "iso_country": "PA", "iso_region": "PA-9", "municipality": "Pixvae", "scheduled_service": "yes", "icao_code": null, "iata_code": null, "gps_code": "MPPX", "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": "Pixbae, Pifa, Piba, Pejibaye" }, "geometry": { "type": "Point", "coordinates": [ -81.567301, 7.841248 ] } },
82
+ { "type": "Feature", "properties": { "id": 506053, "ident": "PA-0039", "type": "closed", "name": "Filipinas Airstrip", "latitude_deg": 7.728211, "longitude_deg": -81.262396, "elevation_ft": 59.0, "continent": null, "iso_country": "PA", "iso_region": "PA-9", "municipality": "Carrizal", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -81.262396, 7.728211 ] } },
83
+ { "type": "Feature", "properties": { "id": 506054, "ident": "PA-0040", "type": "closed", "name": "La Providencia Airstrip", "latitude_deg": 7.8878, "longitude_deg": -80.978748, "elevation_ft": 121.0, "continent": null, "iso_country": "PA", "iso_region": "PA-9", "municipality": "Ponuga", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -80.978748, 7.8878 ] } },
84
+ { "type": "Feature", "properties": { "id": 506055, "ident": "PA-0041", "type": "closed", "name": "Limones Airstrip", "latitude_deg": 7.619267, "longitude_deg": -80.946937, "elevation_ft": 141.0, "continent": null, "iso_country": "PA", "iso_region": "PA-9", "municipality": "Limones", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -80.946937, 7.619267 ] } },
85
+ { "type": "Feature", "properties": { "id": 5322, "ident": "PA-0042", "type": "closed", "name": "Pedasí Airport", "latitude_deg": 7.55688, "longitude_deg": -80.0233, "elevation_ft": 16.0, "continent": null, "iso_country": "PA", "iso_region": "PA-7", "municipality": "Pedasí", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": "PDM, Capt Justiniano Montenegro, MP00" }, "geometry": { "type": "Point", "coordinates": [ -80.0233, 7.55688 ] } },
86
+ { "type": "Feature", "properties": { "id": 32164, "ident": "PA-0043", "type": "closed", "name": "Captain Ramon Xatruch Airport", "latitude_deg": 8.40667, "longitude_deg": -78.141701, "elevation_ft": 30.0, "continent": null, "iso_country": "PA", "iso_region": "PA-5", "municipality": "La Palma", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": "MPLP, MPLP, PLP" }, "geometry": { "type": "Point", "coordinates": [ -78.141701, 8.40667 ] } },
87
+ { "type": "Feature", "properties": { "id": 315016, "ident": "PA-0044", "type": "closed", "name": "Tupile Airport", "latitude_deg": 9.45, "longitude_deg": -78.566667, "elevation_ft": 5.0, "continent": null, "iso_country": "PA", "iso_region": "PA-GY", "municipality": "Isla Tupile", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": "TUE" }, "geometry": { "type": "Point", "coordinates": [ -78.566667, 9.45 ] } },
88
+ { "type": "Feature", "properties": { "id": 30640, "ident": "PA-AML", "type": "small_airport", "name": "Puerto Armuelles Airport", "latitude_deg": 8.267667, "longitude_deg": -82.864537, "elevation_ft": 42.0, "continent": null, "iso_country": "PA", "iso_region": "PA-4", "municipality": "Puerto Armuelles", "scheduled_service": "no", "icao_code": null, "iata_code": "AML", "gps_code": null, "local_code": null, "home_link": "https://visitpuertoarmuelles.com/airport-update-for-puerto-armuelles", "wikipedia_link": null, "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -82.864537, 8.267667 ] } },
89
+ { "type": "Feature", "properties": { "id": 35194, "ident": "PA-BFQ", "type": "small_airport", "name": "Bahia Piña Airport", "latitude_deg": 7.58737, "longitude_deg": -78.179939, "elevation_ft": 14.0, "continent": null, "iso_country": "PA", "iso_region": "PA-5", "municipality": "Puerto Piña", "scheduled_service": "yes", "icao_code": null, "iata_code": "BFQ", "gps_code": "MPPI", "local_code": null, "home_link": null, "wikipedia_link": "https://en.wikipedia.org/wiki/Bah%C3%ADa_Pi%C3%B1a_Airport", "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -78.179939, 7.58737 ] } },
90
+ { "type": "Feature", "properties": { "id": 35196, "ident": "PA-ELE", "type": "small_airport", "name": "EL Real Airport", "latitude_deg": 8.107235, "longitude_deg": -77.725545, "elevation_ft": 65.0, "continent": null, "iso_country": "PA", "iso_region": "PA-5", "municipality": "El Real de Santa María", "scheduled_service": "no", "icao_code": null, "iata_code": "ELE", "gps_code": "MPRE", "local_code": null, "home_link": null, "wikipedia_link": "https://en.wikipedia.org/wiki/El_Real_Airport", "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -77.725545, 8.107235 ] } },
91
+ { "type": "Feature", "properties": { "id": 42181, "ident": "PA-MRF", "type": "small_airport", "name": "Miraflores Airport", "latitude_deg": 8.338889, "longitude_deg": -78.131944, "elevation_ft": 32.0, "continent": null, "iso_country": "PA", "iso_region": "PA-5", "municipality": "Miraflores", "scheduled_service": "no", "icao_code": null, "iata_code": null, "gps_code": "MPMF", "local_code": "MRF", "home_link": null, "wikipedia_link": "https://en.wikipedia.org/wiki/Miraflores_Airport,_Dari%C3%A9n", "keywords": "MPSE" }, "geometry": { "type": "Point", "coordinates": [ -78.131944, 8.338889 ] } },
92
+ { "type": "Feature", "properties": { "id": 35195, "ident": "PA-OTD", "type": "small_airport", "name": "Raul Arias Espinoza Airport", "latitude_deg": 8.62876, "longitude_deg": -79.034698, "elevation_ft": 43.0, "continent": null, "iso_country": "PA", "iso_region": "PA-8", "municipality": "Contadora Island", "scheduled_service": "yes", "icao_code": null, "iata_code": "OTD", "gps_code": "MPRA", "local_code": null, "home_link": null, "wikipedia_link": "https://en.wikipedia.org/wiki/Contadora_Airport", "keywords": "Contadora Airport" }, "geometry": { "type": "Point", "coordinates": [ -79.034698, 8.62876 ] } },
93
+ { "type": "Feature", "properties": { "id": 35197, "ident": "PA-SAX", "type": "small_airport", "name": "Sambú Airport", "latitude_deg": 8.026279, "longitude_deg": -78.209555, "elevation_ft": 32.0, "continent": null, "iso_country": "PA", "iso_region": "PA-5", "municipality": "Boca de Sábalo", "scheduled_service": "no", "icao_code": null, "iata_code": "SAX", "gps_code": "MPSB", "local_code": null, "home_link": null, "wikipedia_link": "https://en.wikipedia.org/wiki/Sambú_Airport", "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -78.209555, 8.026279 ] } },
94
+ { "type": "Feature", "properties": { "id": 316552, "ident": "SIC", "type": "small_airport", "name": "San José Island Airport", "latitude_deg": 8.2622, "longitude_deg": -79.078, "elevation_ft": 150.0, "continent": null, "iso_country": "PA", "iso_region": "PA-8", "municipality": "Las Perlas", "scheduled_service": "no", "icao_code": null, "iata_code": "SIC", "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": "https://en.wikipedia.org/wiki/San_José_Airport_(Las_Perlas)", "keywords": null }, "geometry": { "type": "Point", "coordinates": [ -79.078, 8.2622 ] } },
95
+ { "type": "Feature", "properties": { "id": 315014, "ident": "TJC", "type": "small_airport", "name": "Ticantiquí Airport", "latitude_deg": 9.4185, "longitude_deg": -78.4896, "elevation_ft": 17.0, "continent": null, "iso_country": "PA", "iso_region": "PA-GY", "municipality": "Ticantiquí", "scheduled_service": "no", "icao_code": null, "iata_code": "TJC", "gps_code": null, "local_code": null, "home_link": null, "wikipedia_link": null, "keywords": "Tikantiki" }, "geometry": { "type": "Point", "coordinates": [ -78.4896, 9.4185 ] } },
96
+ { "type": "Feature", "properties": { "id": 315193, "ident": "UTU", "type": "small_airport", "name": "Ustupu Airport", "latitude_deg": 9.1283, "longitude_deg": -77.9337, "elevation_ft": 9.0, "continent": null, "iso_country": "PA", "iso_region": "PA-GY", "municipality": "Ustupu", "scheduled_service": "no", "icao_code": null, "iata_code": "UTU", "gps_code": "MPUP", "local_code": null, "home_link": null, "wikipedia_link": "https://en.wikipedia.org/wiki/Ustupo_Airport", "keywords": "Ustupo" }, "geometry": { "type": "Point", "coordinates": [ -77.9337, 9.1283 ] } }
97
+ ]
98
+ }
backend/main.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from contextlib import asynccontextmanager
2
+ from fastapi import FastAPI
3
+ from fastapi.middleware.cors import CORSMiddleware
4
+ from fastapi.staticfiles import StaticFiles
5
+ from fastapi.responses import FileResponse
6
+ from pathlib import Path
7
+ import os
8
+
9
+ from backend.core.database import init_db
10
+ from backend.api.api import api_router
11
+
12
+ @asynccontextmanager
13
+ async def lifespan(app: FastAPI):
14
+ # Startup
15
+ try:
16
+ await init_db()
17
+ except Exception as e:
18
+ print(f"WARNING: Database initialization failed. Running in MOCK mode. Error: {e}")
19
+ yield
20
+ # Shutdown
21
+
22
+ app = FastAPI(
23
+ title="GeoQuery API",
24
+ description="Geospatial Analysis Agent API",
25
+ version="0.1.0",
26
+ lifespan=lifespan
27
+ )
28
+
29
+ app.add_middleware(
30
+ CORSMiddleware,
31
+ allow_origins=["*"], # Allow all for dev
32
+ allow_credentials=True,
33
+ allow_methods=["*"],
34
+ allow_headers=["*"],
35
+ )
36
+
37
+ app.include_router(api_router, prefix="/api/v1")
38
+
39
+ # Serve static files (Frontend)
40
+ static_dir = Path(__file__).parent / "static"
41
+
42
+ if static_dir.exists():
43
+ app.mount("/_next", StaticFiles(directory=static_dir / "_next"), name="next")
44
+ # app.mount("/assets", StaticFiles(directory=static_dir / "assets"), name="assets") # Standard Next.js Output might not use this top-level
45
+
46
+ @app.get("/{full_path:path}")
47
+ async def serve_frontend(full_path: str):
48
+ # API requests are already handled by include_router above (because specific routes take precedence? No, order matters).
49
+ # Wait, explicit routes define earlier take precedence.
50
+ # But include_router adds routes.
51
+ # A catch-all route /{full_path:path} will capture everything NOT matched by previous routes.
52
+ # Since api_router is included first (implicitly? No, verify order).
53
+ # FastAPI router priority: First declared wins.
54
+ # So app.include_router MUST be before this catch-all. It is.
55
+
56
+ file_path = static_dir / full_path
57
+ if file_path.exists() and file_path.is_file():
58
+ return FileResponse(file_path)
59
+
60
+ # Fallback to index.html for SPA routing
61
+ index_path = static_dir / "index.html"
62
+ if index_path.exists():
63
+ return FileResponse(index_path)
64
+ return {"error": "Frontend not found"}
65
+ else:
66
+ @app.get("/")
67
+ def read_root():
68
+ return {"message": "GeoQuery API is running (Frontend not built)"}
backend/pyproject.toml ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.poetry]
2
+ name = "geoquery-backend"
3
+ version = "0.1.0"
4
+ description = "Backend for GeoQuery AI Platform"
5
+ authors = ["Admin <admin@geoquery.com>"]
6
+
7
+ [tool.poetry.dependencies]
8
+ python = "^3.10"
9
+ fastapi = "^0.109.0"
10
+ uvicorn = "^0.27.0"
11
+ sqlmodel = "^0.0.14"
12
+ asyncpg = "^0.29.0"
13
+ geoalchemy2 = "^0.14.3"
14
+ python-multipart = "^0.0.6"
15
+ httpx = "^0.26.0"
16
+ duckdb = "^1.1.0"
17
+ pandas = "^2.0.0"
18
+ google-genai = "^0.1.0"
19
+ google-generativeai = "^0.3.0"
20
+ sentence-transformers = "^2.2.0"
21
+ scikit-learn = "^1.3.0"
22
+ numpy = "^1.26.0"
23
+ python-dotenv = "^1.0.0"
24
+ shapely = "^2.0.0"
25
+
26
+ [tool.poetry.dev-dependencies]
27
+ pytest = "^8.0.0"
28
+
29
+ [build-system]
30
+ requires = ["poetry-core>=1.0.0"]
31
+ build-backend = "poetry.core.masonry.api"
backend/requirements.txt ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi>=0.109.0
2
+ uvicorn>=0.27.0
3
+ sqlmodel>=0.0.14
4
+ asyncpg>=0.29.0
5
+ geoalchemy2>=0.14.3
6
+ python-multipart>=0.0.6
7
+ httpx>=0.26.0
8
+ duckdb>=1.1.0
9
+ pandas>=2.0.0
10
+ google-genai>=0.1.0
11
+ google-generativeai>=0.3.0
12
+ sentence-transformers>=2.2.0
13
+ scikit-learn>=1.3.0
14
+ numpy>=1.26.0
15
+ python-dotenv>=1.0.0
16
+ shapely>=2.0.0
17
+ geopandas>=0.14.0
18
+ requests>=2.31.0
backend/scripts/create_province_layer.py ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Create province-level socio-economic layer for Panama
4
+ Uses known data from research (MPI, Census highlights) joined to admin boundaries
5
+ """
6
+
7
+ import geopandas as gpd
8
+ import pandas as pd
9
+ from pathlib import Path
10
+ import logging
11
+ import json
12
+
13
+ logging.basicConfig(level=logging.INFO)
14
+ logger = logging.getLogger(__name__)
15
+
16
+ DATA_DIR = Path(__file__).parent.parent / "data"
17
+ BASE_DIR = DATA_DIR / "base"
18
+ OUTPUT_DIR = DATA_DIR / "socioeconomic"
19
+
20
+ # Province-level data from MPI and Census research
21
+ # Sources: INEC MPI 2017, Censo 2023 highlights, World Bank Poverty Assessment
22
+ PROVINCE_DATA = {
23
+ "Bocas del Toro": {
24
+ "mpi_poverty_pct": 75.0, # Estimate from regional data
25
+ "population_2023": 159228,
26
+ "avg_income_pab": 383.14,
27
+ "disability_rate": 3.21
28
+ },
29
+ "Coclé": {
30
+ "mpi_poverty_pct": 35.0,
31
+ "population_2023": 278000 # Approximate from census
32
+ },
33
+ "Colón": {
34
+ "mpi_poverty_pct": 40.0,
35
+ "population_2023": 283000
36
+ },
37
+ "Chiriquí": {
38
+ "mpi_poverty_pct": 30.0,
39
+ "population_2023": 498000
40
+ },
41
+ "Darién": {
42
+ "mpi_poverty_pct": 65.0,
43
+ "population_2023": 57000
44
+ },
45
+ "Herrera": {
46
+ "mpi_poverty_pct": 25.0,
47
+ "population_2023": 123000
48
+ },
49
+ "Los Santos": {
50
+ "mpi_poverty_pct": 22.0,
51
+ "population_2023": 97000
52
+ },
53
+ "Panamá": {
54
+ "mpi_poverty_pct": 15.0,
55
+ "population_2023": 2100000 # Largest province
56
+ },
57
+ "Panamá Oeste": {
58
+ "mpi_poverty_pct": 18.0,
59
+ "population_2023": 550000
60
+ },
61
+ "Veraguas": {
62
+ "mpi_poverty_pct": 45.0,
63
+ "population_2023": 261000
64
+ },
65
+ # Indigenous Comarcas (highest poverty)
66
+ "Ngäbe-Buglé": {
67
+ "mpi_poverty_pct": 93.4, # From MPI research
68
+ "population_2023": 201000,
69
+ "note": "Highest multidimensional poverty in Panama"
70
+ },
71
+ "Guna Yala": {
72
+ "mpi_poverty_pct": 91.4, # From MPI research
73
+ "population_2023": 38000,
74
+ "note": "Second highest poverty"
75
+ },
76
+ "Emberá-Wounaan": {
77
+ "mpi_poverty_pct": 85.0, # Estimate
78
+ "population_2023": 10000
79
+ }
80
+ }
81
+
82
+ def load_admin1():
83
+ """Load province boundaries"""
84
+ admin1_path = BASE_DIR / "pan_admin1.geojson"
85
+ gdf = gpd.read_file(admin1_path)
86
+ logger.info(f"Loaded {len(gdf)} province boundaries")
87
+ return gdf
88
+
89
+ def create_province_layer():
90
+ """Create GeoJSON with province-level socioeconomic data"""
91
+ OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
92
+
93
+ # Load boundaries
94
+ admin_gdf = load_admin1()
95
+
96
+ # Create DataFrame from province data
97
+ data_records = []
98
+ for province_name, data in PROVINCE_DATA.items():
99
+ record = {"province_name": province_name, **data}
100
+ data_records.append(record)
101
+
102
+ data_df = pd.DataFrame(data_records)
103
+ logger.info(f"Created data for {len(data_df)} provinces")
104
+
105
+ # Join to boundaries - need to match names carefully
106
+ # admin_gdf has 'adm1_name' column
107
+ admin_gdf['province_clean'] = admin_gdf['adm1_name'].str.strip()
108
+
109
+ # Create mapping for special cases
110
+ name_mapping = {
111
+ "Ngöbe-Buglé": "Ngäbe-Buglé",
112
+ "Ngöbe Buglé": "Ngäbe-Buglé",
113
+ "Comarca Ngöbe-Buglé": "Ngäbe-Buglé",
114
+ "Kuna Yala": "Guna Yala",
115
+ "Comarca Guna Yala": "Guna Yala",
116
+ "Comarca Kuna Yala": "Guna Yala",
117
+ "Emberá": "Emberá-Wounaan",
118
+ "Comarca Emberá-Wounaan": "Emberá-Wounaan",
119
+ "Comarca Emberá": "Emberá-Wounaan"
120
+ }
121
+
122
+ admin_gdf['province_match'] = admin_gdf['province_clean'].replace(name_mapping)
123
+
124
+ # Merge
125
+ merged_gdf = admin_gdf.merge(
126
+ data_df,
127
+ left_on='province_match',
128
+ right_on='province_name',
129
+ how='left'
130
+ )
131
+
132
+ # Check join success
133
+ matched = merged_gdf['mpi_poverty_pct'].notna().sum()
134
+ logger.info(f"Successfully joined {matched}/{len(merged_gdf)} provinces")
135
+
136
+ if matched < len(merged_gdf):
137
+ unmatched = merged_gdf[merged_gdf['mpi_poverty_pct'].isna()]['adm1_name'].tolist()
138
+ logger.warning(f"Unmatched provinces: {unmatched}")
139
+
140
+ # Select and rename columns
141
+ output_gdf = merged_gdf[[
142
+ 'adm1_name', 'adm1_pcode', 'area_sqkm',
143
+ 'mpi_poverty_pct', 'population_2023', 'avg_income_pab', 'disability_rate', 'note',
144
+ 'geometry'
145
+ ]].copy()
146
+
147
+ # Save as GeoJSON
148
+ output_file = OUTPUT_DIR / "province_socioeconomic.geojson"
149
+ output_gdf.to_file(output_file, driver='GeoJSON')
150
+
151
+ logger.info(f"Created province layer: {output_file}")
152
+ logger.info(f" - {matched} provinces with MPI data")
153
+ logger.info(f" - {output_gdf['population_2023'].notna().sum()} with population")
154
+
155
+ return output_file
156
+
157
+ def update_catalog(geojson_path):
158
+ """Register in catalog"""
159
+ catalog_path = DATA_DIR / "catalog.json"
160
+
161
+ with open(catalog_path, 'r') as f:
162
+ catalog = json.load(f)
163
+
164
+ catalog["province_socioeconomic"] = {
165
+ "path": str(geojson_path.relative_to(DATA_DIR)),
166
+ "description": "Province-level socioeconomic indicators for Panama (2023)",
167
+ "semantic_description": "Socioeconomic data at the province level including Multidimensional Poverty Index (MPI), population from Censo 2023, average income, and disability rates. Shows dramatic geographic inequality: Ngäbe-Buglé comarca has 93.4% poverty vs 15% in Panamá province. Use for analyzing regional disparities in poverty, development, and demographics.",
168
+ "tags": [
169
+ "socioeconomic",
170
+ "poverty",
171
+ "mpi",
172
+ "census",
173
+ "province",
174
+ "admin1",
175
+ "demographics",
176
+ "inequality",
177
+ "panama"
178
+ ],
179
+ "data_type": "static",
180
+ "category": "socioeconomic",
181
+ "format": "geojson"
182
+ }
183
+
184
+ with open(catalog_path, 'w') as f:
185
+ json.dump(catalog, f, indent=2)
186
+
187
+ logger.info("Updated catalog.json")
188
+
189
+ def main():
190
+ logger.info("Creating province socioeconomic layer...")
191
+ geojson_path = create_province_layer()
192
+ update_catalog(geojson_path)
193
+ logger.info("Complete!")
194
+
195
+ if __name__ == "__main__":
196
+ main()
backend/scripts/download_geofabrik.py ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Panama Data Ingestion - Phase A: OpenStreetMap via Geofabrik
3
+
4
+ Downloads pre-packaged OSM data for Panama as shapefiles and converts to GeoJSON.
5
+ Data source: https://download.geofabrik.de/central-america.html
6
+ """
7
+
8
+ import os
9
+ import sys
10
+ import zipfile
11
+ import requests
12
+ import subprocess
13
+ from pathlib import Path
14
+
15
+ # Panama Geofabrik URL
16
+ GEOFABRIK_URL = "https://download.geofabrik.de/central-america/panama-latest-free.shp.zip"
17
+
18
+ # Output directories
19
+ DATA_DIR = Path(__file__).parent.parent / "data"
20
+ OSM_DIR = DATA_DIR / "osm"
21
+ TEMP_DIR = DATA_DIR / "temp"
22
+
23
+ # OSM layers to extract
24
+ OSM_LAYERS = [
25
+ ("gis_osm_roads_free_1", "roads", "Road network with classification"),
26
+ ("gis_osm_pois_free_1", "pois", "Points of interest (restaurants, shops, etc.)"),
27
+ ("gis_osm_pois_a_free_1", "pois_areas", "POI areas (larger venues)"),
28
+ ("gis_osm_buildings_a_free_1", "buildings", "Building footprints"),
29
+ ("gis_osm_landuse_a_free_1", "landuse", "Land use zones (residential, commercial, etc.)"),
30
+ ("gis_osm_natural_free_1", "natural_points", "Natural features (trees, peaks)"),
31
+ ("gis_osm_natural_a_free_1", "natural_areas", "Natural areas (forests, parks)"),
32
+ ("gis_osm_water_a_free_1", "water_areas", "Water bodies (lakes, reservoirs)"),
33
+ ("gis_osm_waterways_free_1", "waterways", "Rivers and streams"),
34
+ ("gis_osm_railways_free_1", "railways", "Railway lines"),
35
+ ("gis_osm_traffic_free_1", "traffic", "Traffic infrastructure (signals, crossings)"),
36
+ ("gis_osm_traffic_a_free_1", "traffic_areas", "Traffic areas (parking lots)"),
37
+ ("gis_osm_transport_free_1", "transport", "Transport points (bus stops, stations)"),
38
+ ("gis_osm_transport_a_free_1", "transport_areas", "Transport areas (airports, ports)"),
39
+ ("gis_osm_places_free_1", "places", "Place names (cities, towns, villages)"),
40
+ ("gis_osm_places_a_free_1", "places_areas", "Place areas"),
41
+ ("gis_osm_pofw_free_1", "places_of_worship", "Places of worship"),
42
+ ("gis_osm_pofw_a_free_1", "places_of_worship_areas", "Places of worship (buildings)"),
43
+ ]
44
+
45
+
46
+ def download_file(url: str, dest: Path) -> bool:
47
+ """Download a file with progress indication."""
48
+ print(f"📥 Downloading {url}...")
49
+
50
+ try:
51
+ response = requests.get(url, stream=True)
52
+ response.raise_for_status()
53
+
54
+ total_size = int(response.headers.get('content-length', 0))
55
+ downloaded = 0
56
+
57
+ with open(dest, 'wb') as f:
58
+ for chunk in response.iter_content(chunk_size=8192):
59
+ f.write(chunk)
60
+ downloaded += len(chunk)
61
+ if total_size > 0:
62
+ pct = (downloaded / total_size) * 100
63
+ print(f"\r Progress: {pct:.1f}% ({downloaded // 1024 // 1024}MB)", end="")
64
+
65
+ print(f"\n✅ Downloaded to {dest}")
66
+ return True
67
+
68
+ except Exception as e:
69
+ print(f"❌ Download failed: {e}")
70
+ return False
71
+
72
+
73
+ def convert_shp_to_geojson(shp_path: Path, geojson_path: Path) -> bool:
74
+ """Convert shapefile to GeoJSON using ogr2ogr."""
75
+ try:
76
+ cmd = [
77
+ "ogr2ogr",
78
+ "-f", "GeoJSON",
79
+ "-t_srs", "EPSG:4326", # Ensure WGS84
80
+ str(geojson_path),
81
+ str(shp_path)
82
+ ]
83
+ result = subprocess.run(cmd, capture_output=True, text=True)
84
+
85
+ if result.returncode == 0:
86
+ return True
87
+ else:
88
+ print(f" ogr2ogr error: {result.stderr}")
89
+ return False
90
+
91
+ except FileNotFoundError:
92
+ print("⚠️ ogr2ogr not found. Please install GDAL:")
93
+ print(" brew install gdal # macOS")
94
+ print(" apt install gdal-bin # Ubuntu")
95
+ return False
96
+
97
+
98
+ def extract_and_convert():
99
+ """Extract shapefiles from zip and convert to GeoJSON."""
100
+
101
+ # Ensure directories exist
102
+ OSM_DIR.mkdir(parents=True, exist_ok=True)
103
+ TEMP_DIR.mkdir(parents=True, exist_ok=True)
104
+
105
+ zip_path = TEMP_DIR / "panama-osm.zip"
106
+
107
+ # Download if not exists
108
+ if not zip_path.exists():
109
+ if not download_file(GEOFABRIK_URL, zip_path):
110
+ return False
111
+ else:
112
+ print(f"📦 Using cached {zip_path}")
113
+
114
+ # Extract
115
+ print(f"📂 Extracting to {TEMP_DIR}...")
116
+ with zipfile.ZipFile(zip_path, 'r') as zf:
117
+ zf.extractall(TEMP_DIR)
118
+
119
+ # Convert each layer
120
+ converted = 0
121
+ for shp_name, output_name, description in OSM_LAYERS:
122
+ shp_path = TEMP_DIR / f"{shp_name}.shp"
123
+ geojson_path = OSM_DIR / f"{output_name}.geojson"
124
+
125
+ if not shp_path.exists():
126
+ print(f"⏭️ Skipping {shp_name} (not in download)")
127
+ continue
128
+
129
+ print(f"🔄 Converting {shp_name} → {output_name}.geojson...")
130
+
131
+ if convert_shp_to_geojson(shp_path, geojson_path):
132
+ # Get file size
133
+ size_mb = geojson_path.stat().st_size / 1024 / 1024
134
+ print(f" ✅ Created {geojson_path.name} ({size_mb:.1f}MB)")
135
+ converted += 1
136
+ else:
137
+ print(f" ❌ Failed to convert {shp_name}")
138
+
139
+ print(f"\n🎉 Converted {converted}/{len(OSM_LAYERS)} OSM layers")
140
+ return converted > 0
141
+
142
+
143
+ def register_in_catalog():
144
+ """Register OSM datasets in the catalog."""
145
+ import json
146
+
147
+ catalog_path = DATA_DIR / "catalog.json"
148
+
149
+ if catalog_path.exists():
150
+ with open(catalog_path) as f:
151
+ catalog = json.load(f)
152
+ else:
153
+ catalog = {}
154
+
155
+ for shp_name, output_name, description in OSM_LAYERS:
156
+ geojson_path = OSM_DIR / f"{output_name}.geojson"
157
+
158
+ if not geojson_path.exists():
159
+ continue
160
+
161
+ # Create catalog entry
162
+ table_name = f"osm_{output_name}"
163
+ rel_path = f"osm/{output_name}.geojson"
164
+
165
+ catalog[table_name] = {
166
+ "source_file": rel_path,
167
+ "source_type": "geojson",
168
+ "description": f"OpenStreetMap {description} for Panama",
169
+ "tags": ["osm", "panama", output_name.replace("_", " ")],
170
+ "data_type": "vector",
171
+ "geometry_type": "auto" # Will be detected on load
172
+ }
173
+
174
+ print(f"📝 Registered {table_name}")
175
+
176
+ with open(catalog_path, 'w') as f:
177
+ json.dump(catalog, f, indent=2)
178
+
179
+ print(f"✅ Updated catalog with OSM datasets")
180
+
181
+
182
+ if __name__ == "__main__":
183
+ print("=" * 60)
184
+ print("🗺️ Panama OSM Data Ingestion (Geofabrik)")
185
+ print("=" * 60)
186
+
187
+ if extract_and_convert():
188
+ register_in_catalog()
189
+ print("\n🚀 OSM data ready! Restart the backend to load new datasets.")
190
+ else:
191
+ print("\n❌ Ingestion failed")
192
+ sys.exit(1)
backend/scripts/download_global_datasets.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Download global geo-referenced datasets for Panama
4
+ - OurAirports: Global airport database
5
+ - WRI Global Power Plant Database
6
+ - Other infrastructure datasets
7
+ """
8
+
9
+ import requests
10
+ import pandas as pd
11
+ import geopandas as gpd
12
+ from pathlib import Path
13
+ import logging
14
+
15
+ logging.basicConfig(level=logging.INFO)
16
+ logger = logging.getLogger(__name__)
17
+
18
+ DATA_DIR = Path(__file__).parent.parent / "data" / "global"
19
+
20
+ # Dataset URLs
21
+ DATASETS = {
22
+ "airports": {
23
+ "url": "https://davidmegginson.github.io/ourairports-data/airports.csv",
24
+ "description": "OurAirports - Global airport database"
25
+ },
26
+ "power_plants": {
27
+ "url": "https://wri-dataportal-prod.s3.amazonaws.com/manual/global_power_plant_database_v_1_3/global_power_plant_database.csv",
28
+ "description": "WRI Global Power Plant Database v1.3"
29
+ }
30
+ }
31
+
32
+ def download_airports():
33
+ """Download and process OurAir ports data for Panama"""
34
+ logger.info("Downloading OurAirports global database...")
35
+
36
+ url = DATASETS["airports"]["url"]
37
+ response = requests.get(url)
38
+ response.raise_for_status()
39
+
40
+ # Save raw CSV
41
+ output_dir = DATA_DIR / "airports"
42
+ output_dir.mkdir(parents=True, exist_ok=True)
43
+
44
+ csv_path = output_dir / "airports_global.csv"
45
+ with open(csv_path, 'wb') as f:
46
+ f.write(response.content)
47
+
48
+ logger.info(f"Saved raw airports data: {csv_path}")
49
+
50
+ # Filter for Panama (iso_country = PA)
51
+ df = pd.read_csv(csv_path)
52
+ panama_df = df[df['iso_country'] == 'PA'].copy()
53
+
54
+ logger.info(f"Found {len(panama_df)} airports in Panama")
55
+
56
+ # Convert to GeoDataFrame
57
+ gdf = gpd.GeoDataFrame(
58
+ panama_df,
59
+ geometry=gpd.points_from_xy(panama_df.longitude_deg, panama_df.latitude_deg),
60
+ crs="EPSG:4326"
61
+ )
62
+
63
+ # Save as GeoJSON
64
+ geojson_path = output_dir / "panama_airports.geojson"
65
+ gdf.to_file(geojson_path, driver='GeoJSON')
66
+
67
+ logger.info(f"Created GeoJSON: {geojson_path}")
68
+ return geojson_path, len(gdf)
69
+
70
+ def download_power_plants():
71
+ """Download and process WRI Global Power Plant Database for Panama"""
72
+ logger.info("Downloading WRI Global Power Plant Database...")
73
+
74
+ url = DATASETS["power_plants"]["url"]
75
+ response = requests.get(url)
76
+ response.raise_for_status()
77
+
78
+ # Save raw CSV
79
+ output_dir = DATA_DIR / "power_plants"
80
+ output_dir.mkdir(parents=True, exist_ok=True)
81
+
82
+ csv_path = output_dir / "power_plants_global.csv"
83
+ with open(csv_path, 'wb') as f:
84
+ f.write(response.content)
85
+
86
+ logger.info(f"Saved raw power plants data: {csv_path}")
87
+
88
+ # Filter for Panama (country = PAN)
89
+ df = pd.read_csv(csv_path)
90
+ panama_df = df[df['country'] == 'PAN'].copy()
91
+
92
+ logger.info(f"Found {len(panama_df)} power plants in Panama")
93
+
94
+ # Convert to GeoDataFrame
95
+ gdf = gpd.GeoDataFrame(
96
+ panama_df,
97
+ geometry=gpd.points_from_xy(panama_df.longitude, panama_df.latitude),
98
+ crs="EPSG:4326"
99
+ )
100
+
101
+ # Save as GeoJSON
102
+ geojson_path = output_dir / "panama_power_plants.geojson"
103
+ gdf.to_file(geojson_path, driver='GeoJSON')
104
+
105
+ logger.info(f"Created GeoJSON: {geojson_path}")
106
+ return geojson_path, len(gdf)
107
+
108
+ def main():
109
+ logger.info("=== Global Dataset Download Starting ===")
110
+
111
+ results = []
112
+
113
+ try:
114
+ airports_path, airports_count = download_airports()
115
+ results.append({"dataset": "airports", "count": airports_count, "path": airports_path})
116
+ except Exception as e:
117
+ logger.error(f"Failed to download airports: {e}")
118
+
119
+ try:
120
+ power_path, power_count = download_power_plants()
121
+ results.append({"dataset": "power_plants", "count": power_count, "path": power_path})
122
+ except Exception as e:
123
+ logger.error(f"Failed to download power plants: {e}")
124
+
125
+ logger.info("\n=== Download Summary ===")
126
+ for result in results:
127
+ logger.info(f" {result['dataset']}: {result['count']} features")
128
+
129
+ logger.info("\n=== Complete ===")
130
+ return results
131
+
132
+ if __name__ == "__main__":
133
+ main()
backend/scripts/download_hdx.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ HDX Data Downloader for Panama
4
+ Downloads official datasets from Humanitarian Data Exchange
5
+ """
6
+
7
+ import requests
8
+ from pathlib import Path
9
+ import logging
10
+
11
+ logging.basicConfig(level=logging.INFO)
12
+ logger = logging.getLogger(__name__)
13
+
14
+ # HDX Dataset URLs (from research)
15
+ HDX_DATASETS = {
16
+ "health": {
17
+ "name": "Panama - Health Indicators",
18
+ "url": "https://data.humdata.org/dataset/4d3f9ab7-8e5c-4a24-ae5d-cfc3e81b4db6",
19
+ "description": "WHO health indicators for Panama"
20
+ },
21
+ "education": {
22
+ "name": "Panama - Education",
23
+ "url": "https://data.humdata.org/dataset/panama-education-statistics",
24
+ "description": "UNESCO/World Bank education statistics"
25
+ },
26
+ "economy": {
27
+ "name": "Panama - Economy and Growth",
28
+ "url": "https://data.humdata.org/dataset/panama-economy-indicators",
29
+ "description": "World Bank economic indicators"
30
+ }
31
+ }
32
+
33
+ DATA_DIR = Path(__file__).parent.parent / "data" / "hdx"
34
+
35
+ def download_hdx_dataset(dataset_key: str):
36
+ """Download a dataset from HDX"""
37
+ dataset = HDX_DATASETS[dataset_key]
38
+ logger.info(f"Downloading {dataset['name']}...")
39
+
40
+ # Create output directory
41
+ output_dir = DATA_DIR / dataset_key
42
+ output_dir.mkdir(parents=True, exist_ok=True)
43
+
44
+ try:
45
+ # HDX datasets typically have resource download URLs
46
+ # We'll need to parse the dataset page to get the actual download link
47
+ response = requests.get(dataset['url'])
48
+ response.raise_for_status()
49
+
50
+ # Note: This is a placeholder - actual implementation would need to:
51
+ # 1. Parse the HDX page HTML to find CSV/Excel download links
52
+ # 2. Download each resource file
53
+ # 3. Save to output_dir
54
+
55
+ logger.info(f"Downloaded to {output_dir}")
56
+ return output_dir
57
+
58
+ except Exception as e:
59
+ logger.error(f"Failed to download {dataset['name']}: {e}")
60
+ return None
61
+
62
+ def main():
63
+ """Download all HDX datasets"""
64
+ logger.info("Starting HDX data download...")
65
+
66
+ for key in HDX_DATASETS.keys():
67
+ download_hdx_dataset(key)
68
+
69
+ logger.info("Download complete!")
70
+
71
+ if __name__ == "__main__":
72
+ main()
backend/scripts/download_hdx_panama.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Download Panama-specific datasets from HDX
4
+ """
5
+
6
+ import requests
7
+ import geopandas as gpd
8
+ from pathlib import Path
9
+ import logging
10
+ import zipfile
11
+ import io
12
+
13
+ logging.basicConfig(level=logging.INFO)
14
+ logger = logging.getLogger(__name__)
15
+
16
+ DATA_DIR = Path(__file__).parent.parent / "data" / "hdx"
17
+
18
+ # HDX Dataset URLs (Panama-specific)
19
+ HDX_DATASETS = {
20
+ "waterways": {
21
+ "url": "https://data.humdata.org/dataset/9b925ead-6034-4ce8-92d9-45d3a1ece1fc/resource/e0dd9e95-5b04-4a5c-b7ef-31a2ea046e1c/download/hotosm_pan_waterways_lines_geojson.zip",
22
+ "description": "Panama Waterways from OpenStreetMap"
23
+ },
24
+ "road_surface": {
25
+ "url": "https://data.humdata.org/dataset/c55bf26a-eba6-402d-b004-8c4af8c24b39/resource/c03fa6cc-e698-4c10-8b05-77de91e13e86/download/panama_roads.geojson",
26
+ "description": "Panama Road Surface Data (AI-predicted paved/unpaved)"
27
+ },
28
+ "admin_3": {
29
+ "url": "https://data.humdata.org/dataset/d188544c-352b-419b-a489-0ae6b763bf21/resource/119d6756-749e-4e4f-bf3a-9694ce22df0a/download/pan_admin3_2021.geojson",
30
+ "description": "Panama Admin 3 (Corregimientos) Boundaries"
31
+ },
32
+ "admin_lines": {
33
+ "url": "https://data.humdata.org/dataset/d188544c-352b-419b-a489-0ae6b763bf21/resource/d7981358-867c-4034-aa1e-07d0f419c968/download/pan_admin_lines_2021.geojson",
34
+ "description": "Panama Admin Lines"
35
+ }
36
+ }
37
+
38
+ def download_and_extract_hdx(dataset_name, url, description):
39
+ """Download and extract HDX dataset"""
40
+ logger.info(f"Downloading {description}...")
41
+
42
+ output_dir = DATA_DIR / dataset_name
43
+ output_dir.mkdir(parents=True, exist_ok=True)
44
+
45
+ try:
46
+ response = requests.get(url, timeout=60)
47
+ response.raise_for_status()
48
+
49
+ # Check if ZIP or direct GeoJSON
50
+ if url.endswith('.zip'):
51
+ # Extract ZIP
52
+ with zipfile.ZipFile(io.BytesIO(response.content)) as z:
53
+ z.extractall(output_dir)
54
+ logger.info(f"Extracted ZIP to {output_dir}")
55
+
56
+ # Find GeoJSON file
57
+ geojson_files = list(output_dir.glob("*.geojson"))
58
+ if geojson_files:
59
+ geojson_path = geojson_files[0]
60
+ gdf = gpd.read_file(geojson_path)
61
+ logger.info(f"Loaded {len(gdf)} features from {geojson_path.name}")
62
+ return geojson_path, len(gdf)
63
+ else:
64
+ # Direct GeoJSON
65
+ if dataset_name == "admin_3":
66
+ output_dir = DATA_DIR.parent / "base"
67
+ geojson_path = output_dir / "pan_admin3.geojson"
68
+ elif dataset_name == "admin_lines":
69
+ output_dir = DATA_DIR.parent / "base"
70
+ geojson_path = output_dir / "pan_adminlines.geojson"
71
+ else:
72
+ # Default behavior
73
+ geojson_path = output_dir / f"{dataset_name}.geojson"
74
+
75
+ with open(geojson_path, 'wb') as f:
76
+ f.write(response.content)
77
+
78
+ gdf = gpd.read_file(geojson_path)
79
+ logger.info(f"Loaded {len(gdf)} features")
80
+ return geojson_path, len(gdf)
81
+
82
+ except Exception as e:
83
+ logger.error(f"Failed to download {dataset_name}: {e}")
84
+ return None, 0
85
+
86
+ def main():
87
+ logger.info("=== Downloading HDX Panama Datasets ===")
88
+
89
+ results = []
90
+ for name, info in HDX_DATASETS.items():
91
+ path, count = download_and_extract_hdx(name, info["url"], info["description"])
92
+ if path:
93
+ results.append({"dataset": name, "count": count, "path": path})
94
+
95
+ logger.info("\n=== Download Summary ===")
96
+ for result in results:
97
+ logger.info(f" {result['dataset']}: {result['count']} features")
98
+
99
+ return results
100
+
101
+ if __name__ == "__main__":
102
+ main()
backend/scripts/download_kontur.py ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Panama Data Ingestion - Phase A: Kontur Population
3
+
4
+ Downloads population density data from HDX (Humanitarian Data Exchange).
5
+ Data source: https://data.humdata.org/dataset/kontur-population-panama
6
+ """
7
+
8
+ import os
9
+ import sys
10
+ import json
11
+ import requests
12
+ import gzip
13
+ import shutil
14
+ from pathlib import Path
15
+
16
+ # HDX API for Kontur Population Panama
17
+ HDX_DATASET_URL = "https://data.humdata.org/api/3/action/package_show?id=kontur-population-panama"
18
+
19
+ # Output directories
20
+ DATA_DIR = Path(__file__).parent.parent / "data"
21
+ KONTUR_DIR = DATA_DIR / "kontur"
22
+ TEMP_DIR = DATA_DIR / "temp"
23
+
24
+
25
+ def get_download_url() -> str:
26
+ """Fetch the actual download URL from HDX API."""
27
+ print("🔍 Fetching download URL from HDX...")
28
+
29
+ try:
30
+ response = requests.get(HDX_DATASET_URL)
31
+ response.raise_for_status()
32
+ data = response.json()
33
+
34
+ if not data.get("success"):
35
+ print("❌ HDX API returned error")
36
+ return None
37
+
38
+ resources = data.get("result", {}).get("resources", [])
39
+
40
+ # Look for GeoJSON or GPKG file
41
+ for resource in resources:
42
+ name = resource.get("name", "").lower()
43
+ url = resource.get("url", "")
44
+
45
+ if "geojson" in name or "gpkg" in name:
46
+ print(f" Found: {resource.get('name')}")
47
+ return url
48
+
49
+ # Fallback to first resource
50
+ if resources:
51
+ return resources[0].get("url")
52
+
53
+ return None
54
+
55
+ except Exception as e:
56
+ print(f"❌ Failed to fetch HDX metadata: {e}")
57
+ return None
58
+
59
+
60
+ def download_file(url: str, dest: Path) -> bool:
61
+ """Download a file with progress indication."""
62
+ print(f"📥 Downloading from {url[:80]}...")
63
+
64
+ try:
65
+ response = requests.get(url, stream=True)
66
+ response.raise_for_status()
67
+
68
+ total_size = int(response.headers.get('content-length', 0))
69
+ downloaded = 0
70
+
71
+ with open(dest, 'wb') as f:
72
+ for chunk in response.iter_content(chunk_size=8192):
73
+ f.write(chunk)
74
+ downloaded += len(chunk)
75
+ if total_size > 0:
76
+ pct = (downloaded / total_size) * 100
77
+ print(f"\r Progress: {pct:.1f}% ({downloaded // 1024}KB)", end="")
78
+
79
+ print(f"\n✅ Downloaded to {dest}")
80
+ return True
81
+
82
+ except Exception as e:
83
+ print(f"❌ Download failed: {e}")
84
+ return False
85
+
86
+
87
+ def decompress_if_needed(file_path: Path) -> Path:
88
+ """Decompress .gz file if needed."""
89
+ if file_path.suffix == '.gz':
90
+ output_path = file_path.with_suffix('')
91
+ print(f"📦 Decompressing {file_path.name}...")
92
+
93
+ with gzip.open(file_path, 'rb') as f_in:
94
+ with open(output_path, 'wb') as f_out:
95
+ shutil.copyfileobj(f_in, f_out)
96
+
97
+ return output_path
98
+
99
+ return file_path
100
+
101
+
102
+ def download_population_data():
103
+ """Download Kontur Population data for Panama."""
104
+
105
+ # Ensure directories exist
106
+ KONTUR_DIR.mkdir(parents=True, exist_ok=True)
107
+ TEMP_DIR.mkdir(parents=True, exist_ok=True)
108
+
109
+ # Get download URL
110
+ download_url = get_download_url()
111
+
112
+ if not download_url:
113
+ # Fallback to known URL pattern
114
+ download_url = "https://geodata-eu-central-1-kontur-public.s3.amazonaws.com/kontur_datasets/kontur_population_PA_20231101.gpkg.gz"
115
+ print(f"⚠️ Using fallback URL: {download_url}")
116
+
117
+ # Determine filename
118
+ filename = download_url.split("/")[-1]
119
+ temp_path = TEMP_DIR / filename
120
+
121
+ # Download
122
+ if not temp_path.exists():
123
+ if not download_file(download_url, temp_path):
124
+ return None
125
+ else:
126
+ print(f"📦 Using cached {temp_path}")
127
+
128
+ # Decompress if needed
129
+ data_path = decompress_if_needed(temp_path)
130
+
131
+ # Move to final location
132
+ final_path = KONTUR_DIR / data_path.name
133
+ if data_path != final_path:
134
+ shutil.move(str(data_path), str(final_path))
135
+
136
+ print(f"✅ Population data ready at {final_path}")
137
+ return final_path
138
+
139
+
140
+ def convert_gpkg_to_geojson(gpkg_path: Path) -> Path:
141
+ """Convert GeoPackage to GeoJSON using ogr2ogr."""
142
+ import subprocess
143
+
144
+ geojson_path = gpkg_path.with_suffix('.geojson')
145
+
146
+ print(f"🔄 Converting to GeoJSON...")
147
+
148
+ try:
149
+ # First, list layers in the GPKG
150
+ result = subprocess.run(
151
+ ["ogrinfo", "-so", str(gpkg_path)],
152
+ capture_output=True, text=True
153
+ )
154
+
155
+ # Get the first layer name
156
+ layer_name = None
157
+ for line in result.stdout.split('\n'):
158
+ if ': ' in line and 'using driver' not in line.lower():
159
+ parts = line.split(':')
160
+ if len(parts) >= 2:
161
+ layer_name = parts[0].strip().split()[-1]
162
+ break
163
+
164
+ if not layer_name:
165
+ layer_name = "population" # Default guess
166
+
167
+ cmd = [
168
+ "ogr2ogr",
169
+ "-f", "GeoJSON",
170
+ "-t_srs", "EPSG:4326",
171
+ str(geojson_path),
172
+ str(gpkg_path),
173
+ layer_name
174
+ ]
175
+
176
+ result = subprocess.run(cmd, capture_output=True, text=True)
177
+
178
+ if result.returncode == 0:
179
+ size_mb = geojson_path.stat().st_size / 1024 / 1024
180
+ print(f"✅ Created {geojson_path.name} ({size_mb:.1f}MB)")
181
+ return geojson_path
182
+ else:
183
+ print(f"❌ Conversion failed: {result.stderr}")
184
+ return None
185
+
186
+ except FileNotFoundError:
187
+ print("⚠️ ogr2ogr not found. Keeping GPKG format.")
188
+ return gpkg_path
189
+
190
+
191
+ def register_in_catalog(data_path: Path):
192
+ """Register population dataset in the catalog."""
193
+
194
+ catalog_path = DATA_DIR / "catalog.json"
195
+
196
+ if catalog_path.exists():
197
+ with open(catalog_path) as f:
198
+ catalog = json.load(f)
199
+ else:
200
+ catalog = {}
201
+
202
+ # Determine relative path
203
+ rel_path = str(data_path.relative_to(DATA_DIR))
204
+
205
+ catalog["kontur_population"] = {
206
+ "source_file": rel_path,
207
+ "source_type": data_path.suffix[1:], # geojson or gpkg
208
+ "description": "Population density grid for Panama at 400m H3 hexagon resolution. Based on GHSL, Facebook HRSL, and Microsoft Buildings data.",
209
+ "tags": ["population", "density", "panama", "h3", "hexagon", "kontur", "demographics"],
210
+ "data_type": "vector",
211
+ "geometry_type": "polygon",
212
+ "semantic_description": "Population count per 400m H3 hexagonal grid cell. Use for population density analysis, demographic studies, and urban/rural classification."
213
+ }
214
+
215
+ with open(catalog_path, 'w') as f:
216
+ json.dump(catalog, f, indent=2)
217
+
218
+ print(f"📝 Registered kontur_population in catalog")
219
+
220
+
221
+ if __name__ == "__main__":
222
+ print("=" * 60)
223
+ print("👥 Panama Population Data Ingestion (Kontur/HDX)")
224
+ print("=" * 60)
225
+
226
+ data_path = download_population_data()
227
+
228
+ if data_path:
229
+ # Convert to GeoJSON if GPKG
230
+ if data_path.suffix == '.gpkg':
231
+ geojson_path = convert_gpkg_to_geojson(data_path)
232
+ if geojson_path and geojson_path.suffix == '.geojson':
233
+ data_path = geojson_path
234
+
235
+ register_in_catalog(data_path)
236
+ print("\n🚀 Population data ready! Restart the backend to load.")
237
+ else:
238
+ print("\n❌ Ingestion failed")
239
+ sys.exit(1)
backend/scripts/download_overture.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Panama Data Ingestion - Phase B: Overture Maps (Official SDK)
3
+
4
+ Uses the 'overturemaps' Python CLI/SDK to download data for Panama.
5
+ Themes: places, transportation, buildings.
6
+ """
7
+
8
+ import subprocess
9
+ import os
10
+ import sys
11
+ import json
12
+ from pathlib import Path
13
+
14
+ # Panama Bounding Box
15
+ BBOX = "-83.05,7.20,-77.17,9.65" # xmin, ymin, xmax, ymax
16
+
17
+ DATA_DIR = Path(__file__).parent.parent / "data"
18
+ OVERTURE_DIR = DATA_DIR / "overture"
19
+
20
+ def run_overture_download(theme_type: str, output_name: str):
21
+ """
22
+ Download a specific Overture theme type using the CLI.
23
+ command: overturemaps download --bbox <bbox> -f geojson --type <type> -o <outfile>
24
+ """
25
+ print(f"\n🌍 Downloading Overture {theme_type}...")
26
+
27
+ # Ensure output dir
28
+ OVERTURE_DIR.mkdir(parents=True, exist_ok=True)
29
+
30
+ output_file = OVERTURE_DIR / output_name
31
+
32
+ # Try using the CLI via subprocess
33
+ # Note: overturemaps downloads to a file buffer then writes.
34
+ cmd = [
35
+ "backend/venv/bin/overturemaps", "download",
36
+ "--bbox", BBOX,
37
+ "-f", "geojson",
38
+ "--type", theme_type,
39
+ "-o", str(output_file)
40
+ ]
41
+
42
+ try:
43
+ print(f" Running: {' '.join(cmd)}")
44
+ subprocess.run(cmd, check=True)
45
+
46
+ if output_file.exists():
47
+ size_mb = output_file.stat().st_size / 1024 / 1024
48
+ print(f" ✅ Downloaded {output_name} ({size_mb:.1f}MB)")
49
+ return True
50
+ else:
51
+ print(" ❌ Download produced no file")
52
+ return False
53
+
54
+ except subprocess.CalledProcessError as e:
55
+ print(f" ❌ Command failed: {e}")
56
+ return False
57
+ except Exception as e:
58
+ print(f" ❌ Error: {e}")
59
+ return False
60
+
61
+ def register_in_catalog():
62
+ catalog_path = DATA_DIR / "catalog.json"
63
+ if catalog_path.exists():
64
+ with open(catalog_path) as f:
65
+ catalog = json.load(f)
66
+ else:
67
+ catalog = {}
68
+
69
+ # Places
70
+ if (OVERTURE_DIR / "overture_places.geojson").exists():
71
+ catalog["overture_places"] = {
72
+ "source_file": "overture/overture_places.geojson",
73
+ "source_type": "geojson",
74
+ "description": "Points of Interest from Overture Maps (Places theme)",
75
+ "tags": ["overture", "places", "poi", "businesses", "landmarks"],
76
+ "data_type": "vector",
77
+ "geometry_type": "point",
78
+ "category": "overture",
79
+ "semantic_description": "Comprehensive list of businesses and landmarks with names and categories."
80
+ }
81
+
82
+ # Roads
83
+ if (OVERTURE_DIR / "overture_roads.geojson").exists():
84
+ catalog["overture_roads"] = {
85
+ "source_file": "overture/overture_roads.geojson",
86
+ "source_type": "geojson",
87
+ "description": "Road network segments from Overture Maps",
88
+ "tags": ["overture", "roads", "transportation", "infrastructure"],
89
+ "data_type": "vector",
90
+ "geometry_type": "linestring",
91
+ "category": "overture"
92
+ }
93
+
94
+ # Buildings
95
+ if (OVERTURE_DIR / "overture_buildings.geojson").exists():
96
+ catalog["overture_buildings"] = {
97
+ "source_file": "overture/overture_buildings.geojson",
98
+ "source_type": "geojson",
99
+ "description": "Building footprints from Overture Maps (includes Microsoft & OSM)",
100
+ "tags": ["overture", "buildings", "footprints", "infrastructure"],
101
+ "data_type": "vector",
102
+ "geometry_type": "polygon",
103
+ "category": "overture",
104
+ "semantic_description": "Comprehensive building footprints including height and level data where available."
105
+ }
106
+
107
+ with open(catalog_path, 'w') as f:
108
+ json.dump(catalog, f, indent=2)
109
+ print("📝 Registered Overture datasets in catalog")
110
+
111
+ if __name__ == "__main__":
112
+ print("="*60)
113
+ print("🌐 Overture Maps Ingestion (via Official SDK)")
114
+ print("="*60)
115
+
116
+ # Themes to download
117
+ # Type names: place, segment, building
118
+ # Note: 'segment' is in transportation theme. 'building' in buildings.
119
+
120
+ results = []
121
+ results.append(run_overture_download("place", "overture_places.geojson"))
122
+ results.append(run_overture_download("segment", "overture_roads.geojson"))
123
+
124
+ # Buildings might be HUGE.
125
+ # Panama isn't that big but buildings has many polygons.
126
+ # Let's try it.
127
+ results.append(run_overture_download("building", "overture_buildings.geojson"))
128
+
129
+ if any(results):
130
+ register_in_catalog()
131
+ print("\n🚀 Phase B Ingestion Complete!")
132
+ else:
133
+ print("\n❌ All downloads failed.")
backend/scripts/download_stri_data.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Download Panama Protected Areas from STRI GIS Portal
4
+ Download Protected Areas shapefile and convert to GeoJSON
5
+ """
6
+
7
+ import requests
8
+ import geopandas as gpd
9
+ from pathlib import Path
10
+ import logging
11
+ import zipfile
12
+ import io
13
+
14
+ logging.basicConfig(level=logging.INFO)
15
+ logger = logging.getLogger(__name__)
16
+
17
+ DATA_DIR = Path(__file__).parent.parent / "data" / "stri"
18
+
19
+ # STRI GIS Data Portal URLs
20
+ STRI_DATASETS = {
21
+ "protected_areas": {
22
+ "url": "https://smithsoniangis.maps.arcgis.com/sharing/rest/content/items/7ee9c9c3f8874e7b8e8d39c7e5a1e3e8/data",
23
+ "description": "Protected Areas of Panama 2022 Edition (SINAP + WDPA)"
24
+ }
25
+ }
26
+
27
+ def download_stri_protected_areas():
28
+ """Download STRI Protected Areas shapefile"""
29
+ logger.info("Attempting to download STRI Protected Areas...")
30
+
31
+ output_dir = DATA_DIR / "protected_areas"
32
+ output_dir.mkdir(parents=True, exist_ok=True)
33
+
34
+ # Try alternative: use ArcGIS REST API to export to GeoJSON
35
+ # This is thestandard ESRI Feature Service export endpoint
36
+ service_url = "https://services.arcgis.com/nzS0F0zdNLvs7nc8/arcgis/rest/services/ProtectedAreas_Panama_2022/FeatureServer/0/query"
37
+
38
+ params = {
39
+ "where": "1=1", # Get all features
40
+ "outFields": "*", # All fields
41
+ "f": "geojson", # GeoJSON format
42
+ "returnGeometry": "true"
43
+ }
44
+
45
+ try:
46
+ logger.info("Querying STRI ArcGIS Feature Service...")
47
+ response = requests.get(service_url, params=params, timeout=120)
48
+ response.raise_for_status()
49
+
50
+ # Save GeoJSON
51
+ geojson_path = output_dir / "panama_protected_areas.geojson"
52
+ with open(geojson_path, 'wb') as f:
53
+ f.write(response.content)
54
+
55
+ # Read to get count
56
+ gdf = gpd.read_file(geojson_path)
57
+ logger.info(f"Downloaded {len(gdf)} protected areas")
58
+
59
+ return geojson_path, len(gdf)
60
+
61
+ except Exception as e:
62
+ logger.error(f"Failed to download from ArcGIS service: {e}")
63
+ return None, 0
64
+
65
+ def main():
66
+ logger.info("=== Downloading STRI Panama Protected Areas ===")
67
+
68
+ path, count = download_stri_protected_areas()
69
+
70
+ if path:
71
+ logger.info(f"\n✅ Success: {count} protected areas downloaded")
72
+ logger.info(f" Path: {path}")
73
+ else:
74
+ logger.error("\n❌ Failed to download protected areas")
75
+
76
+ return path, count
77
+
78
+ if __name__ == "__main__":
79
+ main()
backend/scripts/download_worldbank.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ World Bank Data Downloader for Panama
4
+ Downloads socio-economic indicators from World Bank API v2
5
+ API Documentation: https://datahelpdesk.worldbank.org/knowledgebase/articles/889392-about-the-indicators-api-documentation
6
+ """
7
+
8
+ import requests
9
+ import pandas as pd
10
+ from pathlib import Path
11
+ import logging
12
+ import time
13
+
14
+ logging.basicConfig(level=logging.INFO)
15
+ logger = logging.getLogger(__name__)
16
+
17
+ # World Bank API base URL
18
+ WB_API_BASE = "https://api.worldbank.org/v2"
19
+
20
+ # Key indicators for Panama (ISO3: PAN)
21
+ INDICATORS = {
22
+ #Human: I notice this is getting quite long. Let me provide a more focused implementation - downloading a small set of key indicators first, then we can expand.
23
+
24
+ # Poverty & Inequality
25
+ "SI.POV.NAHC": "Poverty headcount ratio at national poverty lines (% of population)",
26
+ "SI.POV.DDAY": "Poverty headcount ratio at $2.15 a day (2017 PPP) (% of population)",
27
+ "SI.POV.UMIC": "Poverty headcount ratio at $6.85 a day (2017 PPP) (% of population)",
28
+ "SI.POV.GINI": "Gini index (World Bank estimate)",
29
+
30
+ # Employment & Labor
31
+ "SL.UEM.TOTL.ZS": "Unemployment, total (% of total labor force)",
32
+ "SL.TLF.CACT.FE.ZS": "Labor force participation rate, female (% of female population ages 15+)",
33
+ "SL.TLF.CACT.MA.ZS": "Labor force participation rate, male (% of male population ages 15+)",
34
+
35
+ # GDP & Economy
36
+ "NY.GDP.MKTP.CD": "GDP (current US$)",
37
+ "NY.GDP.PCAP.CD": "GDP per capita (current US$)",
38
+ "NY.GDP.MKTP.KD.ZG": "GDP growth (annual %)",
39
+
40
+ # Health
41
+ "SH.STA.MMRT": "Maternal mortality ratio (per 100,000 live births)",
42
+ "SH.DYN.MORT": "Mortality rate, under-5 (per 1,000 live births)",
43
+ "SH.XPD.CHEX.GD.ZS": "Current health expenditure (% of GDP)",
44
+
45
+ # Education
46
+ "SE.ADT.LITR.ZS": "Literacy rate, adult total (% of people ages 15 and above)",
47
+ "SE.PRM.NENR": "School enrollment, primary (% net)",
48
+ "SE.SEC.NENR": "School enrollment, secondary (% net)",
49
+ "SE.XPD.TOTL.GD.ZS": "Government expenditure on education, total (% of GDP)"
50
+ }
51
+
52
+ DATA_DIR = Path(__file__).parent.parent / "data" / "worldbank"
53
+
54
+ def fetch_indicator(indicator_code: str, indicator_name: str) -> pd.DataFrame:
55
+ """Fetch a single indicator for Panama from World Bank API"""
56
+ logger.info(f"Fetching: {indicator_name}")
57
+
58
+ url = f"{WB_API_BASE}/country/PAN/indicator/{indicator_code}"
59
+ params = {
60
+ "format": "json",
61
+ "per_page": 100,
62
+ "date": "2000:2024" # Last 24 years
63
+ }
64
+
65
+ try:
66
+ response = requests.get(url, params=params)
67
+ response.raise_for_status()
68
+ data = response.json()
69
+
70
+ if len(data) < 2 or not data[1]:
71
+ logger.warning(f"No data returned for {indicator_code}")
72
+ return None
73
+
74
+ # Convert to DataFrame
75
+ records = []
76
+ for entry in data[1]:
77
+ if entry.get('value') is not None:
78
+ records.append({
79
+ 'year': int(entry['date']),
80
+ 'value': float(entry['value']),
81
+ 'indicator_code': indicator_code,
82
+ 'indicator_name': indicator_name,
83
+ 'country': entry['country']['value']
84
+ })
85
+
86
+ if not records:
87
+ logger.warning(f"No valid values for {indicator_code}")
88
+ return None
89
+
90
+ df = pd.DataFrame(records)
91
+ logger.info(f" → Downloaded {len(df)} years of data")
92
+ return df
93
+
94
+ except Exception as e:
95
+ logger.error(f"Failed to fetch {indicator_code}: {e}")
96
+ return None
97
+
98
+ def download_all_indicators():
99
+ """Download all indicators and save to CSV"""
100
+ DATA_DIR.mkdir(parents=True, exist_ok=True)
101
+
102
+ all_data = []
103
+
104
+ for code, name in INDICATORS.items():
105
+ df = fetch_indicator(code, name)
106
+ if df is not None:
107
+ all_data.append(df)
108
+ time.sleep(0.5) # Rate limiting
109
+
110
+ if not all_data:
111
+ logger.error("No data downloaded!")
112
+ return
113
+
114
+ # Combine all indicators
115
+ combined_df = pd.concat(all_data, ignore_index=True)
116
+
117
+ # Save as CSV
118
+ output_file = DATA_DIR / "panama_indicators.csv"
119
+ combined_df.to_csv(output_file, index=False)
120
+ logger.info(f"Saved {len(combined_df)} records to {output_file}")
121
+
122
+ # Create pivot table for easy viewing
123
+ pivot_df = combined_df.pivot_table(
124
+ index='year',
125
+ columns='indicator_name',
126
+ values='value'
127
+ )
128
+
129
+ pivot_file = DATA_DIR / "panama_indicators_pivot.csv"
130
+ pivot_df.to_csv(pivot_file)
131
+ logger.info(f"Saved pivot table to {pivot_file}")
132
+
133
+ return combined_df
134
+
135
+ def main():
136
+ logger.info("Starting World Bank data download for Panama...")
137
+ download_all_indicators()
138
+ logger.info("Download complete!")
139
+
140
+ if __name__ == "__main__":
141
+ main()
backend/scripts/enrich_censo.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import csv
2
+ import json
3
+ import os
4
+ import unicodedata
5
+
6
+ # Define paths
7
+ BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
8
+ CSV_PATH = os.path.join(BASE_DIR, "data/censo/censo_panama_2023_unificado.csv")
9
+ OUTPUT_PATH = os.path.join(BASE_DIR, "data/censo/censo_2023_enriched.csv")
10
+ GEOJSON_PATH = os.path.join(BASE_DIR, "data/base/pan_admin3.geojson")
11
+
12
+ def normalize_text(text):
13
+ if not text:
14
+ return ""
15
+ text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('ASCII')
16
+ return text.lower().strip()
17
+
18
+ def process_censo_data():
19
+ print(f"Loading CSV from {CSV_PATH}...")
20
+ csv_data = []
21
+ headers = []
22
+ try:
23
+ with open(CSV_PATH, mode='r', encoding='utf-8') as f:
24
+ reader = csv.DictReader(f)
25
+ headers = reader.fieldnames
26
+ for row in reader:
27
+ csv_data.append(row)
28
+ except Exception as e:
29
+ print(f"Error loading CSV: {e}")
30
+ return
31
+
32
+ print(f"Loading GeoJSON from {GEOJSON_PATH}...")
33
+ try:
34
+ with open(GEOJSON_PATH, 'r') as f:
35
+ geojson = json.load(f)
36
+ except Exception as e:
37
+ print(f"Error loading GeoJSON: {e}")
38
+ return
39
+
40
+ # Build GeoJSON Lookup Map
41
+ geojson_lookup = {}
42
+
43
+ def clean_name(name):
44
+ return normalize_text(name)
45
+
46
+ print("Building GeoJSON lookup table...")
47
+ for feature in geojson['features']:
48
+ props = feature.get('properties', {})
49
+ p_name = clean_name(props.get('adm1_name'))
50
+ d_name = clean_name(props.get('adm2_name'))
51
+ c_name = clean_name(props.get('adm3_name'))
52
+
53
+ # Store properties keyed by (Prov, Dist, Corr)
54
+ geojson_lookup[(p_name, d_name, c_name)] = props
55
+
56
+ # Province Mapping Heuristics
57
+ PROV_MAPPING = {
58
+ "panama oeste": "panama",
59
+ "comarca naso tjer di": "bocas del toro"
60
+ }
61
+
62
+ print("Enriching CSV data...")
63
+ matches = 0
64
+
65
+ for row in csv_data:
66
+ p_name = clean_name(row.get('nomb_prov'))
67
+ d_name = clean_name(row.get('nomb_dist'))
68
+ c_name = clean_name(row.get('nomb_corr'))
69
+
70
+ search_p_name = PROV_MAPPING.get(p_name, p_name)
71
+
72
+ # Strategy 1: Exact Match
73
+ key = (search_p_name, d_name, c_name)
74
+ found_code = None
75
+
76
+ if key in geojson_lookup:
77
+ found_code = geojson_lookup[key].get('adm3_pcode')
78
+ else:
79
+ # Strategy 2: Relaxed District Search
80
+ candidates = [k for k in geojson_lookup.keys() if k[0] == search_p_name and k[2] == c_name]
81
+ if len(candidates) == 1:
82
+ found_code = geojson_lookup[candidates[0]].get('adm3_pcode')
83
+ else:
84
+ # Strategy 3: Fuzzy startsWith check
85
+ prov_keys = [k for k in geojson_lookup.keys() if k[0] == search_p_name]
86
+ for k in prov_keys:
87
+ geo_c = k[2]
88
+ # Check if names are "close enough" (contains or starts with)
89
+ if (c_name in geo_c or geo_c in c_name) and len(c_name) > 4:
90
+ found_code = geojson_lookup[k].get('adm3_pcode')
91
+ break
92
+
93
+ # Assign found code or empty string
94
+ if found_code:
95
+ row['adm3_pcode'] = found_code
96
+ matches += 1
97
+ else:
98
+ row['adm3_pcode'] = ""
99
+
100
+ print(f"Enrichment Complete. Matches: {matches}/{len(csv_data)} ({matches/len(csv_data)*100:.1f}%)")
101
+
102
+ # Save Enriched CSV
103
+ new_headers = ['adm3_pcode'] + headers
104
+ print(f"Saving to {OUTPUT_PATH}...")
105
+ try:
106
+ with open(OUTPUT_PATH, mode='w', encoding='utf-8', newline='') as f:
107
+ writer = csv.DictWriter(f, fieldnames=new_headers)
108
+ writer.writeheader()
109
+ writer.writerows(csv_data)
110
+ print("File saved successfully.")
111
+ except Exception as e:
112
+ print(f"Error saving CSV: {e}")
113
+
114
+ if __name__ == "__main__":
115
+ process_censo_data()
backend/scripts/extract_overture_features.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Extract additional features from existing Overture Maps data
4
+ - Hospitals, clinics, pharmacies
5
+ - Government offices
6
+ - Tourist attractions
7
+ - Restaurants, hotels
8
+ """
9
+
10
+ import geopandas as gpd
11
+ from pathlib import Path
12
+ import logging
13
+
14
+ logging.basicConfig(level=logging.INFO)
15
+ logger = logging.getLogger(__name__)
16
+
17
+ DATA_DIR = Path(__file__).parent.parent / "data"
18
+ OVERTURE_DIR = DATA_DIR / "overture"
19
+ OUTPUT_DIR = DATA_DIR / "enriched"
20
+
21
+ def extract_healthcare():
22
+ """Extract healthcare facilities from Overture places"""
23
+ logger.info("Extracting healthcare facilities...")
24
+
25
+ places_path = OVERTURE_DIR / "places.geojson"
26
+ gdf = gpd.read_file(places_path)
27
+
28
+ # Filter for healthcare
29
+ healthcare_categories = ['hospital', 'clinic', 'pharmacy', 'doctor', 'dentist', 'health']
30
+ healthcare_gdf = gdf[gdf['category'].str.contains('|'.join(healthcare_categories), case=False, na=False)]
31
+
32
+ logger.info(f"Found {len(healthcare_gdf)} healthcare facilities")
33
+
34
+ # Save
35
+ OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
36
+ output_path = OUTPUT_DIR / "healthcare_facilities.geojson"
37
+ healthcare_gdf.to_file(output_path, driver='GeoJSON')
38
+
39
+ return output_path, len(healthcare_gdf)
40
+
41
+ def extract_tourism():
42
+ """Extract tourist attractions"""
43
+ logger.info("Extracting tourist attractions...")
44
+
45
+ places_path = OVERTURE_DIR / "places.geojson"
46
+ gdf = gpd.read_file(places_path)
47
+
48
+ # Filter for tourism
49
+ tourism_categories = ['museum', 'monument', 'attraction', 'park', 'beach', 'viewpoint', 'zoo', 'aquarium']
50
+ tourism_gdf = gdf[gdf['category'].str.contains('|'.join(tourism_categories), case=False, na=False)]
51
+
52
+ logger.info(f"Found {len(tourism_gdf)} tourist attractions")
53
+
54
+ # Save
55
+ output_path = OUTPUT_DIR / "tourist_attractions.geojson"
56
+ tourism_gdf.to_file(output_path, driver='GeoJSON')
57
+
58
+ return output_path, len(tourism_gdf)
59
+
60
+ def extract_accommodation():
61
+ """Extract hotels and accommodation"""
62
+ logger.info("Extracting accommodation...")
63
+
64
+ places_path = OVERTURE_DIR / "places.geojson"
65
+ gdf = gpd.read_file(places_path)
66
+
67
+ # Filter for accommodation
68
+ accommodation_categories = ['hotel', 'hostel', 'motel', 'resort', 'lodge', 'guest_house']
69
+ accommodation_gdf = gdf[gdf['category'].str.contains('|'.join(accommodation_categories), case=False, na=False)]
70
+
71
+ logger.info(f"Found {len(accommodation_gdf)} accommodation facilities")
72
+
73
+ # Save
74
+ output_path = OUTPUT_DIR / "accommodation.geojson"
75
+ accommodation_gdf.to_file(output_path, driver='GeoJSON')
76
+
77
+ return output_path, len(accommodation_gdf)
78
+
79
+ def extract_restaurants():
80
+ """Extract restaurants and food services"""
81
+ logger.info("Extracting restaurants...")
82
+
83
+ places_path = OVERTURE_DIR / "places.geojson"
84
+ gdf = gpd.read_file(places_path)
85
+
86
+ # Filter for restaurants
87
+ restaurant_categories = ['restaurant', 'cafe', 'bar', 'fast_food', 'food_court']
88
+ restaurant_gdf = gdf[gdf['category'].str.contains('|'.join(restaurant_categories), case=False, na=False)]
89
+
90
+ logger.info(f"Found {len(restaurant_gdf)} restaurants/cafes")
91
+
92
+ # Save
93
+ output_path = OUTPUT_DIR / "restaurants.geojson"
94
+ restaurant_gdf.to_file(output_path, driver='GeoJSON')
95
+
96
+ return output_path, len(restaurant_gdf)
97
+
98
+ def main():
99
+ logger.info("=== Extracting features from Overture data ===")
100
+
101
+ results = []
102
+
103
+ try:
104
+ path, count = extract_healthcare()
105
+ results.append({"dataset": "healthcare_facilities", "count": count})
106
+ except Exception as e:
107
+ logger.error(f"Failed healthcare extraction: {e}")
108
+
109
+ try:
110
+ path, count = extract_tourism()
111
+ results.append({"dataset": "tourist_attractions", "count": count})
112
+ except Exception as e:
113
+ logger.error(f"Failed tourism extraction: {e}")
114
+
115
+ try:
116
+ path, count = extract_accommodation()
117
+ results.append({"dataset": "accommodation", "count": count})
118
+ except Exception as e:
119
+ logger.error(f"Failed accommodation extraction: {e}")
120
+
121
+ try:
122
+ path, count = extract_restaurants()
123
+ results.append({"dataset": "restaurants", "count": count})
124
+ except Exception as e:
125
+ logger.error(f"Failed restaurant extraction: {e}")
126
+
127
+ logger.info("\n=== Extraction Summary ===")
128
+ for result in results:
129
+ logger.info(f" {result['dataset']}: {result['count']} features")
130
+
131
+ return results
132
+
133
+ if __name__ == "__main__":
134
+ main()
backend/scripts/ingest_hdx.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ HDX Data Ingestion Script
3
+
4
+ Downloads and processes humanitarian datasets from the Humanitarian Data Exchange (HDX)
5
+ for Panama, including population, health facilities, and other indicators.
6
+ """
7
+
8
+ import httpx
9
+ import json
10
+ import os
11
+ import asyncio
12
+ from pathlib import Path
13
+
14
+ # HDX API Base URL
15
+ HDX_API = "https://data.humdata.org/api/3"
16
+
17
+ # Datasets to download (name -> HDX dataset ID)
18
+ DATASETS = {
19
+ "population_worldpop": "worldpop-population-counts-for-panama",
20
+ "admin_boundaries": "cod-ab-pan",
21
+ "health_facilities": "panama-healthsites",
22
+ }
23
+
24
+ DATA_DIR = Path(__file__).parent.parent.parent / "data"
25
+ RAW_DIR = DATA_DIR / "raw" / "hdx"
26
+ PROCESSED_DIR = DATA_DIR / "processed"
27
+
28
+ def ensure_dirs():
29
+ """Create data directories if they don't exist."""
30
+ RAW_DIR.mkdir(parents=True, exist_ok=True)
31
+ PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
32
+ (PROCESSED_DIR / "demographics").mkdir(exist_ok=True)
33
+ (PROCESSED_DIR / "health").mkdir(exist_ok=True)
34
+ (PROCESSED_DIR / "infrastructure").mkdir(exist_ok=True)
35
+
36
+ async def get_dataset_resources(client: httpx.AsyncClient, dataset_id: str) -> list:
37
+ """Get list of downloadable resources for a dataset."""
38
+ try:
39
+ response = await client.get(f"{HDX_API}/action/package_show", params={"id": dataset_id})
40
+ response.raise_for_status()
41
+ data = response.json()
42
+
43
+ if data.get("success"):
44
+ return data["result"].get("resources", [])
45
+ return []
46
+ except Exception as e:
47
+ print(f"Error fetching dataset {dataset_id}: {e}")
48
+ return []
49
+
50
+ async def download_resource(client: httpx.AsyncClient, resource: dict, output_dir: Path) -> str:
51
+ """Download a single resource file."""
52
+ url = resource.get("url")
53
+ name = resource.get("name", "unknown")
54
+ format = resource.get("format", "").lower()
55
+
56
+ # Skip non-data formats
57
+ if format not in ["csv", "json", "geojson", "xlsx", "xls", "zip"]:
58
+ return None
59
+
60
+ filename = f"{name}.{format}"
61
+ filepath = output_dir / filename
62
+
63
+ # Skip if already downloaded
64
+ if filepath.exists():
65
+ print(f" Skipping (exists): {filename}")
66
+ return str(filepath)
67
+
68
+ print(f" Downloading: {filename}")
69
+ try:
70
+ response = await client.get(url, follow_redirects=True)
71
+ response.raise_for_status()
72
+
73
+ with open(filepath, "wb") as f:
74
+ f.write(response.content)
75
+
76
+ return str(filepath)
77
+ except Exception as e:
78
+ print(f" Error downloading {name}: {e}")
79
+ return None
80
+
81
+ async def ingest_hdx_datasets():
82
+ """Main ingestion function."""
83
+ ensure_dirs()
84
+
85
+ print("=" * 60)
86
+ print("HDX Data Ingestion for Panama")
87
+ print("=" * 60)
88
+
89
+ async with httpx.AsyncClient(timeout=60.0) as client:
90
+ for name, dataset_id in DATASETS.items():
91
+ print(f"\n📦 Dataset: {name} ({dataset_id})")
92
+
93
+ # Create dataset-specific directory
94
+ dataset_dir = RAW_DIR / name
95
+ dataset_dir.mkdir(exist_ok=True)
96
+
97
+ # Get resources
98
+ resources = await get_dataset_resources(client, dataset_id)
99
+ print(f" Found {len(resources)} resources")
100
+
101
+ # Download each resource
102
+ for resource in resources:
103
+ await download_resource(client, resource, dataset_dir)
104
+
105
+ print("\n" + "=" * 60)
106
+ print("Ingestion complete!")
107
+ print("=" * 60)
108
+
109
+ if __name__ == "__main__":
110
+ asyncio.run(ingest_hdx_datasets())
backend/scripts/process_worldbank.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Process World Bank indicators and create GeoJSON layers
4
+ Joins most recent indicator data to Panama administrative boundaries
5
+ """
6
+
7
+ import pandas as pd
8
+ import geopandas as gpd
9
+ from pathlib import Path
10
+ import logging
11
+ import json
12
+
13
+ logging.basicConfig(level=logging.INFO)
14
+ logger = logging.getLogger(__name__)
15
+
16
+ DATA_DIR = Path(__file__).parent.parent / "data"
17
+ WB_DIR = DATA_DIR / "worldbank"
18
+ BASE_DIR = DATA_DIR / "base"
19
+ OUTPUT_DIR = DATA_DIR / "socioeconomic"
20
+
21
+ def load_admin_boundaries():
22
+ """Load Panama administrative boundaries as GeoDataFrame"""
23
+ admin1_path = BASE_DIR / "pan_admin1.geojson"
24
+
25
+ if not admin1_path.exists():
26
+ logger.error(f"Admin boundaries not found: {admin1_path}")
27
+ return None
28
+
29
+ gdf = gpd.read_file(admin1_path)
30
+ logger.info(f"Loaded {len(gdf)} provinces")
31
+ return gdf
32
+
33
+ def process_indicators():
34
+ """Load and process World Bank indicators"""
35
+ csv_path = WB_DIR / "panama_indicators.csv"
36
+
37
+ if not csv_path.exists():
38
+ logger.error(f"Indicators file not found: {csv_path}")
39
+ return None
40
+
41
+ df = pd.read_csv(csv_path)
42
+ logger.info(f"Loaded {len(df)} indicator records")
43
+
44
+ # Get most recent year for each indicator
45
+ latest_df = df.loc[df.groupby('indicator_code')['year'].idxmax()]
46
+ logger.info(f"Selected most recent data for {len(latest_df)} indicators")
47
+
48
+ return latest_df
49
+
50
+ def create_national_geojson(indicators_df, admin_gdf):
51
+ """Create GeoJSON for national-level indicators"""
52
+ OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
53
+
54
+ # Since WB data is national-level, we'll attach it to the country boundary (admin0)
55
+ # For now, create a simple point feature at Panama's center with the indicators
56
+
57
+ features = []
58
+
59
+ # Create one feature with all latest indicators
60
+ properties = {
61
+ 'country': 'Panama',
62
+ 'data_year': int(indicators_df['year'].max())
63
+ }
64
+
65
+ # Add each indicator as a property
66
+ for _, row in indicators_df.iterrows():
67
+ # Create clean column name (remove special chars)
68
+ col_name = row['indicator_code'].lower().replace('.', '_')
69
+ properties[col_name] = row['value']
70
+ properties[f"{col_name}_name"] = row['indicator_name']
71
+
72
+ # Use Panama's approximate center
73
+ feature = {
74
+ "type": "Feature",
75
+ "geometry": {
76
+ "type": "Point",
77
+ "coordinates": [-80.0, 8.5] # Approximate center of Panama
78
+ },
79
+ "properties": properties
80
+ }
81
+
82
+ geojson = {
83
+ "type": "FeatureCollection",
84
+ "features": [feature]
85
+ }
86
+
87
+ # Save GeoJSON
88
+ output_file = OUTPUT_DIR / "panama_national_indicators.geojson"
89
+ with open(output_file, 'w') as f:
90
+ json.dump(geojson, f, indent=2)
91
+
92
+ logger.info(f"Created national indicators GeoJSON: {output_file}")
93
+ logger.info(f" Indicators included: {len(indicators_df)}")
94
+
95
+ return output_file
96
+
97
+ def update_catalog(geojson_path):
98
+ """Add the new dataset to catalog.json"""
99
+ catalog_path = DATA_DIR / "catalog.json"
100
+
101
+ with open(catalog_path, 'r') as f:
102
+ catalog = json.load(f)
103
+
104
+ # Add new entry
105
+ catalog["panama_national_indicators"] = {
106
+ "path": str(geojson_path.relative_to(DATA_DIR)),
107
+ "description": "National socio-economic indicators from World Bank (2000-2024)",
108
+ "semantic_description": "Comprehensive national-level statistics for Panama including poverty rates, GDP, unemployment, health expenditure, maternal/child mortality, literacy rates, and school enrollment. Data sourced from World Bank Open Data API. Use this dataset for analyzing Panama's socio-economic development trends over time.",
109
+ "tags": [
110
+ "socioeconomic",
111
+ "worldbank",
112
+ "poverty",
113
+ "gdp",
114
+ "employment",
115
+ "health",
116
+ "education",
117
+ "national",
118
+ "panama"
119
+ ],
120
+ "data_type": "static",
121
+ "category": "socioeconomic",
122
+ "format": "geojson"
123
+ }
124
+
125
+ with open(catalog_path, 'w') as f:
126
+ json.dump(catalog, f, indent=2)
127
+
128
+ logger.info("Updated catalog.json")
129
+
130
+ def main():
131
+ logger.info("Processing World Bank indicators...")
132
+
133
+ # Load data
134
+ admin_gdf = load_admin_boundaries()
135
+ indicators_df = process_indicators()
136
+
137
+ if admin_gdf is None or indicators_df is None:
138
+ logger.error("Failed to load required data")
139
+ return
140
+
141
+ # Create GeoJSON
142
+ geojson_path = create_national_geojson(indicators_df, admin_gdf)
143
+
144
+ # Update catalog
145
+ update_catalog(geojson_path)
146
+
147
+ logger.info("Processing complete!")
148
+
149
+ if __name__ == "__main__":
150
+ main()
backend/scripts/register_global_datasets.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Register global datasets in catalog
4
+ """
5
+
6
+ import json
7
+ from pathlib import Path
8
+ import logging
9
+
10
+ logging.basicConfig(level=logging.INFO)
11
+ logger = logging.getLogger(__name__)
12
+
13
+ DATA_DIR = Path(__file__).parent.parent / "data"
14
+ CATALOG_PATH = DATA_DIR / "catalog.json"
15
+
16
+ def register_airports():
17
+ """Register Panama airports dataset"""
18
+ with open(CATALOG_PATH, 'r') as f:
19
+ catalog = json.load(f)
20
+
21
+ catalog["panama_airports"] = {
22
+ "path": "global/airports/panama_airports.geojson",
23
+ "description": "Panama airports from OurAirports global database (91 airports)",
24
+ "semantic_description": "Comprehensive dataset of all airports in Panama including international, domestic, regional, and small airfields. Contains location, elevation, type (large/medium/small/heliport), runway information, and identifiers (ICAO, IATA codes). Updated daily from OurAirports open database. Use for aviation infrastructure analysis, accessibility studies, and transportation planning.",
25
+ "tags": [
26
+ "infrastructure",
27
+ "transportation",
28
+ "airports",
29
+ "aviation",
30
+ "panama",
31
+ "ourairports"
32
+ ],
33
+ "data_type": "static",
34
+ "category": "infrastructure",
35
+ "format": "geojson",
36
+ "source": "OurAirports (davidmegginson/ourairports-data)",
37
+ "license": "Public Domain"
38
+ }
39
+
40
+ with open(CATALOG_PATH, 'w') as f:
41
+ json.dump(catalog, f, indent=2)
42
+
43
+ logger.info("Registered panama_airports in catalog")
44
+
45
+ def main():
46
+ logger.info("Registering datasets in catalog...")
47
+ register_airports()
48
+ logger.info("Complete!")
49
+
50
+ if __name__ == "__main__":
51
+ main()
backend/scripts/stri_catalog_scraper.py ADDED
@@ -0,0 +1,348 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ STRI GIS Portal Catalog Scraper
4
+
5
+ Discovers and catalogs datasets from the Smithsonian Tropical Research Institute
6
+ GIS Portal using the ArcGIS Online API.
7
+ """
8
+
9
+ import requests
10
+ import json
11
+ from pathlib import Path
12
+ import logging
13
+ from datetime import datetime
14
+ from typing import Dict, List, Optional
15
+ import re
16
+
17
+ logging.basicConfig(level=logging.INFO)
18
+ logger = logging.getLogger(__name__)
19
+
20
+ DATA_DIR = Path(__file__).parent.parent / "data" / "stri"
21
+ METADATA_DIR = DATA_DIR / "metadata"
22
+
23
+ # STRI GIS Portal ArcGIS Online Organization ID
24
+ STRI_ORG_ID = "nzS0F0zdNLvs7nc8"
25
+ ARCGIS_BASE_URL = "https://www.arcgis.com/sharing/rest"
26
+
27
+ # Priority keywords for dataset selection
28
+ HIGH_PRIORITY_KEYWORDS = [
29
+ "panama", "national", "country", "forest", "cover", "protected", "areas",
30
+ "land use", "biodiversity", "climate", "water", "infrastructure",
31
+ "administrative", "boundaries", "poverty", "population"
32
+ ]
33
+
34
+ # Keywords to deprioritize (site-specific, not national)
35
+ LOW_PRIORITY_KEYWORDS = [
36
+ "bci", "barro colorado", "island", "pena blanca", "site-specific",
37
+ "trail", "sensor", "camera", "plot"
38
+ ]
39
+
40
+ # Temporal dataset patterns (to identify multi-year series)
41
+ TEMPORAL_PATTERNS = [
42
+ r"\b(19\d{2}|20\d{2})\b", # Years like 1992, 2021
43
+ r"edition\s+(19\d{2}|20\d{2})",
44
+ r"version\s+(19\d{2}|20\d{2})"
45
+ ]
46
+
47
+
48
+ def search_stri_portal(query: str = "panama", num: int = 100, start: int = 1) -> Dict:
49
+ """
50
+ Search the STRI GIS Portal using ArcGIS REST API
51
+
52
+ Args:
53
+ query: Search query string (default: "panama" for Panama-specific datasets)
54
+ num: Number of results per page (max 100)
55
+ start: Starting position
56
+
57
+ Returns:
58
+ JSON response with search results
59
+ """
60
+ search_url = f"{ARCGIS_BASE_URL}/search"
61
+
62
+ # Search for Panama-related datasets within STRI organization
63
+ params = {
64
+ "q": f'orgid:{STRI_ORG_ID} AND (panama OR panamá)',
65
+ "f": "json",
66
+ "num": num,
67
+ "start": start,
68
+ "sortField": "modified",
69
+ "sortOrder": "desc"
70
+ }
71
+
72
+ try:
73
+ response = requests.get(search_url, params=params, timeout=30)
74
+ response.raise_for_status()
75
+ return response.json()
76
+ except Exception as e:
77
+ logger.error(f"Failed to search portal: {e}")
78
+ return {}
79
+
80
+
81
+ def get_item_details(item_id: str) -> Optional[Dict]:
82
+ """Get detailed metadata for a specific item"""
83
+ details_url = f"{ARCGIS_BASE_URL}/content/items/{item_id}"
84
+
85
+ params = {"f": "json"}
86
+
87
+ try:
88
+ response = requests.get(details_url, params=params, timeout=30)
89
+ response.raise_for_status()
90
+ return response.json()
91
+ except Exception as e:
92
+ logger.error(f"Failed to get item {item_id}: {e}")
93
+ return None
94
+
95
+
96
+ def extract_year_from_title(title: str) -> Optional[int]:
97
+ """Extract year from dataset title"""
98
+ for pattern in TEMPORAL_PATTERNS:
99
+ match = re.search(pattern, title, re.IGNORECASE)
100
+ if match:
101
+ year_str = match.group(1) if match.lastindex else match.group(0)
102
+ try:
103
+ return int(year_str)
104
+ except ValueError:
105
+ continue
106
+ return None
107
+
108
+
109
+ def calculate_priority_score(item: Dict) -> float:
110
+ """
111
+ Calculate priority score for a dataset based on:
112
+ - National vs site-specific coverage
113
+ - Relevance keywords
114
+ - Data type (prefer Feature Services)
115
+ - Recency
116
+ """
117
+ score = 50.0 # Baseline
118
+
119
+ title = item.get("title", "").lower() if item.get("title") else ""
120
+ description = item.get("description", "").lower() if item.get("description") else ""
121
+ tags = " ".join(item.get("tags", [])).lower() if item.get("tags") else ""
122
+ item_type = item.get("type", "")
123
+
124
+ combined_text = f"{title} {description} {tags}"
125
+
126
+ # Boost for high-priority keywords
127
+ for keyword in HIGH_PRIORITY_KEYWORDS:
128
+ if keyword in combined_text:
129
+ score += 5
130
+
131
+ # Penalty for low-priority (site-specific) keywords
132
+ for keyword in LOW_PRIORITY_KEYWORDS:
133
+ if keyword in combined_text:
134
+ score -= 15
135
+
136
+ # Prefer Feature Services (queryable GIS data)
137
+ if "Feature Service" in item_type:
138
+ score += 20
139
+ elif "Map Service" in item_type:
140
+ score += 10
141
+
142
+ # Boost for temporal datasets
143
+ if extract_year_from_title(title):
144
+ score += 10
145
+
146
+ # Boost for recent updates
147
+ modified = item.get("modified", 0)
148
+ if modified:
149
+ # Convert milliseconds to years since 2020
150
+ years_since_2020 = (modified - 1577836800000) / (365.25 * 24 * 60 * 60 * 1000)
151
+ score += min(years_since_2020 * 2, 10) # Max +10 for very recent
152
+
153
+ return score
154
+
155
+
156
+ def build_rest_endpoint(item: Dict) -> Optional[str]:
157
+ """Construct the REST endpoint URL for a Feature Service"""
158
+ item_type = item.get("type", "")
159
+
160
+ if "Feature Service" not in item_type:
161
+ return None
162
+
163
+ # Standard ArcGIS REST endpoint pattern
164
+ url = item.get("url")
165
+ if url and "/FeatureServer" in url:
166
+ # Assume layer 0 if not specified
167
+ if not url.endswith(("FeatureServer", "FeatureServer/")):
168
+ return url
169
+ return f"{url.rstrip('/')}/0"
170
+
171
+ # Fallback: construct from item ID
172
+ item_id = item.get("id")
173
+ if item_id:
174
+ return f"https://services.arcgis.com/{STRI_ORG_ID}/arcgis/rest/services/{item_id}/FeatureServer/0"
175
+
176
+ return None
177
+
178
+
179
+ def catalog_datasets(max_datasets: int = 100) -> List[Dict]:
180
+ """
181
+ Scrape the STRI portal and build a prioritized catalog
182
+
183
+ Args:
184
+ max_datasets: Maximum number of datasets to retrieve
185
+
186
+ Returns:
187
+ List of dataset metadata dictionaries
188
+ """
189
+ datasets = []
190
+ start = 1
191
+ batch_size = 100
192
+
193
+ logger.info("Scraping STRI GIS Portal...")
194
+
195
+ while len(datasets) < max_datasets:
196
+ logger.info(f"Fetching items {start} to {start + batch_size - 1}...")
197
+
198
+ results = search_stri_portal(num=batch_size, start=start)
199
+
200
+ if not results or "results" not in results:
201
+ break
202
+
203
+ items = results["results"]
204
+
205
+ if not items:
206
+ break
207
+
208
+ for item in items:
209
+ # Focus on Feature Services (queryable geospatial data)
210
+ if "Feature Service" not in item.get("type", ""):
211
+ continue
212
+
213
+ # Calculate priority
214
+ priority = calculate_priority_score(item)
215
+
216
+ # Extract year if temporal
217
+ year = extract_year_from_title(item.get("title", ""))
218
+
219
+ # Build REST endpoint
220
+ rest_endpoint = build_rest_endpoint(item)
221
+
222
+ dataset = {
223
+ "id": item.get("id"),
224
+ "title": item.get("title"),
225
+ "description": item.get("description", ""),
226
+ "type": item.get("type"),
227
+ "tags": item.get("tags", []),
228
+ "modified": item.get("modified"),
229
+ "modified_date": datetime.fromtimestamp(
230
+ item.get("modified", 0) / 1000
231
+ ).isoformat() if item.get("modified") else None,
232
+ "url": item.get("url"),
233
+ "rest_endpoint": rest_endpoint,
234
+ "year": year,
235
+ "priority_score": round(priority, 2)
236
+ }
237
+
238
+ datasets.append(dataset)
239
+
240
+ # Check if there are more results
241
+ if start + batch_size > results.get("total", 0):
242
+ break
243
+
244
+ start += batch_size
245
+
246
+ # Sort by priority score
247
+ datasets.sort(key=lambda x: x["priority_score"], reverse=True)
248
+
249
+ logger.info(f"Found {len(datasets)} Feature Service datasets")
250
+
251
+ return datasets[:max_datasets]
252
+
253
+
254
+ def identify_temporal_groups(datasets: List[Dict]) -> Dict[str, List[Dict]]:
255
+ """
256
+ Group datasets by base name to identify temporal series
257
+
258
+ Returns:
259
+ Dictionary mapping base name to list of datasets with years
260
+ """
261
+ temporal_groups = {}
262
+
263
+ for dataset in datasets:
264
+ if dataset["year"] is None:
265
+ continue
266
+
267
+ # Remove year from title to get base name
268
+ title = dataset["title"]
269
+ base_name = re.sub(r'\b(19\d{2}|20\d{2})\b', '', title)
270
+ base_name = re.sub(r'\s+', ' ', base_name).strip()
271
+ base_name = re.sub(r'edition|version', '', base_name, flags=re.IGNORECASE).strip()
272
+
273
+ if base_name not in temporal_groups:
274
+ temporal_groups[base_name] = []
275
+
276
+ temporal_groups[base_name].append(dataset)
277
+
278
+ # Filter to groups with multiple years
279
+ temporal_groups = {
280
+ k: sorted(v, key=lambda x: x["year"])
281
+ for k, v in temporal_groups.items()
282
+ if len(v) > 1
283
+ }
284
+
285
+ return temporal_groups
286
+
287
+
288
+ def save_catalog(datasets: List[Dict], temporal_groups: Dict[str, List[Dict]]):
289
+ """Save catalog and temporal groups to JSON files"""
290
+ METADATA_DIR.mkdir(parents=True, exist_ok=True)
291
+
292
+ # Save main catalog
293
+ catalog_path = METADATA_DIR / "stri_catalog.json"
294
+ with open(catalog_path, 'w') as f:
295
+ json.dump({
296
+ "generated_at": datetime.now().isoformat(),
297
+ "total_datasets": len(datasets),
298
+ "datasets": datasets
299
+ }, f, indent=2)
300
+
301
+ logger.info(f"Saved catalog to {catalog_path}")
302
+
303
+ # Save temporal groups
304
+ if temporal_groups:
305
+ temporal_path = METADATA_DIR / "stri_temporal_groups.json"
306
+ with open(temporal_path, 'w') as f:
307
+ json.dump({
308
+ "generated_at": datetime.now().isoformat(),
309
+ "num_groups": len(temporal_groups),
310
+ "groups": temporal_groups
311
+ }, f, indent=2)
312
+
313
+ logger.info(f"Saved {len(temporal_groups)} temporal groups to {temporal_path}")
314
+
315
+
316
+ def main():
317
+ """Main execution"""
318
+ logger.info("=== STRI GIS Portal Catalog Scraper ===")
319
+
320
+ # Catalog datasets
321
+ datasets = catalog_datasets(max_datasets=100)
322
+
323
+ # Identify temporal groups
324
+ temporal_groups = identify_temporal_groups(datasets)
325
+
326
+ # Save results
327
+ save_catalog(datasets, temporal_groups)
328
+
329
+ # Print summary
330
+ logger.info("\n" + "="*60)
331
+ logger.info(f"✅ Cataloged {len(datasets)} datasets")
332
+ logger.info(f"📊 Found {len(temporal_groups)} temporal dataset groups")
333
+
334
+ if temporal_groups:
335
+ logger.info("\nTemporal Groups:")
336
+ for base_name, group in list(temporal_groups.items())[:5]:
337
+ years = [d["year"] for d in group]
338
+ logger.info(f" - {base_name}: {years}")
339
+
340
+ logger.info("\nTop 10 Priority Datasets:")
341
+ for i, dataset in enumerate(datasets[:10], 1):
342
+ logger.info(f" {i}. [{dataset['priority_score']:.1f}] {dataset['title']}")
343
+
344
+ logger.info("="*60)
345
+
346
+
347
+ if __name__ == "__main__":
348
+ main()
backend/scripts/update_embeddings.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Update Embeddings for Semantic Search
3
+
4
+ Refreshes the embeddings.json index with any new tables in the catalog.
5
+ """
6
+
7
+ import sys
8
+ import asyncio
9
+ import logging
10
+ from pathlib import Path
11
+
12
+ # Add project root to path
13
+ sys.path.append(str(Path(__file__).parent.parent.parent))
14
+
15
+ from backend.core.data_catalog import get_data_catalog
16
+ from backend.core.semantic_search import get_semantic_search
17
+
18
+ def update_embeddings():
19
+ print("="*60)
20
+ # Reload catalog to ensure latest
21
+ catalog = get_data_catalog()
22
+ catalog.load_catalog()
23
+
24
+ search_service = get_semantic_search()
25
+
26
+ print(f"Catalog size: {len(catalog.catalog)} tables")
27
+ print(f"Existing embeddings: {len(search_service.embeddings)}")
28
+
29
+ print("\nGenerating embeddings for new tables...")
30
+ new_count = search_service.embed_all_tables(catalog.catalog)
31
+
32
+ print(f"\n✅ Embedded {new_count} new tables.")
33
+ print(f"Total embedded: {len(search_service.embeddings)}")
34
+
35
+ if __name__ == "__main__":
36
+ update_embeddings()
37
+
backend/scripts/validate_censo.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import csv
2
+ import json
3
+ import os
4
+ import unicodedata
5
+
6
+ # Define paths
7
+ BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
8
+ CSV_PATH = os.path.join(BASE_DIR, "data/censo/censo_panama_2023_unificado.csv")
9
+ GEOJSON_PATH = os.path.join(BASE_DIR, "data/base/pan_admin3.geojson")
10
+
11
+ def normalize_text(text):
12
+ if not text:
13
+ return ""
14
+ # Normalize unicode characters to ASCII (remove accents)
15
+ text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('ASCII')
16
+ return text.lower().strip()
17
+
18
+ def validate_censo_integration():
19
+ print(f"Loading CSV from {CSV_PATH}...")
20
+ csv_data = []
21
+ try:
22
+ with open(CSV_PATH, mode='r', encoding='utf-8') as f:
23
+ reader = csv.DictReader(f)
24
+ for row in reader:
25
+ csv_data.append(row)
26
+ except Exception as e:
27
+ print(f"Error loading CSV: {e}")
28
+ return
29
+
30
+ print(f"Loading GeoJSON from {GEOJSON_PATH}...")
31
+ try:
32
+ with open(GEOJSON_PATH, 'r') as f:
33
+ geojson = json.load(f)
34
+ except Exception as e:
35
+ print(f"Error loading GeoJSON: {e}")
36
+ return
37
+
38
+ # Build GeoJSON Lookup Map: (norm_prov, norm_dist, norm_corr) -> properties
39
+ geojson_lookup = {}
40
+
41
+ # Helper to handle common name variations found in Panama data
42
+ # (can add more rules as we discover mismatches)
43
+ def clean_name(name):
44
+ n = normalize_text(name)
45
+ # remove "distrito de", "comarca", etc if needed
46
+ return n
47
+
48
+ print("Building GeoJSON lookup table...")
49
+ for feature in geojson['features']:
50
+ props = feature.get('properties', {})
51
+ p_name = clean_name(props.get('adm1_name'))
52
+ d_name = clean_name(props.get('adm2_name'))
53
+ c_name = clean_name(props.get('adm3_name'))
54
+
55
+ key = (p_name, d_name, c_name)
56
+ if key in geojson_lookup:
57
+ print(f"Duplicate key in GeoJSON: {key}")
58
+ geojson_lookup[key] = props
59
+
60
+ print(f"GeoJSON lookup size: {len(geojson_lookup)}")
61
+
62
+ # Heuristics for Province Mapping (New -> Old)
63
+ PROV_MAPPING = {
64
+ "panama oeste": "panama",
65
+ "comarca naso tjer di": "bocas del toro" # Naso was part of Bocas
66
+ }
67
+
68
+ print("\nValidating CSV via Name Matching with Heuristics...")
69
+
70
+ matches = []
71
+ mismatches = []
72
+
73
+ for row in csv_data:
74
+ # CSV headers: nomb_prov, nomb_dist, nomb_corr
75
+ p_name = clean_name(row.get('nomb_prov'))
76
+ d_name = clean_name(row.get('nomb_dist'))
77
+ c_name = clean_name(row.get('nomb_corr'))
78
+
79
+ # Apply Province Mapping
80
+ search_p_name = PROV_MAPPING.get(p_name, p_name)
81
+
82
+ # 1. Try Exact Match (with mapped province)
83
+ key = (search_p_name, d_name, c_name)
84
+ if key in geojson_lookup:
85
+ matches.append(row)
86
+ row['geo_match_id'] = geojson_lookup[key].get('adm3_pcode')
87
+ continue
88
+
89
+ # 2. Relaxed District Match: Search in Province
90
+ # Find any entry in this province with the same corregimiento name
91
+ candidates = [k for k in geojson_lookup.keys() if k[0] == search_p_name and k[2] == c_name]
92
+
93
+ if len(candidates) == 1:
94
+ # Single match found in another district!
95
+ match_key = candidates[0]
96
+ matches.append(row)
97
+ row['geo_match_id'] = geojson_lookup[match_key].get('adm3_pcode')
98
+ # print(f"Relaxed Match: {c_name} (CSV Dist: {d_name}) -> (Geo Dist: {match_key[1]})")
99
+ continue
100
+ elif len(candidates) > 1:
101
+ # Ambiguous (same corregimiento name in multiple districts of same province - rare but possible)
102
+ # print(f"Ambiguous: {c_name} found in districts {[k[1] for k in candidates]}")
103
+ pass
104
+
105
+ # 3. Fuzzy/Typo Fixes (Specific hardcodes for common mismatch types if needed)
106
+ # E.g. "El Hato de San Juan de Dios" vs "El Hato de San Juan"
107
+ # We can perform a primitive "contains" check
108
+
109
+ best_candidate = None
110
+ # Get all corregimientos in this province
111
+ prov_corrs = [k for k in geojson_lookup.keys() if k[0] == search_p_name]
112
+
113
+ for k in prov_corrs:
114
+ geo_c = k[2]
115
+ # Check if one contains the other
116
+ if (c_name in geo_c or geo_c in c_name) and len(c_name) > 4 and len(geo_c) > 4:
117
+ # Check if starts matching
118
+ if c_name.startswith(geo_c) or geo_c.startswith(c_name):
119
+ best_candidate = k
120
+ break
121
+
122
+ if best_candidate:
123
+ matches.append(row)
124
+ row['geo_match_id'] = geojson_lookup[best_candidate].get('adm3_pcode')
125
+ # print(f"Fuzzy Match: '{c_name}' ~= '{best_candidate[2]}'")
126
+ continue
127
+
128
+ # No match
129
+ mismatches.append(row)
130
+ row['lookup_key'] = (search_p_name, d_name, c_name)
131
+
132
+ print(f"Total rows in CSV: {len(csv_data)}")
133
+ print(f"Matches found: {len(matches)}")
134
+ print(f"Mismatches found: {len(mismatches)}")
135
+ print(f"Match Rate: {len(matches)/len(csv_data)*100:.1f}%")
136
+
137
+ if mismatches:
138
+ print("\nMismatch Details (First 20):")
139
+ print(f"{'CSV Key (Prov, Dist, Corr)':<60} {'Closest Match?':<20}")
140
+ print("-" * 85)
141
+ for row in mismatches[:20]:
142
+ key = row['lookup_key']
143
+ print(f"{str(key):<60}")
144
+
145
+ # Analyze mismatches by Province
146
+ print("\nAnalyzing remaining mismatches by Province:")
147
+ prov_mismatches = {}
148
+ for row in mismatches:
149
+ p = row['nomb_prov']
150
+ prov_mismatches[p] = prov_mismatches.get(p, 0) + 1
151
+ for p, count in prov_mismatches.items():
152
+ print(f"{p}: {count}")
153
+
154
+ if __name__ == "__main__":
155
+ validate_censo_integration()
backend/services/data_loader.py ADDED
@@ -0,0 +1,271 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Data Loader Service for Panama Geographic Data
3
+
4
+ Loads GeoJSON files from the data/raw directory and provides
5
+ query capabilities for the LLM to search and filter features.
6
+ """
7
+
8
+ import os
9
+ import json
10
+ from typing import List, Dict, Any, Optional
11
+ from functools import lru_cache
12
+
13
+
14
+ class PanamaDataLoader:
15
+ """
16
+ Singleton service to load and query Panama geographic data.
17
+ Loads data once on first access and caches in memory.
18
+ """
19
+
20
+ _instance = None
21
+ _data_loaded = False
22
+
23
+ # Data storage
24
+ admin0: List[Dict[str, Any]] = [] # Country
25
+ admin1: List[Dict[str, Any]] = [] # Provinces (13)
26
+ admin2: List[Dict[str, Any]] = [] # Districts (76)
27
+ admin3: List[Dict[str, Any]] = [] # Corregimientos (594)
28
+
29
+ def __new__(cls):
30
+ if cls._instance is None:
31
+ cls._instance = super().__new__(cls)
32
+ return cls._instance
33
+
34
+ def __init__(self):
35
+ if not PanamaDataLoader._data_loaded:
36
+ self._load_data()
37
+ PanamaDataLoader._data_loaded = True
38
+
39
+ def _get_data_path(self) -> str:
40
+ """Get the path to the data/raw directory."""
41
+ # Navigate from backend/services to project root
42
+ current_dir = os.path.dirname(os.path.abspath(__file__))
43
+ project_root = os.path.dirname(os.path.dirname(current_dir))
44
+ return os.path.join(project_root, "data", "raw")
45
+
46
+ def _load_geojson(self, filename: str) -> List[Dict[str, Any]]:
47
+ """Load a GeoJSON file and return its features."""
48
+ filepath = os.path.join(self._get_data_path(), filename)
49
+
50
+ if not os.path.exists(filepath):
51
+ print(f"Warning: {filepath} not found")
52
+ return []
53
+
54
+ try:
55
+ with open(filepath, 'r', encoding='utf-8') as f:
56
+ data = json.load(f)
57
+ features = data.get('features', [])
58
+ print(f" Loaded {len(features)} features from {filename}")
59
+ return features
60
+ except Exception as e:
61
+ print(f"Error loading {filename}: {e}")
62
+ return []
63
+
64
+ def _load_data(self):
65
+ """Load all GeoJSON data files."""
66
+ print("=" * 50)
67
+ print("Loading Panama Geographic Data...")
68
+ print("=" * 50)
69
+
70
+ self.admin0 = self._load_geojson("pan_admin0.geojson")
71
+ self.admin1 = self._load_geojson("pan_admin1.geojson")
72
+ self.admin2 = self._load_geojson("pan_admin2.geojson")
73
+ self.admin3 = self._load_geojson("pan_admin3.geojson")
74
+
75
+ total = len(self.admin0) + len(self.admin1) + len(self.admin2) + len(self.admin3)
76
+ print(f"Total features loaded: {total}")
77
+ print("=" * 50)
78
+
79
+ def get_schema_context(self) -> str:
80
+ """Return schema description for LLM context."""
81
+ return """
82
+ Panama Geographic Data (HDX Administrative Boundaries):
83
+
84
+ 1. admin0 (Country Level)
85
+ - adm0_name: "Panamá"
86
+ - adm0_pcode: "PA"
87
+ - area_sqkm: country area in square kilometers
88
+ - geometry: MultiPolygon
89
+
90
+ 2. admin1 (Provinces - 13 total)
91
+ - adm1_name: Province name (e.g., "Bocas del Toro", "Panamá", "Colón")
92
+ - adm1_pcode: Province code (e.g., "PA01", "PA08")
93
+ - adm0_name: "Panamá"
94
+ - area_sqkm: province area
95
+ - center_lat, center_lon: centroid coordinates
96
+ - geometry: MultiPolygon
97
+
98
+ 3. admin2 (Districts - 76 total)
99
+ - adm2_name: District name
100
+ - adm2_pcode: District code (e.g., "PA0101")
101
+ - adm1_name: Parent province name
102
+ - adm1_pcode: Parent province code
103
+ - area_sqkm: district area
104
+ - center_lat, center_lon: centroid coordinates
105
+ - geometry: MultiPolygon
106
+
107
+ 4. admin3 (Corregimientos - 594 total)
108
+ - adm3_name: Corregimiento name
109
+ - adm3_pcode: Corregimiento code (e.g., "PA010101")
110
+ - adm2_name: Parent district name
111
+ - adm2_pcode: Parent district code
112
+ - adm1_name: Parent province name
113
+ - area_sqkm: corregimiento area
114
+ - center_lat, center_lon: centroid coordinates
115
+ - geometry: MultiPolygon
116
+
117
+ Notes:
118
+ - All geometries use WGS84 (EPSG:4326) coordinate system
119
+ - P-codes follow ISO 3166-2 format
120
+ - Valid as of 2021-10-20
121
+ """
122
+
123
+ def get_data_citations(self, admin_levels: List[str]) -> List[str]:
124
+ """Return citations for the queried data."""
125
+ citations = []
126
+ level_names = {
127
+ "admin0": "Panama Country Boundary",
128
+ "admin1": "Panama Provinces",
129
+ "admin2": "Panama Districts",
130
+ "admin3": "Panama Corregimientos"
131
+ }
132
+
133
+ for level in admin_levels:
134
+ if level in level_names:
135
+ citations.append(f"{level_names[level]} (HDX COD-AB, 2021)")
136
+
137
+ return citations if citations else ["Panama Administrative Boundaries (HDX COD-AB, 2021)"]
138
+
139
+ def search_by_name(
140
+ self,
141
+ name: str,
142
+ admin_level: Optional[str] = None,
143
+ limit: int = 50
144
+ ) -> List[Dict[str, Any]]:
145
+ """
146
+ Search for features by name (case-insensitive partial match).
147
+
148
+ Args:
149
+ name: Search term
150
+ admin_level: Optional filter ("admin1", "admin2", "admin3")
151
+ limit: Maximum results to return
152
+ """
153
+ name_lower = name.lower()
154
+ results = []
155
+
156
+ levels_to_search = []
157
+ if admin_level:
158
+ levels_to_search = [(admin_level, getattr(self, admin_level, []))]
159
+ else:
160
+ levels_to_search = [
161
+ ("admin1", self.admin1),
162
+ ("admin2", self.admin2),
163
+ ("admin3", self.admin3)
164
+ ]
165
+
166
+ for level_name, features in levels_to_search:
167
+ for feature in features:
168
+ props = feature.get("properties", {})
169
+
170
+ # Check various name fields
171
+ for key in ["adm1_name", "adm2_name", "adm3_name", "adm0_name"]:
172
+ value = props.get(key, "")
173
+ if value and name_lower in value.lower():
174
+ results.append({
175
+ "level": level_name,
176
+ "feature": feature
177
+ })
178
+ break
179
+
180
+ if len(results) >= limit:
181
+ break
182
+
183
+ if len(results) >= limit:
184
+ break
185
+
186
+ return results
187
+
188
+ def get_all_provinces(self) -> List[Dict[str, Any]]:
189
+ """Get all provinces (admin1)."""
190
+ return self.admin1
191
+
192
+ def get_all_districts(self, province_pcode: Optional[str] = None) -> List[Dict[str, Any]]:
193
+ """Get all districts, optionally filtered by province."""
194
+ if province_pcode:
195
+ return [
196
+ f for f in self.admin2
197
+ if f.get("properties", {}).get("adm1_pcode") == province_pcode
198
+ ]
199
+ return self.admin2
200
+
201
+ def get_all_corregimientos(
202
+ self,
203
+ district_pcode: Optional[str] = None,
204
+ province_pcode: Optional[str] = None
205
+ ) -> List[Dict[str, Any]]:
206
+ """Get all corregimientos, optionally filtered."""
207
+ results = self.admin3
208
+
209
+ if district_pcode:
210
+ results = [
211
+ f for f in results
212
+ if f.get("properties", {}).get("adm2_pcode") == district_pcode
213
+ ]
214
+ elif province_pcode:
215
+ results = [
216
+ f for f in results
217
+ if f.get("properties", {}).get("adm1_pcode") == province_pcode
218
+ ]
219
+
220
+ return results
221
+
222
+ def get_by_pcode(self, pcode: str) -> Optional[Dict[str, Any]]:
223
+ """Get a feature by its P-code."""
224
+ pcode_upper = pcode.upper()
225
+
226
+ # Determine level by P-code length
227
+ if len(pcode_upper) == 2: # Country
228
+ for f in self.admin0:
229
+ if f.get("properties", {}).get("adm0_pcode") == pcode_upper:
230
+ return f
231
+ elif len(pcode_upper) == 4: # Province
232
+ for f in self.admin1:
233
+ if f.get("properties", {}).get("adm1_pcode") == pcode_upper:
234
+ return f
235
+ elif len(pcode_upper) == 6: # District
236
+ for f in self.admin2:
237
+ if f.get("properties", {}).get("adm2_pcode") == pcode_upper:
238
+ return f
239
+ elif len(pcode_upper) == 8: # Corregimiento
240
+ for f in self.admin3:
241
+ if f.get("properties", {}).get("adm3_pcode") == pcode_upper:
242
+ return f
243
+
244
+ return None
245
+
246
+ def to_geojson(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
247
+ """Convert a list of features to a GeoJSON FeatureCollection."""
248
+ # Handle both raw features and wrapped results from search
249
+ clean_features = []
250
+ for f in features:
251
+ if "feature" in f:
252
+ clean_features.append(f["feature"])
253
+ else:
254
+ clean_features.append(f)
255
+
256
+ return {
257
+ "type": "FeatureCollection",
258
+ "features": clean_features
259
+ }
260
+
261
+
262
+ # Singleton instance
263
+ _data_loader: Optional[PanamaDataLoader] = None
264
+
265
+
266
+ def get_data_loader() -> PanamaDataLoader:
267
+ """Get the singleton data loader instance."""
268
+ global _data_loader
269
+ if _data_loader is None:
270
+ _data_loader = PanamaDataLoader()
271
+ return _data_loader
backend/services/executor.py ADDED
@@ -0,0 +1,860 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Query Executor Service
3
+
4
+ Handles query processing with intent detection, data querying, and response generation.
5
+ Uses semantic search for scalable dataset discovery and session-scoped layer storage.
6
+ """
7
+
8
+ from backend.core.llm_gateway import LLMGateway
9
+ from backend.services.data_loader import get_data_loader
10
+ from backend.core.geo_engine import get_geo_engine
11
+ from backend.services.response_formatter import ResponseFormatter
12
+ from backend.core.session_store import get_session_store
13
+ from backend.core.semantic_search import get_semantic_search
14
+ from backend.core.data_catalog import get_data_catalog
15
+ from backend.core.query_planner import get_query_planner
16
+ from typing import List, Dict, Any, Optional
17
+ import json
18
+ import datetime
19
+ import uuid
20
+ import logging
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+ # Default session ID for backward compatibility
25
+ DEFAULT_SESSION_ID = "default-session"
26
+
27
+
28
+ class QueryExecutor:
29
+ def __init__(self):
30
+ self.llm = LLMGateway()
31
+ self.data_loader = get_data_loader()
32
+ self.geo_engine = get_geo_engine()
33
+ self.session_store = get_session_store()
34
+ self.semantic_search = get_semantic_search()
35
+ self.catalog = get_data_catalog()
36
+ self.query_planner = get_query_planner()
37
+
38
+ def _get_schema_context(self) -> str:
39
+ """Returns the database schema for the LLM context."""
40
+ return self.data_loader.get_schema_context()
41
+
42
+ async def process_query_with_context(self, query: str, history: List[Dict[str, str]]) -> Dict[str, Any]:
43
+ """
44
+ Orchestrates the full query processing flow with conversation context.
45
+ """
46
+ # 1. Detect intent
47
+ intent = await self.llm.detect_intent(query, history)
48
+ print(f"[GeoQuery] Detected intent: {intent}")
49
+
50
+ # 2. Route based on intent
51
+ if intent == "GENERAL_CHAT":
52
+ return await self._handle_general_chat(query, history)
53
+ elif intent in ["DATA_QUERY", "MAP_REQUEST"]:
54
+ # Always include map for data queries - the visual is helpful
55
+ return await self._handle_data_query(query, history, include_map=True)
56
+ elif intent == "SPATIAL_OP":
57
+ return await self._handle_spatial_op(query, history)
58
+ elif intent == "STAT_QUERY":
59
+ return await self._handle_stat_query(query, history)
60
+ else:
61
+ return await self._handle_general_chat(query, history)
62
+
63
+ async def process_query_stream(self, query: str, history: List[Dict[str, str]]):
64
+ """
65
+ Streamable version of process_query_with_context.
66
+ Yields: {"event": "status"|"thought"|"chunk"|"result", "data": ...}
67
+ """
68
+
69
+ # 1. Intent Detection with Thoughts
70
+ yield {"event": "status", "data": json.dumps({"status": "🧠 Understanding intent..."})}
71
+
72
+ intent = "GENERAL_CHAT"
73
+ intent_buffer = ""
74
+
75
+ try:
76
+ async for chunk in self.llm.stream_intent(query, history):
77
+ if chunk["type"] == "thought":
78
+ yield {"event": "chunk", "data": json.dumps({"type": "thought", "content": chunk["text"]})}
79
+ elif chunk["type"] == "content":
80
+ intent_buffer += chunk["text"]
81
+ except Exception as e:
82
+ print(f"Intent stream error: {e}")
83
+
84
+ intent = intent_buffer.strip().upper()
85
+ if not intent:
86
+ intent = "GENERAL_CHAT"
87
+
88
+ # Clean up intent string
89
+ for valid in ["GENERAL_CHAT", "DATA_QUERY", "MAP_REQUEST", "SPATIAL_OP", "STAT_QUERY"]:
90
+ if valid in intent:
91
+ intent = valid
92
+ break
93
+
94
+ yield {"event": "intent", "data": json.dumps({"intent": intent})}
95
+ print(f"[GeoQuery] Detected intent: {intent}")
96
+
97
+ if intent == "GENERAL_CHAT":
98
+ async for chunk in self.llm.generate_response_stream(query, history):
99
+ # Transform to frontend protocol
100
+ if chunk.get("type") == "content":
101
+ yield {"event": "chunk", "data": json.dumps({"type": "text", "content": chunk.get("text")})}
102
+ elif chunk.get("type") == "thought":
103
+ yield {"event": "chunk", "data": json.dumps({"type": "thought", "content": chunk.get("content")})}
104
+
105
+ # Send final result to clear loading status
106
+ yield {"event": "result", "data": json.dumps({"response": ""})}
107
+ return
108
+
109
+ # Handle Data/Map/Stat Queries together via a unified stream handler
110
+
111
+ if intent in ["DATA_QUERY", "MAP_REQUEST", "STAT_QUERY"]:
112
+ include_map = intent != "STAT_QUERY"
113
+ session_id = DEFAULT_SESSION_ID # TODO: Get from request context
114
+
115
+ # 0. Check query complexity
116
+ complexity = self.query_planner.detect_complexity(query)
117
+
118
+ if complexity["is_complex"]:
119
+ yield {"event": "status", "data": json.dumps({"status": "���� Complex query detected, planning steps..."})}
120
+ logger.info(f"Complex query detected: {complexity['reason']}")
121
+
122
+ # Use multi-step executor
123
+ async for event in self._execute_multi_step_query(query, history, include_map, session_id):
124
+ yield event
125
+ return
126
+
127
+ # Simple query - continue with existing flow
128
+ # 0. Semantic Discovery (scalable pre-filter)
129
+ yield {"event": "status", "data": json.dumps({"status": "📚 Searching data catalog..."})}
130
+
131
+ # Use semantic search to find top candidates
132
+ candidate_tables = self.semantic_search.search_table_names(query, top_k=15)
133
+
134
+ if candidate_tables:
135
+ # Get focused summaries for LLM refinement
136
+ candidate_summaries = self.catalog.get_summaries_for_tables(candidate_tables)
137
+ else:
138
+ # Fallback to all summaries (legacy behavior for small catalogs)
139
+ candidate_summaries = self.catalog.get_all_table_summaries()
140
+
141
+ # 1. LLM refines from candidates
142
+ yield {"event": "status", "data": json.dumps({"status": "🔍 Identifying relevant tables..."})}
143
+ relevant_tables = await self.llm.identify_relevant_tables(query, candidate_summaries)
144
+
145
+ # 2. Lazy Load
146
+ if relevant_tables:
147
+ yield {"event": "status", "data": json.dumps({"status": f"💾 Loading tables: {', '.join(relevant_tables)}..."})}
148
+
149
+ feature_tables = []
150
+ for table in relevant_tables:
151
+ if self.geo_engine.ensure_table_loaded(table):
152
+ feature_tables.append(table)
153
+
154
+ # 3. Schema
155
+ table_schema = self.geo_engine.get_table_schemas()
156
+
157
+ # 4. Generate SQL (Streaming Thoughts!)
158
+ yield {"event": "status", "data": json.dumps({"status": "✍️ Writing SQL query..."})}
159
+
160
+ sql_buffer = ""
161
+ async for chunk in self.llm.stream_analytical_sql(query, table_schema, history):
162
+ if chunk["type"] == "thought":
163
+ yield {"event": "chunk", "data": json.dumps({"type": "thought", "content": chunk["text"]})}
164
+ elif chunk["type"] == "content":
165
+ sql_buffer += chunk["text"]
166
+
167
+ sql = sql_buffer.replace("```sql", "").replace("```", "").strip()
168
+
169
+ # 5. Check for DATA_UNAVAILABLE error from LLM
170
+ if "DATA_UNAVAILABLE" in sql or sql.startswith("-- ERROR"):
171
+ yield {"event": "status", "data": json.dumps({"status": "ℹ️ Data not available"})}
172
+
173
+ requested = "the requested data"
174
+ available = "administrative boundaries (provinces, districts, corregimientos)"
175
+
176
+ for line in sql.split("\n"):
177
+ if "Requested:" in line:
178
+ requested = line.split("Requested:")[-1].strip()
179
+ elif "Available:" in line:
180
+ available = line.split("Available:")[-1].strip()
181
+
182
+ error_response = f"""I couldn't find data for **{requested}** in the current database.
183
+
184
+ **Available datasets include:**
185
+ - {available}
186
+
187
+ If you need additional data, please let me know and I can help you understand what's currently available or suggest alternative queries."""
188
+
189
+ yield {
190
+ "event": "result",
191
+ "data": json.dumps({
192
+ "response": error_response,
193
+ "sql_query": sql,
194
+ "geojson": None,
195
+ "data_citations": [],
196
+ "chart_data": None,
197
+ "raw_data": []
198
+ })
199
+ }
200
+ return
201
+
202
+ # 6. Execute query
203
+ yield {"event": "status", "data": json.dumps({"status": "⚡ Executing query..."})}
204
+
205
+ geojson = None
206
+ features = []
207
+ error_message = None
208
+
209
+ try:
210
+ geojson = self.geo_engine.execute_spatial_query(sql)
211
+ features = geojson.get("features", [])
212
+ yield {"event": "status", "data": json.dumps({"status": f"✅ Found {len(features)} results"})}
213
+ except Exception as e:
214
+ error_message = str(e)
215
+ yield {"event": "status", "data": json.dumps({"status": "⚠️ Query error, attempting repair..."})}
216
+ try:
217
+ sql = await self.llm.correct_sql(query, sql, error_message, str(table_schema))
218
+ geojson = self.geo_engine.execute_spatial_query(sql)
219
+ features = geojson.get("features", [])
220
+ error_message = None
221
+ except Exception as e2:
222
+ print(f"Repair failed: {e2}")
223
+
224
+ if error_message:
225
+ yield {
226
+ "event": "result",
227
+ "data": json.dumps({
228
+ "response": f"I was unable to process your request because the data query failed. \n\nError details: {error_message}",
229
+ "sql_query": sql,
230
+ "geojson": None,
231
+ "data_citations": [],
232
+ "chart_data": None,
233
+ "raw_data": []
234
+ })
235
+ }
236
+ return
237
+
238
+ # 7. Post-process using ResponseFormatter
239
+ citations = ResponseFormatter.generate_citations(relevant_tables, features)
240
+
241
+ # Chart
242
+ chart_data = ResponseFormatter.generate_chart_data(sql, features)
243
+ if intent == "STAT_QUERY" and not chart_data and features:
244
+ chart_data = ResponseFormatter.generate_chart_data("GROUP BY forced", features)
245
+
246
+ # Raw Data
247
+ raw_data = ResponseFormatter.prepare_raw_data(features)
248
+
249
+ # Map Config
250
+ if include_map and features and geojson:
251
+ # Generate AI layer name
252
+ layer_info = await self.llm.generate_layer_name(query, sql)
253
+ layer_name_ai = layer_info.get("name", "Map Layer")
254
+ layer_emoji = layer_info.get("emoji", "📍")
255
+ point_style = layer_info.get("pointStyle", None)
256
+ geojson, layer_id, layer_name = ResponseFormatter.format_geojson_layer(query, geojson, features, layer_name_ai, layer_emoji, point_style)
257
+
258
+ try:
259
+ table_name = self.geo_engine.register_layer(layer_id, geojson)
260
+ self.session_store.add_layer(session_id, {
261
+ "id": layer_id,
262
+ "name": layer_name,
263
+ "table_name": table_name,
264
+ "timestamp": datetime.datetime.now().isoformat()
265
+ })
266
+ except Exception as e:
267
+ logger.warning(f"Failed to register layer: {e}")
268
+
269
+ # 8. Explanation (Streaming!)
270
+ yield {"event": "status", "data": json.dumps({"status": "💬 Generating explanation..."})}
271
+
272
+ data_summary = ResponseFormatter.generate_data_summary(features)
273
+
274
+ explanation_buffer = ""
275
+
276
+ async for chunk in self.llm.stream_explanation(query, sql, data_summary, history):
277
+ if chunk["type"] == "thought":
278
+ yield {"event": "chunk", "data": json.dumps({"type": "thought", "content": chunk["text"]})}
279
+ elif chunk["type"] == "content":
280
+ explanation_buffer += chunk["text"]
281
+ yield {"event": "chunk", "data": json.dumps({"type": "text", "content": chunk["text"]})}
282
+
283
+ # 9. Final Result Event
284
+ yield {"event": "result", "data": json.dumps({
285
+ "response": explanation_buffer,
286
+ "sql_query": sql,
287
+ "geojson": geojson if include_map and features else None,
288
+ "chart_data": chart_data,
289
+ "raw_data": raw_data,
290
+ "data_citations": citations
291
+ })}
292
+
293
+ elif intent == "SPATIAL_OP":
294
+ yield {"event": "status", "data": json.dumps({"status": "📐 Preparing spatial operation..."})}
295
+ session_id = DEFAULT_SESSION_ID # TODO: Get from request context
296
+
297
+ # 0. Semantic Discovery for base tables
298
+ candidate_tables = self.semantic_search.search_table_names(query, top_k=15)
299
+ if candidate_tables:
300
+ candidate_summaries = self.catalog.get_summaries_for_tables(candidate_tables)
301
+ else:
302
+ candidate_summaries = self.catalog.get_all_table_summaries()
303
+
304
+ # 1. Identify relevant base tables from query
305
+ relevant_tables = await self.llm.identify_relevant_tables(query, candidate_summaries)
306
+
307
+ # 2. Lazy load those tables
308
+ for table in relevant_tables:
309
+ self.geo_engine.ensure_table_loaded(table)
310
+
311
+ # 3. Get schema of loaded base tables
312
+ base_table_schema = self.geo_engine.get_table_schemas()
313
+
314
+ # 4. Prepare Layer Context (user-created layers from session)
315
+ session_layers = self.session_store.get_layers(session_id)
316
+ layer_context = "User-Created Layers:\n"
317
+ if not session_layers:
318
+ layer_context += "(No user layers created yet.)\n"
319
+ else:
320
+ for i, layer in enumerate(session_layers):
321
+ layer_context += f"Layer {i+1}: {layer['name']} (Table: {layer['table_name']})\n"
322
+
323
+ # 5. Combine both contexts for LLM
324
+ full_context = f"{base_table_schema}\n\n{layer_context}"
325
+
326
+ # 6. Generate Spatial SQL
327
+ yield {"event": "status", "data": json.dumps({"status": "✍️ Writing spatial SQL..."})}
328
+ sql = await self.llm.generate_spatial_sql(query, full_context, history)
329
+
330
+ # 7. Execute
331
+ yield {"event": "status", "data": json.dumps({"status": "⚙️ Processing geometry..."})}
332
+ error_message = None
333
+ geojson = None
334
+ features = []
335
+
336
+ try:
337
+ geojson = self.geo_engine.execute_spatial_query(sql)
338
+ features = geojson.get("features", [])
339
+ yield {"event": "status", "data": json.dumps({"status": f"✅ Result contains {len(features)} features"})}
340
+ except Exception as e:
341
+ error_message = str(e)
342
+ yield {"event": "status", "data": json.dumps({"status": "⚠️ Spatial error, attempting repair..."})}
343
+ try:
344
+ sql = await self.llm.correct_sql(query, sql, error_message, full_context)
345
+ geojson = self.geo_engine.execute_spatial_query(sql)
346
+ features = geojson.get("features", [])
347
+ error_message = None
348
+ except Exception as e2:
349
+ yield {
350
+ "event": "result",
351
+ "data": json.dumps({
352
+ "response": f"I tried to perform the spatial operation but encountered an error: {str(e)}\n\nQuery: {sql}",
353
+ "sql_query": sql,
354
+ "geojson": None,
355
+ "data_citations": [],
356
+ "chart_data": None,
357
+ "raw_data": []
358
+ })
359
+ }
360
+ return
361
+
362
+ # 4. Result Processing
363
+ if features:
364
+ # Generate AI layer name
365
+ layer_info = await self.llm.generate_layer_name(query, sql)
366
+ layer_name_ai = layer_info.get("name", "Map Layer")
367
+ layer_emoji = layer_info.get("emoji", "📍")
368
+ point_style = layer_info.get("pointStyle", None)
369
+ geojson, layer_id, layer_name = ResponseFormatter.format_geojson_layer(query, geojson, features, layer_name_ai, layer_emoji, point_style)
370
+
371
+ try:
372
+ table_name = self.geo_engine.register_layer(layer_id, geojson)
373
+ self.session_store.add_layer(session_id, {
374
+ "id": layer_id,
375
+ "name": layer_name,
376
+ "table_name": table_name,
377
+ "timestamp": datetime.datetime.now().isoformat()
378
+ })
379
+ except Exception as e:
380
+ logger.warning(f"Failed to register layer: {e}")
381
+
382
+ # 5. Explanation
383
+ yield {"event": "status", "data": json.dumps({"status": "💬 Explaining results..."})}
384
+ data_summary = f"Spatial operation resulted in {len(features)} features."
385
+
386
+ explanation_buffer = ""
387
+ async for chunk in self.llm.stream_explanation(query, sql, data_summary, history):
388
+ if chunk["type"] == "thought":
389
+ yield {"event": "chunk", "data": json.dumps({"type": "thought", "content": chunk["text"]})}
390
+ elif chunk["type"] == "content":
391
+ explanation_buffer += chunk["text"]
392
+ yield {"event": "chunk", "data": json.dumps({"type": "text", "content": chunk["text"]})}
393
+
394
+ # 6. Final Result
395
+ yield {"event": "result", "data": json.dumps({
396
+ "response": explanation_buffer,
397
+ "sql_query": sql,
398
+ "geojson": geojson,
399
+ "chart_data": None,
400
+ "raw_data": [], # Spatial ops usually visual
401
+ "data_citations": []
402
+ })}
403
+ return
404
+
405
+ else:
406
+ # Fallback
407
+ yield {"event": "chunk", "data": json.dumps({"type": "text", "content": "I'm not sure how to handle this query yet."})}
408
+
409
+ async def _handle_general_chat(self, query: str, history: List[Dict[str, str]]) -> Dict[str, Any]:
410
+ """Handles general conversational queries."""
411
+ # Add schema context to help the LLM answer questions about the data
412
+ enhanced_query = f"""The user is asking about Panama geographic data.
413
+
414
+ Available data: {len(self.data_loader.admin1)} provinces, {len(self.data_loader.admin2)} districts, {len(self.data_loader.admin3)} corregimientos.
415
+
416
+ User question: {query}
417
+
418
+ Respond helpfully as GeoQuery, the territorial intelligence assistant."""
419
+
420
+ response = await self.llm.generate_response(enhanced_query, history)
421
+
422
+ return {
423
+ "response": response,
424
+ "sql_query": None,
425
+ "geojson": None,
426
+ "data_citations": [],
427
+ "intent": "GENERAL_CHAT"
428
+ }
429
+
430
+ async def _handle_data_query(self, query: str, history: List[Dict[str, str]], include_map: bool = True) -> Dict[str, Any]:
431
+ """
432
+ Handles data queries using text-to-SQL with SOTA Smart Discovery.
433
+ """
434
+ print(f"[GeoQuery] Starting Data Query: {query}")
435
+
436
+ # 0. Get Catalog
437
+ from backend.core.data_catalog import get_data_catalog
438
+ catalog = get_data_catalog()
439
+
440
+ # 1. Smart Discovery: Identify relevant tables
441
+ summaries = catalog.get_all_table_summaries()
442
+
443
+ # Ask LLM which tables are relevant
444
+ relevant_tables = await self.llm.identify_relevant_tables(query, summaries)
445
+
446
+ # 2. Lazy Loading
447
+ feature_tables = []
448
+ for table in relevant_tables:
449
+ if self.geo_engine.ensure_table_loaded(table):
450
+ feature_tables.append(table)
451
+ else:
452
+ print(f"[GeoQuery] Warning: Could not load relevant table '{table}'")
453
+
454
+ # 3. Get schema context (now includes the newly loaded tables)
455
+ table_schema = self.geo_engine.get_table_schemas()
456
+
457
+ # Fallback for empty schema
458
+ if len(table_schema) < 50:
459
+ print("[GeoQuery] GeoEngine schema empty. Fetching from Catalog Metadata.")
460
+ fallback_tables = list(set(feature_tables + ["pan_admin1", "pan_admin2", "pan_admin3"]))
461
+ table_schema = catalog.get_specific_table_schemas(fallback_tables)
462
+
463
+ # 4. Generate real SQL using LLM
464
+ print(f"[GeoQuery] Generating SQL with context size: {len(table_schema)} chars")
465
+ sql = await self.llm.generate_analytical_sql(query, table_schema, history)
466
+
467
+ # Check for SQL generation errors
468
+ if sql.startswith("-- Error"):
469
+ available_data = ", ".join(feature_tables) if feature_tables else "Administrative Boundaries"
470
+ return {
471
+ "response": f"I couldn't find the specific data you asked for. I have access to: {available_data}. \n\nOriginal request: {query}",
472
+ "sql_query": sql,
473
+ "intent": "DATA_QUERY"
474
+ }
475
+
476
+ # 5. Execute SQL in DuckDB
477
+ error_message = None
478
+ try:
479
+ geojson = self.geo_engine.execute_spatial_query(sql)
480
+ features = geojson.get("features", [])
481
+ print(f"[GeoQuery] Query returned {len(features)} features")
482
+ except Exception as e:
483
+ error_message = str(e)
484
+ print(f"[GeoQuery] SQL execution error: {error_message}")
485
+
486
+ # Self-Correction Loop
487
+ try:
488
+ sql = await self.llm.correct_sql(query, sql, error_message, str(table_schema))
489
+ geojson = self.geo_engine.execute_spatial_query(sql)
490
+ features = geojson.get("features", [])
491
+ error_message = None
492
+ except Exception as e2:
493
+ return {
494
+ "response": f"The SQL query failed to execute even after an automatic repair attempt.\nOriginal Error: {error_message}\nRepair Error: {str(e2)}",
495
+ "sql_query": sql,
496
+ "intent": "DATA_QUERY"
497
+ }
498
+
499
+ # 6. Post-Process via ResponseFormatter
500
+ citations = ResponseFormatter.generate_citations(relevant_tables, features)
501
+ data_summary = ResponseFormatter.generate_data_summary(features)
502
+
503
+ # 7. Generate explanation
504
+ explanation = await self.llm.generate_explanation(query, sql, data_summary, history)
505
+
506
+ # 8. Add Layer Metadata to GeoJSON and REGISTER in GeoEngine
507
+ if include_map and features:
508
+ # Generate AI layer name
509
+ layer_info = await self.llm.generate_layer_name(query, sql)
510
+ layer_name_ai = layer_info.get("name", "Map Layer")
511
+ layer_emoji = layer_info.get("emoji", "📍")
512
+ point_style = layer_info.get("pointStyle", None)
513
+ geojson, layer_id, layer_name = ResponseFormatter.format_geojson_layer(query, geojson, features, layer_name_ai, layer_emoji, point_style)
514
+
515
+ try:
516
+ table_name = self.geo_engine.register_layer(layer_id, geojson)
517
+ self.session_store.add_layer(DEFAULT_SESSION_ID, {
518
+ "id": layer_id,
519
+ "name": layer_name,
520
+ "table_name": table_name,
521
+ "timestamp": datetime.datetime.now().isoformat()
522
+ })
523
+ except Exception as e:
524
+ logger.warning(f"Failed to register layer in GeoEngine: {e}")
525
+
526
+ # 9. Auto-generate Chart
527
+ chart_data = ResponseFormatter.generate_chart_data(sql, features)
528
+
529
+ # 10. Prepare Raw Data
530
+ raw_data = ResponseFormatter.prepare_raw_data(features)
531
+
532
+ return {
533
+ "response": explanation,
534
+ "sql_query": sql,
535
+ "geojson": geojson if include_map and features else None,
536
+ "data_citations": citations,
537
+ "chart_data": chart_data,
538
+ "raw_data": raw_data,
539
+ "intent": "DATA_QUERY" if not include_map else "MAP_REQUEST"
540
+ }
541
+
542
+ async def _handle_spatial_op(self, query: str, history: List[Dict[str, str]]) -> Dict[str, Any]:
543
+ """Handles spatial operations (Difference, Intersection, etc) using GeoEngine."""
544
+ # 0. Get data catalog for relevant tables
545
+ from backend.core.data_catalog import get_data_catalog
546
+ catalog = get_data_catalog()
547
+ summaries = catalog.get_all_table_summaries()
548
+
549
+ # 1. Identify relevant base tables from query
550
+ relevant_tables = await self.llm.identify_relevant_tables(query, summaries)
551
+
552
+ # 2. Lazy load those tables
553
+ for table in relevant_tables:
554
+ self.geo_engine.ensure_table_loaded(table)
555
+
556
+ # 3. Get schema of loaded base tables
557
+ base_table_schema = self.geo_engine.get_table_schemas()
558
+
559
+ # 4. Prepare Layer Context (user-created layers from session)
560
+ session_layers = self.session_store.get_layers(DEFAULT_SESSION_ID)
561
+ layer_context = "User-Created Layers:\n"
562
+ if not session_layers:
563
+ layer_context += "(No user layers created yet.)\n"
564
+ else:
565
+ for i, layer in enumerate(session_layers):
566
+ layer_context += f"Layer {i+1}: {layer['name']} (Table: {layer['table_name']})\n"
567
+
568
+ # 5. Combine both contexts for LLM
569
+ full_context = f"{base_table_schema}\n\n{layer_context}"
570
+
571
+ # 6. Generate Spatial SQL
572
+ sql = await self.llm.generate_spatial_sql(query, full_context, history)
573
+
574
+ # 7. Execute
575
+ error_message = None
576
+ geojson = None
577
+ features = []
578
+
579
+ try:
580
+ geojson = self.geo_engine.execute_spatial_query(sql)
581
+ features = geojson.get("features", [])
582
+ except Exception as e:
583
+ error_message = str(e)
584
+ try:
585
+ sql = await self.llm.correct_sql(query, sql, error_message, full_context)
586
+ geojson = self.geo_engine.execute_spatial_query(sql)
587
+ features = geojson.get("features", [])
588
+ error_message = None
589
+ except Exception as e2:
590
+ return {
591
+ "response": f"I tried to perform the spatial operation but encountered an error: {str(e)}\n\nQuery: {sql}",
592
+ "sql_query": sql,
593
+ "intent": "SPATIAL_OP"
594
+ }
595
+
596
+ # 4. Result Processing
597
+ if features:
598
+ # Generate AI layer name
599
+ layer_info = await self.llm.generate_layer_name(query, sql)
600
+ layer_name_ai = layer_info.get("name", "Map Layer")
601
+ layer_emoji = layer_info.get("emoji", "📍")
602
+ point_style = layer_info.get("pointStyle", None)
603
+ geojson, layer_id, layer_name = ResponseFormatter.format_geojson_layer(query, geojson, features, layer_name_ai, layer_emoji, point_style)
604
+ table_name = self.geo_engine.register_layer(layer_id, geojson)
605
+ self.session_store.add_layer(DEFAULT_SESSION_ID, {
606
+ "id": layer_id,
607
+ "name": layer_name,
608
+ "table_name": table_name,
609
+ "timestamp": datetime.datetime.now().isoformat()
610
+ })
611
+
612
+ data_summary = f"Spatial operation resulted in {len(features)} features."
613
+ explanation = await self.llm.generate_explanation(query, sql, data_summary, history)
614
+
615
+ return {
616
+ "response": explanation,
617
+ "sql_query": sql,
618
+ "geojson": geojson,
619
+ "data_citations": [],
620
+ "intent": "SPATIAL_OP"
621
+ }
622
+
623
+ async def _handle_stat_query(self, query: str, history: List[Dict[str, str]]) -> Dict[str, Any]:
624
+ """
625
+ Handles statistical queries where charts/tables are more important than maps.
626
+ """
627
+ # Reuse data query logic but without map emphasis
628
+ result = await self._handle_data_query(query, history, include_map=False)
629
+ result["intent"] = "STAT_QUERY"
630
+
631
+ # Ensure chart data is present if possible
632
+ if not result.get("chart_data") and result.get("raw_data"):
633
+ # Force chart attempt
634
+ features_mock = [{"properties": d} for d in result["raw_data"]]
635
+ result["chart_data"] = ResponseFormatter.generate_chart_data(result.get("sql_query", ""), features_mock)
636
+
637
+ return result
638
+
639
+ async def _execute_multi_step_query(
640
+ self,
641
+ query: str,
642
+ history: List[Dict[str, str]],
643
+ include_map: bool,
644
+ session_id: str
645
+ ):
646
+ """
647
+ Execute a complex query by breaking it into multiple steps.
648
+
649
+ Yields streaming events throughout the multi-step process.
650
+ """
651
+ import asyncio
652
+
653
+ # 1. Get candidate tables for planning
654
+ yield {"event": "status", "data": json.dumps({"status": "📚 Discovering relevant datasets..."})}
655
+
656
+ candidate_tables = self.semantic_search.search_table_names(query, top_k=20)
657
+ if not candidate_tables:
658
+ candidate_tables = list(self.catalog.catalog.keys())
659
+
660
+ # 2. Plan the query
661
+ yield {"event": "status", "data": json.dumps({"status": "📋 Creating execution plan..."})}
662
+
663
+ plan = await self.query_planner.plan_query(query, candidate_tables, self.llm)
664
+
665
+ if not plan.is_complex or not plan.steps:
666
+ # Fallback to simple execution
667
+ yield {"event": "status", "data": json.dumps({"status": "📚 Executing as simple query..."})}
668
+ # Re-route to simple path by manually calling the logic
669
+ candidate_summaries = self.catalog.get_summaries_for_tables(candidate_tables)
670
+ relevant_tables = await self.llm.identify_relevant_tables(query, candidate_summaries)
671
+
672
+ for table in relevant_tables:
673
+ self.geo_engine.ensure_table_loaded(table)
674
+
675
+ table_schema = self.geo_engine.get_table_schemas()
676
+
677
+ yield {"event": "status", "data": json.dumps({"status": "✍️ Writing SQL query..."})}
678
+ sql = await self.llm.generate_analytical_sql(query, table_schema, history)
679
+ sql = sql.replace("```sql", "").replace("```", "").strip()
680
+
681
+ try:
682
+ geojson = self.geo_engine.execute_spatial_query(sql)
683
+ features = geojson.get("features", [])
684
+ except Exception as e:
685
+ yield {"event": "result", "data": json.dumps({
686
+ "response": f"Query execution failed: {str(e)}",
687
+ "sql_query": sql
688
+ })}
689
+ return
690
+
691
+ data_summary = ResponseFormatter.generate_data_summary(features)
692
+ explanation = await self.llm.generate_explanation(query, sql, data_summary, history)
693
+
694
+ yield {"event": "result", "data": json.dumps({
695
+ "response": explanation,
696
+ "sql_query": sql,
697
+ "geojson": geojson if include_map and features else None,
698
+ "chart_data": ResponseFormatter.generate_chart_data(sql, features),
699
+ "raw_data": ResponseFormatter.prepare_raw_data(features),
700
+ "data_citations": []
701
+ })}
702
+ return
703
+
704
+ # 3. Show plan to user
705
+ step_descriptions = [f"Step {i+1}: {s.description}" for i, s in enumerate(plan.steps)]
706
+ yield {"event": "chunk", "data": json.dumps({
707
+ "type": "thought",
708
+ "content": f"Planning multi-step execution:\n" + "\n".join(step_descriptions)
709
+ })}
710
+
711
+ # 4. Load all needed tables
712
+ all_tables = set()
713
+ for step in plan.steps:
714
+ all_tables.update(step.tables_needed)
715
+
716
+ if all_tables:
717
+ yield {"event": "status", "data": json.dumps({"status": f"💾 Loading {len(all_tables)} datasets..."})}
718
+ for table in all_tables:
719
+ self.geo_engine.ensure_table_loaded(table)
720
+
721
+ # 5. Execute steps by parallel groups
722
+ intermediate_results = {}
723
+ all_features = []
724
+ all_sql = []
725
+
726
+ for group_idx, group in enumerate(plan.parallel_groups):
727
+ group_steps = [s for s in plan.steps if s.step_id in group]
728
+
729
+ yield {"event": "status", "data": json.dumps({
730
+ "status": f"⚡ Executing step group {group_idx + 1}/{len(plan.parallel_groups)}..."
731
+ })}
732
+
733
+ # Execute steps in this group (could be parallel, but sequential for simplicity)
734
+ for step in group_steps:
735
+ yield {"event": "status", "data": json.dumps({
736
+ "status": f"🔄 {step.description}..."
737
+ })}
738
+
739
+ # Generate SQL for this step
740
+ table_schema = self.geo_engine.get_table_schemas()
741
+
742
+ # Build step-specific prompt
743
+ step_query = f"""Execute this step: {step.description}
744
+
745
+ Original user request: {query}
746
+
747
+ SQL Hint: {step.sql_template or 'None'}
748
+
749
+ Previous step results available: {list(intermediate_results.keys())}"""
750
+
751
+ sql = await self.llm.generate_analytical_sql(step_query, table_schema, history)
752
+ sql = sql.replace("```sql", "").replace("```", "").strip()
753
+
754
+ # Skip if LLM returned an error
755
+ if "DATA_UNAVAILABLE" in sql or sql.startswith("-- ERROR"):
756
+ logger.warning(f"Step {step.step_id} indicated data unavailable")
757
+ intermediate_results[step.result_name] = {"features": [], "sql": sql}
758
+ continue
759
+
760
+ try:
761
+ geojson = self.geo_engine.execute_spatial_query(sql)
762
+ features = geojson.get("features", [])
763
+
764
+ intermediate_results[step.result_name] = {
765
+ "features": features,
766
+ "sql": sql,
767
+ "geojson": geojson
768
+ }
769
+ all_features.extend(features)
770
+ all_sql.append(f"-- {step.description}\n{sql}")
771
+
772
+ yield {"event": "status", "data": json.dumps({
773
+ "status": f"✅ Step got {len(features)} results"
774
+ })}
775
+
776
+ except Exception as e:
777
+ logger.error(f"Step {step.step_id} failed: {e}")
778
+ # Try to repair
779
+ try:
780
+ sql = await self.llm.correct_sql(step_query, sql, str(e), table_schema)
781
+ geojson = self.geo_engine.execute_spatial_query(sql)
782
+ features = geojson.get("features", [])
783
+ intermediate_results[step.result_name] = {
784
+ "features": features,
785
+ "sql": sql,
786
+ "geojson": geojson
787
+ }
788
+ all_features.extend(features)
789
+ all_sql.append(f"-- {step.description} (repaired)\n{sql}")
790
+ except Exception as e2:
791
+ logger.error(f"Step repair also failed: {e2}")
792
+ intermediate_results[step.result_name] = {"features": [], "sql": sql, "error": str(e2)}
793
+
794
+ # 6. Generate final combined result
795
+ yield {"event": "status", "data": json.dumps({"status": "💬 Generating combined analysis..."})}
796
+
797
+ # Summarize intermediate results for explanation
798
+ result_summary = []
799
+ for name, result in intermediate_results.items():
800
+ features = result.get("features", [])
801
+ result_summary.append(f"{name}: {len(features)} records")
802
+
803
+ combined_summary = f"""Multi-step query completed with {len(plan.steps)} steps.
804
+
805
+ Results:
806
+ {chr(10).join(result_summary)}
807
+
808
+ Combination logic: {plan.final_combination_logic}"""
809
+
810
+ # Get combined explanation
811
+ explanation_buffer = ""
812
+ async for chunk in self.llm.stream_explanation(query, "\n\n".join(all_sql), combined_summary, history):
813
+ if chunk["type"] == "content":
814
+ explanation_buffer += chunk["text"]
815
+ yield {"event": "chunk", "data": json.dumps({"type": "text", "content": chunk["text"]})}
816
+
817
+ # Find the best geojson to display (use the one with most features)
818
+ best_geojson = None
819
+ best_features = []
820
+ for name, result in intermediate_results.items():
821
+ features = result.get("features", [])
822
+ if len(features) > len(best_features):
823
+ best_features = features
824
+ best_geojson = result.get("geojson")
825
+
826
+ # Generate layer if we have features
827
+ if include_map and best_features and best_geojson:
828
+ layer_info = await self.llm.generate_layer_name(query, all_sql[0] if all_sql else "")
829
+ layer_name_ai = layer_info.get("name", "Multi-Step Result")
830
+ layer_emoji = layer_info.get("emoji", "📊")
831
+ best_geojson, layer_id, layer_name = ResponseFormatter.format_geojson_layer(
832
+ query, best_geojson, best_features, layer_name_ai, layer_emoji
833
+ )
834
+
835
+ try:
836
+ table_name = self.geo_engine.register_layer(layer_id, best_geojson)
837
+ self.session_store.add_layer(session_id, {
838
+ "id": layer_id,
839
+ "name": layer_name,
840
+ "table_name": table_name,
841
+ "timestamp": datetime.datetime.now().isoformat()
842
+ })
843
+ except Exception as e:
844
+ logger.warning(f"Failed to register multi-step layer: {e}")
845
+
846
+ # Generate chart from combined results
847
+ chart_data = ResponseFormatter.generate_chart_data("\n".join(all_sql), best_features)
848
+ raw_data = ResponseFormatter.prepare_raw_data(best_features)
849
+
850
+ # Final result
851
+ yield {"event": "result", "data": json.dumps({
852
+ "response": explanation_buffer,
853
+ "sql_query": "\n\n".join(all_sql),
854
+ "geojson": best_geojson if include_map and best_features else None,
855
+ "chart_data": chart_data,
856
+ "raw_data": raw_data,
857
+ "data_citations": [],
858
+ "multi_step": True,
859
+ "steps_executed": len(plan.steps)
860
+ })}
backend/services/orchestrator.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, Any
2
+ from backend.services.executor import QueryExecutor
3
+
4
+ class OrchestratorAgent:
5
+ def __init__(self):
6
+ self.executor = QueryExecutor()
7
+
8
+ async def process_query(self, query: str, history: list[Dict[str, str]] = None, model: str = None) -> Dict[str, Any]:
9
+ """
10
+ Delegates to QueryExecutor. Model param can be used to configure LLM if needed.
11
+ """
12
+ # For now, we rely on the default configured in LLMGateway
13
+ return await self.executor.process_query_with_context(query, history or [])
backend/services/response_formatter.py ADDED
@@ -0,0 +1,287 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Response Formatter Service
3
+
4
+ Handles formatting of query results into citations, charts, GeoJSON layers, and raw data for the frontend.
5
+ Separates presentation logic from execution logic.
6
+ """
7
+
8
+ from typing import List, Dict, Any, Optional
9
+ import uuid
10
+
11
+ class ResponseFormatter:
12
+ @staticmethod
13
+ def generate_citations(tables: List[str], features: Optional[List[Dict]] = None) -> List[str]:
14
+ """Generates readable citations based on table names and returned features."""
15
+ citations = []
16
+ processed = set()
17
+
18
+ # Check explicit table list
19
+ for table in tables:
20
+ table = table.lower()
21
+ if table in processed: continue
22
+
23
+ if "universit" in table:
24
+ citations.append("Universities Data (OpenStreetMap, 2024)")
25
+ elif "school" in table or "education" in table:
26
+ citations.append("Education Facilities (OpenStreetMap, 2024)")
27
+ elif "hospital" in table or "health" in table:
28
+ citations.append("Health Facilities (OpenStreetMap, 2024)")
29
+ elif "airport" in table:
30
+ citations.append("Airports Data (OpenStreetMap, 2024)")
31
+ elif "road" in table:
32
+ citations.append("Road Network (OpenStreetMap, 2024)")
33
+ elif "population" in table or "census" in table:
34
+ citations.append("Panama Census Data (INEC, 2023)")
35
+ elif "admin" in table or "boundar" in table:
36
+ if "Admin Boundaries" not in processed:
37
+ citations.append("Panama Administrative Boundaries (HDX COD-AB, 2021)")
38
+ processed.add("Admin Boundaries")
39
+ continue
40
+
41
+ processed.add(table)
42
+
43
+ # Fallback check on features if no specific tables cited but admin data returned
44
+ if not citations and features:
45
+ if any(k.startswith("adm") for k in features[0].get("properties", {}).keys()):
46
+ citations.append("Panama Administrative Boundaries (HDX COD-AB, 2021)")
47
+
48
+ return list(set(citations))
49
+
50
+ @staticmethod
51
+ def generate_chart_data(sql: str, features: List[Dict]) -> Optional[Dict[str, Any]]:
52
+ """
53
+ Generates Chart.js compatible data structure if the query looks aggregative.
54
+ """
55
+ if not features:
56
+ return None
57
+
58
+ # Heuristic: If GROUP BY or ORDER BY ... LIMIT is used, likely suitable for charting
59
+ # Or if explicitly requested via intent (logic handled in caller, but we check SQL signature here too)
60
+
61
+ # Try to find string (label) and number (value) in properties
62
+ try:
63
+ chart_items = []
64
+ x_key = "name"
65
+ y_key = "value"
66
+ x_label = "Feature"
67
+ y_label = "Value"
68
+
69
+ # 1. Analyze properties to find X (Label) and Y (Value)
70
+ if features:
71
+ sample_props = features[0].get("properties", {})
72
+
73
+ # Exclude system keys
74
+ valid_keys = [k for k in sample_props.keys() if k not in ["geom", "geometry", "style", "layer_name", "layer_id", "choropleth", "fillColor", "color"]]
75
+
76
+ # Find Y (Value) - First numeric column
77
+ for k in valid_keys:
78
+ if isinstance(sample_props[k], (int, float)) and not k.endswith("_id") and not k.endswith("_code"):
79
+ y_key = k
80
+ y_label = k.replace("_", " ").title()
81
+ if "sqkm" in k: y_label = "Area (km²)"
82
+ elif "pop" in k: y_label = "Population"
83
+ elif "count" in k: y_label = "Count"
84
+ break
85
+
86
+ # Find X (Label) - First string column (excluding IDs if possible)
87
+ for k in valid_keys:
88
+ if isinstance(sample_props[k], str) and "name" in k:
89
+ x_key = k
90
+ x_label = k.replace("_", " ").title().replace("Name", "").strip() or "Region"
91
+ break
92
+
93
+ # 2. Build Data
94
+ for f in features:
95
+ props = f.get("properties", {})
96
+ label = props.get(x_key)
97
+ value = props.get(y_key)
98
+
99
+ if label is not None and value is not None:
100
+ chart_items.append({"name": str(label), "value": value})
101
+
102
+ if chart_items:
103
+ # auto-sort descending
104
+ chart_items.sort(key=lambda x: x["value"], reverse=True)
105
+
106
+ return {
107
+ "type": "bar",
108
+ "title": f"{y_label} by {x_label}",
109
+ "data": chart_items[:15], # Limit to top 15 for readability
110
+ "xKey": "name",
111
+ "yKey": "value",
112
+ "xAxisLabel": x_label,
113
+ "yAxisLabel": y_label
114
+ }
115
+ except Exception as e:
116
+ print(f"Error generating chart data: {e}")
117
+ return None
118
+
119
+ return None
120
+
121
+ @staticmethod
122
+ def prepare_raw_data(features: List[Dict]) -> List[Dict]:
123
+ """Cleans feature properties for display in the raw data table."""
124
+ raw_data = []
125
+ if not features:
126
+ return raw_data
127
+
128
+ for f in features:
129
+ props = f.get("properties", {}).copy()
130
+ # Serialize
131
+ props = ResponseFormatter._serialize_properties(props)
132
+
133
+ # Remove system/visual properties
134
+ for key in ["geom", "geometry", "style", "layer_name", "layer_id", "choropleth", "fillColor", "color"]:
135
+ props.pop(key, None)
136
+ raw_data.append(props)
137
+
138
+ return raw_data
139
+
140
+ @staticmethod
141
+ def format_geojson_layer(query: str, geojson: Dict[str, Any], features: List[Dict], layer_name: str, layer_emoji: str = "📍", point_style: Optional[str] = None, admin_levels: Optional[List[str]] = None) -> tuple[Dict[str, Any], str, str]:
142
+ """
143
+ styles the GeoJSON layer and generates metadata (ID, Name, Choropleth).
144
+
145
+ Args:
146
+ point_style: "icon" for emoji markers, "circle" for simple colored circles, None for auto-detect
147
+ """
148
+
149
+ # 0. Serialize properties to avoid datetime errors
150
+ if features:
151
+ for f in features:
152
+ if "properties" in f:
153
+ f["properties"] = ResponseFormatter._serialize_properties(f["properties"])
154
+
155
+ # 2. Random/Distinct Colors
156
+ # Palette of distinct colors (avoiding pure blue which is default)
157
+ palette = [
158
+ "#E63946", # Red
159
+ "#F4A261", # Orange
160
+ "#2A9D8F", # Teal
161
+ "#E9C46A", # Yellow
162
+ "#9C6644", # Brown
163
+ "#D62828", # Dark Red
164
+ "#8338EC", # Purple
165
+ "#3A86FF", # Blue-ish (but distinct)
166
+ "#FB5607", # Orange-Red
167
+ "#FF006E", # Pink
168
+ ]
169
+
170
+ # Deterministic color based on query hash to keep it stable for same query
171
+ color_idx = abs(hash(query)) % len(palette)
172
+ layer_color = palette[color_idx]
173
+
174
+ # Choropleth Logic
175
+ # 1. Identify valid numeric column
176
+ choropleth_col = None
177
+ if features:
178
+ sample = features[0].get("properties", {})
179
+ valid_numerics = [
180
+ k for k, v in sample.items()
181
+ if isinstance(v, (int, float))
182
+ and k not in ["layer_id", "style"]
183
+ and not k.endswith("_code")
184
+ and not k.endswith("_id")
185
+ ]
186
+
187
+ # Prioritize 'population', 'area', 'count'
188
+ priority_cols = ["population", "pop", "count", "num", "density", "area_sqkm", "area"]
189
+
190
+ for p in priority_cols:
191
+ matches = [c for c in valid_numerics if p in c]
192
+ if matches:
193
+ choropleth_col = matches[0]
194
+ break
195
+
196
+ # Fallback to first numeric
197
+ if not choropleth_col and valid_numerics:
198
+ choropleth_col = valid_numerics[0]
199
+
200
+ # 2. Enable if appropriate
201
+ if choropleth_col:
202
+ # Check if values actually vary
203
+ values = [f["properties"].get(choropleth_col, 0) for f in features]
204
+ if len(set(values)) > 1:
205
+ geojson["properties"]["choropleth"] = {
206
+ "enabled": True,
207
+ "palette": "viridis",
208
+ "column": choropleth_col,
209
+ "scale": "log" if "pop" in choropleth_col or "density" in choropleth_col else "linear"
210
+ }
211
+ else:
212
+ # Apply random color if NOT a choropleth
213
+ geojson["properties"]["style"] = {
214
+ "color": layer_color,
215
+ "fillColor": layer_color,
216
+ "opacity": 0.8,
217
+ "fillOpacity": 0.4
218
+ }
219
+
220
+ layer_id = str(uuid.uuid4())[:8]
221
+ geojson["properties"]["layer_name"] = layer_name
222
+ geojson["properties"]["layer_id"] = layer_id
223
+
224
+ # Add Point Marker Configuration
225
+ # Use pointStyle to determine whether to show icon or circle
226
+ marker_icon = None
227
+ marker_style = "circle" # default
228
+
229
+ if point_style == "icon":
230
+ # Use emoji icon for categorical POI
231
+ marker_icon = layer_emoji
232
+ marker_style = "icon"
233
+ elif point_style == "circle":
234
+ # Use simple circle for large datasets or density viz
235
+ marker_icon = None
236
+ marker_style = "circle"
237
+ else:
238
+ # Auto-detect: default to icon for now (backward compatibility)
239
+ marker_icon = layer_emoji
240
+ marker_style = "icon"
241
+
242
+ geojson["properties"]["pointMarker"] = {
243
+ "icon": marker_icon,
244
+ "style": marker_style,
245
+ "color": layer_color,
246
+ "size": 32
247
+ }
248
+
249
+ return geojson, layer_id, layer_name
250
+
251
+ @staticmethod
252
+ def generate_data_summary(features: List[Dict]) -> str:
253
+ """Generates a text summary of the features for the LLM explanation context."""
254
+ if features:
255
+ sample_names = []
256
+ for f in features[:5]:
257
+ props = f.get("properties", {})
258
+ name = props.get("adm3_name") or props.get("adm2_name") or props.get("adm1_name") or props.get("name") or "Feature"
259
+ area = props.get("area_sqkm")
260
+ if area:
261
+ sample_names.append(f"{name} ({float(area):.1f} km²)")
262
+ else:
263
+ sample_names.append(name)
264
+ return f"Found {len(features)} features. Sample: {', '.join(sample_names)}"
265
+ return f"Found {len(features)} features. Sample: {', '.join(sample_names)}"
266
+ else:
267
+ return "No features found matching the query."
268
+
269
+ @staticmethod
270
+ def _serialize_properties(properties: Dict[str, Any]) -> Dict[str, Any]:
271
+ """Recursively converts datetime/date objects to strings for JSON serialization."""
272
+ from datetime import datetime, date
273
+
274
+ serialized = {}
275
+ for k, v in properties.items():
276
+ if isinstance(v, (datetime, date)):
277
+ serialized[k] = v.isoformat()
278
+ elif isinstance(v, dict):
279
+ serialized[k] = ResponseFormatter._serialize_properties(v)
280
+ elif isinstance(v, list):
281
+ serialized[k] = [
282
+ x.isoformat() if isinstance(x, (datetime, date)) else x
283
+ for x in v
284
+ ]
285
+ else:
286
+ serialized[k] = v
287
+ return serialized
docker-compose.yml ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: '3.8'
2
+
3
+ services:
4
+ app:
5
+ build: .
6
+ image: geoquery:latest
7
+ ports:
8
+ - "8000:8000"
9
+ environment:
10
+ - GEMINI_API_KEY=${GEMINI_API_KEY}
11
+ volumes:
12
+ # Optional: Mount data directory if you want to persist changes or add data
13
+ # - ./backend/data:/app/backend/data
14
+ - ./backend/data/custom:/app/backend/data/custom