Spaces:
Running
Running
Daniel Wiesmann commited on
Commit ·
ad5ec6b
1
Parent(s): 24c2cc8
Intitial commit
Browse files- .gitignore +138 -0
- README.md +67 -0
- demo_app.py +217 -0
- ingest/convert_natural_earth.py +187 -0
- pyproject.toml +23 -0
- src/gazet/__init__.py +4 -0
- src/gazet/__main__.py +6 -0
- src/gazet/api.py +147 -0
- src/gazet/config.py +74 -0
- src/gazet/export.py +65 -0
- src/gazet/lm.py +112 -0
- src/gazet/schemas.py +301 -0
- src/gazet/search.py +94 -0
- src/gazet/sql.py +91 -0
- uv.lock +0 -0
.gitignore
ADDED
|
@@ -0,0 +1,138 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Imagery
|
| 2 |
+
*.SAFE*
|
| 3 |
+
*.TIF
|
| 4 |
+
*.tif
|
| 5 |
+
|
| 6 |
+
# Byte-compiled / optimized / DLL files
|
| 7 |
+
__pycache__/
|
| 8 |
+
*.py[cod]
|
| 9 |
+
*$py.class
|
| 10 |
+
|
| 11 |
+
# C extensions
|
| 12 |
+
*.so
|
| 13 |
+
|
| 14 |
+
# Distribution / packaging
|
| 15 |
+
.Python
|
| 16 |
+
build/
|
| 17 |
+
develop-eggs/
|
| 18 |
+
dist/
|
| 19 |
+
downloads/
|
| 20 |
+
eggs/
|
| 21 |
+
.eggs/
|
| 22 |
+
lib/
|
| 23 |
+
lib64/
|
| 24 |
+
parts/
|
| 25 |
+
sdist/
|
| 26 |
+
var/
|
| 27 |
+
wheels/
|
| 28 |
+
pip-wheel-metadata/
|
| 29 |
+
share/python-wheels/
|
| 30 |
+
*.egg-info/
|
| 31 |
+
.installed.cfg
|
| 32 |
+
*.egg
|
| 33 |
+
MANIFEST
|
| 34 |
+
|
| 35 |
+
# PyInstaller
|
| 36 |
+
# Usually these files are written by a python script from a template
|
| 37 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
| 38 |
+
*.manifest
|
| 39 |
+
*.spec
|
| 40 |
+
|
| 41 |
+
# Installer logs
|
| 42 |
+
pip-log.txt
|
| 43 |
+
pip-delete-this-directory.txt
|
| 44 |
+
|
| 45 |
+
# Unit test / coverage reports
|
| 46 |
+
htmlcov/
|
| 47 |
+
.tox/
|
| 48 |
+
.nox/
|
| 49 |
+
.coverage
|
| 50 |
+
.coverage.*
|
| 51 |
+
.cache
|
| 52 |
+
nosetests.xml
|
| 53 |
+
coverage.xml
|
| 54 |
+
*.cover
|
| 55 |
+
*.py,cover
|
| 56 |
+
.hypothesis/
|
| 57 |
+
.pytest_cache/
|
| 58 |
+
|
| 59 |
+
# Translations
|
| 60 |
+
*.mo
|
| 61 |
+
*.pot
|
| 62 |
+
|
| 63 |
+
# Django stuff:
|
| 64 |
+
*.log
|
| 65 |
+
local_settings.py
|
| 66 |
+
db.sqlite3
|
| 67 |
+
db.sqlite3-journal
|
| 68 |
+
|
| 69 |
+
# Flask stuff:
|
| 70 |
+
instance/
|
| 71 |
+
.webassets-cache
|
| 72 |
+
|
| 73 |
+
# Scrapy stuff:
|
| 74 |
+
.scrapy
|
| 75 |
+
|
| 76 |
+
# Sphinx documentation
|
| 77 |
+
docs/_build/
|
| 78 |
+
|
| 79 |
+
# PyBuilder
|
| 80 |
+
target/
|
| 81 |
+
|
| 82 |
+
# Jupyter Notebook
|
| 83 |
+
.ipynb_checkpoints
|
| 84 |
+
|
| 85 |
+
# IPython
|
| 86 |
+
profile_default/
|
| 87 |
+
ipython_config.py
|
| 88 |
+
|
| 89 |
+
# pyenv
|
| 90 |
+
.python-version
|
| 91 |
+
|
| 92 |
+
# pipenv
|
| 93 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
| 94 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
| 95 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
| 96 |
+
# install all needed dependencies.
|
| 97 |
+
#Pipfile.lock
|
| 98 |
+
|
| 99 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
|
| 100 |
+
__pypackages__/
|
| 101 |
+
|
| 102 |
+
# Celery stuff
|
| 103 |
+
celerybeat-schedule
|
| 104 |
+
celerybeat.pid
|
| 105 |
+
|
| 106 |
+
# SageMath parsed files
|
| 107 |
+
*.sage.py
|
| 108 |
+
|
| 109 |
+
# Environments
|
| 110 |
+
.env
|
| 111 |
+
.venv
|
| 112 |
+
env/
|
| 113 |
+
venv/
|
| 114 |
+
ENV/
|
| 115 |
+
env.bak/
|
| 116 |
+
venv.bak/
|
| 117 |
+
|
| 118 |
+
# Spyder project settings
|
| 119 |
+
.spyderproject
|
| 120 |
+
.spyproject
|
| 121 |
+
|
| 122 |
+
# Rope project settings
|
| 123 |
+
.ropeproject
|
| 124 |
+
|
| 125 |
+
# mkdocs documentation
|
| 126 |
+
/site
|
| 127 |
+
|
| 128 |
+
# mypy
|
| 129 |
+
.mypy_cache/
|
| 130 |
+
.dmypy.json
|
| 131 |
+
dmypy.json
|
| 132 |
+
|
| 133 |
+
# Pyre type checker
|
| 134 |
+
.pyre/
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
data/
|
| 138 |
+
output/
|
README.md
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# gazet
|
| 2 |
+
|
| 3 |
+
Lean natural-language geocoder with GIS operations over Overture and Natural Earth parquet datasets. In an industry trending toward ever-larger models and heavier infrastructure, gazet takes the opposite path: small language models, DuckDB, and local Parquet files — no PostGIS, no cloud geocoding APIs, no bloat.
|
| 4 |
+
|
| 5 |
+
Name inspired by [Gazetteer](https://en.wikipedia.org/wiki/Gazetteer). A gazetteer is a geographical dictionary or directory used in conjunction with a map or atlas.
|
| 6 |
+
|
| 7 |
+
## Modules
|
| 8 |
+
|
| 9 |
+
| Module | Contents |
|
| 10 |
+
| --- | --- |
|
| 11 |
+
| `config.py` | data paths, model name, SQL schema description |
|
| 12 |
+
| `types.py` | `SUBTYPES`, `COUNTRIES`, `Place`, `PlacesResult` |
|
| 13 |
+
| `lm.py` | DSPy signatures + LM init (`extract`, `write_sql`) |
|
| 14 |
+
| `search.py` | fuzzy search against `divisions_area` / `natural_earth` |
|
| 15 |
+
| `sql.py` | code-act SQL generation loop |
|
| 16 |
+
| `export.py` | GeoJSON FeatureCollection writer |
|
| 17 |
+
| `api.py` | FastAPI app with `/search?q=...` returning GeoJSON FeatureCollection |
|
| 18 |
+
|
| 19 |
+
## Local setup
|
| 20 |
+
|
| 21 |
+
Install python dependencies
|
| 22 |
+
|
| 23 |
+
```bash
|
| 24 |
+
uv sync --extra dev --extra demo
|
| 25 |
+
```
|
| 26 |
+
|
| 27 |
+
Ensure you are loged into Ollama to use remote models.
|
| 28 |
+
|
| 29 |
+
## Usage
|
| 30 |
+
|
| 31 |
+
```bash
|
| 32 |
+
python -m gazet
|
| 33 |
+
# then GET http://localhost:8000/search?q=Border%20between%20Loja%20and%20Piura
|
| 34 |
+
```
|
| 35 |
+
|
| 36 |
+
### API + Streamlit demo
|
| 37 |
+
|
| 38 |
+
```bash
|
| 39 |
+
uv run uvicorn gazet.api:app --reload # API on :8000
|
| 40 |
+
uv run streamlit run demo_app.py # demo UI
|
| 41 |
+
```
|
| 42 |
+
|
| 43 |
+
## Data preparation
|
| 44 |
+
|
| 45 |
+
1. Download Overture divisions data
|
| 46 |
+
2. Download the 10m physical layer from [Natural Earth](https://www.naturalearthdata.com/downloads/10m-physical-vectors/)
|
| 47 |
+
3. Unzip the data
|
| 48 |
+
4. Convert natural earth data to parquet
|
| 49 |
+
|
| 50 |
+
Example for downloading overture
|
| 51 |
+
|
| 52 |
+
```bash
|
| 53 |
+
aws s3 sync
|
| 54 |
+
s3 sync s3://overturemaps-us-west-2/release/2026-02-18.0/theme=divisions/type=division_area/ data/overture/divisions_area
|
| 55 |
+
```
|
| 56 |
+
|
| 57 |
+
Example for running conversion script for natural earth
|
| 58 |
+
|
| 59 |
+
```bash
|
| 60 |
+
python -m ingest.convert_natural_earth ~/Downloads/10m_physical
|
| 61 |
+
```
|
| 62 |
+
|
| 63 |
+
## Design notes
|
| 64 |
+
|
| 65 |
+
- `api.py` exposes GET `/search?q=<query>`; returns GeoJSON FeatureCollection and logs intermediate output.
|
| 66 |
+
- LM is initialised at import time in `lm.py`, suitable for a long-lived server process.
|
| 67 |
+
- Data lives in `data/overture/` and `data/natural_earth_geoparquet/` (not tracked in git).
|
demo_app.py
ADDED
|
@@ -0,0 +1,217 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Demo Streamlit app for gazet API. Run API first: uv run uvicorn gazet.api:app --reload"""
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import math
|
| 5 |
+
|
| 6 |
+
import pandas as pd
|
| 7 |
+
import requests
|
| 8 |
+
import streamlit as st
|
| 9 |
+
|
| 10 |
+
try:
|
| 11 |
+
import pydeck as pdk
|
| 12 |
+
except ImportError:
|
| 13 |
+
pdk = None
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def _coords_from_geom(geom):
|
| 17 |
+
"""Yield (lng, lat) from a GeoJSON geometry."""
|
| 18 |
+
if geom is None:
|
| 19 |
+
return
|
| 20 |
+
t = geom.get("type")
|
| 21 |
+
coords = geom.get("coordinates")
|
| 22 |
+
if not coords:
|
| 23 |
+
return
|
| 24 |
+
if t == "Point":
|
| 25 |
+
yield coords
|
| 26 |
+
elif t in ("LineString", "MultiPoint"):
|
| 27 |
+
for c in coords:
|
| 28 |
+
yield c
|
| 29 |
+
elif t == "Polygon":
|
| 30 |
+
for ring in coords:
|
| 31 |
+
for c in ring:
|
| 32 |
+
yield c
|
| 33 |
+
elif t in ("MultiLineString", "MultiPolygon"):
|
| 34 |
+
for part in coords:
|
| 35 |
+
for c in part if t == "MultiLineString" else part[0]:
|
| 36 |
+
yield c
|
| 37 |
+
elif t == "GeometryCollection":
|
| 38 |
+
for g in geom.get("geometries", []):
|
| 39 |
+
yield from _coords_from_geom(g)
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def bbox_from_geojson(geojson):
|
| 43 |
+
"""Return (min_lng, min_lat, max_lng, max_lat) or None if no coordinates."""
|
| 44 |
+
lngs, lats = [], []
|
| 45 |
+
for f in geojson.get("features", []):
|
| 46 |
+
geom = (
|
| 47 |
+
f.get("geometry") if isinstance(f, dict) else getattr(f, "geometry", None)
|
| 48 |
+
)
|
| 49 |
+
for lng, lat in _coords_from_geom(geom):
|
| 50 |
+
lngs.append(lng)
|
| 51 |
+
lats.append(lat)
|
| 52 |
+
if not lngs:
|
| 53 |
+
return None
|
| 54 |
+
return min(lngs), min(lats), max(lngs), max(lats)
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def view_state_for_bbox(bbox, padding_zoom=0.8):
|
| 58 |
+
"""Return pydeck ViewState (lat, lon, zoom) to fit bbox (min_lng, min_lat, max_lng, max_lat)."""
|
| 59 |
+
min_lng, min_lat, max_lng, max_lat = bbox
|
| 60 |
+
lat = (min_lat + max_lat) / 2
|
| 61 |
+
lng = (min_lng + max_lng) / 2
|
| 62 |
+
lon_span = max(max_lng - min_lng, 1e-6)
|
| 63 |
+
lat_span = max(max_lat - min_lat, 1e-6)
|
| 64 |
+
span_deg = max(lon_span, lat_span)
|
| 65 |
+
zoom = math.log2(360 / span_deg) - padding_zoom
|
| 66 |
+
zoom = max(0, min(18, zoom))
|
| 67 |
+
return pdk.ViewState(latitude=lat, longitude=lng, zoom=zoom)
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
def _render_map(geojson, placeholder):
|
| 71 |
+
n = len(geojson.get("features", []))
|
| 72 |
+
if pdk and n:
|
| 73 |
+
layer = pdk.Layer(
|
| 74 |
+
"GeoJsonLayer",
|
| 75 |
+
data=geojson,
|
| 76 |
+
get_fill_color=[40, 180, 160, 200],
|
| 77 |
+
get_line_color=[125, 211, 192, 255],
|
| 78 |
+
get_line_width=2,
|
| 79 |
+
pickable=True,
|
| 80 |
+
)
|
| 81 |
+
bbox = bbox_from_geojson(geojson)
|
| 82 |
+
view = (
|
| 83 |
+
view_state_for_bbox(bbox)
|
| 84 |
+
if bbox
|
| 85 |
+
else pdk.ViewState(latitude=0, longitude=0, zoom=1)
|
| 86 |
+
)
|
| 87 |
+
with placeholder.container():
|
| 88 |
+
st.pydeck_chart(
|
| 89 |
+
pdk.Deck(
|
| 90 |
+
layers=[layer],
|
| 91 |
+
initial_view_state=view,
|
| 92 |
+
map_style=None,
|
| 93 |
+
tooltip={"text": "{name}"},
|
| 94 |
+
),
|
| 95 |
+
use_container_width=True,
|
| 96 |
+
height=500,
|
| 97 |
+
)
|
| 98 |
+
elif n:
|
| 99 |
+
with placeholder.container():
|
| 100 |
+
st.json(geojson)
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
API = "http://127.0.0.1:8000"
|
| 104 |
+
EXAMPLES = [
|
| 105 |
+
"Angola and Mozambique",
|
| 106 |
+
"Mediterranean Sea",
|
| 107 |
+
"A 0.01 degree buffer around the border between Loja and Piura",
|
| 108 |
+
"The part of Ecuador that is in the Amazon Basin",
|
| 109 |
+
"The northern half of India",
|
| 110 |
+
]
|
| 111 |
+
|
| 112 |
+
st.set_page_config(page_title="Gazet", page_icon="🌍", layout="wide")
|
| 113 |
+
|
| 114 |
+
st.title("Gazet")
|
| 115 |
+
st.caption("Natural-language geo search · click an example or type your own")
|
| 116 |
+
|
| 117 |
+
if "run_q" not in st.session_state:
|
| 118 |
+
st.session_state.run_q = None
|
| 119 |
+
|
| 120 |
+
col1, col2 = st.columns([1, 2])
|
| 121 |
+
with col1:
|
| 122 |
+
inp_col, btn_col = st.columns([5, 1])
|
| 123 |
+
with inp_col:
|
| 124 |
+
q = st.text_input(
|
| 125 |
+
"Query",
|
| 126 |
+
placeholder="e.g. Southern half of Florida",
|
| 127 |
+
label_visibility="collapsed",
|
| 128 |
+
)
|
| 129 |
+
with btn_col:
|
| 130 |
+
search_clicked = st.button("Search", type="primary")
|
| 131 |
+
if search_clicked and q:
|
| 132 |
+
st.session_state.run_q = q
|
| 133 |
+
for ex in EXAMPLES:
|
| 134 |
+
if st.button(ex, key=ex):
|
| 135 |
+
st.session_state.run_q = ex
|
| 136 |
+
|
| 137 |
+
with col2:
|
| 138 |
+
to_run = st.session_state.run_q
|
| 139 |
+
if to_run:
|
| 140 |
+
st.session_state.run_q = None
|
| 141 |
+
|
| 142 |
+
status_ph = st.empty()
|
| 143 |
+
map_ph = st.empty()
|
| 144 |
+
places_ph = st.empty()
|
| 145 |
+
candidates_ph = st.empty()
|
| 146 |
+
sql_ph = st.empty()
|
| 147 |
+
|
| 148 |
+
status_ph.info("Extracting places…")
|
| 149 |
+
|
| 150 |
+
try:
|
| 151 |
+
with requests.get(
|
| 152 |
+
f"{API}/search/stream", params={"q": to_run}, stream=True, timeout=120
|
| 153 |
+
) as r:
|
| 154 |
+
r.raise_for_status()
|
| 155 |
+
|
| 156 |
+
for raw in r.iter_lines():
|
| 157 |
+
if not raw:
|
| 158 |
+
continue
|
| 159 |
+
event = json.loads(raw)
|
| 160 |
+
t = event["type"]
|
| 161 |
+
|
| 162 |
+
if t == "places":
|
| 163 |
+
places = event["data"].get("places", [])
|
| 164 |
+
status_ph.info("Fuzzy-matching candidates…")
|
| 165 |
+
if places:
|
| 166 |
+
with places_ph.container():
|
| 167 |
+
with st.expander(
|
| 168 |
+
"Extracted place names", expanded=True
|
| 169 |
+
):
|
| 170 |
+
st.dataframe(
|
| 171 |
+
pd.DataFrame(places).rename(
|
| 172 |
+
columns={
|
| 173 |
+
"place": "Place",
|
| 174 |
+
"country": "Country",
|
| 175 |
+
"subtype": "Subtype",
|
| 176 |
+
}
|
| 177 |
+
),
|
| 178 |
+
use_container_width=True,
|
| 179 |
+
hide_index=True,
|
| 180 |
+
)
|
| 181 |
+
|
| 182 |
+
elif t == "candidates":
|
| 183 |
+
status_ph.info("Generating SQL…")
|
| 184 |
+
with candidates_ph.container():
|
| 185 |
+
with st.expander("Candidate datasets", expanded=True):
|
| 186 |
+
st.dataframe(
|
| 187 |
+
pd.DataFrame(event["data"]),
|
| 188 |
+
use_container_width=True,
|
| 189 |
+
hide_index=True,
|
| 190 |
+
)
|
| 191 |
+
|
| 192 |
+
elif t == "sql_attempt":
|
| 193 |
+
iteration = event.get("iteration", "")
|
| 194 |
+
status_ph.info(f"Running SQL (attempt {iteration})…")
|
| 195 |
+
with sql_ph.container():
|
| 196 |
+
with st.expander("SQL", expanded=True):
|
| 197 |
+
st.code(event["data"], language="sql")
|
| 198 |
+
|
| 199 |
+
elif t == "sql_error":
|
| 200 |
+
status_ph.warning(
|
| 201 |
+
f"SQL error on attempt {event.get('iteration', '')}, retrying… "
|
| 202 |
+
f"`{event['data'][:120]}`"
|
| 203 |
+
)
|
| 204 |
+
|
| 205 |
+
elif t == "geojson":
|
| 206 |
+
geojson = event["data"]
|
| 207 |
+
n = len(geojson.get("features", []))
|
| 208 |
+
status_ph.success(f"**{to_run}** → {n} feature(s)")
|
| 209 |
+
_render_map(geojson, map_ph)
|
| 210 |
+
|
| 211 |
+
elif t == "error":
|
| 212 |
+
status_ph.error(event["data"])
|
| 213 |
+
|
| 214 |
+
except requests.RequestException as e:
|
| 215 |
+
status_ph.error(
|
| 216 |
+
f"API error: {e}. Is the API running? `uv run uvicorn gazet.api:app --reload`"
|
| 217 |
+
)
|
ingest/convert_natural_earth.py
ADDED
|
@@ -0,0 +1,187 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Convert Natural Earth shapefiles to a single GeoParquet with Overture-compatible schema.
|
| 2 |
+
|
| 3 |
+
Input: directory of *.shp files (passed as CLI argument)
|
| 4 |
+
|
| 5 |
+
Output: path to write the .parquet file (passed as CLI argument, default: data/natural_earth/ne_geography.parquet)
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import argparse
|
| 9 |
+
import pathlib
|
| 10 |
+
|
| 11 |
+
import geopandas as gpd
|
| 12 |
+
import pandas as pd
|
| 13 |
+
|
| 14 |
+
DEFAULT_OUTPUT = pathlib.Path("data/natural_earth_geoparquet/ne_geography.parquet")
|
| 15 |
+
|
| 16 |
+
# Stems (or substrings) to skip — pure cartographic / utility layers with no
|
| 17 |
+
# geographic search value, or point layers that need a separate schema.
|
| 18 |
+
SKIP_PATTERNS = (
|
| 19 |
+
"graticules", # cartographic grid lines
|
| 20 |
+
"_label_points", # point label layers
|
| 21 |
+
"_scale_rank", # scale-rank rendering duplicates (base layers kept)
|
| 22 |
+
)
|
| 23 |
+
SKIP_EXACT = {
|
| 24 |
+
"ne_10m_land_ocean_seams",
|
| 25 |
+
"ne_10m_wgs84_bounding_box",
|
| 26 |
+
"ne_10m_geography_regions_points",
|
| 27 |
+
"ne_10m_geography_regions_elevation_points",
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
LANG_COLS = [
|
| 31 |
+
"ar",
|
| 32 |
+
"bn",
|
| 33 |
+
"de",
|
| 34 |
+
"en",
|
| 35 |
+
"es",
|
| 36 |
+
"fr",
|
| 37 |
+
"el",
|
| 38 |
+
"hi",
|
| 39 |
+
"hu",
|
| 40 |
+
"id",
|
| 41 |
+
"it",
|
| 42 |
+
"ja",
|
| 43 |
+
"ko",
|
| 44 |
+
"nl",
|
| 45 |
+
"pl",
|
| 46 |
+
"pt",
|
| 47 |
+
"ru",
|
| 48 |
+
"sv",
|
| 49 |
+
"tr",
|
| 50 |
+
"vi",
|
| 51 |
+
"zh",
|
| 52 |
+
"fa",
|
| 53 |
+
"he",
|
| 54 |
+
"uk",
|
| 55 |
+
"ur",
|
| 56 |
+
"zht",
|
| 57 |
+
]
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def _names_struct(gdf: gpd.GeoDataFrame, name_col: str) -> pd.Series:
|
| 61 |
+
"""Build a names struct column matching Overture's names STRUCT(primary, ...)."""
|
| 62 |
+
|
| 63 |
+
def _row(row: pd.Series) -> dict:
|
| 64 |
+
entry: dict[str, str | None] = {"primary": row.get(name_col) or None}
|
| 65 |
+
for lang in LANG_COLS:
|
| 66 |
+
val = row.get(f"name_{lang}")
|
| 67 |
+
entry[lang] = (
|
| 68 |
+
str(val) if val and str(val) not in ("", "nan", "None") else None
|
| 69 |
+
)
|
| 70 |
+
return entry
|
| 71 |
+
|
| 72 |
+
return gdf.apply(_row, axis=1)
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def _pick_name_col(gdf: gpd.GeoDataFrame) -> str | None:
|
| 76 |
+
"""Pick best name column: 'name' or first name_* or 'NAME' etc."""
|
| 77 |
+
cols = [c.lower() for c in gdf.columns]
|
| 78 |
+
if "name" in cols:
|
| 79 |
+
return "name"
|
| 80 |
+
for lang in ["en", "name"] + LANG_COLS:
|
| 81 |
+
cand = f"name_{lang}"
|
| 82 |
+
if cand in cols:
|
| 83 |
+
return cand
|
| 84 |
+
for c in gdf.columns:
|
| 85 |
+
if c.lower().startswith("name"):
|
| 86 |
+
return c
|
| 87 |
+
return None
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
def _load_shapefile(src: pathlib.Path, source_key: str) -> gpd.GeoDataFrame:
|
| 91 |
+
"""Load any Natural Earth shapefile and normalize to Overture-like schema."""
|
| 92 |
+
gdf = gpd.read_file(src)
|
| 93 |
+
gdf.columns = [c.lower() for c in gdf.columns]
|
| 94 |
+
n = len(gdf)
|
| 95 |
+
|
| 96 |
+
# id: ne_id if present else source_index
|
| 97 |
+
if "ne_id" in gdf.columns:
|
| 98 |
+
ids = "ne_" + gdf["ne_id"].astype(str)
|
| 99 |
+
else:
|
| 100 |
+
ids = pd.Series([f"ne_10m_{source_key}_{i}" for i in range(n)])
|
| 101 |
+
|
| 102 |
+
name_col = _pick_name_col(gdf)
|
| 103 |
+
if name_col is None:
|
| 104 |
+
names = pd.Series([{"primary": None, **{lang: None for lang in LANG_COLS}}] * n)
|
| 105 |
+
else:
|
| 106 |
+
names = _names_struct(gdf, name_col)
|
| 107 |
+
|
| 108 |
+
# subtype: featurecla or source key
|
| 109 |
+
if "featurecla" in gdf.columns:
|
| 110 |
+
subtype = gdf["featurecla"]
|
| 111 |
+
else:
|
| 112 |
+
subtype = pd.Series([source_key] * n)
|
| 113 |
+
|
| 114 |
+
return gpd.GeoDataFrame(
|
| 115 |
+
{
|
| 116 |
+
"id": ids,
|
| 117 |
+
"source_layer": pd.array([source_key] * n, dtype=pd.StringDtype()),
|
| 118 |
+
"names": names,
|
| 119 |
+
"subtype": subtype,
|
| 120 |
+
"class": pd.array([None] * n, dtype=pd.StringDtype()),
|
| 121 |
+
"country": gdf.get("sov_a3", pd.array([None] * n, dtype=pd.StringDtype()))
|
| 122 |
+
if "sov_a3" in gdf.columns
|
| 123 |
+
else pd.array([None] * n, dtype=pd.StringDtype()),
|
| 124 |
+
"region": gdf.get("region", pd.array([None] * n, dtype=pd.StringDtype())),
|
| 125 |
+
"admin_level": pd.array([None] * n, dtype=pd.Int32Dtype()),
|
| 126 |
+
"is_land": _infer_is_land(source_key, gdf),
|
| 127 |
+
"is_territorial": pd.array([None] * n, dtype=pd.BooleanDtype()),
|
| 128 |
+
"geometry": gdf.geometry,
|
| 129 |
+
},
|
| 130 |
+
crs=gdf.crs,
|
| 131 |
+
)
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
def _infer_is_land(source_key: str, gdf: gpd.GeoDataFrame) -> pd.Series:
|
| 135 |
+
"""Infer is_land from source name when possible."""
|
| 136 |
+
n = len(gdf)
|
| 137 |
+
ocean_marine = ("ocean", "marine", "bathymetry", "coastline", "seams", "reefs")
|
| 138 |
+
if any(x in source_key for x in ocean_marine):
|
| 139 |
+
return pd.Series([False] * n)
|
| 140 |
+
if "land" in source_key or "lakes" in source_key or "regions" in source_key:
|
| 141 |
+
return pd.Series([True] * n)
|
| 142 |
+
return pd.array([None] * n, dtype=pd.BooleanDtype())
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
if __name__ == "__main__":
|
| 146 |
+
parser = argparse.ArgumentParser(description=__doc__)
|
| 147 |
+
parser.add_argument(
|
| 148 |
+
"shp_dir", type=pathlib.Path, help="Directory containing *.shp files"
|
| 149 |
+
)
|
| 150 |
+
parser.add_argument(
|
| 151 |
+
"-o",
|
| 152 |
+
"--output",
|
| 153 |
+
type=pathlib.Path,
|
| 154 |
+
default=DEFAULT_OUTPUT,
|
| 155 |
+
help=f"Output .parquet path (default: {DEFAULT_OUTPUT})",
|
| 156 |
+
)
|
| 157 |
+
args = parser.parse_args()
|
| 158 |
+
|
| 159 |
+
all_shp = sorted(args.shp_dir.glob("*.shp"))
|
| 160 |
+
if not all_shp:
|
| 161 |
+
raise SystemExit(f"No .shp files in {args.shp_dir}")
|
| 162 |
+
|
| 163 |
+
def _should_skip(stem: str) -> bool:
|
| 164 |
+
if stem in SKIP_EXACT:
|
| 165 |
+
return True
|
| 166 |
+
return any(p in stem for p in SKIP_PATTERNS)
|
| 167 |
+
|
| 168 |
+
shp_files = [p for p in all_shp if not _should_skip(p.stem)]
|
| 169 |
+
skipped = [p.stem for p in all_shp if _should_skip(p.stem)]
|
| 170 |
+
if skipped:
|
| 171 |
+
print(f"Skipping {len(skipped)} utility layers: {', '.join(skipped)}\n")
|
| 172 |
+
|
| 173 |
+
frames = []
|
| 174 |
+
for path in shp_files:
|
| 175 |
+
source_key = path.stem # e.g. ne_10m_geography_marine_polys
|
| 176 |
+
gdf = _load_shapefile(path, source_key)
|
| 177 |
+
frames.append(gdf)
|
| 178 |
+
print(f" {path.name}: {len(gdf)} features")
|
| 179 |
+
|
| 180 |
+
combined = gpd.GeoDataFrame(
|
| 181 |
+
pd.concat(frames, ignore_index=True),
|
| 182 |
+
crs=frames[0].crs,
|
| 183 |
+
)
|
| 184 |
+
|
| 185 |
+
args.output.parent.mkdir(parents=True, exist_ok=True)
|
| 186 |
+
combined.to_parquet(args.output)
|
| 187 |
+
print(f"\nSaved {len(combined)} features → {args.output}")
|
pyproject.toml
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[build-system]
|
| 2 |
+
requires = ["hatchling"]
|
| 3 |
+
build-backend = "hatchling.build"
|
| 4 |
+
|
| 5 |
+
[project]
|
| 6 |
+
name = "gazet"
|
| 7 |
+
version = "0.1.0"
|
| 8 |
+
description = "Lean natural-language geocoder with GIS operations over Overture and Natural Earth parquet datasets"
|
| 9 |
+
requires-python = ">=3.13.0, <3.14"
|
| 10 |
+
dependencies = [
|
| 11 |
+
"duckdb>=1.4.4",
|
| 12 |
+
"fastapi>=0.115",
|
| 13 |
+
"uvicorn[standard]>=0.32",
|
| 14 |
+
"dspy>=3.1.3",
|
| 15 |
+
"pandas>=2.2",
|
| 16 |
+
"pydantic>=2.0",
|
| 17 |
+
"pyarrow>=17.0.0",
|
| 18 |
+
"geopandas>=1.1.2",
|
| 19 |
+
]
|
| 20 |
+
optional-dependencies = { demo = ["streamlit", "requests", "pydeck"], dev = ["ruff"] }
|
| 21 |
+
|
| 22 |
+
[tool.hatch.build.targets.wheel]
|
| 23 |
+
packages = ["src/gazet"]
|
src/gazet/__init__.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .api import app
|
| 2 |
+
from .schemas import Place, PlacesResult
|
| 3 |
+
|
| 4 |
+
__all__ = ["app", "Place", "PlacesResult"]
|
src/gazet/__main__.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import uvicorn
|
| 2 |
+
|
| 3 |
+
from .api import app
|
| 4 |
+
|
| 5 |
+
if __name__ == "__main__":
|
| 6 |
+
uvicorn.run(app, host="0.0.0.0", port=8000)
|
src/gazet/api.py
ADDED
|
@@ -0,0 +1,147 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
from typing import Any, Generator
|
| 3 |
+
|
| 4 |
+
import duckdb
|
| 5 |
+
import pandas as pd
|
| 6 |
+
from fastapi import FastAPI, HTTPException
|
| 7 |
+
from fastapi.responses import StreamingResponse
|
| 8 |
+
|
| 9 |
+
from .export import to_feature_collection
|
| 10 |
+
from .lm import extract
|
| 11 |
+
from .search import search_divisions_area, search_natural_earth
|
| 12 |
+
from .sql import run_geo_sql_loop
|
| 13 |
+
|
| 14 |
+
app = FastAPI()
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def _df_to_records(df: pd.DataFrame) -> list[dict[str, Any]]:
|
| 18 |
+
"""Convert DataFrame to list of dicts for JSON; handle non-JSON-serializable types."""
|
| 19 |
+
return df.replace({float("nan"): None}).to_dict(orient="records")
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def _run_stream(query: str) -> Generator[str, None, None]:
|
| 23 |
+
"""Yield NDJSON lines as each stage of the search completes.
|
| 24 |
+
|
| 25 |
+
Event ``type`` values (in order of emission):
|
| 26 |
+
- ``places`` – extracted place names
|
| 27 |
+
- ``candidates`` – merged fuzzy-match table
|
| 28 |
+
- ``sql_attempt`` – SQL generated in the current loop iteration
|
| 29 |
+
- ``sql_error`` – execution/generation error in the current iteration
|
| 30 |
+
- ``geojson`` – final FeatureCollection
|
| 31 |
+
- ``error`` – fatal error (no result)
|
| 32 |
+
"""
|
| 33 |
+
pred = extract(query=query)
|
| 34 |
+
print("extract result:", pred.result)
|
| 35 |
+
places_result = pred.result
|
| 36 |
+
|
| 37 |
+
yield json.dumps({"type": "places", "data": places_result.model_dump()}) + "\n"
|
| 38 |
+
|
| 39 |
+
con = duckdb.connect()
|
| 40 |
+
con.execute("INSTALL spatial")
|
| 41 |
+
con.execute("LOAD spatial")
|
| 42 |
+
|
| 43 |
+
try:
|
| 44 |
+
all_candidates: list[pd.DataFrame] = []
|
| 45 |
+
for place in places_result.places:
|
| 46 |
+
for search_fn in (search_divisions_area, search_natural_earth):
|
| 47 |
+
df = search_fn(con, place)
|
| 48 |
+
if not df.empty:
|
| 49 |
+
all_candidates.append(df)
|
| 50 |
+
|
| 51 |
+
if not all_candidates:
|
| 52 |
+
yield json.dumps({"type": "error", "data": "No candidates found"}) + "\n"
|
| 53 |
+
return
|
| 54 |
+
|
| 55 |
+
candidates_df = (
|
| 56 |
+
pd.concat(all_candidates, ignore_index=True)
|
| 57 |
+
.drop_duplicates(subset=["source", "id"])
|
| 58 |
+
.sort_values(["similarity", "admin_level"], ascending=[False, True])
|
| 59 |
+
.reset_index(drop=True)
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
yield (
|
| 63 |
+
json.dumps({"type": "candidates", "data": _df_to_records(candidates_df)})
|
| 64 |
+
+ "\n"
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
result_df: pd.DataFrame | None = None
|
| 68 |
+
for event in run_geo_sql_loop(con, query, candidates_df):
|
| 69 |
+
if event["type"] == "sql_attempt":
|
| 70 |
+
yield (
|
| 71 |
+
json.dumps(
|
| 72 |
+
{
|
| 73 |
+
"type": "sql_attempt",
|
| 74 |
+
"data": event["sql"],
|
| 75 |
+
"iteration": event["iteration"],
|
| 76 |
+
}
|
| 77 |
+
)
|
| 78 |
+
+ "\n"
|
| 79 |
+
)
|
| 80 |
+
elif event["type"] == "sql_error":
|
| 81 |
+
yield (
|
| 82 |
+
json.dumps(
|
| 83 |
+
{
|
| 84 |
+
"type": "sql_error",
|
| 85 |
+
"data": event["error"],
|
| 86 |
+
"iteration": event["iteration"],
|
| 87 |
+
}
|
| 88 |
+
)
|
| 89 |
+
+ "\n"
|
| 90 |
+
)
|
| 91 |
+
elif event["type"] == "result":
|
| 92 |
+
result_df = event["df"]
|
| 93 |
+
|
| 94 |
+
if result_df is None or result_df.empty:
|
| 95 |
+
yield json.dumps({"type": "error", "data": "No result from SQL"}) + "\n"
|
| 96 |
+
return
|
| 97 |
+
|
| 98 |
+
yield (
|
| 99 |
+
json.dumps({"type": "geojson", "data": to_feature_collection(result_df)})
|
| 100 |
+
+ "\n"
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
finally:
|
| 104 |
+
con.close()
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
@app.get("/search/stream")
|
| 108 |
+
def search_stream(q: str) -> StreamingResponse:
|
| 109 |
+
"""Stream search progress as NDJSON (one JSON object per line)."""
|
| 110 |
+
return StreamingResponse(_run_stream(q), media_type="application/x-ndjson")
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
@app.get("/search", response_model=None)
|
| 114 |
+
def search(q: str) -> dict[str, Any]:
|
| 115 |
+
"""Run geo search for natural-language query (non-streaming).
|
| 116 |
+
|
| 117 |
+
Returns GeoJSON FeatureCollection, the executed SQL, and the identified
|
| 118 |
+
dataframes (candidates) as JSON-serializable records.
|
| 119 |
+
"""
|
| 120 |
+
places: dict = {}
|
| 121 |
+
candidates: list = []
|
| 122 |
+
sql = ""
|
| 123 |
+
geojson: dict | None = None
|
| 124 |
+
|
| 125 |
+
for line in _run_stream(q):
|
| 126 |
+
if not line.strip():
|
| 127 |
+
continue
|
| 128 |
+
event = json.loads(line)
|
| 129 |
+
t = event["type"]
|
| 130 |
+
if t == "places":
|
| 131 |
+
places = event["data"]
|
| 132 |
+
elif t == "candidates":
|
| 133 |
+
candidates = event["data"]
|
| 134 |
+
elif t == "sql_attempt":
|
| 135 |
+
sql = event["data"]
|
| 136 |
+
elif t == "geojson":
|
| 137 |
+
geojson = event["data"]
|
| 138 |
+
|
| 139 |
+
if geojson is None:
|
| 140 |
+
raise HTTPException(status_code=404, detail="No result")
|
| 141 |
+
|
| 142 |
+
return {
|
| 143 |
+
"geojson": geojson,
|
| 144 |
+
"sql": sql,
|
| 145 |
+
"places": places,
|
| 146 |
+
"dataframes": {"candidates": candidates},
|
| 147 |
+
}
|
src/gazet/config.py
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pathlib
|
| 2 |
+
|
| 3 |
+
# Data lives at project root (gazet/data/), not inside the package
|
| 4 |
+
_DATA_DIR = pathlib.Path(__file__).resolve().parent.parent.parent / "data"
|
| 5 |
+
DIVISIONS_AREA_PATH = str(_DATA_DIR / "overture/divisions_area/*.parquet")
|
| 6 |
+
NATURAL_EARTH_PATH = str(_DATA_DIR / "natural_earth_geoparquet/ne_geography.parquet")
|
| 7 |
+
|
| 8 |
+
# MODEL = "qwen3.5:cloud"
|
| 9 |
+
# MODEL = "granite4:350m"
|
| 10 |
+
# MODEL = "gemma3:12b-cloud"
|
| 11 |
+
# MODEL = "qwen3.5:397b-cloud"
|
| 12 |
+
MODEL = "gpt-oss:20b-cloud"
|
| 13 |
+
# MODEL = "qwen3:4b"
|
| 14 |
+
# MODEL = "qwen3-coder-next:cloud"
|
| 15 |
+
# MODEL = "deepseek-coder:1.3b"
|
| 16 |
+
|
| 17 |
+
MAX_SQL_ITERATIONS = 5
|
| 18 |
+
|
| 19 |
+
SCHEMA_INFO = f"""
|
| 20 |
+
Available DuckDB datasets (read via read_parquet):
|
| 21 |
+
|
| 22 |
+
1. divisions_area — Overture polygon/multipolygon admin boundaries
|
| 23 |
+
path: '{DIVISIONS_AREA_PATH}'
|
| 24 |
+
columns:
|
| 25 |
+
id VARCHAR -- unique feature id (use this to filter precisely)
|
| 26 |
+
names STRUCT("primary" VARCHAR, ...)
|
| 27 |
+
country VARCHAR -- ISO 3166-1 alpha-2
|
| 28 |
+
subtype VARCHAR -- country | region | dependency | county | macrohood |
|
| 29 |
+
localadmin | locality | neighborhood | microhood
|
| 30 |
+
class VARCHAR
|
| 31 |
+
region VARCHAR -- region code (e.g. 'EC-L' for Loja Ecuador)
|
| 32 |
+
admin_level INTEGER
|
| 33 |
+
division_id VARCHAR
|
| 34 |
+
is_land BOOLEAN
|
| 35 |
+
is_territorial BOOLEAN
|
| 36 |
+
geometry GEOMETRY -- boundary polygon/multipolygon (WKB, spatial ext loaded)
|
| 37 |
+
|
| 38 |
+
2. natural_earth — Natural Earth geography polygons (oceans, seas, terrain regions, islands)
|
| 39 |
+
path: '{NATURAL_EARTH_PATH}'
|
| 40 |
+
columns:
|
| 41 |
+
id VARCHAR -- unique feature id prefixed 'ne_'
|
| 42 |
+
names STRUCT("primary" VARCHAR, ...)
|
| 43 |
+
subtype VARCHAR -- e.g. 'ocean', 'sea', 'bay', 'Terrain area', 'Island group'
|
| 44 |
+
class VARCHAR
|
| 45 |
+
country VARCHAR
|
| 46 |
+
region VARCHAR
|
| 47 |
+
admin_level INTEGER
|
| 48 |
+
is_land BOOLEAN
|
| 49 |
+
is_territorial BOOLEAN
|
| 50 |
+
geometry GEOMETRY -- polygon/multipolygon (WKB, spatial ext loaded)
|
| 51 |
+
|
| 52 |
+
Spatial extension is already loaded — use ST_AsGeoJSON(geometry) or ST_AsText(geometry).
|
| 53 |
+
To access names use: names."primary"
|
| 54 |
+
|
| 55 |
+
The candidates table has a 'source' column: 'divisions_area' or 'natural_earth'.
|
| 56 |
+
Use the matching path for each candidate's source when querying.
|
| 57 |
+
|
| 58 |
+
Example patterns:
|
| 59 |
+
-- single region boundary from divisions_area
|
| 60 |
+
SELECT id, names."primary" AS name, ST_AsGeoJSON(geometry) AS geojson
|
| 61 |
+
FROM read_parquet('{DIVISIONS_AREA_PATH}')
|
| 62 |
+
WHERE id = '<candidate_id>'
|
| 63 |
+
|
| 64 |
+
-- feature from natural_earth
|
| 65 |
+
SELECT id, names."primary" AS name, ST_AsGeoJSON(geometry) AS geojson
|
| 66 |
+
FROM read_parquet('{NATURAL_EARTH_PATH}')
|
| 67 |
+
WHERE id = '<candidate_id>'
|
| 68 |
+
|
| 69 |
+
-- shared border between two adjacent regions
|
| 70 |
+
WITH a AS (SELECT geometry FROM read_parquet('{DIVISIONS_AREA_PATH}') WHERE id = '<id_a>'),
|
| 71 |
+
b AS (SELECT geometry FROM read_parquet('{DIVISIONS_AREA_PATH}') WHERE id = '<id_b>')
|
| 72 |
+
SELECT ST_AsGeoJSON(ST_Intersection(a.geometry, b.geometry)) AS border
|
| 73 |
+
FROM a, b
|
| 74 |
+
"""
|
src/gazet/export.py
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import pathlib
|
| 3 |
+
import re
|
| 4 |
+
|
| 5 |
+
import pandas as pd
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def _is_geojson_col(series: pd.Series) -> bool:
|
| 9 |
+
"""Heuristic: a string column whose non-null values start with '{"type":'."""
|
| 10 |
+
sample = series.dropna().head(5)
|
| 11 |
+
return (
|
| 12 |
+
sample.apply(
|
| 13 |
+
lambda v: isinstance(v, str) and v.lstrip().startswith('{"type":')
|
| 14 |
+
).all()
|
| 15 |
+
and len(sample) > 0
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def save_geojson(
|
| 20 |
+
result_df: pd.DataFrame, query: str, output_dir: pathlib.Path | str = "."
|
| 21 |
+
) -> pathlib.Path:
|
| 22 |
+
"""Wrap result_df into a GeoJSON FeatureCollection and save to disk.
|
| 23 |
+
|
| 24 |
+
Any column whose values are GeoJSON geometry strings (output of ST_AsGeoJSON)
|
| 25 |
+
is used as the feature geometry; remaining columns become properties.
|
| 26 |
+
If multiple geometry columns exist the first one wins.
|
| 27 |
+
"""
|
| 28 |
+
output_dir = pathlib.Path(output_dir)
|
| 29 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 30 |
+
|
| 31 |
+
slug = re.sub(r"[^\w]+", "_", query.lower()).strip("_")
|
| 32 |
+
out_path = output_dir / f"{slug}.geojson"
|
| 33 |
+
|
| 34 |
+
fc = _to_feature_collection(result_df)
|
| 35 |
+
out_path.write_text(json.dumps(fc, indent=2))
|
| 36 |
+
print(f"\nSaved {len(fc['features'])} feature(s) → {out_path.resolve()}")
|
| 37 |
+
return out_path
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def to_feature_collection(result_df: pd.DataFrame) -> dict:
|
| 41 |
+
"""Build a GeoJSON FeatureCollection dict from a result DataFrame."""
|
| 42 |
+
return _to_feature_collection(result_df)
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def _to_feature_collection(result_df: pd.DataFrame) -> dict:
|
| 46 |
+
geom_cols = [c for c in result_df.columns if _is_geojson_col(result_df[c])]
|
| 47 |
+
prop_cols = [c for c in result_df.columns if c not in geom_cols]
|
| 48 |
+
features = []
|
| 49 |
+
for _, row in result_df.iterrows():
|
| 50 |
+
geometry = None
|
| 51 |
+
if geom_cols:
|
| 52 |
+
raw = row[geom_cols[0]]
|
| 53 |
+
if raw and isinstance(raw, str):
|
| 54 |
+
try:
|
| 55 |
+
geometry = json.loads(raw)
|
| 56 |
+
except json.JSONDecodeError:
|
| 57 |
+
pass
|
| 58 |
+
properties = {c: row[c] for c in prop_cols if pd.notna(row[c])}
|
| 59 |
+
for c in geom_cols[1:]:
|
| 60 |
+
if pd.notna(row[c]):
|
| 61 |
+
properties[c] = row[c]
|
| 62 |
+
features.append(
|
| 63 |
+
{"type": "Feature", "geometry": geometry, "properties": properties}
|
| 64 |
+
)
|
| 65 |
+
return {"type": "FeatureCollection", "features": features}
|
src/gazet/lm.py
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import dspy
|
| 2 |
+
|
| 3 |
+
from .config import MODEL
|
| 4 |
+
from .schemas import PlacesResult
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class ExtractPlaces(dspy.Signature):
|
| 8 |
+
"""Extract place names from a query.
|
| 9 |
+
|
| 10 |
+
Data is available from overture and natural earth datasets.
|
| 11 |
+
|
| 12 |
+
Overture has divisions and natural earth has physical features.
|
| 13 |
+
- divisions are administrative units like countries, states, counties, cities, towns, villages, etc.
|
| 14 |
+
- physical features are natural features like oceans, seas, lakes, rivers, mountains, etc.
|
| 15 |
+
|
| 16 |
+
When extracting a place name, you can use the overture divisions or natural earth physical features.
|
| 17 |
+
- If the user mentions an overture division, use the overture divisions.
|
| 18 |
+
- If the user mentions a natural earth physical feature, use the natural earth physical features.
|
| 19 |
+
- If the user mentions a place name that is not in the overture divisions or natural earth physical features, return the place name as is.
|
| 20 |
+
|
| 21 |
+
Where possible and relevant, also extract the ISO country code for each place.
|
| 22 |
+
|
| 23 |
+
Do not repeat the same place name in the result.
|
| 24 |
+
|
| 25 |
+
If the user does not explicitly mention a country, dont add the country code to the result.
|
| 26 |
+
|
| 27 |
+
If the user does not mention an admin level, dont add the subtype to the result.
|
| 28 |
+
|
| 29 |
+
If the query asks for some kind of subdivision (e.g. 'municipalities in Bern', 'States in Brazil'),
|
| 30 |
+
return the subdivision type in the places result.
|
| 31 |
+
|
| 32 |
+
When identifying a place name from the user's query, also infer the most appropriate
|
| 33 |
+
Overture division subtype from the list below. Only include a subtype if the query
|
| 34 |
+
makes it reasonably clear what geographic level is intended. If ambiguous, omit it.
|
| 35 |
+
|
| 36 |
+
SUBTYPES:
|
| 37 |
+
- country : Sovereign nation. E.g. "France", "Brazil"
|
| 38 |
+
- dependency : Territory dependent on a country but not a full sub-region. E.g. "Puerto Rico", "Guam"
|
| 39 |
+
- region : Largest admin unit within a country; state, province, canton, etc. E.g. "California", "Alberta", "Bavaria"
|
| 40 |
+
- county : Second-level admin subdivision within a region. E.g. "Kings County", "Kent"
|
| 41 |
+
- localadmin : A governing layer (common in Europe) that contains localities which have no authority of their own. E.g. a French commune or Belgian municipality. Use when the place is clearly an admin unit but not a city itself.
|
| 42 |
+
- locality : A populated place — city, town, village. The most common subtype for named settlements. E.g. "Lisbon", "Taipei", "Salt Lake City"
|
| 43 |
+
- macrohood : A large super-neighborhood grouping smaller neighborhoods. E.g. "BoCoCa" in Brooklyn
|
| 44 |
+
- neighborhood : A named community area within a city or town. E.g. "Cobble Hill", "Alfama"
|
| 45 |
+
- microhood : A mini-neighborhood within a neighborhood. Very fine-grained, rarely referenced explicitly.
|
| 46 |
+
|
| 47 |
+
HIERARCHY (coarse to fine):
|
| 48 |
+
country → dependency / region → county → localadmin → locality → macrohood → neighborhood → microhood
|
| 49 |
+
|
| 50 |
+
GUIDANCE:
|
| 51 |
+
- "Paris" with no qualifier → locality
|
| 52 |
+
- "Île-de-France" or "Catalonia" → region
|
| 53 |
+
- "the 11th arrondissement" → neighborhood (or localadmin)
|
| 54 |
+
- "Greater London" style phrasing → county or region depending on context
|
| 55 |
+
- If the user says "neighborhood in X" or "district of X" → neighborhood
|
| 56 |
+
- Default to locality for any named city/town if unsure
|
| 57 |
+
- Omit subtype entirely if the query gives no signal (e.g. bare coordinates or a POI name)
|
| 58 |
+
"""
|
| 59 |
+
|
| 60 |
+
query: str = dspy.InputField(
|
| 61 |
+
desc="Natural language query mentioning one or more place names"
|
| 62 |
+
)
|
| 63 |
+
result: PlacesResult = dspy.OutputField(
|
| 64 |
+
desc="Extracted places with optional country codes and optional subtype"
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
class WriteGeoSQL(dspy.Signature):
|
| 69 |
+
"""Write a DuckDB SQL SELECT query that extracts the geometry answering a geo query.
|
| 70 |
+
|
| 71 |
+
You are given:
|
| 72 |
+
- The user's original natural language query
|
| 73 |
+
- A schema description of the available Overture divisions parquet datasets
|
| 74 |
+
- A table of fuzzy-matched candidate divisions with their IDs and metadata
|
| 75 |
+
|
| 76 |
+
Write a single, read-only DuckDB SQL SELECT statement that returns the geometry
|
| 77 |
+
(and key attributes like name, subtype, country) that best answers the query.
|
| 78 |
+
Use candidate IDs from the candidates table to filter precisely — avoid full scans
|
| 79 |
+
when you have exact IDs. The spatial extension is already loaded; use
|
| 80 |
+
ST_AsGeoJSON(geometry) for geometry output.
|
| 81 |
+
|
| 82 |
+
The user might ask for GIS operations such as intersections, buffering, or
|
| 83 |
+
sections of geometries. You can use the spatial extension to perform these operations.
|
| 84 |
+
|
| 85 |
+
Return ONLY the SQL — no markdown fences, no explanation.
|
| 86 |
+
"""
|
| 87 |
+
|
| 88 |
+
user_query: str = dspy.InputField(desc="Original natural language geo query")
|
| 89 |
+
schema: str = dspy.InputField(
|
| 90 |
+
desc="Available datasets, column types, and example patterns"
|
| 91 |
+
)
|
| 92 |
+
candidates: str = dspy.InputField(
|
| 93 |
+
desc="Fuzzy-matched candidate divisions (id, name, country, subtype, similarity, ...)"
|
| 94 |
+
)
|
| 95 |
+
previous_sql: str = dspy.InputField(
|
| 96 |
+
desc="SQL from the previous attempt — empty string if this is the first try"
|
| 97 |
+
)
|
| 98 |
+
execution_error: str = dspy.InputField(
|
| 99 |
+
desc="Error raised by the previous SQL — empty string if no error yet"
|
| 100 |
+
)
|
| 101 |
+
sql: str = dspy.OutputField(
|
| 102 |
+
desc="Valid read-only DuckDB SQL SELECT statement, no markdown fences"
|
| 103 |
+
)
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
lm = dspy.LM(
|
| 107 |
+
f"ollama_chat/{MODEL}", api_base="http://localhost:11434", api_key="", temperature=0
|
| 108 |
+
)
|
| 109 |
+
dspy.configure(lm=lm)
|
| 110 |
+
|
| 111 |
+
extract = dspy.Predict(ExtractPlaces)
|
| 112 |
+
write_sql = dspy.Predict(WriteGeoSQL)
|
src/gazet/schemas.py
ADDED
|
@@ -0,0 +1,301 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Literal, Optional, List
|
| 2 |
+
|
| 3 |
+
from pydantic import BaseModel
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
SUBTYPES = Literal[
|
| 7 |
+
"country",
|
| 8 |
+
"region",
|
| 9 |
+
"dependency",
|
| 10 |
+
"county",
|
| 11 |
+
"macrohood",
|
| 12 |
+
"localadmin",
|
| 13 |
+
"locality",
|
| 14 |
+
"neighborhood",
|
| 15 |
+
"microhood",
|
| 16 |
+
]
|
| 17 |
+
COUNTRIES = Literal[
|
| 18 |
+
"AD",
|
| 19 |
+
"AE",
|
| 20 |
+
"AF",
|
| 21 |
+
"AG",
|
| 22 |
+
"AI",
|
| 23 |
+
"AL",
|
| 24 |
+
"AM",
|
| 25 |
+
"AO",
|
| 26 |
+
"AQ",
|
| 27 |
+
"AR",
|
| 28 |
+
"AS",
|
| 29 |
+
"AT",
|
| 30 |
+
"AU",
|
| 31 |
+
"AW",
|
| 32 |
+
"AX",
|
| 33 |
+
"AZ",
|
| 34 |
+
"BA",
|
| 35 |
+
"BB",
|
| 36 |
+
"BD",
|
| 37 |
+
"BE",
|
| 38 |
+
"BF",
|
| 39 |
+
"BG",
|
| 40 |
+
"BH",
|
| 41 |
+
"BI",
|
| 42 |
+
"BJ",
|
| 43 |
+
"BL",
|
| 44 |
+
"BM",
|
| 45 |
+
"BN",
|
| 46 |
+
"BO",
|
| 47 |
+
"BQ",
|
| 48 |
+
"BR",
|
| 49 |
+
"BS",
|
| 50 |
+
"BT",
|
| 51 |
+
"BV",
|
| 52 |
+
"BW",
|
| 53 |
+
"BY",
|
| 54 |
+
"BZ",
|
| 55 |
+
"CA",
|
| 56 |
+
"CC",
|
| 57 |
+
"CD",
|
| 58 |
+
"CF",
|
| 59 |
+
"CG",
|
| 60 |
+
"CH",
|
| 61 |
+
"CI",
|
| 62 |
+
"CK",
|
| 63 |
+
"CL",
|
| 64 |
+
"CM",
|
| 65 |
+
"CN",
|
| 66 |
+
"CO",
|
| 67 |
+
"CP",
|
| 68 |
+
"CR",
|
| 69 |
+
"CU",
|
| 70 |
+
"CV",
|
| 71 |
+
"CW",
|
| 72 |
+
"CX",
|
| 73 |
+
"CY",
|
| 74 |
+
"CZ",
|
| 75 |
+
"DE",
|
| 76 |
+
"DJ",
|
| 77 |
+
"DK",
|
| 78 |
+
"DM",
|
| 79 |
+
"DO",
|
| 80 |
+
"DZ",
|
| 81 |
+
"EC",
|
| 82 |
+
"EE",
|
| 83 |
+
"EG",
|
| 84 |
+
"ER",
|
| 85 |
+
"ES",
|
| 86 |
+
"ET",
|
| 87 |
+
"FI",
|
| 88 |
+
"FJ",
|
| 89 |
+
"FK",
|
| 90 |
+
"FM",
|
| 91 |
+
"FO",
|
| 92 |
+
"FR",
|
| 93 |
+
"GA",
|
| 94 |
+
"GB",
|
| 95 |
+
"GD",
|
| 96 |
+
"GE",
|
| 97 |
+
"GF",
|
| 98 |
+
"GG",
|
| 99 |
+
"GH",
|
| 100 |
+
"GI",
|
| 101 |
+
"GL",
|
| 102 |
+
"GM",
|
| 103 |
+
"GN",
|
| 104 |
+
"GP",
|
| 105 |
+
"GQ",
|
| 106 |
+
"GR",
|
| 107 |
+
"GS",
|
| 108 |
+
"GT",
|
| 109 |
+
"GU",
|
| 110 |
+
"GW",
|
| 111 |
+
"GY",
|
| 112 |
+
"HK",
|
| 113 |
+
"HM",
|
| 114 |
+
"HN",
|
| 115 |
+
"HR",
|
| 116 |
+
"HT",
|
| 117 |
+
"HU",
|
| 118 |
+
"ID",
|
| 119 |
+
"IE",
|
| 120 |
+
"IL",
|
| 121 |
+
"IM",
|
| 122 |
+
"IN",
|
| 123 |
+
"IO",
|
| 124 |
+
"IQ",
|
| 125 |
+
"IR",
|
| 126 |
+
"IS",
|
| 127 |
+
"IT",
|
| 128 |
+
"JE",
|
| 129 |
+
"JM",
|
| 130 |
+
"JO",
|
| 131 |
+
"JP",
|
| 132 |
+
"KE",
|
| 133 |
+
"KG",
|
| 134 |
+
"KH",
|
| 135 |
+
"KI",
|
| 136 |
+
"KM",
|
| 137 |
+
"KN",
|
| 138 |
+
"KP",
|
| 139 |
+
"KR",
|
| 140 |
+
"KW",
|
| 141 |
+
"KY",
|
| 142 |
+
"KZ",
|
| 143 |
+
"LA",
|
| 144 |
+
"LB",
|
| 145 |
+
"LC",
|
| 146 |
+
"LI",
|
| 147 |
+
"LK",
|
| 148 |
+
"LR",
|
| 149 |
+
"LS",
|
| 150 |
+
"LT",
|
| 151 |
+
"LU",
|
| 152 |
+
"LV",
|
| 153 |
+
"LY",
|
| 154 |
+
"MA",
|
| 155 |
+
"MC",
|
| 156 |
+
"MD",
|
| 157 |
+
"ME",
|
| 158 |
+
"MF",
|
| 159 |
+
"MG",
|
| 160 |
+
"MH",
|
| 161 |
+
"MK",
|
| 162 |
+
"ML",
|
| 163 |
+
"MM",
|
| 164 |
+
"MN",
|
| 165 |
+
"MO",
|
| 166 |
+
"MP",
|
| 167 |
+
"MQ",
|
| 168 |
+
"MR",
|
| 169 |
+
"MS",
|
| 170 |
+
"MT",
|
| 171 |
+
"MU",
|
| 172 |
+
"MV",
|
| 173 |
+
"MW",
|
| 174 |
+
"MX",
|
| 175 |
+
"MY",
|
| 176 |
+
"MZ",
|
| 177 |
+
"NA",
|
| 178 |
+
"NC",
|
| 179 |
+
"NE",
|
| 180 |
+
"NF",
|
| 181 |
+
"NG",
|
| 182 |
+
"NI",
|
| 183 |
+
"NL",
|
| 184 |
+
"NO",
|
| 185 |
+
"NP",
|
| 186 |
+
"NR",
|
| 187 |
+
"NU",
|
| 188 |
+
"NZ",
|
| 189 |
+
"OM",
|
| 190 |
+
"PA",
|
| 191 |
+
"PE",
|
| 192 |
+
"PF",
|
| 193 |
+
"PG",
|
| 194 |
+
"PH",
|
| 195 |
+
"PK",
|
| 196 |
+
"PL",
|
| 197 |
+
"PM",
|
| 198 |
+
"PN",
|
| 199 |
+
"PR",
|
| 200 |
+
"PT",
|
| 201 |
+
"PW",
|
| 202 |
+
"PY",
|
| 203 |
+
"QA",
|
| 204 |
+
"RE",
|
| 205 |
+
"RO",
|
| 206 |
+
"RS",
|
| 207 |
+
"RU",
|
| 208 |
+
"RW",
|
| 209 |
+
"SA",
|
| 210 |
+
"SB",
|
| 211 |
+
"SC",
|
| 212 |
+
"SD",
|
| 213 |
+
"SE",
|
| 214 |
+
"SG",
|
| 215 |
+
"SH",
|
| 216 |
+
"SI",
|
| 217 |
+
"SJ",
|
| 218 |
+
"SK",
|
| 219 |
+
"SL",
|
| 220 |
+
"SM",
|
| 221 |
+
"SN",
|
| 222 |
+
"SO",
|
| 223 |
+
"SR",
|
| 224 |
+
"SS",
|
| 225 |
+
"ST",
|
| 226 |
+
"SV",
|
| 227 |
+
"SX",
|
| 228 |
+
"SY",
|
| 229 |
+
"SZ",
|
| 230 |
+
"TC",
|
| 231 |
+
"TD",
|
| 232 |
+
"TF",
|
| 233 |
+
"TG",
|
| 234 |
+
"TH",
|
| 235 |
+
"TJ",
|
| 236 |
+
"TK",
|
| 237 |
+
"TL",
|
| 238 |
+
"TM",
|
| 239 |
+
"TN",
|
| 240 |
+
"TO",
|
| 241 |
+
"TR",
|
| 242 |
+
"TT",
|
| 243 |
+
"TV",
|
| 244 |
+
"TW",
|
| 245 |
+
"TZ",
|
| 246 |
+
"UA",
|
| 247 |
+
"UG",
|
| 248 |
+
"UM",
|
| 249 |
+
"US",
|
| 250 |
+
"UY",
|
| 251 |
+
"UZ",
|
| 252 |
+
"VA",
|
| 253 |
+
"VC",
|
| 254 |
+
"VE",
|
| 255 |
+
"VG",
|
| 256 |
+
"VI",
|
| 257 |
+
"VN",
|
| 258 |
+
"VU",
|
| 259 |
+
"WF",
|
| 260 |
+
"WS",
|
| 261 |
+
"XA",
|
| 262 |
+
"XB",
|
| 263 |
+
"XC",
|
| 264 |
+
"XD",
|
| 265 |
+
"XE",
|
| 266 |
+
"XG",
|
| 267 |
+
"XH",
|
| 268 |
+
"XI",
|
| 269 |
+
"XJ",
|
| 270 |
+
"XK",
|
| 271 |
+
"XL",
|
| 272 |
+
"XM",
|
| 273 |
+
"XN",
|
| 274 |
+
"XO",
|
| 275 |
+
"XP",
|
| 276 |
+
"XQ",
|
| 277 |
+
"XR",
|
| 278 |
+
"XS",
|
| 279 |
+
"XT",
|
| 280 |
+
"XU",
|
| 281 |
+
"XW",
|
| 282 |
+
"XX",
|
| 283 |
+
"XY",
|
| 284 |
+
"XZ",
|
| 285 |
+
"YE",
|
| 286 |
+
"YT",
|
| 287 |
+
"ZA",
|
| 288 |
+
"ZM",
|
| 289 |
+
"ZW",
|
| 290 |
+
]
|
| 291 |
+
|
| 292 |
+
|
| 293 |
+
class Place(BaseModel):
|
| 294 |
+
place: str
|
| 295 |
+
country: Optional[COUNTRIES] = None
|
| 296 |
+
subtype: Optional[SUBTYPES] = None
|
| 297 |
+
|
| 298 |
+
|
| 299 |
+
class PlacesResult(BaseModel):
|
| 300 |
+
places: List[Place]
|
| 301 |
+
subtype: Optional[SUBTYPES] = None
|
src/gazet/search.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import duckdb
|
| 2 |
+
import pandas as pd
|
| 3 |
+
|
| 4 |
+
from .config import DIVISIONS_AREA_PATH, NATURAL_EARTH_PATH
|
| 5 |
+
from .schemas import Place
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def _fuzzy_search(
|
| 9 |
+
con: duckdb.DuckDBPyConnection,
|
| 10 |
+
path: str,
|
| 11 |
+
source: str,
|
| 12 |
+
place: Place,
|
| 13 |
+
extra_select: str = "",
|
| 14 |
+
limit: int = 5,
|
| 15 |
+
is_overture: bool = False,
|
| 16 |
+
) -> pd.DataFrame:
|
| 17 |
+
"""Generic Levenshtein fuzzy search against any parquet with a names.primary column."""
|
| 18 |
+
country_filter = ""
|
| 19 |
+
country_params: list = []
|
| 20 |
+
if is_overture and place.country:
|
| 21 |
+
country_filter = "AND country = ?"
|
| 22 |
+
country_params = [place.country]
|
| 23 |
+
|
| 24 |
+
subtype_filter = ""
|
| 25 |
+
subtype_params: list = []
|
| 26 |
+
if is_overture and place.subtype:
|
| 27 |
+
subtype_filter = "AND subtype = ?"
|
| 28 |
+
subtype_params = [place.subtype]
|
| 29 |
+
|
| 30 |
+
params = (
|
| 31 |
+
[place.place, place.place, path] + country_params + subtype_params + [limit]
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
extra_clause = f", {extra_select}" if extra_select else ""
|
| 35 |
+
rel = con.execute(
|
| 36 |
+
f"""
|
| 37 |
+
SELECT
|
| 38 |
+
id,
|
| 39 |
+
names."primary" AS name,
|
| 40 |
+
country,
|
| 41 |
+
subtype,
|
| 42 |
+
class,
|
| 43 |
+
region,
|
| 44 |
+
admin_level,
|
| 45 |
+
is_land,
|
| 46 |
+
is_territorial{extra_clause},
|
| 47 |
+
1.0 - (levenshtein(lower(names."primary"), lower(?))::float
|
| 48 |
+
/ greatest(length(names."primary"), length(?), 1)) AS similarity
|
| 49 |
+
FROM read_parquet(?)
|
| 50 |
+
WHERE names."primary" IS NOT NULL AND trim(names."primary") != ''
|
| 51 |
+
{country_filter}
|
| 52 |
+
{subtype_filter}
|
| 53 |
+
ORDER BY similarity DESC, admin_level ASC
|
| 54 |
+
LIMIT ?
|
| 55 |
+
""",
|
| 56 |
+
params,
|
| 57 |
+
)
|
| 58 |
+
df = rel.fetchdf()
|
| 59 |
+
df.insert(0, "source", source)
|
| 60 |
+
label = f'"{place.place}"' + (f" [{place.country}]" if place.country else "")
|
| 61 |
+
if df.empty:
|
| 62 |
+
print(f"\n{source} – {label}: no matches")
|
| 63 |
+
else:
|
| 64 |
+
print(f"\n{source} – {label} (top {len(df)} by name similarity):")
|
| 65 |
+
print(df.to_string(index=False))
|
| 66 |
+
return df
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def search_divisions_area(
|
| 70 |
+
con: duckdb.DuckDBPyConnection, place: Place, limit: int = 5
|
| 71 |
+
) -> pd.DataFrame:
|
| 72 |
+
"""Fuzzy-match a place against divisions_area (Overture admin boundaries)."""
|
| 73 |
+
return _fuzzy_search(
|
| 74 |
+
con,
|
| 75 |
+
DIVISIONS_AREA_PATH,
|
| 76 |
+
"divisions_area",
|
| 77 |
+
place,
|
| 78 |
+
extra_select="division_id",
|
| 79 |
+
limit=limit,
|
| 80 |
+
is_overture=True,
|
| 81 |
+
)
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def search_natural_earth(
|
| 85 |
+
con: duckdb.DuckDBPyConnection, place: Place, limit: int = 5
|
| 86 |
+
) -> pd.DataFrame:
|
| 87 |
+
"""Fuzzy-match a place against Natural Earth geography polygons."""
|
| 88 |
+
return _fuzzy_search(
|
| 89 |
+
con,
|
| 90 |
+
NATURAL_EARTH_PATH,
|
| 91 |
+
"natural_earth",
|
| 92 |
+
place,
|
| 93 |
+
limit=limit,
|
| 94 |
+
)
|
src/gazet/sql.py
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
from typing import Any, Generator, Optional
|
| 3 |
+
|
| 4 |
+
import duckdb
|
| 5 |
+
import pandas as pd
|
| 6 |
+
|
| 7 |
+
from .config import MAX_SQL_ITERATIONS, SCHEMA_INFO
|
| 8 |
+
from .lm import write_sql
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def _strip_fences(sql: Optional[str]) -> str:
|
| 12 |
+
"""Remove markdown code fences that the LM may wrap the SQL in."""
|
| 13 |
+
if not sql:
|
| 14 |
+
return ""
|
| 15 |
+
sql = re.sub(r"^```\w*\s*\n?", "", sql.strip())
|
| 16 |
+
sql = re.sub(r"\n?```\s*$", "", sql)
|
| 17 |
+
return sql.strip()
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def run_geo_sql_loop(
|
| 21 |
+
con: duckdb.DuckDBPyConnection,
|
| 22 |
+
user_query: str,
|
| 23 |
+
candidates_df: pd.DataFrame,
|
| 24 |
+
max_iterations: int = MAX_SQL_ITERATIONS,
|
| 25 |
+
) -> Generator[dict[str, Any], None, None]:
|
| 26 |
+
"""Code-act loop yielding progress events.
|
| 27 |
+
|
| 28 |
+
Event types:
|
| 29 |
+
- ``sql_attempt`` – ``{"type": "sql_attempt", "sql": str, "iteration": int}``
|
| 30 |
+
- ``sql_error`` – ``{"type": "sql_error", "error": str, "iteration": int}``
|
| 31 |
+
- ``result`` – ``{"type": "result", "df": DataFrame | None, "sql": str}``
|
| 32 |
+
"""
|
| 33 |
+
if candidates_df.empty:
|
| 34 |
+
print("\n[SQL-Act] No candidates to work with — skipping.")
|
| 35 |
+
yield {"type": "result", "df": None, "sql": ""}
|
| 36 |
+
return
|
| 37 |
+
|
| 38 |
+
candidates_str = candidates_df.to_string(index=False)
|
| 39 |
+
previous_sql = ""
|
| 40 |
+
error = ""
|
| 41 |
+
|
| 42 |
+
for iteration in range(1, max_iterations + 1):
|
| 43 |
+
print(f"\n{'=' * 60}")
|
| 44 |
+
print(f"[SQL-Act] Iteration {iteration}/{max_iterations}")
|
| 45 |
+
|
| 46 |
+
try:
|
| 47 |
+
pred = write_sql(
|
| 48 |
+
user_query=user_query,
|
| 49 |
+
schema=SCHEMA_INFO,
|
| 50 |
+
candidates=candidates_str,
|
| 51 |
+
previous_sql=previous_sql,
|
| 52 |
+
execution_error=error,
|
| 53 |
+
)
|
| 54 |
+
sql = _strip_fences(pred.sql)
|
| 55 |
+
except Exception as exc:
|
| 56 |
+
error = f"LM generation failed: {exc}"
|
| 57 |
+
print(f"Generation error: {error}")
|
| 58 |
+
yield {"type": "sql_error", "error": error, "iteration": iteration}
|
| 59 |
+
continue
|
| 60 |
+
|
| 61 |
+
if not sql:
|
| 62 |
+
error = "LM returned an empty SQL response."
|
| 63 |
+
print(f"Generation error: {error}")
|
| 64 |
+
yield {"type": "sql_error", "error": error, "iteration": iteration}
|
| 65 |
+
continue
|
| 66 |
+
|
| 67 |
+
print(f"\nGenerated SQL:\n{sql}\n")
|
| 68 |
+
yield {"type": "sql_attempt", "sql": sql, "iteration": iteration}
|
| 69 |
+
|
| 70 |
+
try:
|
| 71 |
+
result_df = con.execute(sql).fetchdf()
|
| 72 |
+
if result_df.empty:
|
| 73 |
+
error = "The query executed successfully but returned no rows. Revise the query to return at least one result."
|
| 74 |
+
previous_sql = sql
|
| 75 |
+
print(f"Empty result: {error}")
|
| 76 |
+
yield {"type": "sql_error", "error": error, "iteration": iteration}
|
| 77 |
+
continue
|
| 78 |
+
print(f"Result ({len(result_df)} row(s)):")
|
| 79 |
+
print(result_df.to_string(index=False, max_colwidth=120))
|
| 80 |
+
yield {"type": "result", "df": result_df, "sql": sql}
|
| 81 |
+
return
|
| 82 |
+
except Exception as exc:
|
| 83 |
+
error = str(exc)
|
| 84 |
+
previous_sql = sql
|
| 85 |
+
print(f"Execution error: {error}")
|
| 86 |
+
yield {"type": "sql_error", "error": error, "iteration": iteration}
|
| 87 |
+
|
| 88 |
+
print(
|
| 89 |
+
f"\n[SQL-Act] Exhausted {max_iterations} iterations without a successful query."
|
| 90 |
+
)
|
| 91 |
+
yield {"type": "result", "df": None, "sql": ""}
|
uv.lock
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|