Spaces:

developmentseed
/

gazet

Running

App Files Files Community

Daniel Wiesmann commited on Mar 6

Commit

ad5ec6b

1 Parent(s): 24c2cc8

Intitial commit

Browse files

Files changed (15) hide show

.gitignore +138 -0
README.md +67 -0
demo_app.py +217 -0
ingest/convert_natural_earth.py +187 -0
pyproject.toml +23 -0
src/gazet/__init__.py +4 -0
src/gazet/__main__.py +6 -0
src/gazet/api.py +147 -0
src/gazet/config.py +74 -0
src/gazet/export.py +65 -0
src/gazet/lm.py +112 -0
src/gazet/schemas.py +301 -0
src/gazet/search.py +94 -0
src/gazet/sql.py +91 -0
uv.lock +0 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,138 @@

+# Imagery
+*.SAFE*
+*.TIF
+*.tif
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+data/
+output/

README.md ADDED Viewed

	@@ -0,0 +1,67 @@

+# gazet
+Lean natural-language geocoder with GIS operations over Overture and Natural Earth parquet datasets. In an industry trending toward ever-larger models and heavier infrastructure, gazet takes the opposite path: small language models, DuckDB, and local Parquet files — no PostGIS, no cloud geocoding APIs, no bloat.
+Name inspired by [Gazetteer](https://en.wikipedia.org/wiki/Gazetteer). A gazetteer is a geographical dictionary or directory used in conjunction with a map or atlas.
+## Modules
+| Module | Contents |
+| --- | --- |
+| `config.py` | data paths, model name, SQL schema description |
+| `types.py` | `SUBTYPES`, `COUNTRIES`, `Place`, `PlacesResult` |
+| `lm.py` | DSPy signatures + LM init (`extract`, `write_sql`) |
+| `search.py` | fuzzy search against `divisions_area` / `natural_earth` |
+| `sql.py` | code-act SQL generation loop |
+| `export.py` | GeoJSON FeatureCollection writer |
+| `api.py` | FastAPI app with `/search?q=...` returning GeoJSON FeatureCollection |
+## Local setup
+Install python dependencies
+```bash
+uv sync --extra dev --extra demo
+```
+Ensure you are loged into Ollama to use remote models.
+## Usage
+```bash
+python -m gazet
+# then GET http://localhost:8000/search?q=Border%20between%20Loja%20and%20Piura
+```
+### API + Streamlit demo
+```bash
+uv run uvicorn gazet.api:app --reload   # API on :8000
+uv run streamlit run demo_app.py   # demo UI
+```
+## Data preparation
+1. Download Overture divisions data
+2. Download the 10m physical layer from [Natural Earth](https://www.naturalearthdata.com/downloads/10m-physical-vectors/)
+3. Unzip the data
+4. Convert natural earth data to parquet
+Example for downloading overture
+```bash
+aws s3 sync
+s3 sync s3://overturemaps-us-west-2/release/2026-02-18.0/theme=divisions/type=division_area/ data/overture/divisions_area
+```
+Example for running conversion script for natural earth
+```bash
+python -m ingest.convert_natural_earth ~/Downloads/10m_physical
+```
+## Design notes
+- `api.py` exposes GET `/search?q=<query>`; returns GeoJSON FeatureCollection and logs intermediate output.
+- LM is initialised at import time in `lm.py`, suitable for a long-lived server process.
+- Data lives in `data/overture/` and `data/natural_earth_geoparquet/` (not tracked in git).

demo_app.py ADDED Viewed

	@@ -0,0 +1,217 @@

+"""Demo Streamlit app for gazet API. Run API first: uv run uvicorn gazet.api:app --reload"""
+import json
+import math
+import pandas as pd
+import requests
+import streamlit as st
+try:
+    import pydeck as pdk
+except ImportError:
+    pdk = None
+def _coords_from_geom(geom):
+    """Yield (lng, lat) from a GeoJSON geometry."""
+    if geom is None:
+        return
+    t = geom.get("type")
+    coords = geom.get("coordinates")
+    if not coords:
+        return
+    if t == "Point":
+        yield coords
+    elif t in ("LineString", "MultiPoint"):
+        for c in coords:
+            yield c
+    elif t == "Polygon":
+        for ring in coords:
+            for c in ring:
+                yield c
+    elif t in ("MultiLineString", "MultiPolygon"):
+        for part in coords:
+            for c in part if t == "MultiLineString" else part[0]:
+                yield c
+    elif t == "GeometryCollection":
+        for g in geom.get("geometries", []):
+            yield from _coords_from_geom(g)
+def bbox_from_geojson(geojson):
+    """Return (min_lng, min_lat, max_lng, max_lat) or None if no coordinates."""
+    lngs, lats = [], []
+    for f in geojson.get("features", []):
+        geom = (
+            f.get("geometry") if isinstance(f, dict) else getattr(f, "geometry", None)
+        )
+        for lng, lat in _coords_from_geom(geom):
+            lngs.append(lng)
+            lats.append(lat)
+    if not lngs:
+        return None
+    return min(lngs), min(lats), max(lngs), max(lats)
+def view_state_for_bbox(bbox, padding_zoom=0.8):
+    """Return pydeck ViewState (lat, lon, zoom) to fit bbox (min_lng, min_lat, max_lng, max_lat)."""
+    min_lng, min_lat, max_lng, max_lat = bbox
+    lat = (min_lat + max_lat) / 2
+    lng = (min_lng + max_lng) / 2
+    lon_span = max(max_lng - min_lng, 1e-6)
+    lat_span = max(max_lat - min_lat, 1e-6)
+    span_deg = max(lon_span, lat_span)
+    zoom = math.log2(360 / span_deg) - padding_zoom
+    zoom = max(0, min(18, zoom))
+    return pdk.ViewState(latitude=lat, longitude=lng, zoom=zoom)
+def _render_map(geojson, placeholder):
+    n = len(geojson.get("features", []))
+    if pdk and n:
+        layer = pdk.Layer(
+            "GeoJsonLayer",
+            data=geojson,
+            get_fill_color=[40, 180, 160, 200],
+            get_line_color=[125, 211, 192, 255],
+            get_line_width=2,
+            pickable=True,
+        )
+        bbox = bbox_from_geojson(geojson)
+        view = (
+            view_state_for_bbox(bbox)
+            if bbox
+            else pdk.ViewState(latitude=0, longitude=0, zoom=1)
+        )
+        with placeholder.container():
+            st.pydeck_chart(
+                pdk.Deck(
+                    layers=[layer],
+                    initial_view_state=view,
+                    map_style=None,
+                    tooltip={"text": "{name}"},
+                ),
+                use_container_width=True,
+                height=500,
+            )
+    elif n:
+        with placeholder.container():
+            st.json(geojson)
+API = "http://127.0.0.1:8000"
+EXAMPLES = [
+    "Angola and Mozambique",
+    "Mediterranean Sea",
+    "A 0.01 degree buffer around the border between Loja and Piura",
+    "The part of Ecuador that is in the Amazon Basin",
+    "The northern half of India",
+]
+st.set_page_config(page_title="Gazet", page_icon="🌍", layout="wide")
+st.title("Gazet")
+st.caption("Natural-language geo search · click an example or type your own")
+if "run_q" not in st.session_state:
+    st.session_state.run_q = None
+col1, col2 = st.columns([1, 2])
+with col1:
+    inp_col, btn_col = st.columns([5, 1])
+    with inp_col:
+        q = st.text_input(
+            "Query",
+            placeholder="e.g. Southern half of Florida",
+            label_visibility="collapsed",
+        )
+    with btn_col:
+        search_clicked = st.button("Search", type="primary")
+    if search_clicked and q:
+        st.session_state.run_q = q
+    for ex in EXAMPLES:
+        if st.button(ex, key=ex):
+            st.session_state.run_q = ex
+with col2:
+    to_run = st.session_state.run_q
+    if to_run:
+        st.session_state.run_q = None
+        status_ph = st.empty()
+        map_ph = st.empty()
+        places_ph = st.empty()
+        candidates_ph = st.empty()
+        sql_ph = st.empty()
+        status_ph.info("Extracting places…")
+        try:
+            with requests.get(
+                f"{API}/search/stream", params={"q": to_run}, stream=True, timeout=120
+            ) as r:
+                r.raise_for_status()
+                for raw in r.iter_lines():
+                    if not raw:
+                        continue
+                    event = json.loads(raw)
+                    t = event["type"]
+                    if t == "places":
+                        places = event["data"].get("places", [])
+                        status_ph.info("Fuzzy-matching candidates…")
+                        if places:
+                            with places_ph.container():
+                                with st.expander(
+                                    "Extracted place names", expanded=True
+                                ):
+                                    st.dataframe(
+                                        pd.DataFrame(places).rename(
+                                            columns={
+                                                "place": "Place",
+                                                "country": "Country",
+                                                "subtype": "Subtype",
+                                            }
+                                        ),
+                                        use_container_width=True,
+                                        hide_index=True,
+                                    )
+                    elif t == "candidates":
+                        status_ph.info("Generating SQL…")
+                        with candidates_ph.container():
+                            with st.expander("Candidate datasets", expanded=True):
+                                st.dataframe(
+                                    pd.DataFrame(event["data"]),
+                                    use_container_width=True,
+                                    hide_index=True,
+                                )
+                    elif t == "sql_attempt":
+                        iteration = event.get("iteration", "")
+                        status_ph.info(f"Running SQL (attempt {iteration})…")
+                        with sql_ph.container():
+                            with st.expander("SQL", expanded=True):
+                                st.code(event["data"], language="sql")
+                    elif t == "sql_error":
+                        status_ph.warning(
+                            f"SQL error on attempt {event.get('iteration', '')}, retrying… "
+                            f"`{event['data'][:120]}`"
+                        )
+                    elif t == "geojson":
+                        geojson = event["data"]
+                        n = len(geojson.get("features", []))
+                        status_ph.success(f"**{to_run}** → {n} feature(s)")
+                        _render_map(geojson, map_ph)
+                    elif t == "error":
+                        status_ph.error(event["data"])
+        except requests.RequestException as e:
+            status_ph.error(
+                f"API error: {e}. Is the API running? `uv run uvicorn gazet.api:app --reload`"
+            )

ingest/convert_natural_earth.py ADDED Viewed

	@@ -0,0 +1,187 @@

+"""Convert Natural Earth shapefiles to a single GeoParquet with Overture-compatible schema.
+Input: directory of *.shp files (passed as CLI argument)
+Output: path to write the .parquet file (passed as CLI argument, default: data/natural_earth/ne_geography.parquet)
+"""
+import argparse
+import pathlib
+import geopandas as gpd
+import pandas as pd
+DEFAULT_OUTPUT = pathlib.Path("data/natural_earth_geoparquet/ne_geography.parquet")
+# Stems (or substrings) to skip — pure cartographic / utility layers with no
+# geographic search value, or point layers that need a separate schema.
+SKIP_PATTERNS = (
+    "graticules",  # cartographic grid lines
+    "_label_points",  # point label layers
+    "_scale_rank",  # scale-rank rendering duplicates (base layers kept)
+)
+SKIP_EXACT = {
+    "ne_10m_land_ocean_seams",
+    "ne_10m_wgs84_bounding_box",
+    "ne_10m_geography_regions_points",
+    "ne_10m_geography_regions_elevation_points",
+}
+LANG_COLS = [
+    "ar",
+    "bn",
+    "de",
+    "en",
+    "es",
+    "fr",
+    "el",
+    "hi",
+    "hu",
+    "id",
+    "it",
+    "ja",
+    "ko",
+    "nl",
+    "pl",
+    "pt",
+    "ru",
+    "sv",
+    "tr",
+    "vi",
+    "zh",
+    "fa",
+    "he",
+    "uk",
+    "ur",
+    "zht",
+]
+def _names_struct(gdf: gpd.GeoDataFrame, name_col: str) -> pd.Series:
+    """Build a names struct column matching Overture's names STRUCT(primary, ...)."""
+    def _row(row: pd.Series) -> dict:
+        entry: dict[str, str | None] = {"primary": row.get(name_col) or None}
+        for lang in LANG_COLS:
+            val = row.get(f"name_{lang}")
+            entry[lang] = (
+                str(val) if val and str(val) not in ("", "nan", "None") else None
+            )
+        return entry
+    return gdf.apply(_row, axis=1)
+def _pick_name_col(gdf: gpd.GeoDataFrame) -> str | None:
+    """Pick best name column: 'name' or first name_* or 'NAME' etc."""
+    cols = [c.lower() for c in gdf.columns]
+    if "name" in cols:
+        return "name"
+    for lang in ["en", "name"] + LANG_COLS:
+        cand = f"name_{lang}"
+        if cand in cols:
+            return cand
+    for c in gdf.columns:
+        if c.lower().startswith("name"):
+            return c
+    return None
+def _load_shapefile(src: pathlib.Path, source_key: str) -> gpd.GeoDataFrame:
+    """Load any Natural Earth shapefile and normalize to Overture-like schema."""
+    gdf = gpd.read_file(src)
+    gdf.columns = [c.lower() for c in gdf.columns]
+    n = len(gdf)
+    # id: ne_id if present else source_index
+    if "ne_id" in gdf.columns:
+        ids = "ne_" + gdf["ne_id"].astype(str)
+    else:
+        ids = pd.Series([f"ne_10m_{source_key}_{i}" for i in range(n)])
+    name_col = _pick_name_col(gdf)
+    if name_col is None:
+        names = pd.Series([{"primary": None, **{lang: None for lang in LANG_COLS}}] * n)
+    else:
+        names = _names_struct(gdf, name_col)
+    # subtype: featurecla or source key
+    if "featurecla" in gdf.columns:
+        subtype = gdf["featurecla"]
+    else:
+        subtype = pd.Series([source_key] * n)
+    return gpd.GeoDataFrame(
+        {
+            "id": ids,
+            "source_layer": pd.array([source_key] * n, dtype=pd.StringDtype()),
+            "names": names,
+            "subtype": subtype,
+            "class": pd.array([None] * n, dtype=pd.StringDtype()),
+            "country": gdf.get("sov_a3", pd.array([None] * n, dtype=pd.StringDtype()))
+            if "sov_a3" in gdf.columns
+            else pd.array([None] * n, dtype=pd.StringDtype()),
+            "region": gdf.get("region", pd.array([None] * n, dtype=pd.StringDtype())),
+            "admin_level": pd.array([None] * n, dtype=pd.Int32Dtype()),
+            "is_land": _infer_is_land(source_key, gdf),
+            "is_territorial": pd.array([None] * n, dtype=pd.BooleanDtype()),
+            "geometry": gdf.geometry,
+        },
+        crs=gdf.crs,
+    )
+def _infer_is_land(source_key: str, gdf: gpd.GeoDataFrame) -> pd.Series:
+    """Infer is_land from source name when possible."""
+    n = len(gdf)
+    ocean_marine = ("ocean", "marine", "bathymetry", "coastline", "seams", "reefs")
+    if any(x in source_key for x in ocean_marine):
+        return pd.Series([False] * n)
+    if "land" in source_key or "lakes" in source_key or "regions" in source_key:
+        return pd.Series([True] * n)
+    return pd.array([None] * n, dtype=pd.BooleanDtype())
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "shp_dir", type=pathlib.Path, help="Directory containing *.shp files"
+    )
+    parser.add_argument(
+        "-o",
+        "--output",
+        type=pathlib.Path,
+        default=DEFAULT_OUTPUT,
+        help=f"Output .parquet path (default: {DEFAULT_OUTPUT})",
+    )
+    args = parser.parse_args()
+    all_shp = sorted(args.shp_dir.glob("*.shp"))
+    if not all_shp:
+        raise SystemExit(f"No .shp files in {args.shp_dir}")
+    def _should_skip(stem: str) -> bool:
+        if stem in SKIP_EXACT:
+            return True
+        return any(p in stem for p in SKIP_PATTERNS)
+    shp_files = [p for p in all_shp if not _should_skip(p.stem)]
+    skipped = [p.stem for p in all_shp if _should_skip(p.stem)]
+    if skipped:
+        print(f"Skipping {len(skipped)} utility layers: {', '.join(skipped)}\n")
+    frames = []
+    for path in shp_files:
+        source_key = path.stem  # e.g. ne_10m_geography_marine_polys
+        gdf = _load_shapefile(path, source_key)
+        frames.append(gdf)
+        print(f"  {path.name}: {len(gdf)} features")
+    combined = gpd.GeoDataFrame(
+        pd.concat(frames, ignore_index=True),
+        crs=frames[0].crs,
+    )
+    args.output.parent.mkdir(parents=True, exist_ok=True)
+    combined.to_parquet(args.output)
+    print(f"\nSaved {len(combined)} features → {args.output}")

pyproject.toml ADDED Viewed

	@@ -0,0 +1,23 @@

+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+[project]
+name = "gazet"
+version = "0.1.0"
+description = "Lean natural-language geocoder with GIS operations over Overture and Natural Earth parquet datasets"
+requires-python = ">=3.13.0, <3.14"
+dependencies = [
+    "duckdb>=1.4.4",
+    "fastapi>=0.115",
+    "uvicorn[standard]>=0.32",
+    "dspy>=3.1.3",
+    "pandas>=2.2",
+    "pydantic>=2.0",
+    "pyarrow>=17.0.0",
+    "geopandas>=1.1.2",
+]
+optional-dependencies = { demo = ["streamlit", "requests", "pydeck"], dev = ["ruff"] }
+[tool.hatch.build.targets.wheel]
+packages = ["src/gazet"]

src/gazet/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .api import app
+from .schemas import Place, PlacesResult
+__all__ = ["app", "Place", "PlacesResult"]

src/gazet/__main__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+import uvicorn
+from .api import app
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=8000)

src/gazet/api.py ADDED Viewed

	@@ -0,0 +1,147 @@

+import json
+from typing import Any, Generator
+import duckdb
+import pandas as pd
+from fastapi import FastAPI, HTTPException
+from fastapi.responses import StreamingResponse
+from .export import to_feature_collection
+from .lm import extract
+from .search import search_divisions_area, search_natural_earth
+from .sql import run_geo_sql_loop
+app = FastAPI()
+def _df_to_records(df: pd.DataFrame) -> list[dict[str, Any]]:
+    """Convert DataFrame to list of dicts for JSON; handle non-JSON-serializable types."""
+    return df.replace({float("nan"): None}).to_dict(orient="records")
+def _run_stream(query: str) -> Generator[str, None, None]:
+    """Yield NDJSON lines as each stage of the search completes.
+    Event ``type`` values (in order of emission):
+    - ``places``      – extracted place names
+    - ``candidates``  – merged fuzzy-match table
+    - ``sql_attempt`` – SQL generated in the current loop iteration
+    - ``sql_error``   – execution/generation error in the current iteration
+    - ``geojson``     – final FeatureCollection
+    - ``error``       – fatal error (no result)
+    """
+    pred = extract(query=query)
+    print("extract result:", pred.result)
+    places_result = pred.result
+    yield json.dumps({"type": "places", "data": places_result.model_dump()}) + "\n"
+    con = duckdb.connect()
+    con.execute("INSTALL spatial")
+    con.execute("LOAD spatial")
+    try:
+        all_candidates: list[pd.DataFrame] = []
+        for place in places_result.places:
+            for search_fn in (search_divisions_area, search_natural_earth):
+                df = search_fn(con, place)
+                if not df.empty:
+                    all_candidates.append(df)
+        if not all_candidates:
+            yield json.dumps({"type": "error", "data": "No candidates found"}) + "\n"
+            return
+        candidates_df = (
+            pd.concat(all_candidates, ignore_index=True)
+            .drop_duplicates(subset=["source", "id"])
+            .sort_values(["similarity", "admin_level"], ascending=[False, True])
+            .reset_index(drop=True)
+        )
+        yield (
+            json.dumps({"type": "candidates", "data": _df_to_records(candidates_df)})
+            + "\n"
+        )
+        result_df: pd.DataFrame | None = None
+        for event in run_geo_sql_loop(con, query, candidates_df):
+            if event["type"] == "sql_attempt":
+                yield (
+                    json.dumps(
+                        {
+                            "type": "sql_attempt",
+                            "data": event["sql"],
+                            "iteration": event["iteration"],
+                        }
+                    )
+                    + "\n"
+                )
+            elif event["type"] == "sql_error":
+                yield (
+                    json.dumps(
+                        {
+                            "type": "sql_error",
+                            "data": event["error"],
+                            "iteration": event["iteration"],
+                        }
+                    )
+                    + "\n"
+                )
+            elif event["type"] == "result":
+                result_df = event["df"]
+        if result_df is None or result_df.empty:
+            yield json.dumps({"type": "error", "data": "No result from SQL"}) + "\n"
+            return
+        yield (
+            json.dumps({"type": "geojson", "data": to_feature_collection(result_df)})
+            + "\n"
+        )
+    finally:
+        con.close()
+@app.get("/search/stream")
+def search_stream(q: str) -> StreamingResponse:
+    """Stream search progress as NDJSON (one JSON object per line)."""
+    return StreamingResponse(_run_stream(q), media_type="application/x-ndjson")
+@app.get("/search", response_model=None)
+def search(q: str) -> dict[str, Any]:
+    """Run geo search for natural-language query (non-streaming).
+    Returns GeoJSON FeatureCollection, the executed SQL, and the identified
+    dataframes (candidates) as JSON-serializable records.
+    """
+    places: dict = {}
+    candidates: list = []
+    sql = ""
+    geojson: dict | None = None
+    for line in _run_stream(q):
+        if not line.strip():
+            continue
+        event = json.loads(line)
+        t = event["type"]
+        if t == "places":
+            places = event["data"]
+        elif t == "candidates":
+            candidates = event["data"]
+        elif t == "sql_attempt":
+            sql = event["data"]
+        elif t == "geojson":
+            geojson = event["data"]
+    if geojson is None:
+        raise HTTPException(status_code=404, detail="No result")
+    return {
+        "geojson": geojson,
+        "sql": sql,
+        "places": places,
+        "dataframes": {"candidates": candidates},
+    }

src/gazet/config.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import pathlib
+# Data lives at project root (gazet/data/), not inside the package
+_DATA_DIR = pathlib.Path(__file__).resolve().parent.parent.parent / "data"
+DIVISIONS_AREA_PATH = str(_DATA_DIR / "overture/divisions_area/*.parquet")
+NATURAL_EARTH_PATH = str(_DATA_DIR / "natural_earth_geoparquet/ne_geography.parquet")
+# MODEL = "qwen3.5:cloud"
+# MODEL = "granite4:350m"
+# MODEL = "gemma3:12b-cloud"
+# MODEL = "qwen3.5:397b-cloud"
+MODEL = "gpt-oss:20b-cloud"
+# MODEL = "qwen3:4b"
+# MODEL = "qwen3-coder-next:cloud"
+# MODEL = "deepseek-coder:1.3b"
+MAX_SQL_ITERATIONS = 5
+SCHEMA_INFO = f"""
+Available DuckDB datasets (read via read_parquet):
+1. divisions_area  — Overture polygon/multipolygon admin boundaries
+   path: '{DIVISIONS_AREA_PATH}'
+   columns:
+     id VARCHAR              -- unique feature id (use this to filter precisely)
+     names STRUCT("primary" VARCHAR, ...)
+     country VARCHAR         -- ISO 3166-1 alpha-2
+     subtype VARCHAR         -- country | region | dependency | county | macrohood |
+                               localadmin | locality | neighborhood | microhood
+     class VARCHAR
+     region VARCHAR          -- region code (e.g. 'EC-L' for Loja Ecuador)
+     admin_level INTEGER
+     division_id VARCHAR
+     is_land BOOLEAN
+     is_territorial BOOLEAN
+     geometry GEOMETRY       -- boundary polygon/multipolygon (WKB, spatial ext loaded)
+2. natural_earth  — Natural Earth geography polygons (oceans, seas, terrain regions, islands)
+   path: '{NATURAL_EARTH_PATH}'
+   columns:
+     id VARCHAR              -- unique feature id prefixed 'ne_'
+     names STRUCT("primary" VARCHAR, ...)
+     subtype VARCHAR         -- e.g. 'ocean', 'sea', 'bay', 'Terrain area', 'Island group'
+     class VARCHAR
+     country VARCHAR
+     region VARCHAR
+     admin_level INTEGER
+     is_land BOOLEAN
+     is_territorial BOOLEAN
+     geometry GEOMETRY       -- polygon/multipolygon (WKB, spatial ext loaded)
+Spatial extension is already loaded — use ST_AsGeoJSON(geometry) or ST_AsText(geometry).
+To access names use: names."primary"
+The candidates table has a 'source' column: 'divisions_area' or 'natural_earth'.
+Use the matching path for each candidate's source when querying.
+Example patterns:
+  -- single region boundary from divisions_area
+  SELECT id, names."primary" AS name, ST_AsGeoJSON(geometry) AS geojson
+  FROM read_parquet('{DIVISIONS_AREA_PATH}')
+  WHERE id = '<candidate_id>'
+  -- feature from natural_earth
+  SELECT id, names."primary" AS name, ST_AsGeoJSON(geometry) AS geojson
+  FROM read_parquet('{NATURAL_EARTH_PATH}')
+  WHERE id = '<candidate_id>'
+  -- shared border between two adjacent regions
+  WITH a AS (SELECT geometry FROM read_parquet('{DIVISIONS_AREA_PATH}') WHERE id = '<id_a>'),
+       b AS (SELECT geometry FROM read_parquet('{DIVISIONS_AREA_PATH}') WHERE id = '<id_b>')
+  SELECT ST_AsGeoJSON(ST_Intersection(a.geometry, b.geometry)) AS border
+  FROM a, b
+"""

src/gazet/export.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import json
+import pathlib
+import re
+import pandas as pd
+def _is_geojson_col(series: pd.Series) -> bool:
+    """Heuristic: a string column whose non-null values start with '{"type":'."""
+    sample = series.dropna().head(5)
+    return (
+        sample.apply(
+            lambda v: isinstance(v, str) and v.lstrip().startswith('{"type":')
+        ).all()
+        and len(sample) > 0
+    )
+def save_geojson(
+    result_df: pd.DataFrame, query: str, output_dir: pathlib.Path | str = "."
+) -> pathlib.Path:
+    """Wrap result_df into a GeoJSON FeatureCollection and save to disk.
+    Any column whose values are GeoJSON geometry strings (output of ST_AsGeoJSON)
+    is used as the feature geometry; remaining columns become properties.
+    If multiple geometry columns exist the first one wins.
+    """
+    output_dir = pathlib.Path(output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    slug = re.sub(r"[^\w]+", "_", query.lower()).strip("_")
+    out_path = output_dir / f"{slug}.geojson"
+    fc = _to_feature_collection(result_df)
+    out_path.write_text(json.dumps(fc, indent=2))
+    print(f"\nSaved {len(fc['features'])} feature(s) → {out_path.resolve()}")
+    return out_path
+def to_feature_collection(result_df: pd.DataFrame) -> dict:
+    """Build a GeoJSON FeatureCollection dict from a result DataFrame."""
+    return _to_feature_collection(result_df)
+def _to_feature_collection(result_df: pd.DataFrame) -> dict:
+    geom_cols = [c for c in result_df.columns if _is_geojson_col(result_df[c])]
+    prop_cols = [c for c in result_df.columns if c not in geom_cols]
+    features = []
+    for _, row in result_df.iterrows():
+        geometry = None
+        if geom_cols:
+            raw = row[geom_cols[0]]
+            if raw and isinstance(raw, str):
+                try:
+                    geometry = json.loads(raw)
+                except json.JSONDecodeError:
+                    pass
+        properties = {c: row[c] for c in prop_cols if pd.notna(row[c])}
+        for c in geom_cols[1:]:
+            if pd.notna(row[c]):
+                properties[c] = row[c]
+        features.append(
+            {"type": "Feature", "geometry": geometry, "properties": properties}
+        )
+    return {"type": "FeatureCollection", "features": features}

src/gazet/lm.py ADDED Viewed

	@@ -0,0 +1,112 @@

+import dspy
+from .config import MODEL
+from .schemas import PlacesResult
+class ExtractPlaces(dspy.Signature):
+    """Extract place names from a query.
+    Data is available from overture and natural earth datasets.
+    Overture has divisions and natural earth has physical features.
+    - divisions are administrative units like countries, states, counties, cities, towns, villages, etc.
+    - physical features are natural features like oceans, seas, lakes, rivers, mountains, etc.
+    When extracting a place name, you can use the overture divisions or natural earth physical features.
+    - If the user mentions an overture division, use the overture divisions.
+    - If the user mentions a natural earth physical feature, use the natural earth physical features.
+    - If the user mentions a place name that is not in the overture divisions or natural earth physical features, return the place name as is.
+    Where possible and relevant, also extract the ISO country code for each place.
+    Do not repeat the same place name in the result.
+    If the user does not explicitly mention a country, dont add the country code to the result.
+    If the user does not mention an admin level, dont add the subtype to the result.
+    If the query asks for some kind of subdivision (e.g. 'municipalities in Bern', 'States in Brazil'),
+    return the subdivision type in the places result.
+    When identifying a place name from the user's query, also infer the most appropriate
+    Overture division subtype from the list below. Only include a subtype if the query
+    makes it reasonably clear what geographic level is intended. If ambiguous, omit it.
+    SUBTYPES:
+    - country      : Sovereign nation. E.g. "France", "Brazil"
+    - dependency   : Territory dependent on a country but not a full sub-region. E.g. "Puerto Rico", "Guam"
+    - region       : Largest admin unit within a country; state, province, canton, etc. E.g. "California", "Alberta", "Bavaria"
+    - county       : Second-level admin subdivision within a region. E.g. "Kings County", "Kent"
+    - localadmin   : A governing layer (common in Europe) that contains localities which have no authority of their own. E.g. a French commune or Belgian municipality. Use when the place is clearly an admin unit but not a city itself.
+    - locality     : A populated place — city, town, village. The most common subtype for named settlements. E.g. "Lisbon", "Taipei", "Salt Lake City"
+    - macrohood    : A large super-neighborhood grouping smaller neighborhoods. E.g. "BoCoCa" in Brooklyn
+    - neighborhood : A named community area within a city or town. E.g. "Cobble Hill", "Alfama"
+    - microhood    : A mini-neighborhood within a neighborhood. Very fine-grained, rarely referenced explicitly.
+    HIERARCHY (coarse to fine):
+    country → dependency / region → county → localadmin → locality → macrohood → neighborhood → microhood
+    GUIDANCE:
+    - "Paris" with no qualifier → locality
+    - "Île-de-France" or "Catalonia" → region
+    - "the 11th arrondissement" → neighborhood (or localadmin)
+    - "Greater London" style phrasing → county or region depending on context
+    - If the user says "neighborhood in X" or "district of X" → neighborhood
+    - Default to locality for any named city/town if unsure
+    - Omit subtype entirely if the query gives no signal (e.g. bare coordinates or a POI name)
+    """
+    query: str = dspy.InputField(
+        desc="Natural language query mentioning one or more place names"
+    )
+    result: PlacesResult = dspy.OutputField(
+        desc="Extracted places with optional country codes and optional subtype"
+    )
+class WriteGeoSQL(dspy.Signature):
+    """Write a DuckDB SQL SELECT query that extracts the geometry answering a geo query.
+    You are given:
+    - The user's original natural language query
+    - A schema description of the available Overture divisions parquet datasets
+    - A table of fuzzy-matched candidate divisions with their IDs and metadata
+    Write a single, read-only DuckDB SQL SELECT statement that returns the geometry
+    (and key attributes like name, subtype, country) that best answers the query.
+    Use candidate IDs from the candidates table to filter precisely — avoid full scans
+    when you have exact IDs. The spatial extension is already loaded; use
+    ST_AsGeoJSON(geometry) for geometry output.
+    The user might ask for GIS operations such as intersections, buffering, or
+    sections of geometries. You can use the spatial extension to perform these operations.
+    Return ONLY the SQL — no markdown fences, no explanation.
+    """
+    user_query: str = dspy.InputField(desc="Original natural language geo query")
+    schema: str = dspy.InputField(
+        desc="Available datasets, column types, and example patterns"
+    )
+    candidates: str = dspy.InputField(
+        desc="Fuzzy-matched candidate divisions (id, name, country, subtype, similarity, ...)"
+    )
+    previous_sql: str = dspy.InputField(
+        desc="SQL from the previous attempt — empty string if this is the first try"
+    )
+    execution_error: str = dspy.InputField(
+        desc="Error raised by the previous SQL — empty string if no error yet"
+    )
+    sql: str = dspy.OutputField(
+        desc="Valid read-only DuckDB SQL SELECT statement, no markdown fences"
+    )
+lm = dspy.LM(
+    f"ollama_chat/{MODEL}", api_base="http://localhost:11434", api_key="", temperature=0
+)
+dspy.configure(lm=lm)
+extract = dspy.Predict(ExtractPlaces)
+write_sql = dspy.Predict(WriteGeoSQL)

src/gazet/schemas.py ADDED Viewed

	@@ -0,0 +1,301 @@

+from typing import Literal, Optional, List
+from pydantic import BaseModel
+SUBTYPES = Literal[
+    "country",
+    "region",
+    "dependency",
+    "county",
+    "macrohood",
+    "localadmin",
+    "locality",
+    "neighborhood",
+    "microhood",
+]
+COUNTRIES = Literal[
+    "AD",
+    "AE",
+    "AF",
+    "AG",
+    "AI",
+    "AL",
+    "AM",
+    "AO",
+    "AQ",
+    "AR",
+    "AS",
+    "AT",
+    "AU",
+    "AW",
+    "AX",
+    "AZ",
+    "BA",
+    "BB",
+    "BD",
+    "BE",
+    "BF",
+    "BG",
+    "BH",
+    "BI",
+    "BJ",
+    "BL",
+    "BM",
+    "BN",
+    "BO",
+    "BQ",
+    "BR",
+    "BS",
+    "BT",
+    "BV",
+    "BW",
+    "BY",
+    "BZ",
+    "CA",
+    "CC",
+    "CD",
+    "CF",
+    "CG",
+    "CH",
+    "CI",
+    "CK",
+    "CL",
+    "CM",
+    "CN",
+    "CO",
+    "CP",
+    "CR",
+    "CU",
+    "CV",
+    "CW",
+    "CX",
+    "CY",
+    "CZ",
+    "DE",
+    "DJ",
+    "DK",
+    "DM",
+    "DO",
+    "DZ",
+    "EC",
+    "EE",
+    "EG",
+    "ER",
+    "ES",
+    "ET",
+    "FI",
+    "FJ",
+    "FK",
+    "FM",
+    "FO",
+    "FR",
+    "GA",
+    "GB",
+    "GD",
+    "GE",
+    "GF",
+    "GG",
+    "GH",
+    "GI",
+    "GL",
+    "GM",
+    "GN",
+    "GP",
+    "GQ",
+    "GR",
+    "GS",
+    "GT",
+    "GU",
+    "GW",
+    "GY",
+    "HK",
+    "HM",
+    "HN",
+    "HR",
+    "HT",
+    "HU",
+    "ID",
+    "IE",
+    "IL",
+    "IM",
+    "IN",
+    "IO",
+    "IQ",
+    "IR",
+    "IS",
+    "IT",
+    "JE",
+    "JM",
+    "JO",
+    "JP",
+    "KE",
+    "KG",
+    "KH",
+    "KI",
+    "KM",
+    "KN",
+    "KP",
+    "KR",
+    "KW",
+    "KY",
+    "KZ",
+    "LA",
+    "LB",
+    "LC",
+    "LI",
+    "LK",
+    "LR",
+    "LS",
+    "LT",
+    "LU",
+    "LV",
+    "LY",
+    "MA",
+    "MC",
+    "MD",
+    "ME",
+    "MF",
+    "MG",
+    "MH",
+    "MK",
+    "ML",
+    "MM",
+    "MN",
+    "MO",
+    "MP",
+    "MQ",
+    "MR",
+    "MS",
+    "MT",
+    "MU",
+    "MV",
+    "MW",
+    "MX",
+    "MY",
+    "MZ",
+    "NA",
+    "NC",
+    "NE",
+    "NF",
+    "NG",
+    "NI",
+    "NL",
+    "NO",
+    "NP",
+    "NR",
+    "NU",
+    "NZ",
+    "OM",
+    "PA",
+    "PE",
+    "PF",
+    "PG",
+    "PH",
+    "PK",
+    "PL",
+    "PM",
+    "PN",
+    "PR",
+    "PT",
+    "PW",
+    "PY",
+    "QA",
+    "RE",
+    "RO",
+    "RS",
+    "RU",
+    "RW",
+    "SA",
+    "SB",
+    "SC",
+    "SD",
+    "SE",
+    "SG",
+    "SH",
+    "SI",
+    "SJ",
+    "SK",
+    "SL",
+    "SM",
+    "SN",
+    "SO",
+    "SR",
+    "SS",
+    "ST",
+    "SV",
+    "SX",
+    "SY",
+    "SZ",
+    "TC",
+    "TD",
+    "TF",
+    "TG",
+    "TH",
+    "TJ",
+    "TK",
+    "TL",
+    "TM",
+    "TN",
+    "TO",
+    "TR",
+    "TT",
+    "TV",
+    "TW",
+    "TZ",
+    "UA",
+    "UG",
+    "UM",
+    "US",
+    "UY",
+    "UZ",
+    "VA",
+    "VC",
+    "VE",
+    "VG",
+    "VI",
+    "VN",
+    "VU",
+    "WF",
+    "WS",
+    "XA",
+    "XB",
+    "XC",
+    "XD",
+    "XE",
+    "XG",
+    "XH",
+    "XI",
+    "XJ",
+    "XK",
+    "XL",
+    "XM",
+    "XN",
+    "XO",
+    "XP",
+    "XQ",
+    "XR",
+    "XS",
+    "XT",
+    "XU",
+    "XW",
+    "XX",
+    "XY",
+    "XZ",
+    "YE",
+    "YT",
+    "ZA",
+    "ZM",
+    "ZW",
+]
+class Place(BaseModel):
+    place: str
+    country: Optional[COUNTRIES] = None
+    subtype: Optional[SUBTYPES] = None
+class PlacesResult(BaseModel):
+    places: List[Place]
+    subtype: Optional[SUBTYPES] = None

src/gazet/search.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import duckdb
+import pandas as pd
+from .config import DIVISIONS_AREA_PATH, NATURAL_EARTH_PATH
+from .schemas import Place
+def _fuzzy_search(
+    con: duckdb.DuckDBPyConnection,
+    path: str,
+    source: str,
+    place: Place,
+    extra_select: str = "",
+    limit: int = 5,
+    is_overture: bool = False,
+) -> pd.DataFrame:
+    """Generic Levenshtein fuzzy search against any parquet with a names.primary column."""
+    country_filter = ""
+    country_params: list = []
+    if is_overture and place.country:
+        country_filter = "AND country = ?"
+        country_params = [place.country]
+    subtype_filter = ""
+    subtype_params: list = []
+    if is_overture and place.subtype:
+        subtype_filter = "AND subtype = ?"
+        subtype_params = [place.subtype]
+    params = (
+        [place.place, place.place, path] + country_params + subtype_params + [limit]
+    )
+    extra_clause = f", {extra_select}" if extra_select else ""
+    rel = con.execute(
+        f"""
+        SELECT
+            id,
+            names."primary" AS name,
+            country,
+            subtype,
+            class,
+            region,
+            admin_level,
+            is_land,
+            is_territorial{extra_clause},
+            1.0 - (levenshtein(lower(names."primary"), lower(?))::float
+                   / greatest(length(names."primary"), length(?), 1)) AS similarity
+        FROM read_parquet(?)
+        WHERE names."primary" IS NOT NULL AND trim(names."primary") != ''
+        {country_filter}
+        {subtype_filter}
+        ORDER BY similarity DESC, admin_level ASC
+        LIMIT ?
+        """,
+        params,
+    )
+    df = rel.fetchdf()
+    df.insert(0, "source", source)
+    label = f'"{place.place}"' + (f" [{place.country}]" if place.country else "")
+    if df.empty:
+        print(f"\n{source} – {label}: no matches")
+    else:
+        print(f"\n{source} – {label} (top {len(df)} by name similarity):")
+        print(df.to_string(index=False))
+    return df
+def search_divisions_area(
+    con: duckdb.DuckDBPyConnection, place: Place, limit: int = 5
+) -> pd.DataFrame:
+    """Fuzzy-match a place against divisions_area (Overture admin boundaries)."""
+    return _fuzzy_search(
+        con,
+        DIVISIONS_AREA_PATH,
+        "divisions_area",
+        place,
+        extra_select="division_id",
+        limit=limit,
+        is_overture=True,
+    )
+def search_natural_earth(
+    con: duckdb.DuckDBPyConnection, place: Place, limit: int = 5
+) -> pd.DataFrame:
+    """Fuzzy-match a place against Natural Earth geography polygons."""
+    return _fuzzy_search(
+        con,
+        NATURAL_EARTH_PATH,
+        "natural_earth",
+        place,
+        limit=limit,
+    )

src/gazet/sql.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import re
+from typing import Any, Generator, Optional
+import duckdb
+import pandas as pd
+from .config import MAX_SQL_ITERATIONS, SCHEMA_INFO
+from .lm import write_sql
+def _strip_fences(sql: Optional[str]) -> str:
+    """Remove markdown code fences that the LM may wrap the SQL in."""
+    if not sql:
+        return ""
+    sql = re.sub(r"^```\w*\s*\n?", "", sql.strip())
+    sql = re.sub(r"\n?```\s*$", "", sql)
+    return sql.strip()
+def run_geo_sql_loop(
+    con: duckdb.DuckDBPyConnection,
+    user_query: str,
+    candidates_df: pd.DataFrame,
+    max_iterations: int = MAX_SQL_ITERATIONS,
+) -> Generator[dict[str, Any], None, None]:
+    """Code-act loop yielding progress events.
+    Event types:
+    - ``sql_attempt``  – ``{"type": "sql_attempt", "sql": str, "iteration": int}``
+    - ``sql_error``    – ``{"type": "sql_error", "error": str, "iteration": int}``
+    - ``result``       – ``{"type": "result", "df": DataFrame | None, "sql": str}``
+    """
+    if candidates_df.empty:
+        print("\n[SQL-Act] No candidates to work with — skipping.")
+        yield {"type": "result", "df": None, "sql": ""}
+        return
+    candidates_str = candidates_df.to_string(index=False)
+    previous_sql = ""
+    error = ""
+    for iteration in range(1, max_iterations + 1):
+        print(f"\n{'=' * 60}")
+        print(f"[SQL-Act] Iteration {iteration}/{max_iterations}")
+        try:
+            pred = write_sql(
+                user_query=user_query,
+                schema=SCHEMA_INFO,
+                candidates=candidates_str,
+                previous_sql=previous_sql,
+                execution_error=error,
+            )
+            sql = _strip_fences(pred.sql)
+        except Exception as exc:
+            error = f"LM generation failed: {exc}"
+            print(f"Generation error: {error}")
+            yield {"type": "sql_error", "error": error, "iteration": iteration}
+            continue
+        if not sql:
+            error = "LM returned an empty SQL response."
+            print(f"Generation error: {error}")
+            yield {"type": "sql_error", "error": error, "iteration": iteration}
+            continue
+        print(f"\nGenerated SQL:\n{sql}\n")
+        yield {"type": "sql_attempt", "sql": sql, "iteration": iteration}
+        try:
+            result_df = con.execute(sql).fetchdf()
+            if result_df.empty:
+                error = "The query executed successfully but returned no rows. Revise the query to return at least one result."
+                previous_sql = sql
+                print(f"Empty result: {error}")
+                yield {"type": "sql_error", "error": error, "iteration": iteration}
+                continue
+            print(f"Result ({len(result_df)} row(s)):")
+            print(result_df.to_string(index=False, max_colwidth=120))
+            yield {"type": "result", "df": result_df, "sql": sql}
+            return
+        except Exception as exc:
+            error = str(exc)
+            previous_sql = sql
+            print(f"Execution error: {error}")
+            yield {"type": "sql_error", "error": error, "iteration": iteration}
+    print(
+        f"\n[SQL-Act] Exhausted {max_iterations} iterations without a successful query."
+    )
+    yield {"type": "result", "df": None, "sql": ""}

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff