gazet / dataset /scripts /sql_templates.py
srmsoumya's picture
fix: reduce templates to country, region & county for mvp
3ba9557
"""
SQL template definitions for synthetic data generation.
Geometry output convention
--------------------------
Every final SELECT wraps geometry with ST_AsGeoJSON():
ST_AsGeoJSON(geometry) AS geometry
This returns a GeoJSON string instead of raw WKB bytes, which is directly
JSON-serialisable and matches what the serving stack expects.
CTEs that compute intermediate geometries (used only for spatial predicates
or ST_Area) keep the column as raw GEOMETRY so DuckDB spatial functions work.
Buffer distance convention
--------------------------
All buffer templates use {buffer_m} (metres) in SQL: geometry / 111_320.
Question hints use {buffer_label} β€” a human-readable string like "50 km" or
"500 m" β€” generated by the handler from a combined (metres, label) choice list.
This keeps km and metre queries in a single template instead of separate ones.
Mixed-source candidates
-----------------------
generate_samples.py pads every candidate list with 50 % cross-source
distractors so the model always sees both source values and learns the
correct parquet path from the candidates table.
Template families
-----------------
direct_lookup Simple single-feature fetch by ID.
disambiguation "Place, Container" queries like "Puri, Odisha" β€” lookup by
ID after resolving an ambiguous name via containing region
or country mentioned in the query.
adjacency ST_Touches β€” features sharing a border.
multi_adjacency Features that simultaneously touch TWO anchors.
containment ST_Within / ST_Contains β€” hierarchical nesting.
intersection ST_Intersects β€” overlapping or crossing features.
buffer ST_Buffer β€” proximity zones in km or metres.
chained Containment + EXISTS/NOT EXISTS sea predicate.
difference ST_Difference β€” geometry subtraction.
border_corridor Buffered ST_Intersection of a shared border.
set_operations ST_Union_Agg β€” merging multiple geometries.
partial_selection Bbox clipping β€” directional halves or feature clips.
aggregation TOP-N by area with ORDER BY.
window_function ROW_NUMBER() OVER (PARTITION BY) β€” per-group ranking.
attribute_filter Pure attribute predicates: is_land, country, etc.
"""
from dataclasses import dataclass
from typing import List, Literal
_SUBTYPE_ALIASES = {
"country": ["countries", "nations"],
"region": ["states", "provinces", "oblasts", "departments", "prefectures", "cantons"],
"county": ["districts", "municipalities", "boroughs", "communes", "parishes"],
}
def _join_anchor_exprs(num_anchors: int) -> str:
"""Join anchor placeholders into a natural-language list."""
names = [f"{{anchor_{i}_name}}" for i in range(1, num_anchors + 1)]
if num_anchors == 1:
return names[0]
if num_anchors == 2:
return f"{names[0]} or {names[1]}"
return ", ".join(names[:-1]) + f", or {names[-1]}"
def _buffer_question_hints(target_subtype: str, anchor_name_expr: str) -> List[str]:
"""Build varied buffer question hints for one target subtype."""
aliases = _SUBTYPE_ALIASES[target_subtype]
return [
f"which {aliases[0]} are within {{buffer_label}} of {anchor_name_expr}?",
f"{aliases[0]} within {{buffer_label}} of {anchor_name_expr}",
f"find {aliases[0]} within a {{buffer_label}} radius of {anchor_name_expr}",
f"which {aliases[1]} are within {{buffer_label}} of {anchor_name_expr}?",
f"show {aliases[0]} within {{buffer_label}} of {anchor_name_expr}",
f"what {aliases[0]} fall within {{buffer_label}} of {anchor_name_expr}?",
f"areas close to {anchor_name_expr} within {{buffer_label}}",
]
def _make_buffer_templates() -> List["SQLTemplate"]:
"""Generate subtype-specific and multi-anchor buffer templates."""
templates: List[SQLTemplate] = []
for template_id, target_subtype in (
("buffer_04", "country"),
("buffer_05", "region"),
("buffer_06", "county"),
):
templates.append(
SQLTemplate(
template_id=template_id,
family="buffer",
sql_difficulty="hard",
anchor_source="divisions_area",
num_anchors=1,
target_subtype=target_subtype,
requires_buffer=True,
sql_template=(
"WITH a AS ("
" SELECT ST_Buffer(geometry, {buffer_m} / 111320.0) AS geom"
" FROM read_parquet('divisions_area')"
" WHERE id = '{anchor_id}'"
")"
" SELECT b.id, b.names.\"primary\" AS name, b.subtype, b.country,"
" ST_AsGeoJSON(b.geometry) AS geometry"
" FROM read_parquet('divisions_area') AS b, a"
" WHERE b.id != '{anchor_id}'"
" AND b.subtype = '{target_subtype}'"
" AND ST_Intersects(b.geometry, a.geom)"
),
question_hints=_buffer_question_hints(target_subtype, "{anchor_name}"),
)
)
for template_id, num_anchors, target_subtype in (
("buffer_07", 2, "region"),
("buffer_08", 3, "country"),
("buffer_09", 4, "region"),
("buffer_10", 5, "county"),
):
anchor_ids = ", ".join(f"'{{anchor_id_{i}}}'" for i in range(1, num_anchors + 1))
name_expr = _join_anchor_exprs(num_anchors)
templates.append(
SQLTemplate(
template_id=template_id,
family="buffer",
sql_difficulty="hard",
anchor_source="divisions_area",
num_anchors=num_anchors,
target_subtype=target_subtype,
requires_buffer=True,
sql_template=(
"WITH buf AS ("
" SELECT ST_Union_Agg(ST_Buffer(geometry, {buffer_m} / 111320.0)) AS geom"
" FROM read_parquet('divisions_area')"
f" WHERE id IN ({anchor_ids})"
")"
" SELECT b.id, b.names.\"primary\" AS name, b.subtype, b.country,"
" ST_AsGeoJSON(b.geometry) AS geometry"
" FROM read_parquet('divisions_area') AS b, buf"
f" WHERE b.id NOT IN ({anchor_ids})"
" AND b.subtype = '{target_subtype}'"
" AND ST_Intersects(b.geometry, buf.geom)"
),
question_hints=_buffer_question_hints(target_subtype, name_expr),
)
)
return templates
def _make_country_in_templates() -> List["SQLTemplate"]:
"""Generate repeated country-IN set-operation templates."""
specs = (
("contain_multi_01", 2, "region"),
("contain_multi_02", 3, "region"),
("contain_multi_03", 2, "county"),
)
templates: List[SQLTemplate] = []
for template_id, num_anchors, target_subtype in specs:
countries = ", ".join(f"'{{country_{i}}}'" for i in range(1, num_anchors + 1))
anchor_names = _join_anchor_exprs(num_anchors).replace(" or ", " and ")
aliases = _SUBTYPE_ALIASES[target_subtype]
question_hints = [
f"{aliases[0]} of {anchor_names}",
f"all {aliases[0]} in {anchor_names}",
f"show {aliases[0]} across {anchor_names}",
f"find {aliases[0]} across {anchor_names}",
f"list {aliases[0]} in {anchor_names}",
f"areas of {anchor_names}",
]
if len(aliases) > 1:
question_hints.extend(
[
f"{aliases[1]} of {anchor_names}",
f"all {aliases[1]} in {anchor_names}",
]
)
templates.append(
SQLTemplate(
template_id=template_id,
family="set_operations",
sql_difficulty="medium-hard",
anchor_source="divisions_area",
num_anchors=num_anchors,
target_subtype=target_subtype,
sql_template=(
"SELECT id, names.\"primary\" AS name, subtype, country,"
" ST_AsGeoJSON(geometry) AS geometry"
" FROM read_parquet('divisions_area')"
f" WHERE country IN ({countries})"
" AND subtype = '{target_subtype}'"
),
question_hints=question_hints,
)
)
return templates
@dataclass
class SQLTemplate:
"""SQL template for synthetic data generation."""
template_id: str
family: str
sql_difficulty: Literal["easy", "medium", "medium-hard", "hard"]
anchor_source: Literal["divisions_area", "natural_earth", "mixed"]
num_anchors: int
sql_template: str
question_hints: List[str]
target_subtype: str = None
requires_buffer: bool = False
requires_aggregation: bool = False
# ---------------------------------------------------------------------------
# Template catalog
# ---------------------------------------------------------------------------
TEMPLATES = [
# ── DIRECT LOOKUP ────────────────────────────────────────────────────────
SQLTemplate(
template_id="lookup_01",
family="direct_lookup",
sql_difficulty="easy",
anchor_source="divisions_area",
num_anchors=1,
sql_template=(
"SELECT ST_AsGeoJSON(geometry) AS geometry,"
" names.\"primary\" AS name, id, subtype, country"
" FROM read_parquet('divisions_area')"
" WHERE id = '{anchor_id}'"
),
question_hints=[
"show me {anchor_name}",
"get the boundary of {anchor_name}",
"find {anchor_name}",
"where is {anchor_name}?",
"outline of {anchor_name}",
"map {anchor_name}",
"what does {anchor_name} look like",
"i need the shape of {anchor_name}",
"pull up {anchor_name}",
"can you show {anchor_name}",
"map of {anchor_name}",
"{anchor_name} boundary",
"locate {anchor_name}",
],
),
SQLTemplate(
template_id="lookup_02",
family="direct_lookup",
sql_difficulty="easy",
anchor_source="natural_earth",
num_anchors=1,
sql_template=(
"SELECT ST_AsGeoJSON(geometry) AS geometry,"
" names.\"primary\" AS name, id, subtype"
" FROM read_parquet('natural_earth')"
" WHERE id = '{anchor_id}'"
),
question_hints=[
"show me the {anchor_name}",
"get the {anchor_name}",
"find the {anchor_name}",
"where is the {anchor_name}?",
"extent of the {anchor_name}",
"geometry of the {anchor_name}",
"display the {anchor_name}",
"pull up the {anchor_name}",
"i want to see the {anchor_name}",
"map the {anchor_name}",
"how big is the {anchor_name}?",
"outline of the {anchor_name}",
"show the shape of the {anchor_name}",
"trace the {anchor_name}",
"map out the {anchor_name}",
"where exactly is the {anchor_name}",
],
),
# ── DISAMBIGUATION ──────────────────────────────────────────────────────
# "Puri, Odisha", "Lisbon, Portugal", "Goa, India" β€” a common real-world
# query pattern where users give a place plus its containing region or
# country to disambiguate same-name localities.
# SQL is a plain lookup by id (disambiguation happens at candidate-pick
# time). Candidates include same-name localities in other regions plus
# the container, so the model must read the CSV to choose correctly.
#
# disambiguate_01: county scoped by its region / country
# disambiguate_02: county scoped by its country
# disambiguate_03: region scoped by its country
SQLTemplate(
template_id="disambiguate_01",
family="disambiguation",
sql_difficulty="easy",
anchor_source="divisions_area",
num_anchors=1,
sql_template=(
"SELECT ST_AsGeoJSON(geometry) AS geometry,"
" names.\"primary\" AS name, id, subtype, country, region"
" FROM read_parquet('divisions_area')"
" WHERE id = '{anchor_id}'"
),
question_hints=[
"{anchor_name}, {container_name}",
"{anchor_name} in {container_name}",
"the {anchor_name} that's in {container_name}",
"show me {anchor_name}, {container_name}",
"where is {anchor_name}, {container_name}?",
"map of {anchor_name} ({container_name})",
"{anchor_name} ({container_name})",
"{anchor_name} {container_name}",
"pull up {anchor_name} in {container_name}",
"find {anchor_name} in {container_name}",
"locate {anchor_name} in {container_name}",
"need {anchor_name} from {container_name}",
"show {anchor_name} under {container_name}",
"{anchor_name} near {container_name}",
],
),
SQLTemplate(
template_id="disambiguate_02",
family="disambiguation",
sql_difficulty="easy",
anchor_source="divisions_area",
num_anchors=1,
sql_template=(
"SELECT ST_AsGeoJSON(geometry) AS geometry,"
" names.\"primary\" AS name, id, subtype, country"
" FROM read_parquet('divisions_area')"
" WHERE id = '{anchor_id}'"
),
question_hints=[
"{anchor_name}, {container_name}",
"{anchor_name} in {container_name}",
"{anchor_name}, {container_name}.",
"show me {anchor_name}, {container_name}",
"where is {anchor_name} in {container_name}?",
"the {anchor_name} that's in {container_name}",
"map of {anchor_name}, {container_name}",
"pull up {anchor_name} ({container_name})",
"find {anchor_name} in {container_name}",
"{anchor_name} {container_name}",
"locate {anchor_name} in {container_name}",
"need the {anchor_name} in {container_name}",
"show {anchor_name} from {container_name}",
"bring up {anchor_name}, {container_name}",
],
),
SQLTemplate(
template_id="disambiguate_03",
family="disambiguation",
sql_difficulty="easy",
anchor_source="divisions_area",
num_anchors=1,
sql_template=(
"SELECT ST_AsGeoJSON(geometry) AS geometry,"
" names.\"primary\" AS name, id, subtype, country"
" FROM read_parquet('divisions_area')"
" WHERE id = '{anchor_id}'"
),
question_hints=[
"{anchor_name}, {container_name}",
"{anchor_name} state of {container_name}",
"the {anchor_name} region in {container_name}",
"show me {anchor_name}, {container_name}",
"where is {anchor_name} in {container_name}?",
"map of {anchor_name}, {container_name}",
"{anchor_name} ({container_name})",
"{anchor_name} province of {container_name}",
"pull up {anchor_name} in {container_name}",
"find {anchor_name} {container_name}",
"locate {anchor_name} within {container_name}",
"show the {anchor_name} part of {container_name}",
"need {anchor_name} from {container_name}",
"bring up {anchor_name} in {container_name}",
"{anchor_name} prefecture of {container_name}",
"{anchor_name} oblast of {container_name}",
"{anchor_name} department of {container_name}",
"{anchor_name} canton in {container_name}",
],
),
# ── ADJACENCY ────────────────────────────────────────────────────────────
SQLTemplate(
template_id="adj_01",
family="adjacency",
sql_difficulty="medium",
anchor_source="divisions_area",
num_anchors=1,
sql_template=(
"WITH a AS ("
" SELECT geometry FROM read_parquet('divisions_area') WHERE id = '{anchor_id}'"
")"
" SELECT b.id, b.names.\"primary\" AS name, b.subtype, b.country,"
" ST_AsGeoJSON(b.geometry) AS geometry"
" FROM read_parquet('divisions_area') AS b, a"
" WHERE b.id != '{anchor_id}'"
" AND ST_Touches(a.geometry, b.geometry)"
),
question_hints=[
"what places touch {anchor_name}",
"list everything adjacent to {anchor_name}",
"what shares a border with {anchor_name}",
"neighbours of {anchor_name}",
"what's next to {anchor_name}",
"what surrounds {anchor_name}?",
"places next to {anchor_name}",
"everything bordering {anchor_name}",
"show adjacent places to {anchor_name}",
"areas touching {anchor_name}",
"find the neighbours of {anchor_name}",
"bordering places for {anchor_name}",
"places that meet {anchor_name}",
],
),
SQLTemplate(
template_id="adj_02",
family="adjacency",
sql_difficulty="medium",
anchor_source="divisions_area",
num_anchors=1,
target_subtype="region",
sql_template=(
"WITH a AS ("
" SELECT geometry FROM read_parquet('divisions_area') WHERE id = '{anchor_id}'"
")"
" SELECT b.id, b.names.\"primary\" AS name, b.subtype, b.country,"
" ST_AsGeoJSON(b.geometry) AS geometry"
" FROM read_parquet('divisions_area') AS b, a"
" WHERE b.id != '{anchor_id}'"
" AND b.subtype = '{target_subtype}'"
" AND ST_Touches(a.geometry, b.geometry)"
),
question_hints=[
"which {target_subtype}s border {anchor_name}?",
"what {target_subtype}s share a border with {anchor_name}",
"{target_subtype}s that touch {anchor_name}",
"neighbouring {target_subtype}s of {anchor_name}",
"which {target_subtype}s are adjacent to {anchor_name}?",
"{target_subtype}s along the {anchor_name} border",
"find {target_subtype}s next to {anchor_name}",
"show {target_subtype}s bordering {anchor_name}",
"{target_subtype}s beside {anchor_name}",
"all {target_subtype}s touching {anchor_name}",
"{target_subtype}s meeting {anchor_name}",
"which states border {anchor_name}?",
"which provinces share a border with {anchor_name}?",
"states adjacent to {anchor_name}",
"neighbouring provinces of {anchor_name}",
"which prefectures border {anchor_name}?",
"oblasts bordering {anchor_name}",
"departments that touch {anchor_name}",
"cantons adjacent to {anchor_name}",
],
),
SQLTemplate(
template_id="adj_03",
family="adjacency",
sql_difficulty="medium",
anchor_source="divisions_area",
num_anchors=1,
target_subtype="sea",
sql_template=(
"WITH a AS ("
" SELECT geometry FROM read_parquet('divisions_area') WHERE id = '{anchor_id}'"
")"
" SELECT n.id, n.names.\"primary\" AS name, n.subtype,"
" ST_AsGeoJSON(n.geometry) AS geometry"
" FROM read_parquet('natural_earth') AS n, a"
" WHERE n.subtype IN ('ocean', 'sea')"
" AND ST_Intersects(a.geometry, n.geometry)"
),
question_hints=[
"which seas touch {anchor_name}?",
"what seas border {anchor_name}?",
"which bodies of water is {anchor_name} next to?",
"what ocean or sea borders {anchor_name}",
"which oceans touch {anchor_name}?",
"what coastline does {anchor_name} have?",
"which water bodies does {anchor_name} border?",
"does {anchor_name} have sea access?",
"what ocean is {anchor_name} on?",
"is {anchor_name} on the coast?",
"what sea is off the coast of {anchor_name}?",
"which ocean lies off {anchor_name}?",
"what water is {anchor_name} on the shore of?",
"which sea or ocean is {anchor_name} along?",
],
),
SQLTemplate(
template_id="adj_04",
family="adjacency",
sql_difficulty="medium",
anchor_source="divisions_area",
num_anchors=1,
target_subtype="county",
sql_template=(
"WITH a AS ("
" SELECT geometry FROM read_parquet('divisions_area') WHERE id = '{anchor_id}'"
")"
" SELECT b.id, b.names.\"primary\" AS name, b.subtype, b.country,"
" ST_AsGeoJSON(b.geometry) AS geometry"
" FROM read_parquet('divisions_area') AS b, a"
" WHERE b.id != '{anchor_id}'"
" AND b.subtype = '{target_subtype}'"
" AND ST_Touches(a.geometry, b.geometry)"
),
question_hints=[
"neighbouring counties of {anchor_name}",
"neighbouring districts of {anchor_name}",
"which counties border {anchor_name}?",
"which districts border {anchor_name}?",
"counties adjacent to {anchor_name}",
"districts next to {anchor_name}",
"counties sharing a border with {anchor_name}",
"what counties touch {anchor_name}?",
"nearby counties of {anchor_name}",
"counties along the {anchor_name} boundary",
"municipalities bordering {anchor_name}",
"which boroughs are adjacent to {anchor_name}?",
"communes that touch {anchor_name}",
"parishes next to {anchor_name}",
"neighbouring municipalities of {anchor_name}",
],
),
SQLTemplate(
template_id="adj_05",
family="adjacency",
sql_difficulty="medium",
anchor_source="divisions_area",
num_anchors=1,
target_subtype="country",
sql_template=(
"WITH a AS ("
" SELECT geometry FROM read_parquet('divisions_area') WHERE id = '{anchor_id}'"
")"
" SELECT b.id, b.names.\"primary\" AS name, b.subtype, b.country,"
" ST_AsGeoJSON(b.geometry) AS geometry"
" FROM read_parquet('divisions_area') AS b, a"
" WHERE b.id != '{anchor_id}'"
" AND b.subtype = '{target_subtype}'"
" AND ST_Touches(a.geometry, b.geometry)"
),
question_hints=[
"which countries border {anchor_name}?",
"what countries share a border with {anchor_name}?",
"countries adjacent to {anchor_name}",
"neighbouring countries of {anchor_name}",
"which nations border {anchor_name}?",
"what nations are next to {anchor_name}?",
"nations sharing a border with {anchor_name}",
"countries that touch {anchor_name}",
"find countries bordering {anchor_name}",
"show countries adjacent to {anchor_name}",
"all countries touching {anchor_name}",
"which nations are {anchor_name}'s neighbours?",
],
),
SQLTemplate(
template_id="adj_06",
family="adjacency",
sql_difficulty="medium",
anchor_source="natural_earth",
num_anchors=1,
target_subtype="country",
sql_template=(
"WITH a AS ("
" SELECT geometry FROM read_parquet('natural_earth') WHERE id = '{anchor_id}'"
")"
" SELECT b.id, b.names.\"primary\" AS name, b.subtype, b.country,"
" ST_AsGeoJSON(b.geometry) AS geometry"
" FROM read_parquet('divisions_area') AS b, a"
" WHERE b.subtype = '{target_subtype}'"
" AND ST_Intersects(b.geometry, a.geometry)"
),
question_hints=[
"which countries border the {anchor_name}?",
"what countries are next to the {anchor_name}?",
"countries along the {anchor_name}",
"which nations touch the {anchor_name}?",
"countries adjacent to the {anchor_name}",
"which countries lie on the {anchor_name}?",
],
),
SQLTemplate(
template_id="adj_07",
family="adjacency",
sql_difficulty="medium",
anchor_source="natural_earth",
num_anchors=1,
target_subtype="region",
sql_template=(
"WITH a AS ("
" SELECT geometry FROM read_parquet('natural_earth') WHERE id = '{anchor_id}'"
")"
" SELECT b.id, b.names.\"primary\" AS name, b.subtype, b.country,"
" ST_AsGeoJSON(b.geometry) AS geometry"
" FROM read_parquet('divisions_area') AS b, a"
" WHERE b.subtype = '{target_subtype}'"
" AND ST_Intersects(b.geometry, a.geometry)"
),
question_hints=[
"which regions border the {anchor_name}?",
"what regions are next to the {anchor_name}?",
"regions along the {anchor_name}",
"which states touch the {anchor_name}?",
"provinces adjacent to the {anchor_name}",
"which regions lie on the {anchor_name}?",
],
),
SQLTemplate(
template_id="adj_08",
family="adjacency",
sql_difficulty="medium",
anchor_source="natural_earth",
num_anchors=1,
target_subtype="county",
sql_template=(
"WITH a AS ("
" SELECT geometry FROM read_parquet('natural_earth') WHERE id = '{anchor_id}'"
")"
" SELECT b.id, b.names.\"primary\" AS name, b.subtype, b.country,"
" ST_AsGeoJSON(b.geometry) AS geometry"
" FROM read_parquet('divisions_area') AS b, a"
" WHERE b.subtype = '{target_subtype}'"
" AND ST_Intersects(b.geometry, a.geometry)"
),
question_hints=[
"which counties border the {anchor_name}?",
"what districts are next to the {anchor_name}?",
"counties along the {anchor_name}",
"which districts touch the {anchor_name}?",
"municipalities adjacent to the {anchor_name}",
"which counties lie on the {anchor_name}?",
],
),
# ── NATURAL EARTH ADJACENCY ─────────────────────────────────────────────
# Division anchor, natural_earth targets. Handler formats anchor_id and
# target_subtype but the SQL hardcodes NE subtypes (like adj_03).
SQLTemplate(
template_id="adj_09",
family="adjacency",
sql_difficulty="medium",
anchor_source="divisions_area",
num_anchors=1,
target_subtype="river",
sql_template=(
"WITH a AS ("
" SELECT geometry FROM read_parquet('divisions_area') WHERE id = '{anchor_id}'"
")"
" SELECT n.id, n.names.\"primary\" AS name, n.subtype,"
" ST_AsGeoJSON(n.geometry) AS geometry"
" FROM read_parquet('natural_earth') AS n, a"
" WHERE n.subtype IN ('river', 'lake', 'basin')"
" AND ST_Intersects(a.geometry, n.geometry)"
),
question_hints=[
"what rivers or lakes are in {anchor_name}?",
"natural water features of {anchor_name}",
"which rivers flow through {anchor_name}?",
"lakes and rivers within {anchor_name}",
"water features inside {anchor_name}",
"what bodies of water cross {anchor_name}?",
"rivers of {anchor_name}",
"show me the lakes in {anchor_name}",
"what rivers run through {anchor_name}?",
"which lakes lie in {anchor_name}?",
"what waterways are in {anchor_name}?",
"which basins, lakes, or rivers are in {anchor_name}?",
],
),
SQLTemplate(
template_id="adj_10",
family="adjacency",
sql_difficulty="medium",
anchor_source="divisions_area",
num_anchors=1,
target_subtype="range/mtn",
sql_template=(
"WITH a AS ("
" SELECT geometry FROM read_parquet('divisions_area') WHERE id = '{anchor_id}'"
")"
" SELECT n.id, n.names.\"primary\" AS name, n.subtype,"
" ST_AsGeoJSON(n.geometry) AS geometry"
" FROM read_parquet('natural_earth') AS n, a"
" WHERE n.subtype = 'range/mtn'"
" AND ST_Intersects(a.geometry, n.geometry)"
),
question_hints=[
"what mountain ranges are in {anchor_name}?",
"which mountain ranges cross {anchor_name}?",
"mountains of {anchor_name}",
"mountain regions in {anchor_name}",
"what hills are in {anchor_name}?",
"which hills cross {anchor_name}?",
"ghats of {anchor_name}",
"what ghats are in {anchor_name}?",
"highlands in {anchor_name}",
"mountain belts within {anchor_name}",
],
),
SQLTemplate(
template_id="adj_11",
family="adjacency",
sql_difficulty="medium",
anchor_source="divisions_area",
num_anchors=1,
target_subtype="plateau",
sql_template=(
"WITH a AS ("
" SELECT geometry FROM read_parquet('divisions_area') WHERE id = '{anchor_id}'"
")"
" SELECT n.id, n.names.\"primary\" AS name, n.subtype,"
" ST_AsGeoJSON(n.geometry) AS geometry"
" FROM read_parquet('natural_earth') AS n, a"
" WHERE n.subtype = 'plateau'"
" AND ST_Intersects(a.geometry, n.geometry)"
),
question_hints=[
"what plateaus are in {anchor_name}?",
"which plateaus cross {anchor_name}?",
"uplands in {anchor_name}",
"what uplands are in {anchor_name}?",
"tablelands of {anchor_name}",
"plateau regions within {anchor_name}",
"show plateaus in {anchor_name}",
"find uplands of {anchor_name}",
"give me plateau areas in {anchor_name}",
"{anchor_name} plateaus and uplands",
"what tablelands are there in {anchor_name}?",
],
),
SQLTemplate(
template_id="adj_12",
family="adjacency",
sql_difficulty="medium",
anchor_source="divisions_area",
num_anchors=1,
target_subtype="landform",
sql_template=(
"WITH a AS ("
" SELECT geometry FROM read_parquet('divisions_area') WHERE id = '{anchor_id}'"
")"
" SELECT n.id, n.names.\"primary\" AS name, n.subtype,"
" ST_AsGeoJSON(n.geometry) AS geometry"
" FROM read_parquet('natural_earth') AS n, a"
" WHERE n.subtype IN ('plain', 'lowland', 'basin', 'valley', 'depression', 'gorge')"
" AND ST_Intersects(a.geometry, n.geometry)"
),
question_hints=[
"what plains are in {anchor_name}?",
"basins and valleys of {anchor_name}",
"which basins are in {anchor_name}?",
"what valleys cross {anchor_name}?",
"lowlands in {anchor_name}",
"major landforms in {anchor_name}",
"plains, basins, and valleys within {anchor_name}",
"show me the main landforms in {anchor_name}",
"landforms of {anchor_name}",
"find plains and basins in {anchor_name}",
"{anchor_name} valleys and lowlands",
],
),
# ── MULTI-ADJACENCY ──────────────────────────────────────────────────────
SQLTemplate(
template_id="multi_adj_01",
family="multi_adjacency",
sql_difficulty="hard",
anchor_source="divisions_area",
num_anchors=2,
target_subtype="region",
sql_template=(
"WITH a AS ("
" SELECT geometry FROM read_parquet('divisions_area') WHERE id = '{anchor_id_1}'"
"),"
" b AS ("
" SELECT geometry FROM read_parquet('divisions_area') WHERE id = '{anchor_id_2}'"
")"
" SELECT c.id, c.names.\"primary\" AS name, c.subtype, c.country,"
" ST_AsGeoJSON(c.geometry) AS geometry"
" FROM read_parquet('divisions_area') AS c, a, b"
" WHERE c.id NOT IN ('{anchor_id_1}', '{anchor_id_2}')"
" AND c.subtype = '{target_subtype}'"
" AND ST_Touches(c.geometry, a.geometry)"
" AND ST_Touches(c.geometry, b.geometry)"
),
question_hints=[
"which {target_subtype}s border both {anchor_1_name} and {anchor_2_name}?",
"what {target_subtype}s touch both {anchor_1_name} and {anchor_2_name}?",
"{target_subtype}s adjacent to both {anchor_1_name} and {anchor_2_name}",
"common {target_subtype} neighbours of {anchor_1_name} and {anchor_2_name}",
"show {target_subtype}s touching both {anchor_1_name} and {anchor_2_name}",
"{target_subtype}s bordering both {anchor_1_name} and {anchor_2_name}",
"shared {target_subtype} neighbours of {anchor_1_name} and {anchor_2_name}",
"find {target_subtype}s adjacent to both {anchor_1_name} and {anchor_2_name}",
"which states border both {anchor_1_name} and {anchor_2_name}?",
"which provinces are adjacent to both {anchor_1_name} and {anchor_2_name}?",
"states touching both {anchor_1_name} and {anchor_2_name}",
],
),
SQLTemplate(
template_id="multi_adj_02",
family="multi_adjacency",
sql_difficulty="hard",
anchor_source="divisions_area",
num_anchors=2,
target_subtype="country",
sql_template=(
"WITH a AS ("
" SELECT geometry FROM read_parquet('divisions_area') WHERE id = '{anchor_id_1}'"
"),"
" b AS ("
" SELECT geometry FROM read_parquet('divisions_area') WHERE id = '{anchor_id_2}'"
")"
" SELECT c.id, c.names.\"primary\" AS name, c.subtype, c.country,"
" ST_AsGeoJSON(c.geometry) AS geometry"
" FROM read_parquet('divisions_area') AS c, a, b"
" WHERE c.id NOT IN ('{anchor_id_1}', '{anchor_id_2}')"
" AND c.subtype = '{target_subtype}'"
" AND ST_Touches(c.geometry, a.geometry)"
" AND ST_Touches(c.geometry, b.geometry)"
),
question_hints=[
"which countries border both {anchor_1_name} and {anchor_2_name}?",
"what nations share a border with both {anchor_1_name} and {anchor_2_name}?",
"countries adjacent to both {anchor_1_name} and {anchor_2_name}",
"which country touches both {anchor_1_name} and {anchor_2_name}?",
"nations bordering both {anchor_1_name} and {anchor_2_name}",
"show countries neighbouring both {anchor_1_name} and {anchor_2_name}",
"find countries that border both {anchor_1_name} and {anchor_2_name}",
"common country neighbours of {anchor_1_name} and {anchor_2_name}",
"which nation lies between {anchor_1_name} and {anchor_2_name}?",
"countries sharing a border with both {anchor_1_name} and {anchor_2_name}",
],
),
SQLTemplate(
template_id="multi_adj_03",
family="multi_adjacency",
sql_difficulty="hard",
anchor_source="divisions_area",
num_anchors=2,
target_subtype="county",
sql_template=(
"WITH a AS ("
" SELECT geometry FROM read_parquet('divisions_area') WHERE id = '{anchor_id_1}'"
"),"
" b AS ("
" SELECT geometry FROM read_parquet('divisions_area') WHERE id = '{anchor_id_2}'"
")"
" SELECT c.id, c.names.\"primary\" AS name, c.subtype, c.country,"
" ST_AsGeoJSON(c.geometry) AS geometry"
" FROM read_parquet('divisions_area') AS c, a, b"
" WHERE c.id NOT IN ('{anchor_id_1}', '{anchor_id_2}')"
" AND c.subtype = '{target_subtype}'"
" AND ST_Touches(c.geometry, a.geometry)"
" AND ST_Touches(c.geometry, b.geometry)"
),
question_hints=[
"which {target_subtype}s border both {anchor_1_name} and {anchor_2_name}?",
"what {target_subtype}s touch both {anchor_1_name} and {anchor_2_name}?",
"{target_subtype}s adjacent to both {anchor_1_name} and {anchor_2_name}",
"common {target_subtype} neighbours of {anchor_1_name} and {anchor_2_name}",
"show {target_subtype}s touching both {anchor_1_name} and {anchor_2_name}",
"which districts border both {anchor_1_name} and {anchor_2_name}?",
"districts lying between {anchor_1_name} and {anchor_2_name}",
"find districts adjacent to both {anchor_1_name} and {anchor_2_name}",
"municipalities bordering both {anchor_1_name} and {anchor_2_name}",
"which boroughs touch both {anchor_1_name} and {anchor_2_name}?",
],
),
# ── CONTAINMENT ──────────────────────────────────────────────────────────
SQLTemplate(
template_id="contain_01",
family="containment",
sql_difficulty="medium",
anchor_source="divisions_area",
num_anchors=1,
target_subtype="county",
sql_template=(
"WITH a AS ("
" SELECT geometry FROM read_parquet('divisions_area') WHERE id = '{anchor_id}'"
")"
" SELECT b.id, b.names.\"primary\" AS name, b.subtype, b.country,"
" ST_AsGeoJSON(b.geometry) AS geometry"
" FROM read_parquet('divisions_area') AS b, a"
" WHERE b.id != '{anchor_id}'"
" AND b.subtype = '{target_subtype}'"
" AND ST_Within(b.geometry, a.geometry)"
),
question_hints=[
"what {target_subtype}s are in {anchor_name}?",
"which {target_subtype}s fall within {anchor_name}?",
"list all {target_subtype}s inside {anchor_name}",
"{target_subtype}s contained by {anchor_name}",
"all {target_subtype}s within {anchor_name}",
"{target_subtype}s of {anchor_name}",
"show every {target_subtype} in {anchor_name}",
"show {target_subtype}s inside {anchor_name}",
"find {target_subtype}s in {anchor_name}",
"give me the {target_subtype}s in {anchor_name}",
"{anchor_name} {target_subtype}s",
"which {target_subtype}s does {anchor_name} contain?",
"what all {target_subtype}s are there in {anchor_name}?",
"{target_subtype}s under {anchor_name}",
"what districts are in {anchor_name}?",
"which districts fall within {anchor_name}?",
"districts of {anchor_name}",
"show all districts inside {anchor_name}",
"list municipalities in {anchor_name}",
"boroughs of {anchor_name}",
"communes inside {anchor_name}",
"parishes in {anchor_name}",
],
),
SQLTemplate(
template_id="contain_02",
family="containment",
sql_difficulty="medium",
anchor_source="divisions_area",
num_anchors=1,
target_subtype="country",
sql_template=(
"WITH a AS ("
" SELECT geometry FROM read_parquet('divisions_area') WHERE id = '{anchor_id}'"
")"
" SELECT b.id, b.names.\"primary\" AS name, b.subtype,"
" ST_AsGeoJSON(b.geometry) AS geometry"
" FROM read_parquet('divisions_area') AS b, a"
" WHERE b.id != '{anchor_id}'"
" AND b.subtype = '{target_subtype}'"
" AND ST_Contains(b.geometry, a.geometry)"
),
question_hints=[
"what country contains {anchor_name}?",
"which country is {anchor_name} in?",
"what country does {anchor_name} belong to?",
"which nation contains {anchor_name}?",
"{anchor_name} is part of which country?",
"where is {anchor_name}",
"what country is {anchor_name} in",
"{anchor_name} belongs to which country?",
"show country for {anchor_name}",
"find the country of {anchor_name}",
"which country does {anchor_name} fall in?",
"{anchor_name} under which country",
"tell me the country for {anchor_name}",
"which nation does {anchor_name} belong to?",
"what nation is {anchor_name} in?",
],
),
SQLTemplate(
template_id="contain_03",
family="containment",
sql_difficulty="medium",
anchor_source="divisions_area",
num_anchors=1,
target_subtype="region",
sql_template=(
"WITH a AS ("
" SELECT geometry FROM read_parquet('divisions_area') WHERE id = '{anchor_id}'"
")"
" SELECT b.id, b.names.\"primary\" AS name, b.subtype, b.country,"
" ST_AsGeoJSON(b.geometry) AS geometry"
" FROM read_parquet('divisions_area') AS b, a"
" WHERE b.subtype = '{target_subtype}'"
" AND ST_Within(b.geometry, a.geometry)"
),
question_hints=[
"what {target_subtype}s are in {anchor_name}?",
"list all {target_subtype}s in {anchor_name}",
"which {target_subtype}s does {anchor_name} have?",
"show {target_subtype}s of {anchor_name}",
"all {target_subtype}s within {anchor_name}",
"{anchor_name} {target_subtype}s",
"give me the {target_subtype}s of {anchor_name}",
"what states are in {anchor_name}?",
"list all states of {anchor_name}",
"which provinces does {anchor_name} have?",
"show the provinces of {anchor_name}",
"all states within {anchor_name}",
"provinces of {anchor_name}",
"what oblasts are in {anchor_name}?",
"departments of {anchor_name}",
"prefectures in {anchor_name}",
"cantons of {anchor_name}",
],
),
SQLTemplate(
template_id="contain_04",
family="containment",
sql_difficulty="medium",
anchor_source="natural_earth",
num_anchors=1,
target_subtype="region",
sql_template=(
"WITH a AS ("
" SELECT geometry FROM read_parquet('natural_earth') WHERE id = '{anchor_id}'"
")"
" SELECT b.id, b.names.\"primary\" AS name, b.subtype, b.country,"
" ST_AsGeoJSON(b.geometry) AS geometry"
" FROM read_parquet('divisions_area') AS b, a"
" WHERE b.subtype = '{target_subtype}'"
" AND ST_Within(b.geometry, a.geometry)"
),
question_hints=[
"which {target_subtype}s are in the {anchor_name}?",
"what {target_subtype}s fall within the {anchor_name}?",
"{target_subtype}s inside the {anchor_name}",
"admin {target_subtype}s within the {anchor_name}",
"all regions inside the {anchor_name}",
"what {target_subtype}s does the {anchor_name} contain?",
"{target_subtype}s covered by the {anchor_name}",
"which regions are in the {anchor_name} basin?",
"what admin regions lie within the {anchor_name}?",
"which provinces are inside the {anchor_name}?",
"show the regions in the {anchor_name}",
"find provinces inside the {anchor_name}",
"give me admin regions within the {anchor_name}",
"regions belonging to the {anchor_name}",
"areas contained in the {anchor_name}",
"which states are in the {anchor_name}?",
"states inside the {anchor_name}",
"which prefectures lie within the {anchor_name}?",
"oblasts in the {anchor_name}",
"departments inside the {anchor_name}",
],
),
# ── NATURAL EARTH CONTAINMENT ───────────────────────────────────────────
# contain_05: NE anchor (sea/gulf/bay), find countries that touch it.
# Uses containment handler via containment_pairs.
SQLTemplate(
template_id="contain_05",
family="containment",
sql_difficulty="medium",
anchor_source="natural_earth",
num_anchors=1,
target_subtype="country",
sql_template=(
"WITH a AS ("
" SELECT geometry FROM read_parquet('natural_earth') WHERE id = '{anchor_id}'"
")"
" SELECT b.id, b.names.\"primary\" AS name, b.subtype,"
" ST_AsGeoJSON(b.geometry) AS geometry"
" FROM read_parquet('divisions_area') AS b, a"
" WHERE b.subtype = '{target_subtype}'"
" AND ST_Intersects(b.geometry, a.geometry)"
),
question_hints=[
"which countries border the {anchor_name}?",
"what countries are along the {anchor_name}?",
"countries surrounding the {anchor_name}",
"nations on the {anchor_name}",
"which countries touch the {anchor_name}?",
"countries with coastline on the {anchor_name}",
"what nations lie on the {anchor_name}?",
"which countries are on the coast of the {anchor_name}?",
"what countries lie around the {anchor_name}?",
"which nations have shores on the {anchor_name}?",
"what countries front the {anchor_name}?",
],
),
# ── INTERSECTION ─────────────────────────────────────────────────────────
SQLTemplate(
template_id="intersect_01",
family="intersection",
sql_difficulty="medium-hard",
anchor_source="divisions_area",
num_anchors=1,
target_subtype="region",
sql_template=(
"WITH a AS ("
" SELECT geometry FROM read_parquet('divisions_area') WHERE id = '{anchor_id}'"
")"
" SELECT b.id, b.names.\"primary\" AS name, b.subtype, b.country,"
" ST_AsGeoJSON(b.geometry) AS geometry"
" FROM read_parquet('divisions_area') AS b, a"
" WHERE b.id != '{anchor_id}'"
" AND b.subtype = '{target_subtype}'"
" AND ST_Intersects(b.geometry, a.geometry)"
),
question_hints=[
"which {target_subtype}s intersect {anchor_name}?",
"what {target_subtype}s overlap with {anchor_name}?",
"{target_subtype}s that cross into {anchor_name}",
"which {target_subtype}s overlap {anchor_name}?",
"{target_subtype}s partially inside {anchor_name}",
"what {target_subtype}s extend into {anchor_name}?",
"show {target_subtype}s overlapping {anchor_name}",
"find {target_subtype}s crossing {anchor_name}",
"{target_subtype}s meeting {anchor_name}",
"areas intersecting {anchor_name}",
"{anchor_name} overlapping {target_subtype}s",
"which {target_subtype}s are partly in {anchor_name}?",
"which states intersect {anchor_name}?",
"which provinces overlap {anchor_name}?",
"states crossing into {anchor_name}",
"provinces partly inside {anchor_name}",
],
),
SQLTemplate(
template_id="intersect_02",
family="intersection",
sql_difficulty="medium-hard",
anchor_source="divisions_area",
num_anchors=1,
target_subtype="county",
sql_template=(
"WITH a AS ("
" SELECT geometry FROM read_parquet('divisions_area') WHERE id = '{anchor_id}'"
")"
" SELECT b.id, b.names.\"primary\" AS name, b.subtype, b.country,"
" ST_AsGeoJSON(b.geometry) AS geometry"
" FROM read_parquet('divisions_area') AS b, a"
" WHERE b.id != '{anchor_id}'"
" AND b.subtype = '{target_subtype}'"
" AND ST_Intersects(b.geometry, a.geometry)"
),
question_hints=[
"which {target_subtype}s intersect {anchor_name}?",
"what {target_subtype}s overlap with {anchor_name}?",
"{target_subtype}s crossing into {anchor_name}",
"which {target_subtype}s are partly in {anchor_name}?",
"show {target_subtype}s overlapping {anchor_name}",
"find {target_subtype}s crossing {anchor_name}",
"{target_subtype}s that overlap {anchor_name}",
"which districts intersect {anchor_name}?",
"what districts overlap with {anchor_name}?",
"districts crossing into {anchor_name}",
"which municipalities are partly in {anchor_name}?",
"boroughs overlapping {anchor_name}",
"communes crossing {anchor_name}",
],
),
SQLTemplate(
template_id="intersect_03",
family="intersection",
sql_difficulty="medium-hard",
anchor_source="natural_earth",
num_anchors=1,
target_subtype="country",
sql_template=(
"WITH a AS ("
" SELECT geometry FROM read_parquet('natural_earth') WHERE id = '{anchor_id}'"
")"
" SELECT b.id, b.names.\"primary\" AS name, b.subtype,"
" ST_AsGeoJSON(b.geometry) AS geometry"
" FROM read_parquet('divisions_area') AS b, a"
" WHERE b.subtype = '{target_subtype}'"
" AND ST_Intersects(b.geometry, a.geometry)"
),
question_hints=[
"which countries does the {anchor_name} pass through?",
"what countries does the {anchor_name} cross?",
"countries that overlap the {anchor_name}",
"which countries touch the {anchor_name}?",
"nations intersected by the {anchor_name}",
"which nations does the {anchor_name} cross?",
"countries along the {anchor_name}",
"what countries does the {anchor_name} cover?",
"countries the {anchor_name} spans across",
"what countries is the {anchor_name} in?",
"which countries lie along the {anchor_name}?",
"what countries does the {anchor_name} run through?",
"which countries border the {anchor_name}?",
],
),
# ── NATURAL EARTH INTERSECTION ──────────────────────────────────────────
# intersect_04: NE anchor, finding overlapping regions (vs countries in
# intersect_03). Uses cross_source_relations handler.
# intersect_05: division anchor, finding NE features that overlap it.
# Uses intersection_pairs handler (extra NE subtypes ignored in SQL).
SQLTemplate(
template_id="intersect_04",
family="intersection",
sql_difficulty="medium-hard",
anchor_source="natural_earth",
num_anchors=1,
target_subtype="region",
sql_template=(
"WITH a AS ("
" SELECT geometry FROM read_parquet('natural_earth') WHERE id = '{anchor_id}'"
")"
" SELECT b.id, b.names.\"primary\" AS name, b.subtype, b.country,"
" ST_AsGeoJSON(b.geometry) AS geometry"
" FROM read_parquet('divisions_area') AS b, a"
" WHERE b.subtype = '{target_subtype}'"
" AND ST_Intersects(b.geometry, a.geometry)"
),
question_hints=[
"which regions does the {anchor_name} pass through?",
"what admin regions overlap with the {anchor_name}?",
"regions that the {anchor_name} crosses",
"admin areas intersected by the {anchor_name}",
"what provinces does the {anchor_name} span?",
"regions along the {anchor_name}",
"which provinces overlap the {anchor_name}?",
"which regions is the {anchor_name} in?",
"what states does the {anchor_name} run through?",
"which provinces lie along the {anchor_name}?",
"show regions crossed by the {anchor_name}",
"find administrative regions along the {anchor_name}",
"give me the regions touched by the {anchor_name}",
"regions of the {anchor_name}",
"which states does the {anchor_name} pass through?",
"what oblasts does the {anchor_name} cross?",
"which departments does the {anchor_name} run through?",
"prefectures along the {anchor_name}",
],
),
SQLTemplate(
template_id="intersect_05",
family="intersection",
sql_difficulty="medium-hard",
anchor_source="divisions_area",
num_anchors=1,
target_subtype="region",
sql_template=(
"WITH a AS ("
" SELECT geometry FROM read_parquet('divisions_area') WHERE id = '{anchor_id}'"
")"
" SELECT n.id, n.names.\"primary\" AS name, n.subtype,"
" ST_AsGeoJSON(n.geometry) AS geometry"
" FROM read_parquet('natural_earth') AS n, a"
" WHERE ST_Intersects(n.geometry, a.geometry)"
),
question_hints=[
"what natural features intersect {anchor_name}?",
"natural features that overlap {anchor_name}",
"which geographic features cross {anchor_name}?",
"everything natural that touches {anchor_name}",
"what geographic features does {anchor_name} contain?",
"natural features within or crossing {anchor_name}",
"show natural features overlapping {anchor_name}",
"find the natural features in or across {anchor_name}",
"{anchor_name} intersecting natural features",
"what physical features are associated with {anchor_name}?",
],
),
SQLTemplate(
template_id="intersect_06",
family="intersection",
sql_difficulty="medium-hard",
anchor_source="natural_earth",
num_anchors=1,
target_subtype="county",
sql_template=(
"WITH a AS ("
" SELECT geometry FROM read_parquet('natural_earth') WHERE id = '{anchor_id}'"
")"
" SELECT b.id, b.names.\"primary\" AS name, b.subtype, b.country,"
" ST_AsGeoJSON(b.geometry) AS geometry"
" FROM read_parquet('divisions_area') AS b, a"
" WHERE b.subtype = '{target_subtype}'"
" AND ST_Intersects(b.geometry, a.geometry)"
),
question_hints=[
"which districts does the {anchor_name} pass through?",
"what districts does the {anchor_name} cross?",
"districts intersected by the {anchor_name}",
"which counties does the {anchor_name} flow through?",
"what counties overlap the {anchor_name}?",
"districts along the {anchor_name}",
"which districts are crossed by the {anchor_name}?",
"what districts is the {anchor_name} in?",
"which counties lie along the {anchor_name}?",
"what districts does the {anchor_name} run through?",
"show districts crossed by the {anchor_name}",
"find counties along the {anchor_name}",
"give me the districts touched by the {anchor_name}",
"districts of the {anchor_name}",
"which municipalities does the {anchor_name} pass through?",
"boroughs along the {anchor_name}",
"communes crossed by the {anchor_name}",
"parishes the {anchor_name} runs through",
],
),
# ── BUFFER ───────────────────────────────────────────────────────────────
# CTE computes the buffered geometry (raw) for the spatial join.
# Final SELECT wraps the result features with ST_AsGeoJSON.
SQLTemplate(
template_id="buffer_01",
family="buffer",
sql_difficulty="hard",
anchor_source="divisions_area",
num_anchors=1,
requires_buffer=True,
sql_template=(
"WITH a AS ("
" SELECT ST_Buffer(geometry, {buffer_m} / 111320.0) AS geom"
" FROM read_parquet('divisions_area')"
" WHERE id = '{anchor_id}'"
")"
" SELECT b.id, b.names.\"primary\" AS name, b.subtype, b.country,"
" ST_AsGeoJSON(b.geometry) AS geometry"
" FROM read_parquet('divisions_area') AS b, a"
" WHERE b.id != '{anchor_id}'"
" AND ST_Intersects(b.geometry, a.geom)"
),
question_hints=[
"what's within {buffer_label} of {anchor_name}?",
"admin units within {buffer_label} of {anchor_name}",
"features within a {buffer_label} radius of {anchor_name}",
"places within {buffer_label} of {anchor_name}",
"{buffer_label} buffer around {anchor_name}",
"what falls within {buffer_label} of {anchor_name}?",
"everything within {buffer_label} of {anchor_name}",
],
),
SQLTemplate(
template_id="buffer_02",
family="buffer",
sql_difficulty="hard",
anchor_source="natural_earth",
num_anchors=1,
requires_buffer=True,
sql_template=(
"WITH a AS ("
" SELECT ST_Buffer(geometry, {buffer_m} / 111320.0) AS geom"
" FROM read_parquet('natural_earth')"
" WHERE id = '{anchor_id}'"
")"
" SELECT b.id, b.names.\"primary\" AS name, b.subtype, b.country,"
" ST_AsGeoJSON(b.geometry) AS geometry"
" FROM read_parquet('divisions_area') AS b, a"
" WHERE ST_Intersects(b.geometry, a.geom)"
),
question_hints=[
"what admin units are within {buffer_label} of the {anchor_name}?",
"countries within {buffer_label} of the {anchor_name}",
"regions within {buffer_label} of the {anchor_name}",
"what falls within {buffer_label} of the {anchor_name}?",
"admin divisions within a {buffer_label} radius of the {anchor_name}",
"places within {buffer_label} of the {anchor_name}",
"what places are near the {anchor_name}?",
"what admin areas are close to the {anchor_name}?",
"which regions are around the {anchor_name}?",
"what lies within {buffer_label} of the shoreline of the {anchor_name}?",
"show places around the {anchor_name}",
"find areas near the {anchor_name}",
"admin units close to the {anchor_name}",
"what is around the {anchor_name} within {buffer_label}",
"give me nearby admin regions for the {anchor_name}",
],
),
SQLTemplate(
template_id="buffer_03",
family="buffer",
sql_difficulty="medium",
anchor_source="divisions_area",
num_anchors=1,
requires_buffer=True,
sql_template=(
"SELECT ST_AsGeoJSON(ST_Buffer(geometry, {buffer_m} / 111320.0)) AS geometry"
" FROM read_parquet('divisions_area')"
" WHERE id = '{anchor_id}'"
),
question_hints=[
"{buffer_label} buffer around {anchor_name}",
"draw a {buffer_label} buffer around {anchor_name}",
"show the {buffer_label} buffer around {anchor_name}",
"create a {buffer_label} radius around {anchor_name}",
"map a {buffer_label} zone around {anchor_name}",
"outline the {buffer_label} buffer around {anchor_name}",
"buffer {anchor_name} by {buffer_label}",
"show radius {buffer_label} from {anchor_name}",
"{anchor_name} with a {buffer_label} buffer",
],
),
# Subtype-specific and multi-anchor buffer templates are generated by helper
# functions to keep the catalog compact while preserving template IDs.
*_make_buffer_templates(),
# ── NATURAL EARTH BUFFER ────────────────────────────────────────────────
# buffer_11: NE anchor, find other NE features within a buffer distance.
# Uses buffer handler for natural_earth.
SQLTemplate(
template_id="buffer_11",
family="buffer",
sql_difficulty="hard",
anchor_source="natural_earth",
num_anchors=1,
requires_buffer=True,
sql_template=(
"WITH a AS ("
" SELECT ST_Buffer(geometry, {buffer_m} / 111320.0) AS geom"
" FROM read_parquet('natural_earth')"
" WHERE id = '{anchor_id}'"
")"
" SELECT n.id, n.names.\"primary\" AS name, n.subtype,"
" ST_AsGeoJSON(n.geometry) AS geometry"
" FROM read_parquet('natural_earth') AS n, a"
" WHERE ST_Intersects(n.geometry, a.geom)"
),
question_hints=[
"natural features within {buffer_label} of the {anchor_name}",
"what's within {buffer_label} of the {anchor_name}?",
"geographic features near the {anchor_name} within {buffer_label}",
"everything within {buffer_label} of the {anchor_name}",
"what natural features are close to the {anchor_name}?",
"{buffer_label} radius around the {anchor_name}",
"what natural features are around the {anchor_name}?",
"what lies near the {anchor_name}?",
"which features are close to the {anchor_name}?",
"what natural features are near the shoreline of the {anchor_name}?",
"show nearby natural features for the {anchor_name}",
"find features around the {anchor_name}",
"give me natural features near the {anchor_name}",
"features around the {anchor_name}",
"what is close to the {anchor_name} within {buffer_label}",
],
),
# ── CHAINED ──────────────────────────────────────────────────────────────
# Containment + EXISTS/NOT EXISTS ocean/sea.
# CTE holds raw geometry for ST_Within; final SELECT wraps with ST_AsGeoJSON.
SQLTemplate(
template_id="chained_01",
family="chained",
sql_difficulty="hard",
anchor_source="divisions_area",
num_anchors=1,
target_subtype="county",
sql_template=(
"WITH region AS ("
" SELECT geometry FROM read_parquet('divisions_area') WHERE id = '{anchor_id}'"
")"
" SELECT b.id, b.names.\"primary\" AS name, b.subtype, b.country,"
" ST_AsGeoJSON(b.geometry) AS geometry"
" FROM read_parquet('divisions_area') AS b, region"
" WHERE b.subtype = '{target_subtype}'"
" AND ST_Within(b.geometry, region.geometry)"
" AND EXISTS ("
" SELECT 1 FROM read_parquet('natural_earth') AS n"
" WHERE n.subtype IN ('ocean', 'sea')"
" AND ST_Intersects(b.geometry, n.geometry)"
" )"
),
question_hints=[
"coastal {target_subtype}s of {anchor_name}",
"{target_subtype}s in {anchor_name} with sea access",
"which {target_subtype}s in {anchor_name} are on the coast?",
"seaside {target_subtype}s within {anchor_name}",
"{target_subtype}s in {anchor_name} bordering the sea",
"oceanfront {target_subtype}s in {anchor_name}",
"which {target_subtype}s in {anchor_name} have a coastline?",
"show coastal {target_subtype}s in {anchor_name}",
"find {target_subtype}s of {anchor_name} on the shore",
"{target_subtype}s of {anchor_name} by the sea",
"which {target_subtype}s of {anchor_name} touch the ocean?",
"coastline {target_subtype}s in {anchor_name}",
"coastal districts of {anchor_name}",
"which districts of {anchor_name} are on the coast?",
"seaside municipalities of {anchor_name}",
"coastal boroughs in {anchor_name}",
],
),
SQLTemplate(
template_id="chained_02",
family="chained",
sql_difficulty="hard",
anchor_source="divisions_area",
num_anchors=1,
target_subtype="county",
sql_template=(
"WITH region AS ("
" SELECT geometry FROM read_parquet('divisions_area') WHERE id = '{anchor_id}'"
")"
" SELECT b.id, b.names.\"primary\" AS name, b.subtype, b.country,"
" ST_AsGeoJSON(b.geometry) AS geometry"
" FROM read_parquet('divisions_area') AS b, region"
" WHERE b.subtype = '{target_subtype}'"
" AND ST_Intersects(b.geometry, region.geometry)"
" AND NOT EXISTS ("
" SELECT 1 FROM read_parquet('natural_earth') AS n"
" WHERE n.subtype IN ('ocean', 'sea')"
" AND ST_Intersects(b.geometry, n.geometry)"
" )"
),
question_hints=[
"landlocked {target_subtype}s in {anchor_name}",
"which {target_subtype}s in {anchor_name} have no sea access?",
"{target_subtype}s in {anchor_name} that are landlocked",
"{target_subtype}s in {anchor_name} with no coastline",
"which {target_subtype}s within {anchor_name} are landlocked?",
"interior {target_subtype}s of {anchor_name} with no ocean border",
"show inland {target_subtype}s in {anchor_name}",
"find non-coastal {target_subtype}s of {anchor_name}",
"{target_subtype}s of {anchor_name} away from the sea",
"which {target_subtype}s in {anchor_name} are not coastal?",
"inner {target_subtype}s of {anchor_name}",
"inland districts of {anchor_name}",
"which districts in {anchor_name} are not on the coast?",
"non-coastal municipalities of {anchor_name}",
"interior boroughs of {anchor_name}",
],
),
SQLTemplate(
template_id="chained_03",
family="chained",
sql_difficulty="hard",
anchor_source="divisions_area",
num_anchors=1,
target_subtype="region",
sql_template=(
"WITH country AS ("
" SELECT geometry FROM read_parquet('divisions_area') WHERE id = '{anchor_id}'"
")"
" SELECT b.id, b.names.\"primary\" AS name, b.subtype, b.country,"
" ST_AsGeoJSON(b.geometry) AS geometry"
" FROM read_parquet('divisions_area') AS b, country"
" WHERE b.subtype = '{target_subtype}'"
" AND ST_Within(b.geometry, country.geometry)"
" AND EXISTS ("
" SELECT 1 FROM read_parquet('natural_earth') AS n"
" WHERE n.subtype IN ('range/mtn', 'island group', 'peninsula', 'depression')"
" AND ST_Intersects(b.geometry, n.geometry)"
" )"
),
question_hints=[
"{target_subtype}s in {anchor_name} on a terrain feature or island",
"{target_subtype}s of {anchor_name} on a peninsula or island group",
"{target_subtype}s within {anchor_name} on notable landforms",
"island and peninsula {target_subtype}s of {anchor_name}",
"show {target_subtype}s in {anchor_name} on major landforms",
"find {target_subtype}s of {anchor_name} on islands or peninsulas",
"{target_subtype}s in {anchor_name} on terrain regions",
"states of {anchor_name} on a peninsula or island",
"island states of {anchor_name}",
"provinces of {anchor_name} on major landforms",
"which regions of {anchor_name} are on a peninsula?",
],
),
# ── DIFFERENCE ───────────────────────────────────────────────────────────
# CTEs hold raw geometry; ST_Difference result wrapped with ST_AsGeoJSON.
SQLTemplate(
template_id="diff_01",
family="difference",
sql_difficulty="hard",
anchor_source="divisions_area",
num_anchors=2,
sql_template=(
"WITH a AS ("
" SELECT geometry FROM read_parquet('divisions_area') WHERE id = '{anchor_id_1}'"
"),"
" b AS ("
" SELECT geometry FROM read_parquet('divisions_area') WHERE id = '{anchor_id_2}'"
")"
" SELECT ST_AsGeoJSON(ST_Difference(a.geometry, b.geometry)) AS geometry"
" FROM a, b"
" WHERE ST_Intersects(a.geometry, b.geometry)"
),
question_hints=[
"{anchor_1_name} excluding {anchor_2_name}",
"{anchor_1_name} minus {anchor_2_name}",
"the part of {anchor_1_name} that is not in {anchor_2_name}",
"{anchor_1_name} without the {anchor_2_name} area",
"remove {anchor_2_name} from {anchor_1_name}",
"{anchor_1_name} with {anchor_2_name} cut out",
"subtract {anchor_2_name} from {anchor_1_name}",
"what's left of {anchor_1_name} after removing {anchor_2_name}?",
"difference between {anchor_1_name} and {anchor_2_name}",
"keep only {anchor_1_name} outside {anchor_2_name}",
"cut {anchor_2_name} out of {anchor_1_name}",
"show {anchor_1_name} without {anchor_2_name}",
],
),
SQLTemplate(
template_id="diff_02",
family="difference",
sql_difficulty="hard",
anchor_source="mixed",
num_anchors=2,
sql_template=(
"WITH a AS ("
" SELECT geometry FROM read_parquet('divisions_area') WHERE id = '{anchor_id}'"
"),"
" b AS ("
" SELECT geometry FROM read_parquet('natural_earth') WHERE id = '{clip_feature_id}'"
")"
" SELECT ST_AsGeoJSON(ST_Difference(a.geometry, b.geometry)) AS geometry"
" FROM a, b"
" WHERE ST_Intersects(a.geometry, b.geometry)"
),
question_hints=[
"the part of {anchor_name} outside the {clip_feature_name}",
"{anchor_name} excluding the {clip_feature_name}",
"{anchor_name} minus the {clip_feature_name}",
"parts of {anchor_name} not covered by the {clip_feature_name}",
"{anchor_name} with the {clip_feature_name} removed",
"what's left of {anchor_name} after removing the {clip_feature_name}?",
"show me {anchor_name} excluding the {clip_feature_name}",
"keep only the part of {anchor_name} outside the {clip_feature_name}",
"cut the {clip_feature_name} out of {anchor_name}",
"difference of {anchor_name} and the {clip_feature_name}",
"{anchor_name} after subtracting the {clip_feature_name}",
],
),
# ── BORDER CORRIDOR ──────────────────────────────────────────────────────
# Intermediate intersection kept raw; final buffer wrapped with ST_AsGeoJSON.
SQLTemplate(
template_id="corridor_01",
family="border_corridor",
sql_difficulty="hard",
anchor_source="divisions_area",
num_anchors=2,
requires_buffer=True,
sql_template=(
"WITH a AS ("
" SELECT geometry FROM read_parquet('divisions_area') WHERE id = '{anchor_id_1}'"
"),"
" b AS ("
" SELECT geometry FROM read_parquet('divisions_area') WHERE id = '{anchor_id_2}'"
"),"
" border AS ("
" SELECT ST_Intersection(a.geometry, b.geometry) AS line"
" FROM a, b"
" WHERE ST_Intersects(a.geometry, b.geometry)"
")"
" SELECT ST_AsGeoJSON(ST_Buffer(border.line, {buffer_km} * 1000.0 / 111320.0)) AS geometry"
" FROM border"
" WHERE border.line IS NOT NULL"
),
question_hints=[
"{buffer_km} km zone along the border between {anchor_1_name} and {anchor_2_name}",
"the {buffer_km} km border corridor between {anchor_1_name} and {anchor_2_name}",
"area within {buffer_km} km of the {anchor_1_name}-{anchor_2_name} border",
"the region straddling the border of {anchor_1_name} and {anchor_2_name} within {buffer_km} km",
"{buffer_km} km on either side of the {anchor_1_name} and {anchor_2_name} border",
"buffer the {anchor_1_name}-{anchor_2_name} boundary by {buffer_km} km",
"show the border zone between {anchor_1_name} and {anchor_2_name}",
"map the corridor along the {anchor_1_name}-{anchor_2_name} border",
"find the area near the border of {anchor_1_name} and {anchor_2_name}",
"give me the border buffer for {anchor_1_name} and {anchor_2_name}",
"border area of {anchor_1_name} and {anchor_2_name} within {buffer_km} km",
],
),
# ── SET OPERATIONS ───────────────────────────────────────────────────────
# union_01 / union_02: 2-anchor and filtered-containment unions.
# union_03: 3-anchor union β€” trains the model on IN-clause with 3 IDs.
# contain_multi: subtype within multiple countries via country IN clause.
SQLTemplate(
template_id="union_01",
family="set_operations",
sql_difficulty="medium-hard",
anchor_source="divisions_area",
num_anchors=2,
sql_template=(
"SELECT ST_AsGeoJSON(ST_Union_Agg(geometry)) AS geometry,"
" array_agg(names.\"primary\") AS names"
" FROM read_parquet('divisions_area')"
" WHERE id IN ('{anchor_id_1}', '{anchor_id_2}')"
),
question_hints=[
"the combined area of {anchor_1_name} and {anchor_2_name}",
"union of {anchor_1_name} and {anchor_2_name}",
"merge {anchor_1_name} and {anchor_2_name}",
"{anchor_1_name} and {anchor_2_name} together",
"combined geometry of {anchor_1_name} and {anchor_2_name}",
],
),
SQLTemplate(
template_id="union_02",
family="set_operations",
sql_difficulty="hard",
anchor_source="divisions_area",
num_anchors=1,
target_subtype="county",
requires_aggregation=True,
sql_template=(
"WITH a AS ("
" SELECT geometry FROM read_parquet('divisions_area') WHERE id = '{anchor_id}'"
")"
" SELECT ST_AsGeoJSON(ST_Union_Agg(b.geometry)) AS geometry"
" FROM read_parquet('divisions_area') AS b, a"
" WHERE b.subtype = '{target_subtype}'"
" AND ST_Within(b.geometry, a.geometry)"
),
question_hints=[
"merge all {target_subtype}s in {anchor_name} into one geometry",
"combined geometry of all {target_subtype}s in {anchor_name}",
"union of all {target_subtype}s within {anchor_name}",
"all {target_subtype}s of {anchor_name} merged together",
"the overall extent of {target_subtype}s in {anchor_name}",
"show one merged shape for all {target_subtype}s in {anchor_name}",
"dissolve all {target_subtype}s in {anchor_name}",
"make one geometry from all {target_subtype}s in {anchor_name}",
"single outline of all {target_subtype}s in {anchor_name}",
"combine the {target_subtype}s of {anchor_name} into one area",
"merge all districts in {anchor_name} into one shape",
"dissolve all municipalities in {anchor_name}",
"combine all boroughs of {anchor_name} into one geometry",
],
),
SQLTemplate(
template_id="union_03",
family="set_operations",
sql_difficulty="medium-hard",
anchor_source="divisions_area",
num_anchors=3,
sql_template=(
"SELECT ST_AsGeoJSON(ST_Union_Agg(geometry)) AS geometry,"
" array_agg(names.\"primary\") AS names"
" FROM read_parquet('divisions_area')"
" WHERE id IN ('{anchor_id_1}', '{anchor_id_2}', '{anchor_id_3}')"
),
question_hints=[
"show me {anchor_1_name}, {anchor_2_name} and {anchor_3_name}",
"the combined area of {anchor_1_name}, {anchor_2_name} and {anchor_3_name}",
"union of {anchor_1_name}, {anchor_2_name} and {anchor_3_name}",
"merge {anchor_1_name}, {anchor_2_name} and {anchor_3_name}",
"{anchor_1_name}, {anchor_2_name} and {anchor_3_name} together",
"display {anchor_1_name}, {anchor_2_name} and {anchor_3_name}",
],
),
*_make_country_in_templates(),
# ── PARTIAL SELECTION ────────────────────────────────────────────────────
# Bbox clip CTEs use raw geometry; ST_Intersection result wrapped.
SQLTemplate(
template_id="partial_01",
family="partial_selection",
sql_difficulty="hard",
anchor_source="divisions_area",
num_anchors=1,
sql_template=(
"WITH a AS ("
" SELECT geometry FROM read_parquet('divisions_area') WHERE id = '{anchor_id}'"
"),"
" bbox AS ("
" SELECT ST_XMin(geometry) AS xmin, ST_XMax(geometry) AS xmax,"
" ST_YMin(geometry) AS ymin, ST_YMax(geometry) AS ymax FROM a"
"),"
" clip AS ("
" SELECT ST_MakeEnvelope(xmin, (ymin + ymax) / 2.0, xmax, ymax) AS half_geom FROM bbox"
")"
" SELECT ST_AsGeoJSON(ST_Intersection(a.geometry, clip.half_geom)) AS geometry"
" FROM a, clip"
),
question_hints=[
"the northern half of {anchor_name}",
"northern part of {anchor_name}",
"the top half of {anchor_name}",
"northern portion of {anchor_name}",
"upper half of {anchor_name}",
"top side of {anchor_name}",
"show north half of {anchor_name}",
"cut {anchor_name} to the north half",
"only the northern side of {anchor_name}",
"north part of {anchor_name}",
],
),
SQLTemplate(
template_id="partial_02",
family="partial_selection",
sql_difficulty="hard",
anchor_source="divisions_area",
num_anchors=1,
sql_template=(
"WITH a AS ("
" SELECT geometry FROM read_parquet('divisions_area') WHERE id = '{anchor_id}'"
"),"
" bbox AS ("
" SELECT ST_XMin(geometry) AS xmin, ST_XMax(geometry) AS xmax,"
" ST_YMin(geometry) AS ymin, ST_YMax(geometry) AS ymax FROM a"
"),"
" clip AS ("
" SELECT ST_MakeEnvelope(xmin, ymin, xmax, (ymin + ymax) / 2.0) AS half_geom FROM bbox"
")"
" SELECT ST_AsGeoJSON(ST_Intersection(a.geometry, clip.half_geom)) AS geometry"
" FROM a, clip"
),
question_hints=[
"the southern half of {anchor_name}",
"southern part of {anchor_name}",
"the bottom half of {anchor_name}",
"southern portion of {anchor_name}",
"lower half of {anchor_name}",
"bottom side of {anchor_name}",
"show south half of {anchor_name}",
"cut {anchor_name} to the south half",
"only the southern side of {anchor_name}",
"south part of {anchor_name}",
],
),
SQLTemplate(
template_id="partial_03",
family="partial_selection",
sql_difficulty="hard",
anchor_source="divisions_area",
num_anchors=1,
sql_template=(
"WITH a AS ("
" SELECT geometry FROM read_parquet('divisions_area') WHERE id = '{anchor_id}'"
"),"
" bbox AS ("
" SELECT ST_XMin(geometry) AS xmin, ST_XMax(geometry) AS xmax,"
" ST_YMin(geometry) AS ymin, ST_YMax(geometry) AS ymax FROM a"
"),"
" clip AS ("
" SELECT ST_MakeEnvelope((xmin + xmax) / 2.0, ymin, xmax, ymax) AS half_geom FROM bbox"
")"
" SELECT ST_AsGeoJSON(ST_Intersection(a.geometry, clip.half_geom)) AS geometry"
" FROM a, clip"
),
question_hints=[
"the eastern half of {anchor_name}",
"eastern part of {anchor_name}",
"the right half of {anchor_name}",
"eastern portion of {anchor_name}",
"east side of {anchor_name}",
"show east half of {anchor_name}",
"cut {anchor_name} to the east half",
"only the eastern side of {anchor_name}",
"right side of {anchor_name}",
],
),
SQLTemplate(
template_id="partial_04",
family="partial_selection",
sql_difficulty="hard",
anchor_source="divisions_area",
num_anchors=1,
sql_template=(
"WITH a AS ("
" SELECT geometry FROM read_parquet('divisions_area') WHERE id = '{anchor_id}'"
"),"
" bbox AS ("
" SELECT ST_XMin(geometry) AS xmin, ST_XMax(geometry) AS xmax,"
" ST_YMin(geometry) AS ymin, ST_YMax(geometry) AS ymax FROM a"
"),"
" clip AS ("
" SELECT ST_MakeEnvelope(xmin, ymin, (xmin + xmax) / 2.0, ymax) AS half_geom FROM bbox"
")"
" SELECT ST_AsGeoJSON(ST_Intersection(a.geometry, clip.half_geom)) AS geometry"
" FROM a, clip"
),
question_hints=[
"the western half of {anchor_name}",
"western part of {anchor_name}",
"the left half of {anchor_name}",
"western portion of {anchor_name}",
"west side of {anchor_name}",
"show west half of {anchor_name}",
"cut {anchor_name} to the west half",
"only the western side of {anchor_name}",
"left side of {anchor_name}",
],
),
SQLTemplate(
template_id="partial_05",
family="partial_selection",
sql_difficulty="hard",
anchor_source="mixed",
num_anchors=2,
sql_template=(
"WITH a AS ("
" SELECT geometry AS g1 FROM read_parquet('divisions_area') WHERE id = '{anchor_id}'"
"),"
" b AS ("
" SELECT geometry AS g2 FROM read_parquet('natural_earth') WHERE id = '{clip_feature_id}'"
")"
" SELECT ST_AsGeoJSON(ST_Intersection(a.g1, b.g2)) AS geometry"
" FROM a, b"
" WHERE ST_Intersects(a.g1, b.g2)"
),
question_hints=[
"the part of {anchor_name} that overlaps the {clip_feature_name}",
"{anchor_name} within the {clip_feature_name}",
"the portion of {anchor_name} inside the {clip_feature_name}",
"part of the {clip_feature_name} in {anchor_name}",
"part of {anchor_name} in the {clip_feature_name}",
"clip {anchor_name} to the {clip_feature_name}",
"{anchor_name} clipped to the {clip_feature_name}",
"{clip_feature_name} inside {anchor_name}",
"parts of {anchor_name} covered by the {clip_feature_name}",
"show me where {anchor_name} and the {clip_feature_name} overlap",
"keep only the part of {anchor_name} in the {clip_feature_name}",
"intersection of {anchor_name} and the {clip_feature_name}",
"give me the overlap between {anchor_name} and the {clip_feature_name}",
"only the shared area of {anchor_name} and the {clip_feature_name}",
],
),
# ── AGGREGATION ──────────────────────────────────────────────────────────
# ST_Area uses raw geometry in the ORDER BY; final SELECT wraps output.
SQLTemplate(
template_id="agg_01",
family="aggregation",
sql_difficulty="hard",
anchor_source="divisions_area",
num_anchors=1,
target_subtype=None, # filled at generation time: county or region
requires_aggregation=True,
sql_template=(
"WITH a AS ("
" SELECT geometry FROM read_parquet('divisions_area') WHERE id = '{anchor_id}'"
")"
" SELECT b.id, b.names.\"primary\" AS name,"
" ST_AsGeoJSON(b.geometry) AS geometry,"
" ST_Area(b.geometry) AS area"
" FROM read_parquet('divisions_area') AS b, a"
" WHERE ST_Within(b.geometry, a.geometry)"
" AND b.subtype = '{target_subtype}'"
" ORDER BY area DESC"
" LIMIT {top_n}"
),
question_hints=[
"top {top_n} largest {target_subtype}s in {anchor_name}",
"biggest {top_n} {target_subtype}s in {anchor_name}",
"{top_n} largest {target_subtype}s inside {anchor_name}",
"the {top_n} biggest {target_subtype}s within {anchor_name}",
"largest {target_subtype} in {anchor_name}",
"which {target_subtype} in {anchor_name} has the most area?",
"show the biggest {target_subtype}s in {anchor_name}",
"list the largest {target_subtype}s for {anchor_name}",
"give me the biggest {target_subtype}s in {anchor_name}",
"{anchor_name} largest {target_subtype}s",
],
),
SQLTemplate(
template_id="agg_02",
family="aggregation",
sql_difficulty="hard",
anchor_source="divisions_area",
num_anchors=1,
target_subtype=None, # filled at generation time: county or region
requires_aggregation=True,
sql_template=(
"WITH a AS ("
" SELECT geometry FROM read_parquet('divisions_area') WHERE id = '{anchor_id}'"
")"
" SELECT b.id, b.names.\"primary\" AS name,"
" ST_AsGeoJSON(b.geometry) AS geometry,"
" ST_Area(b.geometry) AS area"
" FROM read_parquet('divisions_area') AS b, a"
" WHERE ST_Within(b.geometry, a.geometry)"
" AND b.subtype = '{target_subtype}'"
" ORDER BY area ASC"
" LIMIT {top_n}"
),
question_hints=[
"top {top_n} smallest {target_subtype}s in {anchor_name}",
"smallest {top_n} {target_subtype}s in {anchor_name}",
"{top_n} smallest {target_subtype}s inside {anchor_name}",
"the {top_n} tiniest {target_subtype}s within {anchor_name}",
"smallest {target_subtype} in {anchor_name}",
"which {target_subtype} in {anchor_name} has the least area?",
"show the smallest {target_subtype}s in {anchor_name}",
"list the smallest {target_subtype}s for {anchor_name}",
"give me the tiniest {target_subtype}s in {anchor_name}",
"{anchor_name} smallest {target_subtype}s",
],
),
SQLTemplate(
template_id="agg_03",
family="aggregation",
sql_difficulty="hard",
anchor_source="divisions_area",
num_anchors=1,
target_subtype=None, # filled at generation time: county or region
requires_aggregation=True,
sql_template=(
"SELECT id, names.\"primary\" AS name,"
" ST_AsGeoJSON(geometry) AS geometry,"
" ST_Area(geometry) AS area"
" FROM read_parquet('divisions_area')"
" WHERE country = '{country}'"
" AND subtype = '{target_subtype}'"
" ORDER BY area DESC"
" LIMIT {top_n}"
),
question_hints=[
"top {top_n} largest {target_subtype}s in {anchor_name}",
"{top_n} biggest {target_subtype}s in {anchor_name}",
"largest {top_n} {target_subtype}s in {anchor_name}",
"the {top_n} largest {target_subtype}s in {anchor_name}",
"biggest {target_subtype} in {anchor_name}",
"which {target_subtype} in {anchor_name} is the largest?",
"show the largest {target_subtype}s in {anchor_name}",
"list the biggest {target_subtype}s in {anchor_name}",
"give me the largest {target_subtype}s for {anchor_name}",
"{anchor_name} biggest {target_subtype}s",
],
),
SQLTemplate(
template_id="agg_04",
family="aggregation",
sql_difficulty="hard",
anchor_source="divisions_area",
num_anchors=1,
target_subtype=None, # filled at generation time: county or region
requires_aggregation=True,
sql_template=(
"SELECT id, names.\"primary\" AS name,"
" ST_AsGeoJSON(geometry) AS geometry,"
" ST_Area(geometry) AS area"
" FROM read_parquet('divisions_area')"
" WHERE country = '{country}'"
" AND subtype = '{target_subtype}'"
" ORDER BY area ASC"
" LIMIT {top_n}"
),
question_hints=[
"top {top_n} smallest {target_subtype}s in {anchor_name}",
"{top_n} smallest {target_subtype}s in {anchor_name}",
"smallest {top_n} {target_subtype}s in {anchor_name}",
"the {top_n} smallest {target_subtype}s in {anchor_name}",
"smallest {target_subtype} in {anchor_name}",
"which {target_subtype} in {anchor_name} is the smallest?",
"show the smallest {target_subtype}s in {anchor_name}",
"list the smallest {target_subtype}s in {anchor_name}",
"give me the smallest {target_subtype}s for {anchor_name}",
"{anchor_name} tiniest {target_subtype}s",
],
),
# ── WINDOW FUNCTION ──────────────────────────────────────────────────────
# CTE keeps raw geometry for ST_Area; final SELECT wraps with ST_AsGeoJSON.
SQLTemplate(
template_id="window_01",
family="window_function",
sql_difficulty="hard",
anchor_source="divisions_area",
num_anchors=1,
target_subtype="county",
requires_aggregation=True,
sql_template=(
"WITH ranked AS ("
" SELECT id, names.\"primary\" AS name, subtype, country, region, geometry,"
" ST_Area(geometry) AS area,"
" ROW_NUMBER() OVER (PARTITION BY region ORDER BY ST_Area(geometry) DESC) AS rn"
" FROM read_parquet('divisions_area')"
" WHERE country = '{country}'"
" AND subtype = '{target_subtype}'"
")"
" SELECT id, name, subtype, country, region,"
" ST_AsGeoJSON(geometry) AS geometry, area"
" FROM ranked"
" WHERE rn = 1"
),
question_hints=[
"the largest {target_subtype} in each region of {anchor_name}",
"biggest {target_subtype} per region in {anchor_name}",
"largest {target_subtype} for every region of {anchor_name}",
"the biggest {target_subtype} in each province of {anchor_name}",
"show the largest {target_subtype} in every region of {anchor_name}",
"list biggest {target_subtype} by region in {anchor_name}",
"for each region in {anchor_name}, give the largest {target_subtype}",
"largest {target_subtype}s grouped by region in {anchor_name}",
"one biggest {target_subtype} for each region of {anchor_name}",
"the largest district in each state of {anchor_name}",
"biggest district per province in {anchor_name}",
"the largest municipality in each region of {anchor_name}",
"biggest borough per state in {anchor_name}",
],
),
SQLTemplate(
template_id="window_02",
family="window_function",
sql_difficulty="hard",
anchor_source="divisions_area",
num_anchors=1,
target_subtype="county",
requires_aggregation=True,
sql_template=(
"WITH ranked AS ("
" SELECT id, names.\"primary\" AS name, subtype, country, region, geometry,"
" ST_Area(geometry) AS area,"
" ROW_NUMBER() OVER (PARTITION BY region ORDER BY ST_Area(geometry) ASC) AS rn"
" FROM read_parquet('divisions_area')"
" WHERE country = '{country}'"
" AND subtype = '{target_subtype}'"
")"
" SELECT id, name, subtype, country, region,"
" ST_AsGeoJSON(geometry) AS geometry, area"
" FROM ranked"
" WHERE rn = 1"
),
question_hints=[
"the smallest {target_subtype} in each region of {anchor_name}",
"smallest {target_subtype} per region in {anchor_name}",
"tiniest {target_subtype} for every region of {anchor_name}",
"the smallest {target_subtype} in each province of {anchor_name}",
"show the smallest {target_subtype} in every region of {anchor_name}",
"list smallest {target_subtype} by region in {anchor_name}",
"for each region in {anchor_name}, give the smallest {target_subtype}",
"smallest {target_subtype}s grouped by region in {anchor_name}",
"one tiniest {target_subtype} for each region of {anchor_name}",
"the smallest district in each state of {anchor_name}",
"smallest district per province in {anchor_name}",
"the smallest municipality in each region of {anchor_name}",
"tiniest borough per state in {anchor_name}",
],
),
SQLTemplate(
template_id="window_03",
family="window_function",
sql_difficulty="hard",
anchor_source="divisions_area",
num_anchors=1,
target_subtype="region",
requires_aggregation=True,
sql_template=(
"WITH ranked AS ("
" SELECT id, names.\"primary\" AS name, subtype, country, geometry,"
" ST_Area(geometry) AS area,"
" ROW_NUMBER() OVER (PARTITION BY country ORDER BY ST_Area(geometry) DESC) AS rn"
" FROM read_parquet('divisions_area')"
" WHERE country = '{country}'"
" AND subtype = '{target_subtype}'"
")"
" SELECT id, name, subtype, country,"
" ST_AsGeoJSON(geometry) AS geometry, area"
" FROM ranked"
" WHERE rn = 1"
),
question_hints=[
"the largest {target_subtype} in {anchor_name}",
"which {target_subtype} of {anchor_name} has the most area?",
"biggest {target_subtype} in {anchor_name}",
"what is the largest {target_subtype} of {anchor_name}?",
"show the biggest {target_subtype} of {anchor_name}",
"the largest state in {anchor_name}",
"which state of {anchor_name} is the biggest?",
"the largest province in {anchor_name}",
"biggest state of {anchor_name}",
"what is the largest province of {anchor_name}?",
"the largest oblast in {anchor_name}",
"biggest department of {anchor_name}",
"which prefecture of {anchor_name} has the most area?",
],
),
SQLTemplate(
template_id="window_04",
family="window_function",
sql_difficulty="hard",
anchor_source="divisions_area",
num_anchors=1,
target_subtype="region",
requires_aggregation=True,
sql_template=(
"WITH ranked AS ("
" SELECT id, names.\"primary\" AS name, subtype, country, geometry,"
" ST_Area(geometry) AS area,"
" ROW_NUMBER() OVER (PARTITION BY country ORDER BY ST_Area(geometry) ASC) AS rn"
" FROM read_parquet('divisions_area')"
" WHERE country = '{country}'"
" AND subtype = '{target_subtype}'"
")"
" SELECT id, name, subtype, country,"
" ST_AsGeoJSON(geometry) AS geometry, area"
" FROM ranked"
" WHERE rn = 1"
),
question_hints=[
"the smallest {target_subtype} in {anchor_name}",
"which {target_subtype} of {anchor_name} has the least area?",
"tiniest {target_subtype} in {anchor_name}",
"what is the smallest {target_subtype} of {anchor_name}?",
"show the smallest {target_subtype} of {anchor_name}",
"the smallest state in {anchor_name}",
"which state of {anchor_name} is the smallest?",
"the smallest province in {anchor_name}",
"tiniest state of {anchor_name}",
"what is the smallest province of {anchor_name}?",
"the smallest oblast in {anchor_name}",
"smallest department of {anchor_name}",
"which prefecture of {anchor_name} has the least area?",
],
),
# ── ATTRIBUTE FILTER ─────────────────────────────────────────────────────
# No spatial op β€” pure WHERE on is_land / is_territorial / country.
SQLTemplate(
template_id="attr_01",
family="attribute_filter",
sql_difficulty="medium",
anchor_source="divisions_area",
num_anchors=1,
target_subtype="county",
sql_template=(
"SELECT id, names.\"primary\" AS name, subtype, country,"
" ST_AsGeoJSON(geometry) AS geometry"
" FROM read_parquet('divisions_area')"
" WHERE country = '{country}'"
" AND is_land = TRUE"
" AND subtype = '{target_subtype}'"
),
question_hints=[
"land {target_subtype}s of {anchor_name}",
"which {target_subtype}s of {anchor_name} are on land?",
"{anchor_name}'s land-based {target_subtype}s",
"show the land {target_subtype}s of {anchor_name}",
"find land {target_subtype}s of {anchor_name}",
"give me land-based {target_subtype}s in {anchor_name}",
"which {target_subtype}s of {anchor_name} are land-based?",
"non-island {target_subtype}s of {anchor_name}",
"land districts of {anchor_name}",
"which districts of {anchor_name} are on land?",
"land-based municipalities of {anchor_name}",
"mainland boroughs of {anchor_name}",
],
),
SQLTemplate(
template_id="attr_02",
family="attribute_filter",
sql_difficulty="medium",
anchor_source="divisions_area",
num_anchors=1,
target_subtype="region",
sql_template=(
"SELECT id, names.\"primary\" AS name, subtype, country,"
" ST_AsGeoJSON(geometry) AS geometry"
" FROM read_parquet('divisions_area')"
" WHERE country = '{country}'"
" AND is_territorial = TRUE"
" AND subtype = '{target_subtype}'"
),
question_hints=[
"territorial {target_subtype}s of {anchor_name}",
"official territorial divisions of {anchor_name}",
"recognised territorial {target_subtype}s belonging to {anchor_name}",
"which territorial regions does {anchor_name} have?",
"show territorial {target_subtype}s of {anchor_name}",
"find official territorial regions of {anchor_name}",
"give me recognised territorial {target_subtype}s of {anchor_name}",
"territorial regions under {anchor_name}",
"{anchor_name} official territorial {target_subtype}s",
"official states of {anchor_name}",
"recognised provinces of {anchor_name}",
"territorial provinces of {anchor_name}",
"official oblasts of {anchor_name}",
"recognised departments of {anchor_name}",
],
),
SQLTemplate(
template_id="attr_03",
family="attribute_filter",
sql_difficulty="medium",
anchor_source="divisions_area",
num_anchors=1,
target_subtype="region",
sql_template=(
"SELECT id, names.\"primary\" AS name, subtype, country,"
" ST_AsGeoJSON(geometry) AS geometry"
" FROM read_parquet('divisions_area')"
" WHERE country = '{country}'"
" AND subtype = '{target_subtype}'"
" AND is_land = FALSE"
),
question_hints=[
"offshore {target_subtype}s of {anchor_name}",
"{target_subtype}s of {anchor_name} that are not on land",
"water-associated {target_subtype}s of {anchor_name}",
"marine or offshore {target_subtype}s of {anchor_name}",
"show offshore {target_subtype}s of {anchor_name}",
"find {target_subtype}s of {anchor_name} in water",
"give me non-land {target_subtype}s of {anchor_name}",
"water-side {target_subtype}s of {anchor_name}",
"{anchor_name} {target_subtype}s not on land",
"offshore states of {anchor_name}",
"island provinces of {anchor_name}",
"non-land regions of {anchor_name}",
],
),
SQLTemplate(
template_id="attr_04",
family="attribute_filter",
sql_difficulty="medium",
anchor_source="divisions_area",
num_anchors=1,
target_subtype="country",
sql_template=(
"SELECT id, names.\"primary\" AS name, subtype, country,"
" ST_AsGeoJSON(geometry) AS geometry"
" FROM read_parquet('divisions_area')"
" WHERE country = '{country}'"
" AND subtype = '{target_subtype}'"
" AND is_territorial = TRUE"
),
question_hints=[
"show the recognised boundary of {anchor_name}",
"find the official territorial extent of {anchor_name}",
"get the internationally recognised boundary of {anchor_name}",
"map the sovereign boundary of {anchor_name}",
"show the official country geometry for {anchor_name}",
"display the territorial country entry for {anchor_name}",
"give me the recognised country outline of {anchor_name}",
"official boundary of {anchor_name}",
],
),
# ── NATURAL EARTH CHAINED ───────────────────────────────────────────────
# chained_04: localities in a region that intersect a river or lake.
# chained_05: localities in a region that lie on a mountain range.
SQLTemplate(
template_id="chained_04",
family="chained",
sql_difficulty="hard",
anchor_source="divisions_area",
num_anchors=1,
target_subtype="region",
sql_template=(
"WITH region AS ("
" SELECT geometry FROM read_parquet('divisions_area') WHERE id = '{anchor_id}'"
")"
" SELECT b.id, b.names.\"primary\" AS name, b.subtype, b.country,"
" ST_AsGeoJSON(b.geometry) AS geometry"
" FROM read_parquet('divisions_area') AS b, region"
" WHERE b.subtype = '{target_subtype}'"
" AND ST_Within(b.geometry, region.geometry)"
" AND EXISTS ("
" SELECT 1 FROM read_parquet('natural_earth') AS n"
" WHERE n.subtype IN ('river', 'lake', 'basin')"
" AND ST_Intersects(b.geometry, n.geometry)"
" )"
),
question_hints=[
"riverside {target_subtype}s in {anchor_name}",
"{target_subtype}s in {anchor_name} near a river or lake",
"which {target_subtype}s in {anchor_name} are on a waterway?",
"lakeside or riverside {target_subtype}s within {anchor_name}",
"{target_subtype}s in {anchor_name} that touch a river",
"which {target_subtype}s in {anchor_name} are on a lake?",
"waterfront {target_subtype}s of {anchor_name}",
"show water-side {target_subtype}s in {anchor_name}",
"find {target_subtype}s of {anchor_name} by rivers or lakes",
"{target_subtype}s of {anchor_name} on the water",
"riverside states of {anchor_name}",
"states of {anchor_name} near a river or lake",
"provinces of {anchor_name} along a waterway",
"which regions of {anchor_name} have a major river?",
],
),
SQLTemplate(
template_id="chained_05",
family="chained",
sql_difficulty="hard",
anchor_source="divisions_area",
num_anchors=1,
target_subtype="region",
sql_template=(
"WITH region AS ("
" SELECT geometry FROM read_parquet('divisions_area') WHERE id = '{anchor_id}'"
")"
" SELECT b.id, b.names.\"primary\" AS name, b.subtype, b.country,"
" ST_AsGeoJSON(b.geometry) AS geometry"
" FROM read_parquet('divisions_area') AS b, region"
" WHERE b.subtype = '{target_subtype}'"
" AND ST_Within(b.geometry, region.geometry)"
" AND EXISTS ("
" SELECT 1 FROM read_parquet('natural_earth') AS n"
" WHERE n.subtype IN ('range/mtn', 'depression')"
" AND ST_Intersects(b.geometry, n.geometry)"
" )"
),
question_hints=[
"mountain {target_subtype}s in {anchor_name}",
"{target_subtype}s in {anchor_name} on a mountain range",
"which {target_subtype}s in {anchor_name} are in the mountains?",
"highland {target_subtype}s within {anchor_name}",
"{target_subtype}s of {anchor_name} in mountainous terrain",
"{target_subtype}s in {anchor_name} near a mountain range",
"show mountain {target_subtype}s in {anchor_name}",
"find {target_subtype}s of {anchor_name} in hilly terrain",
"{target_subtype}s of {anchor_name} by the mountains",
"mountain states of {anchor_name}",
"states of {anchor_name} in mountainous terrain",
"highland provinces of {anchor_name}",
"which regions of {anchor_name} have a mountain range?",
],
),
# ── CHAINED (county-level) ──────────────────────────────────────────────
# Same spatial patterns as chained_01..05 but targeting counties/districts
# so the model learns "coastal districts of X", "riverside counties", etc.
SQLTemplate(
template_id="chained_06",
family="chained",
sql_difficulty="hard",
anchor_source="divisions_area",
num_anchors=1,
target_subtype="county",
sql_template=(
"WITH region AS ("
" SELECT geometry FROM read_parquet('divisions_area') WHERE id = '{anchor_id}'"
")"
" SELECT b.id, b.names.\"primary\" AS name, b.subtype, b.country,"
" ST_AsGeoJSON(b.geometry) AS geometry"
" FROM read_parquet('divisions_area') AS b, region"
" WHERE b.subtype = '{target_subtype}'"
" AND ST_Within(b.geometry, region.geometry)"
" AND EXISTS ("
" SELECT 1 FROM read_parquet('natural_earth') AS n"
" WHERE n.subtype IN ('ocean', 'sea')"
" AND ST_Intersects(b.geometry, n.geometry)"
" )"
),
question_hints=[
"coastal {target_subtype}s of {anchor_name}",
"which districts of {anchor_name} are on the coast?",
"{target_subtype}s in {anchor_name} that border the sea",
"seaside {target_subtype}s within {anchor_name}",
"{target_subtype}s of {anchor_name} with ocean access",
"which {target_subtype}s in {anchor_name} touch the sea?",
"maritime {target_subtype}s of {anchor_name}",
"show coastal districts of {anchor_name}",
"find {target_subtype}s of {anchor_name} by the sea",
"give me shoreline {target_subtype}s in {anchor_name}",
"{target_subtype}s of {anchor_name} on the coast",
],
),
SQLTemplate(
template_id="chained_07",
family="chained",
sql_difficulty="hard",
anchor_source="divisions_area",
num_anchors=1,
target_subtype="county",
sql_template=(
"WITH region AS ("
" SELECT geometry FROM read_parquet('divisions_area') WHERE id = '{anchor_id}'"
")"
" SELECT b.id, b.names.\"primary\" AS name, b.subtype, b.country,"
" ST_AsGeoJSON(b.geometry) AS geometry"
" FROM read_parquet('divisions_area') AS b, region"
" WHERE b.subtype = '{target_subtype}'"
" AND ST_Within(b.geometry, region.geometry)"
" AND NOT EXISTS ("
" SELECT 1 FROM read_parquet('natural_earth') AS n"
" WHERE n.subtype IN ('ocean', 'sea')"
" AND ST_Intersects(b.geometry, n.geometry)"
" )"
),
question_hints=[
"landlocked {target_subtype}s of {anchor_name}",
"which districts of {anchor_name} have no coastline?",
"interior {target_subtype}s within {anchor_name}",
"{target_subtype}s in {anchor_name} with no sea access",
"non-coastal {target_subtype}s of {anchor_name}",
"inland {target_subtype}s of {anchor_name}",
"show inland districts of {anchor_name}",
"find non-coastal {target_subtype}s in {anchor_name}",
"give me inner {target_subtype}s of {anchor_name}",
"{target_subtype}s of {anchor_name} away from the coast",
],
),
SQLTemplate(
template_id="chained_08",
family="chained",
sql_difficulty="hard",
anchor_source="divisions_area",
num_anchors=1,
target_subtype="county",
sql_template=(
"WITH region AS ("
" SELECT geometry FROM read_parquet('divisions_area') WHERE id = '{anchor_id}'"
")"
" SELECT b.id, b.names.\"primary\" AS name, b.subtype, b.country,"
" ST_AsGeoJSON(b.geometry) AS geometry"
" FROM read_parquet('divisions_area') AS b, region"
" WHERE b.subtype = '{target_subtype}'"
" AND ST_Within(b.geometry, region.geometry)"
" AND EXISTS ("
" SELECT 1 FROM read_parquet('natural_earth') AS n"
" WHERE n.subtype IN ('river', 'lake', 'basin')"
" AND ST_Intersects(b.geometry, n.geometry)"
" )"
),
question_hints=[
"riverside {target_subtype}s of {anchor_name}",
"which districts of {anchor_name} have a river or lake?",
"{target_subtype}s in {anchor_name} on a waterway",
"lakeside {target_subtype}s within {anchor_name}",
"{target_subtype}s of {anchor_name} along a river",
"which {target_subtype}s in {anchor_name} border a lake?",
"show water-side {target_subtype}s of {anchor_name}",
"find {target_subtype}s in {anchor_name} near rivers and lakes",
"give me riverside districts of {anchor_name}",
"{target_subtype}s of {anchor_name} by the water",
],
),
SQLTemplate(
template_id="chained_09",
family="chained",
sql_difficulty="hard",
anchor_source="divisions_area",
num_anchors=1,
target_subtype="county",
sql_template=(
"WITH region AS ("
" SELECT geometry FROM read_parquet('divisions_area') WHERE id = '{anchor_id}'"
")"
" SELECT b.id, b.names.\"primary\" AS name, b.subtype, b.country,"
" ST_AsGeoJSON(b.geometry) AS geometry"
" FROM read_parquet('divisions_area') AS b, region"
" WHERE b.subtype = '{target_subtype}'"
" AND ST_Within(b.geometry, region.geometry)"
" AND EXISTS ("
" SELECT 1 FROM read_parquet('natural_earth') AS n"
" WHERE n.subtype IN ('range/mtn', 'depression')"
" AND ST_Intersects(b.geometry, n.geometry)"
" )"
),
question_hints=[
"mountain {target_subtype}s of {anchor_name}",
"which districts of {anchor_name} are in the mountains?",
"{target_subtype}s in {anchor_name} on a mountain range",
"highland {target_subtype}s within {anchor_name}",
"{target_subtype}s of {anchor_name} in mountainous terrain",
"which {target_subtype}s in {anchor_name} have mountain ranges?",
"show mountain districts of {anchor_name}",
"find {target_subtype}s of {anchor_name} in hilly areas",
"give me highland {target_subtype}s in {anchor_name}",
"{target_subtype}s of {anchor_name} by the mountains",
],
),
# chained_10 / chained_11: coastal and inland REGIONS of a country.
# Same pattern as chained_06/07 but with target_subtype='region' and
# container forced to a country so phrasings like "coastal states of
# India" / "inland provinces of Kenya" work correctly.
SQLTemplate(
template_id="chained_10",
family="chained",
sql_difficulty="hard",
anchor_source="divisions_area",
num_anchors=1,
target_subtype="region",
sql_template=(
"WITH country AS ("
" SELECT geometry FROM read_parquet('divisions_area') WHERE id = '{anchor_id}'"
")"
" SELECT b.id, b.names.\"primary\" AS name, b.subtype, b.country,"
" ST_AsGeoJSON(b.geometry) AS geometry"
" FROM read_parquet('divisions_area') AS b, country"
" WHERE b.subtype = '{target_subtype}'"
" AND ST_Within(b.geometry, country.geometry)"
" AND EXISTS ("
" SELECT 1 FROM read_parquet('natural_earth') AS n"
" WHERE n.subtype IN ('ocean', 'sea')"
" AND ST_Intersects(b.geometry, n.geometry)"
" )"
),
question_hints=[
"coastal states of {anchor_name}",
"coastal regions of {anchor_name}",
"coastal provinces of {anchor_name}",
"which states of {anchor_name} are on the coast?",
"regions of {anchor_name} with sea access",
"states of {anchor_name} that border the ocean",
"maritime states of {anchor_name}",
"seaside regions of {anchor_name}",
"which provinces of {anchor_name} touch the sea?",
"states of {anchor_name} along the coast",
"show coastal regions of {anchor_name}",
"find seaside states of {anchor_name}",
"give me provinces of {anchor_name} on the coast",
"{anchor_name} regions by the sea",
"which oblasts of {anchor_name} are on the coast?",
"coastal departments of {anchor_name}",
"coastal prefectures of {anchor_name}",
"which cantons of {anchor_name} have sea access?",
],
),
SQLTemplate(
template_id="chained_11",
family="chained",
sql_difficulty="hard",
anchor_source="divisions_area",
num_anchors=1,
target_subtype="region",
sql_template=(
"WITH country AS ("
" SELECT geometry FROM read_parquet('divisions_area') WHERE id = '{anchor_id}'"
")"
" SELECT b.id, b.names.\"primary\" AS name, b.subtype, b.country,"
" ST_AsGeoJSON(b.geometry) AS geometry"
" FROM read_parquet('divisions_area') AS b, country"
" WHERE b.subtype = '{target_subtype}'"
" AND ST_Within(b.geometry, country.geometry)"
" AND NOT EXISTS ("
" SELECT 1 FROM read_parquet('natural_earth') AS n"
" WHERE n.subtype IN ('ocean', 'sea')"
" AND ST_Intersects(b.geometry, n.geometry)"
" )"
),
question_hints=[
"landlocked states of {anchor_name}",
"inland regions of {anchor_name}",
"non-coastal states of {anchor_name}",
"which states of {anchor_name} have no coast?",
"inland provinces of {anchor_name}",
"regions of {anchor_name} without sea access",
"interior states of {anchor_name}",
"states of {anchor_name} that don't border the ocean",
"show inland regions of {anchor_name}",
"find non-coastal states of {anchor_name}",
"give me inner provinces of {anchor_name}",
"{anchor_name} regions away from the sea",
"landlocked oblasts of {anchor_name}",
"inland departments of {anchor_name}",
"interior prefectures of {anchor_name}",
"inland cantons of {anchor_name}",
],
),
# chained_12 / chained_13: coastal and landlocked COUNTRIES adjacent to anchor.
# Anchor is any country; finds its neighbours that are coastal / landlocked.
SQLTemplate(
template_id="chained_12",
family="chained",
sql_difficulty="hard",
anchor_source="divisions_area",
num_anchors=1,
target_subtype="country",
sql_template=(
"WITH a AS ("
" SELECT geometry FROM read_parquet('divisions_area') WHERE id = '{anchor_id}'"
")"
" SELECT b.id, b.names.\"primary\" AS name, b.subtype, b.country,"
" ST_AsGeoJSON(b.geometry) AS geometry"
" FROM read_parquet('divisions_area') AS b, a"
" WHERE b.subtype = 'country'"
" AND b.id != '{anchor_id}'"
" AND ST_Touches(a.geometry, b.geometry)"
" AND EXISTS ("
" SELECT 1 FROM read_parquet('natural_earth') AS n"
" WHERE n.subtype IN ('ocean', 'sea')"
" AND ST_Intersects(b.geometry, n.geometry)"
" )"
),
question_hints=[
"which countries bordering {anchor_name} are coastal?",
"coastal countries adjacent to {anchor_name}",
"which neighboring countries of {anchor_name} have sea access?",
"countries that touch {anchor_name} and border the ocean",
"which nations next to {anchor_name} are on the coast?",
"find coastal nations bordering {anchor_name}",
"countries neighboring {anchor_name} with a coastline",
"which countries near {anchor_name} touch the sea?",
],
),
SQLTemplate(
template_id="chained_13",
family="chained",
sql_difficulty="hard",
anchor_source="divisions_area",
num_anchors=1,
target_subtype="country",
sql_template=(
"WITH a AS ("
" SELECT geometry FROM read_parquet('divisions_area') WHERE id = '{anchor_id}'"
")"
" SELECT b.id, b.names.\"primary\" AS name, b.subtype, b.country,"
" ST_AsGeoJSON(b.geometry) AS geometry"
" FROM read_parquet('divisions_area') AS b, a"
" WHERE b.subtype = 'country'"
" AND b.id != '{anchor_id}'"
" AND ST_Touches(a.geometry, b.geometry)"
" AND NOT EXISTS ("
" SELECT 1 FROM read_parquet('natural_earth') AS n"
" WHERE n.subtype IN ('ocean', 'sea')"
" AND ST_Intersects(b.geometry, n.geometry)"
" )"
),
question_hints=[
"which countries bordering {anchor_name} are landlocked?",
"landlocked countries adjacent to {anchor_name}",
"which neighboring countries of {anchor_name} have no sea access?",
"countries that touch {anchor_name} and are landlocked",
"which nations next to {anchor_name} have no coastline?",
"find landlocked nations bordering {anchor_name}",
"countries neighboring {anchor_name} with no ocean access",
"which countries near {anchor_name} are inland?",
],
),
]
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def get_templates_by_family(family: str) -> List[SQLTemplate]:
"""Return all templates for a specific task family."""
return [t for t in TEMPLATES if t.family == family]
def get_template_by_id(template_id: str) -> SQLTemplate:
"""Return a template by its ID, raising ValueError if not found."""
for t in TEMPLATES:
if t.template_id == template_id:
return t
raise ValueError(f"Template '{template_id}' not found")
if __name__ == "__main__":
families: dict = {}
for t in TEMPLATES:
families[t.family] = families.get(t.family, 0) + 1
print("SQL Template Catalog")
print("=" * 60)
for family, count in sorted(families.items()):
print(f"{family:20s}: {count:2d} templates")
print(f"{'TOTAL':20s}: {len(TEMPLATES):2d} templates")
# Verify every template's final SELECT wraps geometry with ST_AsGeoJSON
print()
print("Geometry output check (all should show ST_AsGeoJSON)")
print("=" * 60)
for t in TEMPLATES:
has_geojson = "ST_AsGeoJSON" in t.sql_template
status = "OK" if has_geojson else "MISSING"
print(f" {t.template_id:20s}: {status}")