Spaces:

developmentseed
/

gazet

Running

App Files Files Community

gazet / dataset /scripts /generate_samples.py

srmsoumya

Randomize candidate dataset order

582d1ab 10 days ago

raw

history blame contribute delete

79.7 kB

	"""
	Generate synthetic training samples for text-to-SQL task.

	This script:
	1. Loads relation tables and entity inventories
	2. For each SQL template, samples valid anchors
	3. Renders and executes SQL to verify it works
	4. Builds candidate lists with controlled distractors
	5. Generates natural language questions using LLM
	6. Saves complete training samples

	Output:
	- output/samples/sample_*.json (individual samples)
	- output/dataset_raw.jsonl (all samples)
	"""

	import json
	import random
	import re
	import warnings
	from pathlib import Path
	from typing import List, Dict, Any, Optional
	from concurrent.futures import ProcessPoolExecutor, as_completed
	from functools import partial

	import duckdb
	import pandas as pd
	from pydantic import BaseModel

	# Suppress warnings
	warnings.filterwarnings('ignore')

	from gazet.config import DIVISIONS_AREA_PATH, NATURAL_EARTH_PATH

	# Fixed paths embedded in every training SQL string.
	# The model learns these short, stable strings rather than machine-specific
	# local paths. At inference, sql.py's _rewrite_data_paths substitutes them
	# with the actual runtime paths from gazet.config.
	_DIVISIONS_SQL_PATH = 'divisions_area'
	_NATURAL_EARTH_SQL_PATH = 'natural_earth'


	def _for_execution(sql: str) -> str:
	"""Replace symbolic placeholder paths with actual local paths for verification."""
	return (
	sql
	.replace("read_parquet('divisions_area')", f"read_parquet('{DIVISIONS_AREA_PATH}')")
	.replace("read_parquet('natural_earth')", f"read_parquet('{NATURAL_EARTH_PATH}')")
	)

	# Configurable parameters (can be overridden by CLI)
	TARGET_COUNTS = None # Will be set in main() or by CLI
	MAX_WORKERS = 8
	RETRY_MULTIPLIER = 2
	APPEND_MODE = False


	_GENERIC_SURFACE_RULES = [
	("spelling_neighboring", r"\bneighbouring\b", ["neighboring"]),
	("spelling_neighbors", r"\bneighbours\b", ["neighbors"]),
	("expand_whats", r"\bwhat's\b", ["what is"]),
	("show_me", r"\bshow me\b", ["show", "display"]),
	("give_me", r"\bgive me\b", ["show", "list"]),
	("pull_up", r"\bpull up\b", ["show", "display"]),
	("find_to_show", r"\bfind\b", ["show", "locate"]),
	("kilometers_variant", r"\bkilometers\b", ["km"]),
	("metres_variant", r"\bmetres\b", ["meters"]),
	("recognised_variant", r"\brecognised\b", ["recognized"]),
	]

	_FAMILY_SURFACE_RULES = {
	"adjacency": [
	("which_border_to_next_to", r"\bwhich (.+?) border (.+)\?", [r"which \1 are next to \2?", r"which \1 are adjacent to \2?"]),
	("bordering_to_next_to", r"\bbordering (.+)", [r"next to \1", r"adjacent to \1"]),
	("touching_to_next_to", r"\btouching (.+)", [r"next to \1"]),
	("share_border_to_adjacent", r"share a border with", ["are adjacent to", "are next to"]),
	("adjacent_to_next_to", r"adjacent to", ["next to"]),
	],
	"multi_adjacency": [
	("which_border_both_to_next_to", r"\bwhich (.+?) border both (.+)\?", [r"which \1 are next to both \2?", r"which \1 are adjacent to both \2?"]),
	("touch_both_to_next_to", r"touch both", ["are next to both"]),
	("adjacent_both_to_next_to", r"adjacent to both", ["next to both"]),
	],
	"containment": [
	("within_to_inside", r"\bwithin\b", ["inside", "in"]),
	("inside_to_in", r"\binside\b", ["in"]),
	("belonging_to_in", r"belonging to", ["in"]),
	("contain_to_have", r"\bcontain\b", ["have"]),
	],
	"intersection": [
	("which_intersect_to_overlap", r"\bwhich (.+?) intersect (.+)\?", [r"which \1 overlap \2?"]),
	("overlap_with_to_intersect", r"overlap with", ["intersect"]),
	("crossing_to_overlapping", r"crossing into", ["overlapping"]),
	("partly_in_to_overlap", r"partly in", ["overlapping"]),
	],
	"buffer": [
	("within_distance_to_from", r"within ([0-9]+\s*(?:km\|m)) of", [r"up to \1 from", r"at a distance of \1 from"]),
	("buffer_to_radius", r"\bbuffer\b", ["radius", "zone"]),
	("close_to_near", r"close to", ["near"]),
	("around_to_near", r"what is around", ["what is near"]),
	],
	"chained": [
	("coastal_to_seaside", r"\bcoastal\b", ["seaside", "maritime"]),
	("landlocked_to_inland", r"\blandlocked\b", ["inland"]),
	("sea_access_to_coast", r"sea access", ["a coastline"]),
	],
	"difference": [
	("part_to_portion", r"\bpart of\b", ["portion of", "section of"]),
	("outside_to_excluding", r"\boutside\b", ["excluding"]),
	],
	"border_corridor": [
	("zone_to_buffer", r"\bzone\b", ["buffer", "corridor"]),
	("within_distance_to_along", r"within ([0-9]+ km) of the", [r"along the", r"up to \1 from the"]),
	],
	"set_operations": [
	("combined_to_merged", r"combined", ["merged"]),
	("union_of_to_merged_area", r"\bunion of\b", ["merged area of", "combined area of"]),
	("merge_to_combine", r"\bmerge\b", ["combine"]),
	("together_to_combined", r"\btogether\b", ["combined"]),
	],
	"partial_selection": [
	("part_to_portion", r"\bpart of\b", ["portion of", "section of"]),
	("half_to_side", r"\bhalf\b", ["side"]),
	],
	"aggregation": [
	("largest_to_biggest", r"\blargest\b", ["biggest"]),
	("smallest_to_tiniest", r"\bsmallest\b", ["tiniest"]),
	],
	"window_function": [
	("largest_to_biggest", r"\blargest\b", ["biggest"]),
	("smallest_to_tiniest", r"\bsmallest\b", ["tiniest"]),
	],
	"attribute_filter": [
	("official_to_recognized", r"\bofficial\b", ["recognized", "recognized territorial"]),
	("land_based_to_on_land", r"land-based", ["on-land", "on land"]),
	("sovereign_to_official", r"\bsovereign\b", ["official"]),
	],
	"direct_lookup": [
	("where_is_to_show", r"\bwhere is\b", ["show", "locate"]),
	("map_of_to_outline", r"\bmap of\b", ["outline of"]),
	],
	"disambiguation": [
	("show_me_to_find", r"\bshow me\b", ["find", "show"]),
	("pull_up_to_find", r"\bpull up\b", ["find", "show"]),
	],
	}


	def _diversify_question_surface(question: str, family: str) -> tuple[str, List[str]]:
	"""Apply light family-aware paraphrasing to reduce template memorization.

	Rewrites are intentionally shallow and lexically local so the generated
	question stays aligned with the underlying SQL intent.
	"""
	if not question or random.random() < 0.35:
	return question, []

	rules = _GENERIC_SURFACE_RULES + _FAMILY_SURFACE_RULES.get(family, [])
	rewritten = question
	applied: List[str] = []
	max_rewrites = 2 if random.random() < 0.5 else 1

	for _ in range(max_rewrites):
	matches = []
	for label, pattern, replacements in rules:
	if re.search(pattern, rewritten, flags=re.IGNORECASE):
	for replacement in replacements:
	matches.append((label, pattern, replacement))
	if not matches:
	break

	label, pattern, replacement = random.choice(matches)
	updated = re.sub(pattern, replacement, rewritten, count=1, flags=re.IGNORECASE)
	if updated == rewritten:
	continue
	rewritten = re.sub(r"\s+", " ", updated).strip()
	applied.append(f"{family}:{label}")

	return rewritten, applied


	# Import templates from same directory
	from . import sql_templates
	TEMPLATES = sql_templates.TEMPLATES
	SQLTemplate = sql_templates.SQLTemplate
	get_templates_by_family = sql_templates.get_templates_by_family


	_NE_NAMED_LOOKUP_SUBTYPES = {
	'sea', 'ocean', 'lake', 'river', 'basin', 'gulf', 'bay',
	'island group', 'peninsula', 'strait', 'range/mtn', 'depression',
	}

	_NE_TEMPLATE_SUBTYPES = {
	'lookup_02': {'sea', 'ocean', 'lake', 'river', 'basin', 'gulf', 'bay', 'island group', 'peninsula', 'strait', 'range/mtn', 'depression'},
	'adj_03': {'sea', 'ocean'},
	'adj_09': {'river', 'lake', 'basin'},
	'adj_10': {'range/mtn', 'peninsula', 'depression'},
	'adj_06': {'sea', 'ocean', 'lake', 'river', 'basin', 'gulf', 'bay', 'strait', 'range/mtn', 'peninsula', 'depression', 'plateau', 'plain', 'lowland', 'valley', 'gorge'},
	'adj_07': {'sea', 'ocean', 'lake', 'river', 'basin', 'gulf', 'bay', 'strait', 'range/mtn', 'peninsula', 'depression', 'plateau', 'plain', 'lowland', 'valley', 'gorge'},
	'adj_08': {'sea', 'ocean', 'lake', 'river', 'basin', 'gulf', 'bay', 'strait', 'range/mtn', 'peninsula', 'depression', 'plateau', 'plain', 'lowland', 'valley', 'gorge'},
	'contain_04': {'sea', 'ocean', 'gulf', 'bay', 'basin', 'island group', 'peninsula', 'range/mtn', 'depression'},
	'contain_05': {'sea', 'ocean', 'gulf', 'bay', 'strait'},
	'intersect_03': {'river', 'lake', 'basin', 'gulf', 'bay', 'strait', 'range/mtn', 'peninsula', 'depression'},
	'intersect_04': {'river', 'lake', 'basin', 'gulf', 'bay', 'strait', 'range/mtn', 'peninsula', 'depression'},
	'intersect_06': {'river', 'lake', 'basin', 'gulf', 'bay', 'strait', 'range/mtn', 'peninsula', 'depression'},
	'buffer_02': {'sea', 'ocean', 'lake', 'river', 'basin', 'gulf', 'bay', 'island group', 'peninsula', 'strait', 'range/mtn', 'depression'},
	'buffer_11': {'sea', 'ocean', 'lake', 'river', 'basin', 'gulf', 'bay', 'island group', 'peninsula', 'strait', 'range/mtn', 'depression'},
	'chained_03': {'island group', 'peninsula', 'range/mtn', 'depression'},
	'chained_04': {'river', 'lake', 'basin'},
	'chained_05': {'range/mtn', 'depression'},
	'chained_08': {'river', 'lake', 'basin'},
	'chained_09': {'range/mtn', 'depression'},
	'partial_05': {'sea', 'ocean', 'lake', 'river', 'basin', 'gulf', 'bay', 'island group', 'peninsula', 'strait', 'range/mtn', 'depression'},
	'diff_02': {'sea', 'ocean', 'lake', 'river', 'basin', 'gulf', 'bay', 'island group', 'peninsula', 'strait', 'range/mtn', 'depression'},
	}


	class Candidate(BaseModel):
	"""Candidate entity for grounding."""
	candidate_id: str
	source: str
	id: str
	name: str
	subtype: Optional[str] = None
	country: Optional[str] = None
	region: Optional[str] = None
	admin_level: Optional[int] = None
	similarity: float = 0.0


	class TrainingSample(BaseModel):
	"""Complete training sample."""
	id: str
	question: str
	candidates: List[Candidate]
	target: Dict[str, Any]
	metadata: Dict[str, Any]


	def load_relation_tables(intermediate_dir: Path, quiet: bool = False) -> Dict[str, pd.DataFrame]:
	"""Load all precomputed relation tables."""
	tables = {}

	for file in intermediate_dir.glob("*.parquet"):
	name = file.stem
	tables[name] = pd.read_parquet(file)
	if not quiet:
	print(f" {name}: {len(tables[name])} rows")

	return tables


	def sample_adjacency_anchor(
	adjacency_df: pd.DataFrame,
	target_subtype: Optional[str] = None,
	anchor_subtypes: Optional[List[str]] = None,
	) -> Optional[Dict[str, Any]]:
	"""Sample a random adjacency pair, optionally filtered by subtypes.

	When ``target_subtype`` is provided, only rows whose neighbouring feature
	matches that subtype are considered. When ``anchor_subtypes`` is provided,
	only rows whose anchor feature is one of those subtypes are considered.
	Both filters are applied together so sampled pairs are geographically
	coherent with the template intent (e.g. country anchor → country result).
	"""
	if adjacency_df.empty:
	return None

	df = adjacency_df
	if target_subtype is not None:
	df = df[df['target_subtype'] == target_subtype]
	if df.empty:
	return None
	if anchor_subtypes is not None:
	filtered = df[df['anchor_subtype'].isin(anchor_subtypes)]
	if not filtered.empty:
	df = filtered

	row = df.sample(n=1).iloc[0]
	return {
	'anchor_id': row['anchor_id'],
	'anchor_name': row['anchor_name'],
	'anchor_subtype': row['anchor_subtype'],
	'anchor_country': row.get('anchor_country'), # May not exist in all tables
	'target_id': row.get('target_id'),
	'target_name': row.get('target_name'),
	'target_subtype': row.get('target_subtype')
	}


	def sample_intersection_anchor(intersection_df: pd.DataFrame) -> Optional[Dict[str, Any]]:
	"""Sample a random intersection pair."""
	if intersection_df.empty:
	return None

	row = intersection_df.sample(n=1).iloc[0]
	return {
	'anchor_id': row['anchor_id'],
	'anchor_name': row['anchor_name'],
	'anchor_subtype': row['anchor_subtype'],
	'target_id': row.get('target_id'),
	'target_name': row.get('target_name'),
	'target_subtype': row.get('target_subtype')
	}


	def sample_containment_anchor(containment_df: pd.DataFrame) -> Optional[Dict[str, Any]]:
	"""Sample a random containment pair.

	Returns both ends of the pair so callers that need the contained entity
	(e.g. difference templates that clip container by contained) can use it
	directly without a second random draw.
	"""
	if containment_df.empty:
	return None

	row = containment_df.sample(n=1).iloc[0]
	return {
	'container_id': row['container_id'],
	'container_name': row['container_name'],
	'container_subtype': row['container_subtype'],
	'contained_id': row['contained_id'],
	'contained_name': row['contained_name'],
	'contained_subtype': row['contained_subtype'],
	}


	def sample_disambiguation_anchor(
	containment_df: pd.DataFrame,
	contained_subtypes: List[str],
	container_subtypes: List[str],
	) -> Optional[Dict[str, Any]]:
	"""Sample a (contained, container) pair from containment_pairs.

	Used by disambiguation templates like "Puri, Odisha" where the contained
	entity is the target and the container provides disambiguation context.
	"""
	if containment_df.empty:
	return None

	df = containment_df[
	containment_df['contained_subtype'].isin(contained_subtypes)
	& containment_df['container_subtype'].isin(container_subtypes)
	]
	if df.empty:
	return None

	row = df.sample(n=1).iloc[0]
	return {
	'contained_id': row['contained_id'],
	'contained_name': row['contained_name'],
	'contained_subtype': row['contained_subtype'],
	'container_id': row['container_id'],
	'container_name': row['container_name'],
	'container_subtype': row['container_subtype'],
	}


	def sample_cross_source_anchor(
	cross_source_df: pd.DataFrame,
	natural_subtypes: Optional[set[str]] = None,
	relation_types: Optional[set[str]] = None,
	) -> Optional[Dict[str, Any]]:
	"""Sample a random cross-source relation with optional subtype filters."""
	if cross_source_df.empty:
	return None

	df = cross_source_df
	if natural_subtypes is not None:
	df = df[df['natural_subtype'].isin(natural_subtypes)]
	if relation_types is not None:
	df = df[df['relation_type'].isin(relation_types)]
	if df.empty:
	return None

	row = df.sample(n=1).iloc[0]
	return {
	'division_id': row['division_id'],
	'division_name': row['division_name'],
	'division_subtype': row['division_subtype'],
	'natural_id': row['natural_id'],
	'natural_name': row['natural_name'],
	'natural_subtype': row['natural_subtype'],
	'relation_type': row['relation_type']
	}


	def _merge_candidate_lists(
	*lists: List[Candidate],
	max_total: int = 10,
	) -> List[Candidate]:
	"""Merge N candidate lists, deduplicate by id, reassign candidate_ids.

	Interleaves the lists so each anchor is represented before any anchor
	gets a second candidate — matching the grouped-then-interleaved order
	that inference produces.
	"""
	from itertools import zip_longest

	seen: set = set()
	merged: List[Candidate] = []
	for row in zip_longest(*lists):
	for c in row:
	if c is None:
	continue
	if c.id not in seen:
	merged.append(c)
	seen.add(c.id)
	if len(merged) >= max_total:
	break
	if len(merged) >= max_total:
	break
	for i, c in enumerate(merged, 1):
	c.candidate_id = f"c{i}"
	return merged


	def _dedupe_country_candidates(
	candidates: List[Candidate],
	max_total: Optional[int] = None,
	) -> List[Candidate]:
	"""Deduplicate country candidates by country code, preserving first match.

	This is useful for templates whose SQL uses ``country IN (...)`` rather
	than candidate IDs. Overture can contain multiple country-level rows for
	the same ISO code, which weakens grounding if they all remain in the list.
	"""
	deduped: List[Candidate] = []
	seen_keys: set[tuple[str, str]] = set()
	for cand in candidates:
	if cand.subtype == "country" and cand.country:
	key = ("country", cand.country)
	else:
	key = ("id", cand.id)
	if key in seen_keys:
	continue
	deduped.append(cand)
	seen_keys.add(key)
	if max_total is not None and len(deduped) >= max_total:
	break

	for i, cand in enumerate(deduped, 1):
	cand.candidate_id = f"c{i}"
	return deduped


	def build_candidate_list(
	con: duckdb.DuckDBPyConnection,
	anchor_id: str,
	anchor_name: str,
	anchor_source: str,
	num_candidates: int = 10,
	difficulty: str = "medium"
	) -> List[Candidate]:
	"""Build candidate list with true anchor + distractors."""

	# Helper to convert pandas NA to None
	def safe_get(row, key, default=None):
	val = row.get(key, default)
	return None if pd.isna(val) else val

	# Get the true anchor
	if anchor_source == "divisions_area":
	query = """
	SELECT
	id,
	names."primary" AS name,
	subtype,
	country,
	region,
	admin_level
	FROM read_parquet(?)
	WHERE id = ?
	"""
	anchor_row = con.execute(query, [DIVISIONS_AREA_PATH, anchor_id]).fetchdf().iloc[0]
	else:
	query = """
	SELECT
	id,
	names."primary" AS name,
	subtype
	FROM read_parquet(?)
	WHERE id = ?
	"""
	anchor_row = con.execute(query, [NATURAL_EARTH_PATH, anchor_id]).fetchdf().iloc[0]

	true_candidate = Candidate(
	candidate_id="c1",
	source=anchor_source,
	id=anchor_id,
	name=safe_get(anchor_row, 'name'),
	subtype=safe_get(anchor_row, 'subtype'),
	country=safe_get(anchor_row, 'country'),
	region=safe_get(anchor_row, 'region'),
	admin_level=safe_get(anchor_row, 'admin_level'),
	similarity=1.0,
	)

	distractors = build_distractors(
	con,
	anchor_name,
	anchor_source,
	anchor_id,
	num_candidates - 1,
	difficulty,
	)

	# Deduplicate by underlying entity id while preserving order.
	# Some parquet sources contain repeated rows for the same feature id,
	# which can otherwise leak duplicate candidates into the dataset.
	candidates: List[Candidate] = []
	seen_ids: set[str] = set()
	for cand in [true_candidate] + distractors:
	if cand.id in seen_ids:
	continue
	candidates.append(cand)
	seen_ids.add(cand.id)
	if len(candidates) >= num_candidates:
	break

	for i, cand in enumerate(candidates, 1):
	cand.candidate_id = f"c{i}"

	return candidates


	def build_distractors(
	con: duckdb.DuckDBPyConnection,
	anchor_name: str,
	anchor_source: str,
	exclude_id: str,
	num_distractors: int,
	difficulty: str,
	cross_source_ratio: float = 0.5,
	) -> List[Candidate]:
	"""Build distractor candidates using fuzzy search.

	Always includes candidates from both sources so the model sees mixed
	``source`` values in every training example — matching the inference
	behaviour where search.py queries divisions_area AND natural_earth equally
	(5 results each per place).

	Args:
	cross_source_ratio: Fraction of distractors drawn from the other
	source. Defaults to 0.5 (50/50 split) to match inference exactly.
	"""

	def safe_get(row, key, default=None):
	val = row.get(key, default)
	return None if pd.isna(val) else val

	def _query_source(path: str, src_name: str, n: int, excl_id: str) -> List[Candidate]:
	query = """
	WITH ranked AS (
	SELECT
	id,
	names."primary" AS name,
	subtype,
	country,
	region,
	admin_level,
	jaro_winkler_similarity(lower(names."primary"), lower(?)) AS similarity,
	ROW_NUMBER() OVER (
	PARTITION BY id
	ORDER BY jaro_winkler_similarity(lower(names."primary"), lower(?)) DESC
	) AS rn
	FROM read_parquet(?)
	WHERE id != ?
	AND names."primary" IS NOT NULL
	AND trim(names."primary") != ''
	AND geometry IS NOT NULL
	)
	SELECT
	id,
	name,
	subtype,
	country,
	region,
	admin_level,
	similarity
	FROM ranked
	WHERE rn = 1
	ORDER BY similarity DESC
	LIMIT ?
	"""
	df = con.execute(query, [anchor_name, anchor_name, path, excl_id, n]).fetchdf()
	results = []
	for _, row in df.iterrows():
	results.append(Candidate(
	candidate_id="temp",
	source=src_name,
	id=row["id"],
	name=safe_get(row, "name"),
	subtype=safe_get(row, "subtype"),
	country=safe_get(row, "country"),
	region=safe_get(row, "region"),
	admin_level=safe_get(row, "admin_level"),
	similarity=float(row["similarity"]),
	))
	return results

	cross_n = max(1, round(num_distractors * cross_source_ratio))
	same_n = num_distractors - cross_n

	if anchor_source == "divisions_area":
	same = _query_source(DIVISIONS_AREA_PATH, "divisions_area", same_n, exclude_id)
	cross = _query_source(NATURAL_EARTH_PATH, "natural_earth", cross_n, "")
	else:
	same = _query_source(NATURAL_EARTH_PATH, "natural_earth", same_n, exclude_id)
	cross = _query_source(DIVISIONS_AREA_PATH, "divisions_area", cross_n, "")

	return same + cross


	def sample_random_entity(
	con: duckdb.DuckDBPyConnection,
	inventory_df: pd.DataFrame,
	source: str,
	subtypes: Optional[set[str]] = None,
	countries: Optional[set[str]] = None,
	) -> Optional[Dict[str, Any]]:
	"""Sample a random entity from inventory with optional filters."""
	if inventory_df.empty:
	return None

	df = inventory_df
	if subtypes is not None:
	df = df[df['subtype'].isin(subtypes)]
	if countries is not None and 'country' in df.columns:
	df = df[df['country'].isin(countries)]
	if df.empty:
	return None

	row = df.sample(n=1).iloc[0]
	return {
	'id': row['id'],
	'name': row['name'],
	'subtype': row.get('subtype'),
	'country': row.get('country'),
	'source': source
	}


	def generate_template_based_sample(
	con: duckdb.DuckDBPyConnection,
	template: SQLTemplate,
	tables: Dict[str, pd.DataFrame],
	sample_id: str
	) -> Optional[TrainingSample]:
	"""Generate a sample based on a SQL template."""

	# Sample anchor based on template requirements
	if template.family == "direct_lookup":
	# Just pick a random entity
	if template.anchor_source == "divisions_area":
	anchor = sample_random_entity(con, tables['divisions_area_inventory'], 'divisions_area')
	else:
	anchor = sample_random_entity(
	con,
	tables['natural_earth_inventory'],
	'natural_earth',
	subtypes=_NE_TEMPLATE_SUBTYPES.get(template.template_id, _NE_NAMED_LOOKUP_SUBTYPES),
	)

	if not anchor:
	return None

	# Render SQL
	sql = template.sql_template.format(
	anchor_id=anchor['id']
	)

	# Build candidates
	candidates = build_candidate_list(
	con, anchor['id'], anchor['name'], anchor['source'],
	num_candidates=10, difficulty="easy"
	)

	# Question
	question = random.choice(template.question_hints).format(anchor_name=anchor['name'])

	elif template.family == "disambiguation":
	# "Puri, Odisha" style: pick a (contained, container) pair whose
	# subtypes match the template, build candidates that include the
	# container + same-name distractors so the model must read the CSV
	# to pick the right entry.
	_disambig_subtypes = {
	"disambiguate_01": (["county"], ["region", "country"]),
	"disambiguate_02": (["county"], ["country"]),
	"disambiguate_03": (["region"], ["country"]),
	}
	contained_sts, container_sts = _disambig_subtypes.get(
	template.template_id, (["county"], ["country"])
	)

	pair = sample_disambiguation_anchor(
	tables["containment_pairs"], contained_sts, container_sts
	)
	if not pair:
	return None

	candidates = build_candidate_list(
	con, pair["contained_id"], pair["contained_name"], "divisions_area",
	num_candidates=10, difficulty="hard"
	)

	# Ensure the container is among the candidates so the model can
	# ground the disambiguation context (e.g. "Odisha").
	if not any(c.id == pair["container_id"] for c in candidates):
	container_rows = con.execute(
	'SELECT id, names."primary" AS name, subtype, country, region, admin_level '
	'FROM read_parquet(?) WHERE id = ? LIMIT 1',
	[DIVISIONS_AREA_PATH, pair["container_id"]]
	).fetchdf()
	if container_rows.empty:
	return None
	crow = container_rows.iloc[0]

	def _nn(v):
	return None if pd.isna(v) else v

	container_cand = Candidate(
	candidate_id="temp",
	source="divisions_area",
	id=pair["container_id"],
	name=_nn(crow["name"]),
	subtype=_nn(crow["subtype"]),
	country=_nn(crow["country"]),
	region=_nn(crow["region"]),
	admin_level=_nn(crow["admin_level"]),
	similarity=0.95,
	)
	# Insert the container right after the true target and drop the
	# last filler distractor so the total stays at 10.
	candidates = [candidates[0], container_cand] + candidates[1:-1]
	for i, c in enumerate(candidates, 1):
	c.candidate_id = f"c{i}"

	sql = template.sql_template.format(anchor_id=pair["contained_id"])

	question = random.choice(template.question_hints).format(
	anchor_name=pair["contained_name"],
	container_name=pair["container_name"],
	)

	# Only the contained entity is the query target — the container is
	# disambiguation context and stays in candidates but NOT in
	# selected_candidates. The model learns to use the container row of
	# the CSV (via country/region columns) to pick the right same-name
	# county or region.
	anchor = {"id": pair["contained_id"], "name": pair["contained_name"]}

	elif template.family == "adjacency":
	# adj_03/09/10/11/12: division anchor -> natural_earth targets.
	# adj_06/07/08: natural_earth anchor -> admin targets.
	# Use cross_source_relations so anchors are guaranteed to intersect.
	_NE_TARGET_ADJ_SUBTYPES = {
	"adj_03": ("ocean", "sea"),
	"adj_09": ("river", "lake", "basin"),
	"adj_10": ("range/mtn",),
	"adj_11": ("plateau",),
	"adj_12": ("plain", "lowland", "basin", "valley", "depression", "gorge"),
	}
	if template.template_id in _NE_TARGET_ADJ_SUBTYPES:
	cs_df = tables.get('cross_source_relations', pd.DataFrame())
	if cs_df.empty:
	return None
	ne_types = _NE_TARGET_ADJ_SUBTYPES[template.template_id]
	filtered = cs_df[cs_df['natural_subtype'].isin(ne_types)]
	if filtered.empty:
	return None
	row = filtered.sample(n=1).iloc[0]
	anchor = {
	'anchor_id': row['division_id'],
	'anchor_name': row['division_name'],
	'anchor_subtype': row['division_subtype'],
	'target_subtype': row['natural_subtype'],
	'anchor_source': 'divisions_area',
	}
	elif template.anchor_source == "natural_earth":
	cs_anchor = sample_cross_source_anchor(
	tables.get('cross_source_relations', pd.DataFrame()),
	natural_subtypes=_NE_TEMPLATE_SUBTYPES.get(template.template_id),
	)
	if not cs_anchor:
	return None
	anchor = {
	'anchor_id': cs_anchor['natural_id'],
	'anchor_name': cs_anchor['natural_name'],
	'target_subtype': template.target_subtype,
	'anchor_source': 'natural_earth',
	}
	else:
	# divisions_area self-join adjacency.
	_ADJ_ANCHOR_SUBTYPES = {
	"adj_02": ["country", "region"],
	"adj_04": ["region"],
	"adj_05": ["country"],
	}
	filter_subtype = (
	template.target_subtype
	if '{target_subtype}' in template.sql_template
	else None
	)
	anchor = sample_adjacency_anchor(
	tables['adjacency_pairs'],
	target_subtype=filter_subtype,
	anchor_subtypes=_ADJ_ANCHOR_SUBTYPES.get(template.template_id),
	)
	if anchor:
	anchor['anchor_source'] = 'divisions_area'
	if not anchor:
	return None

	sql = template.sql_template.format(
	anchor_id=anchor['anchor_id'],
	target_subtype=anchor.get('target_subtype', ''),
	)

	candidates = build_candidate_list(
	con, anchor['anchor_id'], anchor['anchor_name'], anchor.get('anchor_source', 'divisions_area'),
	num_candidates=10, difficulty="medium"
	)

	question = random.choice(template.question_hints).format(
	anchor_name=anchor['anchor_name'],
	target_subtype=anchor.get('target_subtype', ''),
	)

	elif template.family == "containment":
	if template.anchor_source == "natural_earth":
	# contain_04 / contain_05: NE anchor (sea, desert, etc.).
	# Use cross_source_relations so the anchor exists in natural_earth
	# and is guaranteed to intersect divisions_area features.
	cs_anchor = sample_cross_source_anchor(
	tables.get('cross_source_relations', pd.DataFrame()),
	natural_subtypes=_NE_TEMPLATE_SUBTYPES.get(template.template_id),
	)
	if not cs_anchor:
	return None
	anchor_id = cs_anchor['natural_id']
	anchor_name = cs_anchor['natural_name']
	target_subtype = template.target_subtype or 'country'

	sql = template.sql_template.format(
	anchor_id=anchor_id,
	target_subtype=target_subtype,
	)
	candidates = build_candidate_list(
	con, anchor_id, anchor_name, 'natural_earth',
	num_candidates=10, difficulty="medium"
	)
	question = random.choice(template.question_hints).format(
	anchor_name=anchor_name,
	target_subtype=target_subtype,
	)
	anchor = {'id': anchor_id, 'name': anchor_name}

	elif template.template_id == "contain_02":
	# "What country contains X?" - anchor is the CONTAINED entity;
	# result is the country that ST_Contains it.
	# Guard against stale relation tables by only allowing contained
	# subtypes that exist in the simplified admin schema.
	df = tables['containment_pairs']
	df = df[
	(df['container_subtype'] == 'country')
	& (df['contained_subtype'].isin(['region', 'county']))
	]
	pair = sample_containment_anchor(df)
	if not pair:
	return None

	sql = template.sql_template.format(
	anchor_id=pair['contained_id'],
	target_subtype='country',
	)
	candidates = build_candidate_list(
	con, pair['contained_id'], pair['contained_name'], 'divisions_area',
	num_candidates=10, difficulty="medium"
	)
	question = random.choice(template.question_hints).format(
	anchor_name=pair['contained_name'],
	target_subtype='country',
	)
	anchor = {'id': pair['contained_id'], 'name': pair['contained_name']}

	elif template.template_id == "contain_03":
	# "What regions are in country X?" - anchor is a country, target is regions.
	df = tables['containment_pairs']
	df = df[
	(df['container_subtype'] == 'country')
	& (df['contained_subtype'] == 'region')
	]
	pair = sample_containment_anchor(df)
	if not pair:
	return None

	sql = template.sql_template.format(
	anchor_id=pair['container_id'],
	target_subtype='region',
	)
	candidates = build_candidate_list(
	con, pair['container_id'], pair['container_name'], 'divisions_area',
	num_candidates=10, difficulty="medium"
	)
	question = random.choice(template.question_hints).format(
	anchor_name=pair['container_name'],
	target_subtype='region',
	)
	anchor = {'id': pair['container_id'], 'name': pair['container_name']}

	else:
	# contain_01: standard containment.
	# Enforce hierarchy: county must be inside region or country, never
	# inside another county. Filter container_subtype accordingly.
	# Also filter contained_subtype to match template.target_subtype so
	# hardcoded vocab hints (e.g. "districts") always align with the SQL.
	_VALID_CONTAINERS = {
	"county": ["region", "country"],
	"region": ["country"],
	}
	df = tables['containment_pairs']
	if template.target_subtype:
	filtered = df[df['contained_subtype'] == template.target_subtype]
	if not filtered.empty:
	df = filtered
	valid_containers = _VALID_CONTAINERS.get(template.target_subtype)
	if valid_containers:
	filtered = df[df['container_subtype'].isin(valid_containers)]
	if not filtered.empty:
	df = filtered
	anchor = sample_containment_anchor(df)
	if not anchor:
	return None

	target_subtype = template.target_subtype or anchor['contained_subtype']

	sql = template.sql_template.format(
	anchor_id=anchor['container_id'],
	target_subtype=target_subtype,
	)
	candidates = build_candidate_list(
	con, anchor['container_id'], anchor['container_name'], 'divisions_area',
	num_candidates=10, difficulty="medium"
	)
	question = random.choice(template.question_hints).format(
	anchor_name=anchor['container_name'],
	target_subtype=target_subtype,
	)

	elif template.family == "intersection":
	if template.anchor_source == "natural_earth":
	anchor = sample_cross_source_anchor(
	tables['cross_source_relations'],
	natural_subtypes=_NE_TEMPLATE_SUBTYPES.get(template.template_id),
	)
	if not anchor:
	return None

	target_subtype = template.target_subtype or 'country'

	sql = template.sql_template.format(
	anchor_id=anchor['natural_id'],
	target_subtype=target_subtype,
	)

	candidates = build_candidate_list(
	con, anchor['natural_id'], anchor['natural_name'], 'natural_earth',
	num_candidates=10, difficulty="medium"
	)

	question = random.choice(template.question_hints).format(
	anchor_name=anchor['natural_name'],
	target_subtype=target_subtype,
	)
	else:
	# Same-source intersection.
	# If the template pins a target_subtype (e.g. intersect_02 targets county),
	# filter intersection_pairs so the sampled pair is guaranteed to match.
	idf = tables['intersection_pairs']
	if template.target_subtype and not idf.empty:
	filtered = idf[idf['target_subtype'] == template.target_subtype]
	if filtered.empty:
	return None
	idf = filtered
	anchor = sample_intersection_anchor(idf)
	if not anchor:
	return None

	target_subtype = template.target_subtype or anchor.get('target_subtype') or 'region'

	sql = template.sql_template.format(
	anchor_id=anchor['anchor_id'],
	target_subtype=target_subtype
	)

	candidates = build_candidate_list(
	con, anchor['anchor_id'], anchor['anchor_name'], 'divisions_area',
	num_candidates=10, difficulty="medium"
	)

	question = random.choice(template.question_hints).format(
	anchor_name=anchor['anchor_name'],
	target_subtype=target_subtype
	)

	elif template.family == "set_operations":
	if template.template_id == "union_03":
	# 3-anchor union by ID — candidates: 3 per anchor (9 total)
	anchors = [
	sample_random_entity(con, tables['divisions_area_inventory'], 'divisions_area')
	for _ in range(3)
	]
	if any(a is None for a in anchors):
	return None
	anchor1, anchor2, anchor3 = anchors

	sql = template.sql_template.format(
	anchor_id_1=anchor1['id'],
	anchor_id_2=anchor2['id'],
	anchor_id_3=anchor3['id'],
	)

	per_anchor = 3
	cands = [
	build_candidate_list(con, a['id'], a['name'], 'divisions_area',
	num_candidates=per_anchor, difficulty="medium")
	for a in anchors
	]
	candidates = _merge_candidate_lists(*cands, max_total=9)

	question = random.choice(template.question_hints).format(
	anchor_1_name=anchor1['name'],
	anchor_2_name=anchor2['name'],
	anchor_3_name=anchor3['name'],
	)

	elif template.template_id in ("contain_multi_01", "contain_multi_02", "contain_multi_03"):
	# country IN clause — 2 or 3 anchors, each contributes its country code.
	# Sample unique countries so the query actually teaches a multi-country
	# pattern rather than repeating the same ISO code multiple times.
	num_a = 3 if template.template_id == "contain_multi_02" else 2
	country_inventory = tables['divisions_area_inventory']
	country_inventory = country_inventory[
	(country_inventory['subtype'] == 'country')
	& country_inventory['country'].notna()
	].drop_duplicates(subset=['country'])
	if len(country_inventory) < num_a:
	return None

	sampled = country_inventory.sample(n=num_a, replace=False)
	anchors = [
	{
	'id': row['id'],
	'name': row['name'],
	'subtype': row.get('subtype'),
	'country': row.get('country'),
	'source': 'divisions_area',
	}
	for _, row in sampled.iterrows()
	]

	countries = [a.get('country') or 'US' for a in anchors]
	target_subtype = template.target_subtype or 'region'
	per_anchor = 3 if num_a == 3 else 4

	fmt_kwargs = dict(
	target_subtype=target_subtype,
	)
	for i, c in enumerate(countries, 1):
	fmt_kwargs[f'country_{i}'] = c

	sql = template.sql_template.format(**fmt_kwargs)

	cands = [
	build_candidate_list(con, a['id'], a['name'], 'divisions_area',
	num_candidates=per_anchor, difficulty="medium")
	for a in anchors
	]
	candidates = _dedupe_country_candidates(
	_merge_candidate_lists(cands, max_total=num_a per_anchor),
	max_total=num_a * per_anchor,
	)

	q_kwargs = dict(target_subtype=target_subtype)
	for i, a in enumerate(anchors, 1):
	q_kwargs[f'anchor_{i}_name'] = a['name']

	question = random.choice(template.question_hints).format(**q_kwargs)

	elif template.template_id == "union_02":
	# Filtered union: ST_Union_Agg of contained sub-features.
	# Pin to template.target_subtype so hardcoded vocabulary hints
	# (e.g. "districts") always match the SQL subtype.
	df = tables['containment_pairs']
	if template.target_subtype:
	filtered = df[df['contained_subtype'] == template.target_subtype]
	if not filtered.empty:
	df = filtered
	pair = sample_containment_anchor(df)
	if not pair:
	return None

	target_subtype = template.target_subtype or pair.get('contained_subtype', 'county')
	sql = template.sql_template.format(
	anchor_id=pair['container_id'],
	target_subtype=target_subtype,
	)

	candidates = build_candidate_list(
	con, pair['container_id'], pair['container_name'], 'divisions_area',
	num_candidates=10, difficulty="medium"
	)

	question = random.choice(template.question_hints).format(
	anchor_name=pair['container_name'],
	target_subtype=target_subtype,
	)

	else:
	# union_01: 2-anchor union by ID — candidates: 5 per anchor
	anchor1 = sample_random_entity(con, tables['divisions_area_inventory'], 'divisions_area')
	anchor2 = sample_random_entity(con, tables['divisions_area_inventory'], 'divisions_area')
	if not anchor1 or not anchor2:
	return None

	sql = template.sql_template.format(
	anchor_id_1=anchor1['id'],
	anchor_id_2=anchor2['id'],
	)

	cands1 = build_candidate_list(
	con, anchor1['id'], anchor1['name'], 'divisions_area',
	num_candidates=5, difficulty="medium"
	)
	cands2 = build_candidate_list(
	con, anchor2['id'], anchor2['name'], 'divisions_area',
	num_candidates=5, difficulty="medium"
	)
	candidates = _merge_candidate_lists(cands1, cands2, max_total=10)

	question = random.choice(template.question_hints).format(
	anchor_1_name=anchor1['name'],
	anchor_2_name=anchor2['name'],
	)

	elif template.family == "buffer":
	# Buffer operations use metre distances in SQL and a human-readable
	# buffer_label in questions, e.g. (1000, "1 km") or (250, "250 m").
	# The template SQL divides by 111 320 to approximate metres in degrees.
	_buffer_choices = [
	(100, "100 m"),
	(250, "250 m"),
	(500, "500 m"),
	(1000, "1 km"),
	(2000, "2 km"),
	(5000, "5 km"),
	(10000, "10 km"),
	(25000, "25 km"),
	(50000, "50 km"),
	(100000, "100 km"),
	(200000, "200 km"),
	]

	if template.num_anchors == 1:
	if template.anchor_source == "natural_earth":
	anchor = sample_random_entity(
	con,
	tables['natural_earth_inventory'],
	'natural_earth',
	subtypes=_NE_TEMPLATE_SUBTYPES.get(template.template_id, _NE_NAMED_LOOKUP_SUBTYPES),
	)
	else:
	anchor = sample_random_entity(con, tables['divisions_area_inventory'], 'divisions_area')
	if not anchor:
	return None

	buffer_m, buffer_label = random.choice(_buffer_choices)
	fmt_kwargs = dict(anchor_id=anchor['id'], buffer_m=buffer_m)
	q_kwargs = dict(anchor_name=anchor['name'], buffer_label=buffer_label)

	if template.target_subtype:
	fmt_kwargs['target_subtype'] = template.target_subtype
	q_kwargs['target_subtype'] = template.target_subtype

	sql = template.sql_template.format(**fmt_kwargs)

	candidates = build_candidate_list(
	con, anchor['id'], anchor['name'], anchor['source'],
	num_candidates=10, difficulty="medium"
	)

	question = random.choice(template.question_hints).format(**q_kwargs)
	else:
	# Multi-anchor buffer (2–5 places): union of individual buffers.
	num_a = template.num_anchors
	anchors = []
	for _ in range(num_a):
	a = sample_random_entity(con, tables['divisions_area_inventory'], 'divisions_area')
	if not a:
	return None
	anchors.append(a)

	buffer_m, buffer_label = random.choice(_buffer_choices[:7])

	fmt_kwargs = {f'anchor_id_{i+1}': a['id'] for i, a in enumerate(anchors)}
	fmt_kwargs['buffer_m'] = buffer_m
	if template.target_subtype:
	fmt_kwargs['target_subtype'] = template.target_subtype

	sql = template.sql_template.format(**fmt_kwargs)

	# Build one candidate list per anchor then merge.
	per_anchor_n = max(2, 10 // num_a)
	cand_lists = [
	build_candidate_list(
	con, a['id'], a['name'], 'divisions_area',
	num_candidates=per_anchor_n, difficulty="medium",
	)
	for a in anchors
	]
	candidates = _merge_candidate_lists(*cand_lists)

	q_kwargs = {f'anchor_{i+1}_name': a['name'] for i, a in enumerate(anchors)}
	q_kwargs['buffer_label'] = buffer_label
	if template.target_subtype:
	q_kwargs['target_subtype'] = template.target_subtype

	question = random.choice(template.question_hints).format(**q_kwargs)

	elif template.family == "partial_selection":
	# Partial selection (northern half, clipping, etc.)
	anchor = sample_random_entity(con, tables['divisions_area_inventory'], 'divisions_area')
	if not anchor:
	return None

	if template.num_anchors == 1:
	sql = template.sql_template.format(
	anchor_id=anchor['id'],
	)
	question = random.choice(template.question_hints).format(
	anchor_name=anchor['name'],
	)
	candidates = build_candidate_list(
	con, anchor['id'], anchor['name'], 'divisions_area',
	num_candidates=10, difficulty="hard",
	)
	else:
	# Mixed-source clip: division intersected with a natural_earth feature.
	# Use cross_source_relations so the pair is guaranteed to intersect —
	# random sampling almost never produces an intersecting pair.
	cs_anchor = sample_cross_source_anchor(
	tables.get('cross_source_relations', pd.DataFrame()),
	natural_subtypes=_NE_TEMPLATE_SUBTYPES.get(template.template_id),
	)
	if not cs_anchor:
	return None
	clip_feature = {
	'id': cs_anchor['natural_id'],
	'name': cs_anchor['natural_name'],
	'source': 'natural_earth',
	}
	# Override the division anchor with the paired division so the
	# ST_Intersects check in the SQL is guaranteed to pass.
	anchor = {
	'id': cs_anchor['division_id'],
	'name': cs_anchor['division_name'],
	'source': 'divisions_area',
	}

	sql = template.sql_template.format(
	anchor_id=anchor['id'],
	clip_feature_id=clip_feature['id'],
	)
	question = random.choice(template.question_hints).format(
	anchor_name=anchor['name'],
	clip_feature_name=clip_feature['name'],
	)
	# Build candidates for BOTH anchors so the model sees both IDs
	# in context and learns to pick the right one for each placeholder.
	div_cands = build_candidate_list(
	con, anchor['id'], anchor['name'], 'divisions_area',
	num_candidates=5, difficulty="hard",
	)
	ne_cands = build_candidate_list(
	con, clip_feature['id'], clip_feature['name'], 'natural_earth',
	num_candidates=5, difficulty="hard",
	)
	candidates = _merge_candidate_lists(div_cands, ne_cands, max_total=10)

	elif template.family == "aggregation":
	# Teach the model to distinguish singular superlatives ("the largest")
	# from explicit top-N requests ("top 5 largest").
	top_n = random.choice([1, 3, 5, 10])
	target_subtype = random.choice(['county', 'region'])
	singular_hints = [h for h in template.question_hints if '{top_n}' not in h]
	plural_hints = [h for h in template.question_hints if '{top_n}' in h]
	question_hint_pool = singular_hints if top_n == 1 and singular_hints else plural_hints or template.question_hints

	if template.template_id in ['agg_03', 'agg_04']:
	# Country-level aggregation: SQL uses country code, so the anchor
	# in the question must also be a country.
	anchor = sample_random_entity(
	con,
	tables['divisions_area_inventory'],
	'divisions_area',
	subtypes={'country'},
	)
	if not anchor:
	return None

	country = anchor.get('country') or 'US'

	sql = template.sql_template.format(
	country=country,
	target_subtype=target_subtype,
	top_n=top_n,
	)

	candidates = build_candidate_list(
	con, anchor['id'], anchor['name'], 'divisions_area',
	num_candidates=10, difficulty="hard"
	)

	question = random.choice(question_hint_pool).format(
	top_n=top_n,
	target_subtype=target_subtype,
	anchor_name=anchor['name'],
	)
	else:
	# Containment-based aggregation: anchor is the container region.
	anchor = sample_containment_anchor(tables['containment_pairs'])
	if not anchor:
	return None

	sql = template.sql_template.format(
	anchor_id=anchor['container_id'],
	target_subtype=target_subtype,
	top_n=top_n,
	)

	candidates = build_candidate_list(
	con, anchor['container_id'], anchor['container_name'], 'divisions_area',
	num_candidates=10, difficulty="hard"
	)

	question = random.choice(question_hint_pool).format(
	top_n=top_n,
	target_subtype=target_subtype,
	anchor_name=anchor['container_name'],
	)

	elif template.family == "chained":
	# chained_12/13: country-level coastal/landlocked via adjacency.
	# The SQL uses ST_Touches (not containment), so bypass the containment
	# pair sampling and use adjacency_pairs with country-level anchors.
	if template.template_id in {"chained_12", "chained_13"}:
	adj_df = tables.get('adjacency_pairs', pd.DataFrame())
	country_adj = (
	adj_df[
	(adj_df['anchor_subtype'] == 'country')
	& (adj_df['target_subtype'] == 'country')
	]
	if not adj_df.empty else pd.DataFrame()
	)
	if country_adj.empty:
	return None
	pair = sample_adjacency_anchor(country_adj)
	if not pair:
	return None

	sql = template.sql_template.format(anchor_id=pair['anchor_id'])
	candidates = build_candidate_list(
	con, pair['anchor_id'], pair['anchor_name'], 'divisions_area',
	num_candidates=10, difficulty="hard"
	)
	question = random.choice(template.question_hints).format(
	anchor_name=pair['anchor_name']
	)
	anchor = {'id': pair['anchor_id'], 'name': pair['anchor_name']}

	else:
	# Use pre-filtered coastal/landlocked containment pairs so the SQL
	# verification step doesn't constantly return empty results.
	_COASTAL_CHAINED = {"chained_01", "chained_06", "chained_10"}
	_LANDLOCKED_CHAINED = {"chained_02", "chained_07", "chained_11"}
	if template.template_id in _COASTAL_CHAINED:
	table_key = 'coastal_containment_pairs'
	elif template.template_id in _LANDLOCKED_CHAINED:
	table_key = 'landlocked_containment_pairs'
	else:
	table_key = 'containment_pairs'

	df = tables.get(table_key, tables['containment_pairs'])

	# When the template pins a target_subtype (e.g. chained_06 wants
	# counties), only consider pairs whose contained entity already
	# matches — guarantees the sampled container holds at least one
	# entity of the right subtype so the SQL filter returns rows.
	if template.target_subtype:
	df = df[df['contained_subtype'] == template.target_subtype]

	# chained_10/11 additionally need a country-level container so
	# phrasings like "coastal states of India" line up.
	if template.template_id in {"chained_10", "chained_11"}:
	df = df[df['container_subtype'] == 'country']

	anchor = sample_containment_anchor(df)
	if not anchor:
	return None

	target_subtype = template.target_subtype or anchor.get('contained_subtype', 'county')

	sql = template.sql_template.format(
	anchor_id=anchor['container_id'],
	target_subtype=target_subtype,
	)

	candidates = build_candidate_list(
	con, anchor['container_id'], anchor['container_name'], 'divisions_area',
	num_candidates=10, difficulty="hard"
	)

	question = random.choice(template.question_hints).format(
	anchor_name=anchor['container_name'],
	target_subtype=target_subtype,
	)

	elif template.family == "multi_adjacency":
	# Use common_neighbor_pairs so anchor1 and anchor2 are guaranteed to
	# share at least one touching neighbour — SQL will return non-empty.
	# Filter by both anchor subtypes AND shared_neighbor_subtype so the
	# sampled pair is geographically coherent with the template intent:
	# multi_adj_01: region anchors → region result
	# multi_adj_02: country anchors → country result
	# multi_adj_03: region anchors → county result
	_MULTI_ADJ_ANCHOR_SUBTYPES = {
	"multi_adj_01": ("region", "region"),
	"multi_adj_02": ("country", "country"),
	"multi_adj_03": ("region", "region"),
	}
	cn_df = tables.get('common_neighbor_pairs', pd.DataFrame())
	if cn_df.empty:
	return None
	if template.target_subtype and 'shared_neighbor_subtype' in cn_df.columns:
	filtered = cn_df[cn_df['shared_neighbor_subtype'] == template.target_subtype]
	if not filtered.empty:
	cn_df = filtered
	if template.template_id in _MULTI_ADJ_ANCHOR_SUBTYPES and 'anchor_subtype_1' in cn_df.columns:
	a1_st, a2_st = _MULTI_ADJ_ANCHOR_SUBTYPES[template.template_id]
	filtered = cn_df[
	(cn_df['anchor_subtype_1'] == a1_st) &
	(cn_df['anchor_subtype_2'] == a2_st)
	]
	if not filtered.empty:
	cn_df = filtered
	row = cn_df.sample(n=1).iloc[0]
	anchor1 = {'id': row['anchor_id_1'], 'name': row['anchor_name_1'], 'source': 'divisions_area'}
	anchor2 = {'id': row['anchor_id_2'], 'name': row['anchor_name_2'], 'source': 'divisions_area'}

	target_subtype = template.target_subtype or row.get('shared_neighbor_subtype', 'region')

	sql = template.sql_template.format(
	anchor_id_1=anchor1['id'],
	anchor_id_2=anchor2['id'],
	target_subtype=target_subtype,
	)

	candidates1 = build_candidate_list(
	con, anchor1['id'], anchor1['name'], 'divisions_area',
	num_candidates=5, difficulty="medium"
	)
	candidates2 = build_candidate_list(
	con, anchor2['id'], anchor2['name'], 'divisions_area',
	num_candidates=5, difficulty="medium"
	)
	candidates = _merge_candidate_lists(candidates1, candidates2)

	question = random.choice(template.question_hints).format(
	anchor_1_name=anchor1['name'],
	anchor_2_name=anchor2['name'],
	target_subtype=target_subtype,
	)

	elif template.family == "difference":
	if template.anchor_source == "mixed":
	# divisions_area anchor differenced against a natural_earth feature.
	# Use cross_source_relations so the pair is guaranteed to intersect
	# (ST_Difference on non-intersecting geometries is always equal to
	# the original geometry — a trivial and uninformative sample).
	cs_anchor = sample_cross_source_anchor(
	tables.get('cross_source_relations', pd.DataFrame()),
	natural_subtypes=_NE_TEMPLATE_SUBTYPES.get(template.template_id),
	)
	if not cs_anchor:
	return None
	anchor = {
	'id': cs_anchor['division_id'],
	'name': cs_anchor['division_name'],
	'source': 'divisions_area',
	}
	clip_feature = {
	'id': cs_anchor['natural_id'],
	'name': cs_anchor['natural_name'],
	'source': 'natural_earth',
	}

	sql = template.sql_template.format(
	anchor_id=anchor['id'],
	clip_feature_id=clip_feature['id'],
	)
	question = random.choice(template.question_hints).format(
	anchor_name=anchor['name'],
	clip_feature_name=clip_feature['name'],
	)
	# Build candidates for BOTH anchors — model must see both IDs
	# to correctly assign anchor_id vs clip_feature_id in the SQL.
	div_cands = build_candidate_list(
	con, anchor['id'], anchor['name'], 'divisions_area',
	num_candidates=5, difficulty="hard",
	)
	ne_cands = build_candidate_list(
	con, clip_feature['id'], clip_feature['name'], 'natural_earth',
	num_candidates=5, difficulty="hard",
	)
	candidates = _merge_candidate_lists(div_cands, ne_cands, max_total=10)

	else:
	# Two divisions_area anchors: use both ends of a containment
	# pair so the contained entity is guaranteed to intersect the
	# container. ST_Difference(container, contained) yields the
	# portion of the container outside the contained piece.
	pair = sample_containment_anchor(tables['containment_pairs'])
	if not pair:
	return None

	anchor1 = {'id': pair['container_id'], 'name': pair['container_name']}
	anchor2 = {'id': pair['contained_id'], 'name': pair['contained_name']}

	sql = template.sql_template.format(
	anchor_id_1=anchor1['id'],
	anchor_id_2=anchor2['id'],
	)

	candidates1 = build_candidate_list(
	con, anchor1['id'], anchor1['name'], 'divisions_area',
	num_candidates=5, difficulty="medium"
	)
	candidates2 = build_candidate_list(
	con, anchor2['id'], anchor2['name'], 'divisions_area',
	num_candidates=5, difficulty="medium"
	)
	candidates = _merge_candidate_lists(candidates1, candidates2)

	question = random.choice(template.question_hints).format(
	anchor_1_name=anchor1['name'],
	anchor_2_name=anchor2['name'],
	)

	elif template.family == "border_corridor":
	# Buffered border zone — needs two anchors that actually touch.
	pair = sample_adjacency_anchor(tables['adjacency_pairs'])
	if not pair:
	return None

	anchor1 = {'id': pair['anchor_id'], 'name': pair['anchor_name']}
	anchor2 = {'id': pair['target_id'], 'name': pair['target_name']}

	buffer_val = random.choice([5, 10, 25, 50])

	sql = template.sql_template.format(
	anchor_id_1=anchor1['id'],
	anchor_id_2=anchor2['id'],
	buffer_km=buffer_val,
	)

	candidates1 = build_candidate_list(
	con, anchor1['id'], anchor1['name'], 'divisions_area',
	num_candidates=5, difficulty="medium"
	)
	candidates2 = build_candidate_list(
	con, anchor2['id'], anchor2['name'], 'divisions_area',
	num_candidates=5, difficulty="medium"
	)
	candidates = _merge_candidate_lists(candidates1, candidates2)

	question = random.choice(template.question_hints).format(
	anchor_1_name=anchor1['name'],
	anchor_2_name=anchor2['name'],
	buffer_km=buffer_val,
	)

	elif template.family == "window_function":
	anchor = sample_random_entity(
	con,
	tables['divisions_area_inventory'],
	'divisions_area',
	subtypes={'country'},
	)
	if not anchor:
	return None

	country = anchor.get('country') or 'US'
	target_subtype = template.target_subtype or 'county'

	sql = template.sql_template.format(
	country=country,
	target_subtype=target_subtype,
	)

	candidates = build_candidate_list(
	con, anchor['id'], anchor['name'], 'divisions_area',
	num_candidates=10, difficulty="hard"
	)

	question = random.choice(template.question_hints).format(
	anchor_name=anchor['name'],
	target_subtype=target_subtype,
	)

	elif template.family == "attribute_filter":
	anchor = sample_random_entity(
	con,
	tables['divisions_area_inventory'],
	'divisions_area',
	subtypes={'country'},
	)
	if not anchor:
	return None

	country = anchor.get('country') or 'US'
	target_subtype = template.target_subtype or 'region'

	sql = template.sql_template.format(
	country=country,
	target_subtype=target_subtype,
	)

	candidates = build_candidate_list(
	con, anchor['id'], anchor['name'], 'divisions_area',
	num_candidates=10, difficulty="medium"
	)

	question = random.choice(template.question_hints).format(
	anchor_name=anchor['name'],
	target_subtype=target_subtype,
	country=country,
	)

	else:
	# Skip unsupported families
	return None

	# Execute SQL to verify
	try:
	result = con.execute(_for_execution(sql)).fetchdf()
	if result.empty:
	return None
	except Exception as e:
	# Errors are tracked in worker return, no need to print
	return None

	# Collect every anchor ID that appears in the generated SQL so we can
	# mark them as the "selected" candidates in the training sample.
	_multi_anchor_families = {"set_operations", "multi_adjacency", "difference", "border_corridor", "buffer"}

	# Mixed partial_selection (partial_05) and mixed difference (diff_02) each
	# have two anchors from different sources — both must be marked selected.
	_is_mixed_two_anchor = (
	template.anchor_source == "mixed" and template.num_anchors == 2
	)

	if _is_mixed_two_anchor:
	# partial_05 / diff_02: anchor (division) + clip_feature (natural_earth)
	mixed_ids = {anchor.get("id", ""), clip_feature.get("id", "")}
	selected_candidate_ids = [c.candidate_id for c in candidates if c.id in mixed_ids]

	elif template.family in _multi_anchor_families and template.num_anchors >= 2:
	anchor_ids: set = set()
	for var in ("anchor1", "anchor2", "anchor3"):
	obj = locals().get(var)
	if obj:
	anchor_ids.add(obj.get("id", ""))
	if "anchors" in locals():
	for a in locals()["anchors"]:
	if a:
	anchor_ids.add(a.get("id", ""))
	selected_candidate_ids = [c.candidate_id for c in candidates if c.id in anchor_ids]

	else:
	anchor_id_to_find = (
	anchor.get('anchor_id')
	or anchor.get('container_id')
	or anchor.get('natural_id')
	or anchor.get('id')
	)
	selected_candidate_ids = [c.candidate_id for c in candidates if c.id == anchor_id_to_find]

	question, surface_variants = _diversify_question_surface(question, template.family)

	return TrainingSample(
	id=sample_id,
	question=question,
	candidates=candidates,
	target={
	"selected_candidates": selected_candidate_ids,
	"sql": sql,
	},
	metadata={
	"task_family": template.family,
	"sql_difficulty": template.sql_difficulty,
	"grounding_difficulty": "medium",
	"template_id": template.template_id,
	"num_candidates": len(candidates),
	"anchor_source": template.anchor_source,
	"sql_verified": True,
	"surface_variants": surface_variants,
	}
	)


	def generate_sample_batch_worker(args):
	"""Worker function that processes a batch of work items with a single DuckDB connection.

	Initializes DuckDB, spatial extension, templates module, and relation tables
	ONCE per batch, then processes all items sequentially.
	"""
	from pathlib import Path

	work_items, intermediate_dir_str = args

	# Convert string back to Path
	intermediate_dir = Path(intermediate_dir_str)

	# Initialize DuckDB ONCE for the entire batch
	con = duckdb.connect()
	con.execute("SET enable_progress_bar=false")
	con.execute("INSTALL spatial")
	con.execute("LOAD spatial")

	# Load relation tables ONCE
	tables = load_relation_tables(intermediate_dir, quiet=True)

	# Process all items in batch
	results = []
	for family, template_dict, sample_id, _ in work_items:
	# Reconstruct template from dict (sql_templates is already imported at module level)
	template = sql_templates.SQLTemplate(**template_dict)
	try:
	sample = generate_template_based_sample(con, template, tables, sample_id)
	if sample:
	results.append((sample, family, template.template_id, None))
	else:
	results.append((None, family, template.template_id, "Empty result"))
	except Exception as e:
	results.append((None, family, template_dict.get('template_id', 'unknown'), str(e)))

	con.close()
	return results


	def generate_batch_core(
	work_items: List[tuple],
	intermediate_dir: str,
	) -> List[Dict[str, Any]]:
	"""Standalone batch worker usable from Modal or any remote context.

	Data paths are resolved via GAZET_DATA_DIR env var (set in Modal image).

	Args:
	work_items: List of (family, template_dict, sample_id, _) tuples
	intermediate_dir: Path to intermediate dir with relation parquets

	Returns:
	List of dicts with keys: sample (dict or None), family, template_id, error
	"""
	from pathlib import Path as _Path
	intermediate = _Path(intermediate_dir)

	con = duckdb.connect()
	con.execute("SET enable_progress_bar=false")
	con.execute("INSTALL spatial")
	con.execute("LOAD spatial")

	tables = load_relation_tables(intermediate, quiet=True)

	results = []
	for family, template_dict, sample_id, _ in work_items:
	template = sql_templates.SQLTemplate(**template_dict)
	try:
	sample = generate_template_based_sample(con, template, tables, sample_id)
	if sample:
	results.append({
	"sample": sample.model_dump(),
	"family": family,
	"template_id": template.template_id,
	"error": None,
	})
	else:
	results.append({
	"sample": None,
	"family": family,
	"template_id": template.template_id,
	"error": "Empty result",
	})
	except Exception as e:
	results.append({
	"sample": None,
	"family": family,
	"template_id": template_dict.get('template_id', 'unknown'),
	"error": str(e),
	})

	con.close()
	return results


	def prepare_work_items(
	target_counts: Dict[str, int],
	retry_multiplier: int = 2,
	start_counter: int = 1,
	intermediate_dir_str: str = "",
	) -> List[tuple]:
	"""Prepare shuffled work items for sample generation.

	Returns list of (family, template_dict, sample_id, intermediate_dir_str) tuples.
	Reusable by both local main() and Modal orchestrator.
	"""
	work_items = []
	sample_counter = start_counter

	for family, target_count in target_counts.items():
	if target_count == 0:
	continue

	family_templates = [t for t in TEMPLATES if t.family == family]
	if not family_templates:
	print(f"No templates found for {family}, skipping...")
	continue

	# Distribute target evenly across templates so every template_id gets
	# a guaranteed share. Uniform random choice previously let rare
	# variants like partial_05 / diff_02 get under-represented or dropped
	# entirely when their mixed-source branch hit transient failures.
	n_tpl = len(family_templates)
	per_tpl = target_count // n_tpl
	remainder = target_count % n_tpl

	for i, template in enumerate(family_templates):
	count = per_tpl + (1 if i < remainder else 0)
	template_dict = {
	'template_id': template.template_id,
	'family': template.family,
	'sql_difficulty': template.sql_difficulty,
	'anchor_source': template.anchor_source,
	'num_anchors': template.num_anchors,
	'sql_template': template.sql_template,
	'question_hints': template.question_hints,
	'target_subtype': template.target_subtype,
	'requires_buffer': template.requires_buffer,
	'requires_aggregation': template.requires_aggregation
	}
	for _ in range(count * retry_multiplier):
	work_items.append((
	family,
	template_dict,
	f"sample_{sample_counter:06d}",
	intermediate_dir_str,
	))
	sample_counter += 1

	random.shuffle(work_items)
	return work_items


	def main():
	"""Generate training samples."""
	global TARGET_COUNTS, MAX_WORKERS, RETRY_MULTIPLIER, APPEND_MODE

	# Setup paths
	script_dir = Path(__file__).parent
	intermediate_dir = script_dir.parent / "intermediate"
	output_dir = script_dir.parent / "output"

	output_dir.mkdir(exist_ok=True, parents=True)

	# Load relation tables once to check availability
	print("Loading relation tables...")
	tables = load_relation_tables(intermediate_dir, quiet=False)

	# Use configured target counts or defaults
	if TARGET_COUNTS is None:
	target_counts = {
	'direct_lookup': 100,
	'adjacency': 150,
	'multi_adjacency': 75,
	'containment': 100,
	'intersection': 100,
	'buffer': 100,
	'chained': 150,
	'difference': 75,
	'border_corridor': 75,
	'set_operations': 150,
	'partial_selection': 75,
	'aggregation': 100,
	'window_function': 75,
	'attribute_filter': 75,
	}
	else:
	target_counts = TARGET_COUNTS

	# Load existing samples if in append mode
	existing_samples = []
	existing_sample_ids = set()
	jsonl_file = output_dir / "dataset_raw.jsonl"

	if APPEND_MODE and jsonl_file.exists():
	print(f"\nAppend mode: Loading existing samples from {jsonl_file}")
	with open(jsonl_file, 'r') as f:
	for line in f:
	if line.strip():
	sample_data = json.loads(line)
	existing_samples.append(sample_data)
	existing_sample_ids.add(sample_data['id'])
	print(f" Found {len(existing_samples)} existing samples")

	# Determine starting sample counter
	max_existing_id = max([int(s['id'].split('_')[1]) for s in existing_samples if s['id'].startswith('sample_')], default=0)
	sample_counter = max_existing_id + 1
	else:
	sample_counter = 1

	# Prepare work items using shared helper
	work_items = prepare_work_items(
	target_counts=target_counts,
	retry_multiplier=RETRY_MULTIPLIER,
	start_counter=sample_counter,
	intermediate_dir_str=str(intermediate_dir),
	)
	starting_sample_counter = sample_counter

	# Partition work items into batches (one per worker)
	num_workers = min(MAX_WORKERS, len(work_items))
	if num_workers == 0:
	print("No work items to process")
	return
	batch_size = (len(work_items) + num_workers - 1) // num_workers
	batches = []
	for i in range(0, len(work_items), batch_size):
	batch = work_items[i:i + batch_size]
	batches.append((batch, str(intermediate_dir)))

	# Generate samples in parallel (one batch per worker)
	active_families = len([f for f in target_counts.values() if f > 0])
	print(f"\nGenerating {len(work_items)} samples across {active_families} families...")
	print(f" Split into {len(batches)} batches of ~{batch_size} items (1 DuckDB init per batch)")
	if APPEND_MODE and existing_samples:
	print(f"Appending: starting from sample_{starting_sample_counter:03d}")

	all_samples = []
	family_progress = {f: {'success': 0, 'failed': 0} for f in target_counts.keys() if target_counts[f] > 0}

	with ProcessPoolExecutor(max_workers=num_workers) as executor:
	# Submit one batch per worker
	futures = {executor.submit(generate_sample_batch_worker, batch): i for i, batch in enumerate(batches)}

	# Collect results as batches complete
	batches_done = 0
	for future in as_completed(futures):
	try:
	batch_results = future.result()
	for sample, family, template_id, error in batch_results:
	if sample:
	all_samples.append(sample)
	family_progress[family]['success'] += 1
	else:
	family_progress[family]['failed'] += 1
	except Exception as e:
	print(f"\n Batch failed: {e}")

	batches_done += 1
	total_done = sum(p['success'] + p['failed'] for p in family_progress.values())
	print(f"\r Progress: {total_done}/{len(work_items)} samples ({batches_done}/{len(batches)} batches) ", end='', flush=True)

	print() # New line after progress

	# Show distribution (keep all samples, no filtering)
	print("\nResults by family:")
	for family in sorted(family_progress.keys()):
	success = family_progress[family]['success']
	failed = family_progress[family]['failed']
	target = target_counts.get(family, 0)
	total = success + failed
	success_rate = (success / total * 100) if total > 0 else 0
	print(f" {family:20s}: {success:3d} success / {failed:3d} failed ({success_rate:5.1f}% success rate, target: {target})")

	# Save combined JSONL (skip individual JSON files for speed at scale)
	print(f"\nSaving {len(all_samples)} new samples...")
	if APPEND_MODE and existing_samples:
	# Append to existing dataset
	print(f"Appending to existing dataset ({len(existing_samples)} existing samples)")
	with open(jsonl_file, 'a') as f:
	for sample in all_samples:
	f.write(json.dumps(sample.model_dump()) + '\n')
	total_samples = len(existing_samples) + len(all_samples)
	else:
	# Overwrite dataset
	with open(jsonl_file, 'w') as f:
	for sample in all_samples:
	f.write(json.dumps(sample.model_dump()) + '\n')
	total_samples = len(all_samples)

	print(f"\nGenerated {len(all_samples)} new samples")
	print(f"Total dataset size: {total_samples} samples")
	print(f" Dataset: {jsonl_file}")


	if __name__ == "__main__":
	main()