srmsoumya commited on
Commit
cb256f8
Β·
1 Parent(s): c77ca5f

Add a couple of more sql templates for natural earth & chained queries

Browse files
dataset/README.md CHANGED
@@ -3,8 +3,11 @@
3
  Generates synthetic training data for fine-tuning the geocoding model.
4
  Two datasets come out of one pipeline run:
5
 
6
- - **SQL generation** β€” `(question + candidates) β†’ DuckDB SQL`
7
- - **Place extraction** β€” `question β†’ place names JSON`
 
 
 
8
 
9
  ---
10
 
@@ -106,16 +109,38 @@ After running, your training files are at:
106
  ```
107
  dataset/output/runs/{run_name}/
108
  sql/
109
- train.jsonl ← use this to fine-tune the SQL generation model
110
  val.jsonl
111
  test.jsonl
112
  places/
113
- train.jsonl ← use this to fine-tune the place extraction model
114
  val.jsonl
115
  test.jsonl
116
- stats.json ← sample counts by family
 
 
 
 
 
 
 
 
 
 
 
 
117
  ```
118
 
 
 
 
 
 
 
 
 
 
 
119
  ---
120
 
121
  ## When to regenerate from scratch
@@ -125,7 +150,8 @@ Change `run_name` and run without `--append` whenever you:
125
  - Change any SQL templates (`sql_templates.py`)
126
  - Add new template families
127
  - Change the candidate format or count
128
- - Change the training prompt/completion format
 
129
 
130
  Use `--append` only when you're adding more samples of the same type
131
  (e.g. adding more countries to an existing run with identical templates).
 
3
  Generates synthetic training data for fine-tuning the geocoding model.
4
  Two datasets come out of one pipeline run:
5
 
6
+ - **SQL generation** β€” `(question + candidates) -> DuckDB SQL`
7
+ - **Place extraction** β€” `question -> place names JSON`
8
+
9
+ Both tasks export in **conversation format** (`messages` list of
10
+ system/user/assistant turns), ready for chat-template fine-tuning.
11
 
12
  ---
13
 
 
109
  ```
110
  dataset/output/runs/{run_name}/
111
  sql/
112
+ train.jsonl <- fine-tune the SQL generation model
113
  val.jsonl
114
  test.jsonl
115
  places/
116
+ train.jsonl <- fine-tune the place extraction model
117
  val.jsonl
118
  test.jsonl
119
+ stats.json <- sample counts by family
120
+ ```
121
+
122
+ Each JSONL row is a conversation-format dict:
123
+
124
+ ```json
125
+ {
126
+ "messages": [
127
+ {"role": "system", "content": "..."},
128
+ {"role": "user", "content": "..."},
129
+ {"role": "assistant", "content": "..."}
130
+ ]
131
+ }
132
  ```
133
 
134
+ **SQL task**: the system prompt includes the full two-table schema inside
135
+ `<SCHEMA>` tags. The user prompt contains only `<CANDIDATES>` CSV and
136
+ `<USER_QUERY>`. The assistant response is pretty-printed SQL (via `sqlparse`).
137
+ All parquet paths are symbolic (`divisions_area` / `natural_earth`), never
138
+ runtime-specific.
139
+
140
+ **Places task**: the system prompt includes output format, extraction rules,
141
+ and the full list of Overture subtypes. The assistant response is a JSON
142
+ object with a `places` array.
143
+
144
  ---
145
 
146
  ## When to regenerate from scratch
 
150
  - Change any SQL templates (`sql_templates.py`)
151
  - Add new template families
152
  - Change the candidate format or count
153
+ - Change the system/user prompt structure or content
154
+ - Change the export format (e.g. prompt/completion to messages)
155
 
156
  Use `--append` only when you're adding more samples of the same type
157
  (e.g. adding more countries to an existing run with identical templates).
dataset/config.yaml CHANGED
@@ -34,7 +34,7 @@ sample_targets:
34
  generation:
35
  max_workers: 8 # Number of parallel workers
36
  retry_multiplier: 2 # Generate 2x samples to account for failures
37
- append_mode: true # If true, append to existing dataset instead of overwriting
38
 
39
  # Auto-scaling configuration
40
  # Relation limits are automatically calculated: target * retry_multiplier * safety_factor
@@ -66,4 +66,4 @@ output:
66
  # Run name β€” used to version exported splits so re-runs never overwrite previous data.
67
  # Change this whenever you regenerate from scratch (e.g. after template changes).
68
  # Exported files land in: output/runs/{run_name}/
69
- run_name: "v3-symbolic-paths"
 
34
  generation:
35
  max_workers: 8 # Number of parallel workers
36
  retry_multiplier: 2 # Generate 2x samples to account for failures
37
+ append_mode: false # Set false for clean regeneration after template/format changes
38
 
39
  # Auto-scaling configuration
40
  # Relation limits are automatically calculated: target * retry_multiplier * safety_factor
 
66
  # Run name β€” used to version exported splits so re-runs never overwrite previous data.
67
  # Change this whenever you regenerate from scratch (e.g. after template changes).
68
  # Exported files land in: output/runs/{run_name}/
69
+ run_name: "v4-conversation-format"
dataset/scripts/sql_templates.py CHANGED
@@ -84,12 +84,16 @@ TEMPLATES = [
84
  "Show me {anchor_name}",
85
  "Get the boundary of {anchor_name}",
86
  "Find {anchor_name}",
87
- "Show the geometry of {anchor_name}",
88
  "Where is {anchor_name}?",
89
  "Give me the outline of {anchor_name}",
90
- "Fetch {anchor_name}",
91
  "Display {anchor_name} on a map",
92
  "What does {anchor_name} look like?",
 
 
 
 
 
 
93
  ],
94
  ),
95
 
@@ -112,8 +116,12 @@ TEMPLATES = [
112
  "Where is the {anchor_name}?",
113
  "Show the extent of the {anchor_name}",
114
  "Give me the geometry of the {anchor_name}",
115
- "Fetch the {anchor_name}",
116
  "Display the {anchor_name}",
 
 
 
 
 
117
  ],
118
  ),
119
 
@@ -142,7 +150,9 @@ TEMPLATES = [
142
  "What shares a border with {anchor_name}?",
143
  "Neighbours of {anchor_name}",
144
  "What is adjacent to {anchor_name}?",
145
- "All places that share a boundary with {anchor_name}",
 
 
146
  ],
147
  ),
148
 
@@ -170,8 +180,8 @@ TEMPLATES = [
170
  "{target_subtype}s that touch {anchor_name}",
171
  "Neighbouring {target_subtype}s of {anchor_name}",
172
  "Which {target_subtype}s are adjacent to {anchor_name}?",
173
- "States bordering {anchor_name}",
174
- "Countries that share a boundary with {anchor_name}",
175
  ],
176
  ),
177
 
@@ -200,6 +210,8 @@ TEMPLATES = [
200
  "Which oceans touch {anchor_name}?",
201
  "What coastline does {anchor_name} have?",
202
  "Which water bodies does {anchor_name} border?",
 
 
203
  ],
204
  ),
205
 
@@ -229,8 +241,8 @@ TEMPLATES = [
229
  "Which regions border both {anchor_1_name} and {anchor_2_name}?",
230
  "What places touch both {anchor_1_name} and {anchor_2_name}?",
231
  "Regions adjacent to both {anchor_1_name} and {anchor_2_name}",
232
- "Which states share a border with both {anchor_1_name} and {anchor_2_name}?",
233
- "Countries that are neighbours of both {anchor_1_name} and {anchor_2_name}",
234
  ],
235
  ),
236
 
@@ -260,8 +272,8 @@ TEMPLATES = [
260
  "List all {target_subtype}s inside {anchor_name}",
261
  "{target_subtype}s contained by {anchor_name}",
262
  "All {target_subtype}s within the boundaries of {anchor_name}",
263
- "Cities in {anchor_name}",
264
- "Towns inside {anchor_name}",
265
  ],
266
  ),
267
 
@@ -288,6 +300,9 @@ TEMPLATES = [
288
  "Which country is {anchor_name} in?",
289
  "What country does {anchor_name} belong to?",
290
  "Which nation contains {anchor_name}?",
 
 
 
291
  ],
292
  ),
293
 
@@ -314,6 +329,8 @@ TEMPLATES = [
314
  "{target_subtype}s inside the {anchor_name}",
315
  "Administrative {target_subtype}s within the {anchor_name}",
316
  "All regions contained by the {anchor_name}",
 
 
317
  ],
318
  ),
319
 
@@ -342,6 +359,8 @@ TEMPLATES = [
342
  "What {target_subtype}s overlap with {anchor_name}?",
343
  "{target_subtype}s that cross into {anchor_name}",
344
  "Which {target_subtype}s overlap {anchor_name}?",
 
 
345
  ],
346
  ),
347
 
@@ -368,10 +387,10 @@ TEMPLATES = [
368
  "Countries that overlap with the {anchor_name}",
369
  "Which countries touch the {anchor_name}?",
370
  "Nations intersected by the {anchor_name}",
371
- "Countries the {anchor_name} flows through",
372
  "Which nations does the {anchor_name} cross?",
373
  "Countries along the {anchor_name}",
374
- "States the {anchor_name} runs through",
 
375
  ],
376
  ),
377
 
@@ -523,9 +542,9 @@ TEMPLATES = [
523
  "{target_subtype}s in {anchor_name} with sea access",
524
  "Which {target_subtype}s in {anchor_name} are on the coast?",
525
  "Seaside {target_subtype}s within {anchor_name}",
526
- "Coastal towns of {anchor_name}",
527
- "Which towns in {anchor_name} touch the ocean?",
528
  "{target_subtype}s in {anchor_name} bordering the sea",
 
 
529
  ],
530
  ),
531
 
@@ -555,9 +574,9 @@ TEMPLATES = [
555
  "Landlocked {target_subtype}s in {anchor_name}",
556
  "Which {target_subtype}s in {anchor_name} have no sea access?",
557
  "{target_subtype}s in {anchor_name} that are landlocked",
558
- "Countries in {anchor_name} with no coastline",
559
- "Which countries near {anchor_name} are landlocked?",
560
- "Landlocked nations in the region of {anchor_name}",
561
  ],
562
  ),
563
 
@@ -584,10 +603,10 @@ TEMPLATES = [
584
  " )"
585
  ),
586
  question_hints=[
587
- "{target_subtype}s in {anchor_name} in a terrain or island area",
588
- "Hill or mountain {target_subtype}s within {anchor_name}",
589
- "{target_subtype}s of {anchor_name} on terrain features",
590
- "Island or highland {target_subtype}s of {anchor_name}",
591
  ],
592
  ),
593
 
@@ -618,6 +637,8 @@ TEMPLATES = [
618
  "{anchor_1_name} without the {anchor_2_name} area",
619
  "Remove {anchor_2_name} from {anchor_1_name}",
620
  "{anchor_1_name} with {anchor_2_name} cut out",
 
 
621
  ],
622
  ),
623
 
@@ -643,6 +664,8 @@ TEMPLATES = [
643
  "{anchor_name} excluding the {clip_feature_name}",
644
  "{anchor_name} minus the {clip_feature_name}",
645
  "The land area of {anchor_name} not covered by the {clip_feature_name}",
 
 
646
  ],
647
  ),
648
 
@@ -678,6 +701,7 @@ TEMPLATES = [
678
  "Area within {buffer_km} km of the {anchor_1_name}-{anchor_2_name} border",
679
  "The region straddling the border of {anchor_1_name} and {anchor_2_name} within {buffer_km} km",
680
  "{buffer_km} km on either side of the {anchor_1_name} and {anchor_2_name} border",
 
681
  ],
682
  ),
683
 
@@ -1198,6 +1222,394 @@ TEMPLATES = [
1198
  ],
1199
  ),
1200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1201
  ]
1202
 
1203
 
 
84
  "Show me {anchor_name}",
85
  "Get the boundary of {anchor_name}",
86
  "Find {anchor_name}",
 
87
  "Where is {anchor_name}?",
88
  "Give me the outline of {anchor_name}",
 
89
  "Display {anchor_name} on a map",
90
  "What does {anchor_name} look like?",
91
+ "I need the shape of {anchor_name}",
92
+ "Pull up {anchor_name}",
93
+ "Can you show {anchor_name}?",
94
+ "Map of {anchor_name}",
95
+ "{anchor_name} boundary",
96
+ "Locate {anchor_name} for me",
97
  ],
98
  ),
99
 
 
116
  "Where is the {anchor_name}?",
117
  "Show the extent of the {anchor_name}",
118
  "Give me the geometry of the {anchor_name}",
 
119
  "Display the {anchor_name}",
120
+ "Pull up the {anchor_name}",
121
+ "I want to see the {anchor_name}",
122
+ "Map the {anchor_name}",
123
+ "How big is the {anchor_name}?",
124
+ "Outline of the {anchor_name}",
125
  ],
126
  ),
127
 
 
150
  "What shares a border with {anchor_name}?",
151
  "Neighbours of {anchor_name}",
152
  "What is adjacent to {anchor_name}?",
153
+ "What surrounds {anchor_name}?",
154
+ "Places next to {anchor_name}",
155
+ "Everything bordering {anchor_name}",
156
  ],
157
  ),
158
 
 
180
  "{target_subtype}s that touch {anchor_name}",
181
  "Neighbouring {target_subtype}s of {anchor_name}",
182
  "Which {target_subtype}s are adjacent to {anchor_name}?",
183
+ "{target_subtype}s along the {anchor_name} border",
184
+ "Find {target_subtype}s next to {anchor_name}",
185
  ],
186
  ),
187
 
 
210
  "Which oceans touch {anchor_name}?",
211
  "What coastline does {anchor_name} have?",
212
  "Which water bodies does {anchor_name} border?",
213
+ "Does {anchor_name} have access to the sea?",
214
+ "What ocean is {anchor_name} on?",
215
  ],
216
  ),
217
 
 
241
  "Which regions border both {anchor_1_name} and {anchor_2_name}?",
242
  "What places touch both {anchor_1_name} and {anchor_2_name}?",
243
  "Regions adjacent to both {anchor_1_name} and {anchor_2_name}",
244
+ "What lies between {anchor_1_name} and {anchor_2_name}?",
245
+ "Common neighbours of {anchor_1_name} and {anchor_2_name}",
246
  ],
247
  ),
248
 
 
272
  "List all {target_subtype}s inside {anchor_name}",
273
  "{target_subtype}s contained by {anchor_name}",
274
  "All {target_subtype}s within the boundaries of {anchor_name}",
275
+ "{target_subtype}s of {anchor_name}",
276
+ "Show every {target_subtype} in {anchor_name}",
277
  ],
278
  ),
279
 
 
300
  "Which country is {anchor_name} in?",
301
  "What country does {anchor_name} belong to?",
302
  "Which nation contains {anchor_name}?",
303
+ "{anchor_name} is part of which country?",
304
+ "Where does {anchor_name} fall geographically?",
305
+ "What country is {anchor_name} located in?",
306
  ],
307
  ),
308
 
 
329
  "{target_subtype}s inside the {anchor_name}",
330
  "Administrative {target_subtype}s within the {anchor_name}",
331
  "All regions contained by the {anchor_name}",
332
+ "What {target_subtype}s does the {anchor_name} contain?",
333
+ "{target_subtype}s covered by the {anchor_name}",
334
  ],
335
  ),
336
 
 
359
  "What {target_subtype}s overlap with {anchor_name}?",
360
  "{target_subtype}s that cross into {anchor_name}",
361
  "Which {target_subtype}s overlap {anchor_name}?",
362
+ "{target_subtype}s partially inside {anchor_name}",
363
+ "What {target_subtype}s extend into {anchor_name}?",
364
  ],
365
  ),
366
 
 
387
  "Countries that overlap with the {anchor_name}",
388
  "Which countries touch the {anchor_name}?",
389
  "Nations intersected by the {anchor_name}",
 
390
  "Which nations does the {anchor_name} cross?",
391
  "Countries along the {anchor_name}",
392
+ "What countries does the {anchor_name} cover?",
393
+ "Countries that the {anchor_name} spans across",
394
  ],
395
  ),
396
 
 
542
  "{target_subtype}s in {anchor_name} with sea access",
543
  "Which {target_subtype}s in {anchor_name} are on the coast?",
544
  "Seaside {target_subtype}s within {anchor_name}",
 
 
545
  "{target_subtype}s in {anchor_name} bordering the sea",
546
+ "Oceanfront {target_subtype}s in {anchor_name}",
547
+ "Which {target_subtype}s in {anchor_name} have a coastline?",
548
  ],
549
  ),
550
 
 
574
  "Landlocked {target_subtype}s in {anchor_name}",
575
  "Which {target_subtype}s in {anchor_name} have no sea access?",
576
  "{target_subtype}s in {anchor_name} that are landlocked",
577
+ "{target_subtype}s in {anchor_name} with no coastline",
578
+ "Which {target_subtype}s within {anchor_name} are landlocked?",
579
+ "Interior {target_subtype}s of {anchor_name} with no ocean border",
580
  ],
581
  ),
582
 
 
603
  " )"
604
  ),
605
  question_hints=[
606
+ "{target_subtype}s in {anchor_name} on a terrain feature or island",
607
+ "{target_subtype}s of {anchor_name} on a peninsula or island group",
608
+ "{target_subtype}s within {anchor_name} on notable landforms",
609
+ "Island and peninsula {target_subtype}s of {anchor_name}",
610
  ],
611
  ),
612
 
 
637
  "{anchor_1_name} without the {anchor_2_name} area",
638
  "Remove {anchor_2_name} from {anchor_1_name}",
639
  "{anchor_1_name} with {anchor_2_name} cut out",
640
+ "Subtract {anchor_2_name} from {anchor_1_name}",
641
+ "What is left of {anchor_1_name} after removing {anchor_2_name}?",
642
  ],
643
  ),
644
 
 
664
  "{anchor_name} excluding the {clip_feature_name}",
665
  "{anchor_name} minus the {clip_feature_name}",
666
  "The land area of {anchor_name} not covered by the {clip_feature_name}",
667
+ "{anchor_name} with the {clip_feature_name} removed",
668
+ "What remains of {anchor_name} after removing the {clip_feature_name}?",
669
  ],
670
  ),
671
 
 
701
  "Area within {buffer_km} km of the {anchor_1_name}-{anchor_2_name} border",
702
  "The region straddling the border of {anchor_1_name} and {anchor_2_name} within {buffer_km} km",
703
  "{buffer_km} km on either side of the {anchor_1_name} and {anchor_2_name} border",
704
+ "Buffer the {anchor_1_name}-{anchor_2_name} boundary by {buffer_km} km",
705
  ],
706
  ),
707
 
 
1222
  ],
1223
  ),
1224
 
1225
+ # ── NATURAL EARTH ADJACENCY ─────────────────────────────────────────────
1226
+ # Division anchor, natural_earth targets. Handler formats anchor_id and
1227
+ # target_subtype but the SQL hardcodes NE subtypes (like adj_03).
1228
+
1229
+ SQLTemplate(
1230
+ template_id="adj_04",
1231
+ family="adjacency",
1232
+ sql_difficulty="medium",
1233
+ anchor_source="divisions_area",
1234
+ num_anchors=1,
1235
+ target_subtype="river",
1236
+ sql_template=(
1237
+ "WITH a AS ("
1238
+ " SELECT geometry FROM read_parquet('divisions_area') WHERE id = '{anchor_id}'"
1239
+ ")"
1240
+ " SELECT n.id, n.names.\"primary\" AS name, n.subtype,"
1241
+ " ST_AsGeoJSON(n.geometry) AS geometry"
1242
+ " FROM read_parquet('natural_earth') AS n, a"
1243
+ " WHERE n.subtype IN ('River', 'Lake', 'Basin')"
1244
+ " AND ST_Intersects(a.geometry, n.geometry)"
1245
+ ),
1246
+ question_hints=[
1247
+ "What rivers or lakes are in {anchor_name}?",
1248
+ "Natural water features of {anchor_name}",
1249
+ "Which rivers flow through {anchor_name}?",
1250
+ "Lakes and rivers within {anchor_name}",
1251
+ "Water features inside {anchor_name}",
1252
+ "What bodies of water cross {anchor_name}?",
1253
+ "Rivers of {anchor_name}",
1254
+ "Show me the lakes in {anchor_name}",
1255
+ ],
1256
+ ),
1257
+
1258
+ SQLTemplate(
1259
+ template_id="adj_05",
1260
+ family="adjacency",
1261
+ sql_difficulty="medium",
1262
+ anchor_source="divisions_area",
1263
+ num_anchors=1,
1264
+ target_subtype="range",
1265
+ sql_template=(
1266
+ "WITH a AS ("
1267
+ " SELECT geometry FROM read_parquet('divisions_area') WHERE id = '{anchor_id}'"
1268
+ ")"
1269
+ " SELECT n.id, n.names.\"primary\" AS name, n.subtype,"
1270
+ " ST_AsGeoJSON(n.geometry) AS geometry"
1271
+ " FROM read_parquet('natural_earth') AS n, a"
1272
+ " WHERE n.subtype IN ('Range/Mts', 'Terrain area', 'Peninsula', 'Depression')"
1273
+ " AND ST_Intersects(a.geometry, n.geometry)"
1274
+ ),
1275
+ question_hints=[
1276
+ "What mountain ranges are in {anchor_name}?",
1277
+ "Terrain features of {anchor_name}",
1278
+ "Which mountain ranges cross {anchor_name}?",
1279
+ "Landforms inside {anchor_name}",
1280
+ "Peninsulas and ranges in {anchor_name}",
1281
+ "Geographic features within {anchor_name}",
1282
+ "Mountains of {anchor_name}",
1283
+ "What terrain does {anchor_name} contain?",
1284
+ ],
1285
+ ),
1286
+
1287
+ # ── NATURAL EARTH INTERSECTION ──────────────────────────────────────────
1288
+ # intersect_03: NE anchor, finding overlapping regions (vs countries in
1289
+ # intersect_02). Uses cross_source_relations handler.
1290
+ # intersect_04: division anchor, finding NE features that overlap it.
1291
+ # Uses intersection_pairs handler (extra NE subtypes ignored in SQL).
1292
+
1293
+ SQLTemplate(
1294
+ template_id="intersect_03",
1295
+ family="intersection",
1296
+ sql_difficulty="medium-hard",
1297
+ anchor_source="natural_earth",
1298
+ num_anchors=1,
1299
+ target_subtype="region",
1300
+ sql_template=(
1301
+ "WITH a AS ("
1302
+ " SELECT geometry FROM read_parquet('natural_earth') WHERE id = '{anchor_id}'"
1303
+ ")"
1304
+ " SELECT b.id, b.names.\"primary\" AS name, b.subtype, b.country,"
1305
+ " ST_AsGeoJSON(b.geometry) AS geometry"
1306
+ " FROM read_parquet('divisions_area') AS b, a"
1307
+ " WHERE b.subtype = '{target_subtype}'"
1308
+ " AND ST_Intersects(b.geometry, a.geometry)"
1309
+ ),
1310
+ question_hints=[
1311
+ "Which regions does the {anchor_name} pass through?",
1312
+ "What administrative regions overlap with the {anchor_name}?",
1313
+ "Regions that the {anchor_name} crosses",
1314
+ "Administrative areas intersected by the {anchor_name}",
1315
+ "What provinces does the {anchor_name} span?",
1316
+ "Regions along the {anchor_name}",
1317
+ "Which provinces overlap the {anchor_name}?",
1318
+ ],
1319
+ ),
1320
+
1321
+ SQLTemplate(
1322
+ template_id="intersect_04",
1323
+ family="intersection",
1324
+ sql_difficulty="medium-hard",
1325
+ anchor_source="divisions_area",
1326
+ num_anchors=1,
1327
+ target_subtype="region",
1328
+ sql_template=(
1329
+ "WITH a AS ("
1330
+ " SELECT geometry FROM read_parquet('divisions_area') WHERE id = '{anchor_id}'"
1331
+ ")"
1332
+ " SELECT n.id, n.names.\"primary\" AS name, n.subtype,"
1333
+ " ST_AsGeoJSON(n.geometry) AS geometry"
1334
+ " FROM read_parquet('natural_earth') AS n, a"
1335
+ " WHERE ST_Intersects(n.geometry, a.geometry)"
1336
+ ),
1337
+ question_hints=[
1338
+ "What natural features intersect {anchor_name}?",
1339
+ "Natural earth features that overlap {anchor_name}",
1340
+ "Which geographic features cross {anchor_name}?",
1341
+ "Everything from natural earth that touches {anchor_name}",
1342
+ "What geographic features does {anchor_name} contain?",
1343
+ "Natural features within or crossing {anchor_name}",
1344
+ ],
1345
+ ),
1346
+
1347
+ # ── NATURAL EARTH CHAINED ───────────────────────────────────────────────
1348
+ # chained_04: localities in a region that intersect a river or lake.
1349
+ # chained_05: localities in a region that lie on a mountain range.
1350
+
1351
+ SQLTemplate(
1352
+ template_id="chained_04",
1353
+ family="chained",
1354
+ sql_difficulty="hard",
1355
+ anchor_source="divisions_area",
1356
+ num_anchors=1,
1357
+ target_subtype="locality",
1358
+ sql_template=(
1359
+ "WITH region AS ("
1360
+ " SELECT geometry FROM read_parquet('divisions_area') WHERE id = '{anchor_id}'"
1361
+ ")"
1362
+ " SELECT b.id, b.names.\"primary\" AS name, b.subtype, b.country,"
1363
+ " ST_AsGeoJSON(b.geometry) AS geometry"
1364
+ " FROM read_parquet('divisions_area') AS b, region"
1365
+ " WHERE b.subtype = '{target_subtype}'"
1366
+ " AND ST_Within(b.geometry, region.geometry)"
1367
+ " AND EXISTS ("
1368
+ " SELECT 1 FROM read_parquet('natural_earth') AS n"
1369
+ " WHERE n.subtype IN ('River', 'Lake', 'Basin')"
1370
+ " AND ST_Intersects(b.geometry, n.geometry)"
1371
+ " )"
1372
+ ),
1373
+ question_hints=[
1374
+ "Riverside {target_subtype}s in {anchor_name}",
1375
+ "{target_subtype}s in {anchor_name} near a river or lake",
1376
+ "Which {target_subtype}s in {anchor_name} are on a waterway?",
1377
+ "Lakeside or riverside {target_subtype}s within {anchor_name}",
1378
+ "{target_subtype}s in {anchor_name} that touch a river",
1379
+ "Which {target_subtype}s in {anchor_name} are on a lake?",
1380
+ "Waterfront {target_subtype}s of {anchor_name}",
1381
+ ],
1382
+ ),
1383
+
1384
+ SQLTemplate(
1385
+ template_id="chained_05",
1386
+ family="chained",
1387
+ sql_difficulty="hard",
1388
+ anchor_source="divisions_area",
1389
+ num_anchors=1,
1390
+ target_subtype="locality",
1391
+ sql_template=(
1392
+ "WITH region AS ("
1393
+ " SELECT geometry FROM read_parquet('divisions_area') WHERE id = '{anchor_id}'"
1394
+ ")"
1395
+ " SELECT b.id, b.names.\"primary\" AS name, b.subtype, b.country,"
1396
+ " ST_AsGeoJSON(b.geometry) AS geometry"
1397
+ " FROM read_parquet('divisions_area') AS b, region"
1398
+ " WHERE b.subtype = '{target_subtype}'"
1399
+ " AND ST_Within(b.geometry, region.geometry)"
1400
+ " AND EXISTS ("
1401
+ " SELECT 1 FROM read_parquet('natural_earth') AS n"
1402
+ " WHERE n.subtype IN ('Range/Mts', 'Depression')"
1403
+ " AND ST_Intersects(b.geometry, n.geometry)"
1404
+ " )"
1405
+ ),
1406
+ question_hints=[
1407
+ "Mountain {target_subtype}s in {anchor_name}",
1408
+ "{target_subtype}s in {anchor_name} on a mountain range",
1409
+ "Which {target_subtype}s in {anchor_name} are in the mountains?",
1410
+ "Highland {target_subtype}s within {anchor_name}",
1411
+ "{target_subtype}s of {anchor_name} in mountainous terrain",
1412
+ "{target_subtype}s in {anchor_name} near a mountain range",
1413
+ ],
1414
+ ),
1415
+
1416
+ # ── CHAINED (county-level) ──────────────────────────────────────────────
1417
+ # Same spatial patterns as chained_01..05 but targeting counties/districts
1418
+ # so the model learns "coastal districts of X", "riverside counties", etc.
1419
+
1420
+ SQLTemplate(
1421
+ template_id="chained_06",
1422
+ family="chained",
1423
+ sql_difficulty="hard",
1424
+ anchor_source="divisions_area",
1425
+ num_anchors=1,
1426
+ target_subtype="county",
1427
+ sql_template=(
1428
+ "WITH region AS ("
1429
+ " SELECT geometry FROM read_parquet('divisions_area') WHERE id = '{anchor_id}'"
1430
+ ")"
1431
+ " SELECT b.id, b.names.\"primary\" AS name, b.subtype, b.country,"
1432
+ " ST_AsGeoJSON(b.geometry) AS geometry"
1433
+ " FROM read_parquet('divisions_area') AS b, region"
1434
+ " WHERE b.subtype = '{target_subtype}'"
1435
+ " AND ST_Within(b.geometry, region.geometry)"
1436
+ " AND EXISTS ("
1437
+ " SELECT 1 FROM read_parquet('natural_earth') AS n"
1438
+ " WHERE n.subtype IN ('ocean', 'sea')"
1439
+ " AND ST_Intersects(b.geometry, n.geometry)"
1440
+ " )"
1441
+ ),
1442
+ question_hints=[
1443
+ "Coastal {target_subtype}s of {anchor_name}",
1444
+ "Which districts of {anchor_name} are on the coast?",
1445
+ "{target_subtype}s in {anchor_name} that border the sea",
1446
+ "Seaside {target_subtype}s within {anchor_name}",
1447
+ "{target_subtype}s of {anchor_name} with ocean access",
1448
+ "Which {target_subtype}s in {anchor_name} touch the sea?",
1449
+ "Maritime {target_subtype}s of {anchor_name}",
1450
+ ],
1451
+ ),
1452
+
1453
+ SQLTemplate(
1454
+ template_id="chained_07",
1455
+ family="chained",
1456
+ sql_difficulty="hard",
1457
+ anchor_source="divisions_area",
1458
+ num_anchors=1,
1459
+ target_subtype="county",
1460
+ sql_template=(
1461
+ "WITH region AS ("
1462
+ " SELECT geometry FROM read_parquet('divisions_area') WHERE id = '{anchor_id}'"
1463
+ ")"
1464
+ " SELECT b.id, b.names.\"primary\" AS name, b.subtype, b.country,"
1465
+ " ST_AsGeoJSON(b.geometry) AS geometry"
1466
+ " FROM read_parquet('divisions_area') AS b, region"
1467
+ " WHERE b.subtype = '{target_subtype}'"
1468
+ " AND ST_Within(b.geometry, region.geometry)"
1469
+ " AND NOT EXISTS ("
1470
+ " SELECT 1 FROM read_parquet('natural_earth') AS n"
1471
+ " WHERE n.subtype IN ('ocean', 'sea')"
1472
+ " AND ST_Intersects(b.geometry, n.geometry)"
1473
+ " )"
1474
+ ),
1475
+ question_hints=[
1476
+ "Landlocked {target_subtype}s of {anchor_name}",
1477
+ "Which districts of {anchor_name} have no coastline?",
1478
+ "Interior {target_subtype}s within {anchor_name}",
1479
+ "{target_subtype}s in {anchor_name} with no sea access",
1480
+ "Non-coastal {target_subtype}s of {anchor_name}",
1481
+ "Inland {target_subtype}s of {anchor_name}",
1482
+ ],
1483
+ ),
1484
+
1485
+ SQLTemplate(
1486
+ template_id="chained_08",
1487
+ family="chained",
1488
+ sql_difficulty="hard",
1489
+ anchor_source="divisions_area",
1490
+ num_anchors=1,
1491
+ target_subtype="county",
1492
+ sql_template=(
1493
+ "WITH region AS ("
1494
+ " SELECT geometry FROM read_parquet('divisions_area') WHERE id = '{anchor_id}'"
1495
+ ")"
1496
+ " SELECT b.id, b.names.\"primary\" AS name, b.subtype, b.country,"
1497
+ " ST_AsGeoJSON(b.geometry) AS geometry"
1498
+ " FROM read_parquet('divisions_area') AS b, region"
1499
+ " WHERE b.subtype = '{target_subtype}'"
1500
+ " AND ST_Within(b.geometry, region.geometry)"
1501
+ " AND EXISTS ("
1502
+ " SELECT 1 FROM read_parquet('natural_earth') AS n"
1503
+ " WHERE n.subtype IN ('River', 'Lake', 'Basin')"
1504
+ " AND ST_Intersects(b.geometry, n.geometry)"
1505
+ " )"
1506
+ ),
1507
+ question_hints=[
1508
+ "Riverside {target_subtype}s of {anchor_name}",
1509
+ "Which districts of {anchor_name} have a river or lake?",
1510
+ "{target_subtype}s in {anchor_name} on a waterway",
1511
+ "Lakeside {target_subtype}s within {anchor_name}",
1512
+ "{target_subtype}s of {anchor_name} along a river",
1513
+ "Which {target_subtype}s in {anchor_name} border a lake?",
1514
+ ],
1515
+ ),
1516
+
1517
+ SQLTemplate(
1518
+ template_id="chained_09",
1519
+ family="chained",
1520
+ sql_difficulty="hard",
1521
+ anchor_source="divisions_area",
1522
+ num_anchors=1,
1523
+ target_subtype="county",
1524
+ sql_template=(
1525
+ "WITH region AS ("
1526
+ " SELECT geometry FROM read_parquet('divisions_area') WHERE id = '{anchor_id}'"
1527
+ ")"
1528
+ " SELECT b.id, b.names.\"primary\" AS name, b.subtype, b.country,"
1529
+ " ST_AsGeoJSON(b.geometry) AS geometry"
1530
+ " FROM read_parquet('divisions_area') AS b, region"
1531
+ " WHERE b.subtype = '{target_subtype}'"
1532
+ " AND ST_Within(b.geometry, region.geometry)"
1533
+ " AND EXISTS ("
1534
+ " SELECT 1 FROM read_parquet('natural_earth') AS n"
1535
+ " WHERE n.subtype IN ('Range/Mts', 'Depression')"
1536
+ " AND ST_Intersects(b.geometry, n.geometry)"
1537
+ " )"
1538
+ ),
1539
+ question_hints=[
1540
+ "Mountain {target_subtype}s of {anchor_name}",
1541
+ "Which districts of {anchor_name} are in the mountains?",
1542
+ "{target_subtype}s in {anchor_name} on a mountain range",
1543
+ "Highland {target_subtype}s within {anchor_name}",
1544
+ "{target_subtype}s of {anchor_name} in mountainous terrain",
1545
+ "Which {target_subtype}s in {anchor_name} have mountain ranges?",
1546
+ ],
1547
+ ),
1548
+
1549
+ # ── NATURAL EARTH CONTAINMENT ───────────────────────────────────────────
1550
+ # contain_04: NE anchor (sea/gulf/bay), find countries that touch it.
1551
+ # Uses containment handler via containment_pairs.
1552
+
1553
+ SQLTemplate(
1554
+ template_id="contain_04",
1555
+ family="containment",
1556
+ sql_difficulty="medium",
1557
+ anchor_source="natural_earth",
1558
+ num_anchors=1,
1559
+ target_subtype="country",
1560
+ sql_template=(
1561
+ "WITH a AS ("
1562
+ " SELECT geometry FROM read_parquet('natural_earth') WHERE id = '{anchor_id}'"
1563
+ ")"
1564
+ " SELECT b.id, b.names.\"primary\" AS name, b.subtype,"
1565
+ " ST_AsGeoJSON(b.geometry) AS geometry"
1566
+ " FROM read_parquet('divisions_area') AS b, a"
1567
+ " WHERE b.subtype = '{target_subtype}'"
1568
+ " AND ST_Intersects(b.geometry, a.geometry)"
1569
+ ),
1570
+ question_hints=[
1571
+ "Which countries border the {anchor_name}?",
1572
+ "What countries are along the {anchor_name}?",
1573
+ "Countries surrounding the {anchor_name}",
1574
+ "Nations on the {anchor_name}",
1575
+ "Which countries touch the {anchor_name}?",
1576
+ "Countries with coastline on the {anchor_name}",
1577
+ "What nations lie on the {anchor_name}?",
1578
+ ],
1579
+ ),
1580
+
1581
+ # ── NATURAL EARTH BUFFER ────────────────────────────────────────────────
1582
+ # buffer_05: NE anchor, find other NE features within a buffer distance.
1583
+ # Uses buffer handler for natural_earth.
1584
+
1585
+ SQLTemplate(
1586
+ template_id="buffer_05",
1587
+ family="buffer",
1588
+ sql_difficulty="hard",
1589
+ anchor_source="natural_earth",
1590
+ num_anchors=1,
1591
+ requires_buffer=True,
1592
+ sql_template=(
1593
+ "WITH a AS ("
1594
+ " SELECT ST_Buffer(geometry, {buffer_km} * 1000.0 / 111320.0) AS geom"
1595
+ " FROM read_parquet('natural_earth')"
1596
+ " WHERE id = '{anchor_id}'"
1597
+ ")"
1598
+ " SELECT n.id, n.names.\"primary\" AS name, n.subtype,"
1599
+ " ST_AsGeoJSON(n.geometry) AS geometry"
1600
+ " FROM read_parquet('natural_earth') AS n, a"
1601
+ " WHERE ST_Intersects(n.geometry, a.geom)"
1602
+ ),
1603
+ question_hints=[
1604
+ "Natural features within {buffer_km} km of the {anchor_name}",
1605
+ "What is within {buffer_km} km of the {anchor_name}?",
1606
+ "Geographic features near the {anchor_name} within {buffer_km} km",
1607
+ "Everything within {buffer_km} km of the {anchor_name}",
1608
+ "What natural features are close to the {anchor_name}?",
1609
+ "{buffer_km} km radius around the {anchor_name}",
1610
+ ],
1611
+ ),
1612
+
1613
  ]
1614
 
1615