Spaces:

developmentseed
/

gazet

Sleeping

App Files Files Community

srmsoumya commited on Apr 16

Commit

c77ca5f

1 Parent(s): 2bf5583

chore: clean sql generation, use conversation format, move prompts from user to system

Browse files

Files changed (8) hide show

dataset/scripts/convert_to_conversation_format.py +0 -134
dataset/scripts/export_training_data.py +55 -25
dataset/scripts/validate_dataset.py +14 -0
finetune/eval_cli.py +5 -9
finetune/nlg.py +41 -51
finetune/prompts.py +18 -25
finetune/train_modal_qwen35.py +2 -3
src/gazet/lm.py +23 -17

dataset/scripts/convert_to_conversation_format.py DELETED Viewed

@@ -1,134 +0,0 @@
-#!/usr/bin/env python3
-"""Convert prompt-completion format to conversation format.
-Reads SQL and places JSONL from a run directory and converts to a single
-"messages" list format suitable for various downstream uses.
-Input format (current):
-{
-  "prompt": [
-    {"role": "system", "content": "..."},
-    {"role": "user", "content": "..."}
-  ],
-  "completion": [
-    {"role": "assistant", "content": "..."}
-  ],
-  "metadata": {...}
-}
-Output format:
-{
-  "messages": [
-    {"role": "system", "content": "..."},
-    {"role": "user", "content": "..."},
-    {"role": "assistant", "content": "..."}
-  ]
-}
-Saves to JSONL files:
-  - train_conversation_sql.jsonl
-  - val_conversation_sql.jsonl
-  - test_conversation_sql.jsonl
-  - train_conversation_places.jsonl
-  - val_conversation_places.jsonl
-  - test_conversation_places.jsonl
-Usage with datasets library:
-    from datasets import load_dataset
-    train_sql = load_dataset(
-        "json",
-        data_files="dataset/output/conversations/train_conversation_sql.jsonl",
-        split="train"
-    )
-    # Access messages:
-    print(train_sql[0]["messages"])
-"""
-import argparse
-import json
-from pathlib import Path
-def load_jsonl(path: Path) -> list[dict]:
-    rows = []
-    with open(path) as f:
-        for line in f:
-            line = line.strip()
-            if line:
-                rows.append(json.loads(line))
-    return rows
-def to_conversation_format(sample: dict) -> dict:
-    """Convert prompt+completion format to messages format."""
-    return {
-        "messages": sample["prompt"] + sample["completion"],
-    }
-def process_task(run_dir: Path, task: str, output_dir: Path):
-    """Process all splits for a single task (sql or places)."""
-    task_dir = run_dir / task
-    for split in ["train", "val", "test"]:
-        input_path = task_dir / f"{split}.jsonl"
-        if not input_path.exists():
-            print(f"  Skipping {task}/{split}: {input_path} not found")
-            continue
-        samples = load_jsonl(input_path)
-        conversations = [to_conversation_format(s) for s in samples]
-        output_path = output_dir / f"{split}_conversation_{task}.jsonl"
-        output_path.parent.mkdir(parents=True, exist_ok=True)
-        with open(output_path, "w") as f:
-            for conv in conversations:
-                f.write(json.dumps(conv, ensure_ascii=False) + "\n")
-        print(f"  {task}/{split}: {len(conversations)} samples → {output_path}")
-def main():
-    parser = argparse.ArgumentParser(
-        description="Convert prompt-completion format to conversation format"
-    )
-    parser.add_argument(
-        "--run-dir",
-        type=Path,
-        default=Path("dataset/output/runs/v3-symbolic-paths"),
-        help="Path to run directory containing sql/ and places/ subdirectories",
-    )
-    parser.add_argument(
-        "--output-dir",
-        type=Path,
-        default=Path("dataset/output/conversations"),
-        help="Output directory for JSONL files",
-    )
-    args = parser.parse_args()
-    run_dir = args.run_dir
-    output_dir = args.output_dir
-    if not run_dir.exists():
-        print(f"Error: Run directory not found: {run_dir}")
-        return 1
-    print(f"Converting from: {run_dir}")
-    print(f"Output directory: {output_dir}")
-    print()
-    for task in ["sql", "places"]:
-        print(f"Processing {task}:")
-        process_task(run_dir, task, output_dir)
-    print()
-    print("Conversion complete!")
-    print(f"Output files in: {output_dir}/")
-if __name__ == "__main__":
-    main()

dataset/scripts/export_training_data.py CHANGED Viewed

@@ -27,8 +27,11 @@ from collections import defaultdict
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple
 import yaml
 # ---------------------------------------------------------------------------
 # Loading
@@ -89,16 +92,12 @@ def stratified_split(
 # Conversational prompt-completion: model sees system + user, generates SQL.
 # ---------------------------------------------------------------------------
-_SQL_SYSTEM = (
-    "You are a text to SQL query translator that helps in natural language geocoding."
-)
-_CANDIDATES_COLS = [
-    "source", "id", "name", "subtype", "country", "region",
-    "admin_level", "similarity",
-]
-_SCHEMA = """1. divisions_area  -- Overture polygon/multipolygon admin boundaries
    query: read_parquet('divisions_area')
    columns:
      id VARCHAR              -- unique feature id
@@ -127,11 +126,17 @@ _SCHEMA = """1. divisions_area  -- Overture polygon/multipolygon admin boundarie
      is_land BOOLEAN
      is_territorial BOOLEAN
      geometry GEOMETRY       -- WGS-84 polygon/multipolygon (spatial ext loaded)
 The candidates table has a 'source' column: 'divisions_area' or 'natural_earth'.
 Use read_parquet('divisions_area') or read_parquet('natural_earth') accordingly.
 Use ST_AsGeoJSON(geometry) for all geometry outputs."""
 def _candidates_csv(candidates: List[Dict]) -> str:
     import io
@@ -149,26 +154,42 @@ def _candidates_csv(candidates: List[Dict]) -> str:
     return buf.getvalue().strip()
 def sample_to_sql_pair(sample: Dict[str, Any]) -> Optional[Dict]:
     """Convert a raw sample to a conversational prompt-completion pair for SQL generation."""
     sql = sample.get("target", {}).get("sql", "").strip()
     if not sql:
         return None
     user_content = (
-        "GIVEN the <SCHEMA_DETAILS>, <CANDIDATES> and <USER_QUERY>, "
-        "generate the corresponding SQL command to retrieve the desired geometry.\n\n"
-        f"<SCHEMA_DETAILS>\n{_SCHEMA}\n</SCHEMA_DETAILS>\n\n"
         f"<CANDIDATES>\n{_candidates_csv(sample.get('candidates', []))}\n</CANDIDATES>\n\n"
         f"<USER_QUERY>\n{sample['question']}\n</USER_QUERY>"
     )
     return {
-        "prompt": [
-            {"role": "system", "content": _SQL_SYSTEM},
-            {"role": "user",   "content": user_content},
-        ],
-        "completion": [
             {"role": "assistant", "content": sql},
         ],
         "metadata": sample.get("metadata", {}),
@@ -180,10 +201,21 @@ def sample_to_sql_pair(sample: Dict[str, Any]) -> Optional[Dict]:
 # Derived from the same SQL samples: selected_candidates → PlacesResult JSON.
 # ---------------------------------------------------------------------------
-_PLACE_SYSTEM = (
-    "You are a geographic entity extractor. "
-    "Extract place names from the query and return valid JSON only."
-)
 # Overture division subtypes — used to filter out natural_earth candidates
 # from the place extraction output (NE features don't have these subtypes).
@@ -241,11 +273,9 @@ def sample_to_place_pair(sample: Dict[str, Any]) -> Optional[Dict]:
     completion_json = json.dumps({"places": places}, ensure_ascii=False)
     return {
-        "prompt": [
-            {"role": "system", "content": _PLACE_SYSTEM},
-            {"role": "user",   "content": sample["question"]},
-        ],
-        "completion": [
             {"role": "assistant", "content": completion_json},
         ],
         "metadata": sample.get("metadata", {}),

 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple
+import sqlparse
 import yaml
+from gazet.config import DIVISIONS_AREA_PATH, NATURAL_EARTH_PATH
 # ---------------------------------------------------------------------------
 # Loading
 # Conversational prompt-completion: model sees system + user, generates SQL.
 # ---------------------------------------------------------------------------
+_SQL_SYSTEM = """You are a text to SQL query translator that helps in natural language geocoding.
+You have access to two DuckDB parquet tables. Given a set of candidate entities and a user query, generate the SQL to retrieve the desired geometry.
+<SCHEMA>
+1. divisions_area  -- Overture polygon/multipolygon admin boundaries
    query: read_parquet('divisions_area')
    columns:
      id VARCHAR              -- unique feature id
      is_land BOOLEAN
      is_territorial BOOLEAN
      geometry GEOMETRY       -- WGS-84 polygon/multipolygon (spatial ext loaded)
+</SCHEMA>
 The candidates table has a 'source' column: 'divisions_area' or 'natural_earth'.
 Use read_parquet('divisions_area') or read_parquet('natural_earth') accordingly.
 Use ST_AsGeoJSON(geometry) for all geometry outputs."""
+_CANDIDATES_COLS = [
+    "source", "id", "name", "subtype", "country", "region",
+    "admin_level", "similarity",
+]
 def _candidates_csv(candidates: List[Dict]) -> str:
     import io
     return buf.getvalue().strip()
+def _to_symbolic_sql(sql: str) -> str:
+    """Normalize any hardcoded or runtime paths back to symbolic names."""
+    sql = sql.replace(DIVISIONS_AREA_PATH, "divisions_area")
+    sql = sql.replace(NATURAL_EARTH_PATH, "natural_earth")
+    sql = sql.replace("/data/overture/division_area/*.parquet",          "divisions_area")
+    sql = sql.replace("/data/overture/divisions_area/*.parquet",         "divisions_area")
+    sql = sql.replace("/data/natural_earth_geoparquet/ne_geography.parquet", "natural_earth")
+    return sql
+def _format_sql(sql: str) -> str:
+    """Pretty-print SQL so the model learns clean, readable style."""
+    return sqlparse.format(
+        sql,
+        reindent=True,
+        keyword_case="upper",
+        indent_width=4,
+    ).strip()
 def sample_to_sql_pair(sample: Dict[str, Any]) -> Optional[Dict]:
     """Convert a raw sample to a conversational prompt-completion pair for SQL generation."""
     sql = sample.get("target", {}).get("sql", "").strip()
     if not sql:
         return None
+    sql = _format_sql(_to_symbolic_sql(sql))
     user_content = (
         f"<CANDIDATES>\n{_candidates_csv(sample.get('candidates', []))}\n</CANDIDATES>\n\n"
         f"<USER_QUERY>\n{sample['question']}\n</USER_QUERY>"
     )
     return {
+        "messages": [
+            {"role": "system",    "content": _SQL_SYSTEM},
+            {"role": "user",      "content": user_content},
             {"role": "assistant", "content": sql},
         ],
         "metadata": sample.get("metadata", {}),
 # Derived from the same SQL samples: selected_candidates → PlacesResult JSON.
 # ---------------------------------------------------------------------------
+_PLACE_SYSTEM = """You are a geographic entity extractor. Extract place names from the user query and return valid JSON only.
+OUTPUT FORMAT:
+{"places": [{"place": "<name>", "country": "<ISO-2>", "subtype": "<subtype>"}]}
+"country" and "subtype" are optional; omit if not applicable.
+RULES:
+- Only extract places explicitly mentioned. Never infer or expand (e.g. "states of India" -> extract "India" only).
+- No duplicate place names.
+- "country": ISO 3166-1 alpha-2. Include only if explicitly mentioned or unambiguous.
+- "subtype": include only when the geographic level is clear from the query.
+SUBTYPES:
+country, dependency, region, county, localadmin, locality, macrohood, neighborhood, microhood
+- Default to locality for cities/towns; omit for physical features (oceans, rivers, mountains)."""
 # Overture division subtypes — used to filter out natural_earth candidates
 # from the place extraction output (NE features don't have these subtypes).
     completion_json = json.dumps({"places": places}, ensure_ascii=False)
     return {
+        "messages": [
+            {"role": "system",    "content": _PLACE_SYSTEM},
+            {"role": "user",      "content": sample["question"]},
             {"role": "assistant", "content": completion_json},
         ],
         "metadata": sample.get("metadata", {}),

dataset/scripts/validate_dataset.py CHANGED Viewed

@@ -50,6 +50,18 @@ def _resolve_paths(sql: str) -> str:
     return sql
 def validate_sql(con: duckdb.DuckDBPyConnection, sql: str) -> tuple[bool, str]:
     """Validate that SQL executes without error.
@@ -120,6 +132,8 @@ def validate_sample_worker(sample: Dict[str, Any]) -> Tuple[str, bool, List[str]
     try:
         is_valid, issues = validate_sample(con, sample)
         con.close()
         return (sample['id'], is_valid, issues, sample if is_valid else None)
     except Exception as e:
         con.close()

     return sql
+def _to_symbolic_sql(sql: str) -> str:
+    """Normalize any hardcoded or runtime paths back to symbolic names for storage."""
+    # Current local runtime paths
+    sql = sql.replace(DIVISIONS_AREA_PATH, "divisions_area")
+    sql = sql.replace(NATURAL_EARTH_PATH, "natural_earth")
+    # Legacy Docker paths
+    sql = sql.replace("/data/overture/division_area/*.parquet",          "divisions_area")
+    sql = sql.replace("/data/overture/divisions_area/*.parquet",         "divisions_area")
+    sql = sql.replace("/data/natural_earth_geoparquet/ne_geography.parquet", "natural_earth")
+    return sql
 def validate_sql(con: duckdb.DuckDBPyConnection, sql: str) -> tuple[bool, str]:
     """Validate that SQL executes without error.
     try:
         is_valid, issues = validate_sample(con, sample)
         con.close()
+        if is_valid:
+            sample['target']['sql'] = _to_symbolic_sql(sample['target']['sql'])
         return (sample['id'], is_valid, issues, sample if is_valid else None)
     except Exception as e:
         con.close()

finetune/eval_cli.py CHANGED Viewed

@@ -83,19 +83,15 @@ def load_samples(run_dir: Path, task: str) -> list[dict]:
 def build_raw_prompt(sample: dict) -> str:
-    """Reconstruct the plain prompt string from message-list format.
-    sample["prompt"] is [{role:system, content:...}, {role:user, content:...}].
-    Joins them with a blank line — same format used during training.
-    """
-    return sample["prompt"][0]["content"] + "\n\n" + sample["prompt"][1]["content"]
 def run_sample(sample: dict, task: str, total: int, index: int, verbose: bool = False) -> None:
-    expected = sample["completion"][0]["content"]
-    messages = sample["prompt"]
-    user_content = sample["prompt"][1]["content"]
     if "<USER_QUERY>" in user_content:
         question = user_content.split("<USER_QUERY>")[-1].split("</USER_QUERY>")[0].strip()
     else:

 def build_raw_prompt(sample: dict) -> str:
+    """Reconstruct the plain prompt string from messages format (all turns except assistant)."""
+    return "\n\n".join(m["content"] for m in sample["messages"][:-1])
 def run_sample(sample: dict, task: str, total: int, index: int, verbose: bool = False) -> None:
+    expected = sample["messages"][-1]["content"]
+    messages = sample["messages"][:-1]
+    user_content = sample["messages"][-2]["content"]
     if "<USER_QUERY>" in user_content:
         question = user_content.split("<USER_QUERY>")[-1].split("</USER_QUERY>")[0].strip()
     else:

finetune/nlg.py CHANGED Viewed

@@ -66,49 +66,54 @@ from trl import SFTConfig, SFTTrainer
 LOGGER = logging.getLogger("nlg")
-SYSTEM_PROMPT = (
-    "You are a text to SQL query translator that helps in natural language geocoding."
-)
-USER_PROMPT_TEMPLATE = """GIVEN the <SCHEMA_DETAILS>, <CANDIDATES> and <USER_QUERY>, generate the corresponding SQL command to retrieve the desired geometry.
-<SCHEMA_DETAILS>
-{schema_details}
-</SCHEMA_DETAILS>
-<CANDIDATES>
-{candidates_csv}
-</CANDIDATES>
-<USER_QUERY>
-{question}
-</USER_QUERY>
-"""
-DEFAULT_SCHEMA_DETAILS = """1. divisions_area  — Overture polygon/multipolygon admin boundaries
-   path: '/data/overture/division_area/*.parquet'
    columns:
-     id VARCHAR
      names STRUCT("primary" VARCHAR, ...)
-     country VARCHAR
-     subtype VARCHAR
      class VARCHAR
      region VARCHAR
      admin_level INTEGER
      division_id VARCHAR
      is_land BOOLEAN
      is_territorial BOOLEAN
-     geometry GEOMETRY
-2. natural_earth  — Natural Earth geography polygons
-   path: '/data/natural_earth_geoparquet/ne_geography.parquet'
    columns:
-     id VARCHAR
-     name VARCHAR
-     featurecla VARCHAR
-     scalerank INTEGER
-     min_zoom DOUBLE
-     geometry GEOMETRY"""
 @dataclass
@@ -126,12 +131,6 @@ def setup_logging(verbose: bool = False) -> None:
     )
-def read_text(path: Optional[str], default: str) -> str:
-    if not path:
-        return default
-    return Path(path).read_text(encoding="utf-8")
 def candidates_to_csv(candidates: Sequence[Dict[str, Any]]) -> str:
     df = pd.DataFrame(list(candidates))
     if "candidate_id" in df.columns:
@@ -139,15 +138,14 @@ def candidates_to_csv(candidates: Sequence[Dict[str, Any]]) -> str:
     return df.to_csv(index=False)
-def build_user_prompt(question: str, candidates: Sequence[Dict[str, Any]], schema_details: str) -> str:
     return USER_PROMPT_TEMPLATE.format(
-        schema_details=schema_details.strip(),
         candidates_csv=candidates_to_csv(candidates).strip(),
         question=question.strip(),
     )
-def make_messages(sample: Dict[str, Any], schema_details: str) -> Dict[str, Any]:
     messages = [
         {"role": "system", "content": SYSTEM_PROMPT},
         {
@@ -155,7 +153,6 @@ def make_messages(sample: Dict[str, Any], schema_details: str) -> Dict[str, Any]
             "content": build_user_prompt(
                 question=sample["question"],
                 candidates=sample["candidates"],
-                schema_details=schema_details,
             ),
         },
     ]
@@ -178,10 +175,10 @@ def load_jsonl_splits(
     return load_dataset("json", data_files=data_files)
-def format_dataset_for_sft(dataset: DatasetDict, schema_details: str) -> DatasetDict:
     formatted = DatasetDict()
     for split, ds in dataset.items():
-        formatted[split] = ds.map(lambda row: make_messages(row, schema_details))
     return formatted
@@ -271,9 +268,8 @@ def build_lora_config(args: argparse.Namespace) -> LoraConfig:
 def train(args: argparse.Namespace) -> None:
     set_seed(args.seed)
-    schema_details = read_text(args.schema_file, DEFAULT_SCHEMA_DETAILS)
     raw_ds = load_jsonl_splits(args.train_jsonl, args.val_jsonl, args.test_jsonl)
-    ds = format_dataset_for_sft(raw_ds, schema_details)
     if args.max_train_samples is not None:
         ds["train"] = ds["train"].select(range(min(args.max_train_samples, len(ds["train"]))))
@@ -362,7 +358,6 @@ def generate_sql(
     tokenizer,
     question: str,
     candidates: Sequence[Dict[str, Any]],
-    schema_details: str,
     max_new_tokens: int = 256,
     do_sample: bool = False,
     temperature: float = 0.1,
@@ -371,7 +366,6 @@ def generate_sql(
 ) -> GenerationResult:
     messages = make_messages(
         {"question": question, "candidates": list(candidates), "target": {}},
-        schema_details,
     )["messages"]
     prompt = render_prompt(tokenizer, messages)
     inputs = tokenizer.apply_chat_template(
@@ -446,7 +440,6 @@ def execute_sqlite(sql: str, sqlite_db: str, limit: Optional[int] = None) -> Tup
 def cmd_generate(args: argparse.Namespace) -> None:
-    schema_details = read_text(args.schema_file, DEFAULT_SCHEMA_DETAILS)
     question = read_question(args)
     candidates = read_candidates(args)
     model, tokenizer = load_model_for_inference(
@@ -463,7 +456,6 @@ def cmd_generate(args: argparse.Namespace) -> None:
         tokenizer=tokenizer,
         question=question,
         candidates=candidates,
-        schema_details=schema_details,
         max_new_tokens=args.max_new_tokens,
         do_sample=args.do_sample,
         temperature=args.temperature,
@@ -511,7 +503,6 @@ def build_parser() -> argparse.ArgumentParser:
     train_p.add_argument("--train-jsonl", required=True)
     train_p.add_argument("--val-jsonl")
     train_p.add_argument("--test-jsonl")
-    train_p.add_argument("--schema-file")
     train_p.add_argument("--output-dir", required=True)
     train_p.add_argument("--max-train-samples", type=int)
     train_p.add_argument("--max-eval-samples", type=int)
@@ -552,7 +543,6 @@ def build_parser() -> argparse.ArgumentParser:
     gen_p.add_argument("--model-path")
     gen_p.add_argument("--base-model")
     gen_p.add_argument("--adapter-path")
-    gen_p.add_argument("--schema-file")
     gen_p.add_argument("--question")
     gen_p.add_argument("--candidates-json")
     gen_p.add_argument("--sample-jsonl")

 LOGGER = logging.getLogger("nlg")
+SYSTEM_PROMPT = """You are a text to SQL query translator that helps in natural language geocoding.
+You have access to two DuckDB parquet tables. Given a set of candidate entities and a user query, generate the SQL to retrieve the desired geometry.
+<SCHEMA>
+1. divisions_area  -- Overture polygon/multipolygon admin boundaries
+   query: read_parquet('divisions_area')
    columns:
+     id VARCHAR              -- unique feature id
      names STRUCT("primary" VARCHAR, ...)
+     country VARCHAR         -- ISO 3166-1 alpha-2
+     subtype VARCHAR         -- country | region | dependency | county | localadmin |
+                               locality | macrohood | neighborhood | microhood
      class VARCHAR
      region VARCHAR
      admin_level INTEGER
      division_id VARCHAR
      is_land BOOLEAN
      is_territorial BOOLEAN
+     geometry GEOMETRY       -- WGS-84 polygon/multipolygon (spatial ext loaded)
+2. natural_earth  -- Natural Earth geography polygons (oceans, seas, rivers, terrain)
+   query: read_parquet('natural_earth')
    columns:
+     id VARCHAR              -- unique feature id prefixed 'ne_'
+     names STRUCT("primary" VARCHAR, ...)
+     country VARCHAR
+     subtype VARCHAR         -- e.g. 'ocean', 'sea', 'bay', 'Terrain area', 'Island group'
+     class VARCHAR
+     region VARCHAR
+     admin_level INTEGER
+     is_land BOOLEAN
+     is_territorial BOOLEAN
+     geometry GEOMETRY       -- WGS-84 polygon/multipolygon (spatial ext loaded)
+</SCHEMA>
+The candidates table has a 'source' column: 'divisions_area' or 'natural_earth'.
+Use read_parquet('divisions_area') or read_parquet('natural_earth') accordingly.
+Use ST_AsGeoJSON(geometry) for all geometry outputs."""
+USER_PROMPT_TEMPLATE = """<CANDIDATES>
+{candidates_csv}
+</CANDIDATES>
+<USER_QUERY>
+{question}
+</USER_QUERY>
+"""
 @dataclass
     )
 def candidates_to_csv(candidates: Sequence[Dict[str, Any]]) -> str:
     df = pd.DataFrame(list(candidates))
     if "candidate_id" in df.columns:
     return df.to_csv(index=False)
+def build_user_prompt(question: str, candidates: Sequence[Dict[str, Any]]) -> str:
     return USER_PROMPT_TEMPLATE.format(
         candidates_csv=candidates_to_csv(candidates).strip(),
         question=question.strip(),
     )
+def make_messages(sample: Dict[str, Any]) -> Dict[str, Any]:
     messages = [
         {"role": "system", "content": SYSTEM_PROMPT},
         {
             "content": build_user_prompt(
                 question=sample["question"],
                 candidates=sample["candidates"],
             ),
         },
     ]
     return load_dataset("json", data_files=data_files)
+def format_dataset_for_sft(dataset: DatasetDict) -> DatasetDict:
     formatted = DatasetDict()
     for split, ds in dataset.items():
+        formatted[split] = ds.map(make_messages)
     return formatted
 def train(args: argparse.Namespace) -> None:
     set_seed(args.seed)
     raw_ds = load_jsonl_splits(args.train_jsonl, args.val_jsonl, args.test_jsonl)
+    ds = format_dataset_for_sft(raw_ds)
     if args.max_train_samples is not None:
         ds["train"] = ds["train"].select(range(min(args.max_train_samples, len(ds["train"]))))
     tokenizer,
     question: str,
     candidates: Sequence[Dict[str, Any]],
     max_new_tokens: int = 256,
     do_sample: bool = False,
     temperature: float = 0.1,
 ) -> GenerationResult:
     messages = make_messages(
         {"question": question, "candidates": list(candidates), "target": {}},
     )["messages"]
     prompt = render_prompt(tokenizer, messages)
     inputs = tokenizer.apply_chat_template(
 def cmd_generate(args: argparse.Namespace) -> None:
     question = read_question(args)
     candidates = read_candidates(args)
     model, tokenizer = load_model_for_inference(
         tokenizer=tokenizer,
         question=question,
         candidates=candidates,
         max_new_tokens=args.max_new_tokens,
         do_sample=args.do_sample,
         temperature=args.temperature,
     train_p.add_argument("--train-jsonl", required=True)
     train_p.add_argument("--val-jsonl")
     train_p.add_argument("--test-jsonl")
     train_p.add_argument("--output-dir", required=True)
     train_p.add_argument("--max-train-samples", type=int)
     train_p.add_argument("--max-eval-samples", type=int)
     gen_p.add_argument("--model-path")
     gen_p.add_argument("--base-model")
     gen_p.add_argument("--adapter-path")
     gen_p.add_argument("--question")
     gen_p.add_argument("--candidates-json")
     gen_p.add_argument("--sample-jsonl")

finetune/prompts.py CHANGED Viewed

@@ -6,35 +6,21 @@ from typing import Any, Dict, Sequence
 import pandas as pd
-SYSTEM_PROMPT = (
-    "You are a text to SQL query translator that helps in natural language geocoding."
-)
-USER_PROMPT_TEMPLATE = """GIVEN the <SCHEMA_DETAILS>, <CANDIDATES> and <USER_QUERY>, generate the corresponding SQL command to retrieve the desired geometry.
-<SCHEMA_DETAILS>
-{schema_details}
-</SCHEMA_DETAILS>
-<CANDIDATES>
-{candidates_csv}
-</CANDIDATES>
-<USER_QUERY>
-{question}
-</USER_QUERY>
-"""
-DEFAULT_SCHEMA_DETAILS = """1. divisions_area  -- Overture polygon/multipolygon admin boundaries
    query: read_parquet('divisions_area')
    columns:
-     id VARCHAR              -- unique feature id (use to filter precisely)
      names STRUCT("primary" VARCHAR, ...)
      country VARCHAR         -- ISO 3166-1 alpha-2
      subtype VARCHAR         -- country | region | dependency | county | localadmin |
                                locality | macrohood | neighborhood | microhood
      class VARCHAR
-     region VARCHAR          -- region code e.g. 'IN-OR'
      admin_level INTEGER
      division_id VARCHAR
      is_land BOOLEAN
@@ -54,9 +40,20 @@ DEFAULT_SCHEMA_DETAILS = """1. divisions_area  -- Overture polygon/multipolygon
      is_land BOOLEAN
      is_territorial BOOLEAN
      geometry GEOMETRY       -- WGS-84 polygon/multipolygon (spatial ext loaded)
 The candidates table has a 'source' column: 'divisions_area' or 'natural_earth'.
-Use read_parquet('divisions_area') or read_parquet('natural_earth') accordingly."""
 def candidates_to_csv(candidates: Sequence[Dict[str, Any]]) -> str:
@@ -69,10 +66,8 @@ def candidates_to_csv(candidates: Sequence[Dict[str, Any]]) -> str:
 def build_user_prompt(
     question: str,
     candidates: Sequence[Dict[str, Any]],
-    schema_details: str,
 ) -> str:
     return USER_PROMPT_TEMPLATE.format(
-        schema_details=schema_details.strip(),
         candidates_csv=candidates_to_csv(candidates).strip(),
         question=question.strip(),
     )
@@ -80,12 +75,10 @@ def build_user_prompt(
 def make_prompt_completion(
     sample: Dict[str, Any],
-    schema_details: str,
 ) -> Dict[str, str]:
     prompt = SYSTEM_PROMPT + "\n\n" + build_user_prompt(
         question=sample["question"],
         candidates=sample["candidates"],
-        schema_details=schema_details,
     )
     completion = sample.get("target", {}).get("sql", "")
     return {"prompt": prompt, "completion": completion}

 import pandas as pd
+SYSTEM_PROMPT = """You are a text to SQL query translator that helps in natural language geocoding.
+You have access to two DuckDB parquet tables. Given a set of candidate entities and a user query, generate the SQL to retrieve the desired geometry.
+<SCHEMA>
+1. divisions_area  -- Overture polygon/multipolygon admin boundaries
    query: read_parquet('divisions_area')
    columns:
+     id VARCHAR              -- unique feature id
      names STRUCT("primary" VARCHAR, ...)
      country VARCHAR         -- ISO 3166-1 alpha-2
      subtype VARCHAR         -- country | region | dependency | county | localadmin |
                                locality | macrohood | neighborhood | microhood
      class VARCHAR
+     region VARCHAR
      admin_level INTEGER
      division_id VARCHAR
      is_land BOOLEAN
      is_land BOOLEAN
      is_territorial BOOLEAN
      geometry GEOMETRY       -- WGS-84 polygon/multipolygon (spatial ext loaded)
+</SCHEMA>
 The candidates table has a 'source' column: 'divisions_area' or 'natural_earth'.
+Use read_parquet('divisions_area') or read_parquet('natural_earth') accordingly.
+Use ST_AsGeoJSON(geometry) for all geometry outputs."""
+USER_PROMPT_TEMPLATE = """<CANDIDATES>
+{candidates_csv}
+</CANDIDATES>
+<USER_QUERY>
+{question}
+</USER_QUERY>
+"""
 def candidates_to_csv(candidates: Sequence[Dict[str, Any]]) -> str:
 def build_user_prompt(
     question: str,
     candidates: Sequence[Dict[str, Any]],
 ) -> str:
     return USER_PROMPT_TEMPLATE.format(
         candidates_csv=candidates_to_csv(candidates).strip(),
         question=question.strip(),
     )
 def make_prompt_completion(
     sample: Dict[str, Any],
 ) -> Dict[str, str]:
     prompt = SYSTEM_PROMPT + "\n\n" + build_user_prompt(
         question=sample["question"],
         candidates=sample["candidates"],
     )
     completion = sample.get("target", {}).get("sql", "")
     return {"prompt": prompt, "completion": completion}

finetune/train_modal_qwen35.py CHANGED Viewed

@@ -123,8 +123,7 @@ def _load_data(run_dir: str, tokenizer, max_train_samples=None, max_eval_samples
     """Load JSONL data and apply Qwen3.5 chat template.
     Each sample must have:
-      prompt: list of {role, content} dicts (system + user)
-      completion: list of {role, content} dicts (assistant)
     The chat template produces the full ChatML string including the assistant turn.
     train_on_responses_only then masks everything except the assistant response.
@@ -143,7 +142,7 @@ def _load_data(run_dir: str, tokenizer, max_train_samples=None, max_eval_samples
     def to_message(sample: dict) -> dict:
         text = tokenizer.apply_chat_template(
-            sample["prompt"] + sample["completion"],
             tokenize=False,
             add_generation_prompt=False,
         )

     """Load JSONL data and apply Qwen3.5 chat template.
     Each sample must have:
+      messages: list of {role, content} dicts (system + user + assistant)
     The chat template produces the full ChatML string including the assistant turn.
     train_on_responses_only then masks everything except the assistant response.
     def to_message(sample: dict) -> dict:
         text = tokenizer.apply_chat_template(
+            sample["messages"],
             tokenize=False,
             add_generation_prompt=False,
         )

src/gazet/lm.py CHANGED Viewed

@@ -176,11 +176,12 @@ write_sql = SQLWriter(lm=sql_generation_lm)
 # ── GGUF SQL generation via llama-server ──────────────────────────────────────
-_SYSTEM_PROMPT = (
-    "You are a text to SQL query translator that helps in natural language geocoding."
-)
-_SCHEMA_DETAILS = """1. divisions_area  -- Overture polygon/multipolygon admin boundaries
    query: read_parquet('divisions_area')
    columns:
      id VARCHAR              -- unique feature id
@@ -209,18 +210,13 @@ _SCHEMA_DETAILS = """1. divisions_area  -- Overture polygon/multipolygon admin b
      is_land BOOLEAN
      is_territorial BOOLEAN
      geometry GEOMETRY       -- WGS-84 polygon/multipolygon (spatial ext loaded)
 The candidates table has a 'source' column: 'divisions_area' or 'natural_earth'.
 Use read_parquet('divisions_area') or read_parquet('natural_earth') accordingly.
 Use ST_AsGeoJSON(geometry) for all geometry outputs."""
-_USER_PROMPT_TEMPLATE = """GIVEN the <SCHEMA_DETAILS>, <CANDIDATES> and <USER_QUERY>, generate the corresponding SQL command to retrieve the desired geometry.
-<SCHEMA_DETAILS>
-{schema_details}
-</SCHEMA_DETAILS>
-<CANDIDATES>
 {candidates_csv}
 </CANDIDATES>
@@ -269,10 +265,21 @@ def _llama_chat_complete(messages: list[dict]) -> str:
     return resp.json()["choices"][0]["message"]["content"]
-_PLACES_SYSTEM_PROMPT = (
-    "You are a geographic entity extractor. "
-    "Extract place names from the query and return valid JSON only."
-)
 def generate_places(user_query: str) -> PlacesResult:
@@ -307,7 +314,7 @@ def generate_sql(user_query: str, candidates_df: pd.DataFrame) -> str:
     """Generate SQL from a natural language query using the finetuned GGUF model.
     Uses the same prompt format the model was trained on:
-    SYSTEM_PROMPT + USER_PROMPT_TEMPLATE with schema, candidates CSV, and question.
     Single-shot — no retry loop (the finetuned model can't improve from error feedback).
     """
     # Keep only columns the model was trained on
@@ -316,7 +323,6 @@ def generate_sql(user_query: str, candidates_df: pd.DataFrame) -> str:
     candidates_csv = candidates_df[cols].to_csv(index=False)
     user_prompt = _USER_PROMPT_TEMPLATE.format(
-        schema_details=_SCHEMA_DETAILS.strip(),
         candidates_csv=candidates_csv.strip(),
         question=user_query.strip(),
     )

 # ── GGUF SQL generation via llama-server ──────────────────────────────────────
+_SYSTEM_PROMPT = """You are a text to SQL query translator that helps in natural language geocoding.
+You have access to two DuckDB parquet tables. Given a set of candidate entities and a user query, generate the SQL to retrieve the desired geometry.
+<SCHEMA>
+1. divisions_area  -- Overture polygon/multipolygon admin boundaries
    query: read_parquet('divisions_area')
    columns:
      id VARCHAR              -- unique feature id
      is_land BOOLEAN
      is_territorial BOOLEAN
      geometry GEOMETRY       -- WGS-84 polygon/multipolygon (spatial ext loaded)
+</SCHEMA>
 The candidates table has a 'source' column: 'divisions_area' or 'natural_earth'.
 Use read_parquet('divisions_area') or read_parquet('natural_earth') accordingly.
 Use ST_AsGeoJSON(geometry) for all geometry outputs."""
+_USER_PROMPT_TEMPLATE = """<CANDIDATES>
 {candidates_csv}
 </CANDIDATES>
     return resp.json()["choices"][0]["message"]["content"]
+_PLACES_SYSTEM_PROMPT = """You are a geographic entity extractor. Extract place names from the user query and return valid JSON only.
+OUTPUT FORMAT:
+{"places": [{"place": "<name>", "country": "<ISO-2>", "subtype": "<subtype>"}]}
+"country" and "subtype" are optional; omit if not applicable.
+RULES:
+- Only extract places explicitly mentioned. Never infer or expand (e.g. "states of India" -> extract "India" only).
+- No duplicate place names.
+- "country": ISO 3166-1 alpha-2. Include only if explicitly mentioned or unambiguous.
+- "subtype": include only when the geographic level is clear from the query.
+SUBTYPES:
+country, dependency, region, county, localadmin, locality, macrohood, neighborhood, microhood
+- Default to locality for cities/towns; omit for physical features (oceans, rivers, mountains)."""
 def generate_places(user_query: str) -> PlacesResult:
     """Generate SQL from a natural language query using the finetuned GGUF model.
     Uses the same prompt format the model was trained on:
+    SYSTEM_PROMPT (includes schema) + USER_PROMPT_TEMPLATE with candidates CSV and question.
     Single-shot — no retry loop (the finetuned model can't improve from error feedback).
     """
     # Keep only columns the model was trained on
     candidates_csv = candidates_df[cols].to_csv(index=False)
     user_prompt = _USER_PROMPT_TEMPLATE.format(
         candidates_csv=candidates_csv.strip(),
         question=user_query.strip(),
     )