Spaces:

developmentseed
/

gazet

Running

App Files Files Community

srmsoumya commited on Apr 2

Commit

599b4c3

1 Parent(s): c88725f

Add scripts to train & infer model on modal

Browse files

Files changed (8) hide show

finetune/__init__.py +1 -0
finetune/check_token_lengths.py +173 -0
finetune/config.py +76 -0
finetune/data.py +45 -0
finetune/eval_demo.py +329 -0
finetune/infer_modal.py +236 -0
finetune/prompts.py +83 -0
finetune/train_modal.py +279 -0

finetune/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+

finetune/check_token_lengths.py ADDED Viewed

	@@ -0,0 +1,173 @@

+"""Check token lengths of training samples to validate max_length setting.
+Usage
+-----
+modal run finetune/check_token_lengths.py \
+    --train-jsonl /data/train.jsonl \
+    --val-jsonl /data/val.jsonl \
+    --base-model google/gemma-3-270m-it
+"""
+from __future__ import annotations
+import modal
+app = modal.App("gazet-check-token-lengths")
+check_image = (
+    modal.Image.debian_slim(python_version="3.11")
+    .pip_install(
+        "datasets>=3.0",
+        "pandas>=2.2",
+        "transformers>=4.46",
+    )
+    .add_local_python_source("finetune", copy=True)
+    .env({"HF_HOME": "/mnt/gazet/model_cache"})
+)
+gazet_vol = modal.Volume.from_name("gazet", create_if_missing=True)
+VOLUMES = {
+    "/mnt/gazet": gazet_vol,
+}
+@app.function(
+    image=check_image,
+    volumes=VOLUMES,
+    secrets=[modal.Secret.from_name("huggingface-secret")],
+)
+def analyze_token_lengths(
+    train_jsonl: str,
+    val_jsonl: str | None,
+    base_model: str,
+    schema_file: str | None = None,
+):
+    from transformers import AutoTokenizer
+    from finetune.data import format_dataset_for_sft, load_jsonl_splits, read_text
+    from finetune.prompts import DEFAULT_SCHEMA_DETAILS
+    print(f"Loading tokenizer: {base_model}")
+    tokenizer = AutoTokenizer.from_pretrained(base_model)
+    print(f"Loading dataset from {train_jsonl}")
+    schema_details = read_text(schema_file, DEFAULT_SCHEMA_DETAILS)
+    ds = load_jsonl_splits(train_jsonl, val_jsonl)
+    formatted = format_dataset_for_sft(ds, schema_details)
+    def compute_lengths(split_name: str, dataset):
+        print(f"\n{'='*60}")
+        print(f"Analyzing {split_name} split ({len(dataset)} samples)")
+        print(f"{'='*60}")
+        lengths = []
+        for row in dataset:
+            text = row["prompt"] + row["completion"]
+            tokens = tokenizer.encode(text)
+            lengths.append(len(tokens))
+        lengths.sort()
+        n = len(lengths)
+        print(f"\nToken length statistics:")
+        print(f"  Samples:  {n:,}")
+        print(f"  Min:      {min(lengths):,}")
+        print(f"  Max:      {max(lengths):,}")
+        print(f"  Mean:     {sum(lengths)/n:.0f}")
+        print(f"  Median:   {lengths[n//2]:,}")
+        print(f"  P90:      {lengths[int(n*0.90)]:,}")
+        print(f"  P95:      {lengths[int(n*0.95)]:,}")
+        print(f"  P99:      {lengths[int(n*0.99)]:,}")
+        buckets = [
+            (512, "0-512"),
+            (1024, "513-1024"),
+            (2048, "1025-2048"),
+            (4096, "2049-4096"),
+            (8192, "4097-8192"),
+            (float("inf"), "8193+"),
+        ]
+        print(f"\nFrequency distribution:")
+        prev_limit = 0
+        for limit, label in buckets:
+            count = sum(1 for l in lengths if prev_limit < l <= limit)
+            pct = 100 * count / n
+            bar = "█" * int(pct / 2)
+            print(f"  {label:>12}: {count:5,} ({pct:5.1f}%) {bar}")
+            prev_limit = limit
+        thresholds = [1024, 2048, 4096, 8192]
+        print(f"\nSamples exceeding thresholds:")
+        for threshold in thresholds:
+            count = sum(1 for l in lengths if l > threshold)
+            pct = 100 * count / n
+            print(f"  > {threshold:5,}: {count:5,} ({pct:5.1f}%)")
+        return lengths
+    train_lengths = compute_lengths("train", formatted["train"])
+    if "val" in formatted:
+        val_lengths = compute_lengths("val", formatted["val"])
+    else:
+        val_lengths = []
+    all_lengths = train_lengths + val_lengths
+    if all_lengths:
+        print(f"\n{'='*60}")
+        print(f"COMBINED STATISTICS")
+        print(f"{'='*60}")
+        all_lengths.sort()
+        n = len(all_lengths)
+        print(f"  Total samples: {n:,}")
+        print(f"  Max length:    {max(all_lengths):,}")
+        print(f"  P99:           {all_lengths[int(n*0.99)]:,}")
+        for threshold in [1024, 2048, 4096]:
+            count = sum(1 for l in all_lengths if l > threshold)
+            pct = 100 * count / n
+            status = "⚠️  WARNING" if count > 0 and threshold == 2048 else "✓ OK"
+            print(f"  > {threshold:5,}: {count:5,} ({pct:5.1f}%) {status}")
+    print(f"\n{'='*60}")
+    print("RECOMMENDATIONS")
+    print(f"{'='*60}")
+    max_len = max(all_lengths) if all_lengths else 0
+    over_2048 = sum(1 for l in all_lengths if l > 2048) if all_lengths else 0
+    if max_len <= 1024:
+        print("✓ All samples fit within 1024 tokens")
+        print("  Recommended max_length: 1024")
+    elif max_len <= 2048:
+        print("✓ All samples fit within 2048 tokens")
+        print("  Recommended max_length: 2048")
+    elif over_2048 < n * 0.01:
+        print(f"⚠️  {over_2048} samples ({100*over_2048/n:.1f}%) exceed 2048 tokens")
+        print("  Options:")
+        print("    1. Keep max_length=2048 (truncates <1% of samples)")
+        print("    2. Increase to max_length=4096 (uses more GPU memory)")
+        print("    3. Reduce candidate rows in preprocessing")
+    else:
+        print(f"⚠️  {over_2048} samples ({100*over_2048/n:.1f}%) exceed 2048 tokens")
+        print(f"  Recommended max_length: {max_len} (or reduce candidate rows)")
+    print()
+@app.local_entrypoint()
+def main(
+    train_jsonl: str = "/mnt/gazet/data/output/train.jsonl",
+    val_jsonl: str | None = "/mnt/gazet/data/output/val.jsonl",
+    base_model: str = "google/gemma-3-270m-it",
+    schema_file: str | None = None,
+):
+    print(f"Checking token lengths for:")
+    print(f"  Model: {base_model}")
+    print(f"  Train: {train_jsonl}")
+    if val_jsonl:
+        print(f"  Val:   {val_jsonl}")
+    analyze_token_lengths.remote(train_jsonl, val_jsonl, base_model, schema_file)
+    print("Analysis complete!")

finetune/config.py ADDED Viewed

	@@ -0,0 +1,76 @@

+"""Training configuration for text-to-SQL LoRA finetuning."""
+from dataclasses import dataclass, field
+from datetime import datetime
+from typing import List, Optional
+LORA_TARGET_MODULES = [
+    "q_proj",
+    "k_proj",
+    "v_proj",
+    "o_proj",
+    "gate_proj",
+    "up_proj",
+    "down_proj",
+]
+@dataclass
+class TrainingConfig:
+    # Model
+    base_model: str = "google/gemma-3-270m-it"
+    # Dataset (paths on the Modal volume)
+    train_jsonl: str = "/mnt/gazet/data/output/train.jsonl"
+    val_jsonl: Optional[str] = "/mnt/gazet/data/output/val.jsonl"
+    test_jsonl: Optional[str] = "/mnt/gazet/data/output/test.jsonl"
+    schema_file: Optional[str] = None
+    max_train_samples: Optional[int] = None
+    max_eval_samples: Optional[int] = None
+    # LoRA
+    lora_r: int = 16
+    lora_alpha: int = 16
+    lora_dropout: float = 0.05
+    target_modules: List[str] = field(default_factory=lambda: list(LORA_TARGET_MODULES))
+    # Training
+    num_train_epochs: int = 2
+    per_device_train_batch_size: int = 12
+    per_device_eval_batch_size: int = 12
+    gradient_accumulation_steps: int = 2
+    gradient_checkpointing: bool = True
+    optim: str = "adamw_torch_fused"
+    learning_rate: float = 1e-4
+    max_grad_norm: float = 0.7
+    warmup_steps: int = 50
+    lr_scheduler_type: str = "constant"
+    weight_decay: float = 0.0
+    packing: bool = False
+    max_length: int = 2048
+    # Logging / saving
+    logging_steps: int = 10
+    save_strategy: str = "steps"
+    save_steps: int = 300
+    eval_strategy: str = "steps"
+    eval_steps: int = 100
+    report_to: str = "trackio"
+    trackio_space_id: Optional[str] = "srmsoumya/gazet-trackio"
+    project: str = "gazet-nlg"
+    # SFT-specific
+    completion_only_loss: bool = True
+    dataset_num_proc: Optional[int] = 8
+    # Experiment
+    seed: int = 42
+    experiment_name: Optional[str] = None
+    merge_after_training: bool = True
+    def __post_init__(self):
+        if self.experiment_name is None:
+            timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
+            model_short = self.base_model.split("/")[-1]
+            self.experiment_name = f"{model_short}-r{self.lora_r}-{timestamp}"

finetune/data.py ADDED Viewed

	@@ -0,0 +1,45 @@

+"""Dataset loading and SFT formatting for text-to-SQL finetuning."""
+from __future__ import annotations
+import logging
+from pathlib import Path
+from typing import Dict, Optional
+from datasets import DatasetDict, load_dataset
+from finetune.prompts import DEFAULT_SCHEMA_DETAILS, make_prompt_completion
+LOGGER = logging.getLogger("nlg.data")
+def read_text(path: Optional[str], default: str) -> str:
+    if not path:
+        return default
+    return Path(path).read_text(encoding="utf-8")
+def load_jsonl_splits(
+    train_jsonl: str,
+    val_jsonl: Optional[str] = None,
+    test_jsonl: Optional[str] = None,
+) -> DatasetDict:
+    data_files: Dict[str, str] = {"train": train_jsonl}
+    if val_jsonl:
+        data_files["val"] = val_jsonl
+    if test_jsonl:
+        data_files["test"] = test_jsonl
+    LOGGER.info("Loading dataset splits: %s", data_files)
+    return load_dataset("json", data_files=data_files)
+def format_dataset_for_sft(
+    dataset: DatasetDict,
+    schema_details: str = DEFAULT_SCHEMA_DETAILS,
+) -> DatasetDict:
+    formatted = DatasetDict()
+    for split, ds in dataset.items():
+        formatted[split] = ds.map(
+            lambda row: make_prompt_completion(row, schema_details)
+        )
+    return formatted

finetune/eval_demo.py ADDED Viewed

	@@ -0,0 +1,329 @@

+"""Streamlit eval viewer: compare expected vs predicted SQL and view results on a map.
+Usage: streamlit run finetune/eval_demo.py
+"""
+import difflib
+import json
+import math
+import os
+import pathlib
+import duckdb
+import numpy as np
+import pandas as pd
+import pydeck as pdk
+import sqlparse
+import streamlit as st
+PROJECT_ROOT = pathlib.Path(__file__).resolve().parent.parent
+DATA_DIR = pathlib.Path(
+    os.environ.get("GAZET_DATA_DIR", str(PROJECT_ROOT / "data"))
+)
+EVAL_DIR = PROJECT_ROOT / "data" / "eval_results"
+def load_eval_results(path):
+    with open(path) as f:
+        return json.load(f)
+def rewrite_data_paths(sql):
+    """Replace hardcoded /data/ paths with the local data directory."""
+    return sql.replace("/data/", f"{DATA_DIR}/")
+def format_sql(sql):
+    """Pretty-print SQL with sqlparse."""
+    return sqlparse.format(sql, reindent=True, keyword_case="upper")
+def sql_diff_html(expected, predicted):
+    """Return an HTML diff of two SQL strings."""
+    expected_lines = format_sql(expected).splitlines()
+    predicted_lines = format_sql(predicted).splitlines()
+    diff = difflib.HtmlDiff(tabsize=2, wrapcolumn=80)
+    return diff.make_table(
+        expected_lines, predicted_lines,
+        fromdesc="Expected", todesc="Predicted",
+        context=False,
+    )
+def get_duckdb_connection():
+    con = duckdb.connect()
+    con.execute("INSTALL spatial")
+    con.execute("LOAD spatial")
+    return con
+def execute_sql(con, sql):
+    """Execute SQL, converting geometry columns to simplified GeoJSON strings."""
+    rel = con.sql(sql)
+    cols = rel.columns
+    types = [str(t) for t in rel.dtypes]
+    select_parts = []
+    for col, dtype in zip(cols, types):
+        if "GEOMETRY" in dtype.upper():
+            select_parts.append(
+                f'ST_AsGeoJSON(ST_SimplifyPreserveTopology("{col}", 0.001)) AS "{col}"'
+            )
+        else:
+            select_parts.append(f'"{col}"')
+    wrapped = f"SELECT {', '.join(select_parts)} FROM ({sql})"
+    return con.execute(wrapped).fetchdf()
+def _is_notna(val):
+    """Check if a value is not NA, handling arrays/lists/numpy arrays safely."""
+    if isinstance(val, (list, tuple, np.ndarray)):
+        return len(val) > 0
+    return pd.notna(val)
+def _to_python(val):
+    """Convert numpy/pandas types to native Python for JSON serialization."""
+    if isinstance(val, (np.integer,)):
+        return int(val)
+    if isinstance(val, (np.floating,)):
+        return float(val)
+    if isinstance(val, np.ndarray):
+        return val.tolist()
+    if isinstance(val, (np.bool_,)):
+        return bool(val)
+    return val
+def to_feature_collection(result_df):
+    """Build GeoJSON FeatureCollection from a DataFrame with GeoJSON string columns."""
+    geom_cols = []
+    for c in result_df.columns:
+        vals = [v for v in result_df[c].head(5) if isinstance(v, str)]
+        if vals and all(v.lstrip().startswith('{"type":') for v in vals):
+            geom_cols.append(c)
+    prop_cols = [c for c in result_df.columns if c not in geom_cols]
+    features = []
+    for _, row in result_df.iterrows():
+        geometry = None
+        if geom_cols:
+            raw = row[geom_cols[0]]
+            if raw and isinstance(raw, str):
+                geometry = json.loads(raw)
+        properties = {}
+        for c in prop_cols:
+            val = row[c]
+            if _is_notna(val):
+                properties[c] = _to_python(val)
+        features.append(
+            {"type": "Feature", "geometry": geometry, "properties": properties}
+        )
+    return {"type": "FeatureCollection", "features": features}
+def bbox_from_geojson(geojson):
+    lngs, lats = [], []
+    for f in geojson.get("features", []):
+        geom = f.get("geometry")
+        if geom:
+            for coord in _extract_coords(geom):
+                lngs.append(coord[0])
+                lats.append(coord[1])
+    if not lngs:
+        return None
+    return min(lngs), min(lats), max(lngs), max(lats)
+def _extract_coords(geom):
+    t = geom.get("type", "")
+    coords = geom.get("coordinates", [])
+    if t == "Point":
+        yield coords
+    elif t in ("LineString", "MultiPoint"):
+        yield from coords
+    elif t == "Polygon":
+        for ring in coords:
+            yield from ring
+    elif t in ("MultiLineString", "MultiPolygon"):
+        for part in coords:
+            if t == "MultiLineString":
+                yield from part
+            else:
+                for ring in part:
+                    yield from ring
+    elif t == "GeometryCollection":
+        for g in geom.get("geometries", []):
+            yield from _extract_coords(g)
+def _centroids_from_geojson(geojson):
+    """Extract centroid [lng, lat] for each feature to use as scatter markers."""
+    centroids = []
+    for f in geojson.get("features", []):
+        geom = f.get("geometry")
+        if not geom:
+            continue
+        lngs, lats = [], []
+        for coord in _extract_coords(geom):
+            lngs.append(coord[0])
+            lats.append(coord[1])
+        if lngs:
+            centroids.append({"lng": sum(lngs) / len(lngs), "lat": sum(lats) / len(lats)})
+    return centroids
+def render_map(geojson, color, key):
+    n = len(geojson.get("features", []))
+    if not n:
+        st.info("Query returned no features.")
+        return
+    layers = [
+        pdk.Layer(
+            "GeoJsonLayer",
+            data=geojson,
+            get_fill_color=color,
+            get_line_color=[100, 100, 100, 200],
+            get_line_width=2,
+            pickable=True,
+        ),
+    ]
+    bbox = bbox_from_geojson(geojson)
+    if bbox:
+        min_lng, min_lat, max_lng, max_lat = bbox
+        span = max(max_lng - min_lng, max_lat - min_lat, 1e-6)
+        zoom = max(0, min(18, math.log2(360 / span) - 0.8))
+        # Add scatter markers when polygons would be too small to see
+        if zoom < 4:
+            centroids = _centroids_from_geojson(geojson)
+            if centroids:
+                layers.append(
+                    pdk.Layer(
+                        "ScatterplotLayer",
+                        data=centroids,
+                        get_position=["lng", "lat"],
+                        get_fill_color=color[:3] + [220],
+                        get_radius=50000,
+                        radius_min_pixels=6,
+                        pickable=True,
+                    )
+                )
+        view = pdk.ViewState(
+            latitude=(min_lat + max_lat) / 2,
+            longitude=(min_lng + max_lng) / 2,
+            zoom=zoom,
+        )
+    else:
+        view = pdk.ViewState(latitude=0, longitude=0, zoom=1)
+    st.pydeck_chart(
+        pdk.Deck(layers=layers, initial_view_state=view, map_style=None),
+        width="stretch",
+        height=400,
+        key=key,
+    )
+# --- App ---
+st.set_page_config(page_title="Eval Viewer", layout="wide")
+st.title("Eval Viewer")
+eval_files = sorted(EVAL_DIR.glob("eval-*.json"))
+if not eval_files:
+    st.error(f"No eval result files found in {EVAL_DIR}")
+    st.stop()
+selected_file = st.sidebar.selectbox(
+    "Eval file",
+    eval_files,
+    format_func=lambda p: p.stem,
+)
+data = load_eval_results(selected_file)
+summary = data["summary"]
+results = data["results"]
+st.sidebar.markdown(f"""
+**Model**: `{summary.get('label', '')}`
+**Exact match**: {summary['exact_matches']}/{summary['num_samples']} ({summary['exact_match_rate']:.1%})
+""")
+filter_option = st.sidebar.radio("Filter", ["All", "Matches only", "Mismatches only"])
+if filter_option == "Matches only":
+    results = [r for r in results if r["exact_match"]]
+elif filter_option == "Mismatches only":
+    results = [r for r in results if not r["exact_match"]]
+if not results:
+    st.warning("No results match the current filter.")
+    st.stop()
+questions = [f"[{r['index']}] {r['question']}" for r in results]
+selected_idx = st.selectbox("Select a query", range(len(questions)), format_func=lambda i: questions[i])
+row = results[selected_idx]
+match_label = "MATCH" if row["exact_match"] else "MISMATCH"
+match_color = "green" if row["exact_match"] else "red"
+st.markdown(f"### :{match_color}[{match_label}]")
+# Formatted SQL side-by-side
+col_expected, col_predicted = st.columns(2)
+with col_expected:
+    st.markdown("**Expected SQL**")
+    st.code(format_sql(row["expected_sql"]), language="sql")
+with col_predicted:
+    st.markdown("**Predicted SQL**")
+    st.code(format_sql(row["predicted_sql"]), language="sql")
+# Diff view
+if not row["exact_match"]:
+    with st.expander("SQL Diff", expanded=True):
+        diff_html = sql_diff_html(row["expected_sql"], row["predicted_sql"])
+        diff_css = """
+        <style>
+        .diff_add { background-color: rgba(40, 167, 69, 0.15); }
+        .diff_sub { background-color: rgba(220, 53, 69, 0.15); }
+        .diff_chg { background-color: rgba(255, 193, 7, 0.15); }
+        .diff_header { background-color: rgba(128, 128, 128, 0.1); font-weight: bold; }
+        table.diff { border-collapse: collapse; width: 100%; font-family: monospace; color: inherit; }
+        table.diff td, table.diff th { padding: 4px 8px; border: 1px solid rgba(128, 128, 128, 0.2); }
+        </style>
+        """
+        st.html(f"{diff_css}<div style='overflow-x:auto; font-size:13px;'>{diff_html}</div>")
+# Auto-execute both SQLs and show maps
+con = get_duckdb_connection()
+map_col1, map_col2 = st.columns(2)
+with map_col1:
+    st.markdown("**Expected result**")
+    sql = rewrite_data_paths(row["expected_sql"])
+    try:
+        df = execute_sql(con, sql)
+        geojson = to_feature_collection(df)
+        render_map(geojson, [40, 180, 160, 140], key="map_expected")
+        with st.expander("Result table"):
+            st.dataframe(df, width="stretch")
+    except Exception as e:
+        st.error(f"Execution error: {e}")
+with map_col2:
+    st.markdown("**Predicted result**")
+    sql = rewrite_data_paths(row["predicted_sql"])
+    try:
+        df = execute_sql(con, sql)
+        geojson = to_feature_collection(df)
+        render_map(geojson, [180, 80, 60, 140], key="map_predicted")
+        with st.expander("Result table"):
+            st.dataframe(df, width="stretch")
+    except Exception as e:
+        st.error(f"Execution error: {e}")
+con.close()

finetune/infer_modal.py ADDED Viewed

	@@ -0,0 +1,236 @@

+"""Modal eval script: run a model on the test set and save results.
+Usage
+-----
+# Eval finetuned model (uses raw prompt-completion format):
+modal run finetune/infer_modal.py --label finetuned
+# Eval base model (uses chat template so the model understands the instruction):
+modal run finetune/infer_modal.py \
+    --model-path google/gemma-3-270m-it \
+    --label base \
+    --use-chat-template
+# Limit samples:
+modal run finetune/infer_modal.py --max-samples 50
+"""
+from __future__ import annotations
+import json
+import pathlib
+from datetime import datetime
+from typing import Optional
+import modal
+app = modal.App("gazet-nlg-eval")
+infer_image = (
+    modal.Image.debian_slim(python_version="3.11")
+    .pip_install(
+        "accelerate>=1.0",
+        "pandas>=2.2",
+        "torch>=2.4",
+        "transformers>=4.46",
+    )
+    .add_local_python_source("finetune", copy=True)
+    .env({"HF_HOME": "/mnt/gazet/model_cache"})
+)
+gazet_vol = modal.Volume.from_name("gazet", create_if_missing=True)
+VOLUMES = {
+    "/mnt/gazet": gazet_vol,
+}
+DEFAULT_MODEL_PATH = "/mnt/gazet/checkpoints/gemma-3-270m-it-r16-20260331-134642/merged"
+def postprocess_sql(text: str) -> str:
+    cleaned = text.strip()
+    if "```sql" in cleaned:
+        cleaned = cleaned.split("```sql", 1)[1]
+    if cleaned.startswith("```"):
+        cleaned = cleaned[3:]
+    if "```" in cleaned:
+        cleaned = cleaned.split("```", 1)[0]
+    return cleaned.strip()
+@app.function(
+    image=infer_image,
+    gpu="L40S",
+    volumes=VOLUMES,
+    secrets=[modal.Secret.from_name("huggingface-secret")],
+    timeout=60 * 60,
+)
+def run_eval(
+    model_path: str,
+    label: str,
+    samples: list[dict],
+    output_path: str,
+    max_new_tokens: int = 512,
+    batch_size: int = 16,
+    use_chat_template: bool = False,
+):
+    """Run batched inference on all samples, save results to volume."""
+    import torch
+    from transformers import AutoModelForCausalLM, AutoTokenizer
+    from finetune.prompts import SYSTEM_PROMPT, build_user_prompt, DEFAULT_SCHEMA_DETAILS
+    print(f"Loading model [{label}]: {model_path}")
+    print(f"Chat template: {use_chat_template}")
+    tokenizer = AutoTokenizer.from_pretrained(model_path)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    tokenizer.padding_side = "left"
+    model = AutoModelForCausalLM.from_pretrained(
+        model_path,
+        torch_dtype=torch.bfloat16,
+        attn_implementation="sdpa",
+        device_map="auto",
+    )
+    model.eval()
+    # Build all prompts upfront
+    prompts = []
+    for sample in samples:
+        user_content = build_user_prompt(
+            question=sample["question"],
+            candidates=sample["candidates"],
+            schema_details=DEFAULT_SCHEMA_DETAILS,
+        )
+        if use_chat_template:
+            messages = [
+                {"role": "user", "content": SYSTEM_PROMPT + "\n\n" + user_content},
+            ]
+            prompt = tokenizer.apply_chat_template(
+                messages, tokenize=False, add_generation_prompt=True
+            )
+        else:
+            prompt = SYSTEM_PROMPT + "\n\n" + user_content
+        prompts.append(prompt)
+    # Batched inference
+    all_predictions = []
+    num_batches = (len(prompts) + batch_size - 1) // batch_size
+    for batch_idx in range(num_batches):
+        start = batch_idx * batch_size
+        end = min(start + batch_size, len(prompts))
+        batch_prompts = prompts[start:end]
+        inputs = tokenizer(
+            batch_prompts,
+            return_tensors="pt",
+            padding=True,
+            truncation=True,
+            max_length=2048,
+        ).to(model.device)
+        input_len = inputs["input_ids"].shape[1]
+        with torch.inference_mode():
+            outputs = model.generate(
+                **inputs,
+                max_new_tokens=max_new_tokens,
+                do_sample=False,
+                pad_token_id=tokenizer.pad_token_id,
+                eos_token_id=tokenizer.eos_token_id,
+            )
+        for j in range(len(batch_prompts)):
+            generated = tokenizer.decode(
+                outputs[j][input_len:], skip_special_tokens=True
+            )
+            all_predictions.append(postprocess_sql(generated))
+        print(f"Batch {batch_idx+1}/{num_batches} done ({end}/{len(prompts)} samples)")
+    # Build results
+    results = []
+    matches = 0
+    for i, sample in enumerate(samples):
+        expected = sample.get("target", {}).get("sql", "")
+        predicted = all_predictions[i]
+        is_match = predicted.strip() == expected.strip()
+        if is_match:
+            matches += 1
+        results.append({
+            "index": i,
+            "question": sample["question"],
+            "candidates": sample["candidates"],
+            "expected_sql": expected,
+            "predicted_sql": predicted,
+            "exact_match": is_match,
+        })
+    total = len(results)
+    exact_match_rate = matches / total if total else 0
+    output = {
+        "summary": {
+            "label": label,
+            "model_path": model_path,
+            "num_samples": total,
+            "exact_matches": matches,
+            "exact_match_rate": exact_match_rate,
+            "timestamp": datetime.now().isoformat(),
+        },
+        "results": results,
+    }
+    path = pathlib.Path(output_path)
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with open(path, "w") as f:
+        json.dump(output, f, indent=2)
+    gazet_vol.commit()
+    print(f"\n{'='*60}")
+    print(f"[{label}] {matches}/{total} exact matches ({100*exact_match_rate:.1f}%)")
+    print(f"Results saved to {output_path}")
+    print(f"{'='*60}")
+@app.function(
+    image=infer_image,
+    volumes=VOLUMES,
+)
+def read_test_data(test_jsonl: str) -> list[dict]:
+    """Read test JSONL from the volume."""
+    lines = []
+    with open(test_jsonl) as f:
+        for line in f:
+            lines.append(json.loads(line))
+    return lines
+@app.local_entrypoint()
+def main(
+    model_path: str = DEFAULT_MODEL_PATH,
+    label: str = "finetuned",
+    test_jsonl: str = "/mnt/gazet/data/output/test.jsonl",
+    max_samples: Optional[int] = None,
+    max_new_tokens: int = 512,
+    batch_size: int = 16,
+    use_chat_template: bool = False,
+    output_dir: str = "/mnt/gazet/eval_results",
+):
+    print(f"Model: {model_path}")
+    print(f"Label: {label}")
+    print(f"Chat template: {use_chat_template}")
+    print("Loading test data...")
+    samples = read_test_data.remote(test_jsonl)
+    if max_samples:
+        samples = samples[:max_samples]
+    print(f"Eval samples: {len(samples)}")
+    output_file = f"{output_dir}/eval-{label}.json"
+    run_eval.remote(
+        model_path, label, samples, output_file,
+        max_new_tokens, batch_size, use_chat_template,
+    )

finetune/prompts.py ADDED Viewed

	@@ -0,0 +1,83 @@

+"""Prompt templates and message formatting for natural language geocoding."""
+from __future__ import annotations
+from typing import Any, Dict, Sequence
+import pandas as pd
+SYSTEM_PROMPT = (
+    "You are a text to SQL query translator that helps in natural language geocoding."
+)
+USER_PROMPT_TEMPLATE = """GIVEN the <SCHEMA_DETAILS>, <CANDIDATES> and <USER_QUERY>, generate the corresponding SQL command to retrieve the desired geometry.
+<SCHEMA_DETAILS>
+{schema_details}
+</SCHEMA_DETAILS>
+<CANDIDATES>
+{candidates_csv}
+</CANDIDATES>
+<USER_QUERY>
+{question}
+</USER_QUERY>
+"""
+DEFAULT_SCHEMA_DETAILS = """1. divisions_area  -- Overture polygon/multipolygon admin boundaries
+   path: '/data/overture/division_area/*.parquet'
+   columns:
+     id VARCHAR
+     names STRUCT("primary" VARCHAR, ...)
+     country VARCHAR
+     subtype VARCHAR
+     class VARCHAR
+     region VARCHAR
+     admin_level INTEGER
+     division_id VARCHAR
+     is_land BOOLEAN
+     is_territorial BOOLEAN
+     geometry GEOMETRY
+2. natural_earth  -- Natural Earth geography polygons
+   path: '/data/natural_earth_geoparquet/ne_geography.parquet'
+   columns:
+     id VARCHAR
+     name VARCHAR
+     featurecla VARCHAR
+     scalerank INTEGER
+     min_zoom DOUBLE
+     geometry GEOMETRY"""
+def candidates_to_csv(candidates: Sequence[Dict[str, Any]]) -> str:
+    df = pd.DataFrame(list(candidates))
+    if "candidate_id" in df.columns:
+        df = df.drop(columns=["candidate_id"])
+    return df.to_csv(index=False)
+def build_user_prompt(
+    question: str,
+    candidates: Sequence[Dict[str, Any]],
+    schema_details: str,
+) -> str:
+    return USER_PROMPT_TEMPLATE.format(
+        schema_details=schema_details.strip(),
+        candidates_csv=candidates_to_csv(candidates).strip(),
+        question=question.strip(),
+    )
+def make_prompt_completion(
+    sample: Dict[str, Any],
+    schema_details: str,
+) -> Dict[str, str]:
+    prompt = SYSTEM_PROMPT + "\n\n" + build_user_prompt(
+        question=sample["question"],
+        candidates=sample["candidates"],
+        schema_details=schema_details,
+    )
+    completion = sample.get("target", {}).get("sql", "")
+    return {"prompt": prompt, "completion": completion}

finetune/train_modal.py ADDED Viewed

	@@ -0,0 +1,279 @@

+"""Modal training script for text-to-SQL LoRA finetuning.
+Usage
+-----
+modal run finetune/train_modal.py \
+    --train-jsonl /data/train.jsonl \
+    --val-jsonl /data/val.jsonl \
+    --base-model google/gemma-3-1b-it
+All CLI arguments map to TrainingConfig fields. Run with --help for details.
+"""
+from __future__ import annotations
+import pathlib
+from typing import Optional
+import modal
+app = modal.App("gazet-nlg-finetune")
+GPU_TYPE = "A100-80GB"  # "L40S"
+TIMEOUT_HOURS = 6
+MAX_RETRIES = 1
+train_image = (
+    modal.Image.debian_slim(python_version="3.11")
+    .pip_install(
+        "accelerate>=1.0",
+        "datasets>=3.0",
+        "hf-transfer>=0.1",
+        "huggingface_hub>=0.25",
+        "jinja2>=3.0",
+        "pandas>=2.2",
+        "peft>=0.13",
+        "torch>=2.4",
+        "trackio[gpu]",
+        "transformers>=4.46",
+        "trl>=0.12",
+    )
+    .add_local_python_source("finetune", copy=True)
+    .env({"HF_HOME": "/mnt/gazet/model_cache", "HF_HUB_ENABLE_HF_TRANSFER": "1"})
+)
+with train_image.imports():
+    import torch
+    from datasets import DatasetDict
+    from peft import LoraConfig
+    from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed
+    from trl import SFTConfig, SFTTrainer
+gazet_vol = modal.Volume.from_name("gazet", create_if_missing=True)
+VOLUMES = {
+    "/mnt/gazet": gazet_vol,
+}
+def _load_tokenizer(model_name: str):
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    return tokenizer
+def _load_model(model_name: str):
+    return AutoModelForCausalLM.from_pretrained(
+        model_name,
+        torch_dtype=torch.bfloat16,
+        attn_implementation="sdpa",
+        device_map="auto",
+    )
+def _build_lora_config(config) -> LoraConfig:
+    return LoraConfig(
+        r=config.lora_r,
+        lora_alpha=config.lora_alpha,
+        lora_dropout=config.lora_dropout,
+        bias="none",
+        task_type="CAUSAL_LM",
+        target_modules=config.target_modules,
+    )
+def _load_and_format_dataset(config) -> DatasetDict:
+    """Load JSONL splits and apply prompt-completion formatting."""
+    from finetune.data import (
+        format_dataset_for_sft,
+        load_jsonl_splits,
+        read_text,
+    )
+    from finetune.prompts import DEFAULT_SCHEMA_DETAILS
+    schema_details = read_text(config.schema_file, DEFAULT_SCHEMA_DETAILS)
+    raw_ds = load_jsonl_splits(config.train_jsonl, config.val_jsonl, config.test_jsonl)
+    ds = format_dataset_for_sft(raw_ds, schema_details)
+    if config.max_train_samples is not None:
+        ds["train"] = ds["train"].select(
+            range(min(config.max_train_samples, len(ds["train"])))
+        )
+    if config.max_eval_samples is not None and "val" in ds:
+        ds["val"] = ds["val"].select(
+            range(min(config.max_eval_samples, len(ds["val"])))
+        )
+    return ds
+def _find_latest_checkpoint(checkpoint_dir: pathlib.Path) -> str | None:
+    if not checkpoint_dir.exists():
+        return None
+    checkpoints = list(checkpoint_dir.glob("checkpoint-*"))
+    if not checkpoints:
+        return None
+    latest = max(checkpoints, key=lambda p: int(p.name.split("-")[1]))
+    print(f"Found existing checkpoint: {latest}")
+    return str(latest)
+@app.function(
+    image=train_image,
+    gpu=GPU_TYPE,
+    volumes=VOLUMES,
+    secrets=[modal.Secret.from_name("huggingface-secret")],
+    timeout=TIMEOUT_HOURS * 60 * 60,
+    retries=modal.Retries(initial_delay=0.0, max_retries=MAX_RETRIES),
+)
+def finetune(config_dict: dict):
+    """Run LoRA SFT training inside a Modal container."""
+    from finetune.config import TrainingConfig
+    config = TrainingConfig(**config_dict)
+    set_seed(config.seed)
+    experiment_dir = pathlib.Path("/mnt/gazet/checkpoints") / config.experiment_name
+    experiment_dir.mkdir(parents=True, exist_ok=True)
+    print(f"Experiment: {config.experiment_name}")
+    print(f"Model: {config.base_model}")
+    # Model and tokenizer
+    tokenizer = _load_tokenizer(config.base_model)
+    model = _load_model(config.base_model)
+    # Dataset
+    ds = _load_and_format_dataset(config)
+    print(f"Train samples: {len(ds['train']):,}")
+    if "val" in ds:
+        print(f"Val samples: {len(ds['val']):,}")
+    # LoRA
+    peft_config = _build_lora_config(config)
+    # SFT config
+    sft_args = SFTConfig(
+        output_dir=str(experiment_dir),
+        max_length=config.max_length,
+        packing=config.packing,
+        num_train_epochs=config.num_train_epochs,
+        per_device_train_batch_size=config.per_device_train_batch_size,
+        per_device_eval_batch_size=config.per_device_eval_batch_size,
+        gradient_accumulation_steps=config.gradient_accumulation_steps,
+        gradient_checkpointing=config.gradient_checkpointing,
+        optim=config.optim,
+        logging_steps=config.logging_steps,
+        save_strategy=config.save_strategy,
+        save_steps=config.save_steps,
+        eval_strategy=config.eval_strategy,
+        eval_steps=config.eval_steps,
+        learning_rate=config.learning_rate,
+        bf16=True,
+        max_grad_norm=config.max_grad_norm,
+        warmup_steps=config.warmup_steps,
+        lr_scheduler_type=config.lr_scheduler_type,
+        weight_decay=config.weight_decay,
+        report_to=config.report_to,
+        trackio_space_id=config.trackio_space_id,
+        project=config.project,
+        completion_only_loss=config.completion_only_loss,
+        dataset_num_proc=config.dataset_num_proc,
+        seed=config.seed,
+    )
+    trainer = SFTTrainer(
+        model=model,
+        args=sft_args,
+        train_dataset=ds["train"],
+        eval_dataset=ds.get("val"),
+        peft_config=peft_config,
+        processing_class=tokenizer,
+    )
+    total_params = sum(p.numel() for p in model.parameters())
+    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    print(f"Total parameters: {total_params:,}")
+    print(f"Trainable parameters: {trainable_params:,}")
+    # Resume from checkpoint if available (handles preemption)
+    resume_from = _find_latest_checkpoint(experiment_dir)
+    if resume_from:
+        print(f"Resuming from {resume_from}")
+    trainer.train(resume_from_checkpoint=resume_from)
+    # Save final adapter + tokenizer
+    print(f"Saving adapter to {experiment_dir}")
+    trainer.save_model(str(experiment_dir))
+    tokenizer.save_pretrained(str(experiment_dir))
+    gazet_vol.commit()
+    # Optionally merge adapter into base model
+    if config.merge_after_training:
+        _merge_and_save(config, experiment_dir)
+    print(f"Training complete: {config.experiment_name}")
+    return config.experiment_name
+def _merge_and_save(config, experiment_dir: pathlib.Path):
+    from peft import PeftModel
+    merged_dir = experiment_dir / "merged"
+    merged_dir.mkdir(parents=True, exist_ok=True)
+    base = AutoModelForCausalLM.from_pretrained(
+        config.base_model,
+        device_map="cpu",
+    )
+    peft_model = PeftModel.from_pretrained(base, str(experiment_dir))
+    merged = peft_model.merge_and_unload()
+    merged.save_pretrained(str(merged_dir), safe_serialization=True, max_shard_size="2GB")
+    tokenizer = _load_tokenizer(config.base_model)
+    tokenizer.save_pretrained(str(merged_dir))
+    gazet_vol.commit()
+    print(f"Merged model saved to {merged_dir}")
+# ---------------------------------------------------------------------------
+# Local entrypoint
+# ---------------------------------------------------------------------------
+@app.local_entrypoint()
+def main(
+    base_model: Optional[str] = None,
+    experiment_name: Optional[str] = None,
+    per_device_train_batch_size: Optional[int] = None,
+    max_train_samples: Optional[int] = None,
+    max_eval_samples: Optional[int] = None,
+    num_train_epochs: Optional[int] = None,
+    lora_r: Optional[int] = None,
+    max_length: Optional[int] = None,
+):
+    from finetune.config import TrainingConfig
+    overrides = {
+        k: v for k, v in dict(
+            base_model=base_model,
+            experiment_name=experiment_name,
+            per_device_train_batch_size=per_device_train_batch_size,
+            max_train_samples=max_train_samples,
+            max_eval_samples=max_eval_samples,
+            num_train_epochs=num_train_epochs,
+            lora_r=lora_r,
+            max_length=max_length,
+        ).items() if v is not None
+    }
+    config = TrainingConfig(**overrides)
+    print(f"Starting experiment: {config.experiment_name}")
+    print(f"Model: {config.base_model}")
+    print(f"LoRA: r={config.lora_r}, alpha={config.lora_alpha}")
+    effective_batch = config.per_device_train_batch_size * config.gradient_accumulation_steps
+    print(f"Effective batch size: {effective_batch}")
+    result = finetune.remote(config.__dict__)
+    print(f"Training complete: {result}")