cl-ds / scripts /generate_all.py
j14i's picture
977 CL macro transformation examples: CL-native pipeline with SBCL verification
d69fc90 verified
#!/usr/bin/env python3
"""End-to-end pipeline: CL-native macro discovery + expansion + classification.
1. For each library, runs SBCL with generate.lisp to produce verified JSONL
2. Merges with macro definitions from Phase 1 extractions
3. Classifies and builds train/val/test splits
"""
from __future__ import annotations
import json
import subprocess
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
from cl_macros.classifier import classify_all, quality_report, detect_techniques, detect_category, assess_complexity, assess_capture_risk
from cl_macros.dataset import build_dataset, save_dataset, build_training_record
from cl_macros.ext.library_index import LibraryIndex
from cl_macros.knowledge_base import ALL_EXAMPLES
from cl_macros.schema import (
TransformationExample,
Complexity,
Source,
)
SBCL = "sbcl"
GENERATE_LISP = Path(__file__).parent.parent / "src" / "cl_macros" / "verif" / "generate.lisp"
OUTPUT_DIR = Path("data/generated")
def run_sbcl(lib_name: str, output_path: Path) -> bool:
"""Run SBCL with generate.lisp for one library."""
result = subprocess.run(
[
SBCL, "--noinform", "--non-interactive",
"--load", str(GENERATE_LISP),
"--eval", f'(lol-gen:main "{lib_name}" "{output_path}")',
],
capture_output=True, text=True, timeout=300,
)
return result.returncode == 0
def load_macro_defs(lib_name: str) -> dict[str, str]:
"""Load macro definitions from Phase 1 extraction files."""
ext_path = Path("data/extractions") / f"{lib_name}_extractions.jsonl"
defs = {}
if ext_path.exists():
with open(ext_path) as f:
for line in f:
rec = json.loads(line.strip())
name = rec.get("macro_name", "")
full = rec.get("macro_definition", "")
if name and full:
defs[name.upper()] = full
return defs
def source_for_lib(lib_name: str) -> Source:
"""Map library name to Source enum."""
mapping = {
"alexandria": Source.ALEXANDRIA,
"serapeum": Source.SERAPEUM,
"anaphora": Source.ANAPHORA,
"iterate": Source.ITERATE,
"trivia": Source.TRIVIA,
"arrow-macros": Source.ARROW_MACROS,
"modf": Source.MODF,
"access": Source.ACCESS,
"for": Source.FOR,
"cl-interpol": Source.CL_INTERPOL,
"screamer": Source.SCREAMER,
"coalton": Source.COALTON,
"nhooks": Source.NHOOKS,
"generic-cl": Source.GENERIC_CL,
}
return mapping.get(lib_name, Source.OTHER_LIBRARY)
def build_examples(records: list[dict], lib_name: str) -> list[TransformationExample]:
"""Convert raw SBCL output to TransformationExample objects."""
macro_defs = load_macro_defs(lib_name)
examples = []
seen = set()
for rec in records:
if rec.get("status") != "verified":
continue
macro_name = rec["macro_name"]
call_form = rec["call_form"]
expanded = rec.get("expanded", "")
# Deduplicate: skip identical (macro_name, expanded) pairs
key = (macro_name, expanded)
if key in seen:
continue
seen.add(key)
# Skip trivial expansions (same as input)
if call_form.strip() == expanded.strip():
continue
# Get source macro definition
macro_def = macro_defs.get(macro_name.upper(), f"(defmacro {macro_name} ...)")
ex = TransformationExample(
id=f"{lib_name}-{macro_name}-{len(examples)}",
before_code=call_form,
problem_pattern=f"Macro call that should be transformed by {macro_name}",
macro_definition=macro_def,
after_expansion=expanded,
macro_category=None, # auto-detect
technique=[],
source=source_for_lib(lib_name),
complexity=Complexity.BASIC, # auto-detect
library_name=lib_name,
macro_name=macro_name,
is_verified=True,
macroexpand_1_result=expanded,
formulation="macro-from-usage",
)
examples.append(ex)
return examples
def main():
import argparse
ap = argparse.ArgumentParser()
ap.add_argument("--tier", default="tier1")
ap.add_argument("--library", default=None)
args = ap.parse_args()
idx = LibraryIndex()
if args.library:
libs = [idx.get(args.library)]
else:
libs = idx.list_libraries(args.tier)
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
all_examples = list(ALL_EXAMPLES) # preserve hand-curated
stats = {}
for lib in libs:
print(f"\n--- {lib.name} ({lib.tier}) ---")
output_path = OUTPUT_DIR / f"{lib.name}_generated.jsonl"
if not output_path.exists():
print(f" Running SBCL...")
if not run_sbcl(lib.name, output_path):
print(f" SBCL FAILED for {lib.name}")
continue
# Read results
with open(output_path) as f:
records = [json.loads(line) for line in f if line.strip()]
verified = sum(1 for r in records if r.get("status") == "verified")
errors = sum(1 for r in records if r.get("status") != "verified")
print(f" Records: {len(records)} ({verified} verified, {errors} errors)")
examples = build_examples(records, lib.name)
print(f" Valid examples: {len(examples)}")
all_examples.extend(examples)
stats[lib.name] = len(examples)
# Classify
print(f"\n=== Classification ===")
print(f"Total examples: {len(all_examples)}")
classified = classify_all(all_examples)
report = quality_report(classified)
print(f"Mean score: {report['mean_score']:.3f}")
print(f"Categories: {report['category_distribution']}")
# Build splits
print(f"\n=== Building splits ===")
train, val, test = build_dataset(classified, min_quality=0.3)
print(f"Train: {len(train)}, Val: {len(val)}, Test: {len(test)}")
split_dir = Path("data/splits")
paths = save_dataset(train, val, test, split_dir)
for name, p in paths.items():
print(f" {name}: {p}")
# Stats
stat_path = OUTPUT_DIR / "generation_stats.json"
with open(stat_path, "w") as f:
json.dump({"stats": stats, "report": report}, f, indent=2)
print(f"\nStats: {stat_path}")
if __name__ == "__main__":
main()