cl-ds / scripts /generate_all.py

977 CL macro transformation examples: CL-native pipeline with SBCL verification

d69fc90 verified 7 days ago

6.49 kB

	#!/usr/bin/env python3
	"""End-to-end pipeline: CL-native macro discovery + expansion + classification.

	1. For each library, runs SBCL with generate.lisp to produce verified JSONL
	2. Merges with macro definitions from Phase 1 extractions
	3. Classifies and builds train/val/test splits
	"""

	from __future__ import annotations

	import json
	import subprocess
	import sys
	from pathlib import Path

	sys.path.insert(0, str(Path(__file__).parent.parent / "src"))

	from cl_macros.classifier import classify_all, quality_report, detect_techniques, detect_category, assess_complexity, assess_capture_risk
	from cl_macros.dataset import build_dataset, save_dataset, build_training_record
	from cl_macros.ext.library_index import LibraryIndex
	from cl_macros.knowledge_base import ALL_EXAMPLES
	from cl_macros.schema import (
	TransformationExample,
	Complexity,
	Source,
	)

	SBCL = "sbcl"
	GENERATE_LISP = Path(__file__).parent.parent / "src" / "cl_macros" / "verif" / "generate.lisp"
	OUTPUT_DIR = Path("data/generated")


	def run_sbcl(lib_name: str, output_path: Path) -> bool:
	"""Run SBCL with generate.lisp for one library."""
	result = subprocess.run(
	[
	SBCL, "--noinform", "--non-interactive",
	"--load", str(GENERATE_LISP),
	"--eval", f'(lol-gen:main "{lib_name}" "{output_path}")',
	],
	capture_output=True, text=True, timeout=300,
	)
	return result.returncode == 0


	def load_macro_defs(lib_name: str) -> dict[str, str]:
	"""Load macro definitions from Phase 1 extraction files."""
	ext_path = Path("data/extractions") / f"{lib_name}_extractions.jsonl"
	defs = {}
	if ext_path.exists():
	with open(ext_path) as f:
	for line in f:
	rec = json.loads(line.strip())
	name = rec.get("macro_name", "")
	full = rec.get("macro_definition", "")
	if name and full:
	defs[name.upper()] = full
	return defs


	def source_for_lib(lib_name: str) -> Source:
	"""Map library name to Source enum."""
	mapping = {
	"alexandria": Source.ALEXANDRIA,
	"serapeum": Source.SERAPEUM,
	"anaphora": Source.ANAPHORA,
	"iterate": Source.ITERATE,
	"trivia": Source.TRIVIA,
	"arrow-macros": Source.ARROW_MACROS,
	"modf": Source.MODF,
	"access": Source.ACCESS,
	"for": Source.FOR,
	"cl-interpol": Source.CL_INTERPOL,
	"screamer": Source.SCREAMER,
	"coalton": Source.COALTON,
	"nhooks": Source.NHOOKS,
	"generic-cl": Source.GENERIC_CL,
	}
	return mapping.get(lib_name, Source.OTHER_LIBRARY)


	def build_examples(records: list[dict], lib_name: str) -> list[TransformationExample]:
	"""Convert raw SBCL output to TransformationExample objects."""
	macro_defs = load_macro_defs(lib_name)
	examples = []
	seen = set()

	for rec in records:
	if rec.get("status") != "verified":
	continue

	macro_name = rec["macro_name"]
	call_form = rec["call_form"]
	expanded = rec.get("expanded", "")

	# Deduplicate: skip identical (macro_name, expanded) pairs
	key = (macro_name, expanded)
	if key in seen:
	continue
	seen.add(key)

	# Skip trivial expansions (same as input)
	if call_form.strip() == expanded.strip():
	continue

	# Get source macro definition
	macro_def = macro_defs.get(macro_name.upper(), f"(defmacro {macro_name} ...)")

	ex = TransformationExample(
	id=f"{lib_name}-{macro_name}-{len(examples)}",
	before_code=call_form,
	problem_pattern=f"Macro call that should be transformed by {macro_name}",
	macro_definition=macro_def,
	after_expansion=expanded,
	macro_category=None, # auto-detect
	technique=[],
	source=source_for_lib(lib_name),
	complexity=Complexity.BASIC, # auto-detect
	library_name=lib_name,
	macro_name=macro_name,
	is_verified=True,
	macroexpand_1_result=expanded,
	formulation="macro-from-usage",
	)
	examples.append(ex)

	return examples


	def main():
	import argparse
	ap = argparse.ArgumentParser()
	ap.add_argument("--tier", default="tier1")
	ap.add_argument("--library", default=None)
	args = ap.parse_args()

	idx = LibraryIndex()
	if args.library:
	libs = [idx.get(args.library)]
	else:
	libs = idx.list_libraries(args.tier)

	OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

	all_examples = list(ALL_EXAMPLES) # preserve hand-curated
	stats = {}

	for lib in libs:
	print(f"\n--- {lib.name} ({lib.tier}) ---")
	output_path = OUTPUT_DIR / f"{lib.name}_generated.jsonl"

	if not output_path.exists():
	print(f" Running SBCL...")
	if not run_sbcl(lib.name, output_path):
	print(f" SBCL FAILED for {lib.name}")
	continue

	# Read results
	with open(output_path) as f:
	records = [json.loads(line) for line in f if line.strip()]

	verified = sum(1 for r in records if r.get("status") == "verified")
	errors = sum(1 for r in records if r.get("status") != "verified")
	print(f" Records: {len(records)} ({verified} verified, {errors} errors)")

	examples = build_examples(records, lib.name)
	print(f" Valid examples: {len(examples)}")
	all_examples.extend(examples)
	stats[lib.name] = len(examples)

	# Classify
	print(f"\n=== Classification ===")
	print(f"Total examples: {len(all_examples)}")
	classified = classify_all(all_examples)
	report = quality_report(classified)
	print(f"Mean score: {report['mean_score']:.3f}")
	print(f"Categories: {report['category_distribution']}")

	# Build splits
	print(f"\n=== Building splits ===")
	train, val, test = build_dataset(classified, min_quality=0.3)
	print(f"Train: {len(train)}, Val: {len(val)}, Test: {len(test)}")

	split_dir = Path("data/splits")
	paths = save_dataset(train, val, test, split_dir)
	for name, p in paths.items():
	print(f" {name}: {p}")

	# Stats
	stat_path = OUTPUT_DIR / "generation_stats.json"
	with open(stat_path, "w") as f:
	json.dump({"stats": stats, "report": report}, f, indent=2)
	print(f"\nStats: {stat_path}")


	if __name__ == "__main__":
	main()