File size: 6,488 Bytes

d69fc90

#!/usr/bin/env python3
"""End-to-end pipeline: CL-native macro discovery + expansion + classification.

1. For each library, runs SBCL with generate.lisp to produce verified JSONL
2. Merges with macro definitions from Phase 1 extractions
3. Classifies and builds train/val/test splits
"""

from __future__ import annotations

import json
import subprocess
import sys
from pathlib import Path

sys.path.insert(0, str(Path(__file__).parent.parent / "src"))

from cl_macros.classifier import classify_all, quality_report, detect_techniques, detect_category, assess_complexity, assess_capture_risk
from cl_macros.dataset import build_dataset, save_dataset, build_training_record
from cl_macros.ext.library_index import LibraryIndex
from cl_macros.knowledge_base import ALL_EXAMPLES
from cl_macros.schema import (
    TransformationExample,
    Complexity,
    Source,
)

SBCL = "sbcl"
GENERATE_LISP = Path(__file__).parent.parent / "src" / "cl_macros" / "verif" / "generate.lisp"
OUTPUT_DIR = Path("data/generated")


def run_sbcl(lib_name: str, output_path: Path) -> bool:
    """Run SBCL with generate.lisp for one library."""
    result = subprocess.run(
        [
            SBCL, "--noinform", "--non-interactive",
            "--load", str(GENERATE_LISP),
            "--eval", f'(lol-gen:main "{lib_name}" "{output_path}")',
        ],
        capture_output=True, text=True, timeout=300,
    )
    return result.returncode == 0


def load_macro_defs(lib_name: str) -> dict[str, str]:
    """Load macro definitions from Phase 1 extraction files."""
    ext_path = Path("data/extractions") / f"{lib_name}_extractions.jsonl"
    defs = {}
    if ext_path.exists():
        with open(ext_path) as f:
            for line in f:
                rec = json.loads(line.strip())
                name = rec.get("macro_name", "")
                full = rec.get("macro_definition", "")
                if name and full:
                    defs[name.upper()] = full
    return defs


def source_for_lib(lib_name: str) -> Source:
    """Map library name to Source enum."""
    mapping = {
        "alexandria": Source.ALEXANDRIA,
        "serapeum": Source.SERAPEUM,
        "anaphora": Source.ANAPHORA,
        "iterate": Source.ITERATE,
        "trivia": Source.TRIVIA,
        "arrow-macros": Source.ARROW_MACROS,
        "modf": Source.MODF,
        "access": Source.ACCESS,
        "for": Source.FOR,
        "cl-interpol": Source.CL_INTERPOL,
        "screamer": Source.SCREAMER,
        "coalton": Source.COALTON,
        "nhooks": Source.NHOOKS,
        "generic-cl": Source.GENERIC_CL,
    }
    return mapping.get(lib_name, Source.OTHER_LIBRARY)


def build_examples(records: list[dict], lib_name: str) -> list[TransformationExample]:
    """Convert raw SBCL output to TransformationExample objects."""
    macro_defs = load_macro_defs(lib_name)
    examples = []
    seen = set()

    for rec in records:
        if rec.get("status") != "verified":
            continue

        macro_name = rec["macro_name"]
        call_form = rec["call_form"]
        expanded = rec.get("expanded", "")

        # Deduplicate: skip identical (macro_name, expanded) pairs
        key = (macro_name, expanded)
        if key in seen:
            continue
        seen.add(key)

        # Skip trivial expansions (same as input)
        if call_form.strip() == expanded.strip():
            continue

        # Get source macro definition
        macro_def = macro_defs.get(macro_name.upper(), f"(defmacro {macro_name} ...)")

        ex = TransformationExample(
            id=f"{lib_name}-{macro_name}-{len(examples)}",
            before_code=call_form,
            problem_pattern=f"Macro call that should be transformed by {macro_name}",
            macro_definition=macro_def,
            after_expansion=expanded,
            macro_category=None,  # auto-detect
            technique=[],
            source=source_for_lib(lib_name),
            complexity=Complexity.BASIC,  # auto-detect
            library_name=lib_name,
            macro_name=macro_name,
            is_verified=True,
            macroexpand_1_result=expanded,
            formulation="macro-from-usage",
        )
        examples.append(ex)

    return examples


def main():
    import argparse
    ap = argparse.ArgumentParser()
    ap.add_argument("--tier", default="tier1")
    ap.add_argument("--library", default=None)
    args = ap.parse_args()

    idx = LibraryIndex()
    if args.library:
        libs = [idx.get(args.library)]
    else:
        libs = idx.list_libraries(args.tier)

    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

    all_examples = list(ALL_EXAMPLES)  # preserve hand-curated
    stats = {}

    for lib in libs:
        print(f"\n--- {lib.name} ({lib.tier}) ---")
        output_path = OUTPUT_DIR / f"{lib.name}_generated.jsonl"

        if not output_path.exists():
            print(f"  Running SBCL...")
            if not run_sbcl(lib.name, output_path):
                print(f"  SBCL FAILED for {lib.name}")
                continue

        # Read results
        with open(output_path) as f:
            records = [json.loads(line) for line in f if line.strip()]

        verified = sum(1 for r in records if r.get("status") == "verified")
        errors = sum(1 for r in records if r.get("status") != "verified")
        print(f"  Records: {len(records)} ({verified} verified, {errors} errors)")

        examples = build_examples(records, lib.name)
        print(f"  Valid examples: {len(examples)}")
        all_examples.extend(examples)
        stats[lib.name] = len(examples)

    # Classify
    print(f"\n=== Classification ===")
    print(f"Total examples: {len(all_examples)}")
    classified = classify_all(all_examples)
    report = quality_report(classified)
    print(f"Mean score: {report['mean_score']:.3f}")
    print(f"Categories: {report['category_distribution']}")

    # Build splits
    print(f"\n=== Building splits ===")
    train, val, test = build_dataset(classified, min_quality=0.3)
    print(f"Train: {len(train)}, Val: {len(val)}, Test: {len(test)}")

    split_dir = Path("data/splits")
    paths = save_dataset(train, val, test, split_dir)
    for name, p in paths.items():
        print(f"  {name}: {p}")

    # Stats
    stat_path = OUTPUT_DIR / "generation_stats.json"
    with open(stat_path, "w") as f:
        json.dump({"stats": stats, "report": report}, f, indent=2)
    print(f"\nStats: {stat_path}")


if __name__ == "__main__":
    main()