cl-ds / scripts /extract_all.py
j14i's picture
977 CL macro transformation examples: CL-native pipeline with SBCL verification
d69fc90 verified
#!/usr/bin/env python3
"""Phase 1: Extract macro definitions and call sites from all libraries.
Usage: python3 scripts/extract_all.py [--tier tier1] [--library alexandria]
"""
from __future__ import annotations
import json
import sys
from datetime import datetime, timezone
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
from cl_macros.ext.call_site_scanner import CallSiteScanner
from cl_macros.ext.defmacro_parser import DefmacroParser
from cl_macros.ext.library_index import LibraryIndex
from cl_macros.ext.source_fetcher import SourceFetcher
EXTRACTIONS_DIR = Path("data/extractions")
def extract_library(lib, fetcher, parser, scanner, output_dir):
"""Extract macros and call sites from one library."""
print(f"\n{'='*60}")
print(f"Extracting from: {lib.name} ({lib.tier})")
print(f" {lib.description}")
try:
root = fetcher.fetch(lib)
print(f" source: {root}")
except Exception as e:
print(f" SKIPPED: {e}")
return 0, 0
files = fetcher.get_all_lisp_files(lib)
print(f" lisp files: {len(files)}")
# Extract macro definitions
macro_defs = []
for f in files:
macro_defs.extend(parser.extract_file(f))
if parser.errors:
print(f" parser errors: {len(parser.errors)}")
for err in parser.errors[:3]:
print(f" {err}")
print(f" macros extracted: {len(macro_defs)}")
# Extract call sites for each macro
scanner = CallSiteScanner()
total_call_sites = 0
call_site_records = []
for mdef in macro_defs:
sites = scanner.find_call_sites(mdef.macro_name, files, macro_defs)
total_call_sites += len(sites)
for site in sites:
call_site_records.append({
"library": lib.name,
"macro_name": site.macro_name,
"source_file": site.source_file,
"line_number": site.line_number,
"call_form": site.call_form,
"context_lines": site.context_lines,
})
print(f" call sites found: {total_call_sites}")
# Save extraction records
ts = datetime.now(timezone.utc).isoformat()
out_records = []
for mdef in macro_defs:
out_records.append({
"id": f"{lib.name}-{mdef.macro_name}",
"library_name": lib.name,
"system_name": lib.systems[0] if lib.systems else lib.name,
"source_file": str(mdef.source_file) if mdef.source_file else "",
"macro_name": mdef.macro_name,
"macro_definition": mdef.full_form,
"form_type": mdef.form_type,
"docstring": mdef.docstring,
"args": mdef.args,
"extracted_at": ts,
"status": "extracted",
})
# Write extractions
output_dir.mkdir(parents=True, exist_ok=True)
ext_path = output_dir / f"{lib.name}_extractions.jsonl"
with open(ext_path, "w") as f:
for rec in out_records:
f.write(json.dumps(rec) + "\n")
# Write call sites
cs_path = output_dir / f"{lib.name}_call_sites.jsonl"
with open(cs_path, "w") as f:
for rec in call_site_records:
f.write(json.dumps(rec) + "\n")
return len(macro_defs), total_call_sites
def main():
import argparse
ap = argparse.ArgumentParser()
ap.add_argument("--tier", default="tier1")
ap.add_argument("--library", default=None)
ap.add_argument("--output-dir", default=str(EXTRACTIONS_DIR))
args = ap.parse_args()
output_dir = Path(args.output_dir)
idx = LibraryIndex()
fetcher = SourceFetcher()
parser = DefmacroParser()
if args.library:
libs = [idx.get(args.library)]
if libs[0] is None:
print(f"Library '{args.library}' not found in index")
sys.exit(1)
else:
libs = idx.list_libraries(args.tier)
print(f"Extracting from {len(libs)} libraries (tier={args.tier})")
print(f"Output: {output_dir}")
total_macros = 0
total_sites = 0
successes = 0
failures = 0
for lib in libs:
macros, sites = extract_library(lib, fetcher, parser, CallSiteScanner(), output_dir)
total_macros += macros
total_sites += sites
if macros > 0:
successes += 1
else:
failures += 1
# Write manifest
manifest = {
"extracted_at": datetime.now(timezone.utc).isoformat(),
"tier": args.tier,
"libraries_total": len(libs),
"libraries_successful": successes,
"libraries_failed": failures,
"macros_total": total_macros,
"call_sites_total": total_sites,
"output_dir": str(output_dir),
}
manifest_path = output_dir / "extraction_manifest.json"
with open(manifest_path, "w") as f:
json.dump(manifest, f, indent=2)
print(f"\n{'='*60}")
print(f"PHASE 1 COMPLETE")
print(f" Libraries: {successes}/{len(libs)} successful")
print(f" Macros extracted: {total_macros}")
print(f" Call sites found: {total_sites}")
print(f" Manifest: {manifest_path}")
if __name__ == "__main__":
main()