cl-ds / src /cl_macros /schema.py
j14i's picture
977 CL macro transformation examples: CL-native pipeline with SBCL verification
d69fc90 verified
"""Dataset schema for CL macro transformation pairs — using stdlib dataclasses."""
from __future__ import annotations
import json
from dataclasses import dataclass, field
from enum import Enum
from typing import Optional
class MacroCategory(str, Enum):
CONTROL_FLOW = "control-flow"
DSL = "dsl"
ANAPHORIC = "anaphoric"
CAPTURE_MANAGEMENT = "capture-management"
CODE_WALKING = "code-walking"
RECURSIVE_EXPANSION = "recursive-expansion"
DISPATCH = "dispatch"
READ_MACRO = "read-macro"
INDIRECTION = "indirection"
SCOPE = "scope"
EFFICIENCY = "efficiency"
COMPILER_MACRO = "compiler-macro"
class Complexity(str, Enum):
BASIC = "basic"
INTERMEDIATE = "intermediate"
ADVANCED = "advanced"
class Source(str, Enum):
LET_OVER_LAMBDA = "let-over-lambda"
SYNTHETIC = "synthetic"
CL_UTILITIES = "cl-utilities"
ALEXANDRIA = "alexandria"
ON_LISP = "on-lisp"
SERAPEUM = "serapeum"
ITERATE = "iterate"
ANAPHORA = "anaphora"
TRIVIA = "trivia"
ARROW_MACROS = "arrow-macros"
MODF = "modf"
ACCESS = "access"
FOR = "for"
CL_INTERPOL = "cl-interpol"
SCREAMER = "screamer"
COALTON = "coalton"
NHOOKS = "nhooks"
GENERIC_CL = "generic-cl"
OTHER_LIBRARY = "other-library"
class MacroTechnique(str, Enum):
NESTED_BACKQUOTE = "nested-backquote"
GENSYM = "gensym"
ONCE_ONLY = "once-only"
RECURSIVE_EXPANSION = "recursive-expansion"
CODE_WALKING = "code-walking"
DLAMBDA = "dlambda"
ANAPHOR = "anaphor"
READER = "reader"
COMPILER_MACRO = "compiler-macro"
MACROLET = "macrolet"
SYMBOL_MACROLET = "symbol-macrolet"
DEFSETF = "defsetf"
TAGBODY = "tagbody-go"
PARSER = "parser"
@dataclass
class TransformationExample:
"""A single macro transformation pair for fine-tuning."""
id: str
before_code: str
problem_pattern: str
macro_definition: str
after_expansion: str
macro_category: MacroCategory
technique: list[MacroTechnique] = field(default_factory=list)
source: Source = Source.LET_OVER_LAMBDA
source_chapter: Optional[str] = None
complexity: Complexity = Complexity.BASIC
has_capture_risk: bool = False
requires_gensyms: bool = False
library_name: str = ""
macro_name: str = ""
call_sites: list[str] = field(default_factory=list)
macroexpand_1_result: Optional[str] = None
is_verified: bool = False
formulation: str = "macro-from-usage"
nl_description: Optional[str] = None
is_synthetic: bool = False
quality_score: Optional[float] = None
commentary: Optional[str] = None
@dataclass
class DatasetRecord:
"""Full dataset record with instructions for fine-tuning."""
instruction: str
input: str
output: str
category: str = ""
technique: str = ""
complexity: str = ""
quality_score: Optional[float] = None
def model_dump_json(self) -> str:
return json.dumps(
{
"instruction": self.instruction,
"input": self.input,
"output": self.output,
"category": self.category,
"technique": self.technique,
"complexity": self.complexity,
"quality_score": self.quality_score,
}
)
@dataclass
class RawExtractionRecord:
"""Intermediate record produced by Phase 1 extraction."""
id: str
library_name: str
system_name: str = ""
source_file: str = ""
macro_name: str = ""
macro_definition: str = ""
form_type: str = "defmacro"
docstring: Optional[str] = None
args: str = ""
extracted_at: str = ""
status: str = "extracted"