"""Dataset schema for CL macro transformation pairs — using stdlib dataclasses.""" from __future__ import annotations import json from dataclasses import dataclass, field from enum import Enum from typing import Optional class MacroCategory(str, Enum): CONTROL_FLOW = "control-flow" DSL = "dsl" ANAPHORIC = "anaphoric" CAPTURE_MANAGEMENT = "capture-management" CODE_WALKING = "code-walking" RECURSIVE_EXPANSION = "recursive-expansion" DISPATCH = "dispatch" READ_MACRO = "read-macro" INDIRECTION = "indirection" SCOPE = "scope" EFFICIENCY = "efficiency" COMPILER_MACRO = "compiler-macro" class Complexity(str, Enum): BASIC = "basic" INTERMEDIATE = "intermediate" ADVANCED = "advanced" class Source(str, Enum): LET_OVER_LAMBDA = "let-over-lambda" SYNTHETIC = "synthetic" CL_UTILITIES = "cl-utilities" ALEXANDRIA = "alexandria" ON_LISP = "on-lisp" SERAPEUM = "serapeum" ITERATE = "iterate" ANAPHORA = "anaphora" TRIVIA = "trivia" ARROW_MACROS = "arrow-macros" MODF = "modf" ACCESS = "access" FOR = "for" CL_INTERPOL = "cl-interpol" SCREAMER = "screamer" COALTON = "coalton" NHOOKS = "nhooks" GENERIC_CL = "generic-cl" OTHER_LIBRARY = "other-library" class MacroTechnique(str, Enum): NESTED_BACKQUOTE = "nested-backquote" GENSYM = "gensym" ONCE_ONLY = "once-only" RECURSIVE_EXPANSION = "recursive-expansion" CODE_WALKING = "code-walking" DLAMBDA = "dlambda" ANAPHOR = "anaphor" READER = "reader" COMPILER_MACRO = "compiler-macro" MACROLET = "macrolet" SYMBOL_MACROLET = "symbol-macrolet" DEFSETF = "defsetf" TAGBODY = "tagbody-go" PARSER = "parser" @dataclass class TransformationExample: """A single macro transformation pair for fine-tuning.""" id: str before_code: str problem_pattern: str macro_definition: str after_expansion: str macro_category: MacroCategory technique: list[MacroTechnique] = field(default_factory=list) source: Source = Source.LET_OVER_LAMBDA source_chapter: Optional[str] = None complexity: Complexity = Complexity.BASIC has_capture_risk: bool = False requires_gensyms: bool = False library_name: str = "" macro_name: str = "" call_sites: list[str] = field(default_factory=list) macroexpand_1_result: Optional[str] = None is_verified: bool = False formulation: str = "macro-from-usage" nl_description: Optional[str] = None is_synthetic: bool = False quality_score: Optional[float] = None commentary: Optional[str] = None @dataclass class DatasetRecord: """Full dataset record with instructions for fine-tuning.""" instruction: str input: str output: str category: str = "" technique: str = "" complexity: str = "" quality_score: Optional[float] = None def model_dump_json(self) -> str: return json.dumps( { "instruction": self.instruction, "input": self.input, "output": self.output, "category": self.category, "technique": self.technique, "complexity": self.complexity, "quality_score": self.quality_score, } ) @dataclass class RawExtractionRecord: """Intermediate record produced by Phase 1 extraction.""" id: str library_name: str system_name: str = "" source_file: str = "" macro_name: str = "" macro_definition: str = "" form_type: str = "defmacro" docstring: Optional[str] = None args: str = "" extracted_at: str = "" status: str = "extracted"