| """Dataset schema for CL macro transformation pairs — using stdlib dataclasses.""" |
|
|
| from __future__ import annotations |
|
|
| import json |
| from dataclasses import dataclass, field |
| from enum import Enum |
| from typing import Optional |
|
|
|
|
| class MacroCategory(str, Enum): |
| CONTROL_FLOW = "control-flow" |
| DSL = "dsl" |
| ANAPHORIC = "anaphoric" |
| CAPTURE_MANAGEMENT = "capture-management" |
| CODE_WALKING = "code-walking" |
| RECURSIVE_EXPANSION = "recursive-expansion" |
| DISPATCH = "dispatch" |
| READ_MACRO = "read-macro" |
| INDIRECTION = "indirection" |
| SCOPE = "scope" |
| EFFICIENCY = "efficiency" |
| COMPILER_MACRO = "compiler-macro" |
|
|
|
|
| class Complexity(str, Enum): |
| BASIC = "basic" |
| INTERMEDIATE = "intermediate" |
| ADVANCED = "advanced" |
|
|
|
|
| class Source(str, Enum): |
| LET_OVER_LAMBDA = "let-over-lambda" |
| SYNTHETIC = "synthetic" |
| CL_UTILITIES = "cl-utilities" |
| ALEXANDRIA = "alexandria" |
| ON_LISP = "on-lisp" |
| SERAPEUM = "serapeum" |
| ITERATE = "iterate" |
| ANAPHORA = "anaphora" |
| TRIVIA = "trivia" |
| ARROW_MACROS = "arrow-macros" |
| MODF = "modf" |
| ACCESS = "access" |
| FOR = "for" |
| CL_INTERPOL = "cl-interpol" |
| SCREAMER = "screamer" |
| COALTON = "coalton" |
| NHOOKS = "nhooks" |
| GENERIC_CL = "generic-cl" |
| OTHER_LIBRARY = "other-library" |
|
|
|
|
| class MacroTechnique(str, Enum): |
| NESTED_BACKQUOTE = "nested-backquote" |
| GENSYM = "gensym" |
| ONCE_ONLY = "once-only" |
| RECURSIVE_EXPANSION = "recursive-expansion" |
| CODE_WALKING = "code-walking" |
| DLAMBDA = "dlambda" |
| ANAPHOR = "anaphor" |
| READER = "reader" |
| COMPILER_MACRO = "compiler-macro" |
| MACROLET = "macrolet" |
| SYMBOL_MACROLET = "symbol-macrolet" |
| DEFSETF = "defsetf" |
| TAGBODY = "tagbody-go" |
| PARSER = "parser" |
|
|
|
|
| @dataclass |
| class TransformationExample: |
| """A single macro transformation pair for fine-tuning.""" |
|
|
| id: str |
| before_code: str |
| problem_pattern: str |
| macro_definition: str |
| after_expansion: str |
| macro_category: MacroCategory |
| technique: list[MacroTechnique] = field(default_factory=list) |
| source: Source = Source.LET_OVER_LAMBDA |
| source_chapter: Optional[str] = None |
| complexity: Complexity = Complexity.BASIC |
| has_capture_risk: bool = False |
| requires_gensyms: bool = False |
| library_name: str = "" |
| macro_name: str = "" |
| call_sites: list[str] = field(default_factory=list) |
| macroexpand_1_result: Optional[str] = None |
| is_verified: bool = False |
| formulation: str = "macro-from-usage" |
| nl_description: Optional[str] = None |
| is_synthetic: bool = False |
| quality_score: Optional[float] = None |
| commentary: Optional[str] = None |
|
|
|
|
| @dataclass |
| class DatasetRecord: |
| """Full dataset record with instructions for fine-tuning.""" |
|
|
| instruction: str |
| input: str |
| output: str |
| category: str = "" |
| technique: str = "" |
| complexity: str = "" |
| quality_score: Optional[float] = None |
|
|
| def model_dump_json(self) -> str: |
| return json.dumps( |
| { |
| "instruction": self.instruction, |
| "input": self.input, |
| "output": self.output, |
| "category": self.category, |
| "technique": self.technique, |
| "complexity": self.complexity, |
| "quality_score": self.quality_score, |
| } |
| ) |
|
|
|
|
| @dataclass |
| class RawExtractionRecord: |
| """Intermediate record produced by Phase 1 extraction.""" |
| id: str |
| library_name: str |
| system_name: str = "" |
| source_file: str = "" |
| macro_name: str = "" |
| macro_definition: str = "" |
| form_type: str = "defmacro" |
| docstring: Optional[str] = None |
| args: str = "" |
| extracted_at: str = "" |
| status: str = "extracted" |
|
|