j14i commited on
Commit
794d2a5
·
verified ·
1 Parent(s): a383597

CL macro fine-tuning dataset: data, config, docs

Browse files
.gitignore ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ __pycache__/
2
+ *.pyc
3
+ *.pyo
4
+ *.egg-info/
5
+ dist/
6
+ build/
7
+ .venv/
8
+ *.egg
9
+ *env
README.md ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: bsd-2-clause
3
+ task_categories:
4
+ - text-generation
5
+ language:
6
+ - en
7
+ tags:
8
+ - common-lisp
9
+ - macros
10
+ - code-generation
11
+ - program-transformation
12
+ pretty_name: Common Lisp Macro Transformations
13
+ size_categories:
14
+ - n<1K
15
+ ---
16
+
17
+ # Common Lisp Macro Transformations
18
+
19
+ A fine-tuning dataset for training models to generate Common Lisp macros. Each example is a **(before-code) → (macro-definition) → (after-expansion)** triple.
20
+
21
+ ## Idea
22
+
23
+ Instead of fine-tuning a model to "write code", fine-tune it to generate **CL macros** — code that writes code. The model learns to recognize AST patterns and generate transformations, not final output.
24
+
25
+ ## Sources
26
+
27
+ - **Let Over Lambda** — Doug Hoyte's production macro collection (thephoeron/let-over-lambda)
28
+ - **On Lisp** — Paul Graham's classic Common Lisp macro utilities
29
+
30
+ ## Dataset Structure
31
+
32
+ Each record contains:
33
+ - `instruction` — Task description with the code pattern to address
34
+ - `input` — The "before" code showing the pattern that needs a macro
35
+ - `output` — The `defmacro` form that solves it
36
+ - `category` — Macro category (capture-management, anaphoric, dispatch, control-flow, DSL, compiler-macro, efficiency, scope)
37
+ - `technique` — Comma-separated techniques used (gensym, nested-backquote, dlambda, anaphor, code-walking, symbol-macrolet, defsetf, tagbody-go, once-only, macrolet, compiler-macro, recursive-expansion)
38
+ - `complexity` — basic, intermediate, or advanced
39
+ - `quality_score` — Classifier score from 0.0 to 1.0
40
+
41
+ ## Categories
42
+
43
+ | Category | Description | Examples |
44
+ |---|---|---|
45
+ | capture-management | Hygienic macro writing utilities | defmacro/g!, defmacro!, with-gensyms |
46
+ | anaphoric | Deliberate variable capture for conciseness | aif, alambda, alet, aand |
47
+ | dispatch | Keyword-based dispatch and inter-closure protocols | dlambda, pandoriclet, with-pandoric |
48
+ | control-flow | New evaluation semantics via macros | nlet-tail, condlet, if-match, choose |
49
+ | DSL | Domain-specific embedded languages | defunits, _f (generalized setf), dbind |
50
+ | compiler-macro | Compile-time optimization of function calls | fformat compiler macro |
51
+ | efficiency | Performance-oriented macro techniques | sortf (sorting networks) |
52
+ | scope | Lexical scope manipulation | pandoric-eval |
53
+
54
+ ## Use for Fine-tuning
55
+
56
+ The data is in instruction-input-output JSONL format, ready for fine-tuning:
57
+
58
+ ```python
59
+ from datasets import load_dataset
60
+ ds = load_dataset("j14i/cl-macros", split="train")
61
+ ```
62
+
63
+ Target model size: ≤ 30B parameters (the domain is narrow — pattern matching on ASTs and transformations — so a smaller model suffices).
data/splits/example_scores.jsonl ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"id": "lol-defmacro-g", "score": 0.9874999999999999, "category": "capture-management", "complexity": "advanced"}
2
+ {"id": "lol-defmacro-bang", "score": 0.9624999999999999, "category": "capture-management", "complexity": "advanced"}
3
+ {"id": "lol-aif", "score": 0.9125, "category": "anaphoric", "complexity": "basic"}
4
+ {"id": "lol-alambda", "score": 0.9125, "category": "anaphoric", "complexity": "basic"}
5
+ {"id": "lol-dlambda", "score": 0.95, "category": "dispatch", "complexity": "intermediate"}
6
+ {"id": "lol-alet", "score": 0.95, "category": "anaphoric", "complexity": "intermediate"}
7
+ {"id": "lol-pandoriclet", "score": 0.975, "category": "dispatch", "complexity": "advanced"}
8
+ {"id": "lol-with-pandoric", "score": 0.9824999999999999, "category": "dispatch", "complexity": "advanced"}
9
+ {"id": "lol-nlet-tail", "score": 0.975, "category": "control-flow", "complexity": "advanced"}
10
+ {"id": "lol-sortf", "score": 0.9624999999999999, "category": "efficiency", "complexity": "advanced"}
11
+ {"id": "lol-fformat", "score": 0.97, "category": "compiler-macro", "complexity": "intermediate"}
12
+ {"id": "lol-defun-bang", "score": 0.975, "category": "capture-management", "complexity": "intermediate"}
13
+ {"id": "lol-plambda", "score": 0.9375, "category": "dispatch", "complexity": "advanced"}
14
+ {"id": "lol-if-match", "score": 1.0, "category": "control-flow", "complexity": "advanced"}
15
+ {"id": "lol-pandoric-eval", "score": 0.9824999999999999, "category": "scope", "complexity": "advanced"}
16
+ {"id": "onlisp-with-gensyms", "score": 0.9375, "category": "capture-management", "complexity": "basic"}
17
+ {"id": "onlisp-condlet", "score": 0.9625, "category": "control-flow", "complexity": "intermediate"}
18
+ {"id": "onlisp-generalized-setf", "score": 0.95, "category": "dsl", "complexity": "intermediate"}
19
+ {"id": "onlisp-dbind", "score": 0.9625, "category": "dsl", "complexity": "intermediate"}
20
+ {"id": "onlisp-choose", "score": 0.9375, "category": "control-flow", "complexity": "advanced"}
21
+ {"id": "onlisp-cps", "score": 0.9375, "category": "control-flow", "complexity": "advanced"}
22
+ {"id": "onlisp-aand", "score": 0.9624999999999999, "category": "anaphoric", "complexity": "basic"}
data/splits/quality_report.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "total_examples": 22,
3
+ "mean_score": 0.958409090909091,
4
+ "min_score": 0.9125,
5
+ "max_score": 1.0,
6
+ "below_threshold": 0,
7
+ "category_distribution": {
8
+ "capture-management": 4,
9
+ "anaphoric": 4,
10
+ "dispatch": 4,
11
+ "control-flow": 5,
12
+ "efficiency": 1,
13
+ "compiler-macro": 1,
14
+ "scope": 1,
15
+ "dsl": 2
16
+ },
17
+ "complexity_distribution": {
18
+ "advanced": 11,
19
+ "basic": 4,
20
+ "intermediate": 7
21
+ },
22
+ "source_distribution": {
23
+ "let-over-lambda": 15,
24
+ "on-lisp": 7
25
+ }
26
+ }
data/splits/test.jsonl ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {"instruction": "Generate a defmacro form that transforms code like this:\n;; Macro with double-evaluation bug\n(defmacro square (x)\n `(* ,x ,x))\n;; (square (incf y)) evaluates (incf y) TWICE\n\n;; Manual fix requires gensym + let boilerplate every time\n(defmacro square (x)\n (let ((g (gensym)))\n `(let ((,g ,x))\n (* ,g ,g))))\n\nPattern to recognize: Macro arguments evaluated multiple times (double evaluation) \u2014 need automatic once-only that also provides both original form and evaluated value", "input": ";; Macro with double-evaluation bug\n(defmacro square (x)\n `(* ,x ,x))\n;; (square (incf y)) evaluates (incf y) TWICE\n\n;; Manual fix requires gensym + let boilerplate every time\n(defmacro square (x)\n (let ((g (gensym)))\n `(let ((,g ,x))\n (* ,g ,g))))", "output": "(defun o!-symbol-p (s)\n (and (symbolp s)\n (> (length (symbol-name s)) 2)\n (string= (symbol-name s)\n \"O!\"\n :start1 0\n :end1 2)))\n\n(defun o!-symbol-to-g!-symbol (s)\n (symb \"G!\"\n (subseq (symbol-name s) 2)))\n\n(defmacro defmacro! (name args &rest body)\n (let* ((os (remove-if-not #'o!-symbol-p (flatten args)))\n (gs (mapcar #'o!-symbol-to-g!-symbol os)))\n (multiple-value-bind (body declarations docstring)\n (parse-body body :documentation t)\n `(defmacro/g! ,name ,args\n ,@(when docstring\n (list docstring))\n ,@declarations\n `(let ,(mapcar #'list (list ,@gs) (list ,@os))\n ,(progn ,@body))))))", "category": "capture-management", "technique": "once-only,gensym", "complexity": "advanced", "quality_score": 0.9624999999999999}
2
+ {"instruction": "Define a macro that eliminates this repetitive pattern:\n;; Building a closure with mutable behavior requires manual forwarding\n(let ((counter 0)\n (impl (lambda (n) (incf counter n))))\n (lambda (&rest args)\n (apply impl args)))\n\n;; To change behavior: build an entirely new closure\n\nIssue: Need mutable closure behavior changeable at runtime, with implicit 'this' pointing to current implementation", "input": ";; Building a closure with mutable behavior requires manual forwarding\n(let ((counter 0)\n (impl (lambda (n) (incf counter n))))\n (lambda (&rest args)\n (apply impl args)))\n\n;; To change behavior: build an entirely new closure", "output": "(defmacro alet (letargs &rest body)\n `(let ((this) ,@letargs)\n (setq this ,@(last body))\n ,@(butlast body)\n (lambda (&rest params)\n (apply this params))))", "category": "anaphoric", "technique": "anaphor", "complexity": "intermediate", "quality_score": 0.95}
3
+ {"instruction": "Generate a defmacro form that transforms code like this:\n;; Regex matching with captures requires manual multiple-value-bind\n(multiple-value-bind (matchp captures)\n (cl-ppcre:scan-to-strings \"_(\\\\w+)@(\\\\w+)\\\\.com_\" email)\n (if matchp\n (let ((user (aref captures 0))\n (domain (aref captures 1)))\n (format t \"~a at ~a~%\" user domain))\n :invalid))\n\n;; Verbose: bind result, extract captures by index, branch - repetitive\n\nPattern to recognize: Regex matching with named captures requires verbose destructuring \u2014 want $1, $2, ... anaphors bound to capture groups", "input": ";; Regex matching with captures requires manual multiple-value-bind\n(multiple-value-bind (matchp captures)\n (cl-ppcre:scan-to-strings \"_(\\\\w+)@(\\\\w+)\\\\.com_\" email)\n (if matchp\n (let ((user (aref captures 0))\n (domain (aref captures 1)))\n (format t \"~a at ~a~%\" user domain))\n :invalid))\n\n;; Verbose: bind result, extract captures by index, branch - repetitive", "output": "#+cl-ppcre\n(defun dollar-symbol-p (s)\n (and (symbolp s)\n (> (length (symbol-name s)) 1)\n (string= (symbol-name s)\n \"$\"\n :start1 0\n :end1 1)\n (ignore-errors (parse-integer (subseq (symbol-name s) 1)))))\n\n(defmacro! if-match ((match-regex str) then &optional else)\n (let* ((dollars (remove-duplicates\n (remove-if-not #'dollar-symbol-p\n (flatten then))))\n (top (or (car (sort (mapcar #'dollar-symbol-p dollars) #'>))\n 0)))\n `(multiple-value-bind (,g!matches ,g!captures) (,match-regex ,str)\n (declare (ignorable ,g!matches ,g!captures))\n (let ((,g!captures-len (length ,g!captures)))\n (declare (ignorable ,g!captures-len))\n (symbol-macrolet ,(mapcar #`(,(symb \"$\" a1)\n (if (< ,g!captures-len ,a1)\n (error \"Too few matchs: ~a unbound.\" ,(mkstr \"$\" a1))\n (aref ,g!captures ,(1- a1))))\n (loop for i from 1 to top collect i))\n (if ,g!matches\n ,then\n ,else))))))", "category": "control-flow", "technique": "symbol-macrolet,gensym,code-walking", "complexity": "advanced", "quality_score": 1.0}
4
+ {"instruction": "Generate a defmacro form that transforms code like this:\n;; Exporting closure variables requires building pandoric protocol manually\n(let ((a 0))\n (let ((b 1))\n (let (this self)\n (setq\n this (lambda (n) (incf a n) (setq b (* b n)))\n self (dlambda\n (:pandoric-get (sym)\n (case sym\n ((a) a)\n ((b) b)\n (t (error \"Unknown: ~a\" sym))))\n (:pandoric-set (sym val)\n (case sym\n ((a) (setq a val))\n ((b) (setq b val))))\n (t (&rest args) (apply this args)))))))\n\n;; Manual self construction for every closure is painful\n\nPattern to recognize: Need to export closure variables through pandoric protocol, but manually building the self dlambda for each set of variables is repetitive", "input": ";; Exporting closure variables requires building pandoric protocol manually\n(let ((a 0))\n (let ((b 1))\n (let (this self)\n (setq\n this (lambda (n) (incf a n) (setq b (* b n)))\n self (dlambda\n (:pandoric-get (sym)\n (case sym\n ((a) a)\n ((b) b)\n (t (error \"Unknown: ~a\" sym))))\n (:pandoric-set (sym val)\n (case sym\n ((a) (setq a val))\n ((b) (setq b val))))\n (t (&rest args) (apply this args)))))))\n\n;; Manual self construction for every closure is painful", "output": "(defmacro plambda (largs pargs &rest body)\n (let ((pargs (mapcar #'list pargs)))\n `(let (this self)\n (setq\n this (lambda ,largs ,@body)\n self (dlambda\n (:pandoric-get (sym)\n ,(pandoriclet-get pargs))\n (:pandoric-set (sym val)\n ,(pandoriclet-set pargs))\n (t (&rest args)\n (apply this args)))))))", "category": "dispatch", "technique": "dlambda,nested-backquote", "complexity": "advanced", "quality_score": 0.9375}
data/splits/train.jsonl ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"instruction": "Write a Common Lisp macro to handle this code pattern:\n;; Conditional logic with different variable bindings per branch\n;; requires deeply nested lets inside each cond clause\n(cond ((> x 0)\n (let ((order :ascending)\n (bound (sqrt x)))\n (process order bound)))\n ((< x 0)\n (let ((order :descending)\n (bound (sqrt (- x))))\n (process order bound)))\n (t\n (let ((order :zero)\n (bound 0))\n (process order bound))))\n\n;; Repetitive: same body structure, different bindings per branch\n\nProblem: Cond with per-branch variable bindings leads to duplicated body code \u2014 want shared body with branch-specific bindings", "input": ";; Conditional logic with different variable bindings per branch\n;; requires deeply nested lets inside each cond clause\n(cond ((> x 0)\n (let ((order :ascending)\n (bound (sqrt x)))\n (process order bound)))\n ((< x 0)\n (let ((order :descending)\n (bound (sqrt (- x))))\n (process order bound)))\n (t\n (let ((order :zero)\n (bound 0))\n (process order bound))))\n\n;; Repetitive: same body structure, different bindings per branch", "output": "(defmacro condlet (clauses &body body)\n (let ((bodfn (gensym))\n (vars (mapcar #'(lambda (v) (cons v (gensym)))\n (remove-duplicates\n (mapcar #'car\n (mappend #'cdr clauses))))))\n `(labels ((,bodfn ,(mapcar #'car vars)\n ,@body))\n (cond ,@(mapcar #'(lambda (cl)\n (condlet-clause vars cl bodfn))\n clauses)))))", "category": "control-flow", "technique": "gensym,nested-backquote", "complexity": "intermediate", "quality_score": 0.9625}
2
+ {"instruction": "Write a Common Lisp macro to handle this code pattern:\n;; Manual destructuring from sequences is verbose\n(let ((lst (compute-list)))\n (let ((a (elt lst 0))\n (b (elt lst 1))\n (c (subseq lst 2)))\n (process a b c)))\n\n;; Nested structures are even worse:\n(let ((tree (compute-tree)))\n (let ((x (elt tree 0))\n (inner (elt tree 1)))\n (let ((y (elt inner 0))\n (z (elt inner 1)))\n (compute x y z))))\n\nProblem: Manual sequence destructuring with elt/subseq is error-prone \u2014 want pattern-based destructuring like ML/Haskell", "input": ";; Manual destructuring from sequences is verbose\n(let ((lst (compute-list)))\n (let ((a (elt lst 0))\n (b (elt lst 1))\n (c (subseq lst 2)))\n (process a b c)))\n\n;; Nested structures are even worse:\n(let ((tree (compute-tree)))\n (let ((x (elt tree 0))\n (inner (elt tree 1)))\n (let ((y (elt inner 0))\n (z (elt inner 1)))\n (compute x y z))))", "output": "(defmacro dbind (pat seq &body body)\n (let ((gseq (gensym)))\n `(let ((,gseq ,seq))\n ,(dbind-ex (destruc pat gseq #'atom) body))))", "category": "dsl", "technique": "gensym,nested-backquote", "complexity": "intermediate", "quality_score": 0.9625}
3
+ {"instruction": "Generate a defmacro form that transforms code like this:\n;; Depth-first search with manual backtracking\n(defun search (node)\n (if (goal-p node)\n node\n (let ((children (expand node)))\n (when children\n (let ((result (search (car children))))\n (if result\n result\n (when (cdr children)\n (search (cadr children)))))))))\n;; Manual backtracking is error-prone and obscures the search logic\n\nPattern to recognize: Search/backtracking algorithms require manual state management \u2014 want implicit backtracking via nondeterministic choice", "input": ";; Depth-first search with manual backtracking\n(defun search (node)\n (if (goal-p node)\n node\n (let ((children (expand node)))\n (when children\n (let ((result (search (car children))))\n (if result\n result\n (when (cdr children)\n (search (cadr children)))))))))\n;; Manual backtracking is error-prone and obscures the search logic", "output": "(defparameter *paths* nil)\n(defconstant failsym '@)\n\n(defmacro choose (&rest choices)\n (if choices\n `(progn\n ,@(mapcar #'(lambda (c)\n `(push #'(lambda () ,c) *paths*))\n (reverse (cdr choices)))\n ,(car choices))\n '(fail)))\n\n(defmacro choose-bind (var choices &body body)\n `(cb #'(lambda (,var) ,@body) ,choices))\n\n(defun cb (fn choices)\n (if choices\n (progn\n (if (cdr choices)\n (push #'(lambda () (cb fn (cdr choices)))\n *paths*))\n (funcall fn (car choices)))\n (fail)))\n\n(defun fail ()\n (if *paths*\n (funcall (pop *paths*))\n failsym))", "category": "control-flow", "technique": "nested-backquote", "complexity": "advanced", "quality_score": 0.9375}
4
+ {"instruction": "Create a Common Lisp macro for the following transformation. Before code:\n;; Functions that build code also need gensyms for hygiene\n(defun build-setter (var)\n (let ((g (gensym)))\n `(lambda (,g)\n (setf ,var ,g))))\n;; Gensym boilerplate in code-generating functions\n\nWhat needs to change: Functions that generate code also need gensyms \u2014 want same G! prefix convention extended to defun", "input": ";; Functions that build code also need gensyms for hygiene\n(defun build-setter (var)\n (let ((g (gensym)))\n `(lambda (,g)\n (setf ,var ,g))))\n;; Gensym boilerplate in code-generating functions", "output": "(defmacro defun! (name args &body body)\n (let ((syms (remove-duplicates\n (remove-if-not #'g!-symbol-p\n (flatten body)))))\n (multiple-value-bind (body declarations docstring)\n (parse-body body :documentation t)\n `(defun ,name ,args\n ,@(when docstring\n (list docstring))\n ,@declarations\n (let ,(mapcar (lambda (s)\n `(,s (gensym ,(subseq (symbol-name s)\n 2))))\n syms)\n ,@body)))))", "category": "capture-management", "technique": "gensym,code-walking", "complexity": "intermediate", "quality_score": 0.975}
5
+ {"instruction": "Write a Common Lisp macro to handle this code pattern:\n;; Threading a value through multiple dependent checks\n(let ((user (find-user id)))\n (if user\n (let ((orders (get-orders user)))\n (if orders\n (let ((latest (first orders)))\n (if latest\n (process latest)\n :no-orders))\n :no-orders))\n :no-user))\n;; Deeply nested ifs obscure the linear flow\n\nProblem: Threading a value through dependent operations creates deep if-nesting \u2014 want linear, pipeline-like syntax", "input": ";; Threading a value through multiple dependent checks\n(let ((user (find-user id)))\n (if user\n (let ((orders (get-orders user)))\n (if orders\n (let ((latest (first orders)))\n (if latest\n (process latest)\n :no-orders))\n :no-orders))\n :no-user))\n;; Deeply nested ifs obscure the linear flow", "output": "(defmacro aand (&rest args)\n (cond ((null args) t)\n ((null (cdr args)) (car args))\n (t `(aif ,(car args) (aand ,@(cdr args))))))", "category": "anaphoric", "technique": "anaphor,recursive-expansion", "complexity": "basic", "quality_score": 0.9624999999999999}
6
+ {"instruction": "Generate a defmacro form that transforms code like this:\n;; Manual gensym binding is verbose\n(let ((g1 (gensym))\n (g2 (gensym))\n (g3 (gensym)))\n `(let ((,g1 ,x)\n (,g2 ,y))\n (f ,g1 ,g2 ,g3)))\n;; The let-bindings for gensyms obscure the macro logic\n\nPattern to recognize: Manual gensym creation with let-bindings is verbose \u2014 want concise declaration of gensym-intended symbols", "input": ";; Manual gensym binding is verbose\n(let ((g1 (gensym))\n (g2 (gensym))\n (g3 (gensym)))\n `(let ((,g1 ,x)\n (,g2 ,y))\n (f ,g1 ,g2 ,g3)))\n;; The let-bindings for gensyms obscure the macro logic", "output": "(defmacro with-gensyms (syms &body body)\n `(let ,(mapcar #'(lambda (s)\n `(,s (gensym)))\n syms)\n ,@body))", "category": "capture-management", "technique": "gensym", "complexity": "basic", "quality_score": 0.9375}
7
+ {"instruction": "Write a Common Lisp macro to handle this code pattern:\n;; Sorting multiple places requires manual compare-and-swap\n(when (> b a) (rotatef a b))\n(when (> c a) (rotatef a c))\n(when (> c b) (rotatef b c))\n;; This is a sorting network but manually written - error-prone\n;; For more than 3 elements, the network is complex\n\nProblem: Sorting multiple values in-place requires compare-and-swap networks \u2014 want automatic sorting network generation", "input": ";; Sorting multiple places requires manual compare-and-swap\n(when (> b a) (rotatef a b))\n(when (> c a) (rotatef a c))\n(when (> c b) (rotatef b c))\n;; This is a sorting network but manually written - error-prone\n;; For more than 3 elements, the network is complex", "output": "(defun build-batcher-sn (n)\n (let* (network\n (tee (ceiling (log n 2)))\n (p (ash 1 (- tee 1))))\n (loop while (> p 0) do\n (let ((q (ash 1 (- tee 1)))\n (r 0)\n (d p))\n (loop while (> d 0) do\n (loop for i from 0 to (- n d 1) do\n (if (= (logand i p) r)\n (push (list i (+ i d))\n network)))\n (setf d (- q p)\n q (ash q -1)\n r p)))\n (setf p (ash p -1)))\n (nreverse network)))\n\n(defmacro! sortf (comparator &rest places)\n (if places\n `(tagbody\n ,@(mapcar\n #`(let ((,g!a #1=,(nth (car a1) places))\n (,g!b #2=,(nth (cadr a1) places)))\n (if (,comparator ,g!b ,g!a)\n (setf #1# ,g!b\n #2# ,g!a)))\n (build-batcher-sn (length places))))))", "category": "efficiency", "technique": "gensym,tagbody-go", "complexity": "advanced", "quality_score": 0.9624999999999999}
8
+ {"instruction": "You are a Common Lisp macro expert. Given this code pattern:\n;; Recursive factorial - stack overflow on large N in CL\n;; (CL does not guarantee tail-call optimization)\n(defun fact (n)\n (labels ((iter (n acc)\n (if (zerop n)\n acc\n (iter (- n 1) (* n acc)))))\n (iter n 1)))\n;; (fact 1000000) => stack overflow on most implementations\n\nWrite a macro that addresses: Need guaranteed tail-call elimination for recursive algorithms \u2014 CL doesn't mandate TCO", "input": ";; Recursive factorial - stack overflow on large N in CL\n;; (CL does not guarantee tail-call optimization)\n(defun fact (n)\n (labels ((iter (n acc)\n (if (zerop n)\n acc\n (iter (- n 1) (* n acc)))))\n (iter n 1)))\n;; (fact 1000000) => stack overflow on most implementations", "output": "(defmacro! nlet-tail (n letargs &body body)\n (let ((gs (loop for i in letargs\n collect (gensym))))\n `(macrolet\n ((,n ,gs\n `(progn\n (psetq\n ,@(apply #'nconc\n (mapcar\n #'list\n ',(mapcar #'car letargs)\n (list ,@gs))))\n (go ,',g!n))))\n (block ,g!b\n (let ,letargs\n (tagbody\n ,g!n (return-from\n ,g!b (progn ,@body))))))))", "category": "control-flow", "technique": "macrolet,tagbody-go,gensym", "complexity": "advanced", "quality_score": 0.975}
9
+ {"instruction": "Create a Common Lisp macro for the following transformation. Before code:\n;; Manual dispatch by keyword with multiple lambdas is repetitive\n(let ((count 0))\n (setf (symbol-function 'counter)\n (lambda (&rest args)\n (case (car args)\n (:inc (apply (lambda (n) (incf count n)) (cdr args)))\n (:dec (apply (lambda (n) (decf count n)) (cdr args)))\n (:reset (progn (setf count 0) nil))\n (t (error \"Unknown: ~a\" (car args)))))))\n\n;; Every method: keyword, lambda wrapper, arg destructuring - repetitive\n\nWhat needs to change: Repeated keyword-dispatch with automatic argument destructuring \u2014 each keyword maps to a lambda with its own parameter list", "input": ";; Manual dispatch by keyword with multiple lambdas is repetitive\n(let ((count 0))\n (setf (symbol-function 'counter)\n (lambda (&rest args)\n (case (car args)\n (:inc (apply (lambda (n) (incf count n)) (cdr args)))\n (:dec (apply (lambda (n) (decf count n)) (cdr args)))\n (:reset (progn (setf count 0) nil))\n (t (error \"Unknown: ~a\" (car args)))))))\n\n;; Every method: keyword, lambda wrapper, arg destructuring - repetitive", "output": "(defmacro! dlambda (&rest ds)\n `(lambda (&rest ,g!args)\n (case (car ,g!args)\n ,@(mapcar\n (lambda (d)\n `(,(if (eq t (car d))\n t\n (list (car d)))\n (apply (lambda ,@(cdr d))\n ,(if (eq t (car d))\n g!args\n `(cdr ,g!args)))))\n ds))))", "category": "dispatch", "technique": "dlambda,gensym", "complexity": "intermediate", "quality_score": 0.95}
10
+ {"instruction": "You are a Common Lisp macro expert. Given this code pattern:\n;; eval loses lexical environment - can't access surrounding bindings\n(let ((x 1))\n (eval '(+ 1 x))) ;; Error: X is unbound!\n\n;; Need: way to tunnel lexical variables into eval\n\nWrite a macro that addresses: eval cannot access lexical environment \u2014 want a 'tunnel' that bridges lexical scope into eval's dynamic world", "input": ";; eval loses lexical environment - can't access surrounding bindings\n(let ((x 1))\n (eval '(+ 1 x))) ;; Error: X is unbound!\n\n;; Need: way to tunnel lexical variables into eval", "output": "(defvar pandoric-eval-tunnel)\n\n(defmacro pandoric-eval (vars expr)\n `(let ((pandoric-eval-tunnel\n (plambda () ,vars t)))\n (eval `(with-pandoric\n ,',vars pandoric-eval-tunnel\n ,,expr))))", "category": "scope", "technique": "dlambda,symbol-macrolet", "complexity": "advanced", "quality_score": 0.9824999999999999}
11
+ {"instruction": "Create a Common Lisp macro for the following transformation. Before code:\n;; Closure variables are opaque - can't inspect or modify them\n(let ((acc 0) (mul 1))\n (lambda (n)\n (incf acc n)\n (setf mul (* mul n))))\n\n;; To access acc/mul: must add specific getter/setter methods\n;; This doesn't scale - every variable needs its own accessor\n\nWhat needs to change: Closure internals are opaque \u2014 want a general protocol for reading/writing any closed-over variable", "input": ";; Closure variables are opaque - can't inspect or modify them\n(let ((acc 0) (mul 1))\n (lambda (n)\n (incf acc n)\n (setf mul (* mul n))))\n\n;; To access acc/mul: must add specific getter/setter methods\n;; This doesn't scale - every variable needs its own accessor", "output": "(defun pandoriclet-get (letargs)\n `(case sym\n ,@(mapcar #`((,(car a1)) ,(car a1))\n letargs)\n (t (error\n \"Unknown pandoric get: ~a\"\n sym))))\n\n(defun pandoriclet-set (letargs)\n `(case sym\n ,@(mapcar #`((,(car a1))\n (setq ,(car a1) val))\n letargs)\n (t (error\n \"Unknown pandoric set: ~a\"\n sym val))))\n\n(defmacro pandoriclet (letargs &rest body)\n (let ((letargs (cons\n '(this)\n (let-binding-transform\n letargs))))\n `(let (,@letargs)\n (setq this ,@(last body))\n ,@(butlast body)\n (dlambda\n (:pandoric-get (sym)\n ,(pandoriclet-get letargs))\n (:pandoric-set (sym val)\n ,(pandoriclet-set letargs))\n (t (&rest args)\n (apply this args))))))", "category": "dispatch", "technique": "dlambda,nested-backquote,anaphor", "complexity": "advanced", "quality_score": 0.975}
12
+ {"instruction": "Generate a defmacro form that transforms code like this:\n;; format with constant control strings interprets directives at runtime\n(format nil \"Value: ~a, Name: ~a\" val name)\n;; Every call: parse the ~a directives, resolve them, dispatch formatting\n;; The format string is known at compile time but still interpreted at runtime\n\nPattern to recognize: format with compile-time-constant control strings wastes time interpreting directives at runtime \u2014 want compile-time compilation of format strings", "input": ";; format with constant control strings interprets directives at runtime\n(format nil \"Value: ~a, Name: ~a\" val name)\n;; Every call: parse the ~a directives, resolve them, dispatch formatting\n;; The format string is known at compile time but still interpreted at runtime", "output": "(defun fformat (&rest all)\n (apply #'format all))\n\n(define-compiler-macro fformat\n (&whole form\n stream fmt &rest args)\n (if (constantp fmt)\n (if stream\n `(funcall (formatter ,fmt)\n ,stream ,@args)\n (let ((g!stream (gensym \"stream\")))\n `(with-output-to-string (,g!stream)\n (funcall (formatter ,fmt)\n ,g!stream ,@args))))\n form))", "category": "compiler-macro", "technique": "compiler-macro,gensym", "complexity": "intermediate", "quality_score": 0.97}
13
+ {"instruction": "Create a Common Lisp macro for the following transformation. Before code:\n;; Anonymous recursion requires explicit labels every time\n(labels ((self (n)\n (if (> n 0)\n (cons n (self (- n 1)))\n nil)))\n #'self)\n\n;; labels boilerplate obscures the recursive logic\n\nWhat needs to change: Need anonymous recursion without writing labels boilerplate \u2014 want implicit 'self' for recursion", "input": ";; Anonymous recursion requires explicit labels every time\n(labels ((self (n)\n (if (> n 0)\n (cons n (self (- n 1)))\n nil)))\n #'self)\n\n;; labels boilerplate obscures the recursive logic", "output": "(defmacro alambda (parms &body body)\n `(labels ((self ,parms ,@body))\n #'self))", "category": "anaphoric", "technique": "anaphor", "complexity": "basic", "quality_score": 0.9125}
data/splits/val.jsonl ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {"instruction": "Create a Common Lisp macro for the following transformation. Before code:\n;; Common pattern: read a value, apply a function, store back\n(let ((old (car place)))\n (setf (car place) (1+ old)))\n\n;; Or with a more complex place:\n(let ((old (getf plist :count)))\n (setf (getf plist :count) (* old 2)))\n\n;; Every operation needs manual read-modify-write boilerplate\n\nWhat needs to change: Read-modify-write on places requires manual let + access + setf \u2014 want a single operator that applies any function to a place", "input": ";; Common pattern: read a value, apply a function, store back\n(let ((old (car place)))\n (setf (car place) (1+ old)))\n\n;; Or with a more complex place:\n(let ((old (getf plist :count)))\n (setf (getf plist :count) (* old 2)))\n\n;; Every operation needs manual read-modify-write boilerplate", "output": "(defmacro _f (op place &rest args)\n (multiple-value-bind (vars forms var set access)\n (get-setf-method place)\n `(let* (,@(mapcar #'list vars forms)\n (,(car var) (,op ,access ,@args)))\n ,set)))", "category": "dsl", "technique": "defsetf", "complexity": "intermediate", "quality_score": 0.95}
2
+ {"instruction": "Generate a defmacro form that transforms code like this:\n;; Manual gensym creation is boilerplate-heavy\n(defmacro with-temp (var &body body)\n (let ((g (gensym)))\n `(let ((,g ,var))\n ,@body)))\n;; Every macro needs this pattern - repetitive and error-prone\n\nPattern to recognize: Repeated gensym boilerplate in macro definitions \u2014 want automatic gensym creation from naming convention", "input": ";; Manual gensym creation is boilerplate-heavy\n(defmacro with-temp (var &body body)\n (let ((g (gensym)))\n `(let ((,g ,var))\n ,@body)))\n;; Every macro needs this pattern - repetitive and error-prone", "output": "(defun g!-symbol-p (s)\n (and (symbolp s)\n (> (length (symbol-name s)) 2)\n (string= (symbol-name s)\n \"G!\"\n :start1 0\n :end1 2)))\n\n(defmacro defmacro/g! (name args &rest body)\n (let ((syms (remove-duplicates\n (remove-if-not #'g!-symbol-p\n (flatten body)))))\n (multiple-value-bind (body declarations docstring)\n (parse-body body :documentation t)\n `(defmacro ,name ,args\n ,@(when docstring\n (list docstring))\n ,@declarations\n (let ,(mapcar\n (lambda (s)\n `(,s (gensym ,(subseq\n (symbol-name s)\n 2))))\n syms)\n ,@body)))))", "category": "capture-management", "technique": "gensym,code-walking", "complexity": "advanced", "quality_score": 0.9874999999999999}
3
+ {"instruction": "Create a Common Lisp macro for the following transformation. Before code:\n;; Accessing pandoric variables requires verbose funcall\n(let ((sum (funcall box :pandoric-get 'sum))\n (count (funcall box :pandoric-get 'count)))\n (format t \"sum=~a count=~a~%\" sum count))\n\n;; Setting is even worse:\n(progn\n (funcall box :pandoric-set 'sum new-value)\n new-value)\n\n;; Want: regular variable syntax for pandoric access\n\nWhat needs to change: Pandoric get/set requires verbose keyword funcalls \u2014 want transparent variable syntax using symbol-macrolet", "input": ";; Accessing pandoric variables requires verbose funcall\n(let ((sum (funcall box :pandoric-get 'sum))\n (count (funcall box :pandoric-get 'count)))\n (format t \"sum=~a count=~a~%\" sum count))\n\n;; Setting is even worse:\n(progn\n (funcall box :pandoric-set 'sum new-value)\n new-value)\n\n;; Want: regular variable syntax for pandoric access", "output": "(defun get-pandoric (box sym)\n (funcall box :pandoric-get sym))\n\n(defsetf get-pandoric (box sym) (val)\n `(progn\n (funcall ,box :pandoric-set ,sym ,val)\n ,val))\n\n(defmacro with-pandoric (syms box &rest body)\n (let ((g!box (gensym \"box\")))\n `(let ((,g!box ,box))\n (declare (ignorable ,g!box))\n (symbol-macrolet\n (,@(mapcar #`(,a1 (get-pandoric ,g!box ',a1))\n syms))\n ,@body))))", "category": "dispatch", "technique": "symbol-macrolet,defsetf,gensym", "complexity": "advanced", "quality_score": 0.9824999999999999}
4
+ {"instruction": "Write a Common Lisp macro to handle this code pattern:\n;; Common pattern: test a value, use result in then-branch\n(let ((result (compute-something)))\n (if result\n (format t \"Got: ~a~%\" result)\n :nothing))\n\n;; Want: implicit 'it' bound to test result\n\nProblem: Repeated pattern of binding test result then branching on it \u2014 want implicit 'it' anaphor", "input": ";; Common pattern: test a value, use result in then-branch\n(let ((result (compute-something)))\n (if result\n (format t \"Got: ~a~%\" result)\n :nothing))\n\n;; Want: implicit 'it' bound to test result", "output": "(defmacro aif (test then &optional else)\n `(let ((it ,test))\n (if it ,then ,else)))", "category": "anaphoric", "technique": "anaphor", "complexity": "basic", "quality_score": 0.9125}
5
+ {"instruction": "Define a macro that eliminates this repetitive pattern:\n;; Explicit continuation-passing is verbose\n(defun fact-cps (n cont)\n (if (zerop n)\n (funcall cont 1)\n (fact-cps (- n 1)\n (lambda (x) (funcall cont (* n x))))))\n\n;; Every function must take an explicit continuation parameter.\n;; Every call must thread the continuation through.\n;; Simple sequential code becomes deeply nested.\n\nIssue: Manual continuation-passing style requires explicit continuation parameters and nested lambdas \u2014 want syntactic sugar for CPS", "input": ";; Explicit continuation-passing is verbose\n(defun fact-cps (n cont)\n (if (zerop n)\n (funcall cont 1)\n (fact-cps (- n 1)\n (lambda (x) (funcall cont (* n x))))))\n\n;; Every function must take an explicit continuation parameter.\n;; Every call must thread the continuation through.\n;; Simple sequential code becomes deeply nested.", "output": "(defmacro =lambda (parms &body body)\n `#'(lambda (*cont* ,@parms) ,@body))\n\n(defmacro =defun (name parms &body body)\n (let ((f (intern (concatenate 'string\n \"=\" (symbol-name name)))))\n `(progn\n (defmacro ,name ,parms\n `(,',f *cont* ,,@parms))\n (defun ,f (*cont* ,@parms) ,@body))))\n\n(defmacro =bind (parms expr &body body)\n `(let ((*cont* #'(lambda ,parms ,@body))) ,expr))\n\n(defmacro =values (&rest retvals)\n `(funcall *cont* ,@retvals))", "category": "control-flow", "technique": "nested-backquote", "complexity": "advanced", "quality_score": 0.9375}
pyproject.toml ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [build-system]
2
+ requires = ["setuptools>=68.0"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "cl-macros"
7
+ version = "0.1.0"
8
+ description = "Fine-tuning dataset for Common Lisp macro generation"
9
+ requires-python = ">=3.10"
10
+ dependencies = [
11
+ "datasets>=2.14.0",
12
+ "pyarrow>=12.0",
13
+ ]
14
+
15
+ [project.optional-dependencies]
16
+ dev = [
17
+ "pytest>=7.0",
18
+ "ruff>=0.1.0",
19
+ ]
20
+
21
+ [tool.setuptools.packages.find]
22
+ where = ["src"]
uv.lock ADDED
The diff for this file is too large to render. See raw diff