cl-ds / src /cl_macros /classifier.py
j14i's picture
977 CL macro transformation examples: CL-native pipeline with SBCL verification
d69fc90 verified
"""Classifier: auto-detect metadata + score quality of macro examples.
Phase 3: Automatic detection of technique, category, complexity, capture risk.
"""
from __future__ import annotations
import re
from collections import Counter
from cl_macros.schema import (
Complexity,
MacroCategory,
MacroTechnique,
Source,
TransformationExample,
)
def detect_techniques(macro_def: str) -> list[MacroTechnique]:
code = macro_def
techniques: list[MacroTechnique] = []
if "define-compiler-macro" in code:
techniques.append(MacroTechnique.COMPILER_MACRO)
if re.search(r"\(symbol-macrolet\b", code):
techniques.append(MacroTechnique.SYMBOL_MACROLET)
if re.search(r"\(macrolet\b", code):
techniques.append(MacroTechnique.MACROLET)
if "tagbody" in code and "go " in code:
techniques.append(MacroTechnique.TAGBODY)
if re.search(r"get-setf-method|\bdefsetf\b", code):
techniques.append(MacroTechnique.DEFSETF)
if "once-only" in code or "o!-" in code:
techniques.append(MacroTechnique.ONCE_ONLY)
if "gensym" in code.lower() or "g!-" in code:
techniques.append(MacroTechnique.GENSYM)
if re.search(r"\(case\s+\(car\b", code):
techniques.append(MacroTechnique.DLAMBDA)
if re.search(r"set-(?:dispatch-)?macro-character", code):
techniques.append(MacroTechnique.READER)
if _has_nested_backquote(code):
techniques.append(MacroTechnique.NESTED_BACKQUOTE)
if "anaphor" in code.lower() or _has_anaphor_injection(code):
techniques.append(MacroTechnique.ANAPHOR)
if re.search(r"\bflatten\b|\bremove-if-not.*body\b", code):
techniques.append(MacroTechnique.CODE_WALKING)
if _detect_recursive_expansion(code):
techniques.append(MacroTechnique.RECURSIVE_EXPANSION)
return techniques
def detect_category(macro_def: str, techniques: list[MacroTechnique]) -> MacroCategory:
code = macro_def
if MacroTechnique.COMPILER_MACRO in techniques:
if "format" in code or "constantp" in code:
return MacroCategory.COMPILER_MACRO
if MacroTechnique.ANAPHOR in techniques:
if "dlambda" in code or "pandoric" in code:
return MacroCategory.DISPATCH
return MacroCategory.ANAPHORIC
if MacroTechnique.DLAMBDA in techniques:
return MacroCategory.DISPATCH
if "gensym" in code.lower() and any(
kw in code for kw in ["defmacro/g!", "defmacro!", "with-gensyms", "with-unique-names"]
):
return MacroCategory.CAPTURE_MANAGEMENT
if MacroTechnique.GENSYM in techniques and not any(
t in techniques for t in [MacroTechnique.ANAPHOR, MacroTechnique.DLAMBDA]
):
if "defmacro/g!" in code or "defmacro!" in code:
return MacroCategory.CAPTURE_MANAGEMENT
if re.search(r"\bpush\b.*\*paths\*|\bfail\b|\bchoose\b|backtrack", code):
return MacroCategory.CONTROL_FLOW
if MacroTechnique.TAGBODY in techniques or (
MacroTechnique.MACROLET in techniques and "go " in code
):
return MacroCategory.CONTROL_FLOW
if MacroTechnique.SYMBOL_MACROLET in techniques:
return MacroCategory.DISPATCH
if MacroTechnique.READER in techniques:
return MacroCategory.READ_MACRO
if re.search(r"batcher|sorting|network|comparator|cons-pool|cons-pool|tlist", code):
return MacroCategory.EFFICIENCY
if "eval" in code and "pandoric" in code:
return MacroCategory.SCOPE
if re.search(r"\bwith-\b", code) or re.search(r"def.*unit|def.*sql|define-", code):
return MacroCategory.DSL
if re.search(r"\bif\b.*\bdo\b|\bcond\b.*\blet\b", code):
return MacroCategory.CONTROL_FLOW
return MacroCategory.CONTROL_FLOW
def assess_complexity(
macro_def: str, techniques: list[MacroTechnique]
) -> Complexity:
score = 0
lines = macro_def.count("\n")
if lines >= 15:
score += 3
elif lines >= 8:
score += 2
elif lines >= 4:
score += 1
advanced = {MacroTechnique.CODE_WALKING, MacroTechnique.RECURSIVE_EXPANSION,
MacroTechnique.COMPILER_MACRO, MacroTechnique.DLAMBDA,
MacroTechnique.TAGBODY, MacroTechnique.NESTED_BACKQUOTE}
score += len(advanced & set(techniques)) * 2
gensym_count = macro_def.count("gensym") + macro_def.count("g!-")
if gensym_count > 3:
score += 2
elif gensym_count > 1:
score += 1
bq_depth = _estimate_backquote_depth(macro_def)
if bq_depth > 2:
score += 2
elif bq_depth > 1:
score += 1
if score >= 5:
return Complexity.ADVANCED
if score >= 2:
return Complexity.INTERMEDIATE
return Complexity.BASIC
def assess_capture_risk(
macro_def: str, techniques: list[MacroTechnique]
) -> tuple[bool, bool]:
has_body = "&body" in macro_def or "&rest body" in macro_def
needs_gensyms = "gensym" in macro_def.lower() or "g!-" in macro_def
is_anaphor = MacroTechnique.ANAPHOR in techniques
if is_anaphor:
return (True, needs_gensyms)
if has_body and not needs_gensyms:
return (True, False)
return (False, needs_gensyms)
def score_example(example: TransformationExample) -> float:
scores = [
_score_correctness(example),
_score_hygiene(example),
_score_transformation(example),
_score_clarity(example),
]
weights = [0.35, 0.25, 0.25, 0.15]
return sum(s * w for s, w in zip(scores, weights))
def classify_all(examples):
for ex in examples:
if not ex.technique:
ex.technique = detect_techniques(ex.macro_definition)
if ex.macro_category is None:
ex.macro_category = detect_category(ex.macro_definition, ex.technique)
if ex.complexity == Complexity.BASIC and len(ex.macro_definition.split("\n")) > 3:
ex.complexity = assess_complexity(ex.macro_definition, ex.technique)
if not ex.has_capture_risk and not ex.requires_gensyms:
has_risk, needs_gensyms = assess_capture_risk(ex.macro_definition, ex.technique)
ex.has_capture_risk = has_risk
ex.requires_gensyms = needs_gensyms
ex.quality_score = score_example(ex)
return examples
def quality_report(examples):
if not examples:
return {"error": "No examples"}
scores = [ex.quality_score for ex in examples if ex.quality_score is not None]
if not scores:
return {"error": "No scored examples"}
return {
"total_examples": len(examples),
"mean_score": sum(scores) / len(scores),
"min_score": min(scores),
"max_score": max(scores),
"below_threshold": sum(1 for s in scores if s < 0.5),
"category_distribution": dict(Counter(ex.macro_category.value for ex in examples)),
"complexity_distribution": dict(Counter(ex.complexity.value for ex in examples)),
"source_distribution": dict(Counter(ex.source.value for ex in examples)),
}
def filter_quality(examples, min_score=0.5):
return [ex for ex in examples if ex.quality_score is not None and ex.quality_score >= min_score]
# --- Private helpers ---
def _has_nested_backquote(code: str) -> bool:
"""Check if a macro definition contains nested backquotes."""
depth = 0
max_depth = 0
i = 0
while i < len(code):
ch = code[i]
if ch == '"':
i += 1
while i < len(code) and code[i] != '"':
if code[i] == "\\":
i += 1
i += 1
i += 1
continue
if ch == ";" and (i == 0 or code[i - 1] != "#"):
while i < len(code) and code[i] != "\n":
i += 1
continue
if ch == "`":
depth += 1
max_depth = max(max_depth, depth)
elif ch == "," and i + 1 < len(code) and code[i + 1] == "@":
depth -= 1
elif ch == ",":
depth -= 1
i += 1
return max_depth > 1
def _has_anaphor_injection(code: str) -> bool:
anaphors = ["'it", "'self", "'this", " it", " self", " this"]
return any(a in code for a in anaphors)
def _detect_recursive_expansion(code: str) -> bool:
"""Detect if a macro uses recursive expansion (calls itself during expansion)."""
names = re.findall(r"\(defmacro\w*\s+([^\s()]+)", code)
for name in names:
if f"({name}" in code and f"(defmacro" not in (
code[code.find(f"({name}") - 10 : code.find(f"({name}")]
):
return True
return False
def _estimate_backquote_depth(code: str) -> int:
depth = 0
max_depth = 0
for ch in code:
if ch == "`":
depth += 1
if depth > max_depth:
max_depth = depth
elif ch in ",)":
if depth > 0:
depth -= 1
return max_depth
def _score_correctness(ex):
score = 1.0
macro = ex.macro_definition
if not re.search(r"\((?:defmacro|define-compiler-macro|defmacro!|defmacro/g!)", macro):
score -= 0.4
if macro.count("\n") < 2:
score -= 0.2
if ex.macro_category is not None and ex.macro_category.value == "read-macro":
if not re.search(r"set-(?:dispatch-)?macro-character", macro):
score -= 0.3
if ex.after_expansion.strip() == ex.before_code.strip():
score -= 0.3
if len(ex.after_expansion.strip()) < 10:
score -= 0.2
return max(0.0, score)
def _score_hygiene(ex):
score = 1.0
if ex.has_capture_risk and ex.requires_gensyms:
if "gensym" not in ex.macro_definition.lower() and "g!" not in ex.macro_definition:
score -= 0.4
if MacroTechnique.ANAPHOR in ex.technique and ex.requires_gensyms:
score -= 0.25
return max(0.0, score)
def _score_transformation(ex):
score = 0.5
if len(ex.before_code) > 50:
score += 0.15
if ex.macro_definition.count("\n") >= 3:
score += 0.1
bonus = {
MacroTechnique.RECURSIVE_EXPANSION: 0.1,
MacroTechnique.CODE_WALKING: 0.1,
MacroTechnique.COMPILER_MACRO: 0.08,
MacroTechnique.SYMBOL_MACROLET: 0.08,
MacroTechnique.MACROLET: 0.05,
MacroTechnique.NESTED_BACKQUOTE: 0.05,
}
for tech in ex.technique:
score += bonus.get(tech, 0.0)
cb = {Complexity.BASIC: 0.0, Complexity.INTERMEDIATE: 0.05, Complexity.ADVANCED: 0.1}
score += cb[ex.complexity]
return min(1.0, score)
def _score_clarity(ex):
score = 0.5
if len(ex.problem_pattern) > 20:
score += 0.15
if ex.commentary and len(ex.commentary) > 30:
score += 0.2
if len(ex.before_code) > 30:
score += 0.1
if ex.source_chapter:
score += 0.05
return min(1.0, score)