977 CL macro transformation examples: CL-native pipeline with SBCL verification

d69fc90 verified 8 days ago

10.8 kB

	"""Classifier: auto-detect metadata + score quality of macro examples.

	Phase 3: Automatic detection of technique, category, complexity, capture risk.
	"""

	from __future__ import annotations

	import re
	from collections import Counter

	from cl_macros.schema import (
	Complexity,
	MacroCategory,
	MacroTechnique,
	Source,
	TransformationExample,
	)


	def detect_techniques(macro_def: str) -> list[MacroTechnique]:
	code = macro_def
	techniques: list[MacroTechnique] = []

	if "define-compiler-macro" in code:
	techniques.append(MacroTechnique.COMPILER_MACRO)
	if re.search(r"\(symbol-macrolet\b", code):
	techniques.append(MacroTechnique.SYMBOL_MACROLET)
	if re.search(r"\(macrolet\b", code):
	techniques.append(MacroTechnique.MACROLET)
	if "tagbody" in code and "go " in code:
	techniques.append(MacroTechnique.TAGBODY)
	if re.search(r"get-setf-method\|\bdefsetf\b", code):
	techniques.append(MacroTechnique.DEFSETF)
	if "once-only" in code or "o!-" in code:
	techniques.append(MacroTechnique.ONCE_ONLY)
	if "gensym" in code.lower() or "g!-" in code:
	techniques.append(MacroTechnique.GENSYM)
	if re.search(r"\(case\s+\(car\b", code):
	techniques.append(MacroTechnique.DLAMBDA)
	if re.search(r"set-(?:dispatch-)?macro-character", code):
	techniques.append(MacroTechnique.READER)
	if _has_nested_backquote(code):
	techniques.append(MacroTechnique.NESTED_BACKQUOTE)
	if "anaphor" in code.lower() or _has_anaphor_injection(code):
	techniques.append(MacroTechnique.ANAPHOR)
	if re.search(r"\bflatten\b\|\bremove-if-not.*body\b", code):
	techniques.append(MacroTechnique.CODE_WALKING)
	if _detect_recursive_expansion(code):
	techniques.append(MacroTechnique.RECURSIVE_EXPANSION)

	return techniques


	def detect_category(macro_def: str, techniques: list[MacroTechnique]) -> MacroCategory:
	code = macro_def

	if MacroTechnique.COMPILER_MACRO in techniques:
	if "format" in code or "constantp" in code:
	return MacroCategory.COMPILER_MACRO

	if MacroTechnique.ANAPHOR in techniques:
	if "dlambda" in code or "pandoric" in code:
	return MacroCategory.DISPATCH
	return MacroCategory.ANAPHORIC

	if MacroTechnique.DLAMBDA in techniques:
	return MacroCategory.DISPATCH

	if "gensym" in code.lower() and any(
	kw in code for kw in ["defmacro/g!", "defmacro!", "with-gensyms", "with-unique-names"]
	):
	return MacroCategory.CAPTURE_MANAGEMENT

	if MacroTechnique.GENSYM in techniques and not any(
	t in techniques for t in [MacroTechnique.ANAPHOR, MacroTechnique.DLAMBDA]
	):
	if "defmacro/g!" in code or "defmacro!" in code:
	return MacroCategory.CAPTURE_MANAGEMENT

	if re.search(r"\bpush\b.\paths\*\|\bfail\b\|\bchoose\b\|backtrack", code):
	return MacroCategory.CONTROL_FLOW

	if MacroTechnique.TAGBODY in techniques or (
	MacroTechnique.MACROLET in techniques and "go " in code
	):
	return MacroCategory.CONTROL_FLOW

	if MacroTechnique.SYMBOL_MACROLET in techniques:
	return MacroCategory.DISPATCH

	if MacroTechnique.READER in techniques:
	return MacroCategory.READ_MACRO

	if re.search(r"batcher\|sorting\|network\|comparator\|cons-pool\|cons-pool\|tlist", code):
	return MacroCategory.EFFICIENCY

	if "eval" in code and "pandoric" in code:
	return MacroCategory.SCOPE

	if re.search(r"\bwith-\b", code) or re.search(r"def.unit\|def.sql\|define-", code):
	return MacroCategory.DSL

	if re.search(r"\bif\b.\bdo\b\|\bcond\b.\blet\b", code):
	return MacroCategory.CONTROL_FLOW

	return MacroCategory.CONTROL_FLOW


	def assess_complexity(
	macro_def: str, techniques: list[MacroTechnique]
	) -> Complexity:
	score = 0
	lines = macro_def.count("\n")
	if lines >= 15:
	score += 3
	elif lines >= 8:
	score += 2
	elif lines >= 4:
	score += 1

	advanced = {MacroTechnique.CODE_WALKING, MacroTechnique.RECURSIVE_EXPANSION,
	MacroTechnique.COMPILER_MACRO, MacroTechnique.DLAMBDA,
	MacroTechnique.TAGBODY, MacroTechnique.NESTED_BACKQUOTE}
	score += len(advanced & set(techniques)) * 2

	gensym_count = macro_def.count("gensym") + macro_def.count("g!-")
	if gensym_count > 3:
	score += 2
	elif gensym_count > 1:
	score += 1

	bq_depth = _estimate_backquote_depth(macro_def)
	if bq_depth > 2:
	score += 2
	elif bq_depth > 1:
	score += 1

	if score >= 5:
	return Complexity.ADVANCED
	if score >= 2:
	return Complexity.INTERMEDIATE
	return Complexity.BASIC


	def assess_capture_risk(
	macro_def: str, techniques: list[MacroTechnique]
	) -> tuple[bool, bool]:
	has_body = "&body" in macro_def or "&rest body" in macro_def
	needs_gensyms = "gensym" in macro_def.lower() or "g!-" in macro_def
	is_anaphor = MacroTechnique.ANAPHOR in techniques

	if is_anaphor:
	return (True, needs_gensyms)
	if has_body and not needs_gensyms:
	return (True, False)
	return (False, needs_gensyms)


	def score_example(example: TransformationExample) -> float:
	scores = [
	_score_correctness(example),
	_score_hygiene(example),
	_score_transformation(example),
	_score_clarity(example),
	]
	weights = [0.35, 0.25, 0.25, 0.15]
	return sum(s * w for s, w in zip(scores, weights))


	def classify_all(examples):
	for ex in examples:
	if not ex.technique:
	ex.technique = detect_techniques(ex.macro_definition)
	if ex.macro_category is None:
	ex.macro_category = detect_category(ex.macro_definition, ex.technique)
	if ex.complexity == Complexity.BASIC and len(ex.macro_definition.split("\n")) > 3:
	ex.complexity = assess_complexity(ex.macro_definition, ex.technique)
	if not ex.has_capture_risk and not ex.requires_gensyms:
	has_risk, needs_gensyms = assess_capture_risk(ex.macro_definition, ex.technique)
	ex.has_capture_risk = has_risk
	ex.requires_gensyms = needs_gensyms
	ex.quality_score = score_example(ex)
	return examples


	def quality_report(examples):
	if not examples:
	return {"error": "No examples"}
	scores = [ex.quality_score for ex in examples if ex.quality_score is not None]
	if not scores:
	return {"error": "No scored examples"}
	return {
	"total_examples": len(examples),
	"mean_score": sum(scores) / len(scores),
	"min_score": min(scores),
	"max_score": max(scores),
	"below_threshold": sum(1 for s in scores if s < 0.5),
	"category_distribution": dict(Counter(ex.macro_category.value for ex in examples)),
	"complexity_distribution": dict(Counter(ex.complexity.value for ex in examples)),
	"source_distribution": dict(Counter(ex.source.value for ex in examples)),
	}


	def filter_quality(examples, min_score=0.5):
	return [ex for ex in examples if ex.quality_score is not None and ex.quality_score >= min_score]


	# --- Private helpers ---

	def _has_nested_backquote(code: str) -> bool:
	"""Check if a macro definition contains nested backquotes."""
	depth = 0
	max_depth = 0
	i = 0
	while i < len(code):
	ch = code[i]
	if ch == '"':
	i += 1
	while i < len(code) and code[i] != '"':
	if code[i] == "\\":
	i += 1
	i += 1
	i += 1
	continue
	if ch == ";" and (i == 0 or code[i - 1] != "#"):
	while i < len(code) and code[i] != "\n":
	i += 1
	continue
	if ch == "`":
	depth += 1
	max_depth = max(max_depth, depth)
	elif ch == "," and i + 1 < len(code) and code[i + 1] == "@":
	depth -= 1
	elif ch == ",":
	depth -= 1
	i += 1
	return max_depth > 1


	def _has_anaphor_injection(code: str) -> bool:
	anaphors = ["'it", "'self", "'this", " it", " self", " this"]
	return any(a in code for a in anaphors)


	def _detect_recursive_expansion(code: str) -> bool:
	"""Detect if a macro uses recursive expansion (calls itself during expansion)."""
	names = re.findall(r"\(defmacro\w*\s+([^\s()]+)", code)
	for name in names:
	if f"({name}" in code and f"(defmacro" not in (
	code[code.find(f"({name}") - 10 : code.find(f"({name}")]
	):
	return True
	return False


	def _estimate_backquote_depth(code: str) -> int:
	depth = 0
	max_depth = 0
	for ch in code:
	if ch == "`":
	depth += 1
	if depth > max_depth:
	max_depth = depth
	elif ch in ",)":
	if depth > 0:
	depth -= 1
	return max_depth


	def _score_correctness(ex):
	score = 1.0
	macro = ex.macro_definition
	if not re.search(r"\((?:defmacro\|define-compiler-macro\|defmacro!\|defmacro/g!)", macro):
	score -= 0.4
	if macro.count("\n") < 2:
	score -= 0.2
	if ex.macro_category is not None and ex.macro_category.value == "read-macro":
	if not re.search(r"set-(?:dispatch-)?macro-character", macro):
	score -= 0.3
	if ex.after_expansion.strip() == ex.before_code.strip():
	score -= 0.3
	if len(ex.after_expansion.strip()) < 10:
	score -= 0.2
	return max(0.0, score)


	def _score_hygiene(ex):
	score = 1.0
	if ex.has_capture_risk and ex.requires_gensyms:
	if "gensym" not in ex.macro_definition.lower() and "g!" not in ex.macro_definition:
	score -= 0.4
	if MacroTechnique.ANAPHOR in ex.technique and ex.requires_gensyms:
	score -= 0.25
	return max(0.0, score)


	def _score_transformation(ex):
	score = 0.5
	if len(ex.before_code) > 50:
	score += 0.15
	if ex.macro_definition.count("\n") >= 3:
	score += 0.1
	bonus = {
	MacroTechnique.RECURSIVE_EXPANSION: 0.1,
	MacroTechnique.CODE_WALKING: 0.1,
	MacroTechnique.COMPILER_MACRO: 0.08,
	MacroTechnique.SYMBOL_MACROLET: 0.08,
	MacroTechnique.MACROLET: 0.05,
	MacroTechnique.NESTED_BACKQUOTE: 0.05,
	}
	for tech in ex.technique:
	score += bonus.get(tech, 0.0)
	cb = {Complexity.BASIC: 0.0, Complexity.INTERMEDIATE: 0.05, Complexity.ADVANCED: 0.1}
	score += cb[ex.complexity]
	return min(1.0, score)


	def _score_clarity(ex):
	score = 0.5
	if len(ex.problem_pattern) > 20:
	score += 0.15
	if ex.commentary and len(ex.commentary) > 30:
	score += 0.2
	if len(ex.before_code) > 30:
	score += 0.1
	if ex.source_chapter:
	score += 0.05
	return min(1.0, score)