"""Classifier: auto-detect metadata + score quality of macro examples. Phase 3: Automatic detection of technique, category, complexity, capture risk. """ from __future__ import annotations import re from collections import Counter from cl_macros.schema import ( Complexity, MacroCategory, MacroTechnique, Source, TransformationExample, ) def detect_techniques(macro_def: str) -> list[MacroTechnique]: code = macro_def techniques: list[MacroTechnique] = [] if "define-compiler-macro" in code: techniques.append(MacroTechnique.COMPILER_MACRO) if re.search(r"\(symbol-macrolet\b", code): techniques.append(MacroTechnique.SYMBOL_MACROLET) if re.search(r"\(macrolet\b", code): techniques.append(MacroTechnique.MACROLET) if "tagbody" in code and "go " in code: techniques.append(MacroTechnique.TAGBODY) if re.search(r"get-setf-method|\bdefsetf\b", code): techniques.append(MacroTechnique.DEFSETF) if "once-only" in code or "o!-" in code: techniques.append(MacroTechnique.ONCE_ONLY) if "gensym" in code.lower() or "g!-" in code: techniques.append(MacroTechnique.GENSYM) if re.search(r"\(case\s+\(car\b", code): techniques.append(MacroTechnique.DLAMBDA) if re.search(r"set-(?:dispatch-)?macro-character", code): techniques.append(MacroTechnique.READER) if _has_nested_backquote(code): techniques.append(MacroTechnique.NESTED_BACKQUOTE) if "anaphor" in code.lower() or _has_anaphor_injection(code): techniques.append(MacroTechnique.ANAPHOR) if re.search(r"\bflatten\b|\bremove-if-not.*body\b", code): techniques.append(MacroTechnique.CODE_WALKING) if _detect_recursive_expansion(code): techniques.append(MacroTechnique.RECURSIVE_EXPANSION) return techniques def detect_category(macro_def: str, techniques: list[MacroTechnique]) -> MacroCategory: code = macro_def if MacroTechnique.COMPILER_MACRO in techniques: if "format" in code or "constantp" in code: return MacroCategory.COMPILER_MACRO if MacroTechnique.ANAPHOR in techniques: if "dlambda" in code or "pandoric" in code: return MacroCategory.DISPATCH return MacroCategory.ANAPHORIC if MacroTechnique.DLAMBDA in techniques: return MacroCategory.DISPATCH if "gensym" in code.lower() and any( kw in code for kw in ["defmacro/g!", "defmacro!", "with-gensyms", "with-unique-names"] ): return MacroCategory.CAPTURE_MANAGEMENT if MacroTechnique.GENSYM in techniques and not any( t in techniques for t in [MacroTechnique.ANAPHOR, MacroTechnique.DLAMBDA] ): if "defmacro/g!" in code or "defmacro!" in code: return MacroCategory.CAPTURE_MANAGEMENT if re.search(r"\bpush\b.*\*paths\*|\bfail\b|\bchoose\b|backtrack", code): return MacroCategory.CONTROL_FLOW if MacroTechnique.TAGBODY in techniques or ( MacroTechnique.MACROLET in techniques and "go " in code ): return MacroCategory.CONTROL_FLOW if MacroTechnique.SYMBOL_MACROLET in techniques: return MacroCategory.DISPATCH if MacroTechnique.READER in techniques: return MacroCategory.READ_MACRO if re.search(r"batcher|sorting|network|comparator|cons-pool|cons-pool|tlist", code): return MacroCategory.EFFICIENCY if "eval" in code and "pandoric" in code: return MacroCategory.SCOPE if re.search(r"\bwith-\b", code) or re.search(r"def.*unit|def.*sql|define-", code): return MacroCategory.DSL if re.search(r"\bif\b.*\bdo\b|\bcond\b.*\blet\b", code): return MacroCategory.CONTROL_FLOW return MacroCategory.CONTROL_FLOW def assess_complexity( macro_def: str, techniques: list[MacroTechnique] ) -> Complexity: score = 0 lines = macro_def.count("\n") if lines >= 15: score += 3 elif lines >= 8: score += 2 elif lines >= 4: score += 1 advanced = {MacroTechnique.CODE_WALKING, MacroTechnique.RECURSIVE_EXPANSION, MacroTechnique.COMPILER_MACRO, MacroTechnique.DLAMBDA, MacroTechnique.TAGBODY, MacroTechnique.NESTED_BACKQUOTE} score += len(advanced & set(techniques)) * 2 gensym_count = macro_def.count("gensym") + macro_def.count("g!-") if gensym_count > 3: score += 2 elif gensym_count > 1: score += 1 bq_depth = _estimate_backquote_depth(macro_def) if bq_depth > 2: score += 2 elif bq_depth > 1: score += 1 if score >= 5: return Complexity.ADVANCED if score >= 2: return Complexity.INTERMEDIATE return Complexity.BASIC def assess_capture_risk( macro_def: str, techniques: list[MacroTechnique] ) -> tuple[bool, bool]: has_body = "&body" in macro_def or "&rest body" in macro_def needs_gensyms = "gensym" in macro_def.lower() or "g!-" in macro_def is_anaphor = MacroTechnique.ANAPHOR in techniques if is_anaphor: return (True, needs_gensyms) if has_body and not needs_gensyms: return (True, False) return (False, needs_gensyms) def score_example(example: TransformationExample) -> float: scores = [ _score_correctness(example), _score_hygiene(example), _score_transformation(example), _score_clarity(example), ] weights = [0.35, 0.25, 0.25, 0.15] return sum(s * w for s, w in zip(scores, weights)) def classify_all(examples): for ex in examples: if not ex.technique: ex.technique = detect_techniques(ex.macro_definition) if ex.macro_category is None: ex.macro_category = detect_category(ex.macro_definition, ex.technique) if ex.complexity == Complexity.BASIC and len(ex.macro_definition.split("\n")) > 3: ex.complexity = assess_complexity(ex.macro_definition, ex.technique) if not ex.has_capture_risk and not ex.requires_gensyms: has_risk, needs_gensyms = assess_capture_risk(ex.macro_definition, ex.technique) ex.has_capture_risk = has_risk ex.requires_gensyms = needs_gensyms ex.quality_score = score_example(ex) return examples def quality_report(examples): if not examples: return {"error": "No examples"} scores = [ex.quality_score for ex in examples if ex.quality_score is not None] if not scores: return {"error": "No scored examples"} return { "total_examples": len(examples), "mean_score": sum(scores) / len(scores), "min_score": min(scores), "max_score": max(scores), "below_threshold": sum(1 for s in scores if s < 0.5), "category_distribution": dict(Counter(ex.macro_category.value for ex in examples)), "complexity_distribution": dict(Counter(ex.complexity.value for ex in examples)), "source_distribution": dict(Counter(ex.source.value for ex in examples)), } def filter_quality(examples, min_score=0.5): return [ex for ex in examples if ex.quality_score is not None and ex.quality_score >= min_score] # --- Private helpers --- def _has_nested_backquote(code: str) -> bool: """Check if a macro definition contains nested backquotes.""" depth = 0 max_depth = 0 i = 0 while i < len(code): ch = code[i] if ch == '"': i += 1 while i < len(code) and code[i] != '"': if code[i] == "\\": i += 1 i += 1 i += 1 continue if ch == ";" and (i == 0 or code[i - 1] != "#"): while i < len(code) and code[i] != "\n": i += 1 continue if ch == "`": depth += 1 max_depth = max(max_depth, depth) elif ch == "," and i + 1 < len(code) and code[i + 1] == "@": depth -= 1 elif ch == ",": depth -= 1 i += 1 return max_depth > 1 def _has_anaphor_injection(code: str) -> bool: anaphors = ["'it", "'self", "'this", " it", " self", " this"] return any(a in code for a in anaphors) def _detect_recursive_expansion(code: str) -> bool: """Detect if a macro uses recursive expansion (calls itself during expansion).""" names = re.findall(r"\(defmacro\w*\s+([^\s()]+)", code) for name in names: if f"({name}" in code and f"(defmacro" not in ( code[code.find(f"({name}") - 10 : code.find(f"({name}")] ): return True return False def _estimate_backquote_depth(code: str) -> int: depth = 0 max_depth = 0 for ch in code: if ch == "`": depth += 1 if depth > max_depth: max_depth = depth elif ch in ",)": if depth > 0: depth -= 1 return max_depth def _score_correctness(ex): score = 1.0 macro = ex.macro_definition if not re.search(r"\((?:defmacro|define-compiler-macro|defmacro!|defmacro/g!)", macro): score -= 0.4 if macro.count("\n") < 2: score -= 0.2 if ex.macro_category is not None and ex.macro_category.value == "read-macro": if not re.search(r"set-(?:dispatch-)?macro-character", macro): score -= 0.3 if ex.after_expansion.strip() == ex.before_code.strip(): score -= 0.3 if len(ex.after_expansion.strip()) < 10: score -= 0.2 return max(0.0, score) def _score_hygiene(ex): score = 1.0 if ex.has_capture_risk and ex.requires_gensyms: if "gensym" not in ex.macro_definition.lower() and "g!" not in ex.macro_definition: score -= 0.4 if MacroTechnique.ANAPHOR in ex.technique and ex.requires_gensyms: score -= 0.25 return max(0.0, score) def _score_transformation(ex): score = 0.5 if len(ex.before_code) > 50: score += 0.15 if ex.macro_definition.count("\n") >= 3: score += 0.1 bonus = { MacroTechnique.RECURSIVE_EXPANSION: 0.1, MacroTechnique.CODE_WALKING: 0.1, MacroTechnique.COMPILER_MACRO: 0.08, MacroTechnique.SYMBOL_MACROLET: 0.08, MacroTechnique.MACROLET: 0.05, MacroTechnique.NESTED_BACKQUOTE: 0.05, } for tech in ex.technique: score += bonus.get(tech, 0.0) cb = {Complexity.BASIC: 0.0, Complexity.INTERMEDIATE: 0.05, Complexity.ADVANCED: 0.1} score += cb[ex.complexity] return min(1.0, score) def _score_clarity(ex): score = 0.5 if len(ex.problem_pattern) > 20: score += 0.15 if ex.commentary and len(ex.commentary) > 30: score += 0.2 if len(ex.before_code) > 30: score += 0.1 if ex.source_chapter: score += 0.05 return min(1.0, score)