| """Classifier: auto-detect metadata + score quality of macro examples. |
| |
| Phase 3: Automatic detection of technique, category, complexity, capture risk. |
| """ |
|
|
| from __future__ import annotations |
|
|
| import re |
| from collections import Counter |
|
|
| from cl_macros.schema import ( |
| Complexity, |
| MacroCategory, |
| MacroTechnique, |
| Source, |
| TransformationExample, |
| ) |
|
|
|
|
| def detect_techniques(macro_def: str) -> list[MacroTechnique]: |
| code = macro_def |
| techniques: list[MacroTechnique] = [] |
|
|
| if "define-compiler-macro" in code: |
| techniques.append(MacroTechnique.COMPILER_MACRO) |
| if re.search(r"\(symbol-macrolet\b", code): |
| techniques.append(MacroTechnique.SYMBOL_MACROLET) |
| if re.search(r"\(macrolet\b", code): |
| techniques.append(MacroTechnique.MACROLET) |
| if "tagbody" in code and "go " in code: |
| techniques.append(MacroTechnique.TAGBODY) |
| if re.search(r"get-setf-method|\bdefsetf\b", code): |
| techniques.append(MacroTechnique.DEFSETF) |
| if "once-only" in code or "o!-" in code: |
| techniques.append(MacroTechnique.ONCE_ONLY) |
| if "gensym" in code.lower() or "g!-" in code: |
| techniques.append(MacroTechnique.GENSYM) |
| if re.search(r"\(case\s+\(car\b", code): |
| techniques.append(MacroTechnique.DLAMBDA) |
| if re.search(r"set-(?:dispatch-)?macro-character", code): |
| techniques.append(MacroTechnique.READER) |
| if _has_nested_backquote(code): |
| techniques.append(MacroTechnique.NESTED_BACKQUOTE) |
| if "anaphor" in code.lower() or _has_anaphor_injection(code): |
| techniques.append(MacroTechnique.ANAPHOR) |
| if re.search(r"\bflatten\b|\bremove-if-not.*body\b", code): |
| techniques.append(MacroTechnique.CODE_WALKING) |
| if _detect_recursive_expansion(code): |
| techniques.append(MacroTechnique.RECURSIVE_EXPANSION) |
|
|
| return techniques |
|
|
|
|
| def detect_category(macro_def: str, techniques: list[MacroTechnique]) -> MacroCategory: |
| code = macro_def |
|
|
| if MacroTechnique.COMPILER_MACRO in techniques: |
| if "format" in code or "constantp" in code: |
| return MacroCategory.COMPILER_MACRO |
|
|
| if MacroTechnique.ANAPHOR in techniques: |
| if "dlambda" in code or "pandoric" in code: |
| return MacroCategory.DISPATCH |
| return MacroCategory.ANAPHORIC |
|
|
| if MacroTechnique.DLAMBDA in techniques: |
| return MacroCategory.DISPATCH |
|
|
| if "gensym" in code.lower() and any( |
| kw in code for kw in ["defmacro/g!", "defmacro!", "with-gensyms", "with-unique-names"] |
| ): |
| return MacroCategory.CAPTURE_MANAGEMENT |
|
|
| if MacroTechnique.GENSYM in techniques and not any( |
| t in techniques for t in [MacroTechnique.ANAPHOR, MacroTechnique.DLAMBDA] |
| ): |
| if "defmacro/g!" in code or "defmacro!" in code: |
| return MacroCategory.CAPTURE_MANAGEMENT |
|
|
| if re.search(r"\bpush\b.*\*paths\*|\bfail\b|\bchoose\b|backtrack", code): |
| return MacroCategory.CONTROL_FLOW |
|
|
| if MacroTechnique.TAGBODY in techniques or ( |
| MacroTechnique.MACROLET in techniques and "go " in code |
| ): |
| return MacroCategory.CONTROL_FLOW |
|
|
| if MacroTechnique.SYMBOL_MACROLET in techniques: |
| return MacroCategory.DISPATCH |
|
|
| if MacroTechnique.READER in techniques: |
| return MacroCategory.READ_MACRO |
|
|
| if re.search(r"batcher|sorting|network|comparator|cons-pool|cons-pool|tlist", code): |
| return MacroCategory.EFFICIENCY |
|
|
| if "eval" in code and "pandoric" in code: |
| return MacroCategory.SCOPE |
|
|
| if re.search(r"\bwith-\b", code) or re.search(r"def.*unit|def.*sql|define-", code): |
| return MacroCategory.DSL |
|
|
| if re.search(r"\bif\b.*\bdo\b|\bcond\b.*\blet\b", code): |
| return MacroCategory.CONTROL_FLOW |
|
|
| return MacroCategory.CONTROL_FLOW |
|
|
|
|
| def assess_complexity( |
| macro_def: str, techniques: list[MacroTechnique] |
| ) -> Complexity: |
| score = 0 |
| lines = macro_def.count("\n") |
| if lines >= 15: |
| score += 3 |
| elif lines >= 8: |
| score += 2 |
| elif lines >= 4: |
| score += 1 |
|
|
| advanced = {MacroTechnique.CODE_WALKING, MacroTechnique.RECURSIVE_EXPANSION, |
| MacroTechnique.COMPILER_MACRO, MacroTechnique.DLAMBDA, |
| MacroTechnique.TAGBODY, MacroTechnique.NESTED_BACKQUOTE} |
| score += len(advanced & set(techniques)) * 2 |
|
|
| gensym_count = macro_def.count("gensym") + macro_def.count("g!-") |
| if gensym_count > 3: |
| score += 2 |
| elif gensym_count > 1: |
| score += 1 |
|
|
| bq_depth = _estimate_backquote_depth(macro_def) |
| if bq_depth > 2: |
| score += 2 |
| elif bq_depth > 1: |
| score += 1 |
|
|
| if score >= 5: |
| return Complexity.ADVANCED |
| if score >= 2: |
| return Complexity.INTERMEDIATE |
| return Complexity.BASIC |
|
|
|
|
| def assess_capture_risk( |
| macro_def: str, techniques: list[MacroTechnique] |
| ) -> tuple[bool, bool]: |
| has_body = "&body" in macro_def or "&rest body" in macro_def |
| needs_gensyms = "gensym" in macro_def.lower() or "g!-" in macro_def |
| is_anaphor = MacroTechnique.ANAPHOR in techniques |
|
|
| if is_anaphor: |
| return (True, needs_gensyms) |
| if has_body and not needs_gensyms: |
| return (True, False) |
| return (False, needs_gensyms) |
|
|
|
|
| def score_example(example: TransformationExample) -> float: |
| scores = [ |
| _score_correctness(example), |
| _score_hygiene(example), |
| _score_transformation(example), |
| _score_clarity(example), |
| ] |
| weights = [0.35, 0.25, 0.25, 0.15] |
| return sum(s * w for s, w in zip(scores, weights)) |
|
|
|
|
| def classify_all(examples): |
| for ex in examples: |
| if not ex.technique: |
| ex.technique = detect_techniques(ex.macro_definition) |
| if ex.macro_category is None: |
| ex.macro_category = detect_category(ex.macro_definition, ex.technique) |
| if ex.complexity == Complexity.BASIC and len(ex.macro_definition.split("\n")) > 3: |
| ex.complexity = assess_complexity(ex.macro_definition, ex.technique) |
| if not ex.has_capture_risk and not ex.requires_gensyms: |
| has_risk, needs_gensyms = assess_capture_risk(ex.macro_definition, ex.technique) |
| ex.has_capture_risk = has_risk |
| ex.requires_gensyms = needs_gensyms |
| ex.quality_score = score_example(ex) |
| return examples |
|
|
|
|
| def quality_report(examples): |
| if not examples: |
| return {"error": "No examples"} |
| scores = [ex.quality_score for ex in examples if ex.quality_score is not None] |
| if not scores: |
| return {"error": "No scored examples"} |
| return { |
| "total_examples": len(examples), |
| "mean_score": sum(scores) / len(scores), |
| "min_score": min(scores), |
| "max_score": max(scores), |
| "below_threshold": sum(1 for s in scores if s < 0.5), |
| "category_distribution": dict(Counter(ex.macro_category.value for ex in examples)), |
| "complexity_distribution": dict(Counter(ex.complexity.value for ex in examples)), |
| "source_distribution": dict(Counter(ex.source.value for ex in examples)), |
| } |
|
|
|
|
| def filter_quality(examples, min_score=0.5): |
| return [ex for ex in examples if ex.quality_score is not None and ex.quality_score >= min_score] |
|
|
|
|
| |
|
|
| def _has_nested_backquote(code: str) -> bool: |
| """Check if a macro definition contains nested backquotes.""" |
| depth = 0 |
| max_depth = 0 |
| i = 0 |
| while i < len(code): |
| ch = code[i] |
| if ch == '"': |
| i += 1 |
| while i < len(code) and code[i] != '"': |
| if code[i] == "\\": |
| i += 1 |
| i += 1 |
| i += 1 |
| continue |
| if ch == ";" and (i == 0 or code[i - 1] != "#"): |
| while i < len(code) and code[i] != "\n": |
| i += 1 |
| continue |
| if ch == "`": |
| depth += 1 |
| max_depth = max(max_depth, depth) |
| elif ch == "," and i + 1 < len(code) and code[i + 1] == "@": |
| depth -= 1 |
| elif ch == ",": |
| depth -= 1 |
| i += 1 |
| return max_depth > 1 |
|
|
|
|
| def _has_anaphor_injection(code: str) -> bool: |
| anaphors = ["'it", "'self", "'this", " it", " self", " this"] |
| return any(a in code for a in anaphors) |
|
|
|
|
| def _detect_recursive_expansion(code: str) -> bool: |
| """Detect if a macro uses recursive expansion (calls itself during expansion).""" |
| names = re.findall(r"\(defmacro\w*\s+([^\s()]+)", code) |
| for name in names: |
| if f"({name}" in code and f"(defmacro" not in ( |
| code[code.find(f"({name}") - 10 : code.find(f"({name}")] |
| ): |
| return True |
| return False |
|
|
|
|
| def _estimate_backquote_depth(code: str) -> int: |
| depth = 0 |
| max_depth = 0 |
| for ch in code: |
| if ch == "`": |
| depth += 1 |
| if depth > max_depth: |
| max_depth = depth |
| elif ch in ",)": |
| if depth > 0: |
| depth -= 1 |
| return max_depth |
|
|
|
|
| def _score_correctness(ex): |
| score = 1.0 |
| macro = ex.macro_definition |
| if not re.search(r"\((?:defmacro|define-compiler-macro|defmacro!|defmacro/g!)", macro): |
| score -= 0.4 |
| if macro.count("\n") < 2: |
| score -= 0.2 |
| if ex.macro_category is not None and ex.macro_category.value == "read-macro": |
| if not re.search(r"set-(?:dispatch-)?macro-character", macro): |
| score -= 0.3 |
| if ex.after_expansion.strip() == ex.before_code.strip(): |
| score -= 0.3 |
| if len(ex.after_expansion.strip()) < 10: |
| score -= 0.2 |
| return max(0.0, score) |
|
|
|
|
| def _score_hygiene(ex): |
| score = 1.0 |
| if ex.has_capture_risk and ex.requires_gensyms: |
| if "gensym" not in ex.macro_definition.lower() and "g!" not in ex.macro_definition: |
| score -= 0.4 |
| if MacroTechnique.ANAPHOR in ex.technique and ex.requires_gensyms: |
| score -= 0.25 |
| return max(0.0, score) |
|
|
|
|
| def _score_transformation(ex): |
| score = 0.5 |
| if len(ex.before_code) > 50: |
| score += 0.15 |
| if ex.macro_definition.count("\n") >= 3: |
| score += 0.1 |
| bonus = { |
| MacroTechnique.RECURSIVE_EXPANSION: 0.1, |
| MacroTechnique.CODE_WALKING: 0.1, |
| MacroTechnique.COMPILER_MACRO: 0.08, |
| MacroTechnique.SYMBOL_MACROLET: 0.08, |
| MacroTechnique.MACROLET: 0.05, |
| MacroTechnique.NESTED_BACKQUOTE: 0.05, |
| } |
| for tech in ex.technique: |
| score += bonus.get(tech, 0.0) |
| cb = {Complexity.BASIC: 0.0, Complexity.INTERMEDIATE: 0.05, Complexity.ADVANCED: 0.1} |
| score += cb[ex.complexity] |
| return min(1.0, score) |
|
|
|
|
| def _score_clarity(ex): |
| score = 0.5 |
| if len(ex.problem_pattern) > 20: |
| score += 0.15 |
| if ex.commentary and len(ex.commentary) > 30: |
| score += 0.2 |
| if len(ex.before_code) > 30: |
| score += 0.1 |
| if ex.source_chapter: |
| score += 0.05 |
| return min(1.0, score) |
|
|