File size: 10,800 Bytes

"""Classifier: auto-detect metadata + score quality of macro examples.

Phase 3: Automatic detection of technique, category, complexity, capture risk.
"""

from __future__ import annotations

import re
from collections import Counter

from cl_macros.schema import (
    Complexity,
    MacroCategory,
    MacroTechnique,
    Source,
    TransformationExample,
)


def detect_techniques(macro_def: str) -> list[MacroTechnique]:
    code = macro_def
    techniques: list[MacroTechnique] = []

    if "define-compiler-macro" in code:
        techniques.append(MacroTechnique.COMPILER_MACRO)
    if re.search(r"\(symbol-macrolet\b", code):
        techniques.append(MacroTechnique.SYMBOL_MACROLET)
    if re.search(r"\(macrolet\b", code):
        techniques.append(MacroTechnique.MACROLET)
    if "tagbody" in code and "go " in code:
        techniques.append(MacroTechnique.TAGBODY)
    if re.search(r"get-setf-method|\bdefsetf\b", code):
        techniques.append(MacroTechnique.DEFSETF)
    if "once-only" in code or "o!-" in code:
        techniques.append(MacroTechnique.ONCE_ONLY)
    if "gensym" in code.lower() or "g!-" in code:
        techniques.append(MacroTechnique.GENSYM)
    if re.search(r"\(case\s+\(car\b", code):
        techniques.append(MacroTechnique.DLAMBDA)
    if re.search(r"set-(?:dispatch-)?macro-character", code):
        techniques.append(MacroTechnique.READER)
    if _has_nested_backquote(code):
        techniques.append(MacroTechnique.NESTED_BACKQUOTE)
    if "anaphor" in code.lower() or _has_anaphor_injection(code):
        techniques.append(MacroTechnique.ANAPHOR)
    if re.search(r"\bflatten\b|\bremove-if-not.*body\b", code):
        techniques.append(MacroTechnique.CODE_WALKING)
    if _detect_recursive_expansion(code):
        techniques.append(MacroTechnique.RECURSIVE_EXPANSION)

    return techniques


def detect_category(macro_def: str, techniques: list[MacroTechnique]) -> MacroCategory:
    code = macro_def

    if MacroTechnique.COMPILER_MACRO in techniques:
        if "format" in code or "constantp" in code:
            return MacroCategory.COMPILER_MACRO

    if MacroTechnique.ANAPHOR in techniques:
        if "dlambda" in code or "pandoric" in code:
            return MacroCategory.DISPATCH
        return MacroCategory.ANAPHORIC

    if MacroTechnique.DLAMBDA in techniques:
        return MacroCategory.DISPATCH

    if "gensym" in code.lower() and any(
        kw in code for kw in ["defmacro/g!", "defmacro!", "with-gensyms", "with-unique-names"]
    ):
        return MacroCategory.CAPTURE_MANAGEMENT

    if MacroTechnique.GENSYM in techniques and not any(
        t in techniques for t in [MacroTechnique.ANAPHOR, MacroTechnique.DLAMBDA]
    ):
        if "defmacro/g!" in code or "defmacro!" in code:
            return MacroCategory.CAPTURE_MANAGEMENT

    if re.search(r"\bpush\b.*\*paths\*|\bfail\b|\bchoose\b|backtrack", code):
        return MacroCategory.CONTROL_FLOW

    if MacroTechnique.TAGBODY in techniques or (
        MacroTechnique.MACROLET in techniques and "go " in code
    ):
        return MacroCategory.CONTROL_FLOW

    if MacroTechnique.SYMBOL_MACROLET in techniques:
        return MacroCategory.DISPATCH

    if MacroTechnique.READER in techniques:
        return MacroCategory.READ_MACRO

    if re.search(r"batcher|sorting|network|comparator|cons-pool|cons-pool|tlist", code):
        return MacroCategory.EFFICIENCY

    if "eval" in code and "pandoric" in code:
        return MacroCategory.SCOPE

    if re.search(r"\bwith-\b", code) or re.search(r"def.*unit|def.*sql|define-", code):
        return MacroCategory.DSL

    if re.search(r"\bif\b.*\bdo\b|\bcond\b.*\blet\b", code):
        return MacroCategory.CONTROL_FLOW

    return MacroCategory.CONTROL_FLOW


def assess_complexity(
    macro_def: str, techniques: list[MacroTechnique]
) -> Complexity:
    score = 0
    lines = macro_def.count("\n")
    if lines >= 15:
        score += 3
    elif lines >= 8:
        score += 2
    elif lines >= 4:
        score += 1

    advanced = {MacroTechnique.CODE_WALKING, MacroTechnique.RECURSIVE_EXPANSION,
                MacroTechnique.COMPILER_MACRO, MacroTechnique.DLAMBDA,
                MacroTechnique.TAGBODY, MacroTechnique.NESTED_BACKQUOTE}
    score += len(advanced & set(techniques)) * 2

    gensym_count = macro_def.count("gensym") + macro_def.count("g!-")
    if gensym_count > 3:
        score += 2
    elif gensym_count > 1:
        score += 1

    bq_depth = _estimate_backquote_depth(macro_def)
    if bq_depth > 2:
        score += 2
    elif bq_depth > 1:
        score += 1

    if score >= 5:
        return Complexity.ADVANCED
    if score >= 2:
        return Complexity.INTERMEDIATE
    return Complexity.BASIC


def assess_capture_risk(
    macro_def: str, techniques: list[MacroTechnique]
) -> tuple[bool, bool]:
    has_body = "&body" in macro_def or "&rest body" in macro_def
    needs_gensyms = "gensym" in macro_def.lower() or "g!-" in macro_def
    is_anaphor = MacroTechnique.ANAPHOR in techniques

    if is_anaphor:
        return (True, needs_gensyms)
    if has_body and not needs_gensyms:
        return (True, False)
    return (False, needs_gensyms)


def score_example(example: TransformationExample) -> float:
    scores = [
        _score_correctness(example),
        _score_hygiene(example),
        _score_transformation(example),
        _score_clarity(example),
    ]
    weights = [0.35, 0.25, 0.25, 0.15]
    return sum(s * w for s, w in zip(scores, weights))


def classify_all(examples):
    for ex in examples:
        if not ex.technique:
            ex.technique = detect_techniques(ex.macro_definition)
        if ex.macro_category is None:
            ex.macro_category = detect_category(ex.macro_definition, ex.technique)
        if ex.complexity == Complexity.BASIC and len(ex.macro_definition.split("\n")) > 3:
            ex.complexity = assess_complexity(ex.macro_definition, ex.technique)
        if not ex.has_capture_risk and not ex.requires_gensyms:
            has_risk, needs_gensyms = assess_capture_risk(ex.macro_definition, ex.technique)
            ex.has_capture_risk = has_risk
            ex.requires_gensyms = needs_gensyms
        ex.quality_score = score_example(ex)
    return examples


def quality_report(examples):
    if not examples:
        return {"error": "No examples"}
    scores = [ex.quality_score for ex in examples if ex.quality_score is not None]
    if not scores:
        return {"error": "No scored examples"}
    return {
        "total_examples": len(examples),
        "mean_score": sum(scores) / len(scores),
        "min_score": min(scores),
        "max_score": max(scores),
        "below_threshold": sum(1 for s in scores if s < 0.5),
        "category_distribution": dict(Counter(ex.macro_category.value for ex in examples)),
        "complexity_distribution": dict(Counter(ex.complexity.value for ex in examples)),
        "source_distribution": dict(Counter(ex.source.value for ex in examples)),
    }


def filter_quality(examples, min_score=0.5):
    return [ex for ex in examples if ex.quality_score is not None and ex.quality_score >= min_score]


# --- Private helpers ---

def _has_nested_backquote(code: str) -> bool:
    """Check if a macro definition contains nested backquotes."""
    depth = 0
    max_depth = 0
    i = 0
    while i < len(code):
        ch = code[i]
        if ch == '"':
            i += 1
            while i < len(code) and code[i] != '"':
                if code[i] == "\\":
                    i += 1
                i += 1
            i += 1
            continue
        if ch == ";" and (i == 0 or code[i - 1] != "#"):
            while i < len(code) and code[i] != "\n":
                i += 1
            continue
        if ch == "`":
            depth += 1
            max_depth = max(max_depth, depth)
        elif ch == "," and i + 1 < len(code) and code[i + 1] == "@":
            depth -= 1
        elif ch == ",":
            depth -= 1
        i += 1
    return max_depth > 1


def _has_anaphor_injection(code: str) -> bool:
    anaphors = ["'it", "'self", "'this", " it", " self", " this"]
    return any(a in code for a in anaphors)


def _detect_recursive_expansion(code: str) -> bool:
    """Detect if a macro uses recursive expansion (calls itself during expansion)."""
    names = re.findall(r"\(defmacro\w*\s+([^\s()]+)", code)
    for name in names:
        if f"({name}" in code and f"(defmacro" not in (
            code[code.find(f"({name}") - 10 : code.find(f"({name}")]
        ):
            return True
    return False


def _estimate_backquote_depth(code: str) -> int:
    depth = 0
    max_depth = 0
    for ch in code:
        if ch == "`":
            depth += 1
            if depth > max_depth:
                max_depth = depth
        elif ch in ",)":
            if depth > 0:
                depth -= 1
    return max_depth


def _score_correctness(ex):
    score = 1.0
    macro = ex.macro_definition
    if not re.search(r"\((?:defmacro|define-compiler-macro|defmacro!|defmacro/g!)", macro):
        score -= 0.4
    if macro.count("\n") < 2:
        score -= 0.2
    if ex.macro_category is not None and ex.macro_category.value == "read-macro":
        if not re.search(r"set-(?:dispatch-)?macro-character", macro):
            score -= 0.3
    if ex.after_expansion.strip() == ex.before_code.strip():
        score -= 0.3
    if len(ex.after_expansion.strip()) < 10:
        score -= 0.2
    return max(0.0, score)


def _score_hygiene(ex):
    score = 1.0
    if ex.has_capture_risk and ex.requires_gensyms:
        if "gensym" not in ex.macro_definition.lower() and "g!" not in ex.macro_definition:
            score -= 0.4
    if MacroTechnique.ANAPHOR in ex.technique and ex.requires_gensyms:
        score -= 0.25
    return max(0.0, score)


def _score_transformation(ex):
    score = 0.5
    if len(ex.before_code) > 50:
        score += 0.15
    if ex.macro_definition.count("\n") >= 3:
        score += 0.1
    bonus = {
        MacroTechnique.RECURSIVE_EXPANSION: 0.1,
        MacroTechnique.CODE_WALKING: 0.1,
        MacroTechnique.COMPILER_MACRO: 0.08,
        MacroTechnique.SYMBOL_MACROLET: 0.08,
        MacroTechnique.MACROLET: 0.05,
        MacroTechnique.NESTED_BACKQUOTE: 0.05,
    }
    for tech in ex.technique:
        score += bonus.get(tech, 0.0)
    cb = {Complexity.BASIC: 0.0, Complexity.INTERMEDIATE: 0.05, Complexity.ADVANCED: 0.1}
    score += cb[ex.complexity]
    return min(1.0, score)


def _score_clarity(ex):
    score = 0.5
    if len(ex.problem_pattern) > 20:
        score += 0.15
    if ex.commentary and len(ex.commentary) > 30:
        score += 0.2
    if len(ex.before_code) > 30:
        score += 0.1
    if ex.source_chapter:
        score += 0.05
    return min(1.0, score)