File size: 10,800 Bytes
d69fc90
a383597
d69fc90
a383597
 
 
 
 
 
 
 
 
 
 
d69fc90
a383597
 
 
 
d69fc90
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a383597
 
 
 
 
 
 
 
 
 
 
d69fc90
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a383597
 
 
 
 
 
d69fc90
a383597
 
 
 
 
 
 
 
 
d69fc90
a383597
 
 
 
d69fc90
 
a383597
 
 
d69fc90
 
 
a383597
d69fc90
 
 
a383597
 
 
 
 
 
 
 
d69fc90
 
 
a383597
 
 
d69fc90
a383597
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
"""Classifier: auto-detect metadata + score quality of macro examples.

Phase 3: Automatic detection of technique, category, complexity, capture risk.
"""

from __future__ import annotations

import re
from collections import Counter

from cl_macros.schema import (
    Complexity,
    MacroCategory,
    MacroTechnique,
    Source,
    TransformationExample,
)


def detect_techniques(macro_def: str) -> list[MacroTechnique]:
    code = macro_def
    techniques: list[MacroTechnique] = []

    if "define-compiler-macro" in code:
        techniques.append(MacroTechnique.COMPILER_MACRO)
    if re.search(r"\(symbol-macrolet\b", code):
        techniques.append(MacroTechnique.SYMBOL_MACROLET)
    if re.search(r"\(macrolet\b", code):
        techniques.append(MacroTechnique.MACROLET)
    if "tagbody" in code and "go " in code:
        techniques.append(MacroTechnique.TAGBODY)
    if re.search(r"get-setf-method|\bdefsetf\b", code):
        techniques.append(MacroTechnique.DEFSETF)
    if "once-only" in code or "o!-" in code:
        techniques.append(MacroTechnique.ONCE_ONLY)
    if "gensym" in code.lower() or "g!-" in code:
        techniques.append(MacroTechnique.GENSYM)
    if re.search(r"\(case\s+\(car\b", code):
        techniques.append(MacroTechnique.DLAMBDA)
    if re.search(r"set-(?:dispatch-)?macro-character", code):
        techniques.append(MacroTechnique.READER)
    if _has_nested_backquote(code):
        techniques.append(MacroTechnique.NESTED_BACKQUOTE)
    if "anaphor" in code.lower() or _has_anaphor_injection(code):
        techniques.append(MacroTechnique.ANAPHOR)
    if re.search(r"\bflatten\b|\bremove-if-not.*body\b", code):
        techniques.append(MacroTechnique.CODE_WALKING)
    if _detect_recursive_expansion(code):
        techniques.append(MacroTechnique.RECURSIVE_EXPANSION)

    return techniques


def detect_category(macro_def: str, techniques: list[MacroTechnique]) -> MacroCategory:
    code = macro_def

    if MacroTechnique.COMPILER_MACRO in techniques:
        if "format" in code or "constantp" in code:
            return MacroCategory.COMPILER_MACRO

    if MacroTechnique.ANAPHOR in techniques:
        if "dlambda" in code or "pandoric" in code:
            return MacroCategory.DISPATCH
        return MacroCategory.ANAPHORIC

    if MacroTechnique.DLAMBDA in techniques:
        return MacroCategory.DISPATCH

    if "gensym" in code.lower() and any(
        kw in code for kw in ["defmacro/g!", "defmacro!", "with-gensyms", "with-unique-names"]
    ):
        return MacroCategory.CAPTURE_MANAGEMENT

    if MacroTechnique.GENSYM in techniques and not any(
        t in techniques for t in [MacroTechnique.ANAPHOR, MacroTechnique.DLAMBDA]
    ):
        if "defmacro/g!" in code or "defmacro!" in code:
            return MacroCategory.CAPTURE_MANAGEMENT

    if re.search(r"\bpush\b.*\*paths\*|\bfail\b|\bchoose\b|backtrack", code):
        return MacroCategory.CONTROL_FLOW

    if MacroTechnique.TAGBODY in techniques or (
        MacroTechnique.MACROLET in techniques and "go " in code
    ):
        return MacroCategory.CONTROL_FLOW

    if MacroTechnique.SYMBOL_MACROLET in techniques:
        return MacroCategory.DISPATCH

    if MacroTechnique.READER in techniques:
        return MacroCategory.READ_MACRO

    if re.search(r"batcher|sorting|network|comparator|cons-pool|cons-pool|tlist", code):
        return MacroCategory.EFFICIENCY

    if "eval" in code and "pandoric" in code:
        return MacroCategory.SCOPE

    if re.search(r"\bwith-\b", code) or re.search(r"def.*unit|def.*sql|define-", code):
        return MacroCategory.DSL

    if re.search(r"\bif\b.*\bdo\b|\bcond\b.*\blet\b", code):
        return MacroCategory.CONTROL_FLOW

    return MacroCategory.CONTROL_FLOW


def assess_complexity(
    macro_def: str, techniques: list[MacroTechnique]
) -> Complexity:
    score = 0
    lines = macro_def.count("\n")
    if lines >= 15:
        score += 3
    elif lines >= 8:
        score += 2
    elif lines >= 4:
        score += 1

    advanced = {MacroTechnique.CODE_WALKING, MacroTechnique.RECURSIVE_EXPANSION,
                MacroTechnique.COMPILER_MACRO, MacroTechnique.DLAMBDA,
                MacroTechnique.TAGBODY, MacroTechnique.NESTED_BACKQUOTE}
    score += len(advanced & set(techniques)) * 2

    gensym_count = macro_def.count("gensym") + macro_def.count("g!-")
    if gensym_count > 3:
        score += 2
    elif gensym_count > 1:
        score += 1

    bq_depth = _estimate_backquote_depth(macro_def)
    if bq_depth > 2:
        score += 2
    elif bq_depth > 1:
        score += 1

    if score >= 5:
        return Complexity.ADVANCED
    if score >= 2:
        return Complexity.INTERMEDIATE
    return Complexity.BASIC


def assess_capture_risk(
    macro_def: str, techniques: list[MacroTechnique]
) -> tuple[bool, bool]:
    has_body = "&body" in macro_def or "&rest body" in macro_def
    needs_gensyms = "gensym" in macro_def.lower() or "g!-" in macro_def
    is_anaphor = MacroTechnique.ANAPHOR in techniques

    if is_anaphor:
        return (True, needs_gensyms)
    if has_body and not needs_gensyms:
        return (True, False)
    return (False, needs_gensyms)


def score_example(example: TransformationExample) -> float:
    scores = [
        _score_correctness(example),
        _score_hygiene(example),
        _score_transformation(example),
        _score_clarity(example),
    ]
    weights = [0.35, 0.25, 0.25, 0.15]
    return sum(s * w for s, w in zip(scores, weights))


def classify_all(examples):
    for ex in examples:
        if not ex.technique:
            ex.technique = detect_techniques(ex.macro_definition)
        if ex.macro_category is None:
            ex.macro_category = detect_category(ex.macro_definition, ex.technique)
        if ex.complexity == Complexity.BASIC and len(ex.macro_definition.split("\n")) > 3:
            ex.complexity = assess_complexity(ex.macro_definition, ex.technique)
        if not ex.has_capture_risk and not ex.requires_gensyms:
            has_risk, needs_gensyms = assess_capture_risk(ex.macro_definition, ex.technique)
            ex.has_capture_risk = has_risk
            ex.requires_gensyms = needs_gensyms
        ex.quality_score = score_example(ex)
    return examples


def quality_report(examples):
    if not examples:
        return {"error": "No examples"}
    scores = [ex.quality_score for ex in examples if ex.quality_score is not None]
    if not scores:
        return {"error": "No scored examples"}
    return {
        "total_examples": len(examples),
        "mean_score": sum(scores) / len(scores),
        "min_score": min(scores),
        "max_score": max(scores),
        "below_threshold": sum(1 for s in scores if s < 0.5),
        "category_distribution": dict(Counter(ex.macro_category.value for ex in examples)),
        "complexity_distribution": dict(Counter(ex.complexity.value for ex in examples)),
        "source_distribution": dict(Counter(ex.source.value for ex in examples)),
    }


def filter_quality(examples, min_score=0.5):
    return [ex for ex in examples if ex.quality_score is not None and ex.quality_score >= min_score]


# --- Private helpers ---

def _has_nested_backquote(code: str) -> bool:
    """Check if a macro definition contains nested backquotes."""
    depth = 0
    max_depth = 0
    i = 0
    while i < len(code):
        ch = code[i]
        if ch == '"':
            i += 1
            while i < len(code) and code[i] != '"':
                if code[i] == "\\":
                    i += 1
                i += 1
            i += 1
            continue
        if ch == ";" and (i == 0 or code[i - 1] != "#"):
            while i < len(code) and code[i] != "\n":
                i += 1
            continue
        if ch == "`":
            depth += 1
            max_depth = max(max_depth, depth)
        elif ch == "," and i + 1 < len(code) and code[i + 1] == "@":
            depth -= 1
        elif ch == ",":
            depth -= 1
        i += 1
    return max_depth > 1


def _has_anaphor_injection(code: str) -> bool:
    anaphors = ["'it", "'self", "'this", " it", " self", " this"]
    return any(a in code for a in anaphors)


def _detect_recursive_expansion(code: str) -> bool:
    """Detect if a macro uses recursive expansion (calls itself during expansion)."""
    names = re.findall(r"\(defmacro\w*\s+([^\s()]+)", code)
    for name in names:
        if f"({name}" in code and f"(defmacro" not in (
            code[code.find(f"({name}") - 10 : code.find(f"({name}")]
        ):
            return True
    return False


def _estimate_backquote_depth(code: str) -> int:
    depth = 0
    max_depth = 0
    for ch in code:
        if ch == "`":
            depth += 1
            if depth > max_depth:
                max_depth = depth
        elif ch in ",)":
            if depth > 0:
                depth -= 1
    return max_depth


def _score_correctness(ex):
    score = 1.0
    macro = ex.macro_definition
    if not re.search(r"\((?:defmacro|define-compiler-macro|defmacro!|defmacro/g!)", macro):
        score -= 0.4
    if macro.count("\n") < 2:
        score -= 0.2
    if ex.macro_category is not None and ex.macro_category.value == "read-macro":
        if not re.search(r"set-(?:dispatch-)?macro-character", macro):
            score -= 0.3
    if ex.after_expansion.strip() == ex.before_code.strip():
        score -= 0.3
    if len(ex.after_expansion.strip()) < 10:
        score -= 0.2
    return max(0.0, score)


def _score_hygiene(ex):
    score = 1.0
    if ex.has_capture_risk and ex.requires_gensyms:
        if "gensym" not in ex.macro_definition.lower() and "g!" not in ex.macro_definition:
            score -= 0.4
    if MacroTechnique.ANAPHOR in ex.technique and ex.requires_gensyms:
        score -= 0.25
    return max(0.0, score)


def _score_transformation(ex):
    score = 0.5
    if len(ex.before_code) > 50:
        score += 0.15
    if ex.macro_definition.count("\n") >= 3:
        score += 0.1
    bonus = {
        MacroTechnique.RECURSIVE_EXPANSION: 0.1,
        MacroTechnique.CODE_WALKING: 0.1,
        MacroTechnique.COMPILER_MACRO: 0.08,
        MacroTechnique.SYMBOL_MACROLET: 0.08,
        MacroTechnique.MACROLET: 0.05,
        MacroTechnique.NESTED_BACKQUOTE: 0.05,
    }
    for tech in ex.technique:
        score += bonus.get(tech, 0.0)
    cb = {Complexity.BASIC: 0.0, Complexity.INTERMEDIATE: 0.05, Complexity.ADVANCED: 0.1}
    score += cb[ex.complexity]
    return min(1.0, score)


def _score_clarity(ex):
    score = 0.5
    if len(ex.problem_pattern) > 20:
        score += 0.15
    if ex.commentary and len(ex.commentary) > 30:
        score += 0.2
    if len(ex.before_code) > 30:
        score += 0.1
    if ex.source_chapter:
        score += 0.05
    return min(1.0, score)