File size: 5,569 Bytes
5520f8e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
#!/usr/bin/env python3
"""Canonical Lean corpus counter for szl-holdings/lutar-lean.

Trust Tier 1 reproducibility script (TRUST.md TODO). Clones (or reads) the
lutar-lean repo at its current `main` HEAD and emits a single canonical JSON of
declaration / axiom / sorry counts so that public surfaces stop drifting.

The methods here are *fixed and documented* so any agent or reviewer gets the
same numbers from the same SHA. This file is the source of truth for the
"Live numbers" line, replacing hand-maintained figures.

Usage:
  python lean_numbers.py --repo-path /path/to/lutar-lean [--out lean_numbers.json]
  python lean_numbers.py --clone --out .github/data/lean_numbers.json

Counting methods (documented, stable):
  - declarations: lines in Lutar/ and Main.lean whose initial token is one of
    {theorem, lemma, def, abbrev, instance, structure, inductive, class},
    optionally preceded by the `noncomputable` and/or `private` modifier.
  - axioms_raw: lines whose initial token is `axiom` (after optional modifiers).
  - axioms_unique: distinct axiom names among those.
  - sorries_raw: total `\\bsorry\\b` token occurrences across all .lean files.
  - sorries_noncomment: occurrences excluding lines that are pure line-comments
    (leading `--`) — i.e. `sorry` that is live proof text.
  - sorries_putnam / sorries_baseline: split by whether the file is under Putnam/.
"""
from __future__ import annotations

import argparse
import json
import os
import re
import subprocess
import sys
import tempfile
from datetime import datetime, timezone

LUTAR_LEAN_URL = "https://github.com/szl-holdings/lutar-lean.git"

DECL_RE = re.compile(
    r"^(?:private\s+)?(?:noncomputable\s+)?(?:private\s+)?"
    r"(theorem|lemma|def|abbrev|instance|structure|inductive|class)\b"
)
AXIOM_RE = re.compile(r"^(?:private\s+)?axiom\s+([A-Za-z_][A-Za-z0-9_']*)")
SORRY_RE = re.compile(r"\bsorry\b")
COMMENT_LINE_RE = re.compile(r"^\s*--")


def iter_lean_files(root: str):
    base = os.path.join(root, "Lutar")
    for dirpath, _dirs, files in os.walk(base):
        for fn in files:
            if fn.endswith(".lean"):
                yield os.path.join(dirpath, fn)
    main = os.path.join(root, "Main.lean")
    if os.path.exists(main):
        yield main


def count(root: str) -> dict:
    declarations = 0
    axioms_raw = 0
    axiom_names: set[str] = set()
    sorries_raw = 0
    sorries_noncomment = 0
    sorries_putnam = 0
    sorries_baseline = 0

    for path in iter_lean_files(root):
        is_putnam = f"{os.sep}Putnam{os.sep}" in path
        with open(path, "r", encoding="utf-8", errors="replace") as fh:
            for line in fh:
                if DECL_RE.match(line):
                    declarations += 1
                m = AXIOM_RE.match(line)
                if m:
                    axioms_raw += 1
                    axiom_names.add(m.group(1))
                hits = len(SORRY_RE.findall(line))
                if hits:
                    sorries_raw += hits
                    if not COMMENT_LINE_RE.match(line):
                        sorries_noncomment += hits
                    if is_putnam:
                        sorries_putnam += hits
                    else:
                        sorries_baseline += hits

    return {
        "declarations": declarations,
        "axioms_raw": axioms_raw,
        "axioms_unique": len(axiom_names),
        "axiom_names": sorted(axiom_names),
        "sorries_raw": sorries_raw,
        "sorries_noncomment": sorries_noncomment,
        "sorries_putnam": sorries_putnam,
        "sorries_baseline": sorries_baseline,
    }


def git_head_sha(root: str) -> str:
    try:
        out = subprocess.check_output(
            ["git", "-C", root, "rev-parse", "HEAD"], text=True
        ).strip()
        return out
    except Exception:
        return "unknown"


def main() -> int:
    ap = argparse.ArgumentParser(description="Canonical lutar-lean corpus counter.")
    ap.add_argument("--repo-path", help="Path to an existing lutar-lean checkout.")
    ap.add_argument("--clone", action="store_true", help="Shallow-clone main first.")
    ap.add_argument("--ref", default="main", help="Branch/SHA to count (with --clone).")
    ap.add_argument("--out", help="Write JSON here (default: stdout).")
    args = ap.parse_args()

    tmp = None
    repo_path = args.repo_path
    if args.clone or not repo_path:
        tmp = tempfile.mkdtemp(prefix="lutar-lean-")
        subprocess.check_call(
            ["git", "clone", "--depth", "1", "--branch", args.ref, LUTAR_LEAN_URL, tmp]
        )
        repo_path = tmp

    if not os.path.isdir(os.path.join(repo_path, "Lutar")):
        print(f"error: {repo_path} has no Lutar/ directory", file=sys.stderr)
        return 2

    sha = git_head_sha(repo_path)
    numbers = count(repo_path)
    payload = {
        "schema": "szl.lean_numbers/v1",
        "repo": "szl-holdings/lutar-lean",
        "ref": args.ref,
        "sha": sha,
        "measured_at_utc": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
        "method": "see .github/scripts/lean_numbers.py docstring (fixed grep-equivalent regexes)",
        "numbers": numbers,
    }

    out_json = json.dumps(payload, indent=2) + "\n"
    if args.out:
        os.makedirs(os.path.dirname(args.out) or ".", exist_ok=True)
        with open(args.out, "w", encoding="utf-8") as fh:
            fh.write(out_json)
        print(f"wrote {args.out} @ {sha}")
    else:
        sys.stdout.write(out_json)
    return 0


if __name__ == "__main__":
    raise SystemExit(main())