Spaces:
Running
Running
File size: 5,569 Bytes
5520f8e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 | #!/usr/bin/env python3
"""Canonical Lean corpus counter for szl-holdings/lutar-lean.
Trust Tier 1 reproducibility script (TRUST.md TODO). Clones (or reads) the
lutar-lean repo at its current `main` HEAD and emits a single canonical JSON of
declaration / axiom / sorry counts so that public surfaces stop drifting.
The methods here are *fixed and documented* so any agent or reviewer gets the
same numbers from the same SHA. This file is the source of truth for the
"Live numbers" line, replacing hand-maintained figures.
Usage:
python lean_numbers.py --repo-path /path/to/lutar-lean [--out lean_numbers.json]
python lean_numbers.py --clone --out .github/data/lean_numbers.json
Counting methods (documented, stable):
- declarations: lines in Lutar/ and Main.lean whose initial token is one of
{theorem, lemma, def, abbrev, instance, structure, inductive, class},
optionally preceded by the `noncomputable` and/or `private` modifier.
- axioms_raw: lines whose initial token is `axiom` (after optional modifiers).
- axioms_unique: distinct axiom names among those.
- sorries_raw: total `\\bsorry\\b` token occurrences across all .lean files.
- sorries_noncomment: occurrences excluding lines that are pure line-comments
(leading `--`) — i.e. `sorry` that is live proof text.
- sorries_putnam / sorries_baseline: split by whether the file is under Putnam/.
"""
from __future__ import annotations
import argparse
import json
import os
import re
import subprocess
import sys
import tempfile
from datetime import datetime, timezone
LUTAR_LEAN_URL = "https://github.com/szl-holdings/lutar-lean.git"
DECL_RE = re.compile(
r"^(?:private\s+)?(?:noncomputable\s+)?(?:private\s+)?"
r"(theorem|lemma|def|abbrev|instance|structure|inductive|class)\b"
)
AXIOM_RE = re.compile(r"^(?:private\s+)?axiom\s+([A-Za-z_][A-Za-z0-9_']*)")
SORRY_RE = re.compile(r"\bsorry\b")
COMMENT_LINE_RE = re.compile(r"^\s*--")
def iter_lean_files(root: str):
base = os.path.join(root, "Lutar")
for dirpath, _dirs, files in os.walk(base):
for fn in files:
if fn.endswith(".lean"):
yield os.path.join(dirpath, fn)
main = os.path.join(root, "Main.lean")
if os.path.exists(main):
yield main
def count(root: str) -> dict:
declarations = 0
axioms_raw = 0
axiom_names: set[str] = set()
sorries_raw = 0
sorries_noncomment = 0
sorries_putnam = 0
sorries_baseline = 0
for path in iter_lean_files(root):
is_putnam = f"{os.sep}Putnam{os.sep}" in path
with open(path, "r", encoding="utf-8", errors="replace") as fh:
for line in fh:
if DECL_RE.match(line):
declarations += 1
m = AXIOM_RE.match(line)
if m:
axioms_raw += 1
axiom_names.add(m.group(1))
hits = len(SORRY_RE.findall(line))
if hits:
sorries_raw += hits
if not COMMENT_LINE_RE.match(line):
sorries_noncomment += hits
if is_putnam:
sorries_putnam += hits
else:
sorries_baseline += hits
return {
"declarations": declarations,
"axioms_raw": axioms_raw,
"axioms_unique": len(axiom_names),
"axiom_names": sorted(axiom_names),
"sorries_raw": sorries_raw,
"sorries_noncomment": sorries_noncomment,
"sorries_putnam": sorries_putnam,
"sorries_baseline": sorries_baseline,
}
def git_head_sha(root: str) -> str:
try:
out = subprocess.check_output(
["git", "-C", root, "rev-parse", "HEAD"], text=True
).strip()
return out
except Exception:
return "unknown"
def main() -> int:
ap = argparse.ArgumentParser(description="Canonical lutar-lean corpus counter.")
ap.add_argument("--repo-path", help="Path to an existing lutar-lean checkout.")
ap.add_argument("--clone", action="store_true", help="Shallow-clone main first.")
ap.add_argument("--ref", default="main", help="Branch/SHA to count (with --clone).")
ap.add_argument("--out", help="Write JSON here (default: stdout).")
args = ap.parse_args()
tmp = None
repo_path = args.repo_path
if args.clone or not repo_path:
tmp = tempfile.mkdtemp(prefix="lutar-lean-")
subprocess.check_call(
["git", "clone", "--depth", "1", "--branch", args.ref, LUTAR_LEAN_URL, tmp]
)
repo_path = tmp
if not os.path.isdir(os.path.join(repo_path, "Lutar")):
print(f"error: {repo_path} has no Lutar/ directory", file=sys.stderr)
return 2
sha = git_head_sha(repo_path)
numbers = count(repo_path)
payload = {
"schema": "szl.lean_numbers/v1",
"repo": "szl-holdings/lutar-lean",
"ref": args.ref,
"sha": sha,
"measured_at_utc": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
"method": "see .github/scripts/lean_numbers.py docstring (fixed grep-equivalent regexes)",
"numbers": numbers,
}
out_json = json.dumps(payload, indent=2) + "\n"
if args.out:
os.makedirs(os.path.dirname(args.out) or ".", exist_ok=True)
with open(args.out, "w", encoding="utf-8") as fh:
fh.write(out_json)
print(f"wrote {args.out} @ {sha}")
else:
sys.stdout.write(out_json)
return 0
if __name__ == "__main__":
raise SystemExit(main())
|