Lean4-helper / src /mathlib_corpus.py
p4r5kpftnp-cmd
Add CI/CD: GitHub Actions workflow + ruff config
6cf7e7c
Raw
History Blame Contribute Delete
4.3 kB
import glob
import os
import re
from pathlib import Path
from typing import List, Optional
from langchain_core.documents import Document
# Regex to capture optional docstring + declaration line
_DECL_PATTERN = re.compile(
r'(?:/--\s*(.*?)\s*-/)?\s*' # optional /-- docstring -/
r'(?:@\[.*?\]\s*)*' # optional attributes
r'(theorem|lemma|def|noncomputable def)\s+'
r'(\S+)\s*' # declaration name
r'(.*?)\s*:=', # everything up to :=
re.DOTALL,
)
def _find_mathlib_root() -> Optional[str]:
"""
Returns the path to the Mathlib4 source directory, searching common locations.
"""
# 1. .lake/packages next to the current project file
here = Path(__file__).resolve().parent.parent
lake_pkg = here / ".lake" / "packages" / "mathlib"
if (lake_pkg / "Mathlib").exists():
return str(lake_pkg)
# 2. lean-interact's TempRequireProject cache
try:
from lean_interact.config import DEFAULT_CACHE_DIR
tmp_root = DEFAULT_CACHE_DIR / "tmp_projects"
for lean_ver_dir in sorted(tmp_root.iterdir(), reverse=True):
for project_dir in lean_ver_dir.iterdir():
candidate = project_dir / ".lake" / "packages" / "mathlib"
if (candidate / "Mathlib").exists():
return str(candidate)
except Exception:
pass
# 3. Broad filesystem search in common Lean locations
candidates = [
os.path.expanduser("~/.elan/toolchains"),
"/usr/local/lib/lean",
"/opt/homebrew/lib/lean",
]
for root in candidates:
if not os.path.isdir(root):
continue
for dirpath, dirnames, _ in os.walk(root):
depth = dirpath.replace(root, "").count(os.sep)
if depth > 6:
dirnames.clear()
continue
if "Mathlib" in dirnames:
return dirpath
return None
def _parse_lean_file(path: str) -> List[Document]:
"""
Extracts theorem/lemma/def declarations from a single .lean file.
Returns a list of Documents with page_content = "<name> : <signature>".
"""
try:
text = Path(path).read_text(encoding="utf-8", errors="ignore")
except OSError:
return []
docs = []
for match in _DECL_PATTERN.finditer(text):
docstring = (match.group(1) or "").strip()
kind = match.group(2)
name = match.group(3)
signature = re.sub(r'\s+', ' ', match.group(4)).strip()
content = f"{name} : {signature}"
if docstring:
content = f"{docstring}\n{content}"
line = text[: match.start()].count("\n") + 1
docs.append(Document(
page_content=content,
metadata={"kind": kind, "name": name, "file": path, "line": line},
))
return docs
class MathLibCorpus:
"""
Extracts LangChain Documents from Mathlib4 source files on disk.
"""
def __init__(self, mathlib_root: Optional[str] = None):
self.mathlib_root = mathlib_root or _find_mathlib_root()
def extract(self, max_files: Optional[int] = None) -> List[Document]:
"""
Walks Mathlib source files and extracts declaration Documents.
Args:
max_files: If set, stop after processing this many .lean files
(useful for quick tests).
Returns:
List of LangChain Documents, one per declaration found.
"""
if not self.mathlib_root:
raise RuntimeError(
"Could not locate Mathlib4 source. "
"Pass mathlib_root explicitly or run `lake exe cache get` first."
)
# Prefer Mathlib/ subdirectory if it exists (avoids index-only files at root)
mathlib_src = os.path.join(self.mathlib_root, "Mathlib")
search_root = mathlib_src if os.path.isdir(mathlib_src) else self.mathlib_root
pattern = os.path.join(search_root, "**", "*.lean")
files = glob.glob(pattern, recursive=True)
if max_files:
files = files[:max_files]
docs: List[Document] = []
for path in files:
docs.extend(_parse_lean_file(path))
return docs