Spaces:
Running
Running
File size: 4,301 Bytes
6cf7e7c 3ac681e 975f30b 3ac681e 975f30b 3ac681e 975f30b 3ac681e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 | import glob
import os
import re
from pathlib import Path
from typing import List, Optional
from langchain_core.documents import Document
# Regex to capture optional docstring + declaration line
_DECL_PATTERN = re.compile(
r'(?:/--\s*(.*?)\s*-/)?\s*' # optional /-- docstring -/
r'(?:@\[.*?\]\s*)*' # optional attributes
r'(theorem|lemma|def|noncomputable def)\s+'
r'(\S+)\s*' # declaration name
r'(.*?)\s*:=', # everything up to :=
re.DOTALL,
)
def _find_mathlib_root() -> Optional[str]:
"""
Returns the path to the Mathlib4 source directory, searching common locations.
"""
# 1. .lake/packages next to the current project file
here = Path(__file__).resolve().parent.parent
lake_pkg = here / ".lake" / "packages" / "mathlib"
if (lake_pkg / "Mathlib").exists():
return str(lake_pkg)
# 2. lean-interact's TempRequireProject cache
try:
from lean_interact.config import DEFAULT_CACHE_DIR
tmp_root = DEFAULT_CACHE_DIR / "tmp_projects"
for lean_ver_dir in sorted(tmp_root.iterdir(), reverse=True):
for project_dir in lean_ver_dir.iterdir():
candidate = project_dir / ".lake" / "packages" / "mathlib"
if (candidate / "Mathlib").exists():
return str(candidate)
except Exception:
pass
# 3. Broad filesystem search in common Lean locations
candidates = [
os.path.expanduser("~/.elan/toolchains"),
"/usr/local/lib/lean",
"/opt/homebrew/lib/lean",
]
for root in candidates:
if not os.path.isdir(root):
continue
for dirpath, dirnames, _ in os.walk(root):
depth = dirpath.replace(root, "").count(os.sep)
if depth > 6:
dirnames.clear()
continue
if "Mathlib" in dirnames:
return dirpath
return None
def _parse_lean_file(path: str) -> List[Document]:
"""
Extracts theorem/lemma/def declarations from a single .lean file.
Returns a list of Documents with page_content = "<name> : <signature>".
"""
try:
text = Path(path).read_text(encoding="utf-8", errors="ignore")
except OSError:
return []
docs = []
for match in _DECL_PATTERN.finditer(text):
docstring = (match.group(1) or "").strip()
kind = match.group(2)
name = match.group(3)
signature = re.sub(r'\s+', ' ', match.group(4)).strip()
content = f"{name} : {signature}"
if docstring:
content = f"{docstring}\n{content}"
line = text[: match.start()].count("\n") + 1
docs.append(Document(
page_content=content,
metadata={"kind": kind, "name": name, "file": path, "line": line},
))
return docs
class MathLibCorpus:
"""
Extracts LangChain Documents from Mathlib4 source files on disk.
"""
def __init__(self, mathlib_root: Optional[str] = None):
self.mathlib_root = mathlib_root or _find_mathlib_root()
def extract(self, max_files: Optional[int] = None) -> List[Document]:
"""
Walks Mathlib source files and extracts declaration Documents.
Args:
max_files: If set, stop after processing this many .lean files
(useful for quick tests).
Returns:
List of LangChain Documents, one per declaration found.
"""
if not self.mathlib_root:
raise RuntimeError(
"Could not locate Mathlib4 source. "
"Pass mathlib_root explicitly or run `lake exe cache get` first."
)
# Prefer Mathlib/ subdirectory if it exists (avoids index-only files at root)
mathlib_src = os.path.join(self.mathlib_root, "Mathlib")
search_root = mathlib_src if os.path.isdir(mathlib_src) else self.mathlib_root
pattern = os.path.join(search_root, "**", "*.lean")
files = glob.glob(pattern, recursive=True)
if max_files:
files = files[:max_files]
docs: List[Document] = []
for path in files:
docs.extend(_parse_lean_file(path))
return docs
|