File size: 4,301 Bytes
6cf7e7c
3ac681e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
975f30b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3ac681e
 
 
 
 
 
 
 
 
 
975f30b
3ac681e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
975f30b
 
 
 
3ac681e
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import glob
import os
import re
from pathlib import Path
from typing import List, Optional

from langchain_core.documents import Document

# Regex to capture optional docstring + declaration line
_DECL_PATTERN = re.compile(
    r'(?:/--\s*(.*?)\s*-/)?\s*'               # optional /-- docstring -/
    r'(?:@\[.*?\]\s*)*'                        # optional attributes
    r'(theorem|lemma|def|noncomputable def)\s+'
    r'(\S+)\s*'                                # declaration name
    r'(.*?)\s*:=',                             # everything up to :=
    re.DOTALL,
)


def _find_mathlib_root() -> Optional[str]:
    """
    Returns the path to the Mathlib4 source directory, searching common locations.
    """
    # 1. .lake/packages next to the current project file
    here = Path(__file__).resolve().parent.parent
    lake_pkg = here / ".lake" / "packages" / "mathlib"
    if (lake_pkg / "Mathlib").exists():
        return str(lake_pkg)

    # 2. lean-interact's TempRequireProject cache
    try:
        from lean_interact.config import DEFAULT_CACHE_DIR
        tmp_root = DEFAULT_CACHE_DIR / "tmp_projects"
        for lean_ver_dir in sorted(tmp_root.iterdir(), reverse=True):
            for project_dir in lean_ver_dir.iterdir():
                candidate = project_dir / ".lake" / "packages" / "mathlib"
                if (candidate / "Mathlib").exists():
                    return str(candidate)
    except Exception:
        pass

    # 3. Broad filesystem search in common Lean locations
    candidates = [
        os.path.expanduser("~/.elan/toolchains"),
        "/usr/local/lib/lean",
        "/opt/homebrew/lib/lean",
    ]
    for root in candidates:
        if not os.path.isdir(root):
            continue
        for dirpath, dirnames, _ in os.walk(root):
            depth = dirpath.replace(root, "").count(os.sep)
            if depth > 6:
                dirnames.clear()
                continue
            if "Mathlib" in dirnames:
                return dirpath
    return None


def _parse_lean_file(path: str) -> List[Document]:
    """
    Extracts theorem/lemma/def declarations from a single .lean file.
    Returns a list of Documents with page_content = "<name> : <signature>".
    """
    try:
        text = Path(path).read_text(encoding="utf-8", errors="ignore")
    except OSError:
        return []

    docs = []
    for match in _DECL_PATTERN.finditer(text):
        docstring = (match.group(1) or "").strip()
        kind = match.group(2)
        name = match.group(3)
        signature = re.sub(r'\s+', ' ', match.group(4)).strip()

        content = f"{name} : {signature}"
        if docstring:
            content = f"{docstring}\n{content}"

        line = text[: match.start()].count("\n") + 1
        docs.append(Document(
            page_content=content,
            metadata={"kind": kind, "name": name, "file": path, "line": line},
        ))
    return docs


class MathLibCorpus:
    """
    Extracts LangChain Documents from Mathlib4 source files on disk.
    """

    def __init__(self, mathlib_root: Optional[str] = None):
        self.mathlib_root = mathlib_root or _find_mathlib_root()

    def extract(self, max_files: Optional[int] = None) -> List[Document]:
        """
        Walks Mathlib source files and extracts declaration Documents.

        Args:
            max_files: If set, stop after processing this many .lean files
                       (useful for quick tests).

        Returns:
            List of LangChain Documents, one per declaration found.
        """
        if not self.mathlib_root:
            raise RuntimeError(
                "Could not locate Mathlib4 source. "
                "Pass mathlib_root explicitly or run `lake exe cache get` first."
            )

        # Prefer Mathlib/ subdirectory if it exists (avoids index-only files at root)
        mathlib_src = os.path.join(self.mathlib_root, "Mathlib")
        search_root = mathlib_src if os.path.isdir(mathlib_src) else self.mathlib_root
        pattern = os.path.join(search_root, "**", "*.lean")
        files = glob.glob(pattern, recursive=True)
        if max_files:
            files = files[:max_files]

        docs: List[Document] = []
        for path in files:
            docs.extend(_parse_lean_file(path))

        return docs