import re


LEAN_OPERATORS = [
    ":=", "!=", "&&", "-.", "->", "←", "..", "...",
    "::", ":>", "<;>", ";;", "==", "||", "=>", "<=",
    ">=", "⁻¹", "?_",
]


MODIFIERS = ("private", "protected", "noncomputable", "nonrec",
             "unsafe", "partial", "scoped", "local")


def _parse_single_attribute(text: str, start: int) -> int | None:
    """Return the index just after the matching ']' for `@[...]` starting at `start`."""
    n = len(text)
    assert text[start] == "@" and start + 1 < n and text[start + 1] == "["
    i = start + 2
    depth = 1
    while i < n:
        c = text[i]
        if c == "[":
            depth += 1
        elif c == "]":
            depth -= 1
            if depth == 0:
                return i + 1
        i += 1
    return None


def extract_and_remove_attributes(text: str):
    """Strip a leading block of `@[...]` attributes, returning (block, rest)."""
    n = len(text)
    if n == 0:
        return "", text
    pos = 0
    while pos < n and text[pos].isspace():
        pos += 1
    if not (pos + 1 < n and text[pos] == "@" and text[pos + 1] == "["):
        return "", text

    attr_block_end = 0
    cur = pos
    first_attr_start = pos
    while cur < n and text[cur] == "@" and cur + 1 < n and text[cur + 1] == "[":
        attr_end = _parse_single_attribute(text, cur)
        if attr_end is None:
            return "", text
        attr_block_end = attr_end
        while attr_block_end < n and text[attr_block_end].isspace():
            attr_block_end += 1
        cur = attr_block_end
        if not (cur + 1 < n and text[cur] == "@" and text[cur + 1] == "["):
            break

    if attr_block_end <= first_attr_start:
        return "", text
    return text[:attr_block_end], text[attr_block_end:]


def remove_comments(text: str) -> str:
    """Strip `/- ... -/` block and `--` line comments; drop empty lines."""
    text = re.sub(r"/-.*?-/", "", text, flags=re.DOTALL)
    cleaned_lines = []
    for line in text.split("\n"):
        cleaned = line.split("--", 1)[0]
        if cleaned.strip() == "":
            continue
        cleaned_lines.append(cleaned)
    return "\n".join(cleaned_lines).strip()


def return_theorem_to_prove_mathlib_style(text: str):
    """Find the span of a Mathlib-style theorem/lemma signature ending at top-level `:=`."""
    mods_pattern = "|".join(MODIFIERS)
    start_pattern = (
        r"\s*"
        r"(?:(?:" + mods_pattern + r")\s+)*"
        r"\s*"
        r"(?:theorem|lemma)\b"
    )
    start_match = re.search(start_pattern, text, re.DOTALL)
    if not start_match:
        prefix = (
            r"\s*"
            r"(?:(?:" + mods_pattern + r")\s+)*"
            r"\s*"
            r"(?:theorem|lemma)"
            r".*?"
        )
        pattern_match = r"(" + prefix + r"\s*\|)"
        match = re.search(pattern_match, text, re.DOTALL)
        if match:
            return match.span()
        return None

    start_index = start_match.start()
    current_index = start_match.end()
    bracket_stack: list[str] = []
    brackets_map = {")": "(", "]": "[", "}": "{"}
    open_brackets = set(brackets_map.values())
    close_brackets = set(brackets_map.keys())
    text_len = len(text)

    while current_index < text_len:
        char = text[current_index]
        if not bracket_stack and current_index + 1 < text_len:
            if text[current_index:current_index + 2] == ":=":
                return (start_index, current_index + 2)
        if char in open_brackets:
            bracket_stack.append(char)
        elif char in close_brackets:
            if bracket_stack and bracket_stack[-1] == brackets_map[char]:
                bracket_stack.pop()
        current_index += 1

    return None


def _lex(snippet: str) -> str:
    """Tokenize a Lean snippet, then re-collapse multi-char operators that
    the per-char split would have separated."""
    spaced = [" ".join(op) for op in LEAN_OPERATORS]
    op_dict = dict(zip(spaced, LEAN_OPERATORS, strict=False))
    out_lines = []
    for line in snippet.splitlines():
        tokens = []
        token = ""
        for ch in line:
            if ch == " ":
                if token:
                    tokens.append(token)
                    token = ""
            elif ch.isalnum() or ch in "._'":
                token += ch
            else:
                if token:
                    tokens.append(token)
                    token = ""
                tokens.append(ch)
        if token:
            tokens.append(token)
        line_out = " ".join(tokens)
        for conn, original in op_dict.items():
            if conn in line_out:
                line_out = line_out.replace(conn, original)
        out_lines.append(line_out)
    return "\n".join(out_lines)


def extract_signature(statement_and_proof: str) -> str | None:
    """Return the theorem signature (everything up to but not including `:=`).

    Strips comments and attributes first. Returns None if the text doesn't
    look like a `theorem`/`lemma` declaration.
    """
    try:
        text = remove_comments(statement_and_proof)
        _, text = extract_and_remove_attributes(text)
        span = return_theorem_to_prove_mathlib_style(text)
        if span is None:
            return None
        sig = text[span[0]:span[1]]
        if sig.rstrip().endswith(":="):
            sig = sig.rstrip()[:-2]
        return sig.strip()
    except Exception:
        return None


def normalize_signature(sig: str) -> str:
    """Whitespace-insensitive normalization for signature comparison."""
    return re.sub(r"\s+", " ", sig or "").strip()


def proof_length(statement_and_proof: str) -> int:
    """Token count of the proof body (everything after the signature's `:=`).

    Returns 10**9 on any parse failure so it's clearly distinguishable from a
    legitimate short proof.
    """
    try:
        text = remove_comments(statement_and_proof)
        _, text = extract_and_remove_attributes(text)
        span = return_theorem_to_prove_mathlib_style(text)
        if span is None:
            return 10**9
        proof = text[span[1]:]
        tokenized = _lex(proof)
        return sum(len(line.split(" ")) for line in tokenized.splitlines())
    except Exception:
        return 10**9