import re
from bs4 import BeautifulSoup, Tag

_MATH_KEYWORDS = frozenset({
    # core math
    "equation", "formula", "theorem", "proof", "solve", "function",
    "derivative", "integral", "probability", "matrix", "angle", "triangle",
    "polynomial", "prime", "modulo", "combination", "permutation",
    # gap-fill topics
    "radical", "inequality", "absolute value", "simplif", "parabola",
    "linear function", "factoring", "remainder theorem", "nonlinear",
    # statistics
    "distribution", "variance", "deviation", "regression", "hypothesis",
    "correlation", "confidence", "sample", "statistic", "population",
    "mean", "median", "interval", "p-value", "chi-square", "t-test",
    "anova", "normal", "binomial", "poisson", "inference", "estimat",
    # Vietnamese
    "phương trình", "định lý", "xác suất",
})

# table excluded: distribution tables, ANOVA tables, frequency tables carry real content
_STRIP_TAGS = ["script", "style", "nav", "header", "footer", "aside"]

# AoPS-specific navigation boilerplate — only applied when source == "aops"
_AOPS_BOILERPLATE_RE = re.compile(
    r"(Retrieved from|This article|AoPS Wiki|Art of Problem Solving"
    r"|Category\s*:|Navigation menu|Contents\s*\[|Jump to\s*(navigation|search)"
    r"|See also|External links|References\s*\[)",
    re.IGNORECASE,
)

# Generic web navigation boilerplate — applied to all sources
_GENERIC_BOILERPLATE_RE = re.compile(
    r"(Skip to (main )?content|Cookie (policy|notice)|Privacy policy"
    r"|Terms of (use|service)|All rights reserved|Breadcrumb"
    r"|Table of [Cc]ontents|Back to top|Share this page)",
    re.IGNORECASE,
)

# Asymptote diagram code blocks embedded in AoPS pages
_ASYMPTOTE_RE = re.compile(r"\[asy\].*?\[/asy\]", re.DOTALL | re.IGNORECASE)


def _replace_latex_imgs(soup: BeautifulSoup) -> None:
    """Replace <img> tags whose src contains 'latex' with their alt text (the LaTeX source)."""
    for img in soup.find_all("img", src=True):
        src = img.get("src", "")
        if "latex" in src:
            alt = img.get("alt", "").strip()
            if alt:
                img.replace_with(f" {alt} ")
            else:
                img.decompose()


def is_math_relevant(chunk: str) -> bool:
    lower = chunk.lower()
    return any(kw in lower for kw in _MATH_KEYWORDS)


def html_to_chunks(html: str, chunk_size: int = 3000, source: str = "generic") -> list[str]:
    """Strip HTML, recover LaTeX from img alt text, filter boilerplate, return quality chunks."""
    soup = BeautifulSoup(html, "html.parser")

    _replace_latex_imgs(soup)

    for tag in _STRIP_TAGS:
        for el in soup.find_all(tag):
            el.decompose()

    text = soup.get_text(separator="\n")
    if source == "aops":
        text = _ASYMPTOTE_RE.sub("", text)

    paragraphs = text.split("\n\n")
    cleaned: list[str] = []
    for para in paragraphs:
        if _GENERIC_BOILERPLATE_RE.search(para):
            continue
        if source == "aops" and _AOPS_BOILERPLATE_RE.search(para):
            continue
        stripped = para.strip()
        if len(stripped) >= 80:
            cleaned.append(stripped)

    chunks: list[str] = []
    current_parts: list[str] = []
    current_len = 0

    for para in cleaned:
        if current_len + len(para) + 2 > chunk_size and current_parts:
            chunks.append("\n\n".join(current_parts))
            current_parts = []
            current_len = 0
        current_parts.append(para)
        current_len += len(para) + 2

    if current_parts:
        chunks.append("\n\n".join(current_parts))

    return [c for c in chunks if is_math_relevant(c)]