import os, re, time, json
import requests
from bs4 import BeautifulSoup, NavigableString, Tag

URL_RU = "https://adilet.zan.kz/rus/docs/K950001000_"
URL_KZ = "https://adilet.zan.kz/kaz/docs/K950001000_"
OUT_JSONL = "data/clauses_constitution_ru_kz.jsonl"

def http_get(url, retries=3, timeout=25):
    last = None
    for i in range(retries):
        try:
            r = requests.get(
                url,
                headers={
                    "User-Agent": "Mozilla/5.0",
                    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
                    "Accept-Language": "ru,en;q=0.9,kk;q=0.8",
                    "Connection": "keep-alive",
                },
                timeout=timeout,
                verify=False,
            )
            r.raise_for_status()
            return r.text
        except Exception as e:
            last = e
            time.sleep(1.5 * (i + 1))
    raise last

def clean_text(el: Tag) -> str:
    parts = []
    for node in el.descendants:
        if isinstance(node, NavigableString):
            parts.append(str(node))
        elif isinstance(node, Tag):
            if node.name in ("script", "style"):
                continue
            if node.name == "a":
                parts.append(node.get_text(strip=True))
    return re.sub(r"\s+", " ", "".join(parts).replace("\xa0", " ")).strip()

def find_main_container(soup: BeautifulSoup):
    for sel in [".container_gamma.text.text_upd article", "article", "main", "#content", ".content", ".document", ".doc"]:
        c = soup.select_one(sel)
        if c:
            return c
    return soup.body or soup

def norm_art_num(s: str) -> str:
    s = s.strip()
    s = s.replace("–", "-").replace("—", "-")
    s = re.sub(r"\s+", "", s)
    return s

def make_article_regex(lang: str):
    if lang == "ru":
        return re.compile(r"(?:^|\b)Статья\s+([0-9IVXLC]+(?:[-–—][0-9IVXLC]+)?)\b", re.I)
    return re.compile(r"(?:^|\b)([0-9IVXLC]+(?:[-–—][0-9IVXLC]+)?)\s*-\s*бап\b|(?:^|\b)Бап\s+([0-9IVXLC]+(?:[-–—][0-9IVXLC]+)?)\b", re.I)

def extract_by_articles(container: Tag, lang: str):
    art_re = make_article_regex(lang)
    articles = []
    current = None

    def push():
        nonlocal current
        if current and current["paragraphs"]:
            articles.append(current)
        current = None

    scan_tags = ("h1","h2","h3","h4","h5","h6","p","div","li")

    for node in container.descendants:
        if not isinstance(node, Tag):
            continue
        if node.name in scan_tags:
            title = clean_text(node)
            if title:
                m = art_re.search(title)
                if m:
                    num = m.group(1) or m.group(2)
                    num = norm_art_num(num)
                    push()
                    current = {"article_number": num, "article_title": title, "paragraphs": []}
                    continue
        if node.name == "p" and current is not None:
            txt = clean_text(node)
            if txt and not art_re.search(txt):
                current["paragraphs"].append(txt)

    push()
    return articles

def build_index(articles):
    idx = {}
    total = 0
    for a in articles:
        art = a["article_number"]
        for i, txt in enumerate(a["paragraphs"], start=1):
            idx[(art, i)] = {"text": txt, "article_title": a.get("article_title")}
            total += 1
    return idx, total

def main():
    soup_ru = BeautifulSoup(http_get(URL_RU), "lxml")
    soup_kz = BeautifulSoup(http_get(URL_KZ), "lxml")

    cont_ru = find_main_container(soup_ru)
    cont_kz = find_main_container(soup_kz)

    arts_ru = extract_by_articles(cont_ru, "ru")
    arts_kz = extract_by_articles(cont_kz, "kz")

    idx_ru, total_ru = build_index(arts_ru)
    idx_kz, total_kz = build_index(arts_kz)

    common = sorted(set(idx_ru.keys()) & set(idx_kz.keys()))

    out_dir = os.path.dirname(OUT_JSONL)
    if out_dir:
        os.makedirs(out_dir, exist_ok=True)

    written = 0
    with open(OUT_JSONL, "w", encoding="utf-8") as f:
        for (art, par) in common:
            node_id = f"KZ.CONST.1995:ART{art}:PAR{par}"
            obj = {
                "id": f"{node_id}:cl1",
                "text": idx_ru[(art, par)]["text"],
                "text_kz": idx_kz[(art, par)]["text"],
                "meta": {
                    "doc_id": "KZ.CONST.1995",
                    "article_number": art,
                    "paragraph_number": par,
                    "article_title_ru": idx_ru[(art, par)].get("article_title"),
                    "article_title_kz": idx_kz[(art, par)].get("article_title"),
                    "source_ru": URL_RU,
                    "source_kz": URL_KZ
                }
            }
            f.write(json.dumps(obj, ensure_ascii=False) + "\n")
            written += 1

    print("[diag] ru_articles:", len(arts_ru), "ru_paragraphs:", total_ru)
    print("[diag] kz_articles:", len(arts_kz), "kz_paragraphs:", total_kz)
    print("[diag] common_pairs:", len(common))
    print("[ok] written:", OUT_JSONL, "rows:", written)

if __name__ == "__main__":
    main()