import os, re, time, json import requests from bs4 import BeautifulSoup, NavigableString, Tag URL_RU = "https://adilet.zan.kz/rus/docs/K950001000_" URL_KZ = "https://adilet.zan.kz/kaz/docs/K950001000_" OUT_JSONL = "data/clauses_constitution_ru_kz.jsonl" def http_get(url, retries=3, timeout=25): last = None for i in range(retries): try: r = requests.get( url, headers={ "User-Agent": "Mozilla/5.0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "ru,en;q=0.9,kk;q=0.8", "Connection": "keep-alive", }, timeout=timeout, verify=False, ) r.raise_for_status() return r.text except Exception as e: last = e time.sleep(1.5 * (i + 1)) raise last def clean_text(el: Tag) -> str: parts = [] for node in el.descendants: if isinstance(node, NavigableString): parts.append(str(node)) elif isinstance(node, Tag): if node.name in ("script", "style"): continue if node.name == "a": parts.append(node.get_text(strip=True)) return re.sub(r"\s+", " ", "".join(parts).replace("\xa0", " ")).strip() def find_main_container(soup: BeautifulSoup): for sel in [".container_gamma.text.text_upd article", "article", "main", "#content", ".content", ".document", ".doc"]: c = soup.select_one(sel) if c: return c return soup.body or soup def norm_art_num(s: str) -> str: s = s.strip() s = s.replace("–", "-").replace("—", "-") s = re.sub(r"\s+", "", s) return s def make_article_regex(lang: str): if lang == "ru": return re.compile(r"(?:^|\b)Статья\s+([0-9IVXLC]+(?:[-–—][0-9IVXLC]+)?)\b", re.I) return re.compile(r"(?:^|\b)([0-9IVXLC]+(?:[-–—][0-9IVXLC]+)?)\s*-\s*бап\b|(?:^|\b)Бап\s+([0-9IVXLC]+(?:[-–—][0-9IVXLC]+)?)\b", re.I) def extract_by_articles(container: Tag, lang: str): art_re = make_article_regex(lang) articles = [] current = None def push(): nonlocal current if current and current["paragraphs"]: articles.append(current) current = None scan_tags = ("h1","h2","h3","h4","h5","h6","p","div","li") for node in container.descendants: if not isinstance(node, Tag): continue if node.name in scan_tags: title = clean_text(node) if title: m = art_re.search(title) if m: num = m.group(1) or m.group(2) num = norm_art_num(num) push() current = {"article_number": num, "article_title": title, "paragraphs": []} continue if node.name == "p" and current is not None: txt = clean_text(node) if txt and not art_re.search(txt): current["paragraphs"].append(txt) push() return articles def build_index(articles): idx = {} total = 0 for a in articles: art = a["article_number"] for i, txt in enumerate(a["paragraphs"], start=1): idx[(art, i)] = {"text": txt, "article_title": a.get("article_title")} total += 1 return idx, total def main(): soup_ru = BeautifulSoup(http_get(URL_RU), "lxml") soup_kz = BeautifulSoup(http_get(URL_KZ), "lxml") cont_ru = find_main_container(soup_ru) cont_kz = find_main_container(soup_kz) arts_ru = extract_by_articles(cont_ru, "ru") arts_kz = extract_by_articles(cont_kz, "kz") idx_ru, total_ru = build_index(arts_ru) idx_kz, total_kz = build_index(arts_kz) common = sorted(set(idx_ru.keys()) & set(idx_kz.keys())) out_dir = os.path.dirname(OUT_JSONL) if out_dir: os.makedirs(out_dir, exist_ok=True) written = 0 with open(OUT_JSONL, "w", encoding="utf-8") as f: for (art, par) in common: node_id = f"KZ.CONST.1995:ART{art}:PAR{par}" obj = { "id": f"{node_id}:cl1", "text": idx_ru[(art, par)]["text"], "text_kz": idx_kz[(art, par)]["text"], "meta": { "doc_id": "KZ.CONST.1995", "article_number": art, "paragraph_number": par, "article_title_ru": idx_ru[(art, par)].get("article_title"), "article_title_kz": idx_kz[(art, par)].get("article_title"), "source_ru": URL_RU, "source_kz": URL_KZ } } f.write(json.dumps(obj, ensure_ascii=False) + "\n") written += 1 print("[diag] ru_articles:", len(arts_ru), "ru_paragraphs:", total_ru) print("[diag] kz_articles:", len(arts_kz), "kz_paragraphs:", total_kz) print("[diag] common_pairs:", len(common)) print("[ok] written:", OUT_JSONL, "rows:", written) if __name__ == "__main__": main()