| import os, re, time, json | |
| import requests | |
| from bs4 import BeautifulSoup, NavigableString, Tag | |
| URL_RU = "https://adilet.zan.kz/rus/docs/K950001000_" | |
| URL_KZ = "https://adilet.zan.kz/kaz/docs/K950001000_" | |
| OUT_JSONL = "data/clauses_constitution_ru_kz.jsonl" | |
| def http_get(url, retries=3, timeout=25): | |
| last = None | |
| for i in range(retries): | |
| try: | |
| r = requests.get( | |
| url, | |
| headers={ | |
| "User-Agent": "Mozilla/5.0", | |
| "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", | |
| "Accept-Language": "ru,en;q=0.9,kk;q=0.8", | |
| "Connection": "keep-alive", | |
| }, | |
| timeout=timeout, | |
| verify=False, | |
| ) | |
| r.raise_for_status() | |
| return r.text | |
| except Exception as e: | |
| last = e | |
| time.sleep(1.5 * (i + 1)) | |
| raise last | |
| def clean_text(el: Tag) -> str: | |
| parts = [] | |
| for node in el.descendants: | |
| if isinstance(node, NavigableString): | |
| parts.append(str(node)) | |
| elif isinstance(node, Tag): | |
| if node.name in ("script", "style"): | |
| continue | |
| if node.name == "a": | |
| parts.append(node.get_text(strip=True)) | |
| return re.sub(r"\s+", " ", "".join(parts).replace("\xa0", " ")).strip() | |
| def find_main_container(soup: BeautifulSoup): | |
| for sel in [".container_gamma.text.text_upd article", "article", "main", "#content", ".content", ".document", ".doc"]: | |
| c = soup.select_one(sel) | |
| if c: | |
| return c | |
| return soup.body or soup | |
| def norm_art_num(s: str) -> str: | |
| s = s.strip() | |
| s = s.replace("–", "-").replace("—", "-") | |
| s = re.sub(r"\s+", "", s) | |
| return s | |
| def make_article_regex(lang: str): | |
| if lang == "ru": | |
| return re.compile(r"(?:^|\b)Статья\s+([0-9IVXLC]+(?:[-–—][0-9IVXLC]+)?)\b", re.I) | |
| return re.compile(r"(?:^|\b)([0-9IVXLC]+(?:[-–—][0-9IVXLC]+)?)\s*-\s*бап\b|(?:^|\b)Бап\s+([0-9IVXLC]+(?:[-–—][0-9IVXLC]+)?)\b", re.I) | |
| def extract_by_articles(container: Tag, lang: str): | |
| art_re = make_article_regex(lang) | |
| articles = [] | |
| current = None | |
| def push(): | |
| nonlocal current | |
| if current and current["paragraphs"]: | |
| articles.append(current) | |
| current = None | |
| scan_tags = ("h1","h2","h3","h4","h5","h6","p","div","li") | |
| for node in container.descendants: | |
| if not isinstance(node, Tag): | |
| continue | |
| if node.name in scan_tags: | |
| title = clean_text(node) | |
| if title: | |
| m = art_re.search(title) | |
| if m: | |
| num = m.group(1) or m.group(2) | |
| num = norm_art_num(num) | |
| push() | |
| current = {"article_number": num, "article_title": title, "paragraphs": []} | |
| continue | |
| if node.name == "p" and current is not None: | |
| txt = clean_text(node) | |
| if txt and not art_re.search(txt): | |
| current["paragraphs"].append(txt) | |
| push() | |
| return articles | |
| def build_index(articles): | |
| idx = {} | |
| total = 0 | |
| for a in articles: | |
| art = a["article_number"] | |
| for i, txt in enumerate(a["paragraphs"], start=1): | |
| idx[(art, i)] = {"text": txt, "article_title": a.get("article_title")} | |
| total += 1 | |
| return idx, total | |
| def main(): | |
| soup_ru = BeautifulSoup(http_get(URL_RU), "lxml") | |
| soup_kz = BeautifulSoup(http_get(URL_KZ), "lxml") | |
| cont_ru = find_main_container(soup_ru) | |
| cont_kz = find_main_container(soup_kz) | |
| arts_ru = extract_by_articles(cont_ru, "ru") | |
| arts_kz = extract_by_articles(cont_kz, "kz") | |
| idx_ru, total_ru = build_index(arts_ru) | |
| idx_kz, total_kz = build_index(arts_kz) | |
| common = sorted(set(idx_ru.keys()) & set(idx_kz.keys())) | |
| out_dir = os.path.dirname(OUT_JSONL) | |
| if out_dir: | |
| os.makedirs(out_dir, exist_ok=True) | |
| written = 0 | |
| with open(OUT_JSONL, "w", encoding="utf-8") as f: | |
| for (art, par) in common: | |
| node_id = f"KZ.CONST.1995:ART{art}:PAR{par}" | |
| obj = { | |
| "id": f"{node_id}:cl1", | |
| "text": idx_ru[(art, par)]["text"], | |
| "text_kz": idx_kz[(art, par)]["text"], | |
| "meta": { | |
| "doc_id": "KZ.CONST.1995", | |
| "article_number": art, | |
| "paragraph_number": par, | |
| "article_title_ru": idx_ru[(art, par)].get("article_title"), | |
| "article_title_kz": idx_kz[(art, par)].get("article_title"), | |
| "source_ru": URL_RU, | |
| "source_kz": URL_KZ | |
| } | |
| } | |
| f.write(json.dumps(obj, ensure_ascii=False) + "\n") | |
| written += 1 | |
| print("[diag] ru_articles:", len(arts_ru), "ru_paragraphs:", total_ru) | |
| print("[diag] kz_articles:", len(arts_kz), "kz_paragraphs:", total_kz) | |
| print("[diag] common_pairs:", len(common)) | |
| print("[ok] written:", OUT_JSONL, "rows:", written) | |
| if __name__ == "__main__": | |
| main() | |