lexir / data_parser /adilet_zan_parser.py
irinaqqq's picture
Upload folder using huggingface_hub
6a02b16 verified
import os, re, time, json
import requests
from bs4 import BeautifulSoup, NavigableString, Tag
URL_RU = "https://adilet.zan.kz/rus/docs/K950001000_"
URL_KZ = "https://adilet.zan.kz/kaz/docs/K950001000_"
OUT_JSONL = "data/clauses_constitution_ru_kz.jsonl"
def http_get(url, retries=3, timeout=25):
last = None
for i in range(retries):
try:
r = requests.get(
url,
headers={
"User-Agent": "Mozilla/5.0",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "ru,en;q=0.9,kk;q=0.8",
"Connection": "keep-alive",
},
timeout=timeout,
verify=False,
)
r.raise_for_status()
return r.text
except Exception as e:
last = e
time.sleep(1.5 * (i + 1))
raise last
def clean_text(el: Tag) -> str:
parts = []
for node in el.descendants:
if isinstance(node, NavigableString):
parts.append(str(node))
elif isinstance(node, Tag):
if node.name in ("script", "style"):
continue
if node.name == "a":
parts.append(node.get_text(strip=True))
return re.sub(r"\s+", " ", "".join(parts).replace("\xa0", " ")).strip()
def find_main_container(soup: BeautifulSoup):
for sel in [".container_gamma.text.text_upd article", "article", "main", "#content", ".content", ".document", ".doc"]:
c = soup.select_one(sel)
if c:
return c
return soup.body or soup
def norm_art_num(s: str) -> str:
s = s.strip()
s = s.replace("–", "-").replace("—", "-")
s = re.sub(r"\s+", "", s)
return s
def make_article_regex(lang: str):
if lang == "ru":
return re.compile(r"(?:^|\b)Статья\s+([0-9IVXLC]+(?:[-–—][0-9IVXLC]+)?)\b", re.I)
return re.compile(r"(?:^|\b)([0-9IVXLC]+(?:[-–—][0-9IVXLC]+)?)\s*-\s*бап\b|(?:^|\b)Бап\s+([0-9IVXLC]+(?:[-–—][0-9IVXLC]+)?)\b", re.I)
def extract_by_articles(container: Tag, lang: str):
art_re = make_article_regex(lang)
articles = []
current = None
def push():
nonlocal current
if current and current["paragraphs"]:
articles.append(current)
current = None
scan_tags = ("h1","h2","h3","h4","h5","h6","p","div","li")
for node in container.descendants:
if not isinstance(node, Tag):
continue
if node.name in scan_tags:
title = clean_text(node)
if title:
m = art_re.search(title)
if m:
num = m.group(1) or m.group(2)
num = norm_art_num(num)
push()
current = {"article_number": num, "article_title": title, "paragraphs": []}
continue
if node.name == "p" and current is not None:
txt = clean_text(node)
if txt and not art_re.search(txt):
current["paragraphs"].append(txt)
push()
return articles
def build_index(articles):
idx = {}
total = 0
for a in articles:
art = a["article_number"]
for i, txt in enumerate(a["paragraphs"], start=1):
idx[(art, i)] = {"text": txt, "article_title": a.get("article_title")}
total += 1
return idx, total
def main():
soup_ru = BeautifulSoup(http_get(URL_RU), "lxml")
soup_kz = BeautifulSoup(http_get(URL_KZ), "lxml")
cont_ru = find_main_container(soup_ru)
cont_kz = find_main_container(soup_kz)
arts_ru = extract_by_articles(cont_ru, "ru")
arts_kz = extract_by_articles(cont_kz, "kz")
idx_ru, total_ru = build_index(arts_ru)
idx_kz, total_kz = build_index(arts_kz)
common = sorted(set(idx_ru.keys()) & set(idx_kz.keys()))
out_dir = os.path.dirname(OUT_JSONL)
if out_dir:
os.makedirs(out_dir, exist_ok=True)
written = 0
with open(OUT_JSONL, "w", encoding="utf-8") as f:
for (art, par) in common:
node_id = f"KZ.CONST.1995:ART{art}:PAR{par}"
obj = {
"id": f"{node_id}:cl1",
"text": idx_ru[(art, par)]["text"],
"text_kz": idx_kz[(art, par)]["text"],
"meta": {
"doc_id": "KZ.CONST.1995",
"article_number": art,
"paragraph_number": par,
"article_title_ru": idx_ru[(art, par)].get("article_title"),
"article_title_kz": idx_kz[(art, par)].get("article_title"),
"source_ru": URL_RU,
"source_kz": URL_KZ
}
}
f.write(json.dumps(obj, ensure_ascii=False) + "\n")
written += 1
print("[diag] ru_articles:", len(arts_ru), "ru_paragraphs:", total_ru)
print("[diag] kz_articles:", len(arts_kz), "kz_paragraphs:", total_kz)
print("[diag] common_pairs:", len(common))
print("[ok] written:", OUT_JSONL, "rows:", written)
if __name__ == "__main__":
main()