lexir / data_parser /adilet_zan_parser.py

Upload folder using huggingface_hub

6a02b16 verified 23 days ago

5.33 kB

	import os, re, time, json
	import requests
	from bs4 import BeautifulSoup, NavigableString, Tag

	URL_RU = "https://adilet.zan.kz/rus/docs/K950001000_"
	URL_KZ = "https://adilet.zan.kz/kaz/docs/K950001000_"
	OUT_JSONL = "data/clauses_constitution_ru_kz.jsonl"

	def http_get(url, retries=3, timeout=25):
	last = None
	for i in range(retries):
	try:
	r = requests.get(
	url,
	headers={
	"User-Agent": "Mozilla/5.0",
	"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8",
	"Accept-Language": "ru,en;q=0.9,kk;q=0.8",
	"Connection": "keep-alive",
	},
	timeout=timeout,
	verify=False,
	)
	r.raise_for_status()
	return r.text
	except Exception as e:
	last = e
	time.sleep(1.5 * (i + 1))
	raise last

	def clean_text(el: Tag) -> str:
	parts = []
	for node in el.descendants:
	if isinstance(node, NavigableString):
	parts.append(str(node))
	elif isinstance(node, Tag):
	if node.name in ("script", "style"):
	continue
	if node.name == "a":
	parts.append(node.get_text(strip=True))
	return re.sub(r"\s+", " ", "".join(parts).replace("\xa0", " ")).strip()

	def find_main_container(soup: BeautifulSoup):
	for sel in [".container_gamma.text.text_upd article", "article", "main", "#content", ".content", ".document", ".doc"]:
	c = soup.select_one(sel)
	if c:
	return c
	return soup.body or soup

	def norm_art_num(s: str) -> str:
	s = s.strip()
	s = s.replace("–", "-").replace("—", "-")
	s = re.sub(r"\s+", "", s)
	return s

	def make_article_regex(lang: str):
	if lang == "ru":
	return re.compile(r"(?:^\|\b)Статья\s+([0-9IVXLC]+(?:[-–—][0-9IVXLC]+)?)\b", re.I)
	return re.compile(r"(?:^\|\b)([0-9IVXLC]+(?:[-–—][0-9IVXLC]+)?)\s-\sбап\b\|(?:^\|\b)Бап\s+([0-9IVXLC]+(?:[-–—][0-9IVXLC]+)?)\b", re.I)

	def extract_by_articles(container: Tag, lang: str):
	art_re = make_article_regex(lang)
	articles = []
	current = None

	def push():
	nonlocal current
	if current and current["paragraphs"]:
	articles.append(current)
	current = None

	scan_tags = ("h1","h2","h3","h4","h5","h6","p","div","li")

	for node in container.descendants:
	if not isinstance(node, Tag):
	continue
	if node.name in scan_tags:
	title = clean_text(node)
	if title:
	m = art_re.search(title)
	if m:
	num = m.group(1) or m.group(2)
	num = norm_art_num(num)
	push()
	current = {"article_number": num, "article_title": title, "paragraphs": []}
	continue
	if node.name == "p" and current is not None:
	txt = clean_text(node)
	if txt and not art_re.search(txt):
	current["paragraphs"].append(txt)

	push()
	return articles

	def build_index(articles):
	idx = {}
	total = 0
	for a in articles:
	art = a["article_number"]
	for i, txt in enumerate(a["paragraphs"], start=1):
	idx[(art, i)] = {"text": txt, "article_title": a.get("article_title")}
	total += 1
	return idx, total

	def main():
	soup_ru = BeautifulSoup(http_get(URL_RU), "lxml")
	soup_kz = BeautifulSoup(http_get(URL_KZ), "lxml")

	cont_ru = find_main_container(soup_ru)
	cont_kz = find_main_container(soup_kz)

	arts_ru = extract_by_articles(cont_ru, "ru")
	arts_kz = extract_by_articles(cont_kz, "kz")

	idx_ru, total_ru = build_index(arts_ru)
	idx_kz, total_kz = build_index(arts_kz)

	common = sorted(set(idx_ru.keys()) & set(idx_kz.keys()))

	out_dir = os.path.dirname(OUT_JSONL)
	if out_dir:
	os.makedirs(out_dir, exist_ok=True)

	written = 0
	with open(OUT_JSONL, "w", encoding="utf-8") as f:
	for (art, par) in common:
	node_id = f"KZ.CONST.1995:ART{art}:PAR{par}"
	obj = {
	"id": f"{node_id}:cl1",
	"text": idx_ru[(art, par)]["text"],
	"text_kz": idx_kz[(art, par)]["text"],
	"meta": {
	"doc_id": "KZ.CONST.1995",
	"article_number": art,
	"paragraph_number": par,
	"article_title_ru": idx_ru[(art, par)].get("article_title"),
	"article_title_kz": idx_kz[(art, par)].get("article_title"),
	"source_ru": URL_RU,
	"source_kz": URL_KZ
	}
	}
	f.write(json.dumps(obj, ensure_ascii=False) + "\n")
	written += 1

	print("[diag] ru_articles:", len(arts_ru), "ru_paragraphs:", total_ru)
	print("[diag] kz_articles:", len(arts_kz), "kz_paragraphs:", total_kz)
	print("[diag] common_pairs:", len(common))
	print("[ok] written:", OUT_JSONL, "rows:", written)

	if __name__ == "__main__":
	main()