Spaces:

Mazenbs
/

extract_html_full

Running

App Files Files Community

extract_html_full / parser /assembler.py

Mazenbs

Update parser/assembler.py

cd3c116 verified 27 days ago

raw

history blame contribute delete

4.96 kB

	# parser/assembler.py
	from typing import List, Dict, Tuple
	from helpers.utils import normalize_digits, extract_article_number, is_article, is_section
	from helpers.cleaner import extract_law_number_and_year, merge_colon_lines

	def extract_title_and_preamble(
	blocks: List[Dict[str, str]],
	) -> Tuple[str, str, List[Dict[str, str]]]:
	title_blocks = [b for b in blocks if b["type"] == "title"]
	preamble_blocks = [b for b in blocks if b["type"] == "preamble"]
	body_blocks = [b for b in blocks if b["type"] == "body"]

	title = "\n".join([b["text"] for b in title_blocks]).strip()
	preamble = "\n".join([b["text"] for b in preamble_blocks]).strip()

	return title, preamble, body_blocks


	def extract_articles_from_blocks(blocks: List[Dict[str, str]]) -> List[Dict]:
	articles = []
	current = None

	for block in blocks:
	t = block["text"].strip()
	if is_article(t):
	if current:
	# تطبيق merge_colon_lines على نص المادة السابقة
	current["text"] = merge_colon_lines(current["text"].strip())
	articles.append(current)
	current = {"number": extract_article_number(t), "text": t}
	else:
	if current:
	current["text"] += "\n" + t
	else:
	current = {"number": None, "text": t}

	if current:
	current["text"] = merge_colon_lines(current["text"].strip())
	articles.append(current)

	return articles


	def extract_sections(blocks: List[Dict[str, str]]) -> List[Dict]:
	sections = []
	current = {"name": "", "texts": []}

	for block in blocks:
	t = normalize_digits(block["text"].strip())

	if is_section(t):
	if current["texts"] or current["name"]:
	sections.append(current)
	current = {"name": t, "texts": []}
	else:
	current["texts"].append(block)

	if current["texts"] or current["name"]:
	sections.append(current)

	return sections

	def parse_law_from_texts(text_blocks: List[Dict[str, str]], url: str = None) -> Dict:
	title, preamble, remaining_blocks = extract_title_and_preamble(text_blocks)
	sections_raw = extract_sections(remaining_blocks)

	# -------------------------------------------------------
	# استخراج رقم القانون والسنة من العنوان
	# -------------------------------------------------------
	law_info_title = extract_law_number_and_year(title)

	# استخراج رقم القانون والسنة من المقدمة (كباك أب)
	law_info_preamble = extract_law_number_and_year(preamble)

	# -------------------------------------------------------
	# اختيار الأفضل:
	# العنوان أولوية، وإذا ناقص → نكمل من المقدمة
	# -------------------------------------------------------
	law_number = None
	law_year = None

	if law_info_title:
	law_number = law_info_title.get("law_number")
	law_year = law_info_title.get("year")

	if (not law_number or not law_year) and law_info_preamble:
	law_number = law_number or law_info_preamble.get("law_number")
	law_year = law_year or law_info_preamble.get("year")

	# -------------------------------------------------------
	# معالجة الأقسام
	# -------------------------------------------------------
	sections = []
	for sec in sections_raw:
	raw_blocks = sec["texts"]

	# دمج نصوص القسم مع merge_colon_lines
	# واستبعاد المواد من نص المحتوى
	content = "\n".join([
	b["text"] for b in raw_blocks
	if not is_article(b["text"])
	]).strip()

	content = merge_colon_lines(content)

	# استخراج المواد من القسم
	articles = extract_articles_from_blocks(raw_blocks)

	# تجهيز المواد بالشكل المطلوب
	articles_cleaned = []
	for a in articles:
	if a["number"] is None:
	articles_cleaned.append({"tag": a["text"]})
	else:
	articles_cleaned.append({
	"number": a["number"],
	"text": a["text"]
	})

	# إضافة القسم النهائي
	sections.append({
	"title": sec["name"],
	"content": content,
	"articles": articles_cleaned
	})

	# -------------------------------------------------------
	# الاستجابة النهائية
	# -------------------------------------------------------
	return {
	"message": "success",
	"url": url,
	"count": len(text_blocks),
	"law": {
	"title": title,
	"preamble": preamble,
	"number": law_number,
	"year": law_year,
	"sections": sections
	}
	}