Spaces:

Mazenbs
/

extract_html_full

Sleeping

Mazenbs commited on Dec 2, 2025

Commit

0f10668

verified ·

1 Parent(s): 26aeab0

Create parser/assembler.py

Files changed (1) hide show

parser/assembler.py ADDED Viewed

+# parser/assembler.py
+from bs4 import BeautifulSoup
+from helpers.text_blocks import extract_all_text_blocks
+from parser.section_extractor import extract_sections_from_text_blocks
+from parser.article_extractor import extract_articles_from_section_lines
+from parser.table_extractor import link_tables_to_sections_and_articles
+def parse_law_from_html(html: str):
+    soup = BeautifulSoup(html, "html.parser")
+    title_tag = soup.find("title")
+    title = title_tag.text.strip() if title_tag else "عنوان غير معروف"
+    text_blocks = extract_all_text_blocks(soup)
+    sections_raw, preamble = extract_sections_from_text_blocks(text_blocks)
+    if not sections_raw:
+        sections_raw = [{"title": "", "lines": text_blocks}]
+    parsed_sections = []
+    for sec in sections_raw:
+        parsed = extract_articles_from_section_lines(sec["lines"])
+        parsed_sections.append({
+            "title": sec["title"],
+            "content": parsed["content"],
+            "articles": parsed["articles"],
+            "tables": []
+        })
+    parsed_sections = link_tables_to_sections_and_articles(soup, parsed_sections)
+    return {
+        "title": title,
+        "preamble": "\n".join(preamble).strip(),
+        "sections": parsed_sections
+    }