Mazenbs commited on
Commit
0f10668
·
verified ·
1 Parent(s): 26aeab0

Create parser/assembler.py

Browse files
Files changed (1) hide show
  1. parser/assembler.py +37 -0
parser/assembler.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # parser/assembler.py
2
+ from bs4 import BeautifulSoup
3
+ from helpers.text_blocks import extract_all_text_blocks
4
+ from parser.section_extractor import extract_sections_from_text_blocks
5
+ from parser.article_extractor import extract_articles_from_section_lines
6
+ from parser.table_extractor import link_tables_to_sections_and_articles
7
+
8
+ def parse_law_from_html(html: str):
9
+ soup = BeautifulSoup(html, "html.parser")
10
+
11
+ title_tag = soup.find("title")
12
+ title = title_tag.text.strip() if title_tag else "عنوان غير معروف"
13
+
14
+ text_blocks = extract_all_text_blocks(soup)
15
+
16
+ sections_raw, preamble = extract_sections_from_text_blocks(text_blocks)
17
+
18
+ if not sections_raw:
19
+ sections_raw = [{"title": "", "lines": text_blocks}]
20
+
21
+ parsed_sections = []
22
+ for sec in sections_raw:
23
+ parsed = extract_articles_from_section_lines(sec["lines"])
24
+ parsed_sections.append({
25
+ "title": sec["title"],
26
+ "content": parsed["content"],
27
+ "articles": parsed["articles"],
28
+ "tables": []
29
+ })
30
+
31
+ parsed_sections = link_tables_to_sections_and_articles(soup, parsed_sections)
32
+
33
+ return {
34
+ "title": title,
35
+ "preamble": "\n".join(preamble).strip(),
36
+ "sections": parsed_sections
37
+ }