Mazenbs commited on
Commit
d4584e2
·
verified ·
1 Parent(s): ebc846d

Update parser/assembler.py

Browse files
Files changed (1) hide show
  1. parser/assembler.py +82 -0
parser/assembler.py CHANGED
@@ -61,6 +61,88 @@ def extract_sections(blocks: List[Dict[str, str]]) -> List[Dict]:
61
  return sections
62
 
63
  def parse_law_from_texts(text_blocks: List[Dict[str, str]]) -> Dict:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  title, preamble, remaining_blocks = extract_title_and_preamble(text_blocks)
65
  sections_raw = extract_sections(remaining_blocks)
66
 
 
61
  return sections
62
 
63
  def parse_law_from_texts(text_blocks: List[Dict[str, str]]) -> Dict:
64
+ # استخراج العنوان + المقدمة + باقي النص
65
+ title, preamble, remaining_blocks = extract_title_and_preamble(text_blocks)
66
+ sections_raw = extract_sections(remaining_blocks)
67
+
68
+ # -------------------------------------------------------
69
+ # استخراج رقم القانون والسنة من العنوان
70
+ # -------------------------------------------------------
71
+ law_info_title = extract_law_number_and_year(title)
72
+
73
+ # استخراج رقم القانون والسنة من المقدمة (كباك أب)
74
+ law_info_preamble = extract_law_number_and_year(preamble)
75
+
76
+ # -------------------------------------------------------
77
+ # اختيار الأفضل:
78
+ # العنوان أولوية، وإذا ناقص → نكمل من المقدمة
79
+ # -------------------------------------------------------
80
+ law_number = None
81
+ law_year = None
82
+
83
+ if law_info_title:
84
+ law_number = law_info_title.get("number")
85
+ law_year = law_info_title.get("year")
86
+
87
+ if (not law_number or not law_year) and law_info_preamble:
88
+ law_number = law_number or law_info_preamble.get("number")
89
+ law_year = law_year or law_info_preamble.get("year")
90
+
91
+ # -------------------------------------------------------
92
+ # معالجة الأقسام
93
+ # -------------------------------------------------------
94
+ sections = []
95
+ for sec in sections_raw:
96
+ raw_blocks = sec["texts"]
97
+
98
+ # دمج نصوص القسم مع merge_colon_lines
99
+ # واستبعاد المواد من نص المحتوى
100
+ content = "\n".join([
101
+ b["text"] for b in raw_blocks
102
+ if not is_article(b["text"])
103
+ ]).strip()
104
+
105
+ content = merge_colon_lines(content)
106
+
107
+ # استخراج المواد من القسم
108
+ articles = extract_articles_from_blocks(raw_blocks)
109
+
110
+ # تجهيز المواد بالشكل المطلوب
111
+ articles_cleaned = []
112
+ for a in articles:
113
+ if a["number"] is None:
114
+ articles_cleaned.append({"tag": a["text"]})
115
+ else:
116
+ articles_cleaned.append({
117
+ "number": a["number"],
118
+ "text": a["text"]
119
+ })
120
+
121
+ # إضافة القسم النهائي
122
+ sections.append({
123
+ "title": sec["name"],
124
+ "content": content,
125
+ "articles": articles_cleaned
126
+ })
127
+
128
+ # -------------------------------------------------------
129
+ # الاستجابة النهائية
130
+ # -------------------------------------------------------
131
+ return {
132
+ "message": "success",
133
+ "blocks": {
134
+ "count": len(text_blocks),
135
+ },
136
+ "law": {
137
+ "title": title,
138
+ "preamble": preamble,
139
+ "number": law_number,
140
+ "year": law_year,
141
+ "sections": sections
142
+ }
143
+ }
144
+
145
+ def parse_law_from_textsx(text_blocks: List[Dict[str, str]]) -> Dict:
146
  title, preamble, remaining_blocks = extract_title_and_preamble(text_blocks)
147
  sections_raw = extract_sections(remaining_blocks)
148