Mazenbs commited on
Commit
3c2ec1f
·
verified ·
1 Parent(s): 735d3d7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +235 -0
app.py CHANGED
@@ -0,0 +1,235 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import requests
3
+ from fastapi import FastAPI, Form
4
+ from fastapi.responses import JSONResponse
5
+ from bs4 import BeautifulSoup
6
+ from typing import Optional
7
+
8
+ from supabase_utils import save_law_to_supabase
9
+
10
+ app = FastAPI(title="Law Parser API", version="1.0")
11
+
12
+ # ---------------------------
13
+ # وظائف مساعدة
14
+ # ---------------------------
15
+ def clean_text(text: str) -> str:
16
+ if not text:
17
+ return ""
18
+ # إزالة الشرط والفراغات الزائدة
19
+ text = re.sub(r"ـ", "", text)
20
+ lines = [line for line in text.splitlines() if line.strip()]
21
+ text = "\n".join(lines)
22
+ text = "\n".join(re.sub(r"\s+", " ", line) for line in text.splitlines())
23
+ text = re.sub(r"\.(\s*)", r".\n", text)
24
+ return text.strip()
25
+
26
+ def extract_all_text_blocks(soup):
27
+ blocks = []
28
+ allowed_tags = ["p", "li", "span", "div", "h1", "h2", "h3", "h4", "h5"]
29
+ for tag in soup.find_all(allowed_tags):
30
+ raw = tag.get_text(separator=" ", strip=True)
31
+ raw = clean_text(raw)
32
+ if raw:
33
+ blocks.append(raw)
34
+ return blocks
35
+
36
+ def is_section(line):
37
+ return bool(re.match(r"^(الكتاب|الباب|الفصل)\s*[\d\w-]*", line))
38
+
39
+ def is_article(line):
40
+ return bool(re.match(r"^مادة\s*\(?(\d+)\)?", line))
41
+
42
+ def get_article_number(line):
43
+ m = re.match(r"^مادة\s*\(?(\d+)\)?", line)
44
+ return int(m.group(1)) if m else None
45
+
46
+ def extract_preamble(text_blocks):
47
+ preamble_lines = []
48
+ collecting_preamble = True
49
+ section_keywords = ["الكتاب", "الباب", "الفصل"]
50
+ article_keywords = ["مادة"]
51
+
52
+ for block in text_blocks:
53
+ block = block.strip()
54
+ if not block:
55
+ continue
56
+ lines = re.split(
57
+ r"(?<=\.)|(?=\b(?:" + "|".join(section_keywords + article_keywords) + r")\b)",
58
+ block,
59
+ )
60
+ for line in lines:
61
+ line = line.strip()
62
+ if not line:
63
+ continue
64
+ line_clean = re.sub(r"[^\w\s\d\u0600-\u06FF]", "", line)
65
+ if collecting_preamble:
66
+ is_section_line = any(re.match(rf"^{kw}\s+", line_clean) for kw in section_keywords)
67
+ is_article_line = any(re.match(rf"^{kw}\s*\(?\d+\)?", line_clean) for kw in article_keywords)
68
+ if is_section_line or is_article_line:
69
+ collecting_preamble = False
70
+ else:
71
+ preamble_lines.append(line)
72
+ return "\n".join(preamble_lines).strip()
73
+
74
+ # ---------------------------
75
+ # تحليل النصوص إلى أقسام ومواد
76
+ # ---------------------------
77
+ def parse_law(lines, end_at_article: Optional[int] = None):
78
+ sections = []
79
+ preamble_lines = []
80
+ current_section = None
81
+ current_article = None
82
+ collecting_preamble = True
83
+ article_map = {}
84
+ stop_reading = False
85
+
86
+ for line in lines:
87
+ line = line.strip()
88
+ if not line or stop_reading:
89
+ continue
90
+
91
+ if collecting_preamble:
92
+ if is_section(line) or is_article(line):
93
+ collecting_preamble = False
94
+ else:
95
+ preamble_lines.append(line)
96
+ continue
97
+
98
+ # بدء قسم جديد
99
+ if is_section(line):
100
+ if current_section:
101
+ sections.append(current_section)
102
+ current_section = {"content": line, "articles": []}
103
+ current_article = None
104
+ continue
105
+
106
+ # بدء مادة جديدة
107
+ if is_article(line):
108
+ number = get_article_number(line)
109
+ if end_at_article is not None and number > end_at_article:
110
+ stop_reading = True
111
+ continue
112
+
113
+ current_article = {"number": number, "text": line}
114
+ if current_section is None:
115
+ current_section = {"content": "", "articles": []}
116
+
117
+ key = f"{number}|{line[:30]}"
118
+ if key not in article_map:
119
+ article_map[key] = current_article
120
+ current_section["articles"].append(current_article)
121
+ else:
122
+ current_article = article_map[key]
123
+ continue
124
+
125
+ # نصوص قبل أي مادة تضاف للقسم
126
+ if current_section and not current_section["articles"]:
127
+ existing_content_lines = current_section["content"].split("\n")
128
+ if line not in existing_content_lines:
129
+ current_section["content"] += ("\n" if current_section["content"] else "") + line
130
+ continue
131
+
132
+ # نصوص بعد المادة تضاف للمادة
133
+ if current_article:
134
+ new_lines = line.split("\n")
135
+ existing_text_lines = current_article["text"].split("\n")
136
+ for new_line in new_lines:
137
+ new_line = new_line.strip()
138
+ if new_line and new_line not in existing_text_lines:
139
+ current_article["text"] += ("\n" if current_article["text"] else "") + new_line
140
+ existing_text_lines.append(new_line)
141
+
142
+ if current_section:
143
+ sections.append(current_section)
144
+
145
+ preamble = "\n".join(preamble_lines).strip()
146
+ return preamble, sections
147
+
148
+ # ---------------------------
149
+ # إلحاق الجداول
150
+ # ---------------------------
151
+ def attach_tables_to_sections(soup, sections):
152
+ tables = soup.find_all("table")
153
+ for idx, table in enumerate(tables):
154
+ all_trs = table.find_all("tr")
155
+ if not all_trs:
156
+ continue
157
+ headers = [clean_text(" ".join(td.stripped_strings)) for td in all_trs[0].find_all(["td", "th"])]
158
+ num_columns = len(headers)
159
+ if num_columns == 0:
160
+ continue
161
+ rows = []
162
+ for tr in all_trs[1:]:
163
+ row = [clean_text(" ".join(td.stripped_strings)) for td in tr.find_all(["td", "th"])]
164
+ if len(row) < num_columns:
165
+ row += [""] * (num_columns - len(row))
166
+ elif len(row) > num_columns:
167
+ row = row[:num_columns]
168
+ rows.append(row)
169
+ table_data = {"position": idx, "headers": headers, "rows": rows}
170
+
171
+ # ربط الجدول بالقسم الأقرب
172
+ if sections:
173
+ section_idx = min(idx, len(sections)-1)
174
+ sections[section_idx].setdefault("tables", []).append(table_data)
175
+ return sections
176
+
177
+ # ---------------------------
178
+ # تحليل القانون من HTML
179
+ # ---------------------------
180
+ def parse_law_from_html(html, end_at_article=None, save_to_supabase=False):
181
+ soup = BeautifulSoup(html, "html.parser")
182
+ title_tag = soup.find("title")
183
+ title = title_tag.text.strip() if title_tag else "عنوان غير معروف"
184
+
185
+ text_blocks = extract_all_text_blocks(soup)
186
+ preamble_tag = extract_preamble(text_blocks)
187
+ preamble, sections = parse_law(text_blocks, end_at_article=end_at_article)
188
+ sections = attach_tables_to_sections(soup, sections)
189
+
190
+ organized_sections = []
191
+ for sec in sections:
192
+ organized_sections.append({
193
+ "content": sec.get("content", "").strip(),
194
+ "articles": sec.get("articles", []),
195
+ "tables": sec.get("tables", []),
196
+ })
197
+
198
+ result = {
199
+ "title": title,
200
+ "preamble": preamble_tag,
201
+ "sections": organized_sections
202
+ }
203
+
204
+ if save_to_supabase:
205
+ try:
206
+ save_law_to_supabase(result)
207
+ except Exception as e:
208
+ print("❌ خطأ أثناء الحفظ في Supabase:", e)
209
+
210
+ return result
211
+
212
+ # ---------------------------
213
+ # نقطة النهاية API
214
+ # ---------------------------
215
+ @app.post("/parse")
216
+ async def parse_law_endpoint(
217
+ url: str = Form(...),
218
+ save_to_supabase: bool = Form(False),
219
+ end_at_article: Optional[int] = Form(None)
220
+ ):
221
+ try:
222
+ resp = requests.get(url)
223
+ resp.raise_for_status()
224
+ html_content = resp.text
225
+
226
+ result = parse_law_from_html(
227
+ html_content,
228
+ end_at_article=end_at_article,
229
+ save_to_supabase=save_to_supabase
230
+ )
231
+
232
+ return JSONResponse(content=result)
233
+
234
+ except Exception as e:
235
+ return JSONResponse(status_code=500, content={"error": str(e)})