Mazenbs commited on
Commit
735d3d7
·
verified ·
1 Parent(s): 47207da

Update app.py

Browse files

import re
import requests
from fastapi import FastAPI, Form
from fastapi.responses import JSONResponse
from bs4 import BeautifulSoup
from typing import Optional

from supabase_utils import save_law_to_supabase

app = FastAPI(title="Law Parser API", version="1.0")

# ---------------------------
# وظائف مساعدة
# ---------------------------
def clean_text(text: str) -> str:
if not text:
return ""
# إزالة الشرط والفراغات الزائدة
text = re.sub(r"ـ", "", text)
lines = [line for line in text.splitlines() if line.strip()]
text = "\n".join(lines)
text = "\n".join(re.sub(r"\s+", " ", line) for line in text.splitlines())
text = re.sub(r"\.(\s*)", r".\n", text)
return text.strip()

def extract_all_text_blocks(soup):
blocks = []
allowed_tags = ["p", "li", "span", "div", "h1", "h2", "h3", "h4", "h5"]
for tag in soup.find_all(allowed_tags):
raw = tag.get_text(separator=" ", strip=True)
raw = clean_text(raw)
if raw:
blocks.append(raw)
return blocks

def is_section(line):
return bool(re.match(r"^(الكتاب|الباب|الفصل)\s*[\d\w-]*", line))

def is_article(line):
return bool(re.match(r"^مادة\s*\(?(\d+)\)?", line))

def get_article_number(line):
m = re.match(r"^مادة\s*\(?(\d+)\)?", line)
return int(m.group(1)) if m else None

def extract_preamble(text_blocks):
preamble_lines = []
collecting_preamble = True
section_keywords = ["الكتاب", "الباب", "الفصل"]
article_keywords = ["مادة"]

for block in text_blocks:
block = block.strip()
if not block:
continue
lines = re.split(
r"(?<=\.)|(?=\b(?:" + "|".join(section_keywords + article_keywords) + r")\b)",
block,
)
for line in lines:
line = line.strip()
if not line:
continue
line_clean = re.sub(r"[^\w\s\d\u0600-\u06FF]", "", line)
if collecting_preamble:
is_section_line = any(re.match(rf"^{kw}\s+", line_clean) for kw in section_keywords)
is_article_line = any(re.match(rf"^{kw}\s*\(?\d+\)?", line_clean) for kw in article_keywords)
if is_section_line or is_article_line:
collecting_preamble = False
else:
preamble_lines.append(line)
return "\n".join(preamble_lines).strip()

# ---------------------------
# تحليل النصوص إلى أقسام ومواد
# ---------------------------
def parse_law(lines, end_at_article: Optional[int] = None):
sections = []
preamble_lines = []
current_section = None
current_article = None
collecting_preamble = True
article_map = {}
stop_reading = False

for line in lines:
line = line.strip()
if not line or stop_reading:
continue

if collecting_preamble:
if is_section(line) or is_article(line):
collecting_preamble = False
else:
preamble_lines.append(line)
continue

# بدء قسم جديد
if is_section(line):
if current_section:
sections.append(current_section)
current_section = {"content": line, "articles": []}
current_article = None
continue

# بدء مادة جديدة
if is_article(line):
number = get_article_number(line)
if end_at_article is not None and number > end_at_article:
stop_reading = True
continue

current_article = {"number": number, "text": line}
if current_section is None:
current_section = {"content": "", "articles": []}

key = f"{number}|{line[:30]}"
if key not in article_map:
article_map[key] = current_article
current_section["articles"].append(current_article)
else:
current_article = article_map[key]
continue

# نصوص قبل أي مادة تضاف للقسم
if current_section and not current_section["articles"]:
existing_content_lines = current_section["content"].split("\n")
if line not in existing_content_lines:
current_section["content"] += ("\n" if current_section["content"] else "") + line
continue

# نصوص بعد المادة تضاف للمادة
if current_article:
new_lines = line.split("\n")
existing_text_lines = current_article["text"].split("\n")
for new_line in new_lines:
new_line = new_line.strip()
if new_line and new_line not in existing_text_lines:
current_article["text"] += ("\n" if current_article["text"] else "") + new_line
existing_text_lines.append(new_line)

if current_section:
sections.append(current_section)

preamble = "\n".join(preamble_lines).strip()
return preamble, sections

# ---------------------------
# إلحاق الجداول
# ---------------------------
def attach_tables_to_sections(soup, sections):
tables = soup.find_all("table")
for idx, table in enumerate(tables):
all_trs = table.find_all("tr")
if not all_trs:
continue
headers = [clean_text(" ".join(td.stripped_strings)) for td in all_trs[0].find_all(["td", "th"])]
num_columns = len(headers)
if num_columns == 0:
continue
rows = []
for tr in all_trs[1:]:
row = [clean_text(" ".join(td.stripped_strings)) for td in tr.find_all(["td", "th"])]
if len(row) < num_columns:
row += [""] * (num_columns - len(row))
elif len(row) > num_columns:
row = row[:num_columns]
rows.append(row)
table_data = {"position": idx, "headers": headers, "rows": rows}

# ربط الجدول بالقسم الأقرب
if sections:
section_idx = min(idx, len(sections)-1)
sections[section_idx].setdefault("tables", []).append(table_data)
return sections

# ---------------------------
# تحليل القانون من HTML
# ---------------------------
def parse_law_from_html(html, end_at_article=None, save_to_supabase=False):
soup = BeautifulSoup(html, "html.parser")
title_tag = soup.find("title")
title = title_tag.text.strip() if title_tag else "عنوان غير معروف"

text_blocks = extract_all_text_blocks(soup)
preamble_tag = extract_preamble(text_blocks)
preamble, sections = parse_law(text_blocks, end_at_article=end_at_article)
sections = attach_tables_to_sections(soup, sections)

organized_sections = []
for sec in sections:
organized_sections.append({
"content": sec.get("content", "").strip(),
"articles": sec.get("articles", []),
"tables": sec.get("tables", []),
})

result = {
"title": title,
"preamble": preamble_tag,
"sections": organized_sections
}

if save_to_supabase:
try:
save_law_to_supabase(result)
except Exception as e:
print("❌ خطأ أثناء الحفظ في Supabase:", e)

return result

# ---------------------------
# نقطة النهاية API
# ---------------------------


@app
.post("/parse")
async def parse_law_endpoint(
url: str = Form(...),
save_to_supabase: bool = Form(False),
end_at_article: Optional[int] = Form(None)
):
try:
resp = requests.get(url)
resp.raise_for_status()
html_content = resp.text

result = parse_law_from_html(
html_content,
end_at_article=end_at_article,
save_to_supabase=save_to_supabase
)

return JSONResponse(content=result)

except Exception as e:
return JSONResponse(status_code=500, content={"error": str(e)})

Files changed (1) hide show
  1. app.py +0 -226
app.py CHANGED
@@ -1,226 +0,0 @@
1
- import re
2
- import requests
3
- from fastapi import FastAPI, Form
4
- from fastapi.responses import JSONResponse
5
- from bs4 import BeautifulSoup
6
- from typing import Optional
7
-
8
- from supabase_utils import save_law_to_supabase
9
-
10
- app = FastAPI(title="Law Parser API", version="1.0")
11
-
12
- # ---------------------------
13
- # وظائف مساعدة
14
- # ---------------------------
15
- def clean_text(text: str) -> str:
16
- if not text:
17
- return ""
18
- text = re.sub(r"ـ", "", text)
19
- lines = [line for line in text.splitlines() if line.strip()]
20
- text = "\n".join(lines)
21
- text = "\n".join(re.sub(r"\s+", " ", line) for line in text.splitlines())
22
- text = re.sub(r"\.(\s*)", r".\n", text)
23
- return text.strip()
24
-
25
- def extract_all_text_blocks(soup):
26
- blocks = []
27
- allowed_tags = ["p", "li", "span", "div", "h1", "h2", "h3", "h4", "h5"]
28
- for tag in soup.find_all(allowed_tags):
29
- raw = tag.get_text(separator=" ", strip=True)
30
- raw = clean_text(raw)
31
- if raw:
32
- blocks.append(raw)
33
- return blocks
34
-
35
- def is_section(line):
36
- return bool(re.match(r"^(الكتاب|الباب|الفصل)\s+", line))
37
-
38
- def is_article(line):
39
- return bool(re.match(r"^مادة\s*\(?(\d+)\)?", line))
40
-
41
- def get_article_number(line):
42
- m = re.match(r"^مادة\s*\(?(\d+)\)?", line)
43
- return int(m.group(1)) if m else None
44
-
45
- def extract_preamble(text_blocks):
46
- preamble_lines = []
47
- collecting_preamble = True
48
- section_keywords = ["الكتاب", "الباب", "الفصل"]
49
- article_keywords = ["مادة"]
50
-
51
- for block in text_blocks:
52
- block = block.strip()
53
- if not block:
54
- continue
55
- lines = re.split(
56
- r"(?<=\.)|(?=\b(?:" + "|".join(section_keywords + article_keywords) + r")\b)",
57
- block,
58
- )
59
- for line in lines:
60
- line = line.strip()
61
- if not line:
62
- continue
63
- line_clean = re.sub(r"[^\w\s\d\u0600-\u06FF]", "", line)
64
- if collecting_preamble:
65
- is_section_line = any(re.match(rf"^{kw}\s+", line_clean) for kw in section_keywords)
66
- is_article_line = any(re.match(rf"^{kw}\s*\(?\d+\)?", line_clean) for kw in article_keywords)
67
- if is_section_line or is_article_line:
68
- collecting_preamble = False
69
- else:
70
- preamble_lines.append(line)
71
- return "\n".join(preamble_lines).strip()
72
-
73
- # ----------------------------------------------------
74
- # أهم تعديل هنا: إضافة end_at_article داخل parse_law
75
- # ----------------------------------------------------
76
- def parse_law(lines, end_at_article: Optional[int] = None):
77
- sections = []
78
- preamble_lines = []
79
- current_section = None
80
- current_article = None
81
- collecting_preamble = True
82
- article_map = {}
83
- stop_reading = False
84
-
85
- for line in lines:
86
- line = line.strip()
87
- if not line or stop_reading:
88
- continue
89
-
90
- if collecting_preamble:
91
- if is_section(line) or is_article(line):
92
- collecting_preamble = False
93
- else:
94
- preamble_lines.append(line)
95
- continue
96
-
97
- if is_section(line):
98
- if current_section:
99
- sections.append(current_section)
100
- current_section = {"content": line, "articles": []}
101
- current_article = None
102
- continue
103
-
104
- if is_article(line):
105
- number = get_article_number(line)
106
-
107
- # 🚨 وقف إذا تجاوز المادة المحددة
108
- if end_at_article is not None and number > end_at_article:
109
- stop_reading = True
110
- continue
111
-
112
- current_article = {"number": number, "text": line}
113
- if current_section is None:
114
- current_section = {"content": "", "articles": []}
115
-
116
- key = f"{number}|{line[:30]}"
117
- if key not in article_map:
118
- article_map[key] = current_article
119
- current_section["articles"].append(current_article)
120
- else:
121
- current_article = article_map[key]
122
- continue
123
-
124
- # إضافة نص للقسم
125
- if current_section and not current_section["articles"]:
126
- existing_content_lines = current_section["content"].split("\n")
127
- if line not in existing_content_lines:
128
- current_section["content"] += ("\n" if current_section["content"] else "") + line
129
- continue
130
-
131
- # إضافة نص للمادة
132
- if current_article:
133
- new_lines = line.split("\n")
134
- existing_text_lines = current_article["text"].split("\n")
135
- for new_line in new_lines:
136
- new_line = new_line.strip()
137
- if new_line and new_line not in existing_text_lines:
138
- current_article["text"] += ("\n" if current_article["text"] else "") + new_line
139
- existing_text_lines.append(new_line)
140
-
141
- if current_section:
142
- sections.append(current_section)
143
-
144
- preamble = "\n".join(preamble_lines).strip()
145
- return preamble, sections
146
-
147
- # ---------------------------
148
- # إلحاق الجداول
149
- # ---------------------------
150
- def extract_tables_and_link(soup, sections):
151
- tables = soup.find_all("table")
152
- for idx, table in enumerate(tables):
153
- all_trs = table.find_all("tr")
154
- if not all_trs:
155
- continue
156
- headers = [clean_text(" ".join(td.stripped_strings)) for td in all_trs[0].find_all(["td", "th"])]
157
- num_columns = len(headers)
158
- if num_columns == 0:
159
- continue
160
- rows = []
161
- for tr in all_trs[1:]:
162
- row = [clean_text(" ".join(td.stripped_strings)) for td in tr.find_all(["td", "th"])]
163
- if len(row) < num_columns:
164
- row += [""] * (num_columns - len(row))
165
- elif len(row) > num_columns:
166
- row = row[:num_columns]
167
- rows.append(row)
168
- table_data = {"position": idx, "headers": headers, "rows": rows}
169
- sections[0].setdefault("tables", []).append(table_data)
170
- return sections
171
-
172
- # ---------------------------
173
- # تحليل القانون من HTML
174
- # ---------------------------
175
- def parse_law_from_html(html, end_at_article=None, save_to_supabase=False):
176
- soup = BeautifulSoup(html, "html.parser")
177
- title_tag = soup.find("title")
178
- title = title_tag.text.strip() if title_tag else "عنوان غير معروف"
179
-
180
- text_blocks = extract_all_text_blocks(soup)
181
- preamble_tag = extract_preamble(text_blocks)
182
- preamble, sections = parse_law(text_blocks, end_at_article=end_at_article)
183
- sections = extract_tables_and_link(soup, sections)
184
-
185
- organized_sections = []
186
- for sec in sections:
187
- organized_sections.append({
188
- "content": sec.get("content", "").strip(),
189
- "articles": sec.get("articles", []),
190
- "tables": sec.get("tables", []),
191
- })
192
-
193
- result = {"title": title, "preamble": preamble_tag, "sections": organized_sections}
194
-
195
- if save_to_supabase:
196
- try:
197
- save_law_to_supabase(result)
198
- except Exception as e:
199
- print("❌ خطأ أثناء الحفظ في Supabase:", e)
200
-
201
- return result
202
-
203
- # ---------------------------
204
- # نقطة النهاية API
205
- # ---------------------------
206
- @app.post("/parse")
207
- async def parse_law_endpoint(
208
- url: str = Form(...),
209
- save_to_supabase: bool = Form(False),
210
- end_at_article: Optional[int] = Form(None)
211
- ):
212
- try:
213
- resp = requests.get(url)
214
- resp.raise_for_status()
215
- html_content = resp.text
216
-
217
- result = parse_law_from_html(
218
- html_content,
219
- end_at_article=end_at_article,
220
- save_to_supabase=save_to_supabase
221
- )
222
-
223
- return JSONResponse(content=result)
224
-
225
- except Exception as e:
226
- return JSONResponse(status_code=500, content={"error": str(e)})