Spaces:

Mazenbs
/

extract_html_full

Sleeping

App Files Files Community

Mazenbs commited on Nov 29, 2025

Commit

735d3d7

verified ·

1 Parent(s): 47207da

Update app.py

Browse files

import re
import requests
from fastapi import FastAPI, Form
from fastapi.responses import JSONResponse
from bs4 import BeautifulSoup
from typing import Optional

from supabase_utils import save_law_to_supabase

app = FastAPI(title="Law Parser API", version="1.0")

# ---------------------------
# وظائف مساعدة
# ---------------------------
def clean_text(text: str) -> str:
if not text:
return ""
# إزالة الشرط والفراغات الزائدة
text = re.sub(r"ـ", "", text)
lines = [line for line in text.splitlines() if line.strip()]
text = "\n".join(lines)
text = "\n".join(re.sub(r"\s+", " ", line) for line in text.splitlines())
text = re.sub(r"\.(\s*)", r".\n", text)
return text.strip()

def extract_all_text_blocks(soup):
blocks = []
allowed_tags = ["p", "li", "span", "div", "h1", "h2", "h3", "h4", "h5"]
for tag in soup.find_all(allowed_tags):
raw = tag.get_text(separator=" ", strip=True)
raw = clean_text(raw)
if raw:
blocks.append(raw)
return blocks

def is_section(line):
return bool(re.match(r"^(الكتاب|الباب|الفصل)\s*[\d\w-]*", line))

def is_article(line):
return bool(re.match(r"^مادة\s*\(?(\d+)\)?", line))

def get_article_number(line):
m = re.match(r"^مادة\s*\(?(\d+)\)?", line)
return int(m.group(1)) if m else None

def extract_preamble(text_blocks):
preamble_lines = []
collecting_preamble = True
section_keywords = ["الكتاب", "الباب", "الفصل"]
article_keywords = ["مادة"]

for block in text_blocks:
block = block.strip()
if not block:
continue
lines = re.split(
r"(?<=\.)|(?=\b(?:" + "|".join(section_keywords + article_keywords) + r")\b)",
block,
)
for line in lines:
line = line.strip()
if not line:
continue
line_clean = re.sub(r"[^\w\s\d\u0600-\u06FF]", "", line)
if collecting_preamble:
is_section_line = any(re.match(rf"^{kw}\s+", line_clean) for kw in section_keywords)
is_article_line = any(re.match(rf"^{kw}\s*\(?\d+\)?", line_clean) for kw in article_keywords)
if is_section_line or is_article_line:
collecting_preamble = False
else:
preamble_lines.append(line)
return "\n".join(preamble_lines).strip()

# ---------------------------
# تحليل النصوص إلى أقسام ومواد
# ---------------------------
def parse_law(lines, end_at_article: Optional[int] = None):
sections = []
preamble_lines = []
current_section = None
current_article = None
collecting_preamble = True
article_map = {}
stop_reading = False

for line in lines:
line = line.strip()
if not line or stop_reading:
continue

if collecting_preamble:
if is_section(line) or is_article(line):
collecting_preamble = False
else:
preamble_lines.append(line)
continue

# بدء قسم جديد
if is_section(line):
if current_section:
sections.append(current_section)
current_section = {"content": line, "articles": []}
current_article = None
continue

# بدء مادة جديدة
if is_article(line):
number = get_article_number(line)
if end_at_article is not None and number > end_at_article:
stop_reading = True
continue

current_article = {"number": number, "text": line}
if current_section is None:
current_section = {"content": "", "articles": []}

key = f"{number}|{line[:30]}"
if key not in article_map:
article_map[key] = current_article
current_section["articles"].append(current_article)
else:
current_article = article_map[key]
continue

# نصوص قبل أي مادة تضاف للقسم
if current_section and not current_section["articles"]:
existing_content_lines = current_section["content"].split("\n")
if line not in existing_content_lines:
current_section["content"] += ("\n" if current_section["content"] else "") + line
continue

# نصوص بعد المادة تضاف للمادة
if current_article:
new_lines = line.split("\n")
existing_text_lines = current_article["text"].split("\n")
for new_line in new_lines:
new_line = new_line.strip()
if new_line and new_line not in existing_text_lines:
current_article["text"] += ("\n" if current_article["text"] else "") + new_line
existing_text_lines.append(new_line)

if current_section:
sections.append(current_section)

preamble = "\n".join(preamble_lines).strip()
return preamble, sections

# ---------------------------
# إلحاق الجداول
# ---------------------------
def attach_tables_to_sections(soup, sections):
tables = soup.find_all("table")
for idx, table in enumerate(tables):
all_trs = table.find_all("tr")
if not all_trs:
continue
headers = [clean_text(" ".join(td.stripped_strings)) for td in all_trs[0].find_all(["td", "th"])]
num_columns = len(headers)
if num_columns == 0:
continue
rows = []
for tr in all_trs[1:]:
row = [clean_text(" ".join(td.stripped_strings)) for td in tr.find_all(["td", "th"])]
if len(row) < num_columns:
row += [""] * (num_columns - len(row))
elif len(row) > num_columns:
row = row[:num_columns]
rows.append(row)
table_data = {"position": idx, "headers": headers, "rows": rows}

# ربط الجدول بالقسم الأقرب
if sections:
section_idx = min(idx, len(sections)-1)
sections[section_idx].setdefault("tables", []).append(table_data)
return sections

# ---------------------------
# تحليل القانون من HTML
# ---------------------------
def parse_law_from_html(html, end_at_article=None, save_to_supabase=False):
soup = BeautifulSoup(html, "html.parser")
title_tag = soup.find("title")
title = title_tag.text.strip() if title_tag else "عنوان غير معروف"

text_blocks = extract_all_text_blocks(soup)
preamble_tag = extract_preamble(text_blocks)
preamble, sections = parse_law(text_blocks, end_at_article=end_at_article)
sections = attach_tables_to_sections(soup, sections)

organized_sections = []
for sec in sections:
organized_sections.append({
"content": sec.get("content", "").strip(),
"articles": sec.get("articles", []),
"tables": sec.get("tables", []),
})

result = {
"title": title,
"preamble": preamble_tag,
"sections": organized_sections
}

if save_to_supabase:
try:
save_law_to_supabase(result)
except Exception as e:
print("❌ خطأ أثناء الحفظ في Supabase:", e)

return result

# ---------------------------
# نقطة النهاية API
# ---------------------------

@app
.post("/parse")
async def parse_law_endpoint(
url: str = Form(...),
save_to_supabase: bool = Form(False),
end_at_article: Optional[int] = Form(None)
):
try:
resp = requests.get(url)
resp.raise_for_status()
html_content = resp.text

result = parse_law_from_html(
html_content,
end_at_article=end_at_article,
save_to_supabase=save_to_supabase
)

return JSONResponse(content=result)

except Exception as e:
return JSONResponse(status_code=500, content={"error": str(e)})

Files changed (1) hide show

app.py +0 -226

app.py CHANGED Viewed

@@ -1,226 +0,0 @@
-import re
-import requests
-from fastapi import FastAPI, Form
-from fastapi.responses import JSONResponse
-from bs4 import BeautifulSoup
-from typing import Optional
-from supabase_utils import save_law_to_supabase
-app = FastAPI(title="Law Parser API", version="1.0")
-# ---------------------------
-# وظائف مساعدة
-# ---------------------------
-def clean_text(text: str) -> str:
-    if not text:
-        return ""
-    text = re.sub(r"ـ", "", text)
-    lines = [line for line in text.splitlines() if line.strip()]
-    text = "\n".join(lines)
-    text = "\n".join(re.sub(r"\s+", " ", line) for line in text.splitlines())
-    text = re.sub(r"\.(\s*)", r".\n", text)
-    return text.strip()
-def extract_all_text_blocks(soup):
-    blocks = []
-    allowed_tags = ["p", "li", "span", "div", "h1", "h2", "h3", "h4", "h5"]
-    for tag in soup.find_all(allowed_tags):
-        raw = tag.get_text(separator=" ", strip=True)
-        raw = clean_text(raw)
-        if raw:
-            blocks.append(raw)
-    return blocks
-def is_section(line):
-    return bool(re.match(r"^(الكتاب|الباب|الفصل)\s+", line))
-def is_article(line):
-    return bool(re.match(r"^مادة\s*\(?(\d+)\)?", line))
-def get_article_number(line):
-    m = re.match(r"^مادة\s*\(?(\d+)\)?", line)
-    return int(m.group(1)) if m else None
-def extract_preamble(text_blocks):
-    preamble_lines = []
-    collecting_preamble = True
-    section_keywords = ["الكتاب", "الباب", "الفصل"]
-    article_keywords = ["مادة"]
-    for block in text_blocks:
-        block = block.strip()
-        if not block:
-            continue
-        lines = re.split(
-            r"(?<=\.)|(?=\b(?:" + "|".join(section_keywords + article_keywords) + r")\b)",
-            block,
-        )
-        for line in lines:
-            line = line.strip()
-            if not line:
-                continue
-            line_clean = re.sub(r"[^\w\s\d\u0600-\u06FF]", "", line)
-            if collecting_preamble:
-                is_section_line = any(re.match(rf"^{kw}\s+", line_clean) for kw in section_keywords)
-                is_article_line = any(re.match(rf"^{kw}\s*\(?\d+\)?", line_clean) for kw in article_keywords)
-                if is_section_line or is_article_line:
-                    collecting_preamble = False
-                else:
-                    preamble_lines.append(line)
-    return "\n".join(preamble_lines).strip()
-# ----------------------------------------------------
-# أهم تعديل هنا: إضافة end_at_article داخل parse_law
-# ----------------------------------------------------
-def parse_law(lines, end_at_article: Optional[int] = None):
-    sections = []
-    preamble_lines = []
-    current_section = None
-    current_article = None
-    collecting_preamble = True
-    article_map = {}
-    stop_reading = False
-    for line in lines:
-        line = line.strip()
-        if not line or stop_reading:
-            continue
-        if collecting_preamble:
-            if is_section(line) or is_article(line):
-                collecting_preamble = False
-            else:
-                preamble_lines.append(line)
-                continue
-        if is_section(line):
-            if current_section:
-                sections.append(current_section)
-            current_section = {"content": line, "articles": []}
-            current_article = None
-            continue
-        if is_article(line):
-            number = get_article_number(line)
-            # 🚨 وقف إذا تجاوز المادة المحددة
-            if end_at_article is not None and number > end_at_article:
-                stop_reading = True
-                continue
-            current_article = {"number": number, "text": line}
-            if current_section is None:
-                current_section = {"content": "", "articles": []}
-            key = f"{number}|{line[:30]}"
-            if key not in article_map:
-                article_map[key] = current_article
-                current_section["articles"].append(current_article)
-            else:
-                current_article = article_map[key]
-            continue
-        # إضافة نص للقسم
-        if current_section and not current_section["articles"]:
-            existing_content_lines = current_section["content"].split("\n")
-            if line not in existing_content_lines:
-                current_section["content"] += ("\n" if current_section["content"] else "") + line
-            continue
-        # إضافة نص للمادة
-        if current_article:
-            new_lines = line.split("\n")
-            existing_text_lines = current_article["text"].split("\n")
-            for new_line in new_lines:
-                new_line = new_line.strip()
-                if new_line and new_line not in existing_text_lines:
-                    current_article["text"] += ("\n" if current_article["text"] else "") + new_line
-                    existing_text_lines.append(new_line)
-    if current_section:
-        sections.append(current_section)
-    preamble = "\n".join(preamble_lines).strip()
-    return preamble, sections
-# ---------------------------
-# إلحاق الجداول
-# ---------------------------
-def extract_tables_and_link(soup, sections):
-    tables = soup.find_all("table")
-    for idx, table in enumerate(tables):
-        all_trs = table.find_all("tr")
-        if not all_trs:
-            continue
-        headers = [clean_text(" ".join(td.stripped_strings)) for td in all_trs[0].find_all(["td", "th"])]
-        num_columns = len(headers)
-        if num_columns == 0:
-            continue
-        rows = []
-        for tr in all_trs[1:]:
-            row = [clean_text(" ".join(td.stripped_strings)) for td in tr.find_all(["td", "th"])]
-            if len(row) < num_columns:
-                row += [""] * (num_columns - len(row))
-            elif len(row) > num_columns:
-                row = row[:num_columns]
-            rows.append(row)
-        table_data = {"position": idx, "headers": headers, "rows": rows}
-        sections[0].setdefault("tables", []).append(table_data)
-    return sections
-# ---------------------------
-# تحليل القانون من HTML
-# ---------------------------
-def parse_law_from_html(html, end_at_article=None, save_to_supabase=False):
-    soup = BeautifulSoup(html, "html.parser")
-    title_tag = soup.find("title")
-    title = title_tag.text.strip() if title_tag else "عنوان غير معروف"
-    text_blocks = extract_all_text_blocks(soup)
-    preamble_tag = extract_preamble(text_blocks)
-    preamble, sections = parse_law(text_blocks, end_at_article=end_at_article)
-    sections = extract_tables_and_link(soup, sections)
-    organized_sections = []
-    for sec in sections:
-        organized_sections.append({
-            "content": sec.get("content", "").strip(),
-            "articles": sec.get("articles", []),
-            "tables": sec.get("tables", []),
-        })
-    result = {"title": title, "preamble": preamble_tag, "sections": organized_sections}
-    if save_to_supabase:
-        try:
-            save_law_to_supabase(result)
-        except Exception as e:
-            print("❌ خطأ أثناء الحفظ في Supabase:", e)
-    return result
-# ---------------------------
-# نقطة النهاية API
-# ---------------------------
-@app.post("/parse")
-async def parse_law_endpoint(
-    url: str = Form(...),
-    save_to_supabase: bool = Form(False),
-    end_at_article: Optional[int] = Form(None)
-):
-    try:
-        resp = requests.get(url)
-        resp.raise_for_status()
-        html_content = resp.text
-        result = parse_law_from_html(
-            html_content,
-            end_at_article=end_at_article,
-            save_to_supabase=save_to_supabase
-        )
-        return JSONResponse(content=result)
-    except Exception as e:
-        return JSONResponse(status_code=500, content={"error": str(e)})