Spaces:

Mazenbs
/

extract_html_full

Running

App Files Files Community

Mazenbs commited on Nov 29, 2025

Commit

3c2ec1f

verified ·

1 Parent(s): 735d3d7

Update app.py

Browse files

Files changed (1) hide show

app.py +235 -0

app.py CHANGED Viewed

	@@ -0,0 +1,235 @@

+import re
+import requests
+from fastapi import FastAPI, Form
+from fastapi.responses import JSONResponse
+from bs4 import BeautifulSoup
+from typing import Optional
+from supabase_utils import save_law_to_supabase
+app = FastAPI(title="Law Parser API", version="1.0")
+# ---------------------------
+# وظائف مساعدة
+# ---------------------------
+def clean_text(text: str) -> str:
+    if not text:
+        return ""
+    # إزالة الشرط والفراغات الزائدة
+    text = re.sub(r"ـ", "", text)
+    lines = [line for line in text.splitlines() if line.strip()]
+    text = "\n".join(lines)
+    text = "\n".join(re.sub(r"\s+", " ", line) for line in text.splitlines())
+    text = re.sub(r"\.(\s*)", r".\n", text)
+    return text.strip()
+def extract_all_text_blocks(soup):
+    blocks = []
+    allowed_tags = ["p", "li", "span", "div", "h1", "h2", "h3", "h4", "h5"]
+    for tag in soup.find_all(allowed_tags):
+        raw = tag.get_text(separator=" ", strip=True)
+        raw = clean_text(raw)
+        if raw:
+            blocks.append(raw)
+    return blocks
+def is_section(line):
+    return bool(re.match(r"^(الكتاب|الباب|الفصل)\s*[\d\w-]*", line))
+def is_article(line):
+    return bool(re.match(r"^مادة\s*\(?(\d+)\)?", line))
+def get_article_number(line):
+    m = re.match(r"^مادة\s*\(?(\d+)\)?", line)
+    return int(m.group(1)) if m else None
+def extract_preamble(text_blocks):
+    preamble_lines = []
+    collecting_preamble = True
+    section_keywords = ["الكتاب", "الباب", "الفصل"]
+    article_keywords = ["مادة"]
+    for block in text_blocks:
+        block = block.strip()
+        if not block:
+            continue
+        lines = re.split(
+            r"(?<=\.)|(?=\b(?:" + "|".join(section_keywords + article_keywords) + r")\b)",
+            block,
+        )
+        for line in lines:
+            line = line.strip()
+            if not line:
+                continue
+            line_clean = re.sub(r"[^\w\s\d\u0600-\u06FF]", "", line)
+            if collecting_preamble:
+                is_section_line = any(re.match(rf"^{kw}\s+", line_clean) for kw in section_keywords)
+                is_article_line = any(re.match(rf"^{kw}\s*\(?\d+\)?", line_clean) for kw in article_keywords)
+                if is_section_line or is_article_line:
+                    collecting_preamble = False
+                else:
+                    preamble_lines.append(line)
+    return "\n".join(preamble_lines).strip()
+# ---------------------------
+# تحليل النصوص إلى أقسام ومواد
+# ---------------------------
+def parse_law(lines, end_at_article: Optional[int] = None):
+    sections = []
+    preamble_lines = []
+    current_section = None
+    current_article = None
+    collecting_preamble = True
+    article_map = {}
+    stop_reading = False
+    for line in lines:
+        line = line.strip()
+        if not line or stop_reading:
+            continue
+        if collecting_preamble:
+            if is_section(line) or is_article(line):
+                collecting_preamble = False
+            else:
+                preamble_lines.append(line)
+                continue
+        # بدء قسم جديد
+        if is_section(line):
+            if current_section:
+                sections.append(current_section)
+            current_section = {"content": line, "articles": []}
+            current_article = None
+            continue
+        # بدء مادة جديدة
+        if is_article(line):
+            number = get_article_number(line)
+            if end_at_article is not None and number > end_at_article:
+                stop_reading = True
+                continue
+            current_article = {"number": number, "text": line}
+            if current_section is None:
+                current_section = {"content": "", "articles": []}
+            key = f"{number}|{line[:30]}"
+            if key not in article_map:
+                article_map[key] = current_article
+                current_section["articles"].append(current_article)
+            else:
+                current_article = article_map[key]
+            continue
+        # نصوص قبل أي مادة تضاف للقسم
+        if current_section and not current_section["articles"]:
+            existing_content_lines = current_section["content"].split("\n")
+            if line not in existing_content_lines:
+                current_section["content"] += ("\n" if current_section["content"] else "") + line
+            continue
+        # نصوص بعد المادة تضاف للمادة
+        if current_article:
+            new_lines = line.split("\n")
+            existing_text_lines = current_article["text"].split("\n")
+            for new_line in new_lines:
+                new_line = new_line.strip()
+                if new_line and new_line not in existing_text_lines:
+                    current_article["text"] += ("\n" if current_article["text"] else "") + new_line
+                    existing_text_lines.append(new_line)
+    if current_section:
+        sections.append(current_section)
+    preamble = "\n".join(preamble_lines).strip()
+    return preamble, sections
+# ---------------------------
+# إلحاق الجداول
+# ---------------------------
+def attach_tables_to_sections(soup, sections):
+    tables = soup.find_all("table")
+    for idx, table in enumerate(tables):
+        all_trs = table.find_all("tr")
+        if not all_trs:
+            continue
+        headers = [clean_text(" ".join(td.stripped_strings)) for td in all_trs[0].find_all(["td", "th"])]
+        num_columns = len(headers)
+        if num_columns == 0:
+            continue
+        rows = []
+        for tr in all_trs[1:]:
+            row = [clean_text(" ".join(td.stripped_strings)) for td in tr.find_all(["td", "th"])]
+            if len(row) < num_columns:
+                row += [""] * (num_columns - len(row))
+            elif len(row) > num_columns:
+                row = row[:num_columns]
+            rows.append(row)
+        table_data = {"position": idx, "headers": headers, "rows": rows}
+        # ربط الجدول بالقسم الأقرب
+        if sections:
+            section_idx = min(idx, len(sections)-1)
+            sections[section_idx].setdefault("tables", []).append(table_data)
+    return sections
+# ---------------------------
+# تحليل القانون من HTML
+# ---------------------------
+def parse_law_from_html(html, end_at_article=None, save_to_supabase=False):
+    soup = BeautifulSoup(html, "html.parser")
+    title_tag = soup.find("title")
+    title = title_tag.text.strip() if title_tag else "عنوان غير معروف"
+    text_blocks = extract_all_text_blocks(soup)
+    preamble_tag = extract_preamble(text_blocks)
+    preamble, sections = parse_law(text_blocks, end_at_article=end_at_article)
+    sections = attach_tables_to_sections(soup, sections)
+    organized_sections = []
+    for sec in sections:
+        organized_sections.append({
+            "content": sec.get("content", "").strip(),
+            "articles": sec.get("articles", []),
+            "tables": sec.get("tables", []),
+        })
+    result = {
+        "title": title,
+        "preamble": preamble_tag,
+        "sections": organized_sections
+    }
+    if save_to_supabase:
+        try:
+            save_law_to_supabase(result)
+        except Exception as e:
+            print("❌ خطأ أثناء الحفظ في Supabase:", e)
+    return result
+# ---------------------------
+# نقطة النهاية API
+# ---------------------------
+@app.post("/parse")
+async def parse_law_endpoint(
+    url: str = Form(...),
+    save_to_supabase: bool = Form(False),
+    end_at_article: Optional[int] = Form(None)
+):
+    try:
+        resp = requests.get(url)
+        resp.raise_for_status()
+        html_content = resp.text
+        result = parse_law_from_html(
+            html_content,
+            end_at_article=end_at_article,
+            save_to_supabase=save_to_supabase
+        )
+        return JSONResponse(content=result)
+    except Exception as e:
+        return JSONResponse(status_code=500, content={"error": str(e)})