Spaces:
Running
Running
| # parser/table_extractorgo.py | |
| from bs4 import BeautifulSoup, Tag | |
| from typing import List, Dict, Any | |
| from helpers.cleaner import clean_text | |
| from helpers.utils import is_section, extract_article_number, is_article | |
| def tables_from_soup(soup: BeautifulSoup) -> List[Tag]: | |
| """استخراج جميع عناصر الجدول من صفحة HTML""" | |
| return soup.find_all("table") | |
| def table_to_struct(table: Tag) -> Dict[str, Any]: | |
| """تحويل جدول HTML إلى هيكل JSON يحتوي على headers و rows""" | |
| trs = table.find_all("tr") | |
| if not trs: | |
| return {"headers": [], "rows": []} | |
| headers = [ | |
| clean_text(" ".join(td.stripped_strings)) | |
| for td in trs[0].find_all(["th", "td"]) | |
| ] | |
| num_cols = len(headers) | |
| rows = [] | |
| for tr in trs[1:]: | |
| cols = [clean_text(" ".join(td.stripped_strings)) for td in tr.find_all(["td", "th"])] | |
| while len(cols) < num_cols: | |
| cols.append("") | |
| rows.append(cols) | |
| return {"headers": headers, "rows": rows} | |
| def link_tables_to_sections_and_articles( | |
| soup: BeautifulSoup, sections: List[Dict[str, Any]] | |
| ) -> List[Dict[str, Any]]: | |
| """ | |
| ربط الجداول بالأقسام أو المواد الأقرب لها. | |
| كل جدول مرتبط بمادة أو قسم حسب النص السابق له مباشرة. | |
| """ | |
| tables = tables_from_soup(soup) | |
| for idx, table in enumerate(tables): | |
| struct = table_to_struct(table) | |
| struct["position"] = idx | |
| prev = table.find_previous(string=True) | |
| target_section = None | |
| target_article = None | |
| while prev: | |
| text = prev.strip() | |
| # إذا كان السطر يمثل بداية مادة | |
| if is_article(text): | |
| num = extract_article_number(text) | |
| for sec in reversed(sections): | |
| for art in reversed(sec.get("articles", [])): | |
| if art.get("number") == num: | |
| target_article = art | |
| break | |
| if target_article: | |
| break | |
| if target_article: | |
| break | |
| # إذا كان السطر يمثل بداية قسم | |
| if is_section(text): | |
| for sec in reversed(sections): | |
| if sec["title"] == text: | |
| target_section = sec | |
| break | |
| if target_section: | |
| break | |
| prev = prev.find_previous(string=True) | |
| # ربط الجدول بالمادة أو القسم أو القسم الأول إذا لم يكن هناك هدف | |
| if target_article: | |
| target_article.setdefault("tables", []).append(struct) | |
| elif target_section: | |
| target_section.setdefault("tables", []).append(struct) | |
| else: | |
| if sections: | |
| sections[0].setdefault("tables", []).append(struct) | |
| return sections |