| import re |
| from bs4 import BeautifulSoup |
|
|
|
|
| def parse_hub_page(html): |
| soup = BeautifulSoup(html, "lxml") |
| result = {"name": "", "series_ranges": [], "sub_series_links": []} |
|
|
| h2 = soup.select_one("div#page-header h2") |
| if h2: |
| result["name"] = h2.get_text(strip=True) |
|
|
| for table in soup.select("table:has(th)"): |
| headers = [th.get_text(strip=True) for th in table.select("th")] |
| header_text = " ".join(headers) |
|
|
| if "作品一覧" in header_text or "一覧" in header_text: |
| for row in table.select("tbody tr"): |
| cells = row.find_all(["td", "th"]) |
| td_cells = row.find_all("td") |
| if len(td_cells) < 3: |
| continue |
| link_cell = td_cells[2] |
| a = link_cell.find("a") |
| if not a: |
| continue |
| href = a.get("href", "") |
| text = a.get_text(strip=True) |
| if "/d/" in href: |
| page_name = href.split("/d/")[-1] |
| result["series_ranges"].append({ |
| "page_name": page_name, |
| "display_name": text, |
| }) |
|
|
| body = soup.select_one("div#page-body") |
| if body: |
| for a in body.select("a[href*='/d/']"): |
| href = a.get("href", "") |
| text = a.get_text(strip=True) |
| parent = a.find_parent("li") |
| if parent and "シリーズ" in str(parent.find_previous(["h4", "h5", "h3"])): |
| continue |
| if any(s.get("href", "") == href for s in result["series_ranges"]): |
| continue |
| result["sub_series_links"].append({ |
| "page_name": href.split("/d/")[-1], |
| "display_name": text, |
| }) |
|
|
| return result |
|
|