import re from bs4 import BeautifulSoup def parse_hub_page(html): soup = BeautifulSoup(html, "lxml") result = {"name": "", "series_ranges": [], "sub_series_links": []} h2 = soup.select_one("div#page-header h2") if h2: result["name"] = h2.get_text(strip=True) for table in soup.select("table:has(th)"): headers = [th.get_text(strip=True) for th in table.select("th")] header_text = " ".join(headers) if "作品一覧" in header_text or "一覧" in header_text: for row in table.select("tbody tr"): cells = row.find_all(["td", "th"]) td_cells = row.find_all("td") if len(td_cells) < 3: continue link_cell = td_cells[2] a = link_cell.find("a") if not a: continue href = a.get("href", "") text = a.get_text(strip=True) if "/d/" in href: page_name = href.split("/d/")[-1] result["series_ranges"].append({ "page_name": page_name, "display_name": text, }) body = soup.select_one("div#page-body") if body: for a in body.select("a[href*='/d/']"): href = a.get("href", "") text = a.get_text(strip=True) parent = a.find_parent("li") if parent and "シリーズ" in str(parent.find_previous(["h4", "h5", "h3"])): continue if any(s.get("href", "") == href for s in result["series_ranges"]): continue result["sub_series_links"].append({ "page_name": href.split("/d/")[-1], "display_name": text, }) return result