File size: 1,829 Bytes
2685f20 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 | import re
from bs4 import BeautifulSoup
def parse_hub_page(html):
soup = BeautifulSoup(html, "lxml")
result = {"name": "", "series_ranges": [], "sub_series_links": []}
h2 = soup.select_one("div#page-header h2")
if h2:
result["name"] = h2.get_text(strip=True)
for table in soup.select("table:has(th)"):
headers = [th.get_text(strip=True) for th in table.select("th")]
header_text = " ".join(headers)
if "作品一覧" in header_text or "一覧" in header_text:
for row in table.select("tbody tr"):
cells = row.find_all(["td", "th"])
td_cells = row.find_all("td")
if len(td_cells) < 3:
continue
link_cell = td_cells[2]
a = link_cell.find("a")
if not a:
continue
href = a.get("href", "")
text = a.get_text(strip=True)
if "/d/" in href:
page_name = href.split("/d/")[-1]
result["series_ranges"].append({
"page_name": page_name,
"display_name": text,
})
body = soup.select_one("div#page-body")
if body:
for a in body.select("a[href*='/d/']"):
href = a.get("href", "")
text = a.get_text(strip=True)
parent = a.find_parent("li")
if parent and "シリーズ" in str(parent.find_previous(["h4", "h5", "h3"])):
continue
if any(s.get("href", "") == href for s in result["series_ranges"]):
continue
result["sub_series_links"].append({
"page_name": href.split("/d/")[-1],
"display_name": text,
})
return result
|