Spaces:

henry99a
/

avactress

Paused

File size: 1,829 Bytes

2685f20

import re
from bs4 import BeautifulSoup


def parse_hub_page(html):
    soup = BeautifulSoup(html, "lxml")
    result = {"name": "", "series_ranges": [], "sub_series_links": []}

    h2 = soup.select_one("div#page-header h2")
    if h2:
        result["name"] = h2.get_text(strip=True)

    for table in soup.select("table:has(th)"):
        headers = [th.get_text(strip=True) for th in table.select("th")]
        header_text = " ".join(headers)

        if "作品一覧" in header_text or "一覧" in header_text:
            for row in table.select("tbody tr"):
                cells = row.find_all(["td", "th"])
                td_cells = row.find_all("td")
                if len(td_cells) < 3:
                    continue
                link_cell = td_cells[2]
                a = link_cell.find("a")
                if not a:
                    continue
                href = a.get("href", "")
                text = a.get_text(strip=True)
                if "/d/" in href:
                    page_name = href.split("/d/")[-1]
                    result["series_ranges"].append({
                        "page_name": page_name,
                        "display_name": text,
                    })

    body = soup.select_one("div#page-body")
    if body:
        for a in body.select("a[href*='/d/']"):
            href = a.get("href", "")
            text = a.get_text(strip=True)
            parent = a.find_parent("li")
            if parent and "シリーズ" in str(parent.find_previous(["h4", "h5", "h3"])):
                continue
            if any(s.get("href", "") == href for s in result["series_ranges"]):
                continue
            result["sub_series_links"].append({
                "page_name": href.split("/d/")[-1],
                "display_name": text,
            })

    return result