Spaces:

henry99a
/

avactress

Paused

Gemini CLI

Initial commit for Hugging Face Spaces deployment

2685f20 16 days ago

1.83 kB

	import re
	from bs4 import BeautifulSoup


	def parse_hub_page(html):
	soup = BeautifulSoup(html, "lxml")
	result = {"name": "", "series_ranges": [], "sub_series_links": []}

	h2 = soup.select_one("div#page-header h2")
	if h2:
	result["name"] = h2.get_text(strip=True)

	for table in soup.select("table:has(th)"):
	headers = [th.get_text(strip=True) for th in table.select("th")]
	header_text = " ".join(headers)

	if "作品一覧" in header_text or "一覧" in header_text:
	for row in table.select("tbody tr"):
	cells = row.find_all(["td", "th"])
	td_cells = row.find_all("td")
	if len(td_cells) < 3:
	continue
	link_cell = td_cells[2]
	a = link_cell.find("a")
	if not a:
	continue
	href = a.get("href", "")
	text = a.get_text(strip=True)
	if "/d/" in href:
	page_name = href.split("/d/")[-1]
	result["series_ranges"].append({
	"page_name": page_name,
	"display_name": text,
	})

	body = soup.select_one("div#page-body")
	if body:
	for a in body.select("a[href*='/d/']"):
	href = a.get("href", "")
	text = a.get_text(strip=True)
	parent = a.find_parent("li")
	if parent and "シリーズ" in str(parent.find_previous(["h4", "h5", "h3"])):
	continue
	if any(s.get("href", "") == href for s in result["series_ranges"]):
	continue
	result["sub_series_links"].append({
	"page_name": href.split("/d/")[-1],
	"display_name": text,
	})

	return result