Spaces:

henry99a
/

avactress

Paused

Gemini CLI

Initial commit for Hugging Face Spaces deployment

2685f20 16 days ago

4.17 kB

	import re
	from bs4 import BeautifulSoup
	from .name_util import clean_actress_name


	def parse_table_videos(table):
	videos = []
	headers = [th.get_text(strip=True).upper() for th in table.select("thead th")]
	col_map = {}
	for i, h in enumerate(headers):
	if h in ("NO", "NUMBER", "番号"):
	col_map["no"] = i
	elif h in ("TITLE", "タイトル"):
	col_map["title"] = i
	elif h in ("ACTRESS", "女優", "出演"):
	col_map["actress"] = i
	elif h in ("RELEASE", "発売日", "発売"):
	col_map["release"] = i
	elif h in ("NOTE", "備考", "备注"):
	col_map["note"] = i
	elif h in ("PHOTO", "画像", "封面"):
	col_map["photo"] = i
	elif h in ("SUBTITLE", "サブタイトル"):
	col_map["subtitle"] = i
	elif h in ("DIRECTOR", "監督"):
	col_map["director"] = i

	for row in table.select("tbody tr"):
	cells = row.find_all("td")
	if len(cells) < 3:
	continue
	no_cell = cells[col_map["no"]] if "no" in col_map and col_map["no"] < len(cells) else None
	dvd_id = ""
	if no_cell:
	a = no_cell.find("a")
	dvd_id = a.get_text(strip=True) if a else no_cell.get_text(strip=True)
	if not dvd_id:
	continue

	title_cell = cells[col_map["title"]] if "title" in col_map and col_map["title"] < len(cells) else None
	title = title_cell.get_text(" ", strip=True) if title_cell else ""

	actress_names = []
	actress_roles = []
	if "actress" in col_map and col_map["actress"] < len(cells):
	actress_cell = cells[col_map["actress"]]
	for child in actress_cell.children:
	if hasattr(child, "name") and child.name == "a":
	name = clean_actress_name(child.get_text(strip=True))
	role_name = ""
	if name:
	actress_names.append(name)
	actress_roles.append(role_name)
	elif hasattr(child, "name") and child.name == "br":
	continue
	elif isinstance(child, str) and actress_names:
	text = child.strip()
	role_m = __import__("re").search(r"[（(]([^）)]+)[）)]", text)
	if role_m and actress_roles:
	raw_role = role_m.group(1)
	raw_role = re.sub(r"[、,].*$", "", raw_role).strip()
	actress_roles[-1] = raw_role

	release_date = ""
	if "release" in col_map and col_map["release"] < len(cells):
	date_text = cells[col_map["release"]].get_text(strip=True)
	date_text = date_text.replace("/", "-")
	if re.match(r"\d{4}-\d{2}-\d{2}", date_text):
	release_date = date_text
	elif re.match(r"\d{4}/\d{2}/\d{2}", date_text):
	release_date = date_text.replace("/", "-")

	dmm_url = ""
	if no_cell:
	a = no_cell.find("a")
	if a:
	dmm_url = a.get("href", "")

	cover_url = ""
	if "photo" in col_map and col_map["photo"] < len(cells):
	img = cells[col_map["photo"]].find("img")
	if img:
	src = img.get("src", "")
	cover_url = src.replace("ps.jpg", "pl.jpg")

	videos.append({
	"dvd_id": dvd_id,
	"prefix": dvd_id.split("-")[0] if "-" in dvd_id else "",
	"title": title,
	"release_date": release_date,
	"cover_url": cover_url,
	"dmm_url": dmm_url,
	"actress_names": actress_names,
	"actress_roles": actress_roles,
	})

	return videos


	def parse_label_page(html):
	soup = BeautifulSoup(html, "lxml")
	result = {"name": "", "videos": []}
	h3 = soup.select_one("div#page-body h3")
	if h3:
	result["name"] = h3.get_text(strip=True)

	for table in soup.select("table.edit"):
	videos = parse_table_videos(table)
	result["videos"].extend(videos)

	return result