avactress / crawler /label_parser.py
Gemini CLI
Initial commit for Hugging Face Spaces deployment
2685f20
import re
from bs4 import BeautifulSoup
from .name_util import clean_actress_name
def parse_table_videos(table):
videos = []
headers = [th.get_text(strip=True).upper() for th in table.select("thead th")]
col_map = {}
for i, h in enumerate(headers):
if h in ("NO", "NUMBER", "番号"):
col_map["no"] = i
elif h in ("TITLE", "タイトル"):
col_map["title"] = i
elif h in ("ACTRESS", "女優", "出演"):
col_map["actress"] = i
elif h in ("RELEASE", "発売日", "発売"):
col_map["release"] = i
elif h in ("NOTE", "備考", "备注"):
col_map["note"] = i
elif h in ("PHOTO", "画像", "封面"):
col_map["photo"] = i
elif h in ("SUBTITLE", "サブタイトル"):
col_map["subtitle"] = i
elif h in ("DIRECTOR", "監督"):
col_map["director"] = i
for row in table.select("tbody tr"):
cells = row.find_all("td")
if len(cells) < 3:
continue
no_cell = cells[col_map["no"]] if "no" in col_map and col_map["no"] < len(cells) else None
dvd_id = ""
if no_cell:
a = no_cell.find("a")
dvd_id = a.get_text(strip=True) if a else no_cell.get_text(strip=True)
if not dvd_id:
continue
title_cell = cells[col_map["title"]] if "title" in col_map and col_map["title"] < len(cells) else None
title = title_cell.get_text(" ", strip=True) if title_cell else ""
actress_names = []
actress_roles = []
if "actress" in col_map and col_map["actress"] < len(cells):
actress_cell = cells[col_map["actress"]]
for child in actress_cell.children:
if hasattr(child, "name") and child.name == "a":
name = clean_actress_name(child.get_text(strip=True))
role_name = ""
if name:
actress_names.append(name)
actress_roles.append(role_name)
elif hasattr(child, "name") and child.name == "br":
continue
elif isinstance(child, str) and actress_names:
text = child.strip()
role_m = __import__("re").search(r"[((]([^))]+)[))]", text)
if role_m and actress_roles:
raw_role = role_m.group(1)
raw_role = re.sub(r"[、,].*$", "", raw_role).strip()
actress_roles[-1] = raw_role
release_date = ""
if "release" in col_map and col_map["release"] < len(cells):
date_text = cells[col_map["release"]].get_text(strip=True)
date_text = date_text.replace("/", "-")
if re.match(r"\d{4}-\d{2}-\d{2}", date_text):
release_date = date_text
elif re.match(r"\d{4}/\d{2}/\d{2}", date_text):
release_date = date_text.replace("/", "-")
dmm_url = ""
if no_cell:
a = no_cell.find("a")
if a:
dmm_url = a.get("href", "")
cover_url = ""
if "photo" in col_map and col_map["photo"] < len(cells):
img = cells[col_map["photo"]].find("img")
if img:
src = img.get("src", "")
cover_url = src.replace("ps.jpg", "pl.jpg")
videos.append({
"dvd_id": dvd_id,
"prefix": dvd_id.split("-")[0] if "-" in dvd_id else "",
"title": title,
"release_date": release_date,
"cover_url": cover_url,
"dmm_url": dmm_url,
"actress_names": actress_names,
"actress_roles": actress_roles,
})
return videos
def parse_label_page(html):
soup = BeautifulSoup(html, "lxml")
result = {"name": "", "videos": []}
h3 = soup.select_one("div#page-body h3")
if h3:
result["name"] = h3.get_text(strip=True)
for table in soup.select("table.edit"):
videos = parse_table_videos(table)
result["videos"].extend(videos)
return result