import re from bs4 import BeautifulSoup from .name_util import clean_actress_name def parse_table_videos(table): videos = [] headers = [th.get_text(strip=True).upper() for th in table.select("thead th")] col_map = {} for i, h in enumerate(headers): if h in ("NO", "NUMBER", "番号"): col_map["no"] = i elif h in ("TITLE", "タイトル"): col_map["title"] = i elif h in ("ACTRESS", "女優", "出演"): col_map["actress"] = i elif h in ("RELEASE", "発売日", "発売"): col_map["release"] = i elif h in ("NOTE", "備考", "备注"): col_map["note"] = i elif h in ("PHOTO", "画像", "封面"): col_map["photo"] = i elif h in ("SUBTITLE", "サブタイトル"): col_map["subtitle"] = i elif h in ("DIRECTOR", "監督"): col_map["director"] = i for row in table.select("tbody tr"): cells = row.find_all("td") if len(cells) < 3: continue no_cell = cells[col_map["no"]] if "no" in col_map and col_map["no"] < len(cells) else None dvd_id = "" if no_cell: a = no_cell.find("a") dvd_id = a.get_text(strip=True) if a else no_cell.get_text(strip=True) if not dvd_id: continue title_cell = cells[col_map["title"]] if "title" in col_map and col_map["title"] < len(cells) else None title = title_cell.get_text(" ", strip=True) if title_cell else "" actress_names = [] actress_roles = [] if "actress" in col_map and col_map["actress"] < len(cells): actress_cell = cells[col_map["actress"]] for child in actress_cell.children: if hasattr(child, "name") and child.name == "a": name = clean_actress_name(child.get_text(strip=True)) role_name = "" if name: actress_names.append(name) actress_roles.append(role_name) elif hasattr(child, "name") and child.name == "br": continue elif isinstance(child, str) and actress_names: text = child.strip() role_m = __import__("re").search(r"[((]([^))]+)[))]", text) if role_m and actress_roles: raw_role = role_m.group(1) raw_role = re.sub(r"[、,].*$", "", raw_role).strip() actress_roles[-1] = raw_role release_date = "" if "release" in col_map and col_map["release"] < len(cells): date_text = cells[col_map["release"]].get_text(strip=True) date_text = date_text.replace("/", "-") if re.match(r"\d{4}-\d{2}-\d{2}", date_text): release_date = date_text elif re.match(r"\d{4}/\d{2}/\d{2}", date_text): release_date = date_text.replace("/", "-") dmm_url = "" if no_cell: a = no_cell.find("a") if a: dmm_url = a.get("href", "") cover_url = "" if "photo" in col_map and col_map["photo"] < len(cells): img = cells[col_map["photo"]].find("img") if img: src = img.get("src", "") cover_url = src.replace("ps.jpg", "pl.jpg") videos.append({ "dvd_id": dvd_id, "prefix": dvd_id.split("-")[0] if "-" in dvd_id else "", "title": title, "release_date": release_date, "cover_url": cover_url, "dmm_url": dmm_url, "actress_names": actress_names, "actress_roles": actress_roles, }) return videos def parse_label_page(html): soup = BeautifulSoup(html, "lxml") result = {"name": "", "videos": []} h3 = soup.select_one("div#page-body h3") if h3: result["name"] = h3.get_text(strip=True) for table in soup.select("table.edit"): videos = parse_table_videos(table) result["videos"].extend(videos) return result