| import re |
| from bs4 import BeautifulSoup |
| from .name_util import clean_actress_name |
|
|
|
|
| def parse_table_videos(table): |
| videos = [] |
| headers = [th.get_text(strip=True).upper() for th in table.select("thead th")] |
| col_map = {} |
| for i, h in enumerate(headers): |
| if h in ("NO", "NUMBER", "番号"): |
| col_map["no"] = i |
| elif h in ("TITLE", "タイトル"): |
| col_map["title"] = i |
| elif h in ("ACTRESS", "女優", "出演"): |
| col_map["actress"] = i |
| elif h in ("RELEASE", "発売日", "発売"): |
| col_map["release"] = i |
| elif h in ("NOTE", "備考", "备注"): |
| col_map["note"] = i |
| elif h in ("PHOTO", "画像", "封面"): |
| col_map["photo"] = i |
| elif h in ("SUBTITLE", "サブタイトル"): |
| col_map["subtitle"] = i |
| elif h in ("DIRECTOR", "監督"): |
| col_map["director"] = i |
|
|
| for row in table.select("tbody tr"): |
| cells = row.find_all("td") |
| if len(cells) < 3: |
| continue |
| no_cell = cells[col_map["no"]] if "no" in col_map and col_map["no"] < len(cells) else None |
| dvd_id = "" |
| if no_cell: |
| a = no_cell.find("a") |
| dvd_id = a.get_text(strip=True) if a else no_cell.get_text(strip=True) |
| if not dvd_id: |
| continue |
|
|
| title_cell = cells[col_map["title"]] if "title" in col_map and col_map["title"] < len(cells) else None |
| title = title_cell.get_text(" ", strip=True) if title_cell else "" |
|
|
| actress_names = [] |
| actress_roles = [] |
| if "actress" in col_map and col_map["actress"] < len(cells): |
| actress_cell = cells[col_map["actress"]] |
| for child in actress_cell.children: |
| if hasattr(child, "name") and child.name == "a": |
| name = clean_actress_name(child.get_text(strip=True)) |
| role_name = "" |
| if name: |
| actress_names.append(name) |
| actress_roles.append(role_name) |
| elif hasattr(child, "name") and child.name == "br": |
| continue |
| elif isinstance(child, str) and actress_names: |
| text = child.strip() |
| role_m = __import__("re").search(r"[((]([^))]+)[))]", text) |
| if role_m and actress_roles: |
| raw_role = role_m.group(1) |
| raw_role = re.sub(r"[、,].*$", "", raw_role).strip() |
| actress_roles[-1] = raw_role |
|
|
| release_date = "" |
| if "release" in col_map and col_map["release"] < len(cells): |
| date_text = cells[col_map["release"]].get_text(strip=True) |
| date_text = date_text.replace("/", "-") |
| if re.match(r"\d{4}-\d{2}-\d{2}", date_text): |
| release_date = date_text |
| elif re.match(r"\d{4}/\d{2}/\d{2}", date_text): |
| release_date = date_text.replace("/", "-") |
|
|
| dmm_url = "" |
| if no_cell: |
| a = no_cell.find("a") |
| if a: |
| dmm_url = a.get("href", "") |
|
|
| cover_url = "" |
| if "photo" in col_map and col_map["photo"] < len(cells): |
| img = cells[col_map["photo"]].find("img") |
| if img: |
| src = img.get("src", "") |
| cover_url = src.replace("ps.jpg", "pl.jpg") |
|
|
| videos.append({ |
| "dvd_id": dvd_id, |
| "prefix": dvd_id.split("-")[0] if "-" in dvd_id else "", |
| "title": title, |
| "release_date": release_date, |
| "cover_url": cover_url, |
| "dmm_url": dmm_url, |
| "actress_names": actress_names, |
| "actress_roles": actress_roles, |
| }) |
|
|
| return videos |
|
|
|
|
| def parse_label_page(html): |
| soup = BeautifulSoup(html, "lxml") |
| result = {"name": "", "videos": []} |
| h3 = soup.select_one("div#page-body h3") |
| if h3: |
| result["name"] = h3.get_text(strip=True) |
|
|
| for table in soup.select("table.edit"): |
| videos = parse_table_videos(table) |
| result["videos"].extend(videos) |
|
|
| return result |
|
|