| """ |
| 明星数据爬虫模块 |
| 从 https://www.houyuanhui.com/star/aindex/ 爬取明星数据 |
| """ |
| import asyncio |
| import base64 |
| import os |
| import random |
| import re |
| import time |
| from datetime import datetime |
| from typing import Any, Dict, List, Optional |
|
|
| import requests |
| from bs4 import BeautifulSoup |
|
|
| from config import logger |
| from database import execute, fetch_all, get_connection |
|
|
|
|
| |
| NATIONALITY_MAP = { |
| '中国': 'China', |
| '美国': 'United States', |
| '英国': 'United Kingdom', |
| '日本': 'Japan', |
| '韩国': 'South Korea', |
| '法国': 'France', |
| '德国': 'Germany', |
| '意大利': 'Italy', |
| '加拿大': 'Canada', |
| '澳大利亚': 'Australia', |
| '俄罗斯': 'Russia', |
| '印度': 'India', |
| '巴西': 'Brazil', |
| '墨西哥': 'Mexico', |
| '西班牙': 'Spain', |
| '泰国': 'Thailand', |
| '新加坡': 'Singapore', |
| '马来西亚': 'Malaysia', |
| '越南': 'Vietnam', |
| '菲律宾': 'Philippines', |
| '印度尼西亚': 'Indonesia', |
| '瑞士': 'Switzerland', |
| '荷兰': 'Netherlands', |
| '瑞典': 'Sweden', |
| '挪威': 'Norway', |
| '丹麦': 'Denmark', |
| '芬兰': 'Finland', |
| '奥地利': 'Austria', |
| '比利时': 'Belgium', |
| '葡萄牙': 'Portugal', |
| '希腊': 'Greece', |
| '土耳其': 'Turkey', |
| '以色列': 'Israel', |
| '南非': 'South Africa', |
| '新西兰': 'New Zealand', |
| '阿根廷': 'Argentina', |
| '智利': 'Chile', |
| '哥伦比亚': 'Colombia', |
| '古巴': 'Cuba', |
| '波兰': 'Poland', |
| '乌克兰': 'Ukraine', |
| '捷克': 'Czech Republic', |
| '匈牙利': 'Hungary', |
| '罗马尼亚': 'Romania', |
| '爱尔兰': 'Ireland', |
| '冰岛': 'Iceland', |
| '蒙古': 'Mongolia', |
| '朝鲜': 'North Korea', |
| '巴基斯坦': 'Pakistan', |
| '孟加拉国': 'Bangladesh', |
| '斯里兰卡': 'Sri Lanka', |
| '尼泊尔': 'Nepal', |
| '缅甸': 'Myanmar', |
| '柬埔寨': 'Cambodia', |
| '老挝': 'Laos', |
| '文莱': 'Brunei', |
| '马尔代夫': 'Maldives', |
| '哈萨克斯坦': 'Kazakhstan', |
| '乌兹别克斯坦': 'Uzbekistan', |
| '阿联酋': 'United Arab Emirates', |
| '沙特阿拉伯': 'Saudi Arabia', |
| '埃及': 'Egypt', |
| '尼日利亚': 'Nigeria', |
| '肯尼亚': 'Kenya', |
| '中国香港': 'Hong Kong', |
| '中国台湾': 'Taiwan', |
| '中国澳门': 'Macau', |
| } |
|
|
|
|
| def normalize_birthday(raw: str) -> Optional[str]: |
| """ |
| 将生日格式统一转换为 YYYY-MM-DD |
| 支持: "2000年11月28日", "2000-11-28", "2000/11/28" |
| """ |
| if not raw: |
| return None |
|
|
| |
| raw = re.sub(r'\s*\(.*?\)', '', raw).strip() |
|
|
| |
| patterns = [ |
| r'(\d{4})年(\d{1,2})月(\d{1,2})日', |
| r'(\d{4})-(\d{1,2})-(\d{1,2})', |
| r'(\d{4})/(\d{1,2})/(\d{1,2})', |
| ] |
|
|
| for pattern in patterns: |
| match = re.search(pattern, raw) |
| if match: |
| year, month, day = match.groups() |
| try: |
| dt = datetime(int(year), int(month), int(day)) |
| return dt.strftime('%Y-%m-%d') |
| except ValueError: |
| logger.warning(f"无效的日期格式: {raw}") |
| return None |
|
|
| logger.warning(f"无法解析日期格式: {raw}") |
| return None |
|
|
|
|
| def normalize_blood_type(raw: str) -> Optional[str]: |
| """ |
| 将血型转换为纯字母格式 |
| 支持: "A型" -> "A", "AB型" -> "AB", "O型" -> "O" |
| """ |
| if not raw: |
| return None |
|
|
| |
| blood_type = raw.replace('型', '').strip().upper() |
|
|
| |
| valid_types = ['A', 'B', 'AB', 'O'] |
| if blood_type in valid_types: |
| return blood_type |
|
|
| |
| return blood_type if blood_type else None |
|
|
|
|
| def translate_nationality(raw: str) -> Optional[str]: |
| """ |
| 将国籍从中文翻译为英文 |
| """ |
| if not raw: |
| return None |
|
|
| |
| if re.match(r'^[A-Za-z\s.,\'-]+$', raw): |
| return raw.strip() |
|
|
| |
| return NATIONALITY_MAP.get(raw) |
|
|
|
|
| |
| CELEBRITY_LOCAL_DIR = os.environ.get("CELEBRITY_LOCAL_DIR", "/opt/data/celebritys") |
| |
| CELEBRITY_OSS_PREFIX = "images/celebritys/" |
| |
| CELEBRITY_DB_PREFIX = "celebritys/" |
|
|
|
|
| def normalize_avatar_url(raw: str) -> Optional[str]: |
| """ |
| 补全 avatar URL,加上 https: 前缀 |
| """ |
| if not raw: |
| return None |
|
|
| raw = raw.strip() |
| if not raw: |
| return None |
|
|
| |
| if raw.startswith('http://') or raw.startswith('https://'): |
| return raw |
|
|
| |
| if raw.startswith('//'): |
| return 'https:' + raw |
|
|
| |
| return 'https://' + raw |
|
|
|
|
| def build_oss_object_name(star_id: int, name: str) -> str: |
| """ |
| 构建 OSS 对象名称(上传时使用) |
| 格式: images/celebritys/{id}_{base64_name}.jpg |
| """ |
| name_b64 = base64.urlsafe_b64encode(name.encode('utf-8')).decode('ascii') |
| return f"{CELEBRITY_OSS_PREFIX}{star_id}_{name_b64}.jpg" |
|
|
|
|
| def build_db_avatar_path(star_id: int, name: str) -> str: |
| """ |
| 构建数据库存储的 avatar_oss 路径(不含 images/ 前缀) |
| 格式: celebritys/{id}_{base64_name}.jpg |
| """ |
| name_b64 = base64.urlsafe_b64encode(name.encode('utf-8')).decode('ascii') |
| return f"{CELEBRITY_DB_PREFIX}{star_id}_{name_b64}.jpg" |
|
|
|
|
| def download_avatar(url: str, local_path: str) -> bool: |
| """ |
| 下载头像图片到本地 |
| """ |
| try: |
| response = requests.get(url, timeout=30) |
| response.raise_for_status() |
|
|
| |
| os.makedirs(os.path.dirname(local_path), exist_ok=True) |
|
|
| with open(local_path, 'wb') as f: |
| f.write(response.content) |
|
|
| return True |
| except Exception as e: |
| logger.error(f"下载头像失败: {url}, 错误: {e}") |
| return False |
|
|
|
|
| def upload_avatar_to_bos(file_path: str, object_name: str) -> bool: |
| """ |
| 上传头像到 BOS(复用项目的 BOS 客户端) |
| """ |
| from config import BOS_UPLOAD_ENABLED, BOS_BUCKET_NAME |
|
|
| if not BOS_UPLOAD_ENABLED: |
| logger.warning("BOS 上传未启用") |
| return False |
|
|
| try: |
| from utils import _get_bos_client |
|
|
| bos_client = _get_bos_client() |
| if bos_client is None: |
| logger.warning("BOS 客户端未初始化") |
| return False |
|
|
| if not os.path.isfile(file_path): |
| logger.error(f"文件不存在: {file_path}") |
| return False |
|
|
| |
| with open(file_path, 'rb') as f: |
| file_content = f.read() |
|
|
| |
| bos_client.put_object( |
| Bucket=BOS_BUCKET_NAME, |
| Key=object_name, |
| Body=file_content |
| ) |
|
|
| logger.info(f"BOS 上传成功: {object_name}") |
| return True |
|
|
| except Exception as e: |
| logger.error(f"BOS 上传失败: {object_name}, 错误: {e}") |
| return False |
|
|
|
|
| class StarScraper: |
| """明星数据爬虫""" |
|
|
| def __init__(self): |
| self.base_url = "https://www.houyuanhui.com/star/aindex/" |
| self.detail_url_template = "https://www.houyuanhui.com/mingxing/{id}.html" |
| self.headers = { |
| 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/146.0.0.0 Safari/537.36 Edg/146.0.0.0', |
| 'Accept': 'application/json, text/javascript, */*; q=0.01', |
| 'Accept-Encoding': 'gzip, deflate, br, zstd', |
| 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', |
| 'Cache-Control': 'no-cache', |
| 'Pragma': 'no-cache', |
| 'Sec-Fetch-Dest': 'empty', |
| 'Sec-Fetch-Mode': 'cors', |
| 'Sec-Fetch-Site': 'same-origin', |
| 'X-Requested-With': 'XMLHttpRequest', |
| 'Referer': 'https://www.houyuanhui.com/mingxing/', |
| } |
| self.session = requests.Session() |
| self.session.headers.update(self.headers) |
|
|
| |
| self.is_running = False |
| self.current_page = 0 |
| self.total_inserted = 0 |
| self.total_skipped = 0 |
| self.error_message = None |
| self.start_time = None |
| self.end_time = None |
|
|
| def _parse_star_detail_from_html(self, html_content: str) -> Dict[str, Optional[str]]: |
| """ |
| 从明星个人主页HTML中提取详细信息 |
| 解析 <div class="modules archives"> 里的字段 |
| """ |
| result = { |
| 'alias': None, |
| 'nationality': None, |
| 'constellation': None, |
| 'weight': None, |
| 'birthplace': None, |
| 'agency': None, |
| 'graduated_school': None, |
| 'foreign_name': None, |
| 'ethnicity': None, |
| 'blood_type': None, |
| 'height': None, |
| 'birthday': None, |
| 'profession': None, |
| } |
|
|
| try: |
| soup = BeautifulSoup(html_content, 'html.parser') |
|
|
| |
| archives_div = soup.find('div', class_='modules archives') |
| if not archives_div: |
| logger.warning("未找到 archives 模块") |
| return result |
|
|
| |
| li_items = archives_div.find_all('li') |
|
|
| |
| field_mapping = { |
| '中文名': 'name', |
| '别名': 'alias', |
| '国籍': 'nationality', |
| '星座': 'constellation', |
| '体重': 'weight', |
| '出生地': 'birthplace', |
| '经纪公司': 'agency', |
| '毕业院校': 'graduated_school', |
| '外文名': 'foreign_name', |
| '民族': 'ethnicity', |
| '血型': 'blood_type', |
| '身高': 'height', |
| '出生日期': 'birthday', |
| '职业': 'profession', |
| } |
|
|
| for li in li_items: |
| text = li.get_text(strip=True) |
| if ':' in text: |
| key, value = text.split(':', 1) |
| key = key.strip() |
| value = value.strip() |
|
|
| |
| value = re.sub(r'\s*\(.*?\)', '', value).strip() |
|
|
| if key in field_mapping: |
| field_name = field_mapping[key] |
|
|
| |
| if field_name == 'birthday': |
| value = normalize_birthday(value) |
| elif field_name == 'blood_type': |
| value = normalize_blood_type(value) |
| elif field_name == 'nationality': |
| value = translate_nationality(value) |
|
|
| result[field_name] = value if value else None |
|
|
| except Exception as e: |
| logger.error(f"解析明星详情页失败: {e}") |
|
|
| return result |
|
|
| def _fetch_detail_page(self, star_id: int) -> str: |
| """ |
| 请求明星个人主页HTML |
| """ |
| url = self.detail_url_template.format(id=star_id) |
| try: |
| response = self.session.get(url, timeout=30) |
| response.raise_for_status() |
| return response.text |
| except requests.RequestException as e: |
| logger.error(f"请求明星详情页 {url} 失败: {e}") |
| return "" |
|
|
| def _parse_star_info(self, info_text: str) -> Dict[str, Optional[str]]: |
| """ |
| 解析明星详细信息文本 |
| 例如: "别名:丽颖,颖宝,小赵总\n国籍:中国\n..." |
| """ |
| result = { |
| 'alias': None, |
| 'nationality': None, |
| 'constellation': None, |
| 'weight': None, |
| 'birthplace': None, |
| 'agency': None, |
| 'graduated_school': None, |
| 'foreign_name': None, |
| 'ethnicity': None, |
| 'blood_type': None, |
| 'height': None, |
| 'birthday': None, |
| 'profession': None, |
| } |
|
|
| if not info_text: |
| return result |
|
|
| |
| field_mapping = { |
| '别名': 'alias', |
| '国籍': 'nationality', |
| '星座': 'constellation', |
| '体重': 'weight', |
| '出生地': 'birthplace', |
| '经纪公司': 'agency', |
| '毕业院校': 'graduated_school', |
| '外文名': 'foreign_name', |
| '民族': 'ethnicity', |
| '血型': 'blood_type', |
| '身高': 'height', |
| '出生日期': 'birthday', |
| '职业': 'profession', |
| } |
|
|
| lines = info_text.strip().split('\n') |
| for line in lines: |
| line = line.strip() |
| if ':' in line: |
| key, value = line.split(':', 1) |
| key = key.strip() |
| value = value.strip() |
| if key in field_mapping: |
| result[field_mapping[key]] = value if value else None |
|
|
| return result |
|
|
| def _fetch_page(self, page: int) -> Dict[str, Any]: |
| """ |
| 请求单页数据 |
| """ |
| params = { |
| 'id': '', |
| 'page': page, |
| } |
|
|
| try: |
| response = self.session.get(self.base_url, params=params, timeout=30) |
| response.raise_for_status() |
| return response.json() |
| except requests.RequestException as e: |
| logger.error(f"请求第 {page} 页失败: {e}") |
| raise |
|
|
| async def _insert_star(self, star_data: Dict[str, Any]) -> bool: |
| """ |
| 插入或更新明星记录到数据库 |
| """ |
| query = """ |
| INSERT INTO tpl_app_star_data ( |
| id, name, avatar, avatar_oss, alias, nationality, constellation, |
| weight, birthplace, agency, graduated_school, foreign_name, |
| ethnicity, blood_type, height, birthday, profession |
| ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) |
| ON DUPLICATE KEY UPDATE |
| name = VALUES(name), |
| avatar = VALUES(avatar), |
| avatar_oss = VALUES(avatar_oss), |
| alias = VALUES(alias), |
| nationality = VALUES(nationality), |
| constellation = VALUES(constellation), |
| weight = VALUES(weight), |
| birthplace = VALUES(birthplace), |
| agency = VALUES(agency), |
| graduated_school = VALUES(graduated_school), |
| foreign_name = VALUES(foreign_name), |
| ethnicity = VALUES(ethnicity), |
| blood_type = VALUES(blood_type), |
| height = VALUES(height), |
| birthday = VALUES(birthday), |
| profession = VALUES(profession) |
| """ |
|
|
| params = ( |
| star_data.get('id'), |
| star_data.get('name'), |
| star_data.get('avatar'), |
| star_data.get('avatar_oss'), |
| star_data.get('alias'), |
| star_data.get('nationality'), |
| star_data.get('constellation'), |
| star_data.get('weight'), |
| star_data.get('birthplace'), |
| star_data.get('agency'), |
| star_data.get('graduated_school'), |
| star_data.get('foreign_name'), |
| star_data.get('ethnicity'), |
| star_data.get('blood_type'), |
| star_data.get('height'), |
| star_data.get('birthday'), |
| star_data.get('profession'), |
| ) |
|
|
| try: |
| async with get_connection() as conn: |
| async with conn.cursor() as cursor: |
| await cursor.execute(query, params) |
| return True |
| except Exception as e: |
| logger.error(f"插入明星数据失败 (id={star_data.get('id')}): {e}") |
| return False |
|
|
| async def _should_skip_star(self, star_id: int) -> bool: |
| """ |
| 检查是否应该跳过该明星(已存在且 avatar_oss 不为空) |
| """ |
| query = "SELECT COUNT(*) as cnt FROM tpl_app_star_data WHERE id = %s AND avatar_oss IS NOT NULL AND avatar_oss != ''" |
| try: |
| rows = await fetch_all(query, (star_id,)) |
| if rows and rows[0].get('cnt', 0) > 0: |
| return True |
| return False |
| except Exception as e: |
| logger.warning(f"检查明星 {star_id} 是否跳过失败: {e}") |
| return False |
|
|
| async def start_scraping(self, start_page: int = 1) -> Dict[str, Any]: |
| """ |
| 开始爬取任务 |
| :param start_page: 起始页码,默认从第1页开始 |
| """ |
| if self.is_running: |
| return { |
| 'success': False, |
| 'message': '爬虫任务正在运行中', |
| } |
| |
| if start_page < 1: |
| start_page = 1 |
|
|
| |
| self.is_running = True |
| self.current_page = start_page |
| self.total_inserted = 0 |
| self.total_skipped = 0 |
| self.error_message = None |
| self.start_time = time.time() |
| self.end_time = None |
|
|
| logger.info(f"开始明星数据爬取任务,起始页: {start_page}...") |
|
|
| try: |
| page = start_page |
| while True: |
| |
| if not self.is_running: |
| logger.info("爬虫任务被手动停止") |
| break |
|
|
| self.current_page = page |
| logger.info(f"正在爬取第 {page} 页...") |
|
|
| |
| response_data = await asyncio.get_event_loop().run_in_executor( |
| None, self._fetch_page, page |
| ) |
|
|
| |
| data_list = response_data.get('data', []) |
|
|
| |
| if not data_list: |
| logger.info(f"第 {page} 页 data 为空,爬取完成") |
| break |
|
|
| logger.info(f"第 {page} 页获取到 {len(data_list)} 条记录") |
|
|
| |
| for star_item in data_list: |
| star_id = star_item.get('id') |
| name = star_item.get('name', '') |
| avatar = star_item.get('avatar', '') |
| info_text = star_item.get('info', '') |
|
|
| |
| should_skip = await self._should_skip_star(star_id) |
| if should_skip: |
| self.total_skipped += 1 |
| logger.info(f"[{star_id}] {name} - 已存在且头像已上传,跳过") |
| continue |
|
|
| logger.info(f"[{star_id}] 正在处理明星: {name}") |
|
|
| |
| avatar_url = normalize_avatar_url(avatar) |
| avatar_oss = None |
|
|
| |
| if avatar_url and star_id and name: |
| oss_object_name = build_oss_object_name(star_id, name) |
| db_avatar_path = build_db_avatar_path(star_id, name) |
| local_path = os.path.join(CELEBRITY_LOCAL_DIR, f"{star_id}_{base64.urlsafe_b64encode(name.encode('utf-8')).decode('ascii')}.jpg") |
|
|
| logger.info(f"[{star_id}] {name} - 正在下载头像: {avatar_url}") |
| if download_avatar(avatar_url, local_path): |
| logger.info(f"[{star_id}] {name} - 头像已保存到本地: {local_path}") |
|
|
| |
| logger.info(f"[{star_id}] {name} - 正在上传头像到 OSS: {oss_object_name}") |
| if upload_avatar_to_bos(local_path, oss_object_name): |
| avatar_oss = db_avatar_path |
| logger.info(f"[{star_id}] {name} - 头像已上传 OSS,入库路径: {avatar_oss}") |
| else: |
| logger.warning(f"[{star_id}] {name} - 头像上传到 OSS 失败") |
| else: |
| logger.warning(f"[{star_id}] {name} - 头像下载失败") |
|
|
| |
| await asyncio.sleep(random.uniform(1, 2)) |
|
|
| |
| detail_info = {} |
| if star_id: |
| try: |
| logger.info(f"[{star_id}] 正在访问详情页: {self.detail_url_template.format(id=star_id)}") |
| detail_html = await asyncio.get_event_loop().run_in_executor( |
| None, self._fetch_detail_page, star_id |
| ) |
| if detail_html: |
| detail_info = self._parse_star_detail_from_html(detail_html) |
| logger.info(f"[{star_id}] 详情页解析完成: {name}") |
|
|
| |
| nationality = detail_info.get('nationality') |
| if nationality: |
| logger.info(f"[{star_id}] {name} - 国籍: {nationality}") |
|
|
| birthday = detail_info.get('birthday') |
| if birthday: |
| logger.info(f"[{star_id}] {name} - 生日: {birthday}") |
|
|
| blood_type = detail_info.get('blood_type') |
| if blood_type: |
| logger.info(f"[{star_id}] {name} - 血型: {blood_type}") |
|
|
| |
| await asyncio.sleep(random.uniform(1, 2)) |
| else: |
| logger.warning(f"[{star_id}] {name} - 详情页为空") |
| except Exception as e: |
| logger.warning(f"[{star_id}] 获取明星 {name} 详情页失败: {e}") |
|
|
| |
| if not detail_info or not detail_info.get('name'): |
| logger.info(f"[{star_id}] {name} - 详情页数据不足,使用列表接口 info 字段") |
| detail_info = self._parse_star_info(info_text) |
|
|
| |
| star_data = { |
| 'id': star_id, |
| 'name': name, |
| 'avatar': avatar_url, |
| 'avatar_oss': avatar_oss, |
| **detail_info |
| } |
|
|
| |
| success = await self._insert_star(star_data) |
| if success: |
| self.total_inserted += 1 |
| logger.info(f"[{star_id}] {name} - 数据已入库 (国籍: {detail_info.get('nationality', '未知')})") |
| else: |
| self.total_skipped += 1 |
| logger.error(f"[{star_id}] {name} - 数据入库失败") |
|
|
|
|
| |
| page += 1 |
|
|
| |
| sleep_time = random.uniform(1, 2) |
| logger.info(f"等待 {sleep_time:.2f} 秒后继续下一页...") |
| await asyncio.sleep(sleep_time) |
|
|
| self.is_running = False |
| self.end_time = time.time() |
| duration = self.end_time - self.start_time |
|
|
| result = { |
| 'success': True, |
| 'message': '爬取完成', |
| 'total_pages': page - 1, |
| 'total_inserted': self.total_inserted, |
| 'total_skipped': self.total_skipped, |
| 'duration_seconds': round(duration, 2), |
| } |
|
|
| logger.info("=" * 50) |
| logger.info("明星数据爬取任务完成!") |
| logger.info(f"总页数: {page - 1}") |
| logger.info(f"成功入库: {self.total_inserted} 条") |
| logger.info(f"跳过/失败: {self.total_skipped} 条") |
| logger.info(f"总耗时: {duration:.2f} 秒") |
| logger.info("=" * 50) |
| return result |
|
|
| except Exception as e: |
| self.is_running = False |
| self.end_time = time.time() |
| self.error_message = str(e) |
|
|
| result = { |
| 'success': False, |
| 'message': f'爬取失败: {str(e)}', |
| 'current_page': self.current_page, |
| 'total_inserted': self.total_inserted, |
| 'total_skipped': self.total_skipped, |
| } |
|
|
| logger.error(f"爬取任务异常: {e}") |
| return result |
|
|
| def stop_scraping(self) -> Dict[str, Any]: |
| """ |
| 停止爬取任务 |
| """ |
| if not self.is_running: |
| return { |
| 'success': False, |
| 'message': '爬虫任务未在运行', |
| } |
|
|
| self.is_running = False |
| return { |
| 'success': True, |
| 'message': '已发送停止信号', |
| } |
|
|
| def get_status(self) -> Dict[str, Any]: |
| """ |
| 获取爬虫状态 |
| """ |
| status = { |
| 'is_running': self.is_running, |
| 'current_page': self.current_page, |
| 'total_inserted': self.total_inserted, |
| 'total_skipped': self.total_skipped, |
| 'error_message': self.error_message, |
| } |
|
|
| if self.start_time: |
| status['start_time'] = self.start_time |
| if self.end_time: |
| status['end_time'] = self.end_time |
| status['duration_seconds'] = round(self.end_time - self.start_time, 2) |
| else: |
| status['duration_seconds'] = round(time.time() - self.start_time, 2) |
|
|
| return status |
|
|
|
|
| |
| _scraper: Optional[StarScraper] = None |
|
|
|
|
| def get_scraper() -> StarScraper: |
| """获取全局爬虫单例""" |
| global _scraper |
| if _scraper is None: |
| _scraper = StarScraper() |
| return _scraper |
|
|
|
|
| def chinese_name_to_pinyin(chinese_name: str) -> str: |
| """ |
| 将中文姓名转换为拼音格式(如:杨幂 -> Yang Mi) |
| """ |
| try: |
| from pypinyin import pinyin, Style |
| |
| |
| py_list = pinyin(chinese_name, style=Style.NORMAL) |
| |
| |
| result = ' '.join([item[0].capitalize() for item in py_list]) |
| return result |
| except ImportError: |
| logger.error("pypinyin 库未安装,请执行: pip install pypinyin") |
| return chinese_name |
| except Exception as e: |
| logger.warning(f"转换拼音失败: {chinese_name}, 错误: {e}") |
| return chinese_name |
|
|
|
|
| async def update_empty_foreign_names() -> Dict[str, Any]: |
| """ |
| 将 foreign_name 为空的记录,使用 name 转拼音进行更新 |
| """ |
| logger.info("开始更新空 foreign_name 记录...") |
| |
| try: |
| |
| query = "SELECT id, name FROM tpl_app_star_data WHERE foreign_name IS NULL OR foreign_name = ''" |
| rows = await fetch_all(query) |
| |
| if not rows: |
| return { |
| 'success': True, |
| 'message': '没有需要更新的记录', |
| 'total_updated': 0, |
| } |
| |
| total_updated = 0 |
| for row in rows: |
| star_id = row.get('id') |
| name = row.get('name', '') |
| |
| |
| pinyin_name = chinese_name_to_pinyin(name) |
| |
| |
| update_query = "UPDATE tpl_app_star_data SET foreign_name = %s WHERE id = %s" |
| await execute(update_query, (pinyin_name, star_id)) |
| |
| total_updated += 1 |
| logger.info(f"[{star_id}] {name} -> foreign_name 更新为: {pinyin_name}") |
| |
| return { |
| 'success': True, |
| 'message': '更新完成', |
| 'total_updated': total_updated, |
| } |
| |
| except Exception as e: |
| logger.error(f"更新 foreign_name 失败: {e}") |
| return { |
| 'success': False, |
| 'message': f'更新失败: {str(e)}', |
| 'total_updated': 0, |
| } |
|
|
|
|
| async def fix_empty_avatar_oss() -> Dict[str, Any]: |
| """ |
| 修复 avatar_oss 为空的记录: |
| 1. 查询 avatar_oss 为空的记录 |
| 2. 从 avatar URL 下载图片 |
| 3. 上传到 OSS |
| 4. 更新 avatar_oss 字段 |
| """ |
| logger.info("开始修复空 avatar_oss 记录...") |
| |
| try: |
| |
| query = "SELECT id, name, avatar FROM tpl_app_star_data WHERE avatar_oss IS NULL OR avatar_oss = ''" |
| rows = await fetch_all(query) |
| |
| if not rows: |
| return { |
| 'success': True, |
| 'message': '没有需要修复的记录', |
| 'total_fixed': 0, |
| 'total_failed': 0, |
| } |
| |
| total_fixed = 0 |
| total_failed = 0 |
| |
| for row in rows: |
| star_id = row.get('id') |
| name = row.get('name', '') |
| avatar = row.get('avatar', '') |
| |
| |
| avatar_url = normalize_avatar_url(avatar) |
| if not avatar_url: |
| logger.warning(f"[{star_id}] {name} - avatar URL 为空,跳过") |
| total_failed += 1 |
| continue |
| |
| |
| oss_object_name = build_oss_object_name(star_id, name) |
| db_avatar_path = build_db_avatar_path(star_id, name) |
| local_path = os.path.join(CELEBRITY_LOCAL_DIR, f"{star_id}_{base64.urlsafe_b64encode(name.encode('utf-8')).decode('ascii')}.jpg") |
| |
| logger.info(f"[{star_id}] {name} - 开始修复头像") |
| |
| |
| if not download_avatar(avatar_url, local_path): |
| logger.warning(f"[{star_id}] {name} - 头像下载失败") |
| total_failed += 1 |
| continue |
| |
| logger.info(f"[{star_id}] {name} - 头像已保存到本地: {local_path}") |
| |
| |
| if upload_avatar_to_bos(local_path, oss_object_name): |
| |
| update_query = "UPDATE tpl_app_star_data SET avatar_oss = %s WHERE id = %s" |
| await execute(update_query, (db_avatar_path, star_id)) |
| |
| total_fixed += 1 |
| logger.info(f"[{star_id}] {name} - 头像修复成功,avatar_oss: {db_avatar_path}") |
| else: |
| total_failed += 1 |
| logger.warning(f"[{star_id}] {name} - 头像上传到 OSS 失败") |
| |
| |
| await asyncio.sleep(random.uniform(0.5, 1.5)) |
| |
| return { |
| 'success': True, |
| 'message': '修复完成', |
| 'total_fixed': total_fixed, |
| 'total_failed': total_failed, |
| } |
| |
| except Exception as e: |
| logger.error(f"修复 avatar_oss 失败: {e}") |
| return { |
| 'success': False, |
| 'message': f'修复失败: {str(e)}', |
| 'total_fixed': 0, |
| 'total_failed': 0, |
| } |
|
|