import os import urllib.parse from loguru import logger from apis.xhs_pc_apis import XHS_Apis from xhs_utils.data_util import handle_note_info, download_note, save_to_xlsx from xhs_utils.response_guard import get_dict, get_list class Data_Spider(): def __init__(self): self.xhs_apis = XHS_Apis() def spider_note(self, note_url: str, cookies_str: str, proxies=None): note_info = None try: success, msg, note_info = self.xhs_apis.get_note_info(note_url, cookies_str, proxies) if success: data = get_dict(note_info, "data", context="get_note_info") items = get_list(data, "items", context="get_note_info") if not items: raise ValueError("empty_items") note_info = items[0] note_info['url'] = note_url note_info = handle_note_info(note_info) except Exception as e: success = False msg = e logger.info(f'爬取笔记信息 {note_url}: {success}, msg: {msg}') return success, msg, note_info def _note_id_from_url(self, note_url: str): try: url_parse = urllib.parse.urlparse(note_url) return url_parse.path.split("/")[-1] except Exception: return note_url def spider_some_note(self, notes: list, cookies_str: str, base_path: dict, save_choice: str, excel_name: str = '', proxies=None, state_store=None, state_key: str = "done_note_ids"): if (save_choice == 'all' or save_choice == 'excel') and excel_name == '': raise ValueError('excel_name 不能为空') note_list = [] done = set() if state_store is not None: done = state_store.get_set(state_key) total = 0 skipped = 0 ok = 0 failed = 0 fail_reasons: dict[str, int] = {} for note_url in notes: total += 1 note_id = self._note_id_from_url(note_url) if note_id in done: skipped += 1 continue success, msg, note_info = self.spider_note(note_url, cookies_str, proxies) if note_info is not None and success: note_list.append(note_info) ok += 1 if state_store is not None: state_store.add_to_set(state_key, note_id) else: failed += 1 r = str(msg) fail_reasons[r] = fail_reasons.get(r, 0) + 1 for note_info in note_list: if save_choice == 'all' or 'media' in save_choice: download_note(note_info, base_path['media'], save_choice) if save_choice == 'all' or save_choice == 'excel': file_path = os.path.abspath(os.path.join(base_path['excel'], f'{excel_name}.xlsx')) save_to_xlsx(note_list, file_path) return { "total": total, "skipped": skipped, "ok": ok, "failed": failed, "fail_reasons": fail_reasons, } def spider_user_all_note(self, user_url: str, cookies_str: str, base_path: dict, save_choice: str, excel_name: str = '', proxies=None, state_store=None): note_list = [] try: success, msg, all_note_info = self.xhs_apis.get_user_all_notes(user_url, cookies_str, proxies) if success: logger.info(f'用户 {user_url} 作品数量: {len(all_note_info)}') for simple_note_info in all_note_info: note_url = f"https://www.xiaohongshu.com/explore/{simple_note_info['note_id']}?xsec_token={simple_note_info['xsec_token']}" note_list.append(note_url) if save_choice == 'all' or save_choice == 'excel': excel_name = user_url.split('/')[-1].split('?')[0] state_key = f"done_note_ids:user:{user_url.split('/')[-1].split('?')[0]}" return self.spider_some_note(note_list, cookies_str, base_path, save_choice, excel_name, proxies, state_store=state_store, state_key=state_key) except Exception as e: success = False msg = e logger.info(f'爬取用户所有视频 {user_url}: {success}, msg: {msg}') return note_list, success, msg def spider_some_search_note(self, query: str, require_num: int, cookies_str: str, base_path: dict, save_choice: str, sort_type_choice=0, note_type=0, note_time=0, note_range=0, pos_distance=0, geo: dict = None, excel_name: str = '', proxies=None, state_store=None): note_list = [] try: success, msg, notes = self.xhs_apis.search_some_note(query, require_num, cookies_str, sort_type_choice, note_type, note_time, note_range, pos_distance, geo, proxies) if success: notes = list(filter(lambda x: x['model_type'] == "note", notes)) logger.info(f'搜索关键词 {query} 笔记数量: {len(notes)}') for note in notes: note_url = f"https://www.xiaohongshu.com/explore/{note['id']}?xsec_token={note['xsec_token']}" note_list.append(note_url) if save_choice == 'all' or save_choice == 'excel': excel_name = query state_key = f"done_note_ids:search:{query}:{sort_type_choice}:{note_type}:{note_time}:{note_range}:{pos_distance}" return self.spider_some_note(note_list, cookies_str, base_path, save_choice, excel_name, proxies, state_store=state_store, state_key=state_key) except Exception as e: success = False msg = e logger.info(f'搜索关键词 {query} 笔记: {success}, msg: {msg}') return note_list, success, msg