| import os |
| import urllib.parse |
|
|
| from loguru import logger |
|
|
| from apis.xhs_pc_apis import XHS_Apis |
| from xhs_utils.data_util import handle_note_info, download_note, save_to_xlsx |
| from xhs_utils.response_guard import get_dict, get_list |
|
|
|
|
| class Data_Spider(): |
| def __init__(self): |
| self.xhs_apis = XHS_Apis() |
|
|
| def spider_note(self, note_url: str, cookies_str: str, proxies=None): |
| note_info = None |
| try: |
| success, msg, note_info = self.xhs_apis.get_note_info(note_url, cookies_str, proxies) |
| if success: |
| data = get_dict(note_info, "data", context="get_note_info") |
| items = get_list(data, "items", context="get_note_info") |
| if not items: |
| raise ValueError("empty_items") |
| note_info = items[0] |
| note_info['url'] = note_url |
| note_info = handle_note_info(note_info) |
| except Exception as e: |
| success = False |
| msg = e |
| logger.info(f'爬取笔记信息 {note_url}: {success}, msg: {msg}') |
| return success, msg, note_info |
|
|
| def _note_id_from_url(self, note_url: str): |
| try: |
| url_parse = urllib.parse.urlparse(note_url) |
| return url_parse.path.split("/")[-1] |
| except Exception: |
| return note_url |
|
|
| def spider_some_note(self, notes: list, cookies_str: str, base_path: dict, save_choice: str, excel_name: str = '', proxies=None, state_store=None, state_key: str = "done_note_ids"): |
| if (save_choice == 'all' or save_choice == 'excel') and excel_name == '': |
| raise ValueError('excel_name 不能为空') |
| note_list = [] |
| done = set() |
| if state_store is not None: |
| done = state_store.get_set(state_key) |
| total = 0 |
| skipped = 0 |
| ok = 0 |
| failed = 0 |
| fail_reasons: dict[str, int] = {} |
| for note_url in notes: |
| total += 1 |
| note_id = self._note_id_from_url(note_url) |
| if note_id in done: |
| skipped += 1 |
| continue |
| success, msg, note_info = self.spider_note(note_url, cookies_str, proxies) |
| if note_info is not None and success: |
| note_list.append(note_info) |
| ok += 1 |
| if state_store is not None: |
| state_store.add_to_set(state_key, note_id) |
| else: |
| failed += 1 |
| r = str(msg) |
| fail_reasons[r] = fail_reasons.get(r, 0) + 1 |
| for note_info in note_list: |
| if save_choice == 'all' or 'media' in save_choice: |
| download_note(note_info, base_path['media'], save_choice) |
| if save_choice == 'all' or save_choice == 'excel': |
| file_path = os.path.abspath(os.path.join(base_path['excel'], f'{excel_name}.xlsx')) |
| save_to_xlsx(note_list, file_path) |
| return { |
| "total": total, |
| "skipped": skipped, |
| "ok": ok, |
| "failed": failed, |
| "fail_reasons": fail_reasons, |
| } |
|
|
| def spider_user_all_note(self, user_url: str, cookies_str: str, base_path: dict, save_choice: str, excel_name: str = '', proxies=None, state_store=None): |
| note_list = [] |
| try: |
| success, msg, all_note_info = self.xhs_apis.get_user_all_notes(user_url, cookies_str, proxies) |
| if success: |
| logger.info(f'用户 {user_url} 作品数量: {len(all_note_info)}') |
| for simple_note_info in all_note_info: |
| note_url = f"https://www.xiaohongshu.com/explore/{simple_note_info['note_id']}?xsec_token={simple_note_info['xsec_token']}" |
| note_list.append(note_url) |
| if save_choice == 'all' or save_choice == 'excel': |
| excel_name = user_url.split('/')[-1].split('?')[0] |
| state_key = f"done_note_ids:user:{user_url.split('/')[-1].split('?')[0]}" |
| return self.spider_some_note(note_list, cookies_str, base_path, save_choice, excel_name, proxies, state_store=state_store, state_key=state_key) |
| except Exception as e: |
| success = False |
| msg = e |
| logger.info(f'爬取用户所有视频 {user_url}: {success}, msg: {msg}') |
| return note_list, success, msg |
|
|
| def spider_some_search_note(self, query: str, require_num: int, cookies_str: str, base_path: dict, save_choice: str, sort_type_choice=0, note_type=0, note_time=0, note_range=0, pos_distance=0, geo: dict = None, excel_name: str = '', proxies=None, state_store=None): |
| note_list = [] |
| try: |
| success, msg, notes = self.xhs_apis.search_some_note(query, require_num, cookies_str, sort_type_choice, note_type, note_time, note_range, pos_distance, geo, proxies) |
| if success: |
| notes = list(filter(lambda x: x['model_type'] == "note", notes)) |
| logger.info(f'搜索关键词 {query} 笔记数量: {len(notes)}') |
| for note in notes: |
| note_url = f"https://www.xiaohongshu.com/explore/{note['id']}?xsec_token={note['xsec_token']}" |
| note_list.append(note_url) |
| if save_choice == 'all' or save_choice == 'excel': |
| excel_name = query |
| state_key = f"done_note_ids:search:{query}:{sort_type_choice}:{note_type}:{note_time}:{note_range}:{pos_distance}" |
| return self.spider_some_note(note_list, cookies_str, base_path, save_choice, excel_name, proxies, state_store=state_store, state_key=state_key) |
| except Exception as e: |
| success = False |
| msg = e |
| logger.info(f'搜索关键词 {query} 笔记: {success}, msg: {msg}') |
| return note_list, success, msg |
|
|