XHS / xhs_utils /spider.py
Trae Bot
Upload Spider_XHS project
c481f8a
import os
import urllib.parse
from loguru import logger
from apis.xhs_pc_apis import XHS_Apis
from xhs_utils.data_util import handle_note_info, download_note, save_to_xlsx
from xhs_utils.response_guard import get_dict, get_list
class Data_Spider():
def __init__(self):
self.xhs_apis = XHS_Apis()
def spider_note(self, note_url: str, cookies_str: str, proxies=None):
note_info = None
try:
success, msg, note_info = self.xhs_apis.get_note_info(note_url, cookies_str, proxies)
if success:
data = get_dict(note_info, "data", context="get_note_info")
items = get_list(data, "items", context="get_note_info")
if not items:
raise ValueError("empty_items")
note_info = items[0]
note_info['url'] = note_url
note_info = handle_note_info(note_info)
except Exception as e:
success = False
msg = e
logger.info(f'爬取笔记信息 {note_url}: {success}, msg: {msg}')
return success, msg, note_info
def _note_id_from_url(self, note_url: str):
try:
url_parse = urllib.parse.urlparse(note_url)
return url_parse.path.split("/")[-1]
except Exception:
return note_url
def spider_some_note(self, notes: list, cookies_str: str, base_path: dict, save_choice: str, excel_name: str = '', proxies=None, state_store=None, state_key: str = "done_note_ids"):
if (save_choice == 'all' or save_choice == 'excel') and excel_name == '':
raise ValueError('excel_name 不能为空')
note_list = []
done = set()
if state_store is not None:
done = state_store.get_set(state_key)
total = 0
skipped = 0
ok = 0
failed = 0
fail_reasons: dict[str, int] = {}
for note_url in notes:
total += 1
note_id = self._note_id_from_url(note_url)
if note_id in done:
skipped += 1
continue
success, msg, note_info = self.spider_note(note_url, cookies_str, proxies)
if note_info is not None and success:
note_list.append(note_info)
ok += 1
if state_store is not None:
state_store.add_to_set(state_key, note_id)
else:
failed += 1
r = str(msg)
fail_reasons[r] = fail_reasons.get(r, 0) + 1
for note_info in note_list:
if save_choice == 'all' or 'media' in save_choice:
download_note(note_info, base_path['media'], save_choice)
if save_choice == 'all' or save_choice == 'excel':
file_path = os.path.abspath(os.path.join(base_path['excel'], f'{excel_name}.xlsx'))
save_to_xlsx(note_list, file_path)
return {
"total": total,
"skipped": skipped,
"ok": ok,
"failed": failed,
"fail_reasons": fail_reasons,
}
def spider_user_all_note(self, user_url: str, cookies_str: str, base_path: dict, save_choice: str, excel_name: str = '', proxies=None, state_store=None):
note_list = []
try:
success, msg, all_note_info = self.xhs_apis.get_user_all_notes(user_url, cookies_str, proxies)
if success:
logger.info(f'用户 {user_url} 作品数量: {len(all_note_info)}')
for simple_note_info in all_note_info:
note_url = f"https://www.xiaohongshu.com/explore/{simple_note_info['note_id']}?xsec_token={simple_note_info['xsec_token']}"
note_list.append(note_url)
if save_choice == 'all' or save_choice == 'excel':
excel_name = user_url.split('/')[-1].split('?')[0]
state_key = f"done_note_ids:user:{user_url.split('/')[-1].split('?')[0]}"
return self.spider_some_note(note_list, cookies_str, base_path, save_choice, excel_name, proxies, state_store=state_store, state_key=state_key)
except Exception as e:
success = False
msg = e
logger.info(f'爬取用户所有视频 {user_url}: {success}, msg: {msg}')
return note_list, success, msg
def spider_some_search_note(self, query: str, require_num: int, cookies_str: str, base_path: dict, save_choice: str, sort_type_choice=0, note_type=0, note_time=0, note_range=0, pos_distance=0, geo: dict = None, excel_name: str = '', proxies=None, state_store=None):
note_list = []
try:
success, msg, notes = self.xhs_apis.search_some_note(query, require_num, cookies_str, sort_type_choice, note_type, note_time, note_range, pos_distance, geo, proxies)
if success:
notes = list(filter(lambda x: x['model_type'] == "note", notes))
logger.info(f'搜索关键词 {query} 笔记数量: {len(notes)}')
for note in notes:
note_url = f"https://www.xiaohongshu.com/explore/{note['id']}?xsec_token={note['xsec_token']}"
note_list.append(note_url)
if save_choice == 'all' or save_choice == 'excel':
excel_name = query
state_key = f"done_note_ids:search:{query}:{sort_type_choice}:{note_type}:{note_time}:{note_range}:{pos_distance}"
return self.spider_some_note(note_list, cookies_str, base_path, save_choice, excel_name, proxies, state_store=state_store, state_key=state_key)
except Exception as e:
success = False
msg = e
logger.info(f'搜索关键词 {query} 笔记: {success}, msg: {msg}')
return note_list, success, msg