Spaces:

luoleyuan
/

XHS

Sleeping

Trae Bot

Upload Spider_XHS project

c481f8a about 1 month ago

5.78 kB

	import os
	import urllib.parse

	from loguru import logger

	from apis.xhs_pc_apis import XHS_Apis
	from xhs_utils.data_util import handle_note_info, download_note, save_to_xlsx
	from xhs_utils.response_guard import get_dict, get_list


	class Data_Spider():
	def __init__(self):
	self.xhs_apis = XHS_Apis()

	def spider_note(self, note_url: str, cookies_str: str, proxies=None):
	note_info = None
	try:
	success, msg, note_info = self.xhs_apis.get_note_info(note_url, cookies_str, proxies)
	if success:
	data = get_dict(note_info, "data", context="get_note_info")
	items = get_list(data, "items", context="get_note_info")
	if not items:
	raise ValueError("empty_items")
	note_info = items[0]
	note_info['url'] = note_url
	note_info = handle_note_info(note_info)
	except Exception as e:
	success = False
	msg = e
	logger.info(f'爬取笔记信息 {note_url}: {success}, msg: {msg}')
	return success, msg, note_info

	def _note_id_from_url(self, note_url: str):
	try:
	url_parse = urllib.parse.urlparse(note_url)
	return url_parse.path.split("/")[-1]
	except Exception:
	return note_url

	def spider_some_note(self, notes: list, cookies_str: str, base_path: dict, save_choice: str, excel_name: str = '', proxies=None, state_store=None, state_key: str = "done_note_ids"):
	if (save_choice == 'all' or save_choice == 'excel') and excel_name == '':
	raise ValueError('excel_name 不能为空')
	note_list = []
	done = set()
	if state_store is not None:
	done = state_store.get_set(state_key)
	total = 0
	skipped = 0
	ok = 0
	failed = 0
	fail_reasons: dict[str, int] = {}
	for note_url in notes:
	total += 1
	note_id = self._note_id_from_url(note_url)
	if note_id in done:
	skipped += 1
	continue
	success, msg, note_info = self.spider_note(note_url, cookies_str, proxies)
	if note_info is not None and success:
	note_list.append(note_info)
	ok += 1
	if state_store is not None:
	state_store.add_to_set(state_key, note_id)
	else:
	failed += 1
	r = str(msg)
	fail_reasons[r] = fail_reasons.get(r, 0) + 1
	for note_info in note_list:
	if save_choice == 'all' or 'media' in save_choice:
	download_note(note_info, base_path['media'], save_choice)
	if save_choice == 'all' or save_choice == 'excel':
	file_path = os.path.abspath(os.path.join(base_path['excel'], f'{excel_name}.xlsx'))
	save_to_xlsx(note_list, file_path)
	return {
	"total": total,
	"skipped": skipped,
	"ok": ok,
	"failed": failed,
	"fail_reasons": fail_reasons,
	}

	def spider_user_all_note(self, user_url: str, cookies_str: str, base_path: dict, save_choice: str, excel_name: str = '', proxies=None, state_store=None):
	note_list = []
	try:
	success, msg, all_note_info = self.xhs_apis.get_user_all_notes(user_url, cookies_str, proxies)
	if success:
	logger.info(f'用户 {user_url} 作品数量: {len(all_note_info)}')
	for simple_note_info in all_note_info:
	note_url = f"https://www.xiaohongshu.com/explore/{simple_note_info['note_id']}?xsec_token={simple_note_info['xsec_token']}"
	note_list.append(note_url)
	if save_choice == 'all' or save_choice == 'excel':
	excel_name = user_url.split('/')[-1].split('?')[0]
	state_key = f"done_note_ids:user:{user_url.split('/')[-1].split('?')[0]}"
	return self.spider_some_note(note_list, cookies_str, base_path, save_choice, excel_name, proxies, state_store=state_store, state_key=state_key)
	except Exception as e:
	success = False
	msg = e
	logger.info(f'爬取用户所有视频 {user_url}: {success}, msg: {msg}')
	return note_list, success, msg

	def spider_some_search_note(self, query: str, require_num: int, cookies_str: str, base_path: dict, save_choice: str, sort_type_choice=0, note_type=0, note_time=0, note_range=0, pos_distance=0, geo: dict = None, excel_name: str = '', proxies=None, state_store=None):
	note_list = []
	try:
	success, msg, notes = self.xhs_apis.search_some_note(query, require_num, cookies_str, sort_type_choice, note_type, note_time, note_range, pos_distance, geo, proxies)
	if success:
	notes = list(filter(lambda x: x['model_type'] == "note", notes))
	logger.info(f'搜索关键词 {query} 笔记数量: {len(notes)}')
	for note in notes:
	note_url = f"https://www.xiaohongshu.com/explore/{note['id']}?xsec_token={note['xsec_token']}"
	note_list.append(note_url)
	if save_choice == 'all' or save_choice == 'excel':
	excel_name = query
	state_key = f"done_note_ids:search:{query}:{sort_type_choice}:{note_type}:{note_time}:{note_range}:{pos_distance}"
	return self.spider_some_note(note_list, cookies_str, base_path, save_choice, excel_name, proxies, state_store=state_store, state_key=state_key)
	except Exception as e:
	success = False
	msg = e
	logger.info(f'搜索关键词 {query} 笔记: {success}, msg: {msg}')
	return note_list, success, msg