Spaces:

pjpjq
/

MiroFish

Running

MiroFish / backend /app /utils /zep_paging.py

Codex Deploy

Deploy MiroFish to HF Space

ebdfd3b about 2 months ago

4.45 kB

	"""Zep Graph 分页读取工具。

	Zep 的 node/edge 列表接口使用 UUID cursor 分页，
	本模块封装自动翻页逻辑（含单页重试），对调用方透明地返回完整列表。
	"""

	from __future__ import annotations

	import time
	from collections.abc import Callable
	from typing import Any

	from zep_cloud import InternalServerError
	from zep_cloud.client import Zep

	from .logger import get_logger

	logger = get_logger('mirofish.zep_paging')

	_DEFAULT_PAGE_SIZE = 100
	_MAX_NODES = 2000
	_DEFAULT_MAX_RETRIES = 3
	_DEFAULT_RETRY_DELAY = 2.0 # seconds, doubles each retry


	def _fetch_page_with_retry(
	api_call: Callable[..., list[Any]],
	*args: Any,
	max_retries: int = _DEFAULT_MAX_RETRIES,
	retry_delay: float = _DEFAULT_RETRY_DELAY,
	page_description: str = "page",
	**kwargs: Any,
	) -> list[Any]:
	"""单页请求，失败时指数退避重试。仅重试网络/IO类瞬态错误。"""
	if max_retries < 1:
	raise ValueError("max_retries must be >= 1")

	last_exception: Exception \| None = None
	delay = retry_delay

	for attempt in range(max_retries):
	try:
	return api_call(args, *kwargs)
	except (ConnectionError, TimeoutError, OSError, InternalServerError) as e:
	last_exception = e
	if attempt < max_retries - 1:
	logger.warning(
	f"Zep {page_description} attempt {attempt + 1} failed: {str(e)[:100]}, retrying in {delay:.1f}s..."
	)
	time.sleep(delay)
	delay *= 2
	else:
	logger.error(f"Zep {page_description} failed after {max_retries} attempts: {str(e)}")

	assert last_exception is not None
	raise last_exception


	def fetch_all_nodes(
	client: Zep,
	graph_id: str,
	page_size: int = _DEFAULT_PAGE_SIZE,
	max_items: int = _MAX_NODES,
	max_retries: int = _DEFAULT_MAX_RETRIES,
	retry_delay: float = _DEFAULT_RETRY_DELAY,
	) -> list[Any]:
	"""分页获取图谱节点，最多返回 max_items 条（默认 2000）。每页请求自带重试。"""
	all_nodes: list[Any] = []
	cursor: str \| None = None
	page_num = 0

	while True:
	kwargs: dict[str, Any] = {"limit": page_size}
	if cursor is not None:
	kwargs["uuid_cursor"] = cursor

	page_num += 1
	batch = _fetch_page_with_retry(
	client.graph.node.get_by_graph_id,
	graph_id,
	max_retries=max_retries,
	retry_delay=retry_delay,
	page_description=f"fetch nodes page {page_num} (graph={graph_id})",
	**kwargs,
	)
	if not batch:
	break

	all_nodes.extend(batch)
	if len(all_nodes) >= max_items:
	all_nodes = all_nodes[:max_items]
	logger.warning(f"Node count reached limit ({max_items}), stopping pagination for graph {graph_id}")
	break
	if len(batch) < page_size:
	break

	cursor = getattr(batch[-1], "uuid_", None) or getattr(batch[-1], "uuid", None)
	if cursor is None:
	logger.warning(f"Node missing uuid field, stopping pagination at {len(all_nodes)} nodes")
	break

	return all_nodes


	def fetch_all_edges(
	client: Zep,
	graph_id: str,
	page_size: int = _DEFAULT_PAGE_SIZE,
	max_retries: int = _DEFAULT_MAX_RETRIES,
	retry_delay: float = _DEFAULT_RETRY_DELAY,
	) -> list[Any]:
	"""分页获取图谱所有边，返回完整列表。每页请求自带重试。"""
	all_edges: list[Any] = []
	cursor: str \| None = None
	page_num = 0

	while True:
	kwargs: dict[str, Any] = {"limit": page_size}
	if cursor is not None:
	kwargs["uuid_cursor"] = cursor

	page_num += 1
	batch = _fetch_page_with_retry(
	client.graph.edge.get_by_graph_id,
	graph_id,
	max_retries=max_retries,
	retry_delay=retry_delay,
	page_description=f"fetch edges page {page_num} (graph={graph_id})",
	**kwargs,
	)
	if not batch:
	break

	all_edges.extend(batch)
	if len(batch) < page_size:
	break

	cursor = getattr(batch[-1], "uuid_", None) or getattr(batch[-1], "uuid", None)
	if cursor is None:
	logger.warning(f"Edge missing uuid field, stopping pagination at {len(all_edges)} edges")
	break

	return all_edges