Spaces:

nvidia
/

voice-agent-examples

Running

App Files Files Community

voice-agent-examples / src /nvidia_pipecat /services /nvidia_rag.py

fciannella

Working with service run on 7860

53ea588 3 months ago

raw

history blame

14.8 kB

	# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
	# SPDX-License-Identifier: BSD 2-Clause License

	"""NVIDIA Retrieval-Augmented Generation (RAG) service implementation.

	Integrates with NVIDIA's Retrieval-Augmented Generation service to enhance responses
	by incorporating knowledge from external documents. Features include:
	- Document collection management
	- Real-time retrieval and citation
	- OpenAI-compatible LLM interface
	- Configurable retrieval parameters
	"""

	import json

	import httpx
	from loguru import logger
	from openai.types.chat import ChatCompletionMessageParam
	from pipecat.frames.frames import (
	CancelFrame,
	EndFrame,
	ErrorFrame,
	Frame,
	LLMFullResponseEndFrame,
	LLMFullResponseStartFrame,
	LLMMessagesFrame,
	StartInterruptionFrame,
	TextFrame,
	VisionImageRawFrame,
	)
	from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext, OpenAILLMContextFrame
	from pipecat.processors.frame_processor import FrameDirection
	from pipecat.services.openai.llm import OpenAILLMService

	from nvidia_pipecat.frames.nvidia_rag import NvidiaRAGCitation, NvidiaRAGCitationsFrame, NvidiaRAGSettingsFrame


	class NvidiaRAGService(OpenAILLMService):
	"""This is the base class for all services that use NVIDIA RAG/GenerativeAIExamples.

	Requires deployed NVIDIA RAG server. For deployment instructions see:
	https://github.com/NVIDIA-AI-Blueprints/rag/blob/main/docs/quickstart.md

	Attributes:
	collection_name: Document collection identifier.
	rag_server_url: RAG API endpoint URL.
	stop_words: Words that stop LLM generation.
	temperature: Controls response randomness (0-1).
	top_p: Token probability threshold (0-1).
	max_tokens: Maximum response length.
	use_knowledge_base: Whether to use RAG retrieval.
	vdb_top_k: Number of chunks to retrieve.
	reranker_top_k: Number of chunks to rerank.
	enable_citations: Whether to return citations.
	suffix_prompt: Text appended to last user message.
	"""

	_shared_session: httpx.AsyncClient \| None = None

	def __init__(
	self,
	collection_name: str,
	rag_server_url: str = "http://localhost:8081",
	stop_words: list \| None = None,
	temperature: float = 0.2,
	top_p: float = 0.7,
	max_tokens: int = 1000,
	use_knowledge_base: bool = True,
	vdb_top_k: int = 20,
	reranker_top_k: int = 4,
	enable_citations: bool = True,
	suffix_prompt: str \| None = None,
	session: httpx.AsyncClient \| None = None,
	**kwargs,
	):
	"""Initialize the NVIDIA RAG service.

	Args:
	collection_name: Document collection identifier.
	rag_server_url: RAG API endpoint URL.
	stop_words: Words that stop LLM generation.
	temperature: Controls response randomness (0-1).
	top_p: Token probability threshold (0-1).
	max_tokens: Maximum response length.
	use_knowledge_base: Whether to use RAG retrieval.
	vdb_top_k: Number of chunks to retrieve.
	reranker_top_k: Number of chunks to rerank.
	enable_citations: Whether to return citations.
	suffix_prompt: Text appended to last user message.
	session: Optional httpx.AsyncClient. Creates new if None.
	**kwargs: Additional arguments passed to OpenAILLMService.
	"""
	super().__init__(api_key="", **kwargs)
	self.collection_name = collection_name
	self.rag_server_url = rag_server_url
	if stop_words is None:
	stop_words = []
	self.stop_words = stop_words
	self.temperature = temperature
	self.top_p = top_p
	self.max_tokens = max_tokens
	self.use_knowledge_base = use_knowledge_base
	self.vdb_top_k = vdb_top_k
	self.reranker_top_k = reranker_top_k
	self.enable_citations = enable_citations
	self.suffix_prompt = suffix_prompt
	self._external_client_session = None
	self._current_task = None

	if session is not None:
	self._external_client_session = session

	@property
	def shared_session(self) -> httpx.AsyncClient:
	"""Get the shared HTTP client session.

	Returns:
	httpx.AsyncClient: The shared session for making HTTP requests.
	Creates a new session if none exists and no external session was provided.
	"""
	if self._external_client_session is not None:
	return self._external_client_session

	if NvidiaRAGService._shared_session is None:
	NvidiaRAGService._shared_session = httpx.AsyncClient()
	return NvidiaRAGService._shared_session

	@shared_session.setter
	def shared_session(self, shared_session: httpx.AsyncClient):
	"""Set the shared HTTP client session.

	Args:
	shared_session: The httpx.AsyncClient to use for all instances.
	"""
	NvidiaRAGService._shared_session = shared_session

	async def stop(self, frame: EndFrame):
	"""Stop the NVIDIA RAG service and cleanup resources.

	Args:
	frame: The EndFrame that triggered the stop.
	"""
	await super().stop(frame)
	if self._current_task:
	await self.cancel_task(self._current_task)

	async def cancel(self, frame: CancelFrame):
	"""Cancel the NVIDIA RAG service and cleanup resources.

	Args:
	frame: The CancelFrame that triggered the cancellation.
	"""
	await super().cancel(frame)
	if self._current_task:
	await self.cancel_task(self._current_task)

	async def cleanup(self):
	"""Clean up resources used by the RAG service.

	Closes the shared HTTP client session if it exists and performs parent cleanup.
	"""
	await super().cleanup()
	await self._close_client_session()

	async def _close_client_session(self):
	"""Close the Client Session if it exists."""
	if NvidiaRAGService._shared_session:
	await NvidiaRAGService._shared_session.aclose()
	NvidiaRAGService._shared_session = None

	async def _get_rag_response(self, request_json: dict):
	resp = await self.shared_session.post(f"{self.rag_server_url}/generate", json=request_json)
	return resp

	async def _process_context(self, context: OpenAILLMContext):
	"""Processes LLM context through RAG pipeline.

	Args:
	context: Contains conversation history and settings.

	Raises:
	Exception: If invalid message role or empty query.
	"""
	try:
	messages: list[ChatCompletionMessageParam] = context.get_messages()
	chat_details = []

	for msg in messages:
	if msg["role"] != "system" and msg["role"] != "user" and msg["role"] != "assistant":
	raise Exception(f"Unexpected role {msg['role']} found!")
	chat_details.append({"role": msg["role"], "content": msg["content"]})

	if self.suffix_prompt:
	for i in range(len(chat_details) - 1, -1, -1):
	if chat_details[i]["role"] == "user":
	chat_details[i]["content"] += f" {self.suffix_prompt}"
	break

	logger.debug(f"Chat details: {chat_details}")

	if len(chat_details) == 0 or all(msg["content"] == "" for msg in chat_details) or not self.collection_name:
	raise Exception("No query or collection name is provided..")

	"""
	Call the RAG chain server and return the streaming response.
	"""
	request_json = {
	"messages": chat_details,
	"use_knowledge_base": self.use_knowledge_base,
	"temperature": self.temperature,
	"top_p": self.top_p,
	"max_tokens": self.max_tokens,
	"vdb_top_k": self.vdb_top_k,
	"reranker_top_k": self.reranker_top_k,
	"collection_name": self.collection_name,
	"stop": self.stop_words,
	"enable_citations": self.enable_citations,
	}

	await self.start_ttfb_metrics()

	full_response = ""
	resp = await self._get_rag_response(request_json)
	try:
	async for chunk in resp.aiter_lines():
	await self.stop_ttfb_metrics()

	citations = []
	try:
	chunk = chunk.strip("\n")

	try:
	if len(chunk) > 6:
	parsed = json.loads(chunk[6:])
	message = parsed["choices"][0]["message"]["content"]
	if "citations" in parsed:
	for citation in parsed["citations"]["results"]:
	citations.append(
	NvidiaRAGCitation(
	document_type=str(citation["document_type"]),
	document_id=str(citation["document_id"]),
	document_name=str(citation["document_name"]),
	content=str(citation["content"]).encode(),
	metadata=str(citation["metadata"]),
	score=float(citation["score"]),
	)
	)
	else:
	logger.warning(f"Received empty RAG response chunk '{chunk}'.")
	message = ""

	except Exception as e:
	logger.debug(f"Parsing RAG response chunk failed. Error: {e}")
	message = ""
	if not message and not citations:
	continue
	full_response += message
	if citations:
	scores = [citation.score for citation in citations]
	types = [citation.document_type for citation in citations]
	logger.debug(f"Received total {len(citations)} RAG citations")
	logger.debug(f"Received RAG citation types: {types}")
	logger.debug(f"Received RAG citation scores: {scores}")

	await self.push_frame(NvidiaRAGCitationsFrame(citations=citations))
	if message:
	await self.push_frame(TextFrame(message))
	except Exception as e:
	await self.push_error(ErrorFrame("Internal error in RAG stream: " + str(e)))
	finally:
	await resp.aclose()

	logger.debug(f"Full RAG response: {full_response}")

	except Exception as e:
	logger.error(f"An error occurred in http request to RAG endpoint, Error: {e}")
	await self.push_error(ErrorFrame("An error occurred in http request to RAG endpoint, Error: " + str(e)))

	async def _update_settings(self, settings):
	"""Updates service settings.

	Args:
	settings: Dictionary of setting name-value pairs.
	"""
	for setting, value in settings.items():
	logger.debug(f"Updating {setting} to {value} via NvidiaRAGSettingsFrame")
	match setting:
	case "collection_name":
	self.collection_name = value
	case "rag_server_url":
	self.rag_server_url = value
	case "stop_words":
	self.stop_words = value
	case "temperature":
	self.temperature = value
	case "top_p":
	self.top_p = value
	case "max_tokens":
	self.max_tokens = value
	case "use_knowledge_base":
	self.use_knowledge_base = value
	case "vdb_top_k":
	self.vdb_top_k = value
	case "reranker_top_k":
	self.reranker_top_k = value
	case "enable_citations":
	self.enable_citations = value
	case _:
	logger.warning(f"Unknown setting for NvidiaRAG service: {setting}")

	async def _process_context_and_frames(self, context: OpenAILLMContext):
	"""Process context and handle start/end frames with metrics."""
	await self.push_frame(LLMFullResponseStartFrame())
	await self.start_processing_metrics()
	await self._process_context(context)
	await self.stop_processing_metrics()
	await self.push_frame(LLMFullResponseEndFrame())

	async def process_frame(self, frame: Frame, direction: FrameDirection):
	"""Processes pipeline frames.

	Handles settings updates and parent frame processing.

	Args:
	frame: Input frame to process.
	direction: Frame processing direction.
	"""
	context = None
	if isinstance(frame, NvidiaRAGSettingsFrame):
	await self._update_settings(frame.settings)
	if isinstance(frame, OpenAILLMContextFrame):
	context: OpenAILLMContext = frame.context
	elif isinstance(frame, LLMMessagesFrame):
	context = OpenAILLMContext.from_messages(frame.messages)
	elif isinstance(frame, VisionImageRawFrame):
	context = OpenAILLMContext()
	context.add_image_frame_message(format=frame.format, size=frame.size, image=frame.image, text=frame.text)
	elif isinstance(frame, StartInterruptionFrame):
	if self._current_task is not None:
	await self.cancel_task(self._current_task)
	await self._start_interruption()
	await self.stop_all_metrics()
	await self.push_frame(frame)
	else:
	await super().process_frame(frame, direction)

	if context:
	new_task = self.create_task(self._process_context_and_frames(context))
	if self._current_task is not None:
	await self.cancel_task(self._current_task)
	self._current_task = new_task
	self._current_task.add_done_callback(lambda _: setattr(self, "_current_task", None))