Spaces:

rrizwan98
/

honcho-api

Runtime error

rrizwan98

Honcho self-hosted deployment for HF Spaces

66227af about 2 months ago

19.4 kB

	"""
	Core Dialectic Agent implementation.

	This agent uses tools to gather context from the memory system
	and synthesize responses to queries about a peer.
	"""

	import logging
	import time
	import uuid
	from collections.abc import AsyncIterator, Callable
	from typing import Any, cast

	from src import crud
	from src.config import ConfiguredModelSettings, ReasoningLevel, settings
	from src.dependencies import tracked_db
	from src.dialectic import prompts
	from src.embedding_client import embedding_client
	from src.llm import (
	HonchoLLMCallResponse,
	StreamingResponseWithMetadata,
	honcho_llm_call,
	)
	from src.telemetry import prometheus_metrics
	from src.telemetry.events import DialecticCompletedEvent, emit
	from src.telemetry.logging import (
	accumulate_metric,
	log_performance_metrics,
	log_token_usage_metrics,
	)
	from src.telemetry.prometheus.metrics import DialecticComponents, TokenTypes
	from src.utils.agent_tools import (
	DIALECTIC_TOOLS,
	DIALECTIC_TOOLS_MINIMAL,
	create_tool_executor,
	search_memory,
	)
	from src.utils.formatting import format_new_turn_with_timestamp

	logger = logging.getLogger(__name__)


	def _get_dialectic_level_model_config(
	reasoning_level: ReasoningLevel,
	) -> ConfiguredModelSettings:
	return settings.DIALECTIC.LEVELS[reasoning_level].MODEL_CONFIG


	class DialecticAgent:
	"""
	An agentic dialectic that iteratively gathers context to answer queries.

	Unlike the standard dialectic which pre-gathers all context before a single
	LLM call, this agent uses tools to strategically gather only the context
	needed to answer the specific query.
	"""

	def __init__(
	self,
	workspace_name: str,
	session_name: str \| None,
	observer: str,
	observed: str,
	observer_peer_card: list[str] \| None = None,
	observed_peer_card: list[str] \| None = None,
	metric_key: str \| None = None,
	reasoning_level: ReasoningLevel = "low",
	):
	"""
	Initialize the dialectic agent.

	Args:
	workspace_name: Workspace identifier
	session_name: Session identifier (may be None for global queries)
	observer: The peer making the query
	observed: The peer being queried about
	observer_peer_card: Biographical information about the observer
	observed_peer_card: Biographical information about the observed peer
	metric_key: Optional key for logging metrics (if provided, agent won't log separately)
	reasoning_level: Level of reasoning to apply
	"""
	self.workspace_name: str = workspace_name
	self.session_name: str \| None = session_name
	self.observer: str = observer
	self.observed: str = observed
	self.observer_peer_card: list[str] \| None = observer_peer_card
	self.observed_peer_card: list[str] \| None = observed_peer_card
	self.metric_key: str \| None = metric_key
	self.reasoning_level: ReasoningLevel = reasoning_level

	# Initialize conversation history with system prompt
	self.messages: list[dict[str, str]] = [
	{
	"role": "system",
	"content": prompts.agent_system_prompt(
	observer, observed, observer_peer_card, observed_peer_card
	),
	}
	]
	self._session_history_initialized: bool = False
	self._prefetched_conclusion_count: int = 0
	self._run_id: str = str(uuid.uuid4())[
	:8
	] # Always generate for event correlation

	async def _initialize_session_history(self) -> None:
	"""Fetch and inject session history into the system prompt if configured."""
	if self._session_history_initialized:
	return
	self._session_history_initialized = True

	max_tokens = settings.DIALECTIC.SESSION_HISTORY_MAX_TOKENS
	if max_tokens == 0 or not self.session_name:
	return

	# Fetch recent messages up to the token limit
	stmt = await crud.get_messages(
	workspace_name=self.workspace_name,
	session_name=self.session_name,
	token_limit=max_tokens,
	reverse=False, # chronological order
	)
	async with tracked_db("dialectic.session_history") as db:
	result = await db.execute(stmt)
	messages = result.scalars().all()

	if not messages:
	return

	# Format messages for injection (must access ORM attrs before session closes)
	formatted_messages: list[str] = []
	for msg in messages:
	formatted = format_new_turn_with_timestamp(
	msg.content, msg.created_at, msg.peer_name
	)
	formatted_messages.append(formatted)

	session_history_section = (
	"\n\n## SESSION HISTORY\n\n"
	"The following is the recent conversation history from this session. "
	"Use this as immediate context when answering the query.\n\n"
	"<session_history>\n"
	f"{chr(10).join(formatted_messages)}\n"
	"</session_history>"
	)

	# Append session history to the system prompt
	self.messages[0]["content"] += session_history_section

	async def _prefetch_relevant_observations(self, query: str) -> str \| None:
	"""
	Prefetch semantically relevant observations for the query.

	This provides immediate context to the agent without requiring
	tool calls, improving response quality and speed.

	Performs two separate searches to prevent retrieval dilution:
	- Explicit observations (produced by deriver)
	- Higher-level observations (produced in dreaming/background/chat)

	The number of observations fetched depends on reasoning level:
	- minimal: 10 of each type (reduced context for cost savings)
	- all others: 25 of each type

	Args:
	query: The user's query

	Returns:
	Formatted observations string or None if no observations found
	"""
	# Use reduced prefetch for minimal reasoning to save tokens
	prefetch_limit = 10 if self.reasoning_level == "minimal" else 25

	try:
	# Pre-compute embedding once for both searches (no DB needed)
	query_embedding = await embedding_client.embed(query)

	# search_memory manages its own short-lived DB sessions so no
	# connection is held during external vector-store calls.
	explicit_repr = await search_memory(
	workspace_name=self.workspace_name,
	observer=self.observer,
	observed=self.observed,
	query=query,
	limit=prefetch_limit,
	levels=["explicit"],
	embedding=query_embedding,
	)

	derived_repr = await search_memory(
	workspace_name=self.workspace_name,
	observer=self.observer,
	observed=self.observed,
	query=query,
	limit=prefetch_limit,
	levels=["deductive", "inductive", "contradiction"],
	embedding=query_embedding,
	)

	if explicit_repr.is_empty() and derived_repr.is_empty():
	return None

	# Count prefetched conclusions for telemetry
	explicit_count = len(explicit_repr.explicit) + len(explicit_repr.deductive)
	derived_count = len(derived_repr.explicit) + len(derived_repr.deductive)
	self._prefetched_conclusion_count = explicit_count + derived_count

	# Format as two separate sections
	parts: list[str] = []

	if not explicit_repr.is_empty():
	parts.append(explicit_repr.format_as_markdown(include_ids=False))

	if not derived_repr.is_empty():
	# Include IDs for derived so agent can use get_reasoning_chain
	parts.append(derived_repr.format_as_markdown(include_ids=True))

	return "\n".join(parts)

	except Exception as e:
	logger.warning(f"Failed to prefetch observations: {e}")
	return None

	async def _prepare_query(
	self, query: str
	) -> tuple[Callable[[str, dict[str, Any]], Any], str, str \| None, float]:
	"""
	Prepare common state for answering a query.

	Handles session history initialization, metrics setup, observation prefetching,
	user message construction, and tool executor creation.

	Args:
	query: The question to answer about the peer

	Returns:
	A tuple of (tool_executor, task_name, run_id, start_time)
	"""
	await self._initialize_session_history()

	run_id: str \| None = None
	if self.metric_key:
	task_name = self.metric_key
	else:
	run_id = str(uuid.uuid4())[:8]
	task_name = f"dialectic_chat_{run_id}"
	start_time = time.perf_counter()

	accumulate_metric(
	task_name,
	"context",
	(
	f"workspace: {self.workspace_name}\n"
	f"session: {self.session_name or '(global)'}\n"
	f"observer: {self.observer}\n"
	f"observed: {self.observed}\n"
	f"reasoning_level: {self.reasoning_level}"
	),
	"blob",
	)
	accumulate_metric(task_name, "query", query, "blob")

	prefetched_observations = await self._prefetch_relevant_observations(query)

	if prefetched_observations:
	user_content = (
	f"Query: {query}\n\n"
	f"## Relevant Observations (prefetched)\n"
	f"The following observations were found to be semantically relevant to your query. "
	f"Use these as primary context. You may still use tools to find additional information if needed.\n\n"
	f"{prefetched_observations}"
	)
	accumulate_metric(
	task_name, "prefetched_observations", prefetched_observations, "blob"
	)
	else:
	user_content = f"Query: {query}"

	self.messages.append({"role": "user", "content": user_content})

	tool_executor: Callable[
	[str, dict[str, Any]], Any
	] = await create_tool_executor(
	workspace_name=self.workspace_name,
	session_name=self.session_name,
	observer=self.observer,
	observed=self.observed,
	history_token_limit=settings.DIALECTIC.HISTORY_TOKEN_LIMIT,
	run_id=self._run_id,
	agent_type="dialectic",
	parent_category="dialectic",
	)

	return tool_executor, task_name, run_id, start_time

	def _log_response_metrics(
	self,
	task_name: str,
	run_id: str \| None,
	start_time: float,
	response_content: str,
	input_tokens: int,
	output_tokens: int,
	cache_read_input_tokens: int \| None,
	cache_creation_input_tokens: int \| None,
	tool_calls_count: int,
	thinking_content: str \| None,
	iterations: int,
	) -> None:
	"""
	Log metrics common to both streaming and non-streaming responses.

	Args:
	task_name: Metrics task identifier
	run_id: Run identifier (None if using caller-provided metric_key)
	start_time: Start time from time.perf_counter()
	response_content: The full response text
	input_tokens: Input token count (actual from API)
	output_tokens: Output token count (actual from API)
	cache_read_input_tokens: Cache read tokens (if any)
	cache_creation_input_tokens: Cache creation tokens (if any)
	tool_calls_count: Number of tool calls made
	thinking_content: Thinking trace content (if any)
	iterations: Number of iterations in the tool execution loop
	"""
	accumulate_metric(task_name, "tool_calls", tool_calls_count, "count")

	if thinking_content:
	accumulate_metric(task_name, "thinking", thinking_content, "blob")

	log_token_usage_metrics(
	task_name,
	input_tokens,
	output_tokens,
	cache_read_input_tokens or 0,
	cache_creation_input_tokens or 0,
	)
	accumulate_metric(task_name, "response", response_content, "blob")

	elapsed_ms = (time.perf_counter() - start_time) * 1000
	accumulate_metric(task_name, "total_duration", elapsed_ms, "ms")

	if not self.metric_key and run_id is not None:
	log_performance_metrics("dialectic_chat", run_id)

	# Prometheus metrics
	if settings.METRICS.ENABLED:
	prometheus_metrics.record_dialectic_tokens(
	count=input_tokens,
	token_type=TokenTypes.INPUT.value,
	component=DialecticComponents.TOTAL.value,
	reasoning_level=self.reasoning_level,
	)
	prometheus_metrics.record_dialectic_tokens(
	count=output_tokens,
	token_type=TokenTypes.OUTPUT.value,
	component=DialecticComponents.TOTAL.value,
	reasoning_level=self.reasoning_level,
	)

	# Emit telemetry event
	emit(
	DialecticCompletedEvent(
	run_id=self._run_id,
	workspace_name=self.workspace_name,
	peer_name=self.observed,
	session_name=self.session_name,
	reasoning_level=self.reasoning_level,
	total_iterations=iterations,
	prefetched_conclusion_count=self._prefetched_conclusion_count,
	tool_calls_count=tool_calls_count,
	total_duration_ms=elapsed_ms,
	input_tokens=input_tokens,
	output_tokens=output_tokens,
	cache_read_tokens=cache_read_input_tokens or 0,
	cache_creation_tokens=cache_creation_input_tokens or 0,
	)
	)

	async def answer(self, query: str) -> str:
	"""
	Answer a query about the peer using agentic tool calling.

	The agent will:
	1. Receive the query
	2. Use tools to gather relevant context
	3. Synthesize a response grounded in the gathered context

	Args:
	query: The question to answer about the peer

	Returns:
	The synthesized answer string
	"""
	tool_executor, task_name, run_id, start_time = await self._prepare_query(query)

	# Get level-specific settings
	level_settings = settings.DIALECTIC.LEVELS[self.reasoning_level]

	# Use minimal tools for minimal reasoning to reduce cost
	tools = (
	DIALECTIC_TOOLS_MINIMAL
	if self.reasoning_level == "minimal"
	else DIALECTIC_TOOLS
	)
	# Use level-specific max_output_tokens if set, otherwise global default
	max_tokens = (
	level_settings.MAX_OUTPUT_TOKENS
	if level_settings.MAX_OUTPUT_TOKENS is not None
	else settings.DIALECTIC.MAX_OUTPUT_TOKENS
	)

	response: HonchoLLMCallResponse[str] = await honcho_llm_call(
	model_config=_get_dialectic_level_model_config(self.reasoning_level),
	prompt="", # Ignored since we pass messages
	max_tokens=max_tokens,
	tools=tools,
	tool_choice=level_settings.TOOL_CHOICE,
	tool_executor=tool_executor,
	max_tool_iterations=level_settings.MAX_TOOL_ITERATIONS,
	messages=self.messages,
	track_name="Dialectic Agent",
	max_input_tokens=settings.DIALECTIC.MAX_INPUT_TOKENS,
	trace_name="dialectic_chat",
	)

	self._log_response_metrics(
	task_name=task_name,
	run_id=run_id,
	start_time=start_time,
	response_content=response.content,
	input_tokens=response.input_tokens,
	output_tokens=response.output_tokens,
	cache_read_input_tokens=response.cache_read_input_tokens,
	cache_creation_input_tokens=response.cache_creation_input_tokens,
	tool_calls_count=len(response.tool_calls_made),
	thinking_content=response.thinking_content,
	iterations=response.iterations,
	)

	return response.content

	async def answer_stream(self, query: str) -> AsyncIterator[str]:
	"""
	Answer a query about the peer using agentic tool calling, streaming the response.

	The agent will:
	1. Receive the query
	2. Use tools to gather relevant context (non-streaming)
	3. Stream the synthesized response

	Args:
	query: The question to answer about the peer

	Yields:
	Chunks of the response text as they are generated
	"""
	tool_executor, task_name, run_id, start_time = await self._prepare_query(query)

	# Get level-specific settings
	level_settings = settings.DIALECTIC.LEVELS[self.reasoning_level]

	# Use minimal tools for minimal reasoning to reduce cost
	tools = (
	DIALECTIC_TOOLS_MINIMAL
	if self.reasoning_level == "minimal"
	else DIALECTIC_TOOLS
	)
	# Use level-specific max_output_tokens if set, otherwise global default
	max_tokens = (
	level_settings.MAX_OUTPUT_TOKENS
	if level_settings.MAX_OUTPUT_TOKENS is not None
	else settings.DIALECTIC.MAX_OUTPUT_TOKENS
	)

	response = cast(
	StreamingResponseWithMetadata,
	await honcho_llm_call(
	model_config=_get_dialectic_level_model_config(self.reasoning_level),
	prompt="", # Ignored since we pass messages
	max_tokens=max_tokens,
	stream=True,
	stream_final_only=True,
	tools=tools,
	tool_choice=level_settings.TOOL_CHOICE,
	tool_executor=tool_executor,
	max_tool_iterations=level_settings.MAX_TOOL_ITERATIONS,
	messages=self.messages,
	track_name="Dialectic Agent Stream",
	max_input_tokens=settings.DIALECTIC.MAX_INPUT_TOKENS,
	trace_name="dialectic_chat",
	),
	)

	accumulated_content: list[str] = []
	async for chunk in response:
	if chunk.content:
	accumulated_content.append(chunk.content)
	yield chunk.content

	self._log_response_metrics(
	task_name=task_name,
	run_id=run_id,
	start_time=start_time,
	response_content="".join(accumulated_content),
	input_tokens=response.input_tokens,
	output_tokens=response.output_tokens,
	cache_read_input_tokens=response.cache_read_input_tokens,
	cache_creation_input_tokens=response.cache_creation_input_tokens,
	tool_calls_count=len(response.tool_calls_made),
	thinking_content=response.thinking_content,
	iterations=response.iterations,
	)