Spaces:

rrizwan98
/

honcho-api

Runtime error

rrizwan98

Honcho self-hosted deployment for HF Spaces

66227af about 2 months ago

7.67 kB

	"""Runtime config planning and retry/fallback selection.

	Owns:
	- Resolution of ConfiguredModelSettings → ModelConfig.
	- Per-attempt planning (AttemptPlan) including primary/fallback selection and
	reasoning-effort/thinking-budget resolution.
	- Per-call effective config construction (applying caller kwarg overrides onto
	the selected ModelConfig).
	- Retry attempt tracking via a ContextVar, plus the temperature-bump heuristic.
	"""

	from __future__ import annotations

	import logging
	from contextvars import ContextVar
	from dataclasses import dataclass
	from typing import Any

	from src.config import (
	ConfiguredModelSettings,
	ModelConfig,
	ModelTransport,
	resolve_model_config,
	settings,
	)

	from .registry import backend_for_provider, client_for_model_config
	from .types import ProviderClient, ReasoningEffortType

	logger = logging.getLogger(__name__)

	# ContextVar tracking the current retry attempt for provider switching.
	current_attempt: ContextVar[int] = ContextVar("current_attempt", default=0)


	def update_current_langfuse_observation(
	provider: ModelTransport,
	model: str,
	*,
	name: str \| None = None,
	) -> None:
	"""Best-effort annotation of the current Langfuse span with LLM routing."""
	if not settings.LANGFUSE_PUBLIC_KEY:
	return

	try:
	from langfuse import get_client

	update_kwargs: dict[str, Any] = {
	"metadata": {
	"namespace": settings.NAMESPACE,
	"provider": provider,
	"model": model,
	}
	}
	if name is not None:
	update_kwargs["name"] = name
	get_client().update_current_span(**update_kwargs)
	except Exception as exc: # pragma: no cover - best-effort telemetry
	logger.debug("Failed to update Langfuse span metadata: %s", exc)


	@dataclass(frozen=True)
	class AttemptPlan:
	"""Per-attempt plan produced by `plan_attempt`.

	Replaces the old loose tuple-of-six (`ProviderSelection`) with a single
	dataclass. Carries everything the executor / tool loop needs to make one
	backend call without re-resolving configuration mid-call.
	"""

	provider: ModelTransport
	model: str
	client: ProviderClient
	thinking_budget_tokens: int \| None
	reasoning_effort: ReasoningEffortType
	selected_config: ModelConfig


	def resolve_runtime_model_config(
	model_config: ModelConfig \| ConfiguredModelSettings,
	) -> ModelConfig:
	"""Return a runtime ModelConfig, resolving settings-shape inputs if needed."""
	if isinstance(model_config, ModelConfig):
	return model_config
	return resolve_model_config(model_config)


	def select_model_config_for_attempt(
	model_config: ModelConfig,
	*,
	attempt: int,
	retry_attempts: int,
	) -> ModelConfig:
	"""Pick the effective config for this attempt.

	Primary config on all attempts except the last, which swaps to the
	resolved fallback (if any).
	"""
	if attempt != retry_attempts or model_config.fallback is None:
	return model_config

	fb = model_config.fallback
	return ModelConfig(
	model=fb.model,
	transport=fb.transport,
	fallback=None,
	api_key=fb.api_key,
	base_url=fb.base_url,
	temperature=fb.temperature,
	top_p=fb.top_p,
	top_k=fb.top_k,
	frequency_penalty=fb.frequency_penalty,
	presence_penalty=fb.presence_penalty,
	seed=fb.seed,
	thinking_effort=fb.thinking_effort,
	thinking_budget_tokens=fb.thinking_budget_tokens,
	provider_params=fb.provider_params,
	max_output_tokens=fb.max_output_tokens,
	stop_sequences=fb.stop_sequences,
	cache_policy=fb.cache_policy,
	)


	def plan_attempt(
	*,
	runtime_model_config: ModelConfig,
	attempt: int,
	retry_attempts: int,
	call_thinking_budget_tokens: int \| None,
	call_reasoning_effort: ReasoningEffortType,
	) -> AttemptPlan:
	"""Build the AttemptPlan for `attempt`.

	Reasoning params are drawn from the caller when we're still on the
	primary config, and from the fallback config otherwise, so cross-transport
	fallbacks use provider-appropriate params.
	"""
	selected = select_model_config_for_attempt(
	runtime_model_config,
	attempt=attempt,
	retry_attempts=retry_attempts,
	)
	provider = selected.transport
	client = client_for_model_config(provider, selected)

	is_primary = selected is runtime_model_config
	attempt_thinking_budget = (
	call_thinking_budget_tokens if is_primary else selected.thinking_budget_tokens
	)
	attempt_reasoning_effort: ReasoningEffortType = (
	call_reasoning_effort if is_primary else selected.thinking_effort
	)

	if attempt == retry_attempts and runtime_model_config.fallback is not None:
	logger.warning(
	f"Final retry attempt {attempt}/{retry_attempts}: switching from "
	+ f"{runtime_model_config.transport}/{runtime_model_config.model} to "
	+ f"backup {provider}/{selected.model}"
	)

	return AttemptPlan(
	provider=provider,
	model=selected.model,
	client=client,
	thinking_budget_tokens=attempt_thinking_budget,
	reasoning_effort=attempt_reasoning_effort,
	selected_config=selected,
	)


	def effective_config_for_call(
	*,
	selected_config: ModelConfig \| None,
	provider: ModelTransport,
	model: str,
	temperature: float \| None,
	stop_seqs: list[str] \| None,
	thinking_budget_tokens: int \| None,
	reasoning_effort: ReasoningEffortType,
	) -> ModelConfig:
	"""Build the ModelConfig passed to the executor / request_builder.

	Per-call kwargs (temperature, stop_seqs, thinking_*) win when set; otherwise
	the selected_config's values are used. When selected_config is None
	(test-only callers passing provider+model directly) a minimal ModelConfig
	is synthesized.

	max_output_tokens is forced to None so the per-call max_tokens kwarg is
	authoritative — matching historical honcho_llm_call_inner behavior.
	"""
	if selected_config is None:
	return ModelConfig(
	model=model,
	transport=provider,
	temperature=temperature,
	stop_sequences=stop_seqs,
	thinking_budget_tokens=thinking_budget_tokens,
	thinking_effort=reasoning_effort,
	)
	updates: dict[str, Any] = {"max_output_tokens": None}
	if temperature is not None:
	updates["temperature"] = temperature
	if stop_seqs is not None:
	updates["stop_sequences"] = stop_seqs
	if thinking_budget_tokens is not None:
	updates["thinking_budget_tokens"] = thinking_budget_tokens
	if reasoning_effort is not None:
	updates["thinking_effort"] = reasoning_effort
	return selected_config.model_copy(update=updates)


	def effective_temperature(temperature: float \| None) -> float \| None:
	"""Bump temperature from 0.0 → 0.2 on retry attempts for variety."""
	if temperature == 0.0 and current_attempt.get() > 1:
	logger.debug("Bumping temperature from 0.0 to 0.2 on retry")
	return 0.2
	return temperature


	def resolve_backend_for_plan(plan: AttemptPlan) -> Any:
	"""Convenience helper: plan → ready-to-call ProviderBackend."""
	return backend_for_provider(plan.provider, plan.client)


	__all__ = [
	"AttemptPlan",
	"current_attempt",
	"effective_config_for_call",
	"effective_temperature",
	"plan_attempt",
	"resolve_backend_for_plan",
	"resolve_runtime_model_config",
	"select_model_config_for_attempt",
	"update_current_langfuse_observation",
	]