Spaces:

build-small-hackathon
/

multi-agent-lab

Running on Zero

multi-agent-lab / tests /test_instructor.py

agharsallah

feat: Implement audience-only secret badge for Twenty Sprouts game

f6566bb 16 days ago

12.8 kB

	"""Validated structured-output tests — fully offline, instructor + litellm faked.

	No network and no real credentials. Three layers are covered:

	* ``build_output_model`` is pure Pydantic: it constrains ``kind`` to the
	allowed grant and requires ``text`` (+ extra fields).
	* ``LiteLLMProvider.complete_structured`` wraps a faked
	``instructor.from_litellm`` client and reads tokens + cost from the raw
	completion, mirroring ``complete``.
	* ``ManifestAgent`` takes the structured path when the provider offers
	``complete_structured`` (validated payload, no ``_raw_fallback``) and the
	tolerant-parser path with the deterministic stub.
	"""

	from __future__ import annotations

	import sys
	import types
	from dataclasses import dataclass

	import pytest
	from pydantic import BaseModel, ValidationError

	from src.agents.base import ManifestAgent
	from src.core.manifest import AgentManifest
	from src.core.projections import StageProjection
	from src.core.structured import AgentOutputError, build_output_model
	from src.models.litellm_provider import LiteLLMProvider
	from src.models.router import ModelRouter


	# ── build_output_model (pure Pydantic) ─────────────────────────────────────────


	class TestBuildOutputModel:
	def test_accepts_valid_kind(self):
	model = build_output_model(["agent.spoke", "judge.verdict"])
	out = model(kind="agent.spoke", text="I collect echoes.")
	assert out.kind == "agent.spoke"
	assert out.text == "I collect echoes."

	def test_rejects_kind_not_in_allowed(self):
	model = build_output_model(["agent.spoke"])
	with pytest.raises(ValidationError):
	model(kind="not.real", text="oops")

	def test_single_kind_still_constrains(self):
	model = build_output_model(["world.observed"])
	assert model(kind="world.observed", text="A booth opens.").kind == "world.observed"
	with pytest.raises(ValidationError):
	model(kind="judge.verdict", text="x")

	def test_extra_fields_required(self):
	model = build_output_model(["agent.spoke"], ["emotion"])
	out = model(kind="agent.spoke", text="hi", emotion="puzzled")
	assert out.emotion == "puzzled"
	with pytest.raises(ValidationError):
	model(kind="agent.spoke", text="hi") # emotion missing

	def test_text_required(self):
	model = build_output_model(["agent.spoke"])
	with pytest.raises(ValidationError):
	model(kind="agent.spoke")

	def test_empty_allowed_kinds_raises(self):
	with pytest.raises(AgentOutputError):
	build_output_model([])

	def test_is_subclass_of_basemodel(self):
	model = build_output_model(["agent.spoke"])
	assert issubclass(model, BaseModel)


	# ── fake instructor + litellm for the provider ──────────────────────────────────


	@dataclass
	class _FakeUsage:
	prompt_tokens: int = 11
	completion_tokens: int = 7
	total_tokens: int = 18


	class _FakeRawCompletion:
	"""Raw completion Instructor returns alongside the parsed model."""

	def __init__(self, *, hidden_cost: float \| None = None) -> None:
	self.usage = _FakeUsage()
	self._hidden_params = {} if hidden_cost is None else {"response_cost": hidden_cost}


	class _FakeInstructorClient:
	def __init__(self, *, hidden_cost=None, raise_exc=None, record=None) -> None:
	self._hidden_cost = hidden_cost
	self._raise = raise_exc
	self._record = record

	def create_with_completion(self, , response_model, *kwargs):
	if self._record is not None:
	self._record.update(kwargs)
	self._record["response_model"] = response_model
	if self._raise is not None:
	raise self._raise
	# Instructor returns a validated instance of the requested model.
	result = response_model(kind=response_model.model_fields["kind"].annotation.__args__[0], text="a mossy booth")
	return result, _FakeRawCompletion(hidden_cost=self._hidden_cost)


	def _install_fakes(monkeypatch, *, client, from_litellm_kw: dict \| None = None) -> None:
	"""Inject fake ``instructor`` (from_litellm -> client) and ``litellm`` modules.

	from_litellm_kw, when given, records the kwargs ``complete_structured`` passes to
	``instructor.from_litellm`` (e.g. the chosen ``mode``) for assertion.
	"""
	fake_litellm = types.ModuleType("litellm")
	fake_litellm.completion = lambda **kw: None

	def _completion_cost(completion_response=None, **_kw):
	return 0.0

	fake_litellm.completion_cost = _completion_cost

	def _from_litellm(completion, **kw):
	if from_litellm_kw is not None:
	from_litellm_kw.update(kw)
	return client

	fake_instructor = types.ModuleType("instructor")
	fake_instructor.from_litellm = _from_litellm
	# Mode is an enum on the real package; a name->value stand-in is enough for the
	# provider's ``getattr(instructor.Mode, structured_mode.upper())`` resolution.
	fake_instructor.Mode = types.SimpleNamespace(JSON_SCHEMA="json_schema", JSON="json", TOOLS="tools")

	monkeypatch.setitem(sys.modules, "litellm", fake_litellm)
	monkeypatch.setitem(sys.modules, "instructor", fake_instructor)


	# ── provider.complete_structured ────────────────────────────────────────────────


	class TestCompleteStructured:
	def test_returns_validated_model_and_captures_usage(self, monkeypatch):
	_install_fakes(monkeypatch, client=_FakeInstructorClient())
	provider = LiteLLMProvider(model="openai/m", api_base="https://x/v1")
	model = build_output_model(["world.observed"])
	out = provider.complete_structured("scene-whisperer", "grow the wood", model)
	assert isinstance(out, model)
	assert out.kind == "world.observed"
	assert provider.last_usage["prompt_tokens"] == 11
	assert provider.last_usage["completion_tokens"] == 7
	assert provider.last_usage["total_tokens"] == 18

	def test_captures_cost_from_hidden_params(self, monkeypatch):
	_install_fakes(monkeypatch, client=_FakeInstructorClient(hidden_cost=0.05))
	provider = LiteLLMProvider(model="openai/m", api_base="https://x/v1")
	provider.complete_structured("echo", "drop a pebble", build_output_model(["agent.spoke"]))
	assert provider.last_usage["cost_usd"] == pytest.approx(0.05)
	assert provider.last_cost == pytest.approx(0.05)

	def test_passes_response_model_and_retries(self, monkeypatch):
	record: dict = {}
	_install_fakes(monkeypatch, client=_FakeInstructorClient(record=record))
	provider = LiteLLMProvider(model="openai/m", api_base="https://x/v1", max_retries=4)
	model = build_output_model(["world.observed"])
	provider.complete_structured("seedkeeper", "observe", model)
	assert record["response_model"] is model
	assert record["max_retries"] == 4
	assert record["model"] == "openai/m"
	roles = [m["role"] for m in record["messages"]]
	assert roles == ["system", "user"]

	def test_defaults_to_guided_json_schema_mode(self, monkeypatch):
	# Guided decoding, not tool calling: a model with no tool-call parser (e.g. MiniCPM)
	# still validates instead of 400ing. The mode rides on from_litellm, not the call.
	kw: dict = {}
	_install_fakes(monkeypatch, client=_FakeInstructorClient(), from_litellm_kw=kw)
	provider = LiteLLMProvider(model="openai/m", api_base="https://x/v1")
	provider.complete_structured("echo", "x", build_output_model(["agent.spoke"]))
	assert kw["mode"] == "json_schema"

	def test_structured_mode_override_is_honored(self, monkeypatch):
	kw: dict = {}
	_install_fakes(monkeypatch, client=_FakeInstructorClient(), from_litellm_kw=kw)
	provider = LiteLLMProvider(model="openai/m", api_base="https://x/v1", structured_mode="tools")
	provider.complete_structured("echo", "x", build_output_model(["agent.spoke"]))
	assert kw["mode"] == "tools"

	def test_error_zeroes_usage_and_reraises(self, monkeypatch):
	_install_fakes(monkeypatch, client=_FakeInstructorClient(raise_exc=RuntimeError("boom")))
	provider = LiteLLMProvider(model="openai/m", api_base="https://x/v1")
	with pytest.raises(RuntimeError):
	provider.complete_structured("echo", "x", build_output_model(["agent.spoke"]))
	assert provider.last_usage["total_tokens"] == 0
	assert provider.last_usage["cost_usd"] == 0.0


	# ── ManifestAgent path selection ────────────────────────────────────────────────


	class _Agent(ManifestAgent):
	manifest = AgentManifest(
	name="scene-whisperer",
	persona="You grow the wood in one strange sentence.",
	may_emit=["world.observed"],
	model_profile="tiny",
	)


	@dataclass
	class _StructuredProvider:
	"""Stand-in live provider exposing complete_structured."""

	last_usage: dict = None # type: ignore[assignment]
	seen_model: object = None

	def __post_init__(self):
	self.last_usage = {
	"prompt_tokens": 5,
	"completion_tokens": 3,
	"total_tokens": 8,
	"cost_usd": 0.002,
	}

	def complete_structured(self, role, prompt, response_model):
	self.seen_model = response_model
	return response_model(kind="world.observed", text="A booth opens in a root.")

	def complete(self, role, prompt): # pragma: no cover - must not be reached
	raise AssertionError("structured path must not call complete()")


	@dataclass
	class _FixedRouter:
	provider: object

	def for_profile(self, profile):
	return self.provider


	class TestManifestAgentStructuredPath:
	def test_uses_structured_path_when_available(self):
	provider = _StructuredProvider()
	agent = _Agent(_FixedRouter(provider))
	ev = agent.act("r", 1, StageProjection(seed="moss"), ())
	assert ev.kind == "world.observed"
	assert ev.payload["text"] == "A booth opens in a root."
	# The validated path never wraps prose, so no fallback marker is present.
	assert "_raw_fallback" not in ev.payload
	# Cost/tokens flowed through from the provider for the Governor.
	assert agent.last_usage["cost_usd"] == pytest.approx(0.002)
	assert agent.last_usage["total_tokens"] == 8
	# The constructed model was constrained to the manifest's may_emit.
	assert provider.seen_model.model_fields["kind"].annotation.__args__ == ("world.observed",)

	def test_deterministic_stub_uses_parser_path(self):
	# Offline router yields the stub, which has no complete_structured: the
	# tolerant parser runs and (for prose) marks the fallback.
	agent = _Agent(ModelRouter(offline=True))
	provider = agent.router.for_profile("tiny")
	assert not hasattr(provider, "complete_structured")
	ev = agent.act("r", 1, StageProjection(seed="moss"), ())
	assert ev.kind == "world.observed" # coerced to the only allowed kind
	assert ev.payload.get("_raw_fallback") is True
	assert agent.last_usage["total_tokens"] > 0

	def test_structured_failure_falls_back_to_parser(self):
	# If the live structured call raises, the agent still produces an event
	# via the parser path rather than dropping the turn.
	class _FailingProvider:
	def __init__(self):
	self.last_usage = {"total_tokens": 0, "cost_usd": 0.0}
	self.calls = []

	def complete_structured(self, role, prompt, response_model):
	raise RuntimeError("validation exhausted")

	def complete(self, role, prompt):
	self.calls.append(prompt)
	self.last_usage = {
	"prompt_tokens": 4,
	"completion_tokens": 2,
	"total_tokens": 6,
	"cost_usd": 0.0,
	}
	return '{"kind": "world.observed", "text": "fallback line"}'

	provider = _FailingProvider()
	agent = _Agent(_FixedRouter(provider))
	ev = agent.act("r", 1, StageProjection(), ())
	assert ev.kind == "world.observed"
	assert ev.payload["text"] == "fallback line"
	assert provider.calls, "fallback should call complete()"
	assert agent.last_usage["total_tokens"] == 6