Spaces:

zequn-fireworks
/

fireaction-a2a

Paused

Commit

d5f3acc

1 Parent(s): 8a068d0

Add structured execution trace to A2A task responses

Introduces opt-in execution trace data (contracts_called, planner_steps,
cost_metrics) returned as a DataPart artifact. Controlled per-request via
metadata.include_trace or server-wide via FIREACTION_TRACE_ENABLED env var.
Supports eval scoring and production debugging.

Made-with: Cursor

Files changed (8) hide show

src/fireaction_a2a/agent.py +34 -3
src/fireaction_a2a/planner.py +134 -6
src/fireaction_a2a/runner.py +73 -12
src/fireaction_a2a/server.py +9 -1
tests/test_agent.py +148 -0
tests/test_planner_retry.py +166 -1
tests/test_runner.py +153 -3
tests/test_stripe_flow.py +4 -4

src/fireaction_a2a/agent.py CHANGED Viewed

@@ -12,7 +12,7 @@ from typing import Any
 from a2a.server.agent_execution import AgentExecutor, RequestContext
 from a2a.server.events import EventQueue
 from a2a.server.tasks import TaskUpdater
-from a2a.types import Part, TaskState, TextPart
 from a2a.utils import new_agent_text_message, new_task
 from fireaction_a2a.planner import Planner
@@ -23,10 +23,28 @@ logger = logging.getLogger(__name__)
 class ProviderAgentExecutor(AgentExecutor):
     """A2A executor that delegates to a :class:`Planner`."""
-    def __init__(self, planner: Planner) -> None:
         self.planner = planner
         self.conversations: dict[str, list[dict[str, Any]]] = {}
     async def execute(
         self,
         context: RequestContext,
@@ -41,12 +59,15 @@ class ProviderAgentExecutor(AgentExecutor):
         updater = TaskUpdater(event_queue, task.id, task.context_id)
         history = self.conversations.get(task.context_id, [])
         await updater.start_work(
             new_agent_text_message("Planning...", task.context_id, task.id)
         )
-        async for step in self.planner.run(user_input, history):
             if step.type == "status":
                 await updater.update_status(
                     TaskState.working,
@@ -65,10 +86,20 @@ class ProviderAgentExecutor(AgentExecutor):
                     [Part(root=TextPart(text=step.message))],
                     name="result",
                 )
                 await updater.complete()
                 break
             elif step.type == "error":
                 await updater.failed(
                     new_agent_text_message(step.message, task.context_id, task.id),
                 )

 from a2a.server.agent_execution import AgentExecutor, RequestContext
 from a2a.server.events import EventQueue
 from a2a.server.tasks import TaskUpdater
+from a2a.types import DataPart, Part, TaskState, TextPart
 from a2a.utils import new_agent_text_message, new_task
 from fireaction_a2a.planner import Planner
 class ProviderAgentExecutor(AgentExecutor):
     """A2A executor that delegates to a :class:`Planner`."""
+    def __init__(
+        self,
+        planner: Planner,
+        *,
+        trace_enabled_default: bool = False,
+    ) -> None:
         self.planner = planner
+        self.trace_enabled_default = trace_enabled_default
         self.conversations: dict[str, list[dict[str, Any]]] = {}
+    def _resolve_trace_flag(self, context: RequestContext) -> bool:
+        """Per-request metadata overrides the server-wide default."""
+        meta: dict[str, Any] = {}
+        if context.current_task and getattr(context.current_task, "metadata", None):
+            meta = context.current_task.metadata  # type: ignore[assignment]
+        elif getattr(context.message, "metadata", None):
+            meta = context.message.metadata  # type: ignore[assignment]
+        if "include_trace" in meta:
+            return bool(meta["include_trace"])
+        return self.trace_enabled_default
     async def execute(
         self,
         context: RequestContext,
         updater = TaskUpdater(event_queue, task.id, task.context_id)
         history = self.conversations.get(task.context_id, [])
+        trace_enabled = self._resolve_trace_flag(context)
         await updater.start_work(
             new_agent_text_message("Planning...", task.context_id, task.id)
         )
+        async for step in self.planner.run(
+            user_input, history, trace_enabled=trace_enabled
+        ):
             if step.type == "status":
                 await updater.update_status(
                     TaskState.working,
                     [Part(root=TextPart(text=step.message))],
                     name="result",
                 )
+                if step.trace_data:
+                    await updater.add_artifact(
+                        [Part(root=DataPart(data=step.trace_data))],
+                        name="execution_trace",
+                    )
                 await updater.complete()
                 break
             elif step.type == "error":
+                if step.trace_data:
+                    await updater.add_artifact(
+                        [Part(root=DataPart(data=step.trace_data))],
+                        name="execution_trace",
+                    )
                 await updater.failed(
                     new_agent_text_message(step.message, task.context_id, task.id),
                 )

src/fireaction_a2a/planner.py CHANGED Viewed

@@ -14,6 +14,7 @@ from __future__ import annotations
 import json
 import logging
 import os
 from collections.abc import AsyncGenerator
 from dataclasses import dataclass
 from typing import Any, Literal
@@ -145,6 +146,7 @@ TOOLS: list[dict[str, Any]] = [
 class PlanStep:
     type: Literal["status", "completed", "input_required", "error"]
     message: str
 # ---------------------------------------------------------------------------
@@ -161,18 +163,22 @@ class Planner:
         contract_search: ContractSearch,
         llm_model: str,
         system_prompt: str,
     ) -> None:
         self.provider = provider
         self.provider_client = provider_client
         self.search = contract_search
         self.model = llm_model
         self.system_prompt = system_prompt
         self._last_messages: list[dict[str, Any]] = []
     async def run(
         self,
         user_message: str,
         history: list[dict[str, Any]],
     ) -> AsyncGenerator[PlanStep, None]:
         """Execute the planner loop.  Yields PlanStep events."""
@@ -185,6 +191,37 @@ class Planner:
         retry_counts: dict[str, int] = {}
         total_steps = 0
         while total_steps < MAX_TOTAL_STEPS:
             try:
                 response = await litellm.acompletion(
@@ -194,9 +231,27 @@ class Planner:
                 )
             except Exception:
                 logger.exception("LLM call failed")
-                yield PlanStep(type="error", message="LLM call failed")
                 break
             choice = response.choices[0]
             assistant_msg = choice.message
@@ -208,7 +263,11 @@ class Planner:
             if choice.finish_reason == "stop" or not getattr(assistant_msg, "tool_calls", None):
                 content = assistant_msg.content or ""
                 step_type = self._classify_response(content)
-                yield PlanStep(type=step_type, message=content)
                 break
             # Process tool calls
@@ -220,15 +279,41 @@ class Planner:
                 except json.JSONDecodeError:
                     args = {}
-                result = await self._dispatch_tool(name, args, retry_counts)
                 if isinstance(result, PlanStep):
-                    yield result
                     if result.type == "error":
                         self._last_messages = messages
                         return
                     continue
                 # Yield status for successful executions
                 if name == "execute_contract" and not (isinstance(result, dict) and result.get("error")):
                     yield PlanStep(
@@ -246,6 +331,7 @@ class Planner:
             yield PlanStep(
                 type="error",
                 message=f"Exceeded maximum of {MAX_TOTAL_STEPS} tool calls.",
             )
         self._last_messages = messages
@@ -263,6 +349,9 @@ class Planner:
         name: str,
         args: dict[str, Any],
         retry_counts: dict[str, int],
     ) -> dict[str, Any] | list[dict[str, Any]] | PlanStep:
         """Route a tool call to the appropriate handler."""
@@ -286,11 +375,50 @@ class Planner:
             contract = self.provider.contracts[contract_name]
             try:
-                result = await execute_contract(contract, instance_nodes, self.provider_client)
                 retry_counts.pop(contract_name, None)
-                return result
             except ContractError as exc:
                 retry_counts[contract_name] = retry_counts.get(contract_name, 0) + 1
                 if retry_counts[contract_name] > MAX_RETRIES_PER_CONTRACT:
                     return PlanStep(
                         type="error",

 import json
 import logging
 import os
+import time
 from collections.abc import AsyncGenerator
 from dataclasses import dataclass
 from typing import Any, Literal
 class PlanStep:
     type: Literal["status", "completed", "input_required", "error"]
     message: str
+    trace_data: dict[str, Any] | None = None
 # ---------------------------------------------------------------------------
         contract_search: ContractSearch,
         llm_model: str,
         system_prompt: str,
+        provider_name: str = "",
     ) -> None:
         self.provider = provider
         self.provider_client = provider_client
         self.search = contract_search
         self.model = llm_model
         self.system_prompt = system_prompt
+        self.provider_name = provider_name
         self._last_messages: list[dict[str, Any]] = []
     async def run(
         self,
         user_message: str,
         history: list[dict[str, Any]],
+        *,
+        trace_enabled: bool = False,
     ) -> AsyncGenerator[PlanStep, None]:
         """Execute the planner loop.  Yields PlanStep events."""
         retry_counts: dict[str, int] = {}
         total_steps = 0
+        # Trace accumulators (only meaningful when trace_enabled)
+        trace_steps: list[dict[str, Any]] = []
+        trace_contracts: list[dict[str, Any]] = []
+        trace_cost: dict[str, int] = {
+            "total_tokens": 0,
+            "prompt_tokens": 0,
+            "completion_tokens": 0,
+            "num_llm_calls": 0,
+        }
+        start_time = time.monotonic()
+        def _build_trace() -> dict[str, Any] | None:
+            if not trace_enabled:
+                return None
+            return {
+                "execution_trace": {
+                    "contracts_called": trace_contracts,
+                    "planner_steps": trace_steps,
+                    "total_api_calls": sum(
+                        1 for c in trace_contracts if c.get("api_call")
+                    ),
+                    "total_planner_steps": len(trace_steps),
+                },
+                "cost_metrics": {
+                    **trace_cost,
+                    "total_duration_ms": int(
+                        (time.monotonic() - start_time) * 1000
+                    ),
+                },
+            }
         while total_steps < MAX_TOTAL_STEPS:
             try:
                 response = await litellm.acompletion(
                 )
             except Exception:
                 logger.exception("LLM call failed")
+                yield PlanStep(
+                    type="error",
+                    message="LLM call failed",
+                    trace_data=_build_trace(),
+                )
                 break
+            if trace_enabled:
+                trace_cost["num_llm_calls"] += 1
+                usage = getattr(response, "usage", None)
+                if usage:
+                    trace_cost["prompt_tokens"] += (
+                        getattr(usage, "prompt_tokens", 0) or 0
+                    )
+                    trace_cost["completion_tokens"] += (
+                        getattr(usage, "completion_tokens", 0) or 0
+                    )
+                    trace_cost["total_tokens"] += (
+                        getattr(usage, "total_tokens", 0) or 0
+                    )
             choice = response.choices[0]
             assistant_msg = choice.message
             if choice.finish_reason == "stop" or not getattr(assistant_msg, "tool_calls", None):
                 content = assistant_msg.content or ""
                 step_type = self._classify_response(content)
+                yield PlanStep(
+                    type=step_type,
+                    message=content,
+                    trace_data=_build_trace(),
+                )
                 break
             # Process tool calls
                 except json.JSONDecodeError:
                     args = {}
+                result = await self._dispatch_tool(
+                    name,
+                    args,
+                    retry_counts,
+                    trace_enabled=trace_enabled,
+                    trace_contracts=trace_contracts,
+                )
                 if isinstance(result, PlanStep):
+                    if trace_enabled:
+                        trace_steps.append({
+                            "step": len(trace_steps) + 1,
+                            "tool": name,
+                            "input": args,
+                            "output": {"error": result.message},
+                        })
                     if result.type == "error":
+                        yield PlanStep(
+                            type="error",
+                            message=result.message,
+                            trace_data=_build_trace(),
+                        )
                         self._last_messages = messages
                         return
+                    yield result
                     continue
+                if trace_enabled:
+                    trace_steps.append({
+                        "step": len(trace_steps) + 1,
+                        "tool": name,
+                        "input": args,
+                        "output": result,
+                    })
                 # Yield status for successful executions
                 if name == "execute_contract" and not (isinstance(result, dict) and result.get("error")):
                     yield PlanStep(
             yield PlanStep(
                 type="error",
                 message=f"Exceeded maximum of {MAX_TOTAL_STEPS} tool calls.",
+                trace_data=_build_trace(),
             )
         self._last_messages = messages
         name: str,
         args: dict[str, Any],
         retry_counts: dict[str, int],
+        *,
+        trace_enabled: bool = False,
+        trace_contracts: list[dict[str, Any]] | None = None,
     ) -> dict[str, Any] | list[dict[str, Any]] | PlanStep:
         """Route a tool call to the appropriate handler."""
             contract = self.provider.contracts[contract_name]
             try:
+                contract_result = await execute_contract(
+                    contract,
+                    instance_nodes,
+                    self.provider_client,
+                    trace_enabled=trace_enabled,
+                )
                 retry_counts.pop(contract_name, None)
+                if trace_enabled and trace_contracts is not None and contract_result.trace:
+                    trace_contracts.append({
+                        "provider": self.provider_name,
+                        "action": contract_name,
+                        "version": contract.metadata.get("version", ""),
+                        "instance_nodes": instance_nodes,
+                        **contract_result.trace,
+                    })
+                return contract_result.response
             except ContractError as exc:
                 retry_counts[contract_name] = retry_counts.get(contract_name, 0) + 1
+                if trace_enabled and trace_contracts is not None:
+                    entry: dict[str, Any] = {
+                        "provider": self.provider_name,
+                        "action": contract_name,
+                        "version": contract.metadata.get("version", ""),
+                        "instance_nodes": instance_nodes,
+                    }
+                    if exc.trace:
+                        entry.update(exc.trace)
+                    else:
+                        entry.update({
+                            "validation": {
+                                "verify_passed": False,
+                                "properties_passed": False,
+                                "properties_failed": [],
+                                "rules_passed": False,
+                                "rules_failed": [],
+                            },
+                            "compiled_payload": None,
+                            "api_call": None,
+                        })
+                    trace_contracts.append(entry)
                 if retry_counts[contract_name] > MAX_RETRIES_PER_CONTRACT:
                     return PlanStep(
                         type="error",

src/fireaction_a2a/runner.py CHANGED Viewed

@@ -6,6 +6,8 @@ then delegates the actual API call to a ProviderClient.
 from __future__ import annotations
 from typing import Any
 from fireaction.contract import EndpointContract
@@ -13,16 +15,32 @@ from fireaction.contract import EndpointContract
 from fireaction_a2a.client import ProviderClient
 class ContractError(Exception):
     """A contract validation step failed.
     Carries structured detail so the planner can feed the errors back to
-    the LLM for retry.
     """
-    def __init__(self, stage: str, details: list[Any]) -> None:
         self.stage = stage
         self.details = details
         super().__init__(f"Contract validation failed at '{stage}': {details}")
     def to_dict(self) -> dict[str, Any]:
@@ -33,31 +51,74 @@ async def execute_contract(
     contract: EndpointContract,
     instance_nodes: list[dict[str, Any]],
     provider_client: ProviderClient,
-) -> dict[str, Any]:
     """Run the full contract lifecycle and make the API call.
     Raises:
-        ContractError: If any validation step fails (instantiate, verify,
-            check_properties, or check_rules).  The error is structured so
-            the planner can serialize it and feed it back to the LLM.
     """
     results = contract.instantiate(instance_nodes)
     failed = [r for r in results if not r["success"]]
     if failed:
-        raise ContractError("instantiate", failed)
     verify_errors = contract.verify()
     if verify_errors:
-        raise ContractError("verify", verify_errors)
     prop_errors = contract.check_properties()
     if prop_errors:
-        raise ContractError("check_properties", prop_errors)
     payload = contract.compile()
     rule_errors = contract.check_rules(payload)
     if rule_errors:
-        raise ContractError("check_rules", rule_errors)
-    return await provider_client.call(contract.endpoint_info, payload)

 from __future__ import annotations
+import time
+from dataclasses import dataclass
 from typing import Any
 from fireaction.contract import EndpointContract
 from fireaction_a2a.client import ProviderClient
+@dataclass
+class ContractResult:
+    """Result of a contract execution with optional trace data."""
+    response: dict[str, Any]
+    trace: dict[str, Any] | None = None
 class ContractError(Exception):
     """A contract validation step failed.
     Carries structured detail so the planner can feed the errors back to
+    the LLM for retry.  When tracing is enabled, ``trace`` contains partial
+    execution data up to the point of failure.
     """
+    def __init__(
+        self,
+        stage: str,
+        details: list[Any],
+        *,
+        trace: dict[str, Any] | None = None,
+    ) -> None:
         self.stage = stage
         self.details = details
+        self.trace = trace
         super().__init__(f"Contract validation failed at '{stage}': {details}")
     def to_dict(self) -> dict[str, Any]:
     contract: EndpointContract,
     instance_nodes: list[dict[str, Any]],
     provider_client: ProviderClient,
+    *,
+    trace_enabled: bool = False,
+) -> ContractResult:
     """Run the full contract lifecycle and make the API call.
+    When *trace_enabled* is ``True``, captures per-stage validation results,
+    the compiled payload, and API call details in ``ContractResult.trace``.
     Raises:
+        ContractError: If any validation step fails.  When tracing, the
+            exception carries partial trace data via its ``trace`` attribute.
     """
+    trace: dict[str, Any] | None = None
+    if trace_enabled:
+        trace = {
+            "validation": {
+                "verify_passed": False,
+                "properties_passed": False,
+                "properties_failed": [],
+                "rules_passed": False,
+                "rules_failed": [],
+            },
+            "compiled_payload": None,
+            "api_call": None,
+        }
     results = contract.instantiate(instance_nodes)
     failed = [r for r in results if not r["success"]]
     if failed:
+        raise ContractError("instantiate", failed, trace=trace)
     verify_errors = contract.verify()
     if verify_errors:
+        raise ContractError("verify", verify_errors, trace=trace)
+    if trace:
+        trace["validation"]["verify_passed"] = True
     prop_errors = contract.check_properties()
     if prop_errors:
+        if trace:
+            trace["validation"]["properties_failed"] = [str(e) for e in prop_errors]
+        raise ContractError("check_properties", prop_errors, trace=trace)
+    if trace:
+        trace["validation"]["properties_passed"] = True
     payload = contract.compile()
+    if trace:
+        trace["compiled_payload"] = payload
     rule_errors = contract.check_rules(payload)
     if rule_errors:
+        if trace:
+            trace["validation"]["rules_failed"] = [str(e) for e in rule_errors]
+        raise ContractError("check_rules", rule_errors, trace=trace)
+    if trace:
+        trace["validation"]["rules_passed"] = True
+    start = time.monotonic()
+    response = await provider_client.call(contract.endpoint_info, payload)
+    duration_ms = int((time.monotonic() - start) * 1000)
+    if trace:
+        trace["api_call"] = {
+            "method": contract.endpoint_info.get("method", ""),
+            "path": contract.endpoint_info.get("path", ""),
+            "status_code": 200,
+            "response_body": response,
+            "duration_ms": duration_ms,
+        }
+    return ContractResult(response=response, trace=trace)

src/fireaction_a2a/server.py CHANGED Viewed

@@ -69,6 +69,7 @@ async def _build_provider_app(
     llm_model: str,
     embedding_model: str,
     path_prefix: str = "",
 ) -> A2AStarletteApplication:
     """Build an A2A app for a single provider."""
     provider = load_provider(provider_name)
@@ -89,8 +90,11 @@ async def _build_provider_app(
         contract_search=contract_search,
         llm_model=llm_model,
         system_prompt=system_prompt,
     )
-    executor = ProviderAgentExecutor(planner)
     card = build_card(provider_name, provider, host, port, path_prefix=path_prefix)
     handler = DefaultRequestHandler(
@@ -113,9 +117,11 @@ async def create_app(
     llm_model = os.environ.get("FIREACTION_LLM_MODEL", "gpt-4o")
     embedding_model = os.environ.get("FIREACTION_EMBEDDING_MODEL", "text-embedding-3-small")
     api_key = os.environ.get("FIREACTION_API_KEY", "")
     a2a_app = await _build_provider_app(
         provider_name, host, port, llm_model, embedding_model,
     )
     app = a2a_app.build()
@@ -142,6 +148,7 @@ async def create_multi_app(
     llm_model = os.environ.get("FIREACTION_LLM_MODEL", "gpt-4o")
     embedding_model = os.environ.get("FIREACTION_EMBEDDING_MODEL", "text-embedding-3-small")
     api_key = os.environ.get("FIREACTION_API_KEY", "")
     mounts: list[Mount] = []
     agent_directory: list[dict] = []
@@ -152,6 +159,7 @@ async def create_multi_app(
             a2a_app = await _build_provider_app(
                 name, host, port, llm_model, embedding_model,
                 path_prefix=prefix,
             )
             sub_app = a2a_app.build()
             mounts.append(Mount(prefix, app=sub_app))

     llm_model: str,
     embedding_model: str,
     path_prefix: str = "",
+    trace_enabled_default: bool = False,
 ) -> A2AStarletteApplication:
     """Build an A2A app for a single provider."""
     provider = load_provider(provider_name)
         contract_search=contract_search,
         llm_model=llm_model,
         system_prompt=system_prompt,
+        provider_name=provider_name,
+    )
+    executor = ProviderAgentExecutor(
+        planner, trace_enabled_default=trace_enabled_default,
     )
     card = build_card(provider_name, provider, host, port, path_prefix=path_prefix)
     handler = DefaultRequestHandler(
     llm_model = os.environ.get("FIREACTION_LLM_MODEL", "gpt-4o")
     embedding_model = os.environ.get("FIREACTION_EMBEDDING_MODEL", "text-embedding-3-small")
     api_key = os.environ.get("FIREACTION_API_KEY", "")
+    trace_enabled = os.environ.get("FIREACTION_TRACE_ENABLED", "").lower() in ("1", "true", "yes")
     a2a_app = await _build_provider_app(
         provider_name, host, port, llm_model, embedding_model,
+        trace_enabled_default=trace_enabled,
     )
     app = a2a_app.build()
     llm_model = os.environ.get("FIREACTION_LLM_MODEL", "gpt-4o")
     embedding_model = os.environ.get("FIREACTION_EMBEDDING_MODEL", "text-embedding-3-small")
     api_key = os.environ.get("FIREACTION_API_KEY", "")
+    trace_enabled = os.environ.get("FIREACTION_TRACE_ENABLED", "").lower() in ("1", "true", "yes")
     mounts: list[Mount] = []
     agent_directory: list[dict] = []
             a2a_app = await _build_provider_app(
                 name, host, port, llm_model, embedding_model,
                 path_prefix=prefix,
+                trace_enabled_default=trace_enabled,
             )
             sub_app = a2a_app.build()
             mounts.append(Mount(prefix, app=sub_app))

tests/test_agent.py ADDED Viewed

	@@ -0,0 +1,148 @@

+"""Tests for ProviderAgentExecutor: trace flag resolution and artifact emission."""
+import asyncio
+from unittest.mock import AsyncMock, MagicMock, patch
+from fireaction_a2a.agent import ProviderAgentExecutor
+from fireaction_a2a.planner import PlanStep
+# ---- _resolve_trace_flag tests ----
+def _make_executor(*, trace_default: bool = False) -> ProviderAgentExecutor:
+    mock_planner = MagicMock()
+    return ProviderAgentExecutor(mock_planner, trace_enabled_default=trace_default)
+def _make_context(*, task_metadata=None, message_metadata=None, has_task=True):
+    ctx = MagicMock()
+    if has_task:
+        ctx.current_task = MagicMock()
+        ctx.current_task.metadata = task_metadata
+    else:
+        ctx.current_task = None
+    ctx.message = MagicMock()
+    ctx.message.metadata = message_metadata
+    return ctx
+def test_resolve_trace_flag_default_false():
+    executor = _make_executor(trace_default=False)
+    ctx = _make_context(task_metadata={})
+    assert executor._resolve_trace_flag(ctx) is False
+def test_resolve_trace_flag_default_true():
+    executor = _make_executor(trace_default=True)
+    ctx = _make_context(task_metadata={})
+    assert executor._resolve_trace_flag(ctx) is True
+def test_resolve_trace_flag_task_metadata_overrides_default():
+    executor = _make_executor(trace_default=False)
+    ctx = _make_context(task_metadata={"include_trace": True})
+    assert executor._resolve_trace_flag(ctx) is True
+def test_resolve_trace_flag_task_metadata_disables():
+    executor = _make_executor(trace_default=True)
+    ctx = _make_context(task_metadata={"include_trace": False})
+    assert executor._resolve_trace_flag(ctx) is False
+def test_resolve_trace_flag_message_metadata_fallback():
+    executor = _make_executor(trace_default=False)
+    ctx = _make_context(has_task=False, message_metadata={"include_trace": True})
+    assert executor._resolve_trace_flag(ctx) is True
+def test_resolve_trace_flag_no_metadata():
+    executor = _make_executor(trace_default=False)
+    ctx = _make_context(task_metadata=None)
+    assert executor._resolve_trace_flag(ctx) is False
+# ---- execute() artifact emission tests ----
+def test_execute_emits_trace_artifact_when_trace_data_present():
+    """When planner yields a completed step with trace_data, agent emits DataPart."""
+    mock_planner = MagicMock()
+    trace_payload = {
+        "execution_trace": {"contracts_called": [], "planner_steps": [],
+                            "total_api_calls": 0, "total_planner_steps": 0},
+        "cost_metrics": {"total_tokens": 100, "prompt_tokens": 60,
+                         "completion_tokens": 40, "num_llm_calls": 1,
+                         "total_duration_ms": 500},
+    }
+    async def mock_run(user_msg, history, *, trace_enabled=False):
+        yield PlanStep(type="completed", message="Done!", trace_data=trace_payload)
+    mock_planner.run = mock_run
+    mock_planner.get_messages.return_value = []
+    executor = ProviderAgentExecutor(mock_planner, trace_enabled_default=True)
+    context = MagicMock()
+    context.get_user_input.return_value = "send email"
+    context.current_task = MagicMock()
+    context.current_task.metadata = {}
+    context.current_task.id = "task_1"
+    context.current_task.context_id = "ctx_1"
+    context.message = MagicMock()
+    event_queue = MagicMock()
+    event_queue.enqueue_event = AsyncMock()
+    async def _run():
+        await executor.execute(context, event_queue)
+    asyncio.run(_run())
+    calls = event_queue.enqueue_event.call_args_list
+    artifacts = [
+        c for c in calls
+        if hasattr(c[0][0], "parts") or "artifact" in str(type(c[0][0])).lower()
+    ]
+    assert len(calls) >= 3
+def test_execute_skips_trace_artifact_when_trace_disabled():
+    """When trace_enabled is False, no DataPart artifact is emitted."""
+    mock_planner = MagicMock()
+    async def mock_run(user_msg, history, *, trace_enabled=False):
+        yield PlanStep(type="completed", message="Done!", trace_data=None)
+    mock_planner.run = mock_run
+    mock_planner.get_messages.return_value = []
+    executor = ProviderAgentExecutor(mock_planner, trace_enabled_default=False)
+    context = MagicMock()
+    context.get_user_input.return_value = "send email"
+    context.current_task = MagicMock()
+    context.current_task.metadata = {}
+    context.current_task.id = "task_2"
+    context.current_task.context_id = "ctx_2"
+    context.message = MagicMock()
+    event_queue = MagicMock()
+    event_queue.enqueue_event = AsyncMock()
+    async def _run():
+        await executor.execute(context, event_queue)
+    asyncio.run(_run())
+    all_events = [c[0][0] for c in event_queue.enqueue_event.call_args_list]
+    data_part_events = [
+        e for e in all_events
+        if hasattr(e, "parts") and any(
+            getattr(getattr(p, "root", None), "kind", None) == "data"
+            for p in getattr(e, "parts", [])
+        )
+    ]
+    assert len(data_part_events) == 0

tests/test_planner_retry.py CHANGED Viewed

@@ -1,7 +1,8 @@
 """Tests for planner error recovery and retry limits."""
 import asyncio
-from unittest.mock import AsyncMock, patch
 from fireaction import load_provider
@@ -20,6 +21,7 @@ def _make_planner() -> Planner:
         contract_search=search,
         llm_model="gpt-4o",
         system_prompt="Test",
     )
@@ -101,8 +103,171 @@ def test_dispatch_execute_exceeds_retry_limit():
     assert "failed validation" in result.message.lower() or "failed" in result.message.lower()
 def test_classify_response():
     assert Planner._classify_response("Done! Email sent successfully.") == "completed"
     assert Planner._classify_response("Please specify the recipient.") == "input_required"
     assert Planner._classify_response("Could you provide the subject?") == "input_required"
     assert Planner._classify_response("Error: API returned 500") == "error"

 """Tests for planner error recovery and retry limits."""
 import asyncio
+import json
+from unittest.mock import AsyncMock, MagicMock, patch
 from fireaction import load_provider
         contract_search=search,
         llm_model="gpt-4o",
         system_prompt="Test",
+        provider_name="resend",
     )
     assert "failed validation" in result.message.lower() or "failed" in result.message.lower()
+def test_dispatch_execute_contract_with_trace():
+    """Successful execute_contract with trace_enabled populates trace_contracts."""
+    planner = _make_planner()
+    planner.provider_client.call.return_value = {"id": "email_traced"}
+    retry_counts: dict[str, int] = {}
+    trace_contracts: list[dict] = []
+    from tests.test_runner import _minimal_email_nodes
+    result = asyncio.run(
+        planner._dispatch_tool(
+            "execute_contract",
+            {"contract_name": "send_email", "instance_nodes": _minimal_email_nodes()},
+            retry_counts,
+            trace_enabled=True,
+            trace_contracts=trace_contracts,
+        )
+    )
+    assert isinstance(result, dict)
+    assert result == {"id": "email_traced"}
+    assert len(trace_contracts) == 1
+    entry = trace_contracts[0]
+    assert entry["provider"] == "resend"
+    assert entry["action"] == "send_email"
+    assert entry["validation"]["verify_passed"] is True
+    assert entry["api_call"] is not None
+    assert entry["api_call"]["response_body"] == {"id": "email_traced"}
+def test_dispatch_execute_contract_error_with_trace():
+    """Failed execute_contract with trace_enabled records partial trace."""
+    planner = _make_planner()
+    retry_counts: dict[str, int] = {}
+    trace_contracts: list[dict] = []
+    bad_nodes = [
+        {"instance_key": "root", "element_key": "nonexistent",
+         "variant_key": "send_email_skeleton", "data_type": "object",
+         "compile_key": "root", "description": "bad", "index": 0,
+         "parent_instance_key": None},
+    ]
+    result = asyncio.run(
+        planner._dispatch_tool(
+            "execute_contract",
+            {"contract_name": "send_email", "instance_nodes": bad_nodes},
+            retry_counts,
+            trace_enabled=True,
+            trace_contracts=trace_contracts,
+        )
+    )
+    assert isinstance(result, dict)
+    assert result["error"] is True
+    assert len(trace_contracts) == 1
+    entry = trace_contracts[0]
+    assert entry["provider"] == "resend"
+    assert entry["action"] == "send_email"
+    assert entry["api_call"] is None
 def test_classify_response():
     assert Planner._classify_response("Done! Email sent successfully.") == "completed"
     assert Planner._classify_response("Please specify the recipient.") == "input_required"
     assert Planner._classify_response("Could you provide the subject?") == "input_required"
     assert Planner._classify_response("Error: API returned 500") == "error"
+# ---- Planner run() integration tests with trace ----
+def _mock_llm_response(*, tool_calls=None, content=None, finish_reason="stop",
+                        prompt_tokens=100, completion_tokens=50):
+    """Build a mock litellm response."""
+    resp = MagicMock()
+    msg = MagicMock()
+    msg.content = content
+    msg.tool_calls = tool_calls
+    msg.model_dump.return_value = {
+        "role": "assistant",
+        "content": content,
+        **({"tool_calls": [
+            {"id": tc.id, "type": "function",
+             "function": {"name": tc.function.name, "arguments": tc.function.arguments}}
+            for tc in tool_calls
+        ]} if tool_calls else {}),
+    }
+    choice = MagicMock()
+    choice.message = msg
+    choice.finish_reason = finish_reason
+    resp.choices = [choice]
+    resp.usage = MagicMock()
+    resp.usage.prompt_tokens = prompt_tokens
+    resp.usage.completion_tokens = completion_tokens
+    resp.usage.total_tokens = prompt_tokens + completion_tokens
+    return resp
+def _mock_tool_call(name, arguments, call_id="call_1"):
+    tc = MagicMock()
+    tc.id = call_id
+    tc.function.name = name
+    tc.function.arguments = json.dumps(arguments)
+    return tc
+def test_planner_run_trace_accumulates_steps_and_cost():
+    """Full run() with trace collects planner_steps and cost_metrics."""
+    planner = _make_planner()
+    search_tc = _mock_tool_call("search_contracts", {"query": "email"}, "call_1")
+    resp1 = _mock_llm_response(
+        tool_calls=[search_tc], finish_reason="tool_calls",
+        prompt_tokens=100, completion_tokens=50,
+    )
+    resp2 = _mock_llm_response(
+        content="Done! Email sent successfully.",
+        prompt_tokens=200, completion_tokens=30,
+    )
+    steps: list[PlanStep] = []
+    async def _run():
+        with patch("fireaction_a2a.planner.litellm") as mock_litellm:
+            mock_litellm.acompletion = AsyncMock(side_effect=[resp1, resp2])
+            async for step in planner.run("send an email", [], trace_enabled=True):
+                steps.append(step)
+    asyncio.run(_run())
+    final = steps[-1]
+    assert final.type == "completed"
+    assert final.trace_data is not None
+    trace = final.trace_data
+    assert "execution_trace" in trace
+    assert "cost_metrics" in trace
+    assert len(trace["execution_trace"]["planner_steps"]) == 1
+    assert trace["execution_trace"]["planner_steps"][0]["tool"] == "search_contracts"
+    assert trace["execution_trace"]["total_planner_steps"] == 1
+    cost = trace["cost_metrics"]
+    assert cost["num_llm_calls"] == 2
+    assert cost["prompt_tokens"] == 300
+    assert cost["completion_tokens"] == 80
+    assert cost["total_tokens"] == 380
+    assert cost["total_duration_ms"] >= 0
+def test_planner_run_without_trace_returns_none():
+    """run() with trace_enabled=False yields PlanStep with trace_data=None."""
+    planner = _make_planner()
+    resp = _mock_llm_response(content="Done! Email sent.")
+    steps: list[PlanStep] = []
+    async def _run():
+        with patch("fireaction_a2a.planner.litellm") as mock_litellm:
+            mock_litellm.acompletion = AsyncMock(return_value=resp)
+            async for step in planner.run("send email", [], trace_enabled=False):
+                steps.append(step)
+    asyncio.run(_run())
+    assert steps[-1].type == "completed"
+    assert steps[-1].trace_data is None

tests/test_runner.py CHANGED Viewed

@@ -1,11 +1,11 @@
 """Tests for the contract runner (lifecycle only, no HTTP calls)."""
 import asyncio
-from unittest.mock import AsyncMock
 import fireaction
-from fireaction_a2a.runner import ContractError, execute_contract
 V = "send_email_skeleton"
@@ -69,7 +69,9 @@ def test_execute_contract_success():
     result = asyncio.run(execute_contract(contract, _minimal_email_nodes(), mock_client))
-    assert result == {"id": "email_123"}
     mock_client.call.assert_called_once()
     call_args = mock_client.call.call_args
     payload = call_args[0][1]
@@ -77,6 +79,38 @@ def test_execute_contract_success():
     assert payload["subject"] == "Test"
 def test_execute_contract_instantiate_error():
     contract = fireaction.load(provider="resend", action="send_email")
     mock_client = AsyncMock()
@@ -99,7 +133,123 @@ def test_execute_contract_instantiate_error():
         assert False, "Should have raised ContractError"
     except ContractError as e:
         assert e.stage == "instantiate"
         d = e.to_dict()
         assert d["error"] is True
         assert d["stage"] == "instantiate"
     mock_client.call.assert_not_called()

 """Tests for the contract runner (lifecycle only, no HTTP calls)."""
 import asyncio
+from unittest.mock import AsyncMock, MagicMock
 import fireaction
+from fireaction_a2a.runner import ContractError, ContractResult, execute_contract
 V = "send_email_skeleton"
     result = asyncio.run(execute_contract(contract, _minimal_email_nodes(), mock_client))
+    assert isinstance(result, ContractResult)
+    assert result.response == {"id": "email_123"}
+    assert result.trace is None
     mock_client.call.assert_called_once()
     call_args = mock_client.call.call_args
     payload = call_args[0][1]
     assert payload["subject"] == "Test"
+def test_execute_contract_success_with_trace():
+    contract = fireaction.load(provider="resend", action="send_email")
+    mock_client = AsyncMock()
+    mock_client.call.return_value = {"id": "email_456"}
+    result = asyncio.run(
+        execute_contract(
+            contract, _minimal_email_nodes(), mock_client, trace_enabled=True,
+        )
+    )
+    assert isinstance(result, ContractResult)
+    assert result.response == {"id": "email_456"}
+    assert result.trace is not None
+    v = result.trace["validation"]
+    assert v["verify_passed"] is True
+    assert v["properties_passed"] is True
+    assert v["properties_failed"] == []
+    assert v["rules_passed"] is True
+    assert v["rules_failed"] == []
+    assert result.trace["compiled_payload"] is not None
+    assert result.trace["compiled_payload"]["from"] == "onboarding@resend.dev"
+    api = result.trace["api_call"]
+    assert api is not None
+    assert api["status_code"] == 200
+    assert api["response_body"] == {"id": "email_456"}
+    assert isinstance(api["duration_ms"], int)
 def test_execute_contract_instantiate_error():
     contract = fireaction.load(provider="resend", action="send_email")
     mock_client = AsyncMock()
         assert False, "Should have raised ContractError"
     except ContractError as e:
         assert e.stage == "instantiate"
+        assert e.trace is None
         d = e.to_dict()
         assert d["error"] is True
         assert d["stage"] == "instantiate"
     mock_client.call.assert_not_called()
+def test_execute_contract_instantiate_error_with_trace():
+    contract = fireaction.load(provider="resend", action="send_email")
+    mock_client = AsyncMock()
+    bad_nodes = [
+        {
+            "instance_key": "root",
+            "element_key": "nonexistent",
+            "variant_key": V,
+            "data_type": "object",
+            "compile_key": "root",
+            "description": "bad",
+            "index": 0,
+            "parent_instance_key": None,
+        }
+    ]
+    try:
+        asyncio.run(
+            execute_contract(contract, bad_nodes, mock_client, trace_enabled=True)
+        )
+        assert False, "Should have raised ContractError"
+    except ContractError as e:
+        assert e.stage == "instantiate"
+        assert e.trace is not None
+        assert e.trace["validation"]["verify_passed"] is False
+        assert e.trace["compiled_payload"] is None
+        assert e.trace["api_call"] is None
+    mock_client.call.assert_not_called()
+# ---- Tests using mocked contracts for later-stage failures ----
+def _mock_contract():
+    """Return a MagicMock contract that passes all stages by default."""
+    contract = MagicMock()
+    contract.instantiate.return_value = [{"success": True}]
+    contract.verify.return_value = []
+    contract.check_properties.return_value = []
+    contract.compile.return_value = {"from": "a@b.com", "to": ["c@d.com"]}
+    contract.check_rules.return_value = []
+    contract.endpoint_info = {"method": "POST", "path": "/emails"}
+    contract.metadata = {"version": "v1"}
+    return contract
+def test_verify_failure_with_trace():
+    """Verify failure captures verify_passed=False but no later stages."""
+    contract = _mock_contract()
+    contract.verify.return_value = ["Missing required child: to_array"]
+    mock_client = AsyncMock()
+    try:
+        asyncio.run(
+            execute_contract(contract, [], mock_client, trace_enabled=True)
+        )
+        assert False, "Should have raised ContractError"
+    except ContractError as e:
+        assert e.stage == "verify"
+        assert e.trace is not None
+        assert e.trace["validation"]["verify_passed"] is False
+        assert e.trace["validation"]["properties_passed"] is False
+        assert e.trace["compiled_payload"] is None
+        assert e.trace["api_call"] is None
+def test_check_properties_failure_with_trace():
+    """Property failure captures verify_passed=True, properties_failed list."""
+    contract = _mock_contract()
+    contract.check_properties.return_value = ["from must be a valid email"]
+    mock_client = AsyncMock()
+    try:
+        asyncio.run(
+            execute_contract(contract, [], mock_client, trace_enabled=True)
+        )
+        assert False, "Should have raised ContractError"
+    except ContractError as e:
+        assert e.stage == "check_properties"
+        assert e.trace is not None
+        assert e.trace["validation"]["verify_passed"] is True
+        assert e.trace["validation"]["properties_passed"] is False
+        assert e.trace["validation"]["properties_failed"] == [
+            "from must be a valid email"
+        ]
+        assert e.trace["compiled_payload"] is None
+        assert e.trace["api_call"] is None
+def test_check_rules_failure_with_trace():
+    """Rule failure captures verify+properties passed, compiled payload present."""
+    contract = _mock_contract()
+    contract.check_rules.return_value = ["rule_html_not_empty"]
+    mock_client = AsyncMock()
+    try:
+        asyncio.run(
+            execute_contract(contract, [], mock_client, trace_enabled=True)
+        )
+        assert False, "Should have raised ContractError"
+    except ContractError as e:
+        assert e.stage == "check_rules"
+        assert e.trace is not None
+        assert e.trace["validation"]["verify_passed"] is True
+        assert e.trace["validation"]["properties_passed"] is True
+        assert e.trace["validation"]["rules_passed"] is False
+        assert e.trace["validation"]["rules_failed"] == ["rule_html_not_empty"]
+        assert e.trace["compiled_payload"] == {
+            "from": "a@b.com",
+            "to": ["c@d.com"],
+        }
+        assert e.trace["api_call"] is None

tests/test_stripe_flow.py CHANGED Viewed

@@ -34,7 +34,7 @@ def test_stripe_product_to_price_to_payment_link():
          "index": 1, "primitive_contents": "Pro Plan", "parent_instance_key": "root"},
     ]
     result = asyncio.run(execute_contract(contract, nodes, mock_client))
-    product_id = result["id"]
     assert product_id == "prod_test123"
     # Step 2: create_price (references product_id)
@@ -62,7 +62,7 @@ def test_stripe_product_to_price_to_payment_link():
          "index": 3, "primitive_contents": product_id, "parent_instance_key": "root"},
     ]
     result = asyncio.run(execute_contract(contract, nodes, mock_client))
-    price_id = result["id"]
     assert price_id == "price_test456"
     # Step 3: create_payment_link (references price_id)
@@ -92,8 +92,8 @@ def test_stripe_product_to_price_to_payment_link():
          "index": 4, "primitive_contents": 1, "parent_instance_key": "li_1"},
     ]
     result = asyncio.run(execute_contract(contract, nodes, mock_client))
-    assert result["id"] == "plink_test789"
-    assert result["url"] == "https://buy.stripe.com/test_xxx"
     # Verify the mock was called 3 times total
     assert mock_client.call.call_count == 3

          "index": 1, "primitive_contents": "Pro Plan", "parent_instance_key": "root"},
     ]
     result = asyncio.run(execute_contract(contract, nodes, mock_client))
+    product_id = result.response["id"]
     assert product_id == "prod_test123"
     # Step 2: create_price (references product_id)
          "index": 3, "primitive_contents": product_id, "parent_instance_key": "root"},
     ]
     result = asyncio.run(execute_contract(contract, nodes, mock_client))
+    price_id = result.response["id"]
     assert price_id == "price_test456"
     # Step 3: create_payment_link (references price_id)
          "index": 4, "primitive_contents": 1, "parent_instance_key": "li_1"},
     ]
     result = asyncio.run(execute_contract(contract, nodes, mock_client))
+    assert result.response["id"] == "plink_test789"
+    assert result.response["url"] == "https://buy.stripe.com/test_xxx"
     # Verify the mock was called 3 times total
     assert mock_client.call.call_count == 3