Spaces:

smolagents
/

ml-agent

Running

akseljoonas HF Staff Claude Opus 4.6 commited on 25 days ago

Commit

960792d

1 Parent(s): 7edb225

refactor: simplify tool call validation and make interrupts cancel tool execution

- Handle finish_reason=length by dropping truncated tool calls before
they enter context
- Single json.loads per tool call instead of 4 redundant parses
- Parsed args flow through to approval/execution without re-parsing
- Replace recover_malformed_tool_calls (90 lines) with inline validation
in agent loop — bad calls get error results, good calls execute
- Make tool execution cancellable: asyncio.wait races gather against
cancel event so Ctrl+C/frontend interrupt stops immediately instead
of waiting for all tools to finish
- Keep _patch_dangling_tool_calls as the only safety net in
get_messages() for any remaining edge cases

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (2) hide show

agent/context_manager/manager.py +3 -99
agent/core/agent_loop.py +97 -115

agent/context_manager/manager.py CHANGED Viewed

@@ -133,13 +133,10 @@ class ContextManager:
     def get_messages(self) -> list[Message]:
         """Get all messages for sending to LLM.
-        Automatically recovers malformed tool_call arguments and patches
-        any dangling tool_calls (assistant messages with tool_calls that
-        have no matching tool-result message).  Both can happen after
-        errors or cancellations and would cause the LLM API to reject the
-        request.
         """
-        self.recover_malformed_tool_calls()
         self._patch_dangling_tool_calls()
         return self.items
@@ -163,99 +160,6 @@ class ContextManager:
             tc if not isinstance(tc, dict) else ToolCall(**tc) for tc in tool_calls
         ]
-    def recover_malformed_tool_calls(self) -> set[str]:
-        """Sanitize malformed tool_call arguments and inject error results.
-        Handles two classes of corruption:
-        - **Empty/missing IDs**: Stripped from the assistant message entirely
-          (common when streaming is interrupted mid-tool-call).
-        - **Malformed JSON arguments**: Replaced with ``"{}"`` and an error
-          tool-result is injected asking the agent to retry.
-        This method is idempotent — safe to call from both the agent loop
-        (before tool execution) and from :meth:`get_messages` (safety net).
-        Returns:
-            Set of tool_call IDs that had malformed arguments.
-        """
-        import json
-        malformed_ids: set[str] = set()
-        for msg in self.items:
-            if getattr(msg, "role", None) != "assistant":
-                continue
-            tool_calls = getattr(msg, "tool_calls", None)
-            if not tool_calls:
-                continue
-            self._normalize_tool_calls(msg)
-            # 1. Strip tool_calls with empty/missing IDs (cannot be repaired)
-            valid_tcs = []
-            for tc in msg.tool_calls:
-                if not getattr(tc, "id", None):
-                    logger.warning(
-                        "Stripping tool_call with empty ID (name=%s) — likely interrupted stream",
-                        getattr(tc.function, "name", "?"),
-                    )
-                    continue
-                valid_tcs.append(tc)
-            if len(valid_tcs) != len(msg.tool_calls):
-                msg.tool_calls = valid_tcs or None
-            if not msg.tool_calls:
-                continue
-            # 2. Fix malformed JSON arguments
-            for tc in msg.tool_calls:
-                try:
-                    json.loads(tc.function.arguments)
-                except (json.JSONDecodeError, TypeError, ValueError) as e:
-                    logger.warning(
-                        "Malformed arguments for tool_call %s (%s): %s",
-                        tc.id,
-                        tc.function.name,
-                        e,
-                    )
-                    tc.function.arguments = "{}"
-                    malformed_ids.add(tc.id)
-        if not malformed_ids:
-            return malformed_ids
-        # 3. Inject error results for malformed calls that don't have one yet
-        answered_ids = {
-            getattr(m, "tool_call_id", None)
-            for m in self.items
-            if getattr(m, "role", None) == "tool"
-        }
-        for msg in self.items:
-            if getattr(msg, "role", None) != "assistant":
-                continue
-            tool_calls = getattr(msg, "tool_calls", None)
-            if not tool_calls:
-                continue
-            for tc in msg.tool_calls:
-                if tc.id in malformed_ids and tc.id not in answered_ids:
-                    self.items.append(
-                        Message(
-                            role="tool",
-                            content=(
-                                f"ERROR: Your tool call to '{tc.function.name}' had malformed "
-                                f"JSON arguments and was NOT executed. This usually happens "
-                                f"when the content is too large and gets truncated. "
-                                f"Please retry with smaller content — for 'write', split the "
-                                f"file into multiple smaller writes using 'edit' to build up "
-                                f"the file incrementally."
-                            ),
-                            tool_call_id=tc.id,
-                            name=tc.function.name,
-                        )
-                    )
-                    answered_ids.add(tc.id)
-        return malformed_ids
     def _patch_dangling_tool_calls(self) -> None:
         """Add stub tool results for any tool_calls that lack a matching result.

     def get_messages(self) -> list[Message]:
         """Get all messages for sending to LLM.
+        Patches any dangling tool_calls (assistant messages with tool_calls
+        that have no matching tool-result message) so the LLM API doesn't
+        reject the request.
         """
         self._patch_dangling_tool_calls()
         return self.items
             tc if not isinstance(tc, dict) else ToolCall(**tc) for tc in tool_calls
         ]
     def _patch_dangling_tool_calls(self) -> None:
         """Add stub tool results for any tool_calls that lack a matching result.

agent/core/agent_loop.py CHANGED Viewed

@@ -261,6 +261,7 @@ class Handlers:
                 full_content = ""
                 tool_calls_acc: dict[int, dict] = {}
                 token_count = 0
                 async for chunk in response:
                     # ── Check cancellation during streaming ──
@@ -276,6 +277,8 @@ class Handlers:
                         continue
                     delta = choice.delta
                     # Stream text deltas to the frontend
                     if delta.content:
@@ -316,17 +319,15 @@ class Handlers:
                 # ── Stream finished — reconstruct full message ───────
                 content = full_content or None
-                # Build tool_calls list from accumulated deltas,
-                # dropping any with empty IDs (from interrupted streams)
                 tool_calls: list[ToolCall] = []
                 for idx in sorted(tool_calls_acc.keys()):
                     tc_data = tool_calls_acc[idx]
-                    if not tc_data["id"]:
-                        logger.warning(
-                            "Dropping tool_call with empty ID (name=%s) — likely interrupted stream",
-                            tc_data["function"]["name"],
-                        )
-                        continue
                     tool_calls.append(
                         ToolCall(
                             id=tc_data["id"],
@@ -351,7 +352,23 @@ class Handlers:
                         final_response = content
                     break
-                # Add assistant message with tool calls to history
                 assistant_msg = Message(
                     role="assistant",
                     content=content,
@@ -359,79 +376,49 @@ class Handlers:
                 )
                 session.context_manager.add_message(assistant_msg, token_count)
                 # ── Cancellation check: before tool execution ──
                 if session.is_cancelled:
                     break
-                # Recover any malformed tool calls (sanitize JSON + inject
-                # error results).  Returns IDs to skip during execution.
-                malformed_ids = session.context_manager.recover_malformed_tool_calls()
-                if malformed_ids:
-                    # For each malformed tool_call, emit a synthetic tool_call +
-                    # tool_output-error pair so the frontend has a matching
-                    # dynamic-tool part instead of an orphan error.
-                    for tc in tool_calls:
-                        if tc.id not in malformed_ids:
-                            continue
-                        tool_name = tc.function.name
-                        try:
-                            tool_args = json.loads(tc.function.arguments)
-                        except (json.JSONDecodeError, TypeError, ValueError):
-                            tool_args = {}
-                        await session.send_event(
-                            Event(
-                                event_type="tool_call",
-                                data={
-                                    "tool": tool_name,
-                                    "arguments": tool_args,
-                                    "tool_call_id": tc.id,
-                                },
-                            )
-                        )
-                        await session.send_event(
-                            Event(
-                                event_type="tool_output",
-                                data={
-                                    "tool": tool_name,
-                                    "tool_call_id": tc.id,
-                                    "output": "Malformed tool call — see error in context.",
-                                    "success": False,
-                                },
-                            )
-                        )
-                # Separate tools into those requiring approval and those that don't
-                approval_required_tools = []
-                non_approval_tools = []
-                for tc in tool_calls:
-                    if tc.id in malformed_ids:
-                        continue
-                    tool_name = tc.function.name
-                    try:
-                        tool_args = json.loads(tc.function.arguments)
-                    except (json.JSONDecodeError, TypeError) as e:
-                        logger.warning(f"Malformed tool arguments for {tool_name}: {e}")
-                        tool_args = {}
                     if _needs_approval(tool_name, tool_args, session.config):
-                        approval_required_tools.append(tc)
                     else:
-                        non_approval_tools.append(tc)
                 # Execute non-approval tools (in parallel when possible)
                 if non_approval_tools:
-                    # 1. Parse args and validate upfront
                     parsed_tools: list[
-                        tuple[ChatCompletionMessageToolCall, str, dict, bool, str]
                     ] = []
-                    for tc in non_approval_tools:
-                        tool_name = tc.function.name
-                        try:
-                            tool_args = json.loads(tc.function.arguments)
-                        except (json.JSONDecodeError, TypeError):
-                            tool_args = {}
                         args_valid, error_msg = _validate_tool_args(tool_args)
                         parsed_tools.append(
                             (tc, tool_name, tool_args, args_valid, error_msg)
@@ -451,14 +438,14 @@ class Handlers:
                                 )
                             )
-                    # 3. Execute all valid tools in parallel
                     async def _exec_tool(
-                        tc: ChatCompletionMessageToolCall,
                         name: str,
                         args: dict,
                         valid: bool,
                         err: str,
-                    ) -> tuple[ChatCompletionMessageToolCall, str, dict, str, bool]:
                         if not valid:
                             return (tc, name, args, err, False)
                         out, ok = await session.tool_router.call_tool(
@@ -466,13 +453,30 @@ class Handlers:
                         )
                         return (tc, name, args, out, ok)
-                    results = await asyncio.gather(
                         *[
                             _exec_tool(tc, name, args, valid, err)
                             for tc, name, args, valid, err in parsed_tools
                         ]
                     )
                     # 4. Record results and send outputs (order preserved)
                     for tc, tool_name, tool_args, output, success in results:
                         tool_msg = Message(
@@ -495,56 +499,34 @@ class Handlers:
                             )
                         )
-                # ── Cancellation check: after tool execution ──
-                if session.is_cancelled:
-                    break
                 # If there are tools requiring approval, ask for batch approval
                 if approval_required_tools:
                     # Prepare batch approval data
                     tools_data = []
-                    for tc in approval_required_tools:
-                        tool_name = tc.function.name
-                        try:
-                            tool_args = json.loads(tc.function.arguments)
-                        except (json.JSONDecodeError, TypeError):
-                            tool_args = {}
                         # Resolve sandbox file paths for hf_jobs scripts so the
                         # frontend can display & edit the actual file content.
-                        if tool_name == "hf_jobs" and isinstance(
-                            tool_args.get("script"), str
-                        ):
                             from agent.tools.sandbox_tool import resolve_sandbox_script
                             sandbox = getattr(session, "sandbox", None)
-                            content, _ = await resolve_sandbox_script(
-                                sandbox, tool_args["script"]
-                            )
-                            if content:
-                                tool_args = {**tool_args, "script": content}
-                        tools_data.append(
-                            {
-                                "tool": tool_name,
-                                "arguments": tool_args,
-                                "tool_call_id": tc.id,
-                            }
-                        )
-                    await session.send_event(
-                        Event(
-                            event_type="approval_required",
-                            data={
-                                "tools": tools_data,  # Batch of tools
-                                "count": len(tools_data),
-                            },
-                        )
-                    )
-                    # Store all approval-requiring tools
                     session.pending_approval = {
-                        "tool_calls": approval_required_tools,
                     }
                     # Return early - wait for EXEC_APPROVAL operation

                 full_content = ""
                 tool_calls_acc: dict[int, dict] = {}
                 token_count = 0
+                finish_reason = None
                 async for chunk in response:
                     # ── Check cancellation during streaming ──
                         continue
                     delta = choice.delta
+                    if choice.finish_reason:
+                        finish_reason = choice.finish_reason
                     # Stream text deltas to the frontend
                     if delta.content:
                 # ── Stream finished — reconstruct full message ───────
                 content = full_content or None
+                # If output was truncated, all tool call args are garbage
+                if finish_reason == "length" and tool_calls_acc:
+                    logger.warning("Output truncated (finish_reason=length) — dropping tool calls")
+                    tool_calls_acc.clear()
+                # Build tool_calls list from accumulated deltas
                 tool_calls: list[ToolCall] = []
                 for idx in sorted(tool_calls_acc.keys()):
                     tc_data = tool_calls_acc[idx]
                     tool_calls.append(
                         ToolCall(
                             id=tc_data["id"],
                         final_response = content
                     break
+                # Validate tool call args (one json.loads per call, once)
+                # and split into good vs bad
+                good_tools: list[tuple[ToolCall, str, dict]] = []
+                bad_tools: list[ToolCall] = []
+                for tc in tool_calls:
+                    try:
+                        args = json.loads(tc.function.arguments)
+                        good_tools.append((tc, tc.function.name, args))
+                    except (json.JSONDecodeError, TypeError, ValueError):
+                        logger.warning(
+                            "Malformed arguments for tool_call %s (%s) — skipping",
+                            tc.id, tc.function.name,
+                        )
+                        tc.function.arguments = "{}"
+                        bad_tools.append(tc)
+                # Add assistant message with all tool calls to context
                 assistant_msg = Message(
                     role="assistant",
                     content=content,
                 )
                 session.context_manager.add_message(assistant_msg, token_count)
+                # Add error results for bad tool calls so the LLM
+                # knows what happened and can retry differently
+                for tc in bad_tools:
+                    error_msg = (
+                        f"ERROR: Tool call to '{tc.function.name}' had malformed JSON "
+                        f"arguments and was NOT executed. Retry with smaller content — "
+                        f"for 'write', split into multiple smaller writes using 'edit'."
+                    )
+                    session.context_manager.add_message(Message(
+                        role="tool",
+                        content=error_msg,
+                        tool_call_id=tc.id,
+                        name=tc.function.name,
+                    ))
+                    await session.send_event(Event(
+                        event_type="tool_call",
+                        data={"tool": tc.function.name, "arguments": {}, "tool_call_id": tc.id},
+                    ))
+                    await session.send_event(Event(
+                        event_type="tool_output",
+                        data={"tool": tc.function.name, "tool_call_id": tc.id, "output": error_msg, "success": False},
+                    ))
                 # ── Cancellation check: before tool execution ──
                 if session.is_cancelled:
                     break
+                # Separate good tools into approval-required vs auto-execute
+                approval_required_tools: list[tuple[ToolCall, str, dict]] = []
+                non_approval_tools: list[tuple[ToolCall, str, dict]] = []
+                for tc, tool_name, tool_args in good_tools:
                     if _needs_approval(tool_name, tool_args, session.config):
+                        approval_required_tools.append((tc, tool_name, tool_args))
                     else:
+                        non_approval_tools.append((tc, tool_name, tool_args))
                 # Execute non-approval tools (in parallel when possible)
                 if non_approval_tools:
+                    # 1. Validate args upfront
                     parsed_tools: list[
+                        tuple[ToolCall, str, dict, bool, str]
                     ] = []
+                    for tc, tool_name, tool_args in non_approval_tools:
                         args_valid, error_msg = _validate_tool_args(tool_args)
                         parsed_tools.append(
                             (tc, tool_name, tool_args, args_valid, error_msg)
                                 )
                             )
+                    # 3. Execute all valid tools in parallel, cancellable
                     async def _exec_tool(
+                        tc: ToolCall,
                         name: str,
                         args: dict,
                         valid: bool,
                         err: str,
+                    ) -> tuple[ToolCall, str, dict, str, bool]:
                         if not valid:
                             return (tc, name, args, err, False)
                         out, ok = await session.tool_router.call_tool(
                         )
                         return (tc, name, args, out, ok)
+                    gather_task = asyncio.ensure_future(asyncio.gather(
                         *[
                             _exec_tool(tc, name, args, valid, err)
                             for tc, name, args, valid, err in parsed_tools
                         ]
+                    ))
+                    cancel_task = asyncio.ensure_future(session._cancelled.wait())
+                    done, _ = await asyncio.wait(
+                        [gather_task, cancel_task],
+                        return_when=asyncio.FIRST_COMPLETED,
                     )
+                    if cancel_task in done:
+                        gather_task.cancel()
+                        try:
+                            await gather_task
+                        except asyncio.CancelledError:
+                            pass
+                        break
+                    cancel_task.cancel()
+                    results = gather_task.result()
                     # 4. Record results and send outputs (order preserved)
                     for tc, tool_name, tool_args, output, success in results:
                         tool_msg = Message(
                             )
                         )
                 # If there are tools requiring approval, ask for batch approval
                 if approval_required_tools:
                     # Prepare batch approval data
                     tools_data = []
+                    for tc, tool_name, tool_args in approval_required_tools:
                         # Resolve sandbox file paths for hf_jobs scripts so the
                         # frontend can display & edit the actual file content.
+                        if tool_name == "hf_jobs" and isinstance(tool_args.get("script"), str):
                             from agent.tools.sandbox_tool import resolve_sandbox_script
                             sandbox = getattr(session, "sandbox", None)
+                            resolved, _ = await resolve_sandbox_script(sandbox, tool_args["script"])
+                            if resolved:
+                                tool_args = {**tool_args, "script": resolved}
+                        tools_data.append({
+                            "tool": tool_name,
+                            "arguments": tool_args,
+                            "tool_call_id": tc.id,
+                        })
+                    await session.send_event(Event(
+                        event_type="approval_required",
+                        data={"tools": tools_data, "count": len(tools_data)},
+                    ))
+                    # Store all approval-requiring tools (ToolCall objects for execution)
                     session.pending_approval = {
+                        "tool_calls": [tc for tc, _, _ in approval_required_tools],
                     }
                     # Return early - wait for EXEC_APPROVAL operation