Spaces:

smolagents
/

ml-agent

Running

App Files Files Community

Aksel Joonas Reedi commited on 13 days ago

Commit

d95cff9

1 Parent(s): fdddeaa

Merge pull request #31 from huggingface/agent-improvements

Browse files

Files changed (14) hide show

agent/config.py +1 -2
agent/context_manager/manager.py +15 -110
agent/core/agent_loop.py +5 -46
agent/core/session.py +0 -3
agent/main.py +40 -6
agent/prompts/system_prompt_v3.yaml +19 -9
agent/tools/edit_utils.py +9 -3
agent/tools/file_content_cache.py +0 -40
agent/tools/local_tools.py +154 -42
agent/tools/research_tool.py +83 -5
agent/tools/sandbox_client.py +59 -46
agent/tools/sandbox_tool.py +0 -19
pyproject.toml +11 -1
uv.lock +4 -1

agent/config.py CHANGED Viewed

@@ -23,8 +23,7 @@ class Config(BaseModel):
     session_dataset_repo: str = "akseljoonas/hf-agent-sessions"
     auto_save_interval: int = 3  # Save every N user turns (0 = disabled)
     yolo_mode: bool = False  # Auto-approve all tool calls without confirmation
-    max_tool_failures_per_turn: int = 3  # Disable a tool after this many failures in one turn
-    max_requests_per_turn: int = 50  # Hard cap on LLM requests per agent turn
     # Permission control parameters
     confirm_cpu_jobs: bool = True

     session_dataset_repo: str = "akseljoonas/hf-agent-sessions"
     auto_save_interval: int = 3  # Save every N user turns (0 = disabled)
     yolo_mode: bool = False  # Auto-approve all tool calls without confirmation
+    max_iterations: int = 300  # Max LLM calls per agent turn (-1 = unlimited)
     # Permission control parameters
     confirm_cpu_jobs: bool = True

agent/context_manager/manager.py CHANGED Viewed

@@ -243,114 +243,10 @@ class ContextManager:
         return False
-    # Tools whose outputs should never be pruned (too valuable to summarise)
-    _PRUNE_SKIP_TOOLS: set[str] = {"research", "plan_tool"}
-    # Tools whose outputs are pruned via a cheap LLM call instead of
-    # deterministic truncation (the output structure is too complex for
-    # a fixed head-slice to capture the answer reliably).
-    _LLM_PRUNE_TOOLS: set[str] = {"hf_jobs"}
-    async def prune_old_tool_outputs(self, model_name: str | None = None) -> None:
-        """Stage 1 compaction: shrink old tool outputs.
-        For any tool message older than the last 6 messages whose content
-        exceeds 500 chars:
-        - Tools in _LLM_PRUNE_TOOLS get a cheap LLM summarisation (≤600 tokens).
-        - All other tools get a deterministic one-line summary.
-        tool_call_id and name are always preserved.
-        """
-        if len(self.items) <= 6:
-            return
-        cutoff = len(self.items) - 6
-        # Find the preceding assistant tool_call arguments so the LLM
-        # knows what question the tool output was answering.
-        def _find_tool_call_args(tool_call_id: str) -> str | None:
-            for msg in self.items:
-                if getattr(msg, "role", None) != "assistant":
-                    continue
-                for tc in getattr(msg, "tool_calls", None) or []:
-                    tc_id = tc.id if hasattr(tc, "id") else tc.get("id")
-                    if tc_id == tool_call_id:
-                        fn = tc.function if hasattr(tc, "function") else tc.get("function", {})
-                        return fn.arguments if hasattr(fn, "arguments") else fn.get("arguments", "")
-            return None
-        for i in range(cutoff - 1, -1, -1):
-            msg = self.items[i]
-            if getattr(msg, "role", None) != "tool":
-                continue
-            content = getattr(msg, "content", None) or ""
-            if len(content) <= 500:
-                continue
-            tool_name = getattr(msg, "name", None) or "tool"
-            if tool_name in self._PRUNE_SKIP_TOOLS:
-                continue
-            # --- LLM-based pruning for complex tool outputs ---
-            if tool_name in self._LLM_PRUNE_TOOLS and model_name:
-                call_args = _find_tool_call_args(getattr(msg, "tool_call_id", ""))
-                context_line = (
-                    f"The tool was called with: {call_args}\n\n" if call_args else ""
-                )
-                try:
-                    hf_key = os.environ.get("INFERENCE_TOKEN")
-                    resp = await acompletion(
-                        model=model_name,
-                        messages=[
-                            Message(
-                                role="user",
-                                content=(
-                                    f"{context_line}"
-                                    f"Below is the raw output of the '{tool_name}' tool.\n"
-                                    "Give the answer to the original request unchanged — "
-                                    "preserve all job IDs, numbers, status values, error "
-                                    "messages, and metrics exactly. Omit filler/boilerplate. "
-                                    "Stay under 600 tokens.\n\n"
-                                    f"{content}"
-                                ),
-                            )
-                        ],
-                        max_completion_tokens=600,
-                        api_key=hf_key
-                        if hf_key and model_name.startswith("huggingface/")
-                        else None,
-                    )
-                    msg.content = resp.choices[0].message.content
-                    continue
-                except Exception:
-                    logger.warning(
-                        "LLM prune failed for %s, falling back to deterministic",
-                        tool_name,
-                    )
-                    # fall through to deterministic pruning below
-            # --- Deterministic pruning ---
-            preview = content[:80]
-            total = len(content)
-            if tool_name == "bash":
-                exit_code_part = ""
-                if "exit_code" in content[:200]:
-                    for line in content[:200].splitlines():
-                        if "exit_code" in line:
-                            exit_code_part = "exit_code visible if present, "
-                            break
-                summary = f"[bash: {exit_code_part}{preview}... ({total} chars)]"
-            else:
-                summary = f"[{tool_name}: {preview}... ({total} chars)]"
-            msg.content = summary
     async def compact(
         self, model_name: str, tool_specs: list[dict] | None = None
     ) -> None:
         """Remove old messages to keep history under target size"""
-        await self.prune_old_tool_outputs(model_name=model_name)
         if (self.context_length <= self.max_context) or not self.items:
             return
@@ -358,6 +254,15 @@ class ContextManager:
             self.items[0] if self.items and self.items[0].role == "system" else None
         )
         # Don't summarize a certain number of just-preceding messages
         # Walk back to find a user message to make sure we keep an assistant -> user ->
         # assistant general conversation structure
@@ -366,7 +271,7 @@ class ContextManager:
             idx -= 1
         recent_messages = self.items[idx:]
-        messages_to_summarize = self.items[1:idx]
         # improbable, messages would have to very long
         if not messages_to_summarize:
@@ -393,11 +298,11 @@ class ContextManager:
             role="assistant", content=response.choices[0].message.content
         )
-        # Reconstruct: system + summary + recent messages (includes tools)
-        if system_msg:
-            self.items = [system_msg, summarized_message] + recent_messages
-        else:
-            self.items = [summarized_message] + recent_messages
         self.context_length = (
             len(self.system_prompt) // 4 + response.usage.completion_tokens

         return False
     async def compact(
         self, model_name: str, tool_specs: list[dict] | None = None
     ) -> None:
         """Remove old messages to keep history under target size"""
         if (self.context_length <= self.max_context) or not self.items:
             return
             self.items[0] if self.items and self.items[0].role == "system" else None
         )
+        # Preserve the first user message (task prompt) — never summarize it
+        first_user_msg = None
+        first_user_idx = 1
+        for i in range(1, len(self.items)):
+            if getattr(self.items[i], "role", None) == "user":
+                first_user_msg = self.items[i]
+                first_user_idx = i
+                break
         # Don't summarize a certain number of just-preceding messages
         # Walk back to find a user message to make sure we keep an assistant -> user ->
         # assistant general conversation structure
             idx -= 1
         recent_messages = self.items[idx:]
+        messages_to_summarize = self.items[first_user_idx + 1:idx]
         # improbable, messages would have to very long
         if not messages_to_summarize:
             role="assistant", content=response.choices[0].message.content
         )
+        # Reconstruct: system + first user msg + summary + recent messages
+        head = [system_msg] if system_msg else []
+        if first_user_msg:
+            head.append(first_user_msg)
+        self.items = head + [summarized_message] + recent_messages
         self.context_length = (
             len(self.system_prompt) // 4 + response.usage.completion_tokens

agent/core/agent_loop.py CHANGED Viewed

@@ -153,35 +153,6 @@ _MAX_LLM_RETRIES = 3
 _LLM_RETRY_DELAYS = [5, 15, 30]  # seconds between retries
-def _append_failure_warning(
-    output: str,
-    tool_name: str,
-    tool_error_counts: dict[str, int],
-    max_failures: int,
-) -> str:
-    """Track a tool failure and append a warning to the output.
-    Returns the output with an appended warning indicating how many
-    failures have occurred and whether the LLM should switch approach.
-    """
-    tool_error_counts[tool_name] = tool_error_counts.get(tool_name, 0) + 1
-    count = tool_error_counts[tool_name]
-    if count >= max_failures:
-        return output + (
-            f"\n\n⚠ Tool '{tool_name}' has now failed "
-            f"{count} times this turn. You should try a "
-            f"different approach instead of calling this "
-            f"tool again."
-        )
-    remaining = max_failures - count
-    return output + (
-        f"\n\n⚠ Tool '{tool_name}' has failed "
-        f"{count}/{max_failures} times this turn. "
-        f"{remaining} attempt(s) before you should "
-        f"switch to a different approach."
-    )
 def _is_transient_error(error: Exception) -> bool:
     """Return True for errors that are likely transient and worth retrying."""
     err_str = str(error).lower()
@@ -200,9 +171,6 @@ def _is_transient_error(error: Exception) -> bool:
 async def _compact_and_notify(session: Session) -> None:
     """Run compaction and send event if context was reduced."""
-    await session.context_manager.prune_old_tool_outputs(
-        model_name=session.config.model_name,
-    )
     old_length = session.context_manager.context_length
     max_ctx = session.context_manager.max_context
     logger.debug(
@@ -456,7 +424,7 @@ class Handlers:
     @staticmethod
     async def run_agent(
-        session: Session, text: str, max_iterations: int = 300
     ) -> str | None:
         """
         Handle user input (like user_input_or_turn in codex.rs:1291)
@@ -484,10 +452,9 @@ class Handlers:
         iteration = 0
         final_response = None
         errored = False
-        tool_error_counts: dict[str, int] = {}
-        effective_max = min(max_iterations, session.config.max_requests_per_turn)
-        while iteration < effective_max:
             # ── Cancellation check: before LLM call ──
             if session.is_cancelled:
                 break
@@ -603,7 +570,7 @@ class Handlers:
                         session.context_manager.context_length,
                         session.context_manager.max_context,
                         iteration,
-                        effective_max,
                         (content or "")[:500],
                     )
                     await session.send_event(
@@ -615,7 +582,7 @@ class Handlers:
                                     f"Loop exit: no tool calls. "
                                     f"finish_reason={finish_reason}, "
                                     f"tokens={token_count}/{session.context_manager.max_context}, "
-                                    f"iter={iteration}/{effective_max}"
                                 ),
                             },
                         )
@@ -760,15 +727,7 @@ class Handlers:
                     results = gather_task.result()
                     # 4. Record results and send outputs (order preserved)
-                    max_failures = session.config.max_tool_failures_per_turn
                     for tc, tool_name, tool_args, output, success in results:
-                        if not success:
-                            output = _append_failure_warning(
-                                output, tool_name, tool_error_counts, max_failures,
-                            )
-                        else:
-                            tool_error_counts.pop(tool_name, None)
                         tool_msg = Message(
                             role="tool",
                             content=output,

 _LLM_RETRY_DELAYS = [5, 15, 30]  # seconds between retries
 def _is_transient_error(error: Exception) -> bool:
     """Return True for errors that are likely transient and worth retrying."""
     err_str = str(error).lower()
 async def _compact_and_notify(session: Session) -> None:
     """Run compaction and send event if context was reduced."""
     old_length = session.context_manager.context_length
     max_ctx = session.context_manager.max_context
     logger.debug(
     @staticmethod
     async def run_agent(
+        session: Session, text: str,
     ) -> str | None:
         """
         Handle user input (like user_input_or_turn in codex.rs:1291)
         iteration = 0
         final_response = None
         errored = False
+        max_iterations = session.config.max_iterations
+        while max_iterations == -1 or iteration < max_iterations:
             # ── Cancellation check: before LLM call ──
             if session.is_cancelled:
                 break
                         session.context_manager.context_length,
                         session.context_manager.max_context,
                         iteration,
+                        max_iterations,
                         (content or "")[:500],
                     )
                     await session.send_event(
                                     f"Loop exit: no tool calls. "
                                     f"finish_reason={finish_reason}, "
                                     f"tokens={token_count}/{session.context_manager.max_context}, "
+                                    f"iter={iteration}/{max_iterations}"
                                 ),
                             },
                         )
                     results = gather_task.result()
                     # 4. Record results and send outputs (order preserved)
                     for tc, tool_name, tool_args, output, success in results:
                         tool_msg = Message(
                             role="tool",
                             content=output,

agent/core/session.py CHANGED Viewed

@@ -12,7 +12,6 @@ from typing import Any, Optional
 from agent.config import Config
 from agent.context_manager.manager import ContextManager
-from agent.tools.file_content_cache import FileContentCache
 logger = logging.getLogger(__name__)
@@ -110,8 +109,6 @@ class Session:
         self.sandbox = None
         self._running_job_ids: set[str] = set()  # HF job IDs currently executing
-        self.file_content_cache = FileContentCache()
         # Session trajectory logging
         self.logged_events: list[dict] = []
         self.session_start_time = datetime.now().isoformat()

 from agent.config import Config
 from agent.context_manager.manager import ContextManager
 logger = logging.getLogger(__name__)
         self.sandbox = None
         self._running_job_ids: set[str] = set()  # HF job IDs currently executing
         # Session trajectory logging
         self.logged_events: list[dict] = []
         self.session_start_time = datetime.now().isoformat()

agent/main.py CHANGED Viewed

@@ -858,7 +858,12 @@ async def main():
     get_console().print("\n[dim]Bye.[/dim]\n")
-async def headless_main(prompt: str, model: str | None = None) -> None:
     """Run a single prompt headlessly and exit."""
     import logging
@@ -876,12 +881,13 @@ async def headless_main(prompt: str, model: str | None = None) -> None:
     config.yolo_mode = True  # Auto-approve everything in headless mode
     if model:
-        if model not in VALID_MODEL_IDS:
-            print(f"ERROR: Unknown model '{model}'. Valid: {', '.join(VALID_MODEL_IDS)}", file=sys.stderr)
-            sys.exit(1)
         config.model_name = model
     print(f"Model: {config.model_name}", file=sys.stderr)
     print(f"Prompt: {prompt}", file=sys.stderr)
     print("---", file=sys.stderr)
@@ -900,7 +906,7 @@ async def headless_main(prompt: str, model: str | None = None) -> None:
             session_holder=session_holder,
             hf_token=hf_token,
             local_mode=True,
-            stream=True,
         )
     )
@@ -922,6 +928,7 @@ async def headless_main(prompt: str, model: str | None = None) -> None:
     shimmer = _ThinkingShimmer(console)
     stream_buf = _StreamBuffer(console)
     _hl_last_tool = [None]
     shimmer.start()
     while True:
@@ -960,6 +967,26 @@ async def headless_main(prompt: str, model: str | None = None) -> None:
             log = event.data.get("log", "") if event.data else ""
             if log:
                 print_tool_log(tool, log)
         elif event.event_type == "compacted":
             old_tokens = event.data.get("old_tokens", 0) if event.data else 0
             new_tokens = event.data.get("new_tokens", 0) if event.data else 0
@@ -1001,11 +1028,18 @@ if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Hugging Face Agent CLI")
     parser.add_argument("prompt", nargs="?", default=None, help="Run headlessly with this prompt")
     parser.add_argument("--model", "-m", default=None, help=f"Model to use (default: from config)")
     args = parser.parse_args()
     try:
         if args.prompt:
-            asyncio.run(headless_main(args.prompt, model=args.model))
         else:
             asyncio.run(main())
     except KeyboardInterrupt:

     get_console().print("\n[dim]Bye.[/dim]\n")
+async def headless_main(
+    prompt: str,
+    model: str | None = None,
+    max_iterations: int | None = None,
+    stream: bool = True,
+) -> None:
     """Run a single prompt headlessly and exit."""
     import logging
     config.yolo_mode = True  # Auto-approve everything in headless mode
     if model:
         config.model_name = model
+    if max_iterations is not None:
+        config.max_iterations = max_iterations
     print(f"Model: {config.model_name}", file=sys.stderr)
+    print(f"Max iterations: {config.max_iterations}", file=sys.stderr)
     print(f"Prompt: {prompt}", file=sys.stderr)
     print("---", file=sys.stderr)
             session_holder=session_holder,
             hf_token=hf_token,
             local_mode=True,
+            stream=stream,
         )
     )
     shimmer = _ThinkingShimmer(console)
     stream_buf = _StreamBuffer(console)
     _hl_last_tool = [None]
+    _hl_sub_id = [1]
     shimmer.start()
     while True:
             log = event.data.get("log", "") if event.data else ""
             if log:
                 print_tool_log(tool, log)
+        elif event.event_type == "approval_required":
+            # Auto-approve everything in headless mode (safety net if yolo_mode
+            # didn't prevent the approval event for some reason)
+            tools_data = event.data.get("tools", []) if event.data else []
+            approvals = [
+                {
+                    "tool_call_id": t.get("tool_call_id", ""),
+                    "approved": True,
+                    "feedback": None,
+                }
+                for t in tools_data
+            ]
+            _hl_sub_id[0] += 1
+            await submission_queue.put(Submission(
+                id=f"hl_approval_{_hl_sub_id[0]}",
+                operation=Operation(
+                    op_type=OpType.EXEC_APPROVAL,
+                    data={"approvals": approvals},
+                ),
+            ))
         elif event.event_type == "compacted":
             old_tokens = event.data.get("old_tokens", 0) if event.data else 0
             new_tokens = event.data.get("new_tokens", 0) if event.data else 0
     parser = argparse.ArgumentParser(description="Hugging Face Agent CLI")
     parser.add_argument("prompt", nargs="?", default=None, help="Run headlessly with this prompt")
     parser.add_argument("--model", "-m", default=None, help=f"Model to use (default: from config)")
+    parser.add_argument("--max-iterations", type=int, default=None,
+                        help="Max LLM requests per turn (default: 50, use -1 for unlimited)")
+    parser.add_argument("--no-stream", action="store_true",
+                        help="Disable token streaming (use non-streaming LLM calls)")
     args = parser.parse_args()
     try:
         if args.prompt:
+            max_iter = args.max_iterations
+            if max_iter is not None and max_iter < 0:
+                max_iter = 10_000  # effectively unlimited
+            asyncio.run(headless_main(args.prompt, model=args.model, max_iterations=max_iter, stream=not args.no_stream))
         else:
             asyncio.run(main())
     except KeyboardInterrupt:

agent/prompts/system_prompt_v3.yaml CHANGED Viewed

@@ -46,6 +46,8 @@ system_prompt: |
   2. Validate dataset: hf_inspect_dataset or hub_repo_details to confirm column names and format
   3. Validate model: hub_repo_details to confirm model exists, correct architecture/size/tokenizer
   Dataset format requirements by training method:
     SFT: "messages", "text", or "prompt"/"completion"
     DPO: "prompt", "chosen", "rejected"
@@ -116,21 +118,29 @@ system_prompt: |
   When running autonomously (no human in the loop), you MUST follow these rules:
-  NEVER respond with only text. Every response MUST include at least one tool call. If you have nothing to do, check the plan, check the timer, or verify outputs. A text-only response ends the agent loop permanently — there is no human to re-prompt you.
-  After training completes:
-  1. Check the output for errors or warnings
-  2. Copy/save the trained model to the required output location (e.g. final_model/)
-  3. Run evaluation to measure performance
-  4. If time remains and performance can improve: iterate (adjust hyperparameters, train longer, try different data)
-  5. Verify the final output exists and is valid before stopping
   Check the remaining time periodically with the timer command specified in the task prompt. Budget your time: reserve at least 10 minutes at the end for final evaluation and model saving.
   The task is NOT done until:
-  - The required output directory exists (e.g. final_model/) with a valid model
   - You have evaluated the model and confirmed it works
-  - You have used all available time productively
   # Communication

   2. Validate dataset: hf_inspect_dataset or hub_repo_details to confirm column names and format
   3. Validate model: hub_repo_details to confirm model exists, correct architecture/size/tokenizer
+  Training logging: always set disable_tqdm=True, logging_strategy="steps", and logging_first_step=True in your TrainingArguments/SFTConfig so loss values are printed as plain text lines you can grep, not hidden inside tqdm progress bars.
   Dataset format requirements by training method:
     SFT: "messages", "text", or "prompt"/"completion"
     DPO: "prompt", "chosen", "rejected"
   When running autonomously (no human in the loop), you MUST follow these rules:
+  NEVER respond with only text. Every response MUST include at least one tool call. If you have nothing to do, check the plan, verify outputs or plan ahead. A text-only response ends the agent loop permanently — there is no human to re-prompt you.
+  NEVER STOP WORKING. Do NOT decide you are "done" while time remains. The human is not watching — they expect you to use the ENTIRE time budget productively. Do NOT ask "should I continue?" or "is this a good stopping point?" — there is nobody to answer. You run until the timer expires or you are manually killed.
+  Your workflow is a loop, not a checklist. Once you have a working result, KEEP ITERATING:
+  LOOP UNTIL TIME RUNS OUT:
+  1. Research the approach (read docs, find examples, check current APIs)
+  2. Implement the solution (write code, set up training)
+  3. Train and evaluate
+  4. Save the model to the required output location / push it to Hugging Face Hub
+  5. Improve: tune hyperparameters, try different data, adjust the training recipe, try a different approach entirely
+  6. Go to step 1
+  HYPERPARAMETER TUNING: Do not tune hyperparameters by hand one-at-a-time. Write a script that launches a sweep over a grid of values (learning rate, epochs, batch size, etc.) and evaluates each run automatically. One well-designed sweep script beats ten manual experiments.
+  If you run out of ideas: research. Use the research tool to find papers on the task or technique — look for recent methods, ablation results, tricks that worked for similar problems. Re-read the task prompt for angles you missed. Re-read the training logs for clues. Try combining approaches from different papers. Try a fundamentally different strategy from the literature. There is always a paper you haven't read yet.
   Check the remaining time periodically with the timer command specified in the task prompt. Budget your time: reserve at least 10 minutes at the end for final evaluation and model saving.
   The task is NOT done until:
+  - The required output exists (e.g. final model, metrics reached, dataset updated etc)
   - You have evaluated the model and confirmed it works
   # Communication

agent/tools/edit_utils.py CHANGED Viewed

@@ -181,7 +181,11 @@ def apply_edit(
     if old_str not in content:
         original_match, fuzzy_note = fuzzy_find_original_match(content, old_str)
         if original_match is None:
-            raise ValueError("old_str not found in file.")
         old_str = original_match
     count = content.count(old_str)
@@ -189,8 +193,10 @@ def apply_edit(
     if mode == "replace":
         if count > 1 and not replace_all:
             raise ValueError(
-                f"old_str appears {count} times. Use replace_all=true to replace all, "
-                "or provide a more specific old_str."
             )
         if replace_all:
             new_content = content.replace(old_str, new_str)

     if old_str not in content:
         original_match, fuzzy_note = fuzzy_find_original_match(content, old_str)
         if original_match is None:
+            raise ValueError(
+                "old_str was not found in the file. Make sure old_str matches "
+                "the file contents exactly, including whitespace and indentation. "
+                "Use the read tool to verify the current file contents before retrying."
+            )
         old_str = original_match
     count = content.count(old_str)
     if mode == "replace":
         if count > 1 and not replace_all:
             raise ValueError(
+                f"Found {count} matches of old_str in the file, but replace_all is "
+                f"false. To replace all occurrences, set replace_all to true. To "
+                f"replace only one, provide a larger old_str with more surrounding "
+                f"context to uniquely identify the instance."
             )
         if replace_all:
             new_content = content.replace(old_str, new_str)

agent/tools/file_content_cache.py DELETED Viewed

@@ -1,40 +0,0 @@
-"""Cache for detecting unchanged local file re-reads."""
-from __future__ import annotations
-import hashlib
-def _short_hash(content: str) -> str:
-    return hashlib.sha256(content.encode()).hexdigest()[:16]
-def _resolve(path: str) -> str:
-    try:
-        from pathlib import Path
-        return str(Path(path).resolve())
-    except Exception:
-        return path
-class FileContentCache:
-    """Tracks file content hashes to skip re-reading unchanged files."""
-    def __init__(self) -> None:
-        self._cache: dict[str, tuple[str, int]] = {}
-    def record_read(self, path: str, content: str, turn: int) -> None:
-        key = _resolve(path)
-        self._cache[key] = (_short_hash(content), turn)
-    def check_unchanged(self, path: str, content: str) -> tuple[bool, int | None]:
-        key = _resolve(path)
-        cached = self._cache.get(key)
-        if cached is None:
-            return False, None
-        cached_hash, turn = cached
-        return _short_hash(content) == cached_hash, turn
-    def clear_path(self, path: str) -> None:
-        key = _resolve(path)
-        self._cache.pop(key, None)

agent/tools/local_tools.py CHANGED Viewed

@@ -15,16 +15,25 @@ import tempfile
 from pathlib import Path
 from typing import Any
-from agent.tools.sandbox_client import Sandbox
 MAX_OUTPUT_CHARS = 25_000
-MAX_LINE_LENGTH = 2000
 DEFAULT_READ_LINES = 2000
 DEFAULT_TIMEOUT = 120
-MAX_TIMEOUT = 600
 _ANSI_RE = re.compile(r'\x1b\[[0-9;]*[a-zA-Z]|\x1b\].*?\x07')
 def _atomic_write(path: Path, content: str) -> None:
     """Write file atomically via temp file + os.replace().
@@ -105,7 +114,14 @@ async def _bash_handler(args: dict[str, Any], **_kw) -> tuple[str, bool]:
             output = "(no output)"
         return output, result.returncode == 0
     except subprocess.TimeoutExpired:
-        return f"Command timed out after {timeout}s.", False
     except Exception as e:
         return f"bash error: {e}", False
@@ -124,17 +140,7 @@ async def _read_handler(args: dict[str, Any], **_kw) -> tuple[str, bool]:
     except Exception as e:
         return f"read error: {e}", False
-    # Check if file is unchanged since last read
-    session = _kw.get("session")
-    if session is not None:
-        is_unchanged, last_turn = session.file_content_cache.check_unchanged(
-            file_path, raw_content
-        )
-        if is_unchanged:
-            return (
-                f"[File unchanged since turn {last_turn}, "
-                f"content already in context.]"
-            ), True
     lines = raw_content.splitlines()
     offset = max((args.get("offset") or 1), 1)
@@ -147,11 +153,6 @@ async def _read_handler(args: dict[str, Any], **_kw) -> tuple[str, bool]:
             line = line[:MAX_LINE_LENGTH] + "..."
         numbered.append(f"{i:>6}\t{line}")
-    if session is not None:
-        session.file_content_cache.record_read(
-            file_path, raw_content, session.turn_count
-        )
     return "\n".join(numbered), True
@@ -161,11 +162,14 @@ async def _write_handler(args: dict[str, Any], **_kw) -> tuple[str, bool]:
     if not file_path:
         return "No path provided.", False
     p = Path(file_path)
     try:
         _atomic_write(p, content)
-        session = _kw.get("session")
-        if session is not None:
-            session.file_content_cache.clear_path(file_path)
         msg = f"Wrote {len(content)} bytes to {file_path}"
         # Syntax validation for Python files
         if p.suffix == ".py":
@@ -195,6 +199,11 @@ async def _edit_handler(args: dict[str, Any], **_kw) -> tuple[str, bool]:
     p = Path(file_path)
     if not p.exists():
         return f"File not found: {file_path}", False
     try:
         text = p.read_text()
@@ -213,10 +222,6 @@ async def _edit_handler(args: dict[str, Any], **_kw) -> tuple[str, bool]:
     except Exception as e:
         return f"edit write error: {e}", False
-    session = _kw.get("session")
-    if session is not None:
-        session.file_content_cache.clear_path(file_path)
     msg = f"Edited {file_path} ({replacements} replacement{'s' if replacements > 1 else ''})"
     if fuzzy_note:
         msg += f" {fuzzy_note}"
@@ -235,18 +240,22 @@ _LOCAL_TOOL_SPECS = {
         "description": (
             "Run a shell command on the local machine and return stdout/stderr.\n"
             "\n"
-            "Commands run in a shell at the working directory (default: current directory). "
-            "Each invocation is independent.\n"
-            "\n"
-            "AVOID using bash for operations covered by specialized tools:\n"
-            "- File reading: use read (not cat/head/tail)\n"
-            "- File editing: use edit (not sed/awk)\n"
-            "- File writing: use write (not echo/cat <<EOF)\n"
             "\n"
             "Chain dependent commands with &&. Independent commands should be "
             "separate bash calls (they can run in parallel).\n"
             "\n"
-            "Timeout default 120s, max 600s."
         ),
         "parameters": {
             "type": "object",
@@ -267,22 +276,125 @@ _LOCAL_TOOL_SPECS = {
                 },
                 "timeout": {
                     "type": "integer",
-                    "description": "Timeout in seconds (default: 120, max: 600).",
                 },
             },
         },
     },
     "read": {
-        "description": Sandbox.TOOLS["read"]["description"],
-        "parameters": Sandbox.TOOLS["read"]["parameters"],
     },
     "write": {
-        "description": Sandbox.TOOLS["write"]["description"],
-        "parameters": Sandbox.TOOLS["write"]["parameters"],
     },
     "edit": {
-        "description": Sandbox.TOOLS["edit"]["description"],
-        "parameters": Sandbox.TOOLS["edit"]["parameters"],
     },
 }

 from pathlib import Path
 from typing import Any
 MAX_OUTPUT_CHARS = 25_000
+MAX_LINE_LENGTH = 4000
 DEFAULT_READ_LINES = 2000
 DEFAULT_TIMEOUT = 120
+MAX_TIMEOUT = 36000  # 10 hours — needed for long training runs (e.g. PostTrainBench)
 _ANSI_RE = re.compile(r'\x1b\[[0-9;]*[a-zA-Z]|\x1b\].*?\x07')
+# Track files that have been read this session (enforces read-before-write/edit)
+_files_read: set[str] = set()
+def _resolve_path(path: str) -> str:
+    try:
+        return str(Path(path).resolve())
+    except Exception:
+        return path
 def _atomic_write(path: Path, content: str) -> None:
     """Write file atomically via temp file + os.replace().
             output = "(no output)"
         return output, result.returncode == 0
     except subprocess.TimeoutExpired:
+        return (
+            f"Command timed out after {timeout}s and was killed.\n\n"
+            f"For long-running commands, run in the background and poll:\n"
+            f"  nohup <command> > /tmp/output.log 2>&1 & echo $!\n"
+            f"Then check status with:\n"
+            f"  kill -0 <PID> 2>/dev/null && echo 'running' || echo 'done'\n"
+            f"  tail -n 50 /tmp/output.log"
+        ), False
     except Exception as e:
         return f"bash error: {e}", False
     except Exception as e:
         return f"read error: {e}", False
+    _files_read.add(_resolve_path(file_path))
     lines = raw_content.splitlines()
     offset = max((args.get("offset") or 1), 1)
             line = line[:MAX_LINE_LENGTH] + "..."
         numbered.append(f"{i:>6}\t{line}")
     return "\n".join(numbered), True
     if not file_path:
         return "No path provided.", False
     p = Path(file_path)
+    if p.exists() and _resolve_path(file_path) not in _files_read:
+        return (
+            f"You must read {file_path} before overwriting it. "
+            f"Use the read tool first to see current contents."
+        ), False
     try:
         _atomic_write(p, content)
+        _files_read.add(_resolve_path(file_path))
         msg = f"Wrote {len(content)} bytes to {file_path}"
         # Syntax validation for Python files
         if p.suffix == ".py":
     p = Path(file_path)
     if not p.exists():
         return f"File not found: {file_path}", False
+    if _resolve_path(file_path) not in _files_read:
+        return (
+            f"You must read {file_path} before editing it. "
+            f"Use the read tool first to see current contents."
+        ), False
     try:
         text = p.read_text()
     except Exception as e:
         return f"edit write error: {e}", False
     msg = f"Edited {file_path} ({replacements} replacement{'s' if replacements > 1 else ''})"
     if fuzzy_note:
         msg += f" {fuzzy_note}"
         "description": (
             "Run a shell command on the local machine and return stdout/stderr.\n"
             "\n"
+            "IMPORTANT: Do NOT use bash for file operations — use the dedicated tools instead:\n"
+            "- To read files: use read (not cat/head/tail)\n"
+            "- To edit files: use edit (not sed/awk)\n"
+            "- To write files: use write (not echo/cat <<EOF)\n"
             "\n"
+            "Commands run in a shell at the working directory. Each invocation is independent.\n"
             "Chain dependent commands with &&. Independent commands should be "
             "separate bash calls (they can run in parallel).\n"
             "\n"
+            "For long-running commands (training, evaluation), run in the background and poll:\n"
+            "  nohup <command> > /tmp/output.log 2>&1 & echo $!\n"
+            "Then check status:\n"
+            "  kill -0 <PID> 2>/dev/null && echo 'running' || echo 'done'\n"
+            "  tail -n 50 /tmp/output.log\n"
+            "\n"
+            "Timeout default 120s, max 36000s."
         ),
         "parameters": {
             "type": "object",
                 },
                 "timeout": {
                     "type": "integer",
+                    "description": "Optional timeout in seconds (default: 120, max: 36000).",
                 },
             },
         },
     },
     "read": {
+        "description": (
+            "Reads a file from the local filesystem. Returns contents with line numbers "
+            "(cat -n format).\n"
+            "\n"
+            "Usage:\n"
+            "- By default, reads up to 2000 lines from the beginning of the file.\n"
+            "- You can optionally specify offset and limit for large files, but prefer "
+            "reading the whole file first.\n"
+            "- Lines longer than 4000 chars are truncated.\n"
+            "- Cannot read directories — use bash with 'ls' instead.\n"
+            "- You should read multiple potentially useful files in parallel when possible.\n"
+            "- IMPORTANT: Always read a file before editing or overwriting it. The edit and "
+            "write tools will reject operations on files you haven't read."
+        ),
+        "parameters": {
+            "type": "object",
+            "required": ["path"],
+            "additionalProperties": False,
+            "properties": {
+                "path": {
+                    "type": "string",
+                    "description": "Absolute path to the file to read.",
+                },
+                "offset": {
+                    "type": "integer",
+                    "description": "The line number to start reading from (1-based). Only provide if the file is too large to read at once.",
+                },
+                "limit": {
+                    "type": "integer",
+                    "description": "The number of lines to read. Only provide if the file is too large to read at once.",
+                },
+            },
+        },
     },
     "write": {
+        "description": (
+            "Writes a file to the local filesystem. Overwrites the existing file if one "
+            "exists at the path.\n"
+            "\n"
+            "- If this is an existing file, you MUST use the read tool first. This tool "
+            "will fail if you did not read the file first.\n"
+            "- ALWAYS prefer editing existing files with the edit tool over overwriting "
+            "with write.\n"
+            "- Creates parent directories as needed."
+        ),
+        "parameters": {
+            "type": "object",
+            "required": ["path", "content"],
+            "additionalProperties": False,
+            "properties": {
+                "path": {
+                    "type": "string",
+                    "description": "Absolute path to the file to write.",
+                },
+                "content": {
+                    "type": "string",
+                    "description": "The complete file content to write.",
+                },
+            },
+        },
     },
     "edit": {
+        "description": (
+            "Performs string replacements in files. Supports exact matching with "
+            "fuzzy fallback.\n"
+            "\n"
+            "Usage:\n"
+            "- You must read the file at least once before editing. This tool will "
+            "error if you attempt an edit without reading the file.\n"
+            "- The edit will FAIL if old_str is not unique in the file. Either provide "
+            "a larger string with more surrounding context to make it unique, or set "
+            "replace_all to true.\n"
+            "- old_str and new_str must differ.\n"
+            "- Preserve indentation exactly as it appears in the file.\n"
+            "- Do NOT include line number prefixes from read output in old_str or new_str.\n"
+            "- To delete code, set new_str to empty string.\n"
+            "- Use replace_all for renaming variables or strings across the file.\n"
+            "\n"
+            "Modes:\n"
+            "- replace (default): replace first occurrence of old_str with new_str.\n"
+            "- append_after: insert new_str immediately after old_str (old_str is kept).\n"
+            "- prepend_before: insert new_str immediately before old_str (old_str is kept)."
+        ),
+        "parameters": {
+            "type": "object",
+            "required": ["path", "old_str", "new_str"],
+            "additionalProperties": False,
+            "properties": {
+                "path": {
+                    "type": "string",
+                    "description": "Absolute path to the file to edit.",
+                },
+                "old_str": {
+                    "type": "string",
+                    "description": "The text to find in the file. Must match exactly (fuzzy matching is used as fallback).",
+                },
+                "new_str": {
+                    "type": "string",
+                    "description": "The replacement text. For append_after/prepend_before modes, the text to insert.",
+                },
+                "replace_all": {
+                    "type": "boolean",
+                    "description": "Replace all occurrences of old_str (default: false).",
+                    "default": False,
+                },
+                "mode": {
+                    "type": "string",
+                    "enum": ["replace", "append_after", "prepend_before"],
+                    "description": "Edit mode (default: replace).",
+                    "default": "replace",
+                },
+            },
+        },
     },
 }

agent/tools/research_tool.py CHANGED Viewed

@@ -14,10 +14,17 @@ from typing import Any
 from litellm import Message, acompletion
 from agent.core.session import Event
 logger = logging.getLogger(__name__)
 # Tools the research agent can use (read-only subset)
 RESEARCH_TOOL_NAMES = {
     "read",
@@ -171,7 +178,7 @@ def _resolve_llm_params(model_name: str) -> dict:
 def _get_research_model(main_model: str) -> str:
     """Pick a cheaper model for research based on the main model."""
     if "anthropic/" in main_model:
-        return "anthropic/claude-haiku-4-5-20251001"
     # For non-Anthropic models (HF router etc.), use the same model
     return main_model
@@ -221,12 +228,60 @@ async def research_handler(
     _tool_uses = 0
     _total_tokens = 0
     await _log("Starting research sub-agent...")
-    # Run the research loop (max 20 iterations — research should be focused)
-    max_iterations = 20
     for _iteration in range(max_iterations):
         try:
             response = await acompletion(
                 messages=messages,
@@ -242,7 +297,7 @@ async def research_handler(
         # Track tokens
         if response.usage:
-            _total_tokens += response.usage.total_tokens
             await _log(f"tokens:{_total_tokens}")
         choice = response.choices[0]
@@ -308,8 +363,31 @@ async def research_handler(
                 )
             )
     return (
-        "Research agent hit iteration limit (20). "
         "Partial findings may be incomplete — try a more focused task.",
         False,
     )

 from litellm import Message, acompletion
+from agent.core.doom_loop import check_for_doom_loop
 from agent.core.session import Event
 logger = logging.getLogger(__name__)
+# Context budget for the research subagent (tokens).
+# When usage exceeds WARN threshold, the subagent is told to wrap up.
+# At MAX, the loop is force-stopped and whatever content exists is returned.
+_RESEARCH_CONTEXT_WARN = 170_000  # 85% of 200k
+_RESEARCH_CONTEXT_MAX = 190_000
 # Tools the research agent can use (read-only subset)
 RESEARCH_TOOL_NAMES = {
     "read",
 def _get_research_model(main_model: str) -> str:
     """Pick a cheaper model for research based on the main model."""
     if "anthropic/" in main_model:
+        return "anthropic/claude-sonnet-4-6"
     # For non-Anthropic models (HF router etc.), use the same model
     return main_model
     _tool_uses = 0
     _total_tokens = 0
+    _warned_context = False
     await _log("Starting research sub-agent...")
+    # Run the research loop — context budget is the real limiter
+    max_iterations = 60
     for _iteration in range(max_iterations):
+        # ── Doom-loop detection ──
+        doom_prompt = check_for_doom_loop(messages)
+        if doom_prompt:
+            logger.warning("Research sub-agent doom loop detected at iteration %d", _iteration)
+            await _log("Doom loop detected — injecting corrective prompt")
+            messages.append(Message(role="user", content=doom_prompt))
+        # ── Context budget: warn at 75%, hard-stop at 95% ──
+        if _total_tokens >= _RESEARCH_CONTEXT_MAX:
+            logger.warning(
+                "Research sub-agent hit context max (%d tokens) — forcing summary",
+                _total_tokens,
+            )
+            await _log(f"Context limit reached ({_total_tokens} tokens) — forcing wrap-up")
+            # Ask for a final summary with no tools
+            messages.append(Message(
+                role="user",
+                content=(
+                    "[SYSTEM: CONTEXT LIMIT REACHED] You have used all available context. "
+                    "Summarize your findings NOW. Do NOT call any more tools."
+                ),
+            ))
+            try:
+                response = await acompletion(
+                    messages=messages,
+                    tools=None,  # no tools — force text response
+                    stream=False,
+                    timeout=120,
+                    **llm_params,
+                )
+                content = response.choices[0].message.content or ""
+                return content or "Research context exhausted — no summary produced.", bool(content)
+            except Exception:
+                return "Research context exhausted and summary call failed.", False
+        if not _warned_context and _total_tokens >= _RESEARCH_CONTEXT_WARN:
+            _warned_context = True
+            await _log(f"Context at {_total_tokens} tokens — nudging to wrap up")
+            messages.append(Message(
+                role="user",
+                content=(
+                    "[SYSTEM: You have used 75% of your context budget. "
+                    "Start wrapping up: finish any critical lookups, then "
+                    "produce your final summary within the next 1-2 iterations.]"
+                ),
+            ))
         try:
             response = await acompletion(
                 messages=messages,
         # Track tokens
         if response.usage:
+            _total_tokens = response.usage.total_tokens
             await _log(f"tokens:{_total_tokens}")
         choice = response.choices[0]
                 )
             )
+    # ── Iteration limit: try to salvage findings ──
+    await _log("Iteration limit reached — extracting summary")
+    messages.append(Message(
+        role="user",
+        content=(
+            "[SYSTEM: ITERATION LIMIT] You have reached the maximum number of research "
+            "iterations. Summarize ALL findings so far. Do NOT call any more tools."
+        ),
+    ))
+    try:
+        response = await acompletion(
+            messages=messages,
+            tools=None,
+            stream=False,
+            timeout=120,
+            **llm_params,
+        )
+        content = response.choices[0].message.content or ""
+        if content:
+            return content, True
+    except Exception as e:
+        logger.error("Research summary call failed: %s", e)
     return (
+        "Research agent hit iteration limit (60). "
         "Partial findings may be incomplete — try a more focused task.",
         False,
     )

agent/tools/sandbox_client.py CHANGED Viewed

@@ -57,7 +57,7 @@ HARDWARE_OPTIONS = [
     "a100-large",
 ]
 OUTPUT_LIMIT = 25000
-LINE_LIMIT = 2000
 DEFAULT_READ_LIMIT = 2000
 DEFAULT_TIMEOUT = 240
 MAX_TIMEOUT = 1200
@@ -855,22 +855,23 @@ class Sandbox:
             "description": (
                 "Run a shell command in the remote sandbox and return stdout/stderr.\n"
                 "\n"
-                "Commands run in a shell at the working directory (default /app). "
-                "Each invocation is independent — use files in /app to persist state.\n"
-                "\n"
-                "AVOID using bash for operations covered by specialized tools:\n"
-                "- File reading: use read (not cat/head/tail)\n"
-                "- File editing: use edit (not sed/awk)\n"
-                "- File writing: use write (not echo/cat <<EOF)\n"
-                "\n"
-                "For long-running tasks, background them:\n"
-                "  nohup uv run train.py > /app/train.log 2>&1 &\n"
-                "Then check with read on the log file.\n"
                 "\n"
                 "Chain dependent commands with &&. Independent commands should be "
                 "separate bash calls (they can run in parallel).\n"
                 "\n"
-                "Timeout default 120s, max 600s."
             ),
             "parameters": {
                 "type": "object",
@@ -883,7 +884,7 @@ class Sandbox:
                     },
                     "description": {
                         "type": "string",
-                        "description": "Short description (5-10 words, active voice). E.g. 'Install dependencies', 'Run training script'.",
                     },
                     "work_dir": {
                         "type": "string",
@@ -891,20 +892,25 @@ class Sandbox:
                     },
                     "timeout": {
                         "type": "integer",
-                        "description": "Timeout in seconds (default: 240, max: 1200).",
                     },
                 },
             },
         },
         "read": {
             "description": (
-                "Read file contents with line numbers (cat -n format).\n"
-                "\n"
-                "Returns the first 2000 lines by default. For large files, use offset/limit "
-                "to read a specific range. Line numbers always match the original file.\n"
                 "\n"
-                "Lines longer than 2000 chars are truncated.\n"
-                "Cannot read directories — use bash with 'ls' instead."
             ),
             "parameters": {
                 "type": "object",
@@ -917,21 +923,25 @@ class Sandbox:
                     },
                     "offset": {
                         "type": "integer",
-                        "description": "Start from this line (1-based). Only if file is too large.",
                     },
                     "limit": {
                         "type": "integer",
-                        "description": "Number of lines to read. Only if file is too large.",
                     },
                 },
             },
         },
         "write": {
             "description": (
-                "Create or overwrite a file. Creates parent directories as needed.\n"
                 "\n"
-                "For existing files, you MUST read the file first (system enforced). "
-                "Prefer edit for modifications."
             ),
             "parameters": {
                 "type": "object",
@@ -944,32 +954,32 @@ class Sandbox:
                     },
                     "content": {
                         "type": "string",
-                        "description": "Complete file content.",
                     },
                 },
             },
         },
         "edit": {
             "description": (
-                "Targeted edit via string replacement with fuzzy matching fallback.\n"
                 "\n"
-                "Modes:\n"
-                "- replace (default): replace first occurrence of old_str with new_str.\n"
-                "- append_after: insert new_str immediately after old_str (old_str is kept).\n"
-                "- prepend_before: insert new_str immediately before old_str (old_str is kept).\n"
-                "\n"
-                "Rules:\n"
-                "- old_str must appear EXACTLY once (unless replace_all is true).\n"
-                "- Include enough context in old_str for uniqueness.\n"
                 "- old_str and new_str must differ.\n"
-                "- Preserve indentation exactly.\n"
                 "- To delete code, set new_str to empty string.\n"
-                "- File MUST have been read this session (system enforced).\n"
-                "- Do NOT include line number prefixes in old_str/new_str.\n"
                 "\n"
-                "If exact match fails, the tool automatically tries trimmed/normalized matching.\n"
-                "Use replace_all=true for batch operations like variable renaming.\n"
-                "Use append_after/prepend_before to insert code without replacing existing code."
             ),
             "parameters": {
                 "type": "object",
@@ -978,16 +988,19 @@ class Sandbox:
                 "properties": {
                     "path": {
                         "type": "string",
-                        "description": "Absolute path to the file.",
                     },
                     "old_str": {
                         "type": "string",
-                        "description": "Text to find (fuzzy matching used as fallback).",
                     },
-                    "new_str": {"type": "string", "description": "Replacement text (or text to insert for append_after/prepend_before)."},
                     "replace_all": {
                         "type": "boolean",
-                        "description": "Replace all occurrences (default: false).",
                         "default": False,
                     },
                     "mode": {

     "a100-large",
 ]
 OUTPUT_LIMIT = 25000
+LINE_LIMIT = 4000
 DEFAULT_READ_LIMIT = 2000
 DEFAULT_TIMEOUT = 240
 MAX_TIMEOUT = 1200
             "description": (
                 "Run a shell command in the remote sandbox and return stdout/stderr.\n"
                 "\n"
+                "IMPORTANT: Do NOT use bash for file operations — use the dedicated tools instead:\n"
+                "- To read files: use read (not cat/head/tail)\n"
+                "- To edit files: use edit (not sed/awk)\n"
+                "- To write files: use write (not echo/cat <<EOF)\n"
                 "\n"
+                "Commands run in a shell at /app. Each invocation is independent — "
+                "use files in /app to persist state.\n"
                 "Chain dependent commands with &&. Independent commands should be "
                 "separate bash calls (they can run in parallel).\n"
                 "\n"
+                "For long-running commands (training, evaluation), run in the background and poll:\n"
+                "  nohup <command> > /app/output.log 2>&1 & echo $!\n"
+                "Then check status:\n"
+                "  kill -0 <PID> 2>/dev/null && echo 'running' || echo 'done'\n"
+                "  tail -n 50 /app/output.log\n"
+                "\n"
+                "Timeout default 240s, max 1200s."
             ),
             "parameters": {
                 "type": "object",
                     },
                     "description": {
                         "type": "string",
+                        "description": "Short description (5-10 words, active voice).",
                     },
                     "work_dir": {
                         "type": "string",
                     },
                     "timeout": {
                         "type": "integer",
+                        "description": "Optional timeout in seconds (default: 240, max: 1200).",
                     },
                 },
             },
         },
         "read": {
             "description": (
+                "Reads a file from the sandbox filesystem. Returns contents with line "
+                "numbers (cat -n format).\n"
                 "\n"
+                "Usage:\n"
+                "- By default, reads up to 2000 lines from the beginning of the file.\n"
+                "- You can optionally specify offset and limit for large files, but prefer "
+                "reading the whole file first.\n"
+                "- Lines longer than 4000 chars are truncated.\n"
+                "- Cannot read directories — use bash with 'ls' instead.\n"
+                "- You should read multiple potentially useful files in parallel when possible.\n"
+                "- IMPORTANT: Always read a file before editing or overwriting it. The edit and "
+                "write tools will reject operations on files you haven't read."
             ),
             "parameters": {
                 "type": "object",
                     },
                     "offset": {
                         "type": "integer",
+                        "description": "The line number to start reading from (1-based). Only provide if the file is too large to read at once.",
                     },
                     "limit": {
                         "type": "integer",
+                        "description": "The number of lines to read. Only provide if the file is too large to read at once.",
                     },
                 },
             },
         },
         "write": {
             "description": (
+                "Writes a file to the sandbox filesystem. Overwrites the existing file if "
+                "one exists at the path.\n"
                 "\n"
+                "- If this is an existing file, you MUST use the read tool first. This tool "
+                "will fail if you did not read the file first.\n"
+                "- ALWAYS prefer editing existing files with the edit tool over overwriting "
+                "with write.\n"
+                "- Creates parent directories as needed."
             ),
             "parameters": {
                 "type": "object",
                     },
                     "content": {
                         "type": "string",
+                        "description": "The complete file content to write.",
                     },
                 },
             },
         },
         "edit": {
             "description": (
+                "Performs string replacements in files. Supports exact matching with "
+                "fuzzy fallback.\n"
                 "\n"
+                "Usage:\n"
+                "- You must read the file at least once before editing. This tool will "
+                "error if you attempt an edit without reading the file.\n"
+                "- The edit will FAIL if old_str is not unique in the file. Either provide "
+                "a larger string with more surrounding context to make it unique, or set "
+                "replace_all to true.\n"
                 "- old_str and new_str must differ.\n"
+                "- Preserve indentation exactly as it appears in the file.\n"
+                "- Do NOT include line number prefixes from read output in old_str or new_str.\n"
                 "- To delete code, set new_str to empty string.\n"
+                "- Use replace_all for renaming variables or strings across the file.\n"
                 "\n"
+                "Modes:\n"
+                "- replace (default): replace first occurrence of old_str with new_str.\n"
+                "- append_after: insert new_str immediately after old_str (old_str is kept).\n"
+                "- prepend_before: insert new_str immediately before old_str (old_str is kept)."
             ),
             "parameters": {
                 "type": "object",
                 "properties": {
                     "path": {
                         "type": "string",
+                        "description": "Absolute path to the file to edit.",
                     },
                     "old_str": {
                         "type": "string",
+                        "description": "The text to find in the file. Must match exactly (fuzzy matching is used as fallback).",
+                    },
+                    "new_str": {
+                        "type": "string",
+                        "description": "The replacement text. For append_after/prepend_before modes, the text to insert.",
                     },
                     "replace_all": {
                         "type": "boolean",
+                        "description": "Replace all occurrences of old_str (default: false).",
                         "default": False,
                     },
                     "mode": {

agent/tools/sandbox_tool.py CHANGED Viewed

@@ -245,25 +245,6 @@ def _make_tool_handler(sandbox_tool_name: str):
             result = await asyncio.to_thread(sb.call_tool, sandbox_tool_name, args)
             if result.success:
                 output = result.output or "(no output)"
-                cache = getattr(session, "file_content_cache", None)
-                file_path = args.get("path", "")
-                if sandbox_tool_name == "read" and cache and file_path:
-                    is_unchanged, last_turn = cache.check_unchanged(
-                        f"sandbox:{file_path}", output
-                    )
-                    if is_unchanged:
-                        return (
-                            f"[File unchanged since turn {last_turn}, "
-                            f"content already in context.]"
-                        ), True
-                    cache.record_read(
-                        f"sandbox:{file_path}", output, session.turn_count
-                    )
-                if sandbox_tool_name in ("write", "edit") and cache and file_path:
-                    cache.clear_path(f"sandbox:{file_path}")
                 return output, True
             else:
                 error_msg = result.error or "Unknown error"

             result = await asyncio.to_thread(sb.call_tool, sandbox_tool_name, args)
             if result.success:
                 output = result.output or "(no output)"
                 return output, True
             else:
                 error_msg = result.error or "Unknown error"

pyproject.toml CHANGED Viewed

@@ -3,7 +3,7 @@ name = "hf-agent"
 version = "0.1.0"
 description = "Add your description here"
 readme = "README.md"
-requires-python = ">=3.12"
 dependencies = [
     "datasets>=4.4.1",
     # Core dependencies (always required)
@@ -49,3 +49,13 @@ dev = [
 all = [
     "hf-agent[agent,eval,dev]",
 ]

 version = "0.1.0"
 description = "Add your description here"
 readme = "README.md"
+requires-python = ">=3.11"
 dependencies = [
     "datasets>=4.4.1",
     # Core dependencies (always required)
 all = [
     "hf-agent[agent,eval,dev]",
 ]
+[build-system]
+requires = ["setuptools>=64"]
+build-backend = "setuptools.build_meta"
+[tool.setuptools.packages.find]
+include = ["agent*"]
+[tool.uv]
+package = true

uv.lock CHANGED Viewed

@@ -871,7 +871,7 @@ wheels = [
 [[package]]
 name = "hf-agent"
 version = "0.1.0"
-source = { virtual = "." }
 dependencies = [
     { name = "datasets" },
     { name = "pydantic" },
@@ -890,6 +890,7 @@ agent = [
     { name = "nbformat" },
     { name = "prompt-toolkit" },
     { name = "requests" },
     { name = "thefuzz" },
     { name = "uvicorn", extra = ["standard"] },
     { name = "websockets" },
@@ -909,6 +910,7 @@ all = [
     { name = "prompt-toolkit" },
     { name = "pytest" },
     { name = "requests" },
     { name = "tenacity" },
     { name = "thefuzz" },
     { name = "uvicorn", extra = ["standard"] },
@@ -945,6 +947,7 @@ requires-dist = [
     { name = "pytest", marker = "extra == 'dev'", specifier = ">=9.0.2" },
     { name = "python-dotenv", specifier = ">=1.2.1" },
     { name = "requests", marker = "extra == 'agent'", specifier = ">=2.32.5" },
     { name = "tenacity", marker = "extra == 'eval'", specifier = ">=8.0.0" },
     { name = "thefuzz", marker = "extra == 'agent'", specifier = ">=0.22.1" },
     { name = "uvicorn", extras = ["standard"], marker = "extra == 'agent'", specifier = ">=0.32.0" },

 [[package]]
 name = "hf-agent"
 version = "0.1.0"
+source = { editable = "." }
 dependencies = [
     { name = "datasets" },
     { name = "pydantic" },
     { name = "nbformat" },
     { name = "prompt-toolkit" },
     { name = "requests" },
+    { name = "rich" },
     { name = "thefuzz" },
     { name = "uvicorn", extra = ["standard"] },
     { name = "websockets" },
     { name = "prompt-toolkit" },
     { name = "pytest" },
     { name = "requests" },
+    { name = "rich" },
     { name = "tenacity" },
     { name = "thefuzz" },
     { name = "uvicorn", extra = ["standard"] },
     { name = "pytest", marker = "extra == 'dev'", specifier = ">=9.0.2" },
     { name = "python-dotenv", specifier = ">=1.2.1" },
     { name = "requests", marker = "extra == 'agent'", specifier = ">=2.32.5" },
+    { name = "rich", marker = "extra == 'agent'", specifier = ">=13.0.0" },
     { name = "tenacity", marker = "extra == 'eval'", specifier = ">=8.0.0" },
     { name = "thefuzz", marker = "extra == 'agent'", specifier = ">=0.22.1" },
     { name = "uvicorn", extras = ["standard"], marker = "extra == 'agent'", specifier = ">=0.32.0" },