Aksel Joonas Reedi commited on
Commit
d95cff9
·
1 Parent(s): fdddeaa

Merge pull request #31 from huggingface/agent-improvements

Browse files
agent/config.py CHANGED
@@ -23,8 +23,7 @@ class Config(BaseModel):
23
  session_dataset_repo: str = "akseljoonas/hf-agent-sessions"
24
  auto_save_interval: int = 3 # Save every N user turns (0 = disabled)
25
  yolo_mode: bool = False # Auto-approve all tool calls without confirmation
26
- max_tool_failures_per_turn: int = 3 # Disable a tool after this many failures in one turn
27
- max_requests_per_turn: int = 50 # Hard cap on LLM requests per agent turn
28
 
29
  # Permission control parameters
30
  confirm_cpu_jobs: bool = True
 
23
  session_dataset_repo: str = "akseljoonas/hf-agent-sessions"
24
  auto_save_interval: int = 3 # Save every N user turns (0 = disabled)
25
  yolo_mode: bool = False # Auto-approve all tool calls without confirmation
26
+ max_iterations: int = 300 # Max LLM calls per agent turn (-1 = unlimited)
 
27
 
28
  # Permission control parameters
29
  confirm_cpu_jobs: bool = True
agent/context_manager/manager.py CHANGED
@@ -243,114 +243,10 @@ class ContextManager:
243
 
244
  return False
245
 
246
- # Tools whose outputs should never be pruned (too valuable to summarise)
247
- _PRUNE_SKIP_TOOLS: set[str] = {"research", "plan_tool"}
248
-
249
- # Tools whose outputs are pruned via a cheap LLM call instead of
250
- # deterministic truncation (the output structure is too complex for
251
- # a fixed head-slice to capture the answer reliably).
252
- _LLM_PRUNE_TOOLS: set[str] = {"hf_jobs"}
253
-
254
- async def prune_old_tool_outputs(self, model_name: str | None = None) -> None:
255
- """Stage 1 compaction: shrink old tool outputs.
256
-
257
- For any tool message older than the last 6 messages whose content
258
- exceeds 500 chars:
259
- - Tools in _LLM_PRUNE_TOOLS get a cheap LLM summarisation (≤600 tokens).
260
- - All other tools get a deterministic one-line summary.
261
- tool_call_id and name are always preserved.
262
- """
263
- if len(self.items) <= 6:
264
- return
265
-
266
- cutoff = len(self.items) - 6
267
-
268
- # Find the preceding assistant tool_call arguments so the LLM
269
- # knows what question the tool output was answering.
270
- def _find_tool_call_args(tool_call_id: str) -> str | None:
271
- for msg in self.items:
272
- if getattr(msg, "role", None) != "assistant":
273
- continue
274
- for tc in getattr(msg, "tool_calls", None) or []:
275
- tc_id = tc.id if hasattr(tc, "id") else tc.get("id")
276
- if tc_id == tool_call_id:
277
- fn = tc.function if hasattr(tc, "function") else tc.get("function", {})
278
- return fn.arguments if hasattr(fn, "arguments") else fn.get("arguments", "")
279
- return None
280
-
281
- for i in range(cutoff - 1, -1, -1):
282
- msg = self.items[i]
283
- if getattr(msg, "role", None) != "tool":
284
- continue
285
- content = getattr(msg, "content", None) or ""
286
- if len(content) <= 500:
287
- continue
288
-
289
- tool_name = getattr(msg, "name", None) or "tool"
290
- if tool_name in self._PRUNE_SKIP_TOOLS:
291
- continue
292
-
293
- # --- LLM-based pruning for complex tool outputs ---
294
- if tool_name in self._LLM_PRUNE_TOOLS and model_name:
295
- call_args = _find_tool_call_args(getattr(msg, "tool_call_id", ""))
296
- context_line = (
297
- f"The tool was called with: {call_args}\n\n" if call_args else ""
298
- )
299
- try:
300
- hf_key = os.environ.get("INFERENCE_TOKEN")
301
- resp = await acompletion(
302
- model=model_name,
303
- messages=[
304
- Message(
305
- role="user",
306
- content=(
307
- f"{context_line}"
308
- f"Below is the raw output of the '{tool_name}' tool.\n"
309
- "Give the answer to the original request unchanged — "
310
- "preserve all job IDs, numbers, status values, error "
311
- "messages, and metrics exactly. Omit filler/boilerplate. "
312
- "Stay under 600 tokens.\n\n"
313
- f"{content}"
314
- ),
315
- )
316
- ],
317
- max_completion_tokens=600,
318
- api_key=hf_key
319
- if hf_key and model_name.startswith("huggingface/")
320
- else None,
321
- )
322
- msg.content = resp.choices[0].message.content
323
- continue
324
- except Exception:
325
- logger.warning(
326
- "LLM prune failed for %s, falling back to deterministic",
327
- tool_name,
328
- )
329
- # fall through to deterministic pruning below
330
-
331
- # --- Deterministic pruning ---
332
- preview = content[:80]
333
- total = len(content)
334
-
335
- if tool_name == "bash":
336
- exit_code_part = ""
337
- if "exit_code" in content[:200]:
338
- for line in content[:200].splitlines():
339
- if "exit_code" in line:
340
- exit_code_part = "exit_code visible if present, "
341
- break
342
- summary = f"[bash: {exit_code_part}{preview}... ({total} chars)]"
343
- else:
344
- summary = f"[{tool_name}: {preview}... ({total} chars)]"
345
-
346
- msg.content = summary
347
-
348
  async def compact(
349
  self, model_name: str, tool_specs: list[dict] | None = None
350
  ) -> None:
351
  """Remove old messages to keep history under target size"""
352
- await self.prune_old_tool_outputs(model_name=model_name)
353
-
354
  if (self.context_length <= self.max_context) or not self.items:
355
  return
356
 
@@ -358,6 +254,15 @@ class ContextManager:
358
  self.items[0] if self.items and self.items[0].role == "system" else None
359
  )
360
 
 
 
 
 
 
 
 
 
 
361
  # Don't summarize a certain number of just-preceding messages
362
  # Walk back to find a user message to make sure we keep an assistant -> user ->
363
  # assistant general conversation structure
@@ -366,7 +271,7 @@ class ContextManager:
366
  idx -= 1
367
 
368
  recent_messages = self.items[idx:]
369
- messages_to_summarize = self.items[1:idx]
370
 
371
  # improbable, messages would have to very long
372
  if not messages_to_summarize:
@@ -393,11 +298,11 @@ class ContextManager:
393
  role="assistant", content=response.choices[0].message.content
394
  )
395
 
396
- # Reconstruct: system + summary + recent messages (includes tools)
397
- if system_msg:
398
- self.items = [system_msg, summarized_message] + recent_messages
399
- else:
400
- self.items = [summarized_message] + recent_messages
401
 
402
  self.context_length = (
403
  len(self.system_prompt) // 4 + response.usage.completion_tokens
 
243
 
244
  return False
245
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
246
  async def compact(
247
  self, model_name: str, tool_specs: list[dict] | None = None
248
  ) -> None:
249
  """Remove old messages to keep history under target size"""
 
 
250
  if (self.context_length <= self.max_context) or not self.items:
251
  return
252
 
 
254
  self.items[0] if self.items and self.items[0].role == "system" else None
255
  )
256
 
257
+ # Preserve the first user message (task prompt) — never summarize it
258
+ first_user_msg = None
259
+ first_user_idx = 1
260
+ for i in range(1, len(self.items)):
261
+ if getattr(self.items[i], "role", None) == "user":
262
+ first_user_msg = self.items[i]
263
+ first_user_idx = i
264
+ break
265
+
266
  # Don't summarize a certain number of just-preceding messages
267
  # Walk back to find a user message to make sure we keep an assistant -> user ->
268
  # assistant general conversation structure
 
271
  idx -= 1
272
 
273
  recent_messages = self.items[idx:]
274
+ messages_to_summarize = self.items[first_user_idx + 1:idx]
275
 
276
  # improbable, messages would have to very long
277
  if not messages_to_summarize:
 
298
  role="assistant", content=response.choices[0].message.content
299
  )
300
 
301
+ # Reconstruct: system + first user msg + summary + recent messages
302
+ head = [system_msg] if system_msg else []
303
+ if first_user_msg:
304
+ head.append(first_user_msg)
305
+ self.items = head + [summarized_message] + recent_messages
306
 
307
  self.context_length = (
308
  len(self.system_prompt) // 4 + response.usage.completion_tokens
agent/core/agent_loop.py CHANGED
@@ -153,35 +153,6 @@ _MAX_LLM_RETRIES = 3
153
  _LLM_RETRY_DELAYS = [5, 15, 30] # seconds between retries
154
 
155
 
156
- def _append_failure_warning(
157
- output: str,
158
- tool_name: str,
159
- tool_error_counts: dict[str, int],
160
- max_failures: int,
161
- ) -> str:
162
- """Track a tool failure and append a warning to the output.
163
-
164
- Returns the output with an appended warning indicating how many
165
- failures have occurred and whether the LLM should switch approach.
166
- """
167
- tool_error_counts[tool_name] = tool_error_counts.get(tool_name, 0) + 1
168
- count = tool_error_counts[tool_name]
169
- if count >= max_failures:
170
- return output + (
171
- f"\n\n⚠ Tool '{tool_name}' has now failed "
172
- f"{count} times this turn. You should try a "
173
- f"different approach instead of calling this "
174
- f"tool again."
175
- )
176
- remaining = max_failures - count
177
- return output + (
178
- f"\n\n⚠ Tool '{tool_name}' has failed "
179
- f"{count}/{max_failures} times this turn. "
180
- f"{remaining} attempt(s) before you should "
181
- f"switch to a different approach."
182
- )
183
-
184
-
185
  def _is_transient_error(error: Exception) -> bool:
186
  """Return True for errors that are likely transient and worth retrying."""
187
  err_str = str(error).lower()
@@ -200,9 +171,6 @@ def _is_transient_error(error: Exception) -> bool:
200
 
201
  async def _compact_and_notify(session: Session) -> None:
202
  """Run compaction and send event if context was reduced."""
203
- await session.context_manager.prune_old_tool_outputs(
204
- model_name=session.config.model_name,
205
- )
206
  old_length = session.context_manager.context_length
207
  max_ctx = session.context_manager.max_context
208
  logger.debug(
@@ -456,7 +424,7 @@ class Handlers:
456
 
457
  @staticmethod
458
  async def run_agent(
459
- session: Session, text: str, max_iterations: int = 300
460
  ) -> str | None:
461
  """
462
  Handle user input (like user_input_or_turn in codex.rs:1291)
@@ -484,10 +452,9 @@ class Handlers:
484
  iteration = 0
485
  final_response = None
486
  errored = False
487
- tool_error_counts: dict[str, int] = {}
488
 
489
- effective_max = min(max_iterations, session.config.max_requests_per_turn)
490
- while iteration < effective_max:
491
  # ── Cancellation check: before LLM call ──
492
  if session.is_cancelled:
493
  break
@@ -603,7 +570,7 @@ class Handlers:
603
  session.context_manager.context_length,
604
  session.context_manager.max_context,
605
  iteration,
606
- effective_max,
607
  (content or "")[:500],
608
  )
609
  await session.send_event(
@@ -615,7 +582,7 @@ class Handlers:
615
  f"Loop exit: no tool calls. "
616
  f"finish_reason={finish_reason}, "
617
  f"tokens={token_count}/{session.context_manager.max_context}, "
618
- f"iter={iteration}/{effective_max}"
619
  ),
620
  },
621
  )
@@ -760,15 +727,7 @@ class Handlers:
760
  results = gather_task.result()
761
 
762
  # 4. Record results and send outputs (order preserved)
763
- max_failures = session.config.max_tool_failures_per_turn
764
  for tc, tool_name, tool_args, output, success in results:
765
- if not success:
766
- output = _append_failure_warning(
767
- output, tool_name, tool_error_counts, max_failures,
768
- )
769
- else:
770
- tool_error_counts.pop(tool_name, None)
771
-
772
  tool_msg = Message(
773
  role="tool",
774
  content=output,
 
153
  _LLM_RETRY_DELAYS = [5, 15, 30] # seconds between retries
154
 
155
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
  def _is_transient_error(error: Exception) -> bool:
157
  """Return True for errors that are likely transient and worth retrying."""
158
  err_str = str(error).lower()
 
171
 
172
  async def _compact_and_notify(session: Session) -> None:
173
  """Run compaction and send event if context was reduced."""
 
 
 
174
  old_length = session.context_manager.context_length
175
  max_ctx = session.context_manager.max_context
176
  logger.debug(
 
424
 
425
  @staticmethod
426
  async def run_agent(
427
+ session: Session, text: str,
428
  ) -> str | None:
429
  """
430
  Handle user input (like user_input_or_turn in codex.rs:1291)
 
452
  iteration = 0
453
  final_response = None
454
  errored = False
455
+ max_iterations = session.config.max_iterations
456
 
457
+ while max_iterations == -1 or iteration < max_iterations:
 
458
  # ── Cancellation check: before LLM call ──
459
  if session.is_cancelled:
460
  break
 
570
  session.context_manager.context_length,
571
  session.context_manager.max_context,
572
  iteration,
573
+ max_iterations,
574
  (content or "")[:500],
575
  )
576
  await session.send_event(
 
582
  f"Loop exit: no tool calls. "
583
  f"finish_reason={finish_reason}, "
584
  f"tokens={token_count}/{session.context_manager.max_context}, "
585
+ f"iter={iteration}/{max_iterations}"
586
  ),
587
  },
588
  )
 
727
  results = gather_task.result()
728
 
729
  # 4. Record results and send outputs (order preserved)
 
730
  for tc, tool_name, tool_args, output, success in results:
 
 
 
 
 
 
 
731
  tool_msg = Message(
732
  role="tool",
733
  content=output,
agent/core/session.py CHANGED
@@ -12,7 +12,6 @@ from typing import Any, Optional
12
 
13
  from agent.config import Config
14
  from agent.context_manager.manager import ContextManager
15
- from agent.tools.file_content_cache import FileContentCache
16
 
17
  logger = logging.getLogger(__name__)
18
 
@@ -110,8 +109,6 @@ class Session:
110
  self.sandbox = None
111
  self._running_job_ids: set[str] = set() # HF job IDs currently executing
112
 
113
- self.file_content_cache = FileContentCache()
114
-
115
  # Session trajectory logging
116
  self.logged_events: list[dict] = []
117
  self.session_start_time = datetime.now().isoformat()
 
12
 
13
  from agent.config import Config
14
  from agent.context_manager.manager import ContextManager
 
15
 
16
  logger = logging.getLogger(__name__)
17
 
 
109
  self.sandbox = None
110
  self._running_job_ids: set[str] = set() # HF job IDs currently executing
111
 
 
 
112
  # Session trajectory logging
113
  self.logged_events: list[dict] = []
114
  self.session_start_time = datetime.now().isoformat()
agent/main.py CHANGED
@@ -858,7 +858,12 @@ async def main():
858
  get_console().print("\n[dim]Bye.[/dim]\n")
859
 
860
 
861
- async def headless_main(prompt: str, model: str | None = None) -> None:
 
 
 
 
 
862
  """Run a single prompt headlessly and exit."""
863
  import logging
864
 
@@ -876,12 +881,13 @@ async def headless_main(prompt: str, model: str | None = None) -> None:
876
  config.yolo_mode = True # Auto-approve everything in headless mode
877
 
878
  if model:
879
- if model not in VALID_MODEL_IDS:
880
- print(f"ERROR: Unknown model '{model}'. Valid: {', '.join(VALID_MODEL_IDS)}", file=sys.stderr)
881
- sys.exit(1)
882
  config.model_name = model
883
 
 
 
 
884
  print(f"Model: {config.model_name}", file=sys.stderr)
 
885
  print(f"Prompt: {prompt}", file=sys.stderr)
886
  print("---", file=sys.stderr)
887
 
@@ -900,7 +906,7 @@ async def headless_main(prompt: str, model: str | None = None) -> None:
900
  session_holder=session_holder,
901
  hf_token=hf_token,
902
  local_mode=True,
903
- stream=True,
904
  )
905
  )
906
 
@@ -922,6 +928,7 @@ async def headless_main(prompt: str, model: str | None = None) -> None:
922
  shimmer = _ThinkingShimmer(console)
923
  stream_buf = _StreamBuffer(console)
924
  _hl_last_tool = [None]
 
925
  shimmer.start()
926
 
927
  while True:
@@ -960,6 +967,26 @@ async def headless_main(prompt: str, model: str | None = None) -> None:
960
  log = event.data.get("log", "") if event.data else ""
961
  if log:
962
  print_tool_log(tool, log)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
963
  elif event.event_type == "compacted":
964
  old_tokens = event.data.get("old_tokens", 0) if event.data else 0
965
  new_tokens = event.data.get("new_tokens", 0) if event.data else 0
@@ -1001,11 +1028,18 @@ if __name__ == "__main__":
1001
  parser = argparse.ArgumentParser(description="Hugging Face Agent CLI")
1002
  parser.add_argument("prompt", nargs="?", default=None, help="Run headlessly with this prompt")
1003
  parser.add_argument("--model", "-m", default=None, help=f"Model to use (default: from config)")
 
 
 
 
1004
  args = parser.parse_args()
1005
 
1006
  try:
1007
  if args.prompt:
1008
- asyncio.run(headless_main(args.prompt, model=args.model))
 
 
 
1009
  else:
1010
  asyncio.run(main())
1011
  except KeyboardInterrupt:
 
858
  get_console().print("\n[dim]Bye.[/dim]\n")
859
 
860
 
861
+ async def headless_main(
862
+ prompt: str,
863
+ model: str | None = None,
864
+ max_iterations: int | None = None,
865
+ stream: bool = True,
866
+ ) -> None:
867
  """Run a single prompt headlessly and exit."""
868
  import logging
869
 
 
881
  config.yolo_mode = True # Auto-approve everything in headless mode
882
 
883
  if model:
 
 
 
884
  config.model_name = model
885
 
886
+ if max_iterations is not None:
887
+ config.max_iterations = max_iterations
888
+
889
  print(f"Model: {config.model_name}", file=sys.stderr)
890
+ print(f"Max iterations: {config.max_iterations}", file=sys.stderr)
891
  print(f"Prompt: {prompt}", file=sys.stderr)
892
  print("---", file=sys.stderr)
893
 
 
906
  session_holder=session_holder,
907
  hf_token=hf_token,
908
  local_mode=True,
909
+ stream=stream,
910
  )
911
  )
912
 
 
928
  shimmer = _ThinkingShimmer(console)
929
  stream_buf = _StreamBuffer(console)
930
  _hl_last_tool = [None]
931
+ _hl_sub_id = [1]
932
  shimmer.start()
933
 
934
  while True:
 
967
  log = event.data.get("log", "") if event.data else ""
968
  if log:
969
  print_tool_log(tool, log)
970
+ elif event.event_type == "approval_required":
971
+ # Auto-approve everything in headless mode (safety net if yolo_mode
972
+ # didn't prevent the approval event for some reason)
973
+ tools_data = event.data.get("tools", []) if event.data else []
974
+ approvals = [
975
+ {
976
+ "tool_call_id": t.get("tool_call_id", ""),
977
+ "approved": True,
978
+ "feedback": None,
979
+ }
980
+ for t in tools_data
981
+ ]
982
+ _hl_sub_id[0] += 1
983
+ await submission_queue.put(Submission(
984
+ id=f"hl_approval_{_hl_sub_id[0]}",
985
+ operation=Operation(
986
+ op_type=OpType.EXEC_APPROVAL,
987
+ data={"approvals": approvals},
988
+ ),
989
+ ))
990
  elif event.event_type == "compacted":
991
  old_tokens = event.data.get("old_tokens", 0) if event.data else 0
992
  new_tokens = event.data.get("new_tokens", 0) if event.data else 0
 
1028
  parser = argparse.ArgumentParser(description="Hugging Face Agent CLI")
1029
  parser.add_argument("prompt", nargs="?", default=None, help="Run headlessly with this prompt")
1030
  parser.add_argument("--model", "-m", default=None, help=f"Model to use (default: from config)")
1031
+ parser.add_argument("--max-iterations", type=int, default=None,
1032
+ help="Max LLM requests per turn (default: 50, use -1 for unlimited)")
1033
+ parser.add_argument("--no-stream", action="store_true",
1034
+ help="Disable token streaming (use non-streaming LLM calls)")
1035
  args = parser.parse_args()
1036
 
1037
  try:
1038
  if args.prompt:
1039
+ max_iter = args.max_iterations
1040
+ if max_iter is not None and max_iter < 0:
1041
+ max_iter = 10_000 # effectively unlimited
1042
+ asyncio.run(headless_main(args.prompt, model=args.model, max_iterations=max_iter, stream=not args.no_stream))
1043
  else:
1044
  asyncio.run(main())
1045
  except KeyboardInterrupt:
agent/prompts/system_prompt_v3.yaml CHANGED
@@ -46,6 +46,8 @@ system_prompt: |
46
  2. Validate dataset: hf_inspect_dataset or hub_repo_details to confirm column names and format
47
  3. Validate model: hub_repo_details to confirm model exists, correct architecture/size/tokenizer
48
 
 
 
49
  Dataset format requirements by training method:
50
  SFT: "messages", "text", or "prompt"/"completion"
51
  DPO: "prompt", "chosen", "rejected"
@@ -116,21 +118,29 @@ system_prompt: |
116
 
117
  When running autonomously (no human in the loop), you MUST follow these rules:
118
 
119
- NEVER respond with only text. Every response MUST include at least one tool call. If you have nothing to do, check the plan, check the timer, or verify outputs. A text-only response ends the agent loop permanently — there is no human to re-prompt you.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
 
121
- After training completes:
122
- 1. Check the output for errors or warnings
123
- 2. Copy/save the trained model to the required output location (e.g. final_model/)
124
- 3. Run evaluation to measure performance
125
- 4. If time remains and performance can improve: iterate (adjust hyperparameters, train longer, try different data)
126
- 5. Verify the final output exists and is valid before stopping
127
 
128
  Check the remaining time periodically with the timer command specified in the task prompt. Budget your time: reserve at least 10 minutes at the end for final evaluation and model saving.
129
 
130
  The task is NOT done until:
131
- - The required output directory exists (e.g. final_model/) with a valid model
132
  - You have evaluated the model and confirmed it works
133
- - You have used all available time productively
134
 
135
  # Communication
136
 
 
46
  2. Validate dataset: hf_inspect_dataset or hub_repo_details to confirm column names and format
47
  3. Validate model: hub_repo_details to confirm model exists, correct architecture/size/tokenizer
48
 
49
+ Training logging: always set disable_tqdm=True, logging_strategy="steps", and logging_first_step=True in your TrainingArguments/SFTConfig so loss values are printed as plain text lines you can grep, not hidden inside tqdm progress bars.
50
+
51
  Dataset format requirements by training method:
52
  SFT: "messages", "text", or "prompt"/"completion"
53
  DPO: "prompt", "chosen", "rejected"
 
118
 
119
  When running autonomously (no human in the loop), you MUST follow these rules:
120
 
121
+ NEVER respond with only text. Every response MUST include at least one tool call. If you have nothing to do, check the plan, verify outputs or plan ahead. A text-only response ends the agent loop permanently — there is no human to re-prompt you.
122
+
123
+ NEVER STOP WORKING. Do NOT decide you are "done" while time remains. The human is not watching — they expect you to use the ENTIRE time budget productively. Do NOT ask "should I continue?" or "is this a good stopping point?" — there is nobody to answer. You run until the timer expires or you are manually killed.
124
+
125
+ Your workflow is a loop, not a checklist. Once you have a working result, KEEP ITERATING:
126
+
127
+ LOOP UNTIL TIME RUNS OUT:
128
+ 1. Research the approach (read docs, find examples, check current APIs)
129
+ 2. Implement the solution (write code, set up training)
130
+ 3. Train and evaluate
131
+ 4. Save the model to the required output location / push it to Hugging Face Hub
132
+ 5. Improve: tune hyperparameters, try different data, adjust the training recipe, try a different approach entirely
133
+ 6. Go to step 1
134
+
135
+ HYPERPARAMETER TUNING: Do not tune hyperparameters by hand one-at-a-time. Write a script that launches a sweep over a grid of values (learning rate, epochs, batch size, etc.) and evaluates each run automatically. One well-designed sweep script beats ten manual experiments.
136
 
137
+ If you run out of ideas: research. Use the research tool to find papers on the task or technique — look for recent methods, ablation results, tricks that worked for similar problems. Re-read the task prompt for angles you missed. Re-read the training logs for clues. Try combining approaches from different papers. Try a fundamentally different strategy from the literature. There is always a paper you haven't read yet.
 
 
 
 
 
138
 
139
  Check the remaining time periodically with the timer command specified in the task prompt. Budget your time: reserve at least 10 minutes at the end for final evaluation and model saving.
140
 
141
  The task is NOT done until:
142
+ - The required output exists (e.g. final model, metrics reached, dataset updated etc)
143
  - You have evaluated the model and confirmed it works
 
144
 
145
  # Communication
146
 
agent/tools/edit_utils.py CHANGED
@@ -181,7 +181,11 @@ def apply_edit(
181
  if old_str not in content:
182
  original_match, fuzzy_note = fuzzy_find_original_match(content, old_str)
183
  if original_match is None:
184
- raise ValueError("old_str not found in file.")
 
 
 
 
185
  old_str = original_match
186
 
187
  count = content.count(old_str)
@@ -189,8 +193,10 @@ def apply_edit(
189
  if mode == "replace":
190
  if count > 1 and not replace_all:
191
  raise ValueError(
192
- f"old_str appears {count} times. Use replace_all=true to replace all, "
193
- "or provide a more specific old_str."
 
 
194
  )
195
  if replace_all:
196
  new_content = content.replace(old_str, new_str)
 
181
  if old_str not in content:
182
  original_match, fuzzy_note = fuzzy_find_original_match(content, old_str)
183
  if original_match is None:
184
+ raise ValueError(
185
+ "old_str was not found in the file. Make sure old_str matches "
186
+ "the file contents exactly, including whitespace and indentation. "
187
+ "Use the read tool to verify the current file contents before retrying."
188
+ )
189
  old_str = original_match
190
 
191
  count = content.count(old_str)
 
193
  if mode == "replace":
194
  if count > 1 and not replace_all:
195
  raise ValueError(
196
+ f"Found {count} matches of old_str in the file, but replace_all is "
197
+ f"false. To replace all occurrences, set replace_all to true. To "
198
+ f"replace only one, provide a larger old_str with more surrounding "
199
+ f"context to uniquely identify the instance."
200
  )
201
  if replace_all:
202
  new_content = content.replace(old_str, new_str)
agent/tools/file_content_cache.py DELETED
@@ -1,40 +0,0 @@
1
- """Cache for detecting unchanged local file re-reads."""
2
-
3
- from __future__ import annotations
4
-
5
- import hashlib
6
-
7
-
8
- def _short_hash(content: str) -> str:
9
- return hashlib.sha256(content.encode()).hexdigest()[:16]
10
-
11
-
12
- def _resolve(path: str) -> str:
13
- try:
14
- from pathlib import Path
15
- return str(Path(path).resolve())
16
- except Exception:
17
- return path
18
-
19
-
20
- class FileContentCache:
21
- """Tracks file content hashes to skip re-reading unchanged files."""
22
-
23
- def __init__(self) -> None:
24
- self._cache: dict[str, tuple[str, int]] = {}
25
-
26
- def record_read(self, path: str, content: str, turn: int) -> None:
27
- key = _resolve(path)
28
- self._cache[key] = (_short_hash(content), turn)
29
-
30
- def check_unchanged(self, path: str, content: str) -> tuple[bool, int | None]:
31
- key = _resolve(path)
32
- cached = self._cache.get(key)
33
- if cached is None:
34
- return False, None
35
- cached_hash, turn = cached
36
- return _short_hash(content) == cached_hash, turn
37
-
38
- def clear_path(self, path: str) -> None:
39
- key = _resolve(path)
40
- self._cache.pop(key, None)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
agent/tools/local_tools.py CHANGED
@@ -15,16 +15,25 @@ import tempfile
15
  from pathlib import Path
16
  from typing import Any
17
 
18
- from agent.tools.sandbox_client import Sandbox
19
 
20
  MAX_OUTPUT_CHARS = 25_000
21
- MAX_LINE_LENGTH = 2000
22
  DEFAULT_READ_LINES = 2000
23
  DEFAULT_TIMEOUT = 120
24
- MAX_TIMEOUT = 600
25
 
26
  _ANSI_RE = re.compile(r'\x1b\[[0-9;]*[a-zA-Z]|\x1b\].*?\x07')
27
 
 
 
 
 
 
 
 
 
 
 
28
 
29
  def _atomic_write(path: Path, content: str) -> None:
30
  """Write file atomically via temp file + os.replace().
@@ -105,7 +114,14 @@ async def _bash_handler(args: dict[str, Any], **_kw) -> tuple[str, bool]:
105
  output = "(no output)"
106
  return output, result.returncode == 0
107
  except subprocess.TimeoutExpired:
108
- return f"Command timed out after {timeout}s.", False
 
 
 
 
 
 
 
109
  except Exception as e:
110
  return f"bash error: {e}", False
111
 
@@ -124,17 +140,7 @@ async def _read_handler(args: dict[str, Any], **_kw) -> tuple[str, bool]:
124
  except Exception as e:
125
  return f"read error: {e}", False
126
 
127
- # Check if file is unchanged since last read
128
- session = _kw.get("session")
129
- if session is not None:
130
- is_unchanged, last_turn = session.file_content_cache.check_unchanged(
131
- file_path, raw_content
132
- )
133
- if is_unchanged:
134
- return (
135
- f"[File unchanged since turn {last_turn}, "
136
- f"content already in context.]"
137
- ), True
138
 
139
  lines = raw_content.splitlines()
140
  offset = max((args.get("offset") or 1), 1)
@@ -147,11 +153,6 @@ async def _read_handler(args: dict[str, Any], **_kw) -> tuple[str, bool]:
147
  line = line[:MAX_LINE_LENGTH] + "..."
148
  numbered.append(f"{i:>6}\t{line}")
149
 
150
- if session is not None:
151
- session.file_content_cache.record_read(
152
- file_path, raw_content, session.turn_count
153
- )
154
-
155
  return "\n".join(numbered), True
156
 
157
 
@@ -161,11 +162,14 @@ async def _write_handler(args: dict[str, Any], **_kw) -> tuple[str, bool]:
161
  if not file_path:
162
  return "No path provided.", False
163
  p = Path(file_path)
 
 
 
 
 
164
  try:
165
  _atomic_write(p, content)
166
- session = _kw.get("session")
167
- if session is not None:
168
- session.file_content_cache.clear_path(file_path)
169
  msg = f"Wrote {len(content)} bytes to {file_path}"
170
  # Syntax validation for Python files
171
  if p.suffix == ".py":
@@ -195,6 +199,11 @@ async def _edit_handler(args: dict[str, Any], **_kw) -> tuple[str, bool]:
195
  p = Path(file_path)
196
  if not p.exists():
197
  return f"File not found: {file_path}", False
 
 
 
 
 
198
 
199
  try:
200
  text = p.read_text()
@@ -213,10 +222,6 @@ async def _edit_handler(args: dict[str, Any], **_kw) -> tuple[str, bool]:
213
  except Exception as e:
214
  return f"edit write error: {e}", False
215
 
216
- session = _kw.get("session")
217
- if session is not None:
218
- session.file_content_cache.clear_path(file_path)
219
-
220
  msg = f"Edited {file_path} ({replacements} replacement{'s' if replacements > 1 else ''})"
221
  if fuzzy_note:
222
  msg += f" {fuzzy_note}"
@@ -235,18 +240,22 @@ _LOCAL_TOOL_SPECS = {
235
  "description": (
236
  "Run a shell command on the local machine and return stdout/stderr.\n"
237
  "\n"
238
- "Commands run in a shell at the working directory (default: current directory). "
239
- "Each invocation is independent.\n"
240
- "\n"
241
- "AVOID using bash for operations covered by specialized tools:\n"
242
- "- File reading: use read (not cat/head/tail)\n"
243
- "- File editing: use edit (not sed/awk)\n"
244
- "- File writing: use write (not echo/cat <<EOF)\n"
245
  "\n"
 
246
  "Chain dependent commands with &&. Independent commands should be "
247
  "separate bash calls (they can run in parallel).\n"
248
  "\n"
249
- "Timeout default 120s, max 600s."
 
 
 
 
 
 
250
  ),
251
  "parameters": {
252
  "type": "object",
@@ -267,22 +276,125 @@ _LOCAL_TOOL_SPECS = {
267
  },
268
  "timeout": {
269
  "type": "integer",
270
- "description": "Timeout in seconds (default: 120, max: 600).",
271
  },
272
  },
273
  },
274
  },
275
  "read": {
276
- "description": Sandbox.TOOLS["read"]["description"],
277
- "parameters": Sandbox.TOOLS["read"]["parameters"],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
278
  },
279
  "write": {
280
- "description": Sandbox.TOOLS["write"]["description"],
281
- "parameters": Sandbox.TOOLS["write"]["parameters"],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
282
  },
283
  "edit": {
284
- "description": Sandbox.TOOLS["edit"]["description"],
285
- "parameters": Sandbox.TOOLS["edit"]["parameters"],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
286
  },
287
  }
288
 
 
15
  from pathlib import Path
16
  from typing import Any
17
 
 
18
 
19
  MAX_OUTPUT_CHARS = 25_000
20
+ MAX_LINE_LENGTH = 4000
21
  DEFAULT_READ_LINES = 2000
22
  DEFAULT_TIMEOUT = 120
23
+ MAX_TIMEOUT = 36000 # 10 hours — needed for long training runs (e.g. PostTrainBench)
24
 
25
  _ANSI_RE = re.compile(r'\x1b\[[0-9;]*[a-zA-Z]|\x1b\].*?\x07')
26
 
27
+ # Track files that have been read this session (enforces read-before-write/edit)
28
+ _files_read: set[str] = set()
29
+
30
+
31
+ def _resolve_path(path: str) -> str:
32
+ try:
33
+ return str(Path(path).resolve())
34
+ except Exception:
35
+ return path
36
+
37
 
38
  def _atomic_write(path: Path, content: str) -> None:
39
  """Write file atomically via temp file + os.replace().
 
114
  output = "(no output)"
115
  return output, result.returncode == 0
116
  except subprocess.TimeoutExpired:
117
+ return (
118
+ f"Command timed out after {timeout}s and was killed.\n\n"
119
+ f"For long-running commands, run in the background and poll:\n"
120
+ f" nohup <command> > /tmp/output.log 2>&1 & echo $!\n"
121
+ f"Then check status with:\n"
122
+ f" kill -0 <PID> 2>/dev/null && echo 'running' || echo 'done'\n"
123
+ f" tail -n 50 /tmp/output.log"
124
+ ), False
125
  except Exception as e:
126
  return f"bash error: {e}", False
127
 
 
140
  except Exception as e:
141
  return f"read error: {e}", False
142
 
143
+ _files_read.add(_resolve_path(file_path))
 
 
 
 
 
 
 
 
 
 
144
 
145
  lines = raw_content.splitlines()
146
  offset = max((args.get("offset") or 1), 1)
 
153
  line = line[:MAX_LINE_LENGTH] + "..."
154
  numbered.append(f"{i:>6}\t{line}")
155
 
 
 
 
 
 
156
  return "\n".join(numbered), True
157
 
158
 
 
162
  if not file_path:
163
  return "No path provided.", False
164
  p = Path(file_path)
165
+ if p.exists() and _resolve_path(file_path) not in _files_read:
166
+ return (
167
+ f"You must read {file_path} before overwriting it. "
168
+ f"Use the read tool first to see current contents."
169
+ ), False
170
  try:
171
  _atomic_write(p, content)
172
+ _files_read.add(_resolve_path(file_path))
 
 
173
  msg = f"Wrote {len(content)} bytes to {file_path}"
174
  # Syntax validation for Python files
175
  if p.suffix == ".py":
 
199
  p = Path(file_path)
200
  if not p.exists():
201
  return f"File not found: {file_path}", False
202
+ if _resolve_path(file_path) not in _files_read:
203
+ return (
204
+ f"You must read {file_path} before editing it. "
205
+ f"Use the read tool first to see current contents."
206
+ ), False
207
 
208
  try:
209
  text = p.read_text()
 
222
  except Exception as e:
223
  return f"edit write error: {e}", False
224
 
 
 
 
 
225
  msg = f"Edited {file_path} ({replacements} replacement{'s' if replacements > 1 else ''})"
226
  if fuzzy_note:
227
  msg += f" {fuzzy_note}"
 
240
  "description": (
241
  "Run a shell command on the local machine and return stdout/stderr.\n"
242
  "\n"
243
+ "IMPORTANT: Do NOT use bash for file operations use the dedicated tools instead:\n"
244
+ "- To read files: use read (not cat/head/tail)\n"
245
+ "- To edit files: use edit (not sed/awk)\n"
246
+ "- To write files: use write (not echo/cat <<EOF)\n"
 
 
 
247
  "\n"
248
+ "Commands run in a shell at the working directory. Each invocation is independent.\n"
249
  "Chain dependent commands with &&. Independent commands should be "
250
  "separate bash calls (they can run in parallel).\n"
251
  "\n"
252
+ "For long-running commands (training, evaluation), run in the background and poll:\n"
253
+ " nohup <command> > /tmp/output.log 2>&1 & echo $!\n"
254
+ "Then check status:\n"
255
+ " kill -0 <PID> 2>/dev/null && echo 'running' || echo 'done'\n"
256
+ " tail -n 50 /tmp/output.log\n"
257
+ "\n"
258
+ "Timeout default 120s, max 36000s."
259
  ),
260
  "parameters": {
261
  "type": "object",
 
276
  },
277
  "timeout": {
278
  "type": "integer",
279
+ "description": "Optional timeout in seconds (default: 120, max: 36000).",
280
  },
281
  },
282
  },
283
  },
284
  "read": {
285
+ "description": (
286
+ "Reads a file from the local filesystem. Returns contents with line numbers "
287
+ "(cat -n format).\n"
288
+ "\n"
289
+ "Usage:\n"
290
+ "- By default, reads up to 2000 lines from the beginning of the file.\n"
291
+ "- You can optionally specify offset and limit for large files, but prefer "
292
+ "reading the whole file first.\n"
293
+ "- Lines longer than 4000 chars are truncated.\n"
294
+ "- Cannot read directories — use bash with 'ls' instead.\n"
295
+ "- You should read multiple potentially useful files in parallel when possible.\n"
296
+ "- IMPORTANT: Always read a file before editing or overwriting it. The edit and "
297
+ "write tools will reject operations on files you haven't read."
298
+ ),
299
+ "parameters": {
300
+ "type": "object",
301
+ "required": ["path"],
302
+ "additionalProperties": False,
303
+ "properties": {
304
+ "path": {
305
+ "type": "string",
306
+ "description": "Absolute path to the file to read.",
307
+ },
308
+ "offset": {
309
+ "type": "integer",
310
+ "description": "The line number to start reading from (1-based). Only provide if the file is too large to read at once.",
311
+ },
312
+ "limit": {
313
+ "type": "integer",
314
+ "description": "The number of lines to read. Only provide if the file is too large to read at once.",
315
+ },
316
+ },
317
+ },
318
  },
319
  "write": {
320
+ "description": (
321
+ "Writes a file to the local filesystem. Overwrites the existing file if one "
322
+ "exists at the path.\n"
323
+ "\n"
324
+ "- If this is an existing file, you MUST use the read tool first. This tool "
325
+ "will fail if you did not read the file first.\n"
326
+ "- ALWAYS prefer editing existing files with the edit tool over overwriting "
327
+ "with write.\n"
328
+ "- Creates parent directories as needed."
329
+ ),
330
+ "parameters": {
331
+ "type": "object",
332
+ "required": ["path", "content"],
333
+ "additionalProperties": False,
334
+ "properties": {
335
+ "path": {
336
+ "type": "string",
337
+ "description": "Absolute path to the file to write.",
338
+ },
339
+ "content": {
340
+ "type": "string",
341
+ "description": "The complete file content to write.",
342
+ },
343
+ },
344
+ },
345
  },
346
  "edit": {
347
+ "description": (
348
+ "Performs string replacements in files. Supports exact matching with "
349
+ "fuzzy fallback.\n"
350
+ "\n"
351
+ "Usage:\n"
352
+ "- You must read the file at least once before editing. This tool will "
353
+ "error if you attempt an edit without reading the file.\n"
354
+ "- The edit will FAIL if old_str is not unique in the file. Either provide "
355
+ "a larger string with more surrounding context to make it unique, or set "
356
+ "replace_all to true.\n"
357
+ "- old_str and new_str must differ.\n"
358
+ "- Preserve indentation exactly as it appears in the file.\n"
359
+ "- Do NOT include line number prefixes from read output in old_str or new_str.\n"
360
+ "- To delete code, set new_str to empty string.\n"
361
+ "- Use replace_all for renaming variables or strings across the file.\n"
362
+ "\n"
363
+ "Modes:\n"
364
+ "- replace (default): replace first occurrence of old_str with new_str.\n"
365
+ "- append_after: insert new_str immediately after old_str (old_str is kept).\n"
366
+ "- prepend_before: insert new_str immediately before old_str (old_str is kept)."
367
+ ),
368
+ "parameters": {
369
+ "type": "object",
370
+ "required": ["path", "old_str", "new_str"],
371
+ "additionalProperties": False,
372
+ "properties": {
373
+ "path": {
374
+ "type": "string",
375
+ "description": "Absolute path to the file to edit.",
376
+ },
377
+ "old_str": {
378
+ "type": "string",
379
+ "description": "The text to find in the file. Must match exactly (fuzzy matching is used as fallback).",
380
+ },
381
+ "new_str": {
382
+ "type": "string",
383
+ "description": "The replacement text. For append_after/prepend_before modes, the text to insert.",
384
+ },
385
+ "replace_all": {
386
+ "type": "boolean",
387
+ "description": "Replace all occurrences of old_str (default: false).",
388
+ "default": False,
389
+ },
390
+ "mode": {
391
+ "type": "string",
392
+ "enum": ["replace", "append_after", "prepend_before"],
393
+ "description": "Edit mode (default: replace).",
394
+ "default": "replace",
395
+ },
396
+ },
397
+ },
398
  },
399
  }
400
 
agent/tools/research_tool.py CHANGED
@@ -14,10 +14,17 @@ from typing import Any
14
 
15
  from litellm import Message, acompletion
16
 
 
17
  from agent.core.session import Event
18
 
19
  logger = logging.getLogger(__name__)
20
 
 
 
 
 
 
 
21
  # Tools the research agent can use (read-only subset)
22
  RESEARCH_TOOL_NAMES = {
23
  "read",
@@ -171,7 +178,7 @@ def _resolve_llm_params(model_name: str) -> dict:
171
  def _get_research_model(main_model: str) -> str:
172
  """Pick a cheaper model for research based on the main model."""
173
  if "anthropic/" in main_model:
174
- return "anthropic/claude-haiku-4-5-20251001"
175
  # For non-Anthropic models (HF router etc.), use the same model
176
  return main_model
177
 
@@ -221,12 +228,60 @@ async def research_handler(
221
 
222
  _tool_uses = 0
223
  _total_tokens = 0
 
224
 
225
  await _log("Starting research sub-agent...")
226
 
227
- # Run the research loop (max 20 iterations research should be focused)
228
- max_iterations = 20
229
  for _iteration in range(max_iterations):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
230
  try:
231
  response = await acompletion(
232
  messages=messages,
@@ -242,7 +297,7 @@ async def research_handler(
242
 
243
  # Track tokens
244
  if response.usage:
245
- _total_tokens += response.usage.total_tokens
246
  await _log(f"tokens:{_total_tokens}")
247
 
248
  choice = response.choices[0]
@@ -308,8 +363,31 @@ async def research_handler(
308
  )
309
  )
310
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
311
  return (
312
- "Research agent hit iteration limit (20). "
313
  "Partial findings may be incomplete — try a more focused task.",
314
  False,
315
  )
 
14
 
15
  from litellm import Message, acompletion
16
 
17
+ from agent.core.doom_loop import check_for_doom_loop
18
  from agent.core.session import Event
19
 
20
  logger = logging.getLogger(__name__)
21
 
22
+ # Context budget for the research subagent (tokens).
23
+ # When usage exceeds WARN threshold, the subagent is told to wrap up.
24
+ # At MAX, the loop is force-stopped and whatever content exists is returned.
25
+ _RESEARCH_CONTEXT_WARN = 170_000 # 85% of 200k
26
+ _RESEARCH_CONTEXT_MAX = 190_000
27
+
28
  # Tools the research agent can use (read-only subset)
29
  RESEARCH_TOOL_NAMES = {
30
  "read",
 
178
  def _get_research_model(main_model: str) -> str:
179
  """Pick a cheaper model for research based on the main model."""
180
  if "anthropic/" in main_model:
181
+ return "anthropic/claude-sonnet-4-6"
182
  # For non-Anthropic models (HF router etc.), use the same model
183
  return main_model
184
 
 
228
 
229
  _tool_uses = 0
230
  _total_tokens = 0
231
+ _warned_context = False
232
 
233
  await _log("Starting research sub-agent...")
234
 
235
+ # Run the research loop context budget is the real limiter
236
+ max_iterations = 60
237
  for _iteration in range(max_iterations):
238
+ # ── Doom-loop detection ──
239
+ doom_prompt = check_for_doom_loop(messages)
240
+ if doom_prompt:
241
+ logger.warning("Research sub-agent doom loop detected at iteration %d", _iteration)
242
+ await _log("Doom loop detected — injecting corrective prompt")
243
+ messages.append(Message(role="user", content=doom_prompt))
244
+
245
+ # ── Context budget: warn at 75%, hard-stop at 95% ──
246
+ if _total_tokens >= _RESEARCH_CONTEXT_MAX:
247
+ logger.warning(
248
+ "Research sub-agent hit context max (%d tokens) — forcing summary",
249
+ _total_tokens,
250
+ )
251
+ await _log(f"Context limit reached ({_total_tokens} tokens) — forcing wrap-up")
252
+ # Ask for a final summary with no tools
253
+ messages.append(Message(
254
+ role="user",
255
+ content=(
256
+ "[SYSTEM: CONTEXT LIMIT REACHED] You have used all available context. "
257
+ "Summarize your findings NOW. Do NOT call any more tools."
258
+ ),
259
+ ))
260
+ try:
261
+ response = await acompletion(
262
+ messages=messages,
263
+ tools=None, # no tools — force text response
264
+ stream=False,
265
+ timeout=120,
266
+ **llm_params,
267
+ )
268
+ content = response.choices[0].message.content or ""
269
+ return content or "Research context exhausted — no summary produced.", bool(content)
270
+ except Exception:
271
+ return "Research context exhausted and summary call failed.", False
272
+
273
+ if not _warned_context and _total_tokens >= _RESEARCH_CONTEXT_WARN:
274
+ _warned_context = True
275
+ await _log(f"Context at {_total_tokens} tokens — nudging to wrap up")
276
+ messages.append(Message(
277
+ role="user",
278
+ content=(
279
+ "[SYSTEM: You have used 75% of your context budget. "
280
+ "Start wrapping up: finish any critical lookups, then "
281
+ "produce your final summary within the next 1-2 iterations.]"
282
+ ),
283
+ ))
284
+
285
  try:
286
  response = await acompletion(
287
  messages=messages,
 
297
 
298
  # Track tokens
299
  if response.usage:
300
+ _total_tokens = response.usage.total_tokens
301
  await _log(f"tokens:{_total_tokens}")
302
 
303
  choice = response.choices[0]
 
363
  )
364
  )
365
 
366
+ # ── Iteration limit: try to salvage findings ──
367
+ await _log("Iteration limit reached — extracting summary")
368
+ messages.append(Message(
369
+ role="user",
370
+ content=(
371
+ "[SYSTEM: ITERATION LIMIT] You have reached the maximum number of research "
372
+ "iterations. Summarize ALL findings so far. Do NOT call any more tools."
373
+ ),
374
+ ))
375
+ try:
376
+ response = await acompletion(
377
+ messages=messages,
378
+ tools=None,
379
+ stream=False,
380
+ timeout=120,
381
+ **llm_params,
382
+ )
383
+ content = response.choices[0].message.content or ""
384
+ if content:
385
+ return content, True
386
+ except Exception as e:
387
+ logger.error("Research summary call failed: %s", e)
388
+
389
  return (
390
+ "Research agent hit iteration limit (60). "
391
  "Partial findings may be incomplete — try a more focused task.",
392
  False,
393
  )
agent/tools/sandbox_client.py CHANGED
@@ -57,7 +57,7 @@ HARDWARE_OPTIONS = [
57
  "a100-large",
58
  ]
59
  OUTPUT_LIMIT = 25000
60
- LINE_LIMIT = 2000
61
  DEFAULT_READ_LIMIT = 2000
62
  DEFAULT_TIMEOUT = 240
63
  MAX_TIMEOUT = 1200
@@ -855,22 +855,23 @@ class Sandbox:
855
  "description": (
856
  "Run a shell command in the remote sandbox and return stdout/stderr.\n"
857
  "\n"
858
- "Commands run in a shell at the working directory (default /app). "
859
- "Each invocation is independent use files in /app to persist state.\n"
860
- "\n"
861
- "AVOID using bash for operations covered by specialized tools:\n"
862
- "- File reading: use read (not cat/head/tail)\n"
863
- "- File editing: use edit (not sed/awk)\n"
864
- "- File writing: use write (not echo/cat <<EOF)\n"
865
- "\n"
866
- "For long-running tasks, background them:\n"
867
- " nohup uv run train.py > /app/train.log 2>&1 &\n"
868
- "Then check with read on the log file.\n"
869
  "\n"
 
 
870
  "Chain dependent commands with &&. Independent commands should be "
871
  "separate bash calls (they can run in parallel).\n"
872
  "\n"
873
- "Timeout default 120s, max 600s."
 
 
 
 
 
 
874
  ),
875
  "parameters": {
876
  "type": "object",
@@ -883,7 +884,7 @@ class Sandbox:
883
  },
884
  "description": {
885
  "type": "string",
886
- "description": "Short description (5-10 words, active voice). E.g. 'Install dependencies', 'Run training script'.",
887
  },
888
  "work_dir": {
889
  "type": "string",
@@ -891,20 +892,25 @@ class Sandbox:
891
  },
892
  "timeout": {
893
  "type": "integer",
894
- "description": "Timeout in seconds (default: 240, max: 1200).",
895
  },
896
  },
897
  },
898
  },
899
  "read": {
900
  "description": (
901
- "Read file contents with line numbers (cat -n format).\n"
902
- "\n"
903
- "Returns the first 2000 lines by default. For large files, use offset/limit "
904
- "to read a specific range. Line numbers always match the original file.\n"
905
  "\n"
906
- "Lines longer than 2000 chars are truncated.\n"
907
- "Cannot read directories use bash with 'ls' instead."
 
 
 
 
 
 
 
908
  ),
909
  "parameters": {
910
  "type": "object",
@@ -917,21 +923,25 @@ class Sandbox:
917
  },
918
  "offset": {
919
  "type": "integer",
920
- "description": "Start from this line (1-based). Only if file is too large.",
921
  },
922
  "limit": {
923
  "type": "integer",
924
- "description": "Number of lines to read. Only if file is too large.",
925
  },
926
  },
927
  },
928
  },
929
  "write": {
930
  "description": (
931
- "Create or overwrite a file. Creates parent directories as needed.\n"
 
932
  "\n"
933
- "For existing files, you MUST read the file first (system enforced). "
934
- "Prefer edit for modifications."
 
 
 
935
  ),
936
  "parameters": {
937
  "type": "object",
@@ -944,32 +954,32 @@ class Sandbox:
944
  },
945
  "content": {
946
  "type": "string",
947
- "description": "Complete file content.",
948
  },
949
  },
950
  },
951
  },
952
  "edit": {
953
  "description": (
954
- "Targeted edit via string replacement with fuzzy matching fallback.\n"
 
955
  "\n"
956
- "Modes:\n"
957
- "- replace (default): replace first occurrence of old_str with new_str.\n"
958
- "- append_after: insert new_str immediately after old_str (old_str is kept).\n"
959
- "- prepend_before: insert new_str immediately before old_str (old_str is kept).\n"
960
- "\n"
961
- "Rules:\n"
962
- "- old_str must appear EXACTLY once (unless replace_all is true).\n"
963
- "- Include enough context in old_str for uniqueness.\n"
964
  "- old_str and new_str must differ.\n"
965
- "- Preserve indentation exactly.\n"
 
966
  "- To delete code, set new_str to empty string.\n"
967
- "- File MUST have been read this session (system enforced).\n"
968
- "- Do NOT include line number prefixes in old_str/new_str.\n"
969
  "\n"
970
- "If exact match fails, the tool automatically tries trimmed/normalized matching.\n"
971
- "Use replace_all=true for batch operations like variable renaming.\n"
972
- "Use append_after/prepend_before to insert code without replacing existing code."
 
973
  ),
974
  "parameters": {
975
  "type": "object",
@@ -978,16 +988,19 @@ class Sandbox:
978
  "properties": {
979
  "path": {
980
  "type": "string",
981
- "description": "Absolute path to the file.",
982
  },
983
  "old_str": {
984
  "type": "string",
985
- "description": "Text to find (fuzzy matching used as fallback).",
 
 
 
 
986
  },
987
- "new_str": {"type": "string", "description": "Replacement text (or text to insert for append_after/prepend_before)."},
988
  "replace_all": {
989
  "type": "boolean",
990
- "description": "Replace all occurrences (default: false).",
991
  "default": False,
992
  },
993
  "mode": {
 
57
  "a100-large",
58
  ]
59
  OUTPUT_LIMIT = 25000
60
+ LINE_LIMIT = 4000
61
  DEFAULT_READ_LIMIT = 2000
62
  DEFAULT_TIMEOUT = 240
63
  MAX_TIMEOUT = 1200
 
855
  "description": (
856
  "Run a shell command in the remote sandbox and return stdout/stderr.\n"
857
  "\n"
858
+ "IMPORTANT: Do NOT use bash for file operations use the dedicated tools instead:\n"
859
+ "- To read files: use read (not cat/head/tail)\n"
860
+ "- To edit files: use edit (not sed/awk)\n"
861
+ "- To write files: use write (not echo/cat <<EOF)\n"
 
 
 
 
 
 
 
862
  "\n"
863
+ "Commands run in a shell at /app. Each invocation is independent — "
864
+ "use files in /app to persist state.\n"
865
  "Chain dependent commands with &&. Independent commands should be "
866
  "separate bash calls (they can run in parallel).\n"
867
  "\n"
868
+ "For long-running commands (training, evaluation), run in the background and poll:\n"
869
+ " nohup <command> > /app/output.log 2>&1 & echo $!\n"
870
+ "Then check status:\n"
871
+ " kill -0 <PID> 2>/dev/null && echo 'running' || echo 'done'\n"
872
+ " tail -n 50 /app/output.log\n"
873
+ "\n"
874
+ "Timeout default 240s, max 1200s."
875
  ),
876
  "parameters": {
877
  "type": "object",
 
884
  },
885
  "description": {
886
  "type": "string",
887
+ "description": "Short description (5-10 words, active voice).",
888
  },
889
  "work_dir": {
890
  "type": "string",
 
892
  },
893
  "timeout": {
894
  "type": "integer",
895
+ "description": "Optional timeout in seconds (default: 240, max: 1200).",
896
  },
897
  },
898
  },
899
  },
900
  "read": {
901
  "description": (
902
+ "Reads a file from the sandbox filesystem. Returns contents with line "
903
+ "numbers (cat -n format).\n"
 
 
904
  "\n"
905
+ "Usage:\n"
906
+ "- By default, reads up to 2000 lines from the beginning of the file.\n"
907
+ "- You can optionally specify offset and limit for large files, but prefer "
908
+ "reading the whole file first.\n"
909
+ "- Lines longer than 4000 chars are truncated.\n"
910
+ "- Cannot read directories — use bash with 'ls' instead.\n"
911
+ "- You should read multiple potentially useful files in parallel when possible.\n"
912
+ "- IMPORTANT: Always read a file before editing or overwriting it. The edit and "
913
+ "write tools will reject operations on files you haven't read."
914
  ),
915
  "parameters": {
916
  "type": "object",
 
923
  },
924
  "offset": {
925
  "type": "integer",
926
+ "description": "The line number to start reading from (1-based). Only provide if the file is too large to read at once.",
927
  },
928
  "limit": {
929
  "type": "integer",
930
+ "description": "The number of lines to read. Only provide if the file is too large to read at once.",
931
  },
932
  },
933
  },
934
  },
935
  "write": {
936
  "description": (
937
+ "Writes a file to the sandbox filesystem. Overwrites the existing file if "
938
+ "one exists at the path.\n"
939
  "\n"
940
+ "- If this is an existing file, you MUST use the read tool first. This tool "
941
+ "will fail if you did not read the file first.\n"
942
+ "- ALWAYS prefer editing existing files with the edit tool over overwriting "
943
+ "with write.\n"
944
+ "- Creates parent directories as needed."
945
  ),
946
  "parameters": {
947
  "type": "object",
 
954
  },
955
  "content": {
956
  "type": "string",
957
+ "description": "The complete file content to write.",
958
  },
959
  },
960
  },
961
  },
962
  "edit": {
963
  "description": (
964
+ "Performs string replacements in files. Supports exact matching with "
965
+ "fuzzy fallback.\n"
966
  "\n"
967
+ "Usage:\n"
968
+ "- You must read the file at least once before editing. This tool will "
969
+ "error if you attempt an edit without reading the file.\n"
970
+ "- The edit will FAIL if old_str is not unique in the file. Either provide "
971
+ "a larger string with more surrounding context to make it unique, or set "
972
+ "replace_all to true.\n"
 
 
973
  "- old_str and new_str must differ.\n"
974
+ "- Preserve indentation exactly as it appears in the file.\n"
975
+ "- Do NOT include line number prefixes from read output in old_str or new_str.\n"
976
  "- To delete code, set new_str to empty string.\n"
977
+ "- Use replace_all for renaming variables or strings across the file.\n"
 
978
  "\n"
979
+ "Modes:\n"
980
+ "- replace (default): replace first occurrence of old_str with new_str.\n"
981
+ "- append_after: insert new_str immediately after old_str (old_str is kept).\n"
982
+ "- prepend_before: insert new_str immediately before old_str (old_str is kept)."
983
  ),
984
  "parameters": {
985
  "type": "object",
 
988
  "properties": {
989
  "path": {
990
  "type": "string",
991
+ "description": "Absolute path to the file to edit.",
992
  },
993
  "old_str": {
994
  "type": "string",
995
+ "description": "The text to find in the file. Must match exactly (fuzzy matching is used as fallback).",
996
+ },
997
+ "new_str": {
998
+ "type": "string",
999
+ "description": "The replacement text. For append_after/prepend_before modes, the text to insert.",
1000
  },
 
1001
  "replace_all": {
1002
  "type": "boolean",
1003
+ "description": "Replace all occurrences of old_str (default: false).",
1004
  "default": False,
1005
  },
1006
  "mode": {
agent/tools/sandbox_tool.py CHANGED
@@ -245,25 +245,6 @@ def _make_tool_handler(sandbox_tool_name: str):
245
  result = await asyncio.to_thread(sb.call_tool, sandbox_tool_name, args)
246
  if result.success:
247
  output = result.output or "(no output)"
248
- cache = getattr(session, "file_content_cache", None)
249
- file_path = args.get("path", "")
250
-
251
- if sandbox_tool_name == "read" and cache and file_path:
252
- is_unchanged, last_turn = cache.check_unchanged(
253
- f"sandbox:{file_path}", output
254
- )
255
- if is_unchanged:
256
- return (
257
- f"[File unchanged since turn {last_turn}, "
258
- f"content already in context.]"
259
- ), True
260
- cache.record_read(
261
- f"sandbox:{file_path}", output, session.turn_count
262
- )
263
-
264
- if sandbox_tool_name in ("write", "edit") and cache and file_path:
265
- cache.clear_path(f"sandbox:{file_path}")
266
-
267
  return output, True
268
  else:
269
  error_msg = result.error or "Unknown error"
 
245
  result = await asyncio.to_thread(sb.call_tool, sandbox_tool_name, args)
246
  if result.success:
247
  output = result.output or "(no output)"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
248
  return output, True
249
  else:
250
  error_msg = result.error or "Unknown error"
pyproject.toml CHANGED
@@ -3,7 +3,7 @@ name = "hf-agent"
3
  version = "0.1.0"
4
  description = "Add your description here"
5
  readme = "README.md"
6
- requires-python = ">=3.12"
7
  dependencies = [
8
  "datasets>=4.4.1",
9
  # Core dependencies (always required)
@@ -49,3 +49,13 @@ dev = [
49
  all = [
50
  "hf-agent[agent,eval,dev]",
51
  ]
 
 
 
 
 
 
 
 
 
 
 
3
  version = "0.1.0"
4
  description = "Add your description here"
5
  readme = "README.md"
6
+ requires-python = ">=3.11"
7
  dependencies = [
8
  "datasets>=4.4.1",
9
  # Core dependencies (always required)
 
49
  all = [
50
  "hf-agent[agent,eval,dev]",
51
  ]
52
+
53
+ [build-system]
54
+ requires = ["setuptools>=64"]
55
+ build-backend = "setuptools.build_meta"
56
+
57
+ [tool.setuptools.packages.find]
58
+ include = ["agent*"]
59
+
60
+ [tool.uv]
61
+ package = true
uv.lock CHANGED
@@ -871,7 +871,7 @@ wheels = [
871
  [[package]]
872
  name = "hf-agent"
873
  version = "0.1.0"
874
- source = { virtual = "." }
875
  dependencies = [
876
  { name = "datasets" },
877
  { name = "pydantic" },
@@ -890,6 +890,7 @@ agent = [
890
  { name = "nbformat" },
891
  { name = "prompt-toolkit" },
892
  { name = "requests" },
 
893
  { name = "thefuzz" },
894
  { name = "uvicorn", extra = ["standard"] },
895
  { name = "websockets" },
@@ -909,6 +910,7 @@ all = [
909
  { name = "prompt-toolkit" },
910
  { name = "pytest" },
911
  { name = "requests" },
 
912
  { name = "tenacity" },
913
  { name = "thefuzz" },
914
  { name = "uvicorn", extra = ["standard"] },
@@ -945,6 +947,7 @@ requires-dist = [
945
  { name = "pytest", marker = "extra == 'dev'", specifier = ">=9.0.2" },
946
  { name = "python-dotenv", specifier = ">=1.2.1" },
947
  { name = "requests", marker = "extra == 'agent'", specifier = ">=2.32.5" },
 
948
  { name = "tenacity", marker = "extra == 'eval'", specifier = ">=8.0.0" },
949
  { name = "thefuzz", marker = "extra == 'agent'", specifier = ">=0.22.1" },
950
  { name = "uvicorn", extras = ["standard"], marker = "extra == 'agent'", specifier = ">=0.32.0" },
 
871
  [[package]]
872
  name = "hf-agent"
873
  version = "0.1.0"
874
+ source = { editable = "." }
875
  dependencies = [
876
  { name = "datasets" },
877
  { name = "pydantic" },
 
890
  { name = "nbformat" },
891
  { name = "prompt-toolkit" },
892
  { name = "requests" },
893
+ { name = "rich" },
894
  { name = "thefuzz" },
895
  { name = "uvicorn", extra = ["standard"] },
896
  { name = "websockets" },
 
910
  { name = "prompt-toolkit" },
911
  { name = "pytest" },
912
  { name = "requests" },
913
+ { name = "rich" },
914
  { name = "tenacity" },
915
  { name = "thefuzz" },
916
  { name = "uvicorn", extra = ["standard"] },
 
947
  { name = "pytest", marker = "extra == 'dev'", specifier = ">=9.0.2" },
948
  { name = "python-dotenv", specifier = ">=1.2.1" },
949
  { name = "requests", marker = "extra == 'agent'", specifier = ">=2.32.5" },
950
+ { name = "rich", marker = "extra == 'agent'", specifier = ">=13.0.0" },
951
  { name = "tenacity", marker = "extra == 'eval'", specifier = ">=8.0.0" },
952
  { name = "thefuzz", marker = "extra == 'agent'", specifier = ">=0.22.1" },
953
  { name = "uvicorn", extras = ["standard"], marker = "extra == 'agent'", specifier = ">=0.32.0" },