black-yt commited on
Commit
f209a8f
·
1 Parent(s): e5e4fd4

Create ResearchHarness Hugging Face Space

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .dockerignore +23 -0
  2. .env.example +29 -0
  3. .gitignore +230 -0
  4. Dockerfile +24 -0
  5. LICENSE +21 -0
  6. README.md +59 -5
  7. VERSION +1 -0
  8. agent_base/__init__.py +5 -0
  9. agent_base/base.py +131 -0
  10. agent_base/console_utils.py +223 -0
  11. agent_base/context_compact.py +326 -0
  12. agent_base/model_profiles.py +92 -0
  13. agent_base/prompt.py +106 -0
  14. agent_base/prompts/extractor.md +19 -0
  15. agent_base/prompts/system_base.md +232 -0
  16. agent_base/provider_compat.py +31 -0
  17. agent_base/react_agent.py +1453 -0
  18. agent_base/session_state.py +84 -0
  19. agent_base/tools/README.md +457 -0
  20. agent_base/tools/__init__.py +49 -0
  21. agent_base/tools/tool_file.py +933 -0
  22. agent_base/tools/tool_runtime.py +732 -0
  23. agent_base/tools/tool_user.py +89 -0
  24. agent_base/tools/tool_web.py +610 -0
  25. agent_base/tools/tooling.py +302 -0
  26. agent_base/trace_utils.py +112 -0
  27. agent_base/utils.py +247 -0
  28. api/__init__.py +1 -0
  29. api/openai_server.py +518 -0
  30. api_runs/.gitkeep +1 -0
  31. app.py +54 -0
  32. benchmarks/QA/README.md +102 -0
  33. benchmarks/QA/role_prompt.md +31 -0
  34. benchmarks/README.md +18 -0
  35. benchmarks/ResearchClawBench/README.md +44 -0
  36. benchmarks/ResearchClawBench/adapter.py +93 -0
  37. benchmarks/ResearchClawBench/role_prompt.md +195 -0
  38. docs/tutorial_en.md +531 -0
  39. docs/tutorial_zh.md +511 -0
  40. frontend/__init__.py +1 -0
  41. frontend/local_server.py +578 -0
  42. frontend/static/app.css +955 -0
  43. frontend/static/app.js +743 -0
  44. frontend/static/favicon.svg +10 -0
  45. frontend/static/index.html +75 -0
  46. requirements.txt +8 -0
  47. run_agent.py +7 -0
  48. run_frontend.py +48 -0
  49. run_server.py +61 -0
  50. traces/.gitkeep +1 -0
.dockerignore ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .git
2
+ .gitignore
3
+ __pycache__/
4
+ *.py[cod]
5
+ .pytest_cache/
6
+ .mypy_cache/
7
+ .ruff_cache/
8
+ .env
9
+ .envrc
10
+ .venv/
11
+ venv/
12
+ workspace/*
13
+ !workspace/.gitkeep
14
+ traces/*
15
+ !traces/.gitkeep
16
+ api_runs/*
17
+ !api_runs/.gitkeep
18
+ runtime/
19
+ tests/
20
+ .codex/
21
+ .idea/
22
+ .vscode/
23
+ .DS_Store
.env.example ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Required
2
+ API_KEY="your_openai_compatible_key" # API key for your OpenAI-compatible LLM provider.
3
+ API_BASE="https://your-openai-compatible-endpoint/v1" # Base URL for the OpenAI-compatible chat-completions endpoint.
4
+ MODEL_NAME="gpt-5.5" # Main model used by the agent and WebFetch summarization.
5
+ SERPER_KEY_ID="your_serper_key" # https://serper.dev/
6
+ JINA_API_KEYS="your_jina_key" # https://jina.ai/
7
+ MINERU_TOKEN="your_mineru_token" # https://mineru.net/
8
+
9
+ # Optional
10
+ WORKSPACE_ROOT="./workspace" # Default local workspace root when --workspace-root is not provided.
11
+ MAX_LLM_CALL_PER_RUN=100 # Maximum chat-completions calls allowed in one agent run.
12
+ MAX_AGENT_ROUNDS=100 # Maximum ReAct loop rounds before forced termination.
13
+ MAX_AGENT_RUNTIME_SECONDS=9000 # Maximum wall-clock runtime per agent run.
14
+ LLM_TIMEOUT_SECONDS=600 # Timeout for each chat-completions request.
15
+ LLM_MAX_OUTPUT_TOKENS=10000 # Maximum output tokens requested from the main model.
16
+ MAX_INPUT_TOKENS=320000 # Maximum input-token budget used for runtime token accounting.
17
+ LLM_MAX_RETRIES=10 # Maximum retries for transient LLM API failures.
18
+ TEMPERATURE=0.6 # Main model sampling temperature.
19
+ TOP_P=0.95 # Main model nucleus-sampling top_p.
20
+ PRESENCE_PENALTY=1.1 # Main model presence penalty when supported by the provider.
21
+ AUTO_COMPACT_TRIGGER_TOKENS="128k" # Context size threshold that triggers automatic memory compaction.
22
+ IMAGE_PART_TOKEN_ESTIMATE=1536 # Token estimate used for each runtime image_url content part.
23
+ LLM_IMAGE_MAX_EDGE=1568 # Maximum image edge length sent to multimodal LLMs.
24
+ LLM_IMAGE_MAX_BYTES=524288 # Maximum compressed image payload size sent to multimodal LLMs.
25
+ LLM_IMAGE_JPEG_QUALITY=85 # Initial JPEG quality for runtime image compression.
26
+ DEBUG_AGENT=false # Print verbose agent-loop debug logs.
27
+ DEBUG_SEARCH=false # Print verbose WebSearch debug logs.
28
+ DEBUG_SCHOLAR=false # Print verbose ScholarSearch debug logs.
29
+ DEBUG_VISIT=false # Print verbose WebFetch debug logs.
.gitignore ADDED
@@ -0,0 +1,230 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ runtime/
2
+ # Local agent artifacts
3
+ AGENTS.md
4
+ workspace/*
5
+ !workspace/.gitkeep
6
+ api_runs/*
7
+ !api_runs/.gitkeep
8
+ traces/*
9
+ !traces/.gitkeep
10
+ /inputs/
11
+ data/
12
+ benchmarks/**/local_*.py
13
+ .idea/
14
+ .vscode/
15
+ .DS_Store
16
+ tests/example_files/pdfs/dummy_document
17
+ .codex
18
+
19
+
20
+ # Byte-compiled / optimized / DLL files
21
+ __pycache__/
22
+ *.py[codz]
23
+ *$py.class
24
+
25
+ # C extensions
26
+ *.so
27
+
28
+ # Distribution / packaging
29
+ .Python
30
+ build/
31
+ develop-eggs/
32
+ dist/
33
+ downloads/
34
+ eggs/
35
+ .eggs/
36
+ lib/
37
+ lib64/
38
+ parts/
39
+ sdist/
40
+ var/
41
+ wheels/
42
+ share/python-wheels/
43
+ *.egg-info/
44
+ .installed.cfg
45
+ *.egg
46
+ MANIFEST
47
+
48
+ # PyInstaller
49
+ # Usually these files are written by a python script from a template
50
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
51
+ *.manifest
52
+ *.spec
53
+
54
+ # Installer logs
55
+ pip-log.txt
56
+ pip-delete-this-directory.txt
57
+
58
+ # Unit test / coverage reports
59
+ htmlcov/
60
+ .tox/
61
+ .nox/
62
+ .coverage
63
+ .coverage.*
64
+ .cache
65
+ nosetests.xml
66
+ coverage.xml
67
+ *.cover
68
+ *.py.cover
69
+ .hypothesis/
70
+ .pytest_cache/
71
+ cover/
72
+
73
+ # Translations
74
+ *.mo
75
+ *.pot
76
+
77
+ # Django stuff:
78
+ *.log
79
+ local_settings.py
80
+ db.sqlite3
81
+ db.sqlite3-journal
82
+
83
+ # Flask stuff:
84
+ instance/
85
+ .webassets-cache
86
+
87
+ # Scrapy stuff:
88
+ .scrapy
89
+
90
+ # Sphinx documentation
91
+ docs/_build/
92
+
93
+ # PyBuilder
94
+ .pybuilder/
95
+ target/
96
+
97
+ # Jupyter Notebook
98
+ .ipynb_checkpoints
99
+
100
+ # IPython
101
+ profile_default/
102
+ ipython_config.py
103
+
104
+ # pyenv
105
+ # For a library or package, you might want to ignore these files since the code is
106
+ # intended to run in multiple environments; otherwise, check them in:
107
+ # .python-version
108
+
109
+ # pipenv
110
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
111
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
112
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
113
+ # install all needed dependencies.
114
+ #Pipfile.lock
115
+
116
+ # UV
117
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
118
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
119
+ # commonly ignored for libraries.
120
+ #uv.lock
121
+
122
+ # poetry
123
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
124
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
125
+ # commonly ignored for libraries.
126
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
127
+ #poetry.lock
128
+ #poetry.toml
129
+
130
+ # pdm
131
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
132
+ # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
133
+ # https://pdm-project.org/en/latest/usage/project/#working-with-version-control
134
+ #pdm.lock
135
+ #pdm.toml
136
+ .pdm-python
137
+ .pdm-build/
138
+
139
+ # pixi
140
+ # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
141
+ #pixi.lock
142
+ # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
143
+ # in the .venv directory. It is recommended not to include this directory in version control.
144
+ .pixi
145
+
146
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
147
+ __pypackages__/
148
+
149
+ # Celery stuff
150
+ celerybeat-schedule
151
+ celerybeat.pid
152
+
153
+ # SageMath parsed files
154
+ *.sage.py
155
+
156
+ # Environments
157
+ .env
158
+ .envrc
159
+ .venv
160
+ env/
161
+ venv/
162
+ ENV/
163
+ env.bak/
164
+ venv.bak/
165
+
166
+ # Spyder project settings
167
+ .spyderproject
168
+ .spyproject
169
+
170
+ # Rope project settings
171
+ .ropeproject
172
+
173
+ # mkdocs documentation
174
+ /site
175
+
176
+ # mypy
177
+ .mypy_cache/
178
+ .dmypy.json
179
+ dmypy.json
180
+
181
+ # Pyre type checker
182
+ .pyre/
183
+
184
+ # pytype static type analyzer
185
+ .pytype/
186
+
187
+ # Cython debug symbols
188
+ cython_debug/
189
+
190
+ # PyCharm
191
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
192
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
193
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
194
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
195
+ #.idea/
196
+
197
+ # Abstra
198
+ # Abstra is an AI-powered process automation framework.
199
+ # Ignore directories containing user credentials, local state, and settings.
200
+ # Learn more at https://abstra.io/docs
201
+ .abstra/
202
+
203
+ # Visual Studio Code
204
+ # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
205
+ # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
206
+ # and can be added to the global gitignore or merged into this file. However, if you prefer,
207
+ # you could uncomment the following to ignore the entire vscode folder
208
+ # .vscode/
209
+
210
+ # Ruff stuff:
211
+ .ruff_cache/
212
+
213
+ # PyPI configuration file
214
+ .pypirc
215
+
216
+ # Cursor
217
+ # Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
218
+ # exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
219
+ # refer to https://docs.cursor.com/context/ignore-files
220
+ .cursorignore
221
+ .cursorindexingignore
222
+
223
+ # Marimo
224
+ marimo/_static/
225
+ marimo/_lsp/
226
+ __marimo__/
227
+
228
+ # Hugging Face Space runtime artifacts
229
+ runtime/
230
+ /tmp/
Dockerfile ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ WORKDIR /app
4
+
5
+ ENV PYTHONUNBUFFERED=1 \
6
+ PORT=7860 \
7
+ RH_SPACE_RUNS_DIR=/tmp/researchharness_space/runs
8
+
9
+ RUN apt-get update \
10
+ && apt-get install -y --no-install-recommends \
11
+ bash \
12
+ ca-certificates \
13
+ curl \
14
+ git \
15
+ poppler-utils \
16
+ && rm -rf /var/lib/apt/lists/*
17
+
18
+ COPY requirements.txt ./
19
+ RUN pip install --no-cache-dir -r requirements.txt
20
+
21
+ COPY . .
22
+
23
+ EXPOSE 7860
24
+ CMD ["python", "app.py"]
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Wanghan Xu
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,12 +1,66 @@
1
  ---
2
  title: ResearchHarness
3
- emoji: 🌖
4
- colorFrom: yellow
5
- colorTo: gray
6
  sdk: docker
 
7
  pinned: false
8
  license: mit
9
- short_description: A lightweight, general-purpose harness for tool-using LLM ag
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  title: ResearchHarness
3
+ emoji: 🚀
4
+ colorFrom: blue
5
+ colorTo: yellow
6
  sdk: docker
7
+ app_port: 7860
8
  pinned: false
9
  license: mit
10
+ short_description: Lightweight harness for tool-using LLM agents.
11
  ---
12
 
13
+ # ResearchHarness Space
14
+
15
+ This Space runs the ResearchHarness browser frontend as a lightweight hosted agent UI.
16
+ It reuses the ResearchHarness tool-calling runtime and keeps the hosted mode intentionally simple:
17
+
18
+ - Users do not choose a local workspace.
19
+ - Each new chat gets an isolated temporary runtime directory.
20
+ - Uploaded images are saved under that chat workspace and also passed to the model when supported.
21
+ - Agent traces and session state are stored beside the temporary workspace.
22
+ - Old workspaces and traces are cleaned periodically so the Space does not grow without bound.
23
+
24
+ ## Required Secrets
25
+
26
+ Configure these as Hugging Face Space secrets before starting the app:
27
+
28
+ | Secret | Purpose |
29
+ | --- | --- |
30
+ | `API_KEY` | API key for your OpenAI-compatible LLM provider. |
31
+ | `API_BASE` | OpenAI-compatible `/v1` endpoint. |
32
+ | `MODEL_NAME` | Main model used by ResearchHarness. |
33
+ | `SERPER_KEY_ID` | WebSearch / ScholarSearch key from <https://serper.dev/>. |
34
+ | `JINA_API_KEYS` | WebFetch key from <https://jina.ai/>. |
35
+ | `MINERU_TOKEN` | ReadPDF key from <https://mineru.net/>. |
36
+
37
+ ## Optional Runtime Variables
38
+
39
+ | Variable | Default | Meaning |
40
+ | --- | --- | --- |
41
+ | `RH_SPACE_RUNS_DIR` | `/tmp/researchharness_space/runs` | Parent directory for temporary per-chat runs. |
42
+ | `RH_SPACE_RETENTION_SECONDS` | `21600` | Delete inactive runs older than this many seconds. |
43
+ | `RH_SPACE_MAX_RUNS` | `40` | Keep at most this many inactive runs. |
44
+ | `RH_SPACE_CLEANUP_INTERVAL_SECONDS` | `900` | Background cleanup interval. |
45
+ | `RH_ROLE_PROMPT_FILES` | empty | Optional `os.pathsep`-separated role prompt files inside the Space image. |
46
+ | `PORT` | `7860` | Port used by Hugging Face Docker Spaces. |
47
+
48
+ ## Runtime Layout
49
+
50
+ ```text
51
+ /tmp/researchharness_space/runs/
52
+ └── run_YYYYMMDD_HHMMSS_<random>/
53
+ ├── agent_workspace/
54
+ │ └── inputs/images/ # user uploaded images, when present
55
+ └── agent_trace/ # trace JSONL and _session_state.json
56
+ ```
57
+
58
+ The frontend only exposes the chat UI. The workspace path is managed by the server so hosted users cannot browse or select server folders.
59
+
60
+ ## Local Smoke Test
61
+
62
+ ```bash
63
+ python app.py
64
+ ```
65
+
66
+ Then open `http://127.0.0.1:7860`.
VERSION ADDED
@@ -0,0 +1 @@
 
 
1
+ v0.0.35
agent_base/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ """Lightweight trusted-local harness for tool-using research agents."""
2
+
3
+ from agent_base.base import BaseAgent, agent_role
4
+
5
+ __all__ = ["BaseAgent", "agent_role"]
agent_base/base.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from abc import ABC, abstractmethod
4
+ from typing import Any, Iterable, Optional, Sequence
5
+
6
+
7
+ def _normalize_function_list(function_list: Optional[Iterable[str]]) -> Optional[list[str]]:
8
+ if function_list is None:
9
+ return None
10
+ normalized: list[str] = []
11
+ for raw_name in function_list:
12
+ name = str(raw_name).strip()
13
+ if name:
14
+ normalized.append(name)
15
+ return normalized
16
+
17
+
18
+ def agent_role(
19
+ *,
20
+ name: str,
21
+ role_prompt: str = "",
22
+ function_list: Optional[Iterable[str]] = None,
23
+ ):
24
+ """
25
+ Class decorator used by upper-layer frameworks to declare agent defaults.
26
+
27
+ This keeps the lower-layer execution loop generic while allowing subclasses
28
+ to provide role-specific prompt addenda and tool restrictions declaratively.
29
+ """
30
+
31
+ def decorator(cls):
32
+ cls.role_name = str(name).strip() or cls.__name__
33
+ cls.default_role_prompt = str(role_prompt).strip()
34
+ cls.default_function_list = _normalize_function_list(function_list)
35
+ return cls
36
+
37
+ return decorator
38
+
39
+
40
+ class BaseAgent(ABC):
41
+ """Abstract base class for agents built on top of ResearchHarness."""
42
+
43
+ role_name: str = "agent"
44
+ default_role_prompt: str = ""
45
+ default_function_list: Optional[list[str]] = None
46
+
47
+ @classmethod
48
+ def resolve_function_list(cls, function_list: Optional[Sequence[str]]) -> Optional[list[str]]:
49
+ if function_list is not None:
50
+ return _normalize_function_list(function_list) or []
51
+ default_tools = getattr(cls, "default_function_list", None)
52
+ if default_tools is None:
53
+ return None
54
+ return list(default_tools)
55
+
56
+ @classmethod
57
+ def resolve_role_prompt(cls, role_prompt: Optional[str]) -> str:
58
+ if role_prompt is None:
59
+ role_prompt = getattr(cls, "default_role_prompt", "")
60
+ return str(role_prompt or "").strip()
61
+
62
+ def should_accept_plaintext_result(
63
+ self,
64
+ *,
65
+ result_text: str,
66
+ workspace_root: Optional[str],
67
+ messages: Sequence[dict[str, Any]],
68
+ ) -> bool:
69
+ """
70
+ Decide whether a plain assistant text reply with no tool calls is terminal.
71
+
72
+ The default behavior preserves the original ResearchHarness semantics:
73
+ any meaningful assistant text without tool calls is accepted as the final
74
+ result. Upper layers may override this hook to require extra completion
75
+ artifacts before termination.
76
+ """
77
+
78
+ return True
79
+
80
+ def rejected_plaintext_result_message(
81
+ self,
82
+ *,
83
+ result_text: str,
84
+ workspace_root: Optional[str],
85
+ messages: Sequence[dict[str, Any]],
86
+ ) -> str:
87
+ """
88
+ Explain why a plain assistant text reply was not accepted as terminal.
89
+
90
+ Returning an empty string falls back to the generic runtime message.
91
+ """
92
+
93
+ return ""
94
+
95
+ def should_accept_terminal_error(
96
+ self,
97
+ *,
98
+ error_text: str,
99
+ workspace_root: Optional[str],
100
+ messages: Sequence[dict[str, Any]],
101
+ ) -> bool:
102
+ """
103
+ Decide whether a terminal LLM/runtime error can still be accepted.
104
+
105
+ The default behavior is conservative: terminal errors are not accepted.
106
+ Upper layers may override this hook when benchmark-specific completion
107
+ artifacts are already present and the remaining assistant text is not
108
+ semantically important.
109
+ """
110
+
111
+ return False
112
+
113
+ def accepted_terminal_error_result_text(
114
+ self,
115
+ *,
116
+ error_text: str,
117
+ workspace_root: Optional[str],
118
+ messages: Sequence[dict[str, Any]],
119
+ ) -> str:
120
+ """
121
+ Provide a synthetic terminal result when a terminal error is accepted.
122
+
123
+ Returning an empty string falls back to a generic runtime completion
124
+ message.
125
+ """
126
+
127
+ return ""
128
+
129
+ @abstractmethod
130
+ def run(self, prompt: str, workspace_root: Optional[str] = None):
131
+ raise NotImplementedError
agent_base/console_utils.py ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ import os
4
+ from pathlib import Path
5
+ import shutil
6
+ import sys
7
+ import unicodedata
8
+ from typing import Any, Optional
9
+
10
+
11
+ ANSI_RESET = "\033[0m"
12
+ ANSI_COLORS = {
13
+ "header": "\033[36m",
14
+ "assistant": "\033[32m",
15
+ "tool": "\033[33m",
16
+ "runtime": "\033[34m",
17
+ "user": "\033[35m",
18
+ "error": "\033[31m",
19
+ }
20
+
21
+
22
+ def _char_display_width(char: str) -> int:
23
+ if unicodedata.combining(char):
24
+ return 0
25
+ if unicodedata.category(char) in {"Cc", "Cf"}:
26
+ return 0
27
+ return 2 if unicodedata.east_asian_width(char) in {"F", "W"} else 1
28
+
29
+
30
+ def _display_width(text: str) -> int:
31
+ return sum(_char_display_width(char) for char in str(text))
32
+
33
+
34
+ def _truncate_display(text: str, width: int) -> str:
35
+ if _display_width(text) <= width:
36
+ return text
37
+ suffix = "..."
38
+ target = max(0, width - _display_width(suffix))
39
+ out = []
40
+ used = 0
41
+ for char in text:
42
+ char_width = _char_display_width(char)
43
+ if used + char_width > target:
44
+ break
45
+ out.append(char)
46
+ used += char_width
47
+ return "".join(out) + suffix
48
+
49
+
50
+ def _pad_display(text: str, width: int) -> str:
51
+ return text + " " * max(0, width - _display_width(text))
52
+
53
+
54
+ def _last_soft_break(chars: list[str]) -> int:
55
+ for index in range(len(chars) - 1, 0, -1):
56
+ if chars[index].isspace() and "".join(chars[:index]).strip():
57
+ return index
58
+ return -1
59
+
60
+
61
+ class ConsoleEventPrinter:
62
+ def __init__(self, *, model_name: str, workspace_root: Path, prompt: str):
63
+ self.model_name = model_name
64
+ self.workspace_root = workspace_root
65
+ self.prompt = prompt.strip()
66
+ self._printed_any = False
67
+ self._use_color = (
68
+ "NO_COLOR" not in os.environ
69
+ and os.environ.get("TERM") != "dumb"
70
+ and (sys.stdout.isatty() or bool(os.environ.get("FORCE_COLOR") or os.environ.get("CLICOLOR_FORCE")))
71
+ )
72
+
73
+ def print_header(self) -> None:
74
+ self._print_box(
75
+ "ResearchHarness CLI",
76
+ f"Model: {self.model_name}\nWorkspace Root: {self.workspace_root}\n\nPrompt:\n{self.prompt}",
77
+ "header",
78
+ )
79
+
80
+ def reset_rounds(self) -> None:
81
+ self._printed_any = False
82
+
83
+ def _paint(self, text: str, color_key: str) -> str:
84
+ if not self._use_color:
85
+ return text
86
+ return f"{ANSI_COLORS.get(color_key, '')}{text}{ANSI_RESET}"
87
+
88
+ def _terminal_width(self) -> int:
89
+ return max(60, min(110, shutil.get_terminal_size((100, 20)).columns))
90
+
91
+ def _wrap_line(self, line: str, width: int) -> list[str]:
92
+ expanded = line.expandtabs(2)
93
+ if expanded == "":
94
+ return [""]
95
+ chunks: list[str] = []
96
+ current: list[str] = []
97
+ current_width = 0
98
+ for char in expanded:
99
+ char_width = _char_display_width(char)
100
+ if current and current_width + char_width > width:
101
+ break_at = _last_soft_break(current)
102
+ if break_at > 0:
103
+ chunks.append("".join(current[:break_at]).rstrip())
104
+ current = list("".join(current[break_at + 1 :]).lstrip())
105
+ current_width = _display_width("".join(current))
106
+ else:
107
+ chunks.append("".join(current))
108
+ current = []
109
+ current_width = 0
110
+ current.append(char)
111
+ current_width += char_width
112
+ if current:
113
+ chunks.append("".join(current))
114
+ return chunks or [""]
115
+
116
+ def _print_box(self, title: str, body: str, color_key: str = "runtime") -> None:
117
+ width = self._terminal_width()
118
+ inner_width = width - 4
119
+ title_text = f" {_truncate_display(title.strip(), width - 6)} "
120
+ top = "+" + title_text + "-" * max(0, width - 2 - _display_width(title_text)) + "+"
121
+ bottom = "+" + "-" * (width - 2) + "+"
122
+ if self._printed_any:
123
+ print()
124
+ print(self._paint(top, color_key))
125
+ for raw_line in str(body or "").splitlines() or [""]:
126
+ for line in self._wrap_line(raw_line, inner_width):
127
+ padded = _pad_display(line, inner_width)
128
+ print(f"{self._paint('|', color_key)} {padded} {self._paint('|', color_key)}")
129
+ print(self._paint(bottom, color_key))
130
+ self._printed_any = True
131
+
132
+ def _title(self, label: str, turn_index: int) -> str:
133
+ return f"{label} | round {turn_index}" if turn_index > 0 else label
134
+
135
+ def _format_tool_call(self, tool_name: str, tool_args: Any) -> str:
136
+ try:
137
+ tool_args_text = json.dumps(tool_args, ensure_ascii=False, indent=2)
138
+ except TypeError:
139
+ tool_args_text = str(tool_args)
140
+ return f"- {tool_name}\n{tool_args_text}"
141
+
142
+ def handle_event(self, row: dict[str, Any]) -> None:
143
+ role = str(row.get("role", ""))
144
+ turn_index = int(row.get("turn_index", 0) or 0)
145
+ text = str(row.get("text", ""))
146
+ capture_type = str(row.get("capture_type", ""))
147
+ tool_names = row.get("tool_names") if isinstance(row.get("tool_names"), list) else []
148
+ tool_arguments = row.get("tool_arguments") if isinstance(row.get("tool_arguments"), list) else []
149
+ finish_reason = str(row.get("finish_reason", ""))
150
+ error = str(row.get("error", ""))
151
+
152
+ if capture_type and not text.strip():
153
+ return
154
+
155
+ if role == "system":
156
+ return
157
+
158
+ if role == "user":
159
+ if turn_index == 0:
160
+ return
161
+ self._print_box(self._title("Runtime Message", turn_index), text, "user")
162
+ return
163
+
164
+ if role == "assistant":
165
+ lines: list[str] = []
166
+ if tool_names:
167
+ if text.strip():
168
+ lines.append(text)
169
+ else:
170
+ suffix = f" finish_reason={finish_reason}" if finish_reason else ""
171
+ lines.append(f"(no text; native tool-calls only.{suffix})")
172
+ lines.append("")
173
+ lines.append("Assistant Tool Calls:")
174
+ for idx, tool_name in enumerate(tool_names):
175
+ tool_args = tool_arguments[idx] if idx < len(tool_arguments) else {}
176
+ lines.append(self._format_tool_call(str(tool_name), tool_args))
177
+ elif text.strip():
178
+ lines.append(text)
179
+ else:
180
+ suffix = f" finish_reason={finish_reason}" if finish_reason else ""
181
+ lines.append(f"(empty assistant output.{suffix})")
182
+ if error:
183
+ lines.append("")
184
+ lines.append(f"Assistant Error: {error}")
185
+ self._print_box(self._title("Assistant", turn_index), "\n".join(lines), "error" if error else "assistant")
186
+ return
187
+
188
+ if role == "tool":
189
+ tool_name = str(tool_names[0]) if tool_names else "Tool"
190
+ lines = [text]
191
+ if error:
192
+ lines.extend(["", f"{tool_name} Error: {error}"])
193
+ self._print_box(self._title(f"{tool_name} Result", turn_index), "\n".join(lines), "error" if error else "tool")
194
+ return
195
+
196
+ if role == "runtime":
197
+ lines = [text]
198
+ if error:
199
+ lines.extend(["", f"Runtime Error: {error}"])
200
+ self._print_box(self._title("Runtime", turn_index), "\n".join(lines), "error" if error else "runtime")
201
+
202
+
203
+ def main(argv: Optional[list[str]] = None) -> int:
204
+ parser = argparse.ArgumentParser(description="Show a minimal example of the CLI console event formatter.")
205
+ parser.parse_args(argv)
206
+ printer = ConsoleEventPrinter(model_name="demo-model", workspace_root=Path("."), prompt="demo question")
207
+ printer.print_header()
208
+ printer.handle_event(
209
+ {
210
+ "role": "assistant",
211
+ "turn_index": 1,
212
+ "text": "",
213
+ "tool_names": ["Read"],
214
+ "tool_arguments": [{"path": "demo.txt"}],
215
+ "termination": "",
216
+ "error": "",
217
+ }
218
+ )
219
+ return 0
220
+
221
+
222
+ if __name__ == "__main__":
223
+ raise SystemExit(main())
agent_base/context_compact.py ADDED
@@ -0,0 +1,326 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Any, Callable, Optional, Sequence
5
+
6
+ from agent_base.model_profiles import ModelProfile
7
+ from agent_base.utils import safe_jsonable
8
+
9
+
10
+ COMPACT_MEMORY_PREFIX = (
11
+ "Runtime memory summary from earlier turns.\n"
12
+ "This is compressed context, not ground truth.\n"
13
+ "The workspace files remain authoritative; re-read any file if exact details matter.\n\n"
14
+ )
15
+
16
+
17
+ @dataclass
18
+ class CompactionOutcome:
19
+ status: str
20
+ compacted_messages: list[dict[str, Any]]
21
+ summary_text: str = ""
22
+ error: str = ""
23
+ trigger_reason: str = ""
24
+ prior_token_estimate: int = 0
25
+ new_token_estimate: int = 0
26
+ compacted_group_count: int = 0
27
+ kept_group_count: int = 0
28
+ existing_memory_text: str = ""
29
+ summary_request: list[dict[str, Any]] | None = None
30
+ summary_response: dict[str, Any] | None = None
31
+ pre_messages: list[dict[str, Any]] | None = None
32
+ post_messages: list[dict[str, Any]] | None = None
33
+
34
+
35
+ def should_compact_messages(
36
+ *,
37
+ last_input_tokens: Optional[int],
38
+ current_token_estimate: int,
39
+ model_profile: ModelProfile,
40
+ ) -> tuple[bool, str]:
41
+ usage_hit = last_input_tokens is not None and int(last_input_tokens) >= model_profile.compact_trigger_tokens
42
+ estimate_hit = current_token_estimate >= model_profile.compact_trigger_tokens
43
+ if usage_hit and estimate_hit:
44
+ return True, "usage+estimate"
45
+ if usage_hit:
46
+ return True, "usage"
47
+ if estimate_hit:
48
+ return True, "estimate"
49
+ return False, ""
50
+
51
+
52
+ def compact_messages(
53
+ *,
54
+ messages: Sequence[dict[str, Any]],
55
+ original_prompt_text: str,
56
+ model_name: str,
57
+ model_profile: ModelProfile,
58
+ llm_caller: Callable[..., dict[str, Any]],
59
+ token_counter: Callable[[Sequence[dict[str, Any]]], int],
60
+ runtime_deadline: Optional[float] = None,
61
+ ) -> CompactionOutcome:
62
+ safe_messages = [dict(message) for message in messages]
63
+ if len(safe_messages) <= 2:
64
+ return CompactionOutcome(
65
+ status="error",
66
+ compacted_messages=safe_messages,
67
+ pre_messages=safe_messages,
68
+ post_messages=safe_messages,
69
+ error="context compaction requires at least one conversational turn beyond the initial prompt",
70
+ )
71
+
72
+ prior_token_estimate = token_counter(safe_messages)
73
+ existing_memory_text, eligible_messages = _split_existing_memory_messages(safe_messages[2:])
74
+ turn_groups = _turn_groups(eligible_messages)
75
+ if not turn_groups:
76
+ return CompactionOutcome(
77
+ status="error",
78
+ compacted_messages=safe_messages,
79
+ prior_token_estimate=prior_token_estimate,
80
+ existing_memory_text=existing_memory_text,
81
+ pre_messages=safe_messages,
82
+ post_messages=safe_messages,
83
+ error="context compaction found no eligible conversational turns",
84
+ )
85
+
86
+ compacted_groups, recent_groups = _split_turn_groups(turn_groups, model_profile)
87
+ if not compacted_groups:
88
+ return CompactionOutcome(
89
+ status="error",
90
+ compacted_messages=safe_messages,
91
+ prior_token_estimate=prior_token_estimate,
92
+ existing_memory_text=existing_memory_text,
93
+ pre_messages=safe_messages,
94
+ post_messages=safe_messages,
95
+ error="context compaction did not find any older turns to summarize",
96
+ )
97
+
98
+ history_text = _render_history_text(compacted_groups, model_profile)
99
+ prior_memory_block = ""
100
+ if existing_memory_text:
101
+ prior_memory_block = (
102
+ "Previously compressed memory to preserve and refine:\n"
103
+ f"{_truncate_summary_text(existing_memory_text, max_chars=max(1200, model_profile.context_window // 3))}\n\n"
104
+ )
105
+ summary_request = [
106
+ {
107
+ "role": "system",
108
+ "content": (
109
+ "You compress older tool-using agent history into short working memory for continued execution. "
110
+ "Return plain text only. Do not call tools. Do not invent facts."
111
+ ),
112
+ },
113
+ {
114
+ "role": "user",
115
+ "content": (
116
+ "Summarize the earlier conversation history for a tool-using agent.\n\n"
117
+ f"Original task:\n{original_prompt_text}\n\n"
118
+ "Write a concise working memory with these sections:\n"
119
+ "- Goal\n"
120
+ "- Constraints\n"
121
+ "- Files and artifacts\n"
122
+ "- Evidence and results\n"
123
+ "- Open issues\n"
124
+ "- Next useful actions\n\n"
125
+ "Rules:\n"
126
+ "- Prefer concrete file paths, numeric results, and grounded facts.\n"
127
+ "- Mention uncertainty when details may need to be re-read from files.\n"
128
+ "- Merge any prior compressed memory with the newer history below into one refreshed memory.\n"
129
+ "- Deduplicate repeated sections and do not repeat earlier summaries verbatim.\n"
130
+ "- The workspace remains authoritative.\n\n"
131
+ f"{prior_memory_block}"
132
+ f"Older history to compress:\n{history_text}"
133
+ ),
134
+ },
135
+ ]
136
+ summary_reply = llm_caller(
137
+ summary_request,
138
+ runtime_deadline=runtime_deadline,
139
+ max_output_tokens=model_profile.compact_summary_max_tokens,
140
+ )
141
+ if not isinstance(summary_reply, dict) or summary_reply.get("status") != "ok":
142
+ error = summary_reply.get("error", "context compaction summary call failed") if isinstance(summary_reply, dict) else str(summary_reply)
143
+ return CompactionOutcome(
144
+ status="error",
145
+ compacted_messages=safe_messages,
146
+ prior_token_estimate=prior_token_estimate,
147
+ existing_memory_text=existing_memory_text,
148
+ summary_request=summary_request,
149
+ summary_response=safe_jsonable(summary_reply) if isinstance(summary_reply, dict) else {"status": "error", "error": error},
150
+ pre_messages=safe_messages,
151
+ post_messages=safe_messages,
152
+ error=error,
153
+ compacted_group_count=len(compacted_groups),
154
+ kept_group_count=len(recent_groups),
155
+ )
156
+
157
+ if summary_reply.get("tool_calls"):
158
+ return CompactionOutcome(
159
+ status="error",
160
+ compacted_messages=safe_messages,
161
+ prior_token_estimate=prior_token_estimate,
162
+ existing_memory_text=existing_memory_text,
163
+ summary_request=summary_request,
164
+ summary_response=safe_jsonable(summary_reply),
165
+ pre_messages=safe_messages,
166
+ post_messages=safe_messages,
167
+ compacted_group_count=len(compacted_groups),
168
+ kept_group_count=len(recent_groups),
169
+ error="context compaction summary call returned tool calls",
170
+ )
171
+
172
+ summary_text = str(summary_reply.get("content", "") or "").strip()
173
+ if not summary_text:
174
+ return CompactionOutcome(
175
+ status="error",
176
+ compacted_messages=safe_messages,
177
+ prior_token_estimate=prior_token_estimate,
178
+ existing_memory_text=existing_memory_text,
179
+ summary_request=summary_request,
180
+ summary_response=safe_jsonable(summary_reply),
181
+ pre_messages=safe_messages,
182
+ post_messages=safe_messages,
183
+ compacted_group_count=len(compacted_groups),
184
+ kept_group_count=len(recent_groups),
185
+ error="context compaction summary call returned empty text",
186
+ )
187
+
188
+ summary_message = {"role": "user", "content": COMPACT_MEMORY_PREFIX + summary_text}
189
+ compacted_messages = safe_messages[:2] + [summary_message]
190
+ for group in recent_groups:
191
+ compacted_messages.extend(group)
192
+ new_token_estimate = token_counter(compacted_messages)
193
+ return CompactionOutcome(
194
+ status="ok",
195
+ compacted_messages=compacted_messages,
196
+ summary_text=summary_text,
197
+ prior_token_estimate=prior_token_estimate,
198
+ new_token_estimate=new_token_estimate,
199
+ compacted_group_count=len(compacted_groups),
200
+ kept_group_count=len(recent_groups),
201
+ existing_memory_text=existing_memory_text,
202
+ summary_request=summary_request,
203
+ summary_response=safe_jsonable(summary_reply),
204
+ pre_messages=safe_messages,
205
+ post_messages=compacted_messages,
206
+ )
207
+
208
+
209
+ def _turn_groups(messages: Sequence[dict[str, Any]]) -> list[list[dict[str, Any]]]:
210
+ groups: list[list[dict[str, Any]]] = []
211
+ current_group: list[dict[str, Any]] = []
212
+ for message in messages:
213
+ role = str(message.get("role", ""))
214
+ if role == "assistant" and current_group:
215
+ groups.append(current_group)
216
+ current_group = [message]
217
+ continue
218
+ current_group.append(message)
219
+ if current_group:
220
+ groups.append(current_group)
221
+ return groups
222
+
223
+
224
+ def _split_existing_memory_messages(messages: Sequence[dict[str, Any]]) -> tuple[str, list[dict[str, Any]]]:
225
+ existing_summaries: list[str] = []
226
+ remaining_messages: list[dict[str, Any]] = []
227
+ preserving_summary_prefix = True
228
+ for message in messages:
229
+ content = message.get("content", "")
230
+ if (
231
+ preserving_summary_prefix
232
+ and str(message.get("role", "")) == "user"
233
+ and isinstance(content, str)
234
+ and content.startswith(COMPACT_MEMORY_PREFIX)
235
+ ):
236
+ existing_summaries.append(content[len(COMPACT_MEMORY_PREFIX) :].strip())
237
+ continue
238
+ preserving_summary_prefix = False
239
+ remaining_messages.append(dict(message))
240
+ merged_summary = "\n\n".join(summary for summary in existing_summaries if summary).strip()
241
+ return merged_summary, remaining_messages
242
+
243
+
244
+ def _split_turn_groups(turn_groups: Sequence[Sequence[dict[str, Any]]], model_profile: ModelProfile) -> tuple[list[list[dict[str, Any]]], list[list[dict[str, Any]]]]:
245
+ recent_char_budget = max(400, model_profile.recent_history_budget_tokens * 4)
246
+ recent_groups: list[list[dict[str, Any]]] = []
247
+ recent_chars = 0
248
+
249
+ for group in reversed(turn_groups):
250
+ rendered = _render_group(group, max_chars_per_message=240)
251
+ if recent_groups and recent_chars >= recent_char_budget:
252
+ break
253
+ recent_groups.insert(0, [dict(message) for message in group])
254
+ recent_chars += len(rendered)
255
+ if len(recent_groups) >= 4:
256
+ break
257
+
258
+ if len(recent_groups) >= len(turn_groups):
259
+ recent_groups = recent_groups[1:]
260
+ compacted_count = max(0, len(turn_groups) - len(recent_groups))
261
+ compacted_groups = [[dict(message) for message in group] for group in turn_groups[:compacted_count]]
262
+ return compacted_groups, recent_groups
263
+
264
+
265
+ def _render_history_text(turn_groups: Sequence[Sequence[dict[str, Any]]], model_profile: ModelProfile) -> str:
266
+ max_history_chars = max(600, min(64000, model_profile.context_window * 2))
267
+ max_chars_per_message = max(200, min(4000, max_history_chars // 10))
268
+ parts: list[str] = []
269
+ used = 0
270
+ for index, group in enumerate(turn_groups, start=1):
271
+ rendered = f"[Turn group {index}]\n{_render_group(group, max_chars_per_message=max_chars_per_message)}"
272
+ if parts and used + len(rendered) > max_history_chars:
273
+ remaining = max_history_chars - used
274
+ if remaining > 80:
275
+ parts.append(rendered[: remaining - 40].rstrip() + "\n...[history truncated]")
276
+ break
277
+ parts.append(rendered)
278
+ used += len(rendered)
279
+ return "\n\n".join(parts).strip()
280
+
281
+
282
+ def _render_group(group: Sequence[dict[str, Any]], *, max_chars_per_message: int) -> str:
283
+ lines: list[str] = []
284
+ for message in group:
285
+ role = str(message.get("role", ""))
286
+ content = _message_excerpt(message, max_chars=max_chars_per_message)
287
+ lines.append(f"{role}: {content}")
288
+ return "\n".join(lines).strip()
289
+
290
+
291
+ def _message_excerpt(message: dict[str, Any], *, max_chars: int) -> str:
292
+ content = message.get("content", "")
293
+ text: str
294
+ if isinstance(content, str):
295
+ text = content
296
+ elif isinstance(content, list):
297
+ parts: list[str] = []
298
+ for part in content:
299
+ if isinstance(part, dict) and part.get("type") == "text":
300
+ parts.append(str(part.get("text", "")))
301
+ elif isinstance(part, dict) and part.get("type") == "image_url":
302
+ parts.append("[image_url]")
303
+ else:
304
+ parts.append(str(part))
305
+ text = " ".join(part for part in parts if part)
306
+ else:
307
+ text = str(content)
308
+ tool_calls = message.get("tool_calls")
309
+ if tool_calls:
310
+ tool_names = []
311
+ for tool_call in tool_calls:
312
+ function_block = tool_call.get("function", {}) if isinstance(tool_call, dict) else {}
313
+ tool_names.append(str(function_block.get("name", "")))
314
+ if tool_names:
315
+ text = (text + "\nTool calls: " + ", ".join(name for name in tool_names if name)).strip()
316
+ compacted = " ".join(text.split())
317
+ if len(compacted) <= max_chars:
318
+ return compacted
319
+ return compacted[: max_chars - 16].rstrip() + "...[truncated]"
320
+
321
+
322
+ def _truncate_summary_text(text: str, *, max_chars: int) -> str:
323
+ compacted = " ".join(str(text).split())
324
+ if len(compacted) <= max_chars:
325
+ return compacted
326
+ return compacted[: max_chars - 16].rstrip() + "...[truncated]"
agent_base/model_profiles.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Any, Optional
5
+
6
+
7
+ @dataclass(frozen=True)
8
+ class ModelProfile:
9
+ family: str
10
+ context_window: int
11
+ output_reserve_tokens: int
12
+ compact_buffer_tokens: int
13
+ recent_history_budget_tokens: int
14
+ compact_summary_max_tokens: int
15
+ compact_trigger_tokens_override: Optional[int] = None
16
+
17
+ @property
18
+ def compact_trigger_tokens(self) -> int:
19
+ if self.compact_trigger_tokens_override is not None:
20
+ return self.compact_trigger_tokens_override
21
+ return max(256, self.context_window - self.output_reserve_tokens - self.compact_buffer_tokens)
22
+
23
+
24
+ def _model_family(model_name: str) -> str:
25
+ normalized = str(model_name or "").strip().casefold()
26
+ if "gemini" in normalized:
27
+ return "gemini"
28
+ if "claude" in normalized:
29
+ return "claude"
30
+ if "deepseek" in normalized:
31
+ return "deepseek"
32
+ if "qwen" in normalized:
33
+ return "qwen"
34
+ if "glm" in normalized:
35
+ return "glm"
36
+ if "gpt" in normalized or "o1" in normalized or "o3" in normalized or "o4" in normalized:
37
+ return "gpt"
38
+ return "generic"
39
+
40
+
41
+ def resolve_model_profile(
42
+ model_name: str,
43
+ *,
44
+ configured_max_input_tokens: int,
45
+ configured_max_output_tokens: int,
46
+ compact_trigger_tokens: Any = None,
47
+ ) -> ModelProfile:
48
+ context_window = max(1024, int(configured_max_input_tokens))
49
+ output_reserve_tokens = max(128, min(int(configured_max_output_tokens), max(256, context_window // 12)))
50
+ compact_buffer_tokens = max(64, min(4096, context_window // 20))
51
+ recent_history_budget_tokens = max(128, min(16384, context_window // 8))
52
+ compact_summary_max_tokens = max(256, min(2048, context_window // 16))
53
+ compact_trigger_override = parse_compact_trigger_tokens(compact_trigger_tokens, context_window=context_window)
54
+
55
+ family = _model_family(model_name)
56
+ if family in {"claude", "deepseek", "gemini"}:
57
+ compact_buffer_tokens = max(compact_buffer_tokens, 1024)
58
+ recent_history_budget_tokens = max(recent_history_budget_tokens, 1024)
59
+
60
+ return ModelProfile(
61
+ family=family,
62
+ context_window=context_window,
63
+ output_reserve_tokens=output_reserve_tokens,
64
+ compact_buffer_tokens=compact_buffer_tokens,
65
+ recent_history_budget_tokens=recent_history_budget_tokens,
66
+ compact_summary_max_tokens=compact_summary_max_tokens,
67
+ compact_trigger_tokens_override=compact_trigger_override,
68
+ )
69
+
70
+
71
+ def parse_compact_trigger_tokens(value: Any, *, context_window: int) -> Optional[int]:
72
+ if value is None:
73
+ return None
74
+ if isinstance(value, bool):
75
+ raise ValueError("compact trigger tokens must not be a boolean.")
76
+ if isinstance(value, int):
77
+ parsed = value
78
+ else:
79
+ text = str(value).strip().casefold()
80
+ if not text:
81
+ return None
82
+ multiplier = 1
83
+ if text.endswith("k"):
84
+ multiplier = 1024
85
+ text = text[:-1].strip()
86
+ elif text.endswith("m"):
87
+ multiplier = 1024 * 1024
88
+ text = text[:-1].strip()
89
+ text = text.replace("_", "").replace(",", "")
90
+ parsed = int(text) * multiplier
91
+ parsed = max(256, parsed)
92
+ return min(parsed, max(256, int(context_window)))
agent_base/prompt.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import sys
3
+ from dataclasses import dataclass
4
+ from pathlib import Path
5
+ from typing import Iterable
6
+
7
+
8
+ PROMPTS_DIR = Path(__file__).resolve().parent / "prompts"
9
+
10
+
11
+ @dataclass(frozen=True)
12
+ class PromptAsset:
13
+ name: str
14
+ path: Path
15
+ description: str
16
+
17
+
18
+ PROMPT_ASSETS = {
19
+ "system_base": PromptAsset(
20
+ name="system_base",
21
+ path=PROMPTS_DIR / "system_base.md",
22
+ description="Base general-purpose system prompt for the harness.",
23
+ ),
24
+ "extractor": PromptAsset(
25
+ name="extractor",
26
+ path=PROMPTS_DIR / "extractor.md",
27
+ description="Goal-directed webpage extraction prompt used by WebFetch.",
28
+ ),
29
+ }
30
+
31
+
32
+ def _read_prompt_asset(asset: PromptAsset) -> str:
33
+ return asset.path.read_text(encoding="utf-8").strip()
34
+
35
+
36
+ SYSTEM_PROMPT = _read_prompt_asset(PROMPT_ASSETS["system_base"])
37
+ EXTRACTOR_PROMPT = _read_prompt_asset(PROMPT_ASSETS["extractor"])
38
+
39
+
40
+ def _normalize_extra_blocks(blocks: Iterable[str] | None) -> list[str]:
41
+ normalized: list[str] = []
42
+ for raw_block in blocks or []:
43
+ block = str(raw_block or "").strip()
44
+ if block:
45
+ normalized.append(block)
46
+ return normalized
47
+
48
+
49
+ def composed_system_prompt(*, current_date: str, extra_blocks: Iterable[str] | None = None) -> str:
50
+ blocks = [SYSTEM_PROMPT.rstrip()]
51
+ for block in _normalize_extra_blocks(extra_blocks):
52
+ blocks.append(block.rstrip())
53
+ blocks.append(f"Current date: {current_date}")
54
+ return "\n\n".join(blocks)
55
+
56
+
57
+ def _show_asset(name: str) -> str:
58
+ asset = PROMPT_ASSETS.get(name)
59
+ if asset is None:
60
+ valid = ", ".join(sorted(PROMPT_ASSETS))
61
+ raise ValueError(f"Unknown prompt asset '{name}'. Available assets: {valid}")
62
+ return _read_prompt_asset(asset)
63
+
64
+
65
+ def main(argv: list[str] | None = None) -> int:
66
+ parser = argparse.ArgumentParser(description="Inspect prompt assets.")
67
+ parser.add_argument("--show-system", action="store_true", help="Print the composed system prompt.")
68
+ parser.add_argument("--show-extractor", action="store_true", help="Print the extractor prompt.")
69
+ parser.add_argument("--show-asset", metavar="NAME", help="Print one prompt asset by name.")
70
+ parser.add_argument("--list-assets", action="store_true", help="List registered prompt assets.")
71
+ parser.add_argument(
72
+ "--with-extra-file",
73
+ action="append",
74
+ default=[],
75
+ dest="extra_files",
76
+ help="Append one extra prompt block file when printing the composed system prompt. May be passed multiple times.",
77
+ )
78
+ args = parser.parse_args(argv)
79
+
80
+ extra_blocks = [Path(path).read_text(encoding="utf-8") for path in args.extra_files]
81
+
82
+ if args.list_assets:
83
+ for asset in sorted(PROMPT_ASSETS.values(), key=lambda item: item.name):
84
+ print(f"{asset.name}: {asset.description}")
85
+ return 0
86
+
87
+ if args.show_asset:
88
+ print(_show_asset(args.show_asset))
89
+ return 0
90
+
91
+ if args.show_system:
92
+ print(composed_system_prompt(current_date="<DATE>", extra_blocks=extra_blocks))
93
+ return 0
94
+
95
+ if args.show_extractor:
96
+ print(EXTRACTOR_PROMPT)
97
+ return 0
98
+
99
+ print(f"prompt_asset_dir={PROMPTS_DIR}")
100
+ print(f"system_prompt_chars={len(composed_system_prompt(current_date='<DATE>', extra_blocks=extra_blocks))}")
101
+ print(f"extractor_prompt_chars={len(EXTRACTOR_PROMPT)}")
102
+ return 0
103
+
104
+
105
+ if __name__ == "__main__":
106
+ raise SystemExit(main(sys.argv[1:]))
agent_base/prompts/extractor.md ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Please process the following webpage content and user goal to extract relevant information.
2
+
3
+ ## **Webpage Content**
4
+ {webpage_content}
5
+
6
+ ## **User Goal**
7
+ {goal}
8
+
9
+ ## **Task Guidelines**
10
+ 1. **Content Scanning for Rationale**: Locate the **specific sections/data** directly related to the user's goal within the webpage content
11
+ 2. **Key Extraction for Evidence**: Identify and extract the **most relevant information** from the content. Preserve the most useful original context as fully as practical.
12
+ 3. **Summary Output for Summary**: Organize a concise, goal-focused summary with clear logical flow.
13
+
14
+ ## **Output Requirements**
15
+ - Return a single JSON object only.
16
+ - Required keys: `"rational"`, `"evidence"`, `"summary"`.
17
+ - All three fields must always be present.
18
+ - `"evidence"` and `"summary"` must be non-empty strings whenever relevant content exists.
19
+ - If the page is irrelevant or insufficient, still return valid strings explaining that limitation.
agent_base/prompts/system_base.md ADDED
@@ -0,0 +1,232 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ You are a capable all-purpose AI assistant. You do far more than simple question answering: you handle complex tasks, investigate problems, work through project-level requests, and support serious research work. Work from evidence, not guesses. Use the available tools deliberately, keep control flow simple, and stop as soon as you have enough verified information to complete the task correctly.
2
+
3
+ # Role And Operating Principles
4
+
5
+ ## Mission
6
+
7
+ - Prefer direct evidence over memory or inference.
8
+ - Prefer deterministic local computation over mental arithmetic or paraphrase.
9
+ - Prefer the smallest sufficient tool for the current step.
10
+ - If a tool can verify the exact claim, use it.
11
+
12
+ ## Planning, Memory, And Long-Horizon Work
13
+
14
+ - For substantial, uncertain, or multi-stage tasks, create a local `plan.md` early in the workspace and keep it updated as the work progresses.
15
+ - Use `plan.md` to track the overall goal, phased goals, the current stage, and concrete acceptance checklists for each stage.
16
+ - In `plan.md`, mark work that is not yet complete as `[ ]`, work that is completed and verified as `[Y]`, and work that cannot currently be completed as `[N]` with a short factual reason.
17
+ - Keep `plan.md` aligned with reality. When evidence changes the plan, update the plan instead of continuing with an outdated plan.
18
+ - For long-running tasks, prefer `Write` to create `plan.md` and `Edit` to keep it current.
19
+ - When ongoing work depends on durable facts that may be easy to forget, maintain a local `memory.md`.
20
+ - Use `memory.md` to store important factual state such as resolved paths, URLs, measurements, assumptions, decisions, blockers, and other verified facts that should persist across the task.
21
+ - Keep `memory.md` compact, factual, and easy to update. Record evidence and decisions, not raw hidden reasoning.
22
+ - Small one-step tasks do not need a full `plan.md` or `memory.md` if they would add overhead without helping execution.
23
+
24
+ ## Exploration And Convergence
25
+
26
+ - Explore broadly enough at the beginning of a task to identify the real solution path, the relevant files, the relevant evidence, and the main constraints.
27
+ - Once you have enough evidence, converge and execute cleanly instead of reopening every branch.
28
+ - Follow the same pattern inside each phase: early exploration to understand the phase, then focused execution to finish it.
29
+ - Keep exploration purposeful. Use it to reduce uncertainty, compare plausible paths, or verify assumptions.
30
+ - Let `plan.md`, the current acceptance checklist, and newly gathered evidence determine when to continue exploring, when to revise the plan, and when to move forward.
31
+ - Non-interactive or benchmark-style runs:
32
+ - If `AskUser` is unavailable or forbidden, do not ask follow-up questions.
33
+ - Make the best independent attempt possible from the prompt, workspace, and tools.
34
+ - If the task can be answered by reading local files, searching, fetching a page, inspecting images, or running a small computation, make a bounded attempt before saying the information is unavailable.
35
+ - Keep the attempt proportional to the task; avoid unrelated research, open-ended browsing, or repeated failed tool calls once a short investigation has established the limitation.
36
+ - Interactive runs:
37
+ - Avoid asking the user before doing ordinary investigation.
38
+ - Avoid trying indefinitely when a concise clarification would unblock the task.
39
+ - First make a reasonable bounded attempt using the available workspace and tools.
40
+ - If key information, preference, or approval is still missing after that attempt, ask one concise clarification with `AskUser`, then continue from the user's answer.
41
+
42
+ ## Truthfulness, Evidence, And Claims
43
+
44
+ - Anchor your work to actual tool outputs, explicit user input, and deterministic computation.
45
+ - If evidence is missing, gather it or clearly state the limitation.
46
+ - Treat missing outputs, failed commands, and unknowns honestly.
47
+ - Keep claims proportional to the evidence you actually gathered.
48
+ - Prefer an explicit limitation over a polished but unsupported answer.
49
+ - Do not fabricate tool outputs, file contents, experiment results, citations, numeric values, or completion status.
50
+ - Do not claim that a file, report, plot, experiment, or result exists unless you produced it or verified it directly.
51
+ - If the user or task explicitly names a method, framework, protocol, model family, interpretability technique, metric, comparison axis, or ablation, treat that named item as part of the task contract.
52
+ - Do not quietly replace an explicitly named method or protocol with a looser approximation just because a generic analysis is easier.
53
+ - If an explicitly named method may be blocked by a missing library, missing data, or missing capability, verify that limitation early with tools and then state the limitation plainly before you substitute anything.
54
+
55
+ # Safety And Scope
56
+
57
+ ## Boundaries
58
+
59
+ - Stay inside the current workspace root.
60
+ - Do not attempt to access secrets, credentials, or sensitive files such as `.env`, SSH keys, cloud credentials, `.git-credentials`, or `.netrc`.
61
+ - Do not run destructive or privilege-oriented commands such as `sudo`, `su`, `shutdown`, `reboot`, disk-formatting commands, or obviously destructive deletion commands.
62
+ - Prefer read-only inspection unless the user explicitly asks for a modification or the task clearly requires one.
63
+ - Use the web tools for external information gathering. Do not use `Bash` or `Terminal*` as a substitute for arbitrary network retrieval.
64
+
65
+ # Tool Use And Execution
66
+
67
+ ## Native Tool Calling Contract
68
+
69
+ - Use the API's native tool calling interface when tools are needed. Do not write pseudo-XML, pseudo-tool JSON, or tag-based tool requests in plain text.
70
+ - If a turn includes native tool calls, that turn is a tool-use turn. Any accompanying text is treated as working context, not as the final result.
71
+ - Multiple tool calls in one turn are allowed only when they are independent.
72
+ - If tool B depends on the output of tool A, do not request them in the same turn. Wait for tool A's result first.
73
+ - If the user explicitly names required tools, call those exact tools instead of substituting a different tool.
74
+ - If you are calling tools, that turn is not finished yet. Do not draft, preview, or guess the final result, including candidate field values, partial JSON, or a "likely final result".
75
+ - Keep tool turns structured. Brief text may explain the current tool step, but the tool call itself is the action.
76
+ - When no more tools are needed, return the final result as plain text.
77
+ - If the user requires a strict format such as JSON, output only that payload as the plain final result text.
78
+ - Do not emit legacy protocol tags such as `<tool_call>`, `<tool_response>`, `<think>`, or `<answer>`.
79
+
80
+ ## Tool Selection And Routing
81
+
82
+ - Use this routing order:
83
+ - local file discovery by pathname pattern -> `Glob`
84
+ - local text search across files -> `Grep`
85
+ - local text / code / data files -> `Read`
86
+ - local PDF -> `ReadPDF`
87
+ - local image -> `ReadImage`
88
+ - local deterministic computation / parsing / transformation -> `Bash`
89
+ - discover candidate webpages -> `WebSearch`
90
+ - find paper metadata -> `ScholarSearch`
91
+ - verify actual page content -> `WebFetch`
92
+ - ask the human user for essential missing information -> `AskUser`
93
+ - persistent interactive shell state -> `Terminal*`
94
+ - Search results and scholar results are discovery aids. They are not page-verification evidence by themselves.
95
+ - Prefer `Bash` over `Terminal*` unless persistent interactive shell state is genuinely required.
96
+
97
+ ## Human Clarification Workflow
98
+
99
+ - Only use `AskUser` if it is available in the current tool list. If it is not available, do not simulate a question in plain text; continue independently and report limitations when necessary.
100
+ - Use `AskUser` only when continuing correctly depends on information, preference, or approval that cannot be determined from the workspace, available tools, or the user's existing instructions.
101
+ - Do not use `AskUser` to avoid ordinary investigation, reading files, running commands, or making a reasonable evidence-backed decision.
102
+ - Ask one concise question at a time. Include brief context when it helps the user answer accurately.
103
+ - After receiving an `AskUser` answer, treat it as explicit user input, continue the task, and preserve the answer in the normal tool trace.
104
+
105
+ ## Workspace And Local File Workflow
106
+
107
+ - Treat local files as discoverable resources inside the current workspace.
108
+ - If a workspace root was provided for this run, that workspace is the default starting location for `Bash` and `TerminalStart`.
109
+ - That means a first-turn `Bash` command like `ls` should list the workspace root directly.
110
+ - Both relative paths and absolute paths are valid local path inputs.
111
+ - Relative local paths resolve from the current workspace.
112
+ - If a tool returns an absolute path, prefer reusing that exact path in later tool calls instead of reconstructing it.
113
+ - Prefer `Glob` for file discovery by pattern and `Grep` for text search when those tools are sufficient.
114
+ - `Glob` and `Grep` default to the current workspace root.
115
+ - If the local file layout is unclear, explore it directly with `Bash`, for example `pwd`, `ls`, `find`, or `rg --files`.
116
+ - For file-modification tasks, prefer `Write` for initial creation and `Edit` for targeted follow-up changes before verification.
117
+ - Default pattern for local tasks:
118
+ - explore the workspace only if needed
119
+ - discover with `Glob` / `Grep` when helpful
120
+ - inspect with `Read` / `ReadPDF` / `ReadImage`
121
+ - compute or validate with `Bash`
122
+ - produce the final result from the actual tool output
123
+ - For PDF tasks, prefer `ReadPDF` before `Bash` whenever the PDF content itself matters.
124
+ - `ReadPDF` can expose both extracted text and extracted local image paths from the PDF parser.
125
+ - If the task asks about a figure, caption, chart, diagram, or text visible inside a local PDF figure:
126
+ - start with `ReadPDF`
127
+ - use the extracted text and extracted image paths to identify the relevant figure
128
+ - then call `ReadImage` on the actual extracted local image file
129
+ - use `Bash` only for PDF-specific processing that `ReadPDF` does not already provide
130
+ - Do not put `Read` and a path-dependent `Bash` command in the same turn when the Bash command needs the exact resolved path from `Read`.
131
+ - When moving from file tools to `Bash`, prefer the absolute path shown by `Read` / `ReadPDF` or set `workdir` to the correct directory.
132
+ - Do not assume a referenced local file sits in the current directory. If you have not yet seen the resolved path, either wait for `Read` or explore with `Bash`.
133
+ - If a previous `Bash` command failed because it guessed the wrong working directory or used a relative path incorrectly, immediately retry with the exact absolute path from the file tool output.
134
+ - If the user wants a value derived from a local file, do not guess from inspection alone when local computation is cheap. Compute it.
135
+ - If a trusted local PyTorch `.pt` or `.pth` file fails to load because of
136
+ `weights_only` defaults or missing custom classes, try a compatible recovery
137
+ path such as `weights_only=False` or explicit safe globals after verifying
138
+ the file origin inside the workspace.
139
+
140
+ ## Bash Guidance
141
+
142
+ - Treat `Bash` as the primary local execution tool.
143
+ - Use it for:
144
+ - short `python3` snippets
145
+ - `pwd`, `ls`, `find`, `rg`, `git`
146
+ - parsing CSV / JSON / text
147
+ - ranking, sorting, aggregating, validating, and formatting
148
+ - combining outputs from other tools into a deterministic result
149
+ - For temporary Python, prefer a heredoc:
150
+
151
+ ```bash
152
+ python3 - <<'PY'
153
+ print("hello")
154
+ PY
155
+ ```
156
+
157
+ - In Bash Python snippets, print only the values you need, ideally as valid JSON or short deterministic lines.
158
+ - For output-sensitive tasks, make the Bash command print machine-friendly output first, then base the final result on that exact output.
159
+ - Use explicit `timeout` values for heavier commands.
160
+ - When using `Bash` to run temporary Python, keep the script deterministic and print only the values you need.
161
+ - Do not use `Bash` for basic pathname globbing or simple text search when `Glob` or `Grep` already covers the need.
162
+
163
+ ## Web Research Workflow
164
+
165
+ - If the user asks to visit a page, fetch a page, verify against a page, confirm page content, or explicitly requires `WebFetch`, you must call `WebFetch` before producing the final result.
166
+ - If the user says "search first, then visit the page to verify it" or equivalent, the required pattern is:
167
+ - search first
168
+ - fetch the chosen page with `WebFetch`
169
+ - only then produce the final result
170
+ - Do not treat `WebSearch` or `ScholarSearch` snippets as a substitute for `WebFetch` when page verification is required.
171
+ - The `visited_url` in the final result should be a URL that was actually passed to `WebFetch`.
172
+
173
+ ## Terminal Workflow
174
+
175
+ - In most tasks, do not use `Terminal*`.
176
+ - If the user explicitly requires `Terminal*`, do not substitute `Bash`.
177
+ - Use `Terminal*` only for genuinely stateful shell workflows, such as:
178
+ - starting a long-running process and polling it later
179
+ - interacting with a REPL or debugger
180
+ - keeping shell state across multiple incremental commands
181
+ - sending `Ctrl-C` or terminating a persistent foreground process
182
+ - Do not use `Terminal*` for a single one-shot command, a single Python snippet, a single grep, or a single git command.
183
+ - If you start a terminal session, keep the lifecycle disciplined:
184
+ - `TerminalStart`
185
+ - `TerminalWrite` / `TerminalRead` as needed
186
+ - `TerminalInterrupt` only when necessary
187
+ - `TerminalKill` when done
188
+
189
+ # Recovery And Finalization
190
+
191
+ ## Failure Handling And Recovery
192
+
193
+ - If a tool fails, react to that actual failure. Do not fabricate missing outputs.
194
+ - After any tool call, wait for the returned tool response before deciding the next step.
195
+ - If a value can be checked locally with `Bash`, prefer checking it over paraphrasing from a previous tool output.
196
+ - If required tools are still missing, your only valid next move is another tool turn, not a partial result.
197
+ - If the current plan is blocked by real evidence, update `plan.md`, revise the phase goal, or change the approach instead of pretending the blocker is resolved.
198
+
199
+ ## Finalization Discipline
200
+
201
+ - The final result must satisfy the user's original request, not a simplified or reformulated version of it.
202
+ - Match the user's stated output requirements exactly when they are explicit, including format, required fields, ordering constraints, style constraints, scope constraints, and any stated completion conditions.
203
+ - If the user asks for a strict format such as JSON, Markdown, a table, bullet points, or a specific schema, the final result must follow that format exactly.
204
+ - If the user asks for specific deliverables, make sure the final result covers those deliverables directly instead of replacing them with a generic summary.
205
+ - If the user did not specify a strict final format, default to a clear, sufficiently detailed summary of what you did, what you found, what you changed or produced, and any important limitations or remaining gaps.
206
+ - Do not end with a minimal or cryptic answer when the user expects an explanation of the completed work.
207
+ - Final answers must be complete and self-contained enough for the user to understand the result directly.
208
+ - You may reference local files you created or inspected, but do not make those files the only carrier of the answer.
209
+ - When local artifacts matter, include the actual answer plus a concise summary of the relevant evidence, changes, or solution steps.
210
+ - If the user explicitly requires specific tools, satisfy that requirement before producing the final result.
211
+ - If the user asks for externally verified facts, gather evidence with the relevant web tools before producing the final result.
212
+ - If page verification is required, do not produce the final result until a `WebFetch` response has been received.
213
+ - When enough evidence has been collected, give the final result immediately.
214
+ - Before emitting the final result text, make sure:
215
+ - the final result addresses the original user request directly
216
+ - all user-required tools have already been called
217
+ - any required page verification has already gone through `WebFetch`
218
+ - any required local computation has already been checked with `Bash`
219
+ - the final payload matches the user-required format exactly
220
+ - if JSON is required, the payload is a single valid JSON object with balanced braces, no trailing commas, and no extra closing characters
221
+ - there is no unfinished tool step still pending
222
+
223
+ ## Common Mistakes To Avoid
224
+
225
+ - Do not produce the final result from search snippets when the task requires page verification.
226
+ - Do not use `ScholarSearch` as a replacement for `WebFetch` on page-verification tasks.
227
+ - Do not use `Terminal*` for one-shot work; prefer `Bash` or file tools.
228
+ - Do not reach for `Bash` first when the task is simply "find matching files" or "search text in files"; use `Glob` or `Grep`.
229
+ - Do not skip `ReadPDF` for local PDF figure tasks when `ReadPDF` can already give you the extracted text and local image paths you need.
230
+ - Do not ignore path and working-directory implications when switching from file tools to `Bash`.
231
+ - Do not output placeholder results such as `{\"error\":\"waiting_for_required_tool_calls\"}`, `TBD`, `{}`, or partial final JSON while tool work is still pending.
232
+ - Do not claim a tool was used unless this run actually contains that tool call.
agent_base/provider_compat.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from typing import Any
3
+
4
+
5
+ _MODEL_NAME_SPLIT_RE = re.compile(r"[/:\s]+")
6
+
7
+
8
+ def model_rejects_sampling_params(model_name: str) -> bool:
9
+ normalized = str(model_name or "").strip().casefold()
10
+ if not normalized:
11
+ return False
12
+ parts = [part for part in _MODEL_NAME_SPLIT_RE.split(normalized) if part]
13
+ return any(part.startswith("claude") for part in parts)
14
+
15
+
16
+ def apply_sampling_params(
17
+ request_kwargs: dict[str, Any],
18
+ *,
19
+ model_name: str,
20
+ temperature: Any = None,
21
+ top_p: Any = None,
22
+ presence_penalty: Any = None,
23
+ ) -> None:
24
+ if model_rejects_sampling_params(model_name):
25
+ return
26
+ if temperature is not None:
27
+ request_kwargs["temperature"] = temperature
28
+ if top_p is not None:
29
+ request_kwargs["top_p"] = top_p
30
+ if presence_penalty is not None:
31
+ request_kwargs["presence_penalty"] = presence_penalty
agent_base/react_agent.py ADDED
@@ -0,0 +1,1453 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ from contextlib import contextmanager
3
+ import json
4
+ import os
5
+ import re
6
+ import signal
7
+ import sys
8
+ import threading
9
+ from pathlib import Path
10
+ from typing import Any, Callable, Dict, List, Optional, Sequence, Type
11
+
12
+ from openai import OpenAI, APIError, APIConnectionError, APITimeoutError
13
+ import tiktoken
14
+ from agent_base.base import BaseAgent
15
+ from agent_base.console_utils import ConsoleEventPrinter
16
+ from agent_base.context_compact import compact_messages, should_compact_messages
17
+ from agent_base.model_profiles import resolve_model_profile
18
+ from agent_base.provider_compat import apply_sampling_params
19
+ from agent_base.prompt import composed_system_prompt
20
+ from agent_base.session_state import AgentSessionState, CompactionRecord, persist_session_state, resolve_session_state_path
21
+ from agent_base.trace_utils import FlatTraceWriter
22
+ from agent_base.tools.tooling import normalize_workspace_root
23
+ from agent_base.tools.tool_file import Edit, Glob, Grep, Read, ReadImage, ReadPDF, Write
24
+ from agent_base.tools.tool_runtime import Bash, TerminalInterrupt, TerminalKill, TerminalRead, TerminalStart, TerminalWrite
25
+ from agent_base.tools.tool_user import AskUser
26
+ from agent_base.tools.tool_web import ScholarSearch, WebFetch, WebSearch
27
+ from agent_base.utils import (
28
+ PROJECT_ROOT,
29
+ MissingRequiredEnvError,
30
+ append_saved_image_paths_to_prompt,
31
+ env_flag,
32
+ image_input_content_parts,
33
+ load_dotenv,
34
+ read_role_prompt_files,
35
+ require_required_env,
36
+ safe_jsonable,
37
+ stage_image_file_for_input,
38
+ )
39
+
40
+ import datetime
41
+ import random
42
+ import time
43
+
44
+ AVAILABLE_TOOLS = [
45
+ Glob(),
46
+ Grep(),
47
+ Read(),
48
+ ReadPDF(),
49
+ ReadImage(),
50
+ Write(),
51
+ Edit(),
52
+ Bash(),
53
+ WebSearch(),
54
+ ScholarSearch(),
55
+ WebFetch(),
56
+ AskUser(),
57
+ TerminalStart(),
58
+ TerminalWrite(),
59
+ TerminalRead(),
60
+ TerminalInterrupt(),
61
+ TerminalKill(),
62
+ ]
63
+ AVAILABLE_TOOL_MAP = {tool.name: tool for tool in AVAILABLE_TOOLS}
64
+ DEFAULT_IMAGE_TOKEN_ESTIMATE = 1536
65
+ DEFAULT_MODEL_NAME = "gpt-5.4"
66
+ DEFAULT_MAX_LLM_CALLS = 100
67
+ DEFAULT_MAX_ROUNDS = 100
68
+ DEFAULT_MAX_RUNTIME_SECONDS = 150 * 60
69
+ DEFAULT_MAX_OUTPUT_TOKENS = 10000
70
+ DEFAULT_MAX_INPUT_TOKENS = 320000
71
+ DEFAULT_MAX_RETRIES = 10
72
+ DEFAULT_TEMPERATURE = 0.6
73
+ DEFAULT_TOP_P = 0.95
74
+ DEFAULT_PRESENCE_PENALTY = 1.1
75
+ DEFAULT_LLM_TIMEOUT_SECONDS = 600.0
76
+
77
+
78
+ class LLMHardTimeoutError(TimeoutError):
79
+ pass
80
+
81
+
82
+ @contextmanager
83
+ def llm_hard_timeout(timeout_seconds: float):
84
+ if (
85
+ timeout_seconds <= 0
86
+ or threading.current_thread() is not threading.main_thread()
87
+ or not hasattr(signal, "SIGALRM")
88
+ ):
89
+ yield
90
+ return
91
+
92
+ def _handle_timeout(signum, frame):
93
+ raise LLMHardTimeoutError(f"LLM request exceeded hard timeout of {timeout_seconds:.1f}s")
94
+
95
+ previous_handler = signal.getsignal(signal.SIGALRM)
96
+ previous_timer = signal.getitimer(signal.ITIMER_REAL)
97
+ signal.signal(signal.SIGALRM, _handle_timeout)
98
+ signal.setitimer(signal.ITIMER_REAL, timeout_seconds)
99
+ try:
100
+ yield
101
+ finally:
102
+ signal.setitimer(signal.ITIMER_REAL, 0)
103
+ signal.signal(signal.SIGALRM, previous_handler)
104
+ if previous_timer[0] > 0:
105
+ signal.setitimer(signal.ITIMER_REAL, previous_timer[0], previous_timer[1])
106
+
107
+
108
+ def today_date():
109
+ return datetime.date.today().strftime("%Y-%m-%d")
110
+
111
+
112
+ def max_llm_calls_per_run() -> int:
113
+ return int(os.getenv("MAX_LLM_CALL_PER_RUN", str(DEFAULT_MAX_LLM_CALLS)))
114
+
115
+
116
+ def max_agent_rounds() -> int:
117
+ return int(os.getenv("MAX_AGENT_ROUNDS", str(DEFAULT_MAX_ROUNDS)))
118
+
119
+
120
+ def max_agent_runtime_seconds() -> int:
121
+ return int(os.getenv("MAX_AGENT_RUNTIME_SECONDS", str(DEFAULT_MAX_RUNTIME_SECONDS)))
122
+
123
+
124
+ def llm_max_output_tokens() -> int:
125
+ return int(os.getenv("LLM_MAX_OUTPUT_TOKENS", str(DEFAULT_MAX_OUTPUT_TOKENS)))
126
+
127
+
128
+ def remaining_runtime_seconds(runtime_deadline: Optional[float]) -> Optional[float]:
129
+ if runtime_deadline is None:
130
+ return None
131
+ return runtime_deadline - time.time()
132
+
133
+
134
+ def debug_enabled() -> bool:
135
+ return env_flag("DEBUG_AGENT")
136
+
137
+
138
+ def assistant_text_content(content: Any) -> str:
139
+ if content is None:
140
+ return ""
141
+ if isinstance(content, str):
142
+ return content
143
+ if isinstance(content, list):
144
+ text_parts: list[str] = []
145
+ for part in content:
146
+ if isinstance(part, dict) and part.get("type") == "text":
147
+ text_parts.append(str(part.get("text", "")))
148
+ else:
149
+ text_parts.append(str(part))
150
+ return "".join(text_parts)
151
+ return str(content)
152
+
153
+
154
+ def message_trace_text(content: Any) -> str:
155
+ if isinstance(content, str):
156
+ return content
157
+ if not isinstance(content, list):
158
+ return str(content)
159
+ text_parts: list[str] = []
160
+ for part in content:
161
+ if not isinstance(part, dict):
162
+ text_parts.append(str(part))
163
+ continue
164
+ part_type = part.get("type")
165
+ if part_type == "text":
166
+ text_parts.append(str(part.get("text", "")))
167
+ elif part_type == "image_url":
168
+ image_url = part.get("image_url", {})
169
+ url = image_url.get("url", "") if isinstance(image_url, dict) else ""
170
+ url_text = str(url)
171
+ if url_text.startswith("data:image/"):
172
+ url_text = url_text.split(",", 1)[0] + ",...(base64 omitted)"
173
+ text_parts.append(f"[image_url: {url_text}]")
174
+ else:
175
+ text_parts.append(str(part))
176
+ return "\n".join(text for text in text_parts if text)
177
+
178
+
179
+ def _message_has_image_content(message: dict[str, Any]) -> bool:
180
+ content = message.get("content")
181
+ return isinstance(content, list) and any(isinstance(part, dict) and part.get("type") == "image_url" for part in content)
182
+
183
+
184
+ def _last_assistant_message_index(messages: Sequence[dict[str, Any]]) -> int:
185
+ for index in range(len(messages) - 1, -1, -1):
186
+ if isinstance(messages[index], dict) and messages[index].get("role") == "assistant":
187
+ return index
188
+ return -1
189
+
190
+
191
+ def _image_reference_summary(part: dict[str, Any]) -> str:
192
+ image_url = part.get("image_url", {})
193
+ url = image_url.get("url", "") if isinstance(image_url, dict) else ""
194
+ url_text = str(url)
195
+ if url_text.startswith("data:image/"):
196
+ return url_text.split(",", 1)[0] + ",...(base64 omitted)"
197
+ elif len(url_text) > 180:
198
+ return url_text[:180] + "...(truncated)"
199
+ return url_text or "unavailable"
200
+
201
+
202
+ def _image_path_hint_from_text(text: str) -> str:
203
+ patterns = (
204
+ r"\[User-provided image saved at ([^\]\n]+)\]",
205
+ r"Local image path:\s*([^\n]+)",
206
+ )
207
+ for pattern in patterns:
208
+ match = re.search(pattern, text)
209
+ if match:
210
+ return match.group(1).strip()
211
+ return ""
212
+
213
+
214
+ def _omitted_image_part_text(part: dict[str, Any], *, saved_path_hint: str = "") -> str:
215
+ url_text = _image_reference_summary(part)
216
+ path_text = f" Saved local path: {saved_path_hint}." if saved_path_hint else ""
217
+ return (
218
+ "[Previous image omitted from this model request to avoid repeatedly resending image bytes. "
219
+ f"Original image reference: {url_text}. "
220
+ f"{path_text} "
221
+ "The nearby conversation text or tool metadata records saved local paths when available; "
222
+ "use ReadImage on the saved path if visual details are needed again.]"
223
+ )
224
+
225
+
226
+ def _replace_image_parts_with_text(content: Any, *, message_index: int) -> tuple[Any, list[dict[str, Any]]]:
227
+ if not isinstance(content, list):
228
+ return content, []
229
+ replacement: list[Any] = []
230
+ omitted_images: list[dict[str, Any]] = []
231
+ image_index = 0
232
+ last_text_path_hint = ""
233
+ for part in content:
234
+ if isinstance(part, dict) and part.get("type") == "text":
235
+ path_hint = _image_path_hint_from_text(str(part.get("text", "")))
236
+ if path_hint:
237
+ last_text_path_hint = path_hint
238
+ if isinstance(part, dict) and part.get("type") == "image_url":
239
+ omitted_images.append(
240
+ {
241
+ "message_index": message_index,
242
+ "image_index": image_index,
243
+ "reference_summary": _image_reference_summary(part),
244
+ "saved_path_hint": last_text_path_hint,
245
+ }
246
+ )
247
+ replacement.append({"type": "text", "text": _omitted_image_part_text(part, saved_path_hint=last_text_path_hint)})
248
+ else:
249
+ replacement.append(safe_jsonable(part))
250
+ if isinstance(part, dict) and part.get("type") == "image_url":
251
+ image_index += 1
252
+ return replacement, omitted_images
253
+
254
+
255
+ def prepare_messages_for_llm(messages: Sequence[dict[str, Any]]) -> tuple[list[dict[str, Any]], dict[str, Any]]:
256
+ """Return request messages with old image bytes replaced by text references.
257
+
258
+ Image content parts are only needed immediately after they enter the
259
+ conversation. Older image parts stay represented as text so the agent can
260
+ re-read saved paths with ReadImage without resending the image every round.
261
+ """
262
+ last_assistant_index = _last_assistant_message_index(messages)
263
+ request_messages: list[dict[str, Any]] = []
264
+ omitted_images: list[dict[str, Any]] = []
265
+ for index, raw_message in enumerate(messages):
266
+ message = safe_jsonable(raw_message)
267
+ if not isinstance(message, dict):
268
+ request_messages.append({"role": "user", "content": str(message)})
269
+ continue
270
+ if index <= last_assistant_index and _message_has_image_content(message):
271
+ message = dict(message)
272
+ message["content"], message_omitted_images = _replace_image_parts_with_text(
273
+ message.get("content"),
274
+ message_index=index,
275
+ )
276
+ omitted_images.extend(message_omitted_images)
277
+ request_messages.append(message)
278
+ image_aging = {
279
+ "omitted_image_count": len(omitted_images),
280
+ "omitted_images": omitted_images,
281
+ }
282
+ return request_messages, image_aging
283
+
284
+
285
+ def assistant_reasoning_content(message: Any) -> Optional[Any]:
286
+ if hasattr(message, "model_dump"):
287
+ try:
288
+ dumped = safe_jsonable(message.model_dump())
289
+ if isinstance(dumped, dict) and "reasoning_content" in dumped:
290
+ return dumped.get("reasoning_content")
291
+ except Exception:
292
+ pass
293
+ model_extra = getattr(message, "model_extra", None)
294
+ if isinstance(model_extra, dict) and "reasoning_content" in model_extra:
295
+ return safe_jsonable(model_extra.get("reasoning_content"))
296
+ raw_reasoning = getattr(message, "reasoning_content", None)
297
+ if raw_reasoning is None:
298
+ return None
299
+ return safe_jsonable(raw_reasoning)
300
+
301
+
302
+ def assistant_has_meaningful_text(content: Any) -> bool:
303
+ return bool(assistant_text_content(content).strip())
304
+
305
+
306
+ def input_tokens_from_usage(usage: Any) -> Optional[int]:
307
+ if not isinstance(usage, dict):
308
+ return None
309
+ for key in ("prompt_tokens", "input_tokens"):
310
+ value = usage.get(key)
311
+ if isinstance(value, int):
312
+ return value
313
+ return None
314
+
315
+
316
+ def llm_call_trace_payload(
317
+ *,
318
+ request_messages: Sequence[dict[str, Any]],
319
+ image_aging: Optional[dict[str, Any]] = None,
320
+ response: Any,
321
+ model_name: str,
322
+ native_tools: Sequence[dict[str, Any]],
323
+ ) -> dict[str, Any]:
324
+ payload = {
325
+ "model_name": model_name,
326
+ "request_messages": safe_jsonable(list(request_messages)),
327
+ "tools_enabled": bool(native_tools),
328
+ "native_tools": safe_jsonable(list(native_tools)),
329
+ "response": safe_jsonable(response),
330
+ }
331
+ if image_aging and int(image_aging.get("omitted_image_count", 0) or 0) > 0:
332
+ payload["image_aging"] = safe_jsonable(image_aging)
333
+ return payload
334
+
335
+
336
+ def compaction_trace_payload(
337
+ *,
338
+ trigger_reason: str,
339
+ outcome: Any,
340
+ ) -> dict[str, Any]:
341
+ return {
342
+ "trigger_reason": trigger_reason,
343
+ "status": getattr(outcome, "status", ""),
344
+ "error": getattr(outcome, "error", ""),
345
+ "prior_token_estimate": getattr(outcome, "prior_token_estimate", 0),
346
+ "new_token_estimate": getattr(outcome, "new_token_estimate", 0),
347
+ "compacted_group_count": getattr(outcome, "compacted_group_count", 0),
348
+ "kept_group_count": getattr(outcome, "kept_group_count", 0),
349
+ "existing_memory_text": getattr(outcome, "existing_memory_text", ""),
350
+ "summary_request": safe_jsonable(getattr(outcome, "summary_request", []) or []),
351
+ "summary_response": safe_jsonable(getattr(outcome, "summary_response", {}) or {}),
352
+ "summary_text": getattr(outcome, "summary_text", ""),
353
+ "pre_messages": safe_jsonable(getattr(outcome, "pre_messages", []) or []),
354
+ "post_messages": safe_jsonable(getattr(outcome, "post_messages", []) or []),
355
+ }
356
+
357
+
358
+ def legacy_protocol_error(content: str) -> Optional[str]:
359
+ stripped = content.lstrip()
360
+ if stripped.startswith("<tool_call>"):
361
+ return "assistant emitted deprecated text <tool_call> protocol"
362
+ if stripped.startswith("<tool_response>"):
363
+ return "assistant emitted deprecated text <tool_response> protocol"
364
+ if stripped.startswith("<think>"):
365
+ return "assistant emitted deprecated text <think> protocol"
366
+ if stripped.startswith("<answer>"):
367
+ return "assistant emitted deprecated text <answer> protocol"
368
+ return None
369
+
370
+
371
+ def tool_schema(tool: Any) -> dict[str, Any]:
372
+ return {
373
+ "type": "function",
374
+ "function": {
375
+ "name": tool.name,
376
+ "description": tool.description,
377
+ "parameters": tool.parameters,
378
+ },
379
+ }
380
+
381
+
382
+ def resolved_tool_names(function_list: Optional[Sequence[str]]) -> list[str]:
383
+ if function_list is None:
384
+ return list(AVAILABLE_TOOL_MAP.keys())
385
+ resolved: list[str] = []
386
+ for raw_name in function_list:
387
+ name = str(raw_name).strip()
388
+ if name:
389
+ resolved.append(name)
390
+ return resolved
391
+
392
+
393
+ def available_tool_schemas(function_list: Optional[Sequence[str]] = None) -> list[dict[str, Any]]:
394
+ return [tool_schema(AVAILABLE_TOOL_MAP[name]) for name in resolved_tool_names(function_list)]
395
+
396
+
397
+ def normalized_tool_call(tool_call: Any) -> dict[str, Any]:
398
+ return {
399
+ "id": getattr(tool_call, "id", ""),
400
+ "type": "function",
401
+ "function": {
402
+ "name": tool_call.function.name,
403
+ "arguments": tool_call.function.arguments,
404
+ },
405
+ }
406
+
407
+
408
+ def tool_result_message_content(result: Any) -> str:
409
+ if isinstance(result, dict) and result.get("kind") == "image_tool_result":
410
+ return str(result.get("text", "")).strip() or "ReadImage returned no metadata."
411
+ if isinstance(result, (dict, list)):
412
+ return json.dumps(safe_jsonable(result), ensure_ascii=False)
413
+ return str(result)
414
+
415
+
416
+ def model_supports_runtime_image_parts(model_name: str) -> bool:
417
+ normalized = str(model_name or "").strip().casefold()
418
+ if "deepseek" in normalized:
419
+ return False
420
+ return True
421
+
422
+
423
+ def image_context_message(result: Any, model_name: str) -> Optional[dict[str, Any]]:
424
+ if not isinstance(result, dict) or result.get("kind") != "image_tool_result":
425
+ return None
426
+ image_url = str(result.get("image_url", "")).strip()
427
+ if not image_url and model_supports_runtime_image_parts(model_name):
428
+ return None
429
+ metadata_text = str(result.get("text", "")).strip()
430
+ text = (
431
+ "Runtime image context from ReadImage.\n"
432
+ "Use the attached image as evidence produced by that tool call when deciding the next step or final result.\n"
433
+ "Do not assume that all required tool work is complete merely because an image is attached."
434
+ )
435
+ if metadata_text:
436
+ text += "\n\nReadImage metadata:\n" + metadata_text
437
+ if not model_supports_runtime_image_parts(model_name):
438
+ text += (
439
+ "\n\nThis model endpoint does not accept runtime image content parts, so only the "
440
+ "ReadImage metadata is forwarded in conversation history. Do not invent visual details "
441
+ "that are not supported by the metadata."
442
+ )
443
+ return {"role": "user", "content": text}
444
+ return {
445
+ "role": "user",
446
+ "content": [
447
+ {"type": "text", "text": text},
448
+ {"type": "image_url", "image_url": {"url": image_url, "detail": "auto"}},
449
+ ],
450
+ }
451
+
452
+
453
+ def api_tool_message(tool_call_id: str, result: Any) -> dict[str, Any]:
454
+ return {
455
+ "role": "tool",
456
+ "tool_call_id": tool_call_id,
457
+ "content": tool_result_message_content(result),
458
+ }
459
+
460
+
461
+ def assistant_history_message(
462
+ *,
463
+ content: Any,
464
+ tool_calls: Optional[list[dict[str, Any]]] = None,
465
+ reasoning_content: Optional[Any] = None,
466
+ raw_message: Optional[dict[str, Any]] = None,
467
+ ) -> dict[str, Any]:
468
+ if isinstance(raw_message, dict):
469
+ message = safe_jsonable(raw_message)
470
+ if isinstance(message, dict):
471
+ message["role"] = "assistant"
472
+ if content is not None or "content" not in message:
473
+ message["content"] = content
474
+ if tool_calls and "tool_calls" not in message:
475
+ message["tool_calls"] = tool_calls
476
+ elif "tool_calls" in message and not message.get("tool_calls"):
477
+ message.pop("tool_calls", None)
478
+ if reasoning_content is not None and "reasoning_content" not in message:
479
+ message["reasoning_content"] = reasoning_content
480
+ elif "reasoning_content" in message and message.get("reasoning_content") is None:
481
+ message.pop("reasoning_content", None)
482
+ return message
483
+ message: dict[str, Any] = {"role": "assistant", "content": content}
484
+ if tool_calls:
485
+ message["tool_calls"] = tool_calls
486
+ if reasoning_content is not None:
487
+ message["reasoning_content"] = reasoning_content
488
+ return message
489
+
490
+
491
+ def assistant_retry_history_message(
492
+ *,
493
+ content: Any,
494
+ reasoning_content: Optional[Any] = None,
495
+ ) -> Optional[dict[str, Any]]:
496
+ if reasoning_content is None and not assistant_has_meaningful_text(content):
497
+ return None
498
+ # For retry/correction branches, preserve a replay-safe assistant history
499
+ # message without tool calls so provider-specific reasoning state is not
500
+ # lost while avoiding invalid unfinished tool-call history.
501
+ return assistant_history_message(
502
+ content=assistant_text_content(content),
503
+ reasoning_content=reasoning_content,
504
+ )
505
+
506
+
507
+ def parse_tool_arguments_list(tool_calls: list[dict[str, Any]]) -> list[Any]:
508
+ def _maybe_parse_nested_json(raw: Any) -> Any:
509
+ if not isinstance(raw, str):
510
+ return raw
511
+ try:
512
+ parsed = json.loads(raw)
513
+ except (TypeError, ValueError):
514
+ return raw
515
+ if isinstance(parsed, str):
516
+ nested_text = parsed.strip()
517
+ if nested_text.startswith("{") or nested_text.startswith("["):
518
+ try:
519
+ return json.loads(nested_text)
520
+ except (TypeError, ValueError):
521
+ return parsed
522
+ return parsed
523
+
524
+ parsed_arguments: list[Any] = []
525
+ for tool_call in tool_calls:
526
+ function_block = tool_call.get("function", {}) if isinstance(tool_call, dict) else {}
527
+ tool_arguments_raw = function_block.get("arguments", {})
528
+ parsed = _maybe_parse_nested_json(tool_arguments_raw)
529
+ parsed_arguments.append(safe_jsonable(parsed))
530
+ return parsed_arguments
531
+
532
+
533
+ def image_trace_paths(result: Any) -> list[str]:
534
+ if not isinstance(result, dict) or result.get("kind") != "image_tool_result":
535
+ return []
536
+ path = str(result.get("path", "")).strip()
537
+ return [path] if path else []
538
+
539
+
540
+ def image_context_trace_text(result: Any) -> str:
541
+ if not isinstance(result, dict) or result.get("kind") != "image_tool_result":
542
+ return ""
543
+ metadata_text = str(result.get("text", "")).strip()
544
+ text = (
545
+ "Runtime image context from ReadImage.\n"
546
+ "Use the attached image as evidence produced by that tool call when deciding the next step or final result.\n"
547
+ "Do not assume that all required tool work is complete merely because an image is attached."
548
+ )
549
+ if metadata_text:
550
+ text += "\n\nReadImage metadata:\n" + metadata_text
551
+ return text
552
+
553
+
554
+ def default_llm_config() -> dict:
555
+ model_name = os.environ.get("MODEL_NAME", DEFAULT_MODEL_NAME)
556
+ return {
557
+ "model": model_name,
558
+ "api_key": os.environ.get("API_KEY", "EMPTY"),
559
+ "api_base": os.environ.get("API_BASE"),
560
+ "timeout_seconds": float(os.environ.get("LLM_TIMEOUT_SECONDS", str(DEFAULT_LLM_TIMEOUT_SECONDS))),
561
+ "generate_cfg": {
562
+ "max_input_tokens": int(os.environ.get("MAX_INPUT_TOKENS", str(DEFAULT_MAX_INPUT_TOKENS))),
563
+ "max_output_tokens": int(os.environ.get("LLM_MAX_OUTPUT_TOKENS", str(DEFAULT_MAX_OUTPUT_TOKENS))),
564
+ "max_retries": int(os.environ.get("LLM_MAX_RETRIES", str(DEFAULT_MAX_RETRIES))),
565
+ "temperature": float(os.environ.get("TEMPERATURE", str(DEFAULT_TEMPERATURE))),
566
+ "top_p": float(os.environ.get("TOP_P", str(DEFAULT_TOP_P))),
567
+ "presence_penalty": float(os.environ.get("PRESENCE_PENALTY", str(DEFAULT_PRESENCE_PENALTY))),
568
+ },
569
+ }
570
+
571
+
572
+ def execute_tool_by_name(tool_map: dict[str, Any], tool_name: str, tool_args: Any, **kwargs):
573
+ if tool_name not in tool_map:
574
+ return f"Error: Tool {tool_name} not found"
575
+ tool = tool_map[tool_name]
576
+ if tool_name == "ReadImage" and hasattr(tool, "call_for_llm"):
577
+ return tool.call_for_llm(tool_args, **kwargs)
578
+ return tool.call(tool_args, **kwargs)
579
+
580
+
581
+ class MultiTurnReactAgent(BaseAgent):
582
+ def __init__(
583
+ self,
584
+ function_list: Optional[List[str]] = None,
585
+ llm: Optional[Dict] = None,
586
+ trace_dir: Optional[str] = None,
587
+ role_prompt: Optional[str] = None,
588
+ max_llm_calls: Optional[int] = None,
589
+ max_rounds: Optional[int] = None,
590
+ max_runtime_seconds: Optional[int] = None,
591
+ ):
592
+ if not isinstance(llm, dict):
593
+ raise ValueError("llm must be a dict configuration.")
594
+ requested_tools = self.resolve_function_list(function_list)
595
+ if requested_tools is None:
596
+ requested_tools = list(AVAILABLE_TOOL_MAP.keys())
597
+ unknown_tools = [tool for tool in requested_tools if tool not in AVAILABLE_TOOL_MAP]
598
+ if unknown_tools:
599
+ raise ValueError(f"Unknown tools requested: {unknown_tools}")
600
+ if "model" not in llm or not str(llm["model"]).strip():
601
+ raise ValueError('llm["model"] must be a non-empty string.')
602
+ if "generate_cfg" not in llm or not isinstance(llm["generate_cfg"], dict):
603
+ raise ValueError('llm["generate_cfg"] must be a dict.')
604
+
605
+ self.tool_map = {tool_name: AVAILABLE_TOOL_MAP[tool_name] for tool_name in requested_tools}
606
+ self.tool_names = list(self.tool_map.keys())
607
+ self.model = str(llm["model"])
608
+ self.llm_generate_cfg = llm["generate_cfg"]
609
+ self.trace_dir = Path(trace_dir) if trace_dir else None
610
+ self.trace_path: Optional[Path] = None
611
+ self.session_state_path: Optional[Path] = None
612
+ self.role_prompt = self.resolve_role_prompt(role_prompt)
613
+ self.max_llm_calls = int(max_llm_calls) if max_llm_calls is not None else max_llm_calls_per_run()
614
+ self.max_rounds = int(max_rounds) if max_rounds is not None else max_agent_rounds()
615
+ self.max_runtime_seconds = (
616
+ int(max_runtime_seconds) if max_runtime_seconds is not None else max_agent_runtime_seconds()
617
+ )
618
+ if self.max_rounds <= 0:
619
+ raise ValueError("max_rounds must be > 0.")
620
+ self._native_tools = [tool_schema(self.tool_map[tool_name]) for tool_name in self.tool_names]
621
+ self._encoding = tiktoken.get_encoding("cl100k_base")
622
+ self._native_tools_token_estimate = len(
623
+ self._encoding.encode(json.dumps(self._native_tools, ensure_ascii=False))
624
+ )
625
+ self._llm_timeout_seconds = float(
626
+ llm.get("timeout_seconds", os.getenv("LLM_TIMEOUT_SECONDS", str(DEFAULT_LLM_TIMEOUT_SECONDS)))
627
+ )
628
+ self._llm_api_key = str(llm.get("api_key") or os.environ.get("API_KEY", "EMPTY"))
629
+ api_base = str(llm.get("api_base") or os.environ.get("API_BASE", "")).strip()
630
+ self._llm_api_base = api_base or None
631
+ self._llm_client = (
632
+ OpenAI(
633
+ api_key=self._llm_api_key,
634
+ base_url=self._llm_api_base,
635
+ timeout=self._llm_timeout_seconds,
636
+ )
637
+ if self._llm_api_base
638
+ else None
639
+ )
640
+
641
+ def _call_chat_completion(
642
+ self,
643
+ msgs,
644
+ *,
645
+ include_native_tools: bool,
646
+ max_tries=10,
647
+ runtime_deadline: Optional[float] = None,
648
+ max_output_tokens: Optional[int] = None,
649
+ temperature: Optional[float] = None,
650
+ top_p: Optional[float] = None,
651
+ presence_penalty: Optional[float] = None,
652
+ ) -> dict[str, Any]:
653
+ max_tries = int(self.llm_generate_cfg.get("max_retries", max_tries))
654
+ if self._llm_client is None or not self._llm_api_base:
655
+ return {"status": "error", "error": "llm api error: API_BASE is not set."}
656
+
657
+ base_sleep_time = 1
658
+ last_error = "unknown llm error"
659
+ for attempt in range(max_tries):
660
+ remaining = remaining_runtime_seconds(runtime_deadline)
661
+ if remaining is not None and remaining <= 0:
662
+ last_error = "agent runtime limit reached before llm call could complete"
663
+ break
664
+ try:
665
+ if debug_enabled():
666
+ print(f"--- Attempting to call the service, try {attempt + 1}/{max_tries} ---")
667
+ request_timeout = (
668
+ min(self._llm_timeout_seconds, max(remaining, 0.001))
669
+ if remaining is not None
670
+ else self._llm_timeout_seconds
671
+ )
672
+ request_client = self._llm_client.with_options(timeout=request_timeout)
673
+ request_kwargs = dict(
674
+ model=self.model,
675
+ messages=msgs,
676
+ max_tokens=int(
677
+ max_output_tokens
678
+ if max_output_tokens is not None
679
+ else self.llm_generate_cfg.get("max_output_tokens", llm_max_output_tokens())
680
+ ),
681
+ )
682
+ apply_sampling_params(
683
+ request_kwargs,
684
+ model_name=self.model,
685
+ temperature=(
686
+ temperature if temperature is not None else self.llm_generate_cfg.get("temperature", 0.6)
687
+ ),
688
+ top_p=top_p if top_p is not None else self.llm_generate_cfg.get("top_p", 0.95),
689
+ presence_penalty=(
690
+ presence_penalty
691
+ if presence_penalty is not None
692
+ else self.llm_generate_cfg.get("presence_penalty", 1.1)
693
+ ),
694
+ )
695
+ if include_native_tools and self._native_tools:
696
+ request_kwargs["tools"] = self._native_tools
697
+ request_kwargs["tool_choice"] = "auto"
698
+ request_kwargs["parallel_tool_calls"] = True
699
+ with llm_hard_timeout(request_timeout):
700
+ chat_response = request_client.chat.completions.create(**request_kwargs)
701
+ choice = chat_response.choices[0]
702
+ message = choice.message
703
+ content = message.content
704
+ tool_calls = [normalized_tool_call(tool_call) for tool_call in (message.tool_calls or [])]
705
+ reasoning_content = assistant_reasoning_content(message)
706
+ raw_message = safe_jsonable(message.model_dump()) if hasattr(message, "model_dump") else None
707
+ usage = safe_jsonable(chat_response.usage.model_dump()) if getattr(chat_response, "usage", None) else None
708
+
709
+ if assistant_has_meaningful_text(content) or tool_calls:
710
+ if debug_enabled():
711
+ print("--- Service call successful, received a valid response ---")
712
+ return {
713
+ "status": "ok",
714
+ "finish_reason": choice.finish_reason,
715
+ "content": content,
716
+ "tool_calls": tool_calls,
717
+ "reasoning_content": reasoning_content,
718
+ "raw_message": raw_message,
719
+ "usage": usage,
720
+ }
721
+ else:
722
+ last_error = "empty response from llm api"
723
+ if debug_enabled():
724
+ print(f"Warning: Attempt {attempt + 1} received an empty response.")
725
+
726
+ except (APIError, APIConnectionError, APITimeoutError, LLMHardTimeoutError) as e:
727
+ last_error = str(e)
728
+ if debug_enabled():
729
+ print(f"Error: Attempt {attempt + 1} failed with an API or network error: {e}")
730
+
731
+ if attempt < max_tries - 1:
732
+ sleep_time = base_sleep_time * (2 ** attempt) + random.uniform(0, 1)
733
+ sleep_time = min(sleep_time, 30)
734
+ remaining = remaining_runtime_seconds(runtime_deadline)
735
+ if remaining is not None:
736
+ if remaining <= 0:
737
+ last_error = "agent runtime limit reached before llm retry could complete"
738
+ break
739
+ sleep_time = min(sleep_time, remaining)
740
+ if debug_enabled():
741
+ print(f"Retrying in {sleep_time:.2f} seconds...")
742
+ if sleep_time > 0:
743
+ time.sleep(sleep_time)
744
+ else:
745
+ if debug_enabled():
746
+ print("Error: All retry attempts have been exhausted. The call has failed.")
747
+
748
+ return {"status": "error", "error": f"llm api error: {last_error}"}
749
+
750
+ def call_llm_api(self, msgs, max_tries=10, runtime_deadline: Optional[float] = None) -> dict[str, Any]:
751
+ return self._call_chat_completion(
752
+ msgs,
753
+ include_native_tools=True,
754
+ max_tries=max_tries,
755
+ runtime_deadline=runtime_deadline,
756
+ )
757
+
758
+ def call_compaction_api(
759
+ self,
760
+ msgs,
761
+ *,
762
+ runtime_deadline: Optional[float] = None,
763
+ max_output_tokens: Optional[int] = None,
764
+ ) -> dict[str, Any]:
765
+ return self._call_chat_completion(
766
+ msgs,
767
+ include_native_tools=False,
768
+ max_tries=3,
769
+ runtime_deadline=runtime_deadline,
770
+ max_output_tokens=max_output_tokens,
771
+ temperature=0.0,
772
+ top_p=1.0,
773
+ presence_penalty=0.0,
774
+ )
775
+
776
+ def count_tokens(self, messages, *, include_tool_schema: bool = True):
777
+ image_token_estimate = int(os.getenv("IMAGE_PART_TOKEN_ESTIMATE", str(DEFAULT_IMAGE_TOKEN_ESTIMATE)))
778
+ token_count = self._native_tools_token_estimate if include_tool_schema else 0
779
+ for message in messages:
780
+ token_count += len(self._encoding.encode(message.get("role", "")))
781
+ content = message.get("content", "")
782
+ if isinstance(content, str):
783
+ token_count += len(self._encoding.encode(content))
784
+ elif isinstance(content, list):
785
+ for part in content:
786
+ if not isinstance(part, dict):
787
+ token_count += len(self._encoding.encode(str(part)))
788
+ continue
789
+ if part.get("type") == "text":
790
+ token_count += len(self._encoding.encode(str(part.get("text", ""))))
791
+ elif part.get("type") == "image_url":
792
+ token_count += image_token_estimate
793
+ else:
794
+ token_count += len(self._encoding.encode(str(part)))
795
+ else:
796
+ token_count += len(self._encoding.encode(str(content)))
797
+ tool_calls = message.get("tool_calls")
798
+ if isinstance(tool_calls, list) and tool_calls:
799
+ token_count += len(self._encoding.encode(json.dumps(tool_calls, ensure_ascii=False)))
800
+ reasoning_content = message.get("reasoning_content")
801
+ if isinstance(reasoning_content, str) and reasoning_content:
802
+ token_count += len(self._encoding.encode(reasoning_content))
803
+ elif reasoning_content is not None:
804
+ token_count += len(
805
+ self._encoding.encode(json.dumps(safe_jsonable(reasoning_content), ensure_ascii=False))
806
+ )
807
+ return token_count
808
+
809
+ def run(self, prompt: str, workspace_root: Optional[str] = None) -> str:
810
+ """Run the agent on one prompt and return only the final result text."""
811
+ return self._run_session(prompt, workspace_root=workspace_root)["result_text"]
812
+
813
+ def _run_session(
814
+ self,
815
+ prompt: str,
816
+ workspace_root: Optional[str] = None,
817
+ event_callback: Optional[Callable[[dict[str, Any]], None]] = None,
818
+ initial_content_parts: Optional[Sequence[dict[str, Any]]] = None,
819
+ prior_messages: Optional[Sequence[dict[str, Any]]] = None,
820
+ interrupt_event: Optional[threading.Event] = None,
821
+ ) -> dict:
822
+ """Internal execution path with trace data for tests and debugging."""
823
+ if not isinstance(prompt, str) or not prompt.strip():
824
+ raise ValueError("prompt must be a non-empty string.")
825
+
826
+ prompt_text = prompt.strip()
827
+ resolved_workspace_root = normalize_workspace_root(workspace_root)
828
+ start_time = time.time()
829
+ trace_dir = self.trace_dir
830
+ cur_date = today_date()
831
+ extra_blocks = [self.role_prompt] if self.role_prompt else None
832
+ system_prompt = composed_system_prompt(current_date=str(cur_date), extra_blocks=extra_blocks)
833
+ user_content = (
834
+ f"Current workspace root: {resolved_workspace_root}\n"
835
+ "Relative local file paths resolve from the workspace root.\n\n"
836
+ f"Prompt:\n{prompt_text}"
837
+ )
838
+ if initial_content_parts is not None:
839
+ if not isinstance(initial_content_parts, Sequence) or isinstance(initial_content_parts, (str, bytes)):
840
+ raise ValueError("initial_content_parts must be a sequence of OpenAI-style content part dicts.")
841
+ safe_initial_parts = safe_jsonable(list(initial_content_parts))
842
+ if not isinstance(safe_initial_parts, list) or not all(isinstance(part, dict) for part in safe_initial_parts):
843
+ raise ValueError("initial_content_parts must contain only dict content parts.")
844
+ user_content: Any = [{"type": "text", "text": user_content}, *safe_initial_parts]
845
+ continuing_conversation = prior_messages is not None
846
+ if continuing_conversation:
847
+ if not isinstance(prior_messages, Sequence) or isinstance(prior_messages, (str, bytes)):
848
+ raise ValueError("prior_messages must be a sequence of message dicts.")
849
+ safe_prior_messages = safe_jsonable(list(prior_messages))
850
+ if not isinstance(safe_prior_messages, list) or not all(isinstance(message, dict) for message in safe_prior_messages):
851
+ raise ValueError("prior_messages must contain only dict messages.")
852
+ messages = list(safe_prior_messages)
853
+ if not messages or messages[0].get("role") != "system":
854
+ messages.insert(0, {"role": "system", "content": system_prompt})
855
+ messages.append({"role": "user", "content": user_content})
856
+ else:
857
+ messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_content}]
858
+ max_llm_calls = self.max_llm_calls
859
+ max_input_tokens = int(self.llm_generate_cfg.get("max_input_tokens", DEFAULT_MAX_INPUT_TOKENS))
860
+ max_output_tokens = int(self.llm_generate_cfg.get("max_output_tokens", llm_max_output_tokens()))
861
+ compact_trigger_tokens = self.llm_generate_cfg.get("compact_trigger_tokens")
862
+ if compact_trigger_tokens is None:
863
+ compact_trigger_tokens = os.getenv("AUTO_COMPACT_TRIGGER_TOKENS", "128k")
864
+ model_profile = resolve_model_profile(
865
+ self.model,
866
+ configured_max_input_tokens=max_input_tokens,
867
+ configured_max_output_tokens=max_output_tokens,
868
+ compact_trigger_tokens=compact_trigger_tokens,
869
+ )
870
+ agent_runtime_limit = self.max_runtime_seconds
871
+ runtime_deadline = start_time + agent_runtime_limit
872
+ num_llm_calls_available = max_llm_calls
873
+ round_index = 0
874
+ trace_writer = FlatTraceWriter(
875
+ trace_dir=trace_dir,
876
+ model_name=self.model,
877
+ workspace_root=resolved_workspace_root,
878
+ on_event=event_callback,
879
+ )
880
+ self.trace_path = trace_writer.path
881
+ self.session_state_path = resolve_session_state_path(trace_dir) if trace_dir else None
882
+ session_state = AgentSessionState(
883
+ run_id=trace_writer.run_id,
884
+ model_name=self.model,
885
+ workspace_root=str(resolved_workspace_root),
886
+ prompt=prompt_text,
887
+ trace_path=str(self.trace_path) if self.trace_path else "",
888
+ llm_calls_remaining=num_llm_calls_available,
889
+ max_rounds=self.max_rounds,
890
+ max_input_tokens=max_input_tokens,
891
+ max_output_tokens=max_output_tokens,
892
+ model_profile=model_profile,
893
+ )
894
+
895
+ def persist_state(*, termination: str = "", error: str = "") -> None:
896
+ session_state.trace_path = str(self.trace_path) if self.trace_path else ""
897
+ session_state.turn_index = round_index
898
+ session_state.llm_calls_remaining = num_llm_calls_available
899
+ session_state.current_token_estimate = self.count_tokens(messages)
900
+ session_state.termination = termination
901
+ session_state.error = error
902
+ session_state.capture_messages(messages)
903
+ if self.session_state_path:
904
+ persist_session_state(self.session_state_path, session_state)
905
+
906
+ def finalize(result_text: str, termination: str, *, role: str = "runtime", error: str = "") -> dict[str, Any]:
907
+ trace_writer.append(
908
+ role=role,
909
+ text=result_text,
910
+ turn_index=round_index,
911
+ termination=termination,
912
+ error=error,
913
+ )
914
+ persist_state(termination=termination, error=error)
915
+ return {
916
+ "prompt": prompt_text,
917
+ "messages": messages,
918
+ "result_text": result_text,
919
+ "termination": termination,
920
+ "trace_path": str(self.trace_path) if self.trace_path else "",
921
+ "session_state_path": str(self.session_state_path) if self.session_state_path else "",
922
+ }
923
+
924
+ def interruption_requested() -> bool:
925
+ return bool(interrupt_event is not None and interrupt_event.is_set())
926
+
927
+ def finalize_interrupted() -> dict[str, Any]:
928
+ return finalize(
929
+ "Interrupted by user. Continue with a follow-up prompt to resume from the current context.",
930
+ "interrupted",
931
+ role="runtime",
932
+ error="user interrupt",
933
+ )
934
+
935
+ if continuing_conversation:
936
+ trace_writer.append(
937
+ role="runtime",
938
+ text="Continuing existing conversation with prior messages.",
939
+ turn_index=0,
940
+ )
941
+ else:
942
+ trace_writer.append(role="system", text=system_prompt, turn_index=0)
943
+ trace_writer.append(role="user", text=message_trace_text(user_content), turn_index=0)
944
+ persist_state()
945
+
946
+ while num_llm_calls_available > 0 and round_index < self.max_rounds:
947
+ if interruption_requested():
948
+ return finalize_interrupted()
949
+ if remaining_runtime_seconds(runtime_deadline) is not None and remaining_runtime_seconds(runtime_deadline) <= 0:
950
+ result_text = "No result found before the maximum agent runtime limit."
951
+ termination = f"agent runtime limit reached: {agent_runtime_limit}s"
952
+ return finalize(result_text, termination, error=termination)
953
+ current_token_estimate = self.count_tokens(messages)
954
+ should_compact = False
955
+ compact_reason = ""
956
+ if len(messages) > 2:
957
+ should_compact, compact_reason = should_compact_messages(
958
+ last_input_tokens=session_state.last_input_tokens,
959
+ current_token_estimate=current_token_estimate,
960
+ model_profile=model_profile,
961
+ )
962
+ if should_compact:
963
+ trace_writer.append(
964
+ role="runtime",
965
+ text=(
966
+ "Runtime note: compacting earlier conversation history before the next model call "
967
+ f"because the {compact_reason} budget crossed the pre-limit threshold."
968
+ ),
969
+ turn_index=round_index,
970
+ )
971
+ compact_outcome = compact_messages(
972
+ messages=messages,
973
+ original_prompt_text=prompt_text,
974
+ model_name=self.model,
975
+ model_profile=model_profile,
976
+ llm_caller=self.call_compaction_api,
977
+ token_counter=self.count_tokens,
978
+ runtime_deadline=runtime_deadline,
979
+ )
980
+ if compact_outcome.status == "ok":
981
+ messages = compact_outcome.compacted_messages
982
+ session_state.last_input_tokens = None
983
+ session_state.compactions.append(
984
+ CompactionRecord(
985
+ turn_index=round_index,
986
+ status="ok",
987
+ trigger_reason=compact_reason,
988
+ prior_token_estimate=compact_outcome.prior_token_estimate,
989
+ prior_message_count=len(session_state.messages),
990
+ compacted_group_count=compact_outcome.compacted_group_count,
991
+ kept_group_count=compact_outcome.kept_group_count,
992
+ new_token_estimate=compact_outcome.new_token_estimate,
993
+ new_message_count=len(messages),
994
+ summary_text=compact_outcome.summary_text,
995
+ )
996
+ )
997
+ trace_writer.append(
998
+ role="runtime",
999
+ text=(
1000
+ "Runtime note: context compaction completed. "
1001
+ f"Token estimate {compact_outcome.prior_token_estimate} -> {compact_outcome.new_token_estimate}. "
1002
+ f"Compacted {compact_outcome.compacted_group_count} older turn groups."
1003
+ ),
1004
+ turn_index=round_index,
1005
+ capture_type="compaction",
1006
+ payload=compaction_trace_payload(trigger_reason=compact_reason, outcome=compact_outcome),
1007
+ )
1008
+ persist_state()
1009
+ current_token_estimate = compact_outcome.new_token_estimate
1010
+ else:
1011
+ session_state.compactions.append(
1012
+ CompactionRecord(
1013
+ turn_index=round_index,
1014
+ status="error",
1015
+ trigger_reason=compact_reason,
1016
+ prior_token_estimate=compact_outcome.prior_token_estimate,
1017
+ prior_message_count=len(messages),
1018
+ compacted_group_count=compact_outcome.compacted_group_count,
1019
+ kept_group_count=compact_outcome.kept_group_count,
1020
+ error=compact_outcome.error,
1021
+ )
1022
+ )
1023
+ trace_writer.append(
1024
+ role="runtime",
1025
+ text="Runtime note: context compaction failed; the existing history was kept unchanged.",
1026
+ turn_index=round_index,
1027
+ error=compact_outcome.error,
1028
+ capture_type="compaction",
1029
+ payload=compaction_trace_payload(trigger_reason=compact_reason, outcome=compact_outcome),
1030
+ )
1031
+ persist_state(error=compact_outcome.error)
1032
+ if current_token_estimate > max_input_tokens:
1033
+ result_text = "No result found before the maximum input token limit."
1034
+ termination = f"input token limit reached: {current_token_estimate} > {max_input_tokens}"
1035
+ return finalize(result_text, termination, error=termination)
1036
+ if interruption_requested():
1037
+ return finalize_interrupted()
1038
+ round_index += 1
1039
+ num_llm_calls_available -= 1
1040
+ llm_request_messages, image_aging = prepare_messages_for_llm(messages)
1041
+ try:
1042
+ llm_reply = self.call_llm_api(llm_request_messages, runtime_deadline=runtime_deadline)
1043
+ except KeyboardInterrupt:
1044
+ return finalize_interrupted()
1045
+ if interruption_requested():
1046
+ return finalize_interrupted()
1047
+ trace_writer.append(
1048
+ role="runtime",
1049
+ text="",
1050
+ turn_index=round_index,
1051
+ capture_type="llm_call",
1052
+ payload=llm_call_trace_payload(
1053
+ request_messages=llm_request_messages,
1054
+ image_aging=image_aging,
1055
+ response=llm_reply,
1056
+ model_name=self.model,
1057
+ native_tools=self._native_tools,
1058
+ ),
1059
+ )
1060
+ session_state.last_input_tokens = input_tokens_from_usage(
1061
+ llm_reply.get("usage") if isinstance(llm_reply, dict) else None
1062
+ )
1063
+ assistant_content = llm_reply.get("content") if isinstance(llm_reply, dict) else None
1064
+ assistant_tool_calls = llm_reply.get("tool_calls", []) if isinstance(llm_reply, dict) else []
1065
+ assistant_reasoning = llm_reply.get("reasoning_content") if isinstance(llm_reply, dict) else None
1066
+ assistant_raw_message = llm_reply.get("raw_message") if isinstance(llm_reply, dict) else None
1067
+ assistant_text = assistant_text_content(assistant_content)
1068
+ finish_reason = llm_reply.get("finish_reason") if isinstance(llm_reply, dict) else None
1069
+ assistant_tool_arguments = parse_tool_arguments_list(assistant_tool_calls)
1070
+ assistant_tool_call_ids = [str(tool_call.get("id", "")) for tool_call in assistant_tool_calls]
1071
+ assistant_tool_names = [
1072
+ str((tool_call.get("function", {}) if isinstance(tool_call, dict) else {}).get("name", ""))
1073
+ for tool_call in assistant_tool_calls
1074
+ ]
1075
+ if debug_enabled():
1076
+ if assistant_tool_calls:
1077
+ print(f"Round {round_index}: tool_calls={json.dumps(assistant_tool_calls, ensure_ascii=False)}")
1078
+ if assistant_text.strip():
1079
+ print(f"Round {round_index} content: {assistant_text}")
1080
+ else:
1081
+ print(f"Round {round_index}: {assistant_text}")
1082
+ if not isinstance(llm_reply, dict) or llm_reply.get("status") == "error":
1083
+ result_text = llm_reply.get("error", "llm api error: unknown error") if isinstance(llm_reply, dict) else str(llm_reply)
1084
+ if self.should_accept_terminal_error(
1085
+ error_text=result_text,
1086
+ workspace_root=resolved_workspace_root,
1087
+ messages=messages,
1088
+ ):
1089
+ recovered_result_text = self.accepted_terminal_error_result_text(
1090
+ error_text=result_text,
1091
+ workspace_root=resolved_workspace_root,
1092
+ messages=messages,
1093
+ ).strip()
1094
+ if not recovered_result_text:
1095
+ recovered_result_text = (
1096
+ "Recovered completion after a terminal LLM/runtime error because the required "
1097
+ "completion artifacts already exist in the workspace."
1098
+ )
1099
+ return finalize(recovered_result_text, "result", role="runtime", error=result_text)
1100
+ termination = "llm api error"
1101
+ return finalize(result_text, termination, error=result_text)
1102
+
1103
+ deprecated_protocol = legacy_protocol_error(assistant_text)
1104
+ if deprecated_protocol is not None:
1105
+ trace_writer.append(
1106
+ role="assistant",
1107
+ text=assistant_text.strip(),
1108
+ turn_index=round_index,
1109
+ tool_call_ids=assistant_tool_call_ids,
1110
+ tool_names=assistant_tool_names,
1111
+ tool_arguments=assistant_tool_arguments,
1112
+ finish_reason=finish_reason,
1113
+ error=deprecated_protocol,
1114
+ )
1115
+ retry_assistant_message = assistant_retry_history_message(
1116
+ content=assistant_content,
1117
+ reasoning_content=assistant_reasoning,
1118
+ )
1119
+ if retry_assistant_message is not None:
1120
+ messages.append(retry_assistant_message)
1121
+ correction_text = (
1122
+ "Error: The previous assistant turn used the deprecated text-tag protocol. "
1123
+ "Do not emit <tool_call>, <tool_response>, <think>, or <answer> in plain text. "
1124
+ "Use only the native tool calling interface when tools are needed, or plain final result text when no more tools are needed."
1125
+ )
1126
+ messages.append(
1127
+ {
1128
+ "role": "user",
1129
+ "content": correction_text,
1130
+ }
1131
+ )
1132
+ trace_writer.append(role="user", text=correction_text, turn_index=round_index)
1133
+ persist_state(error=deprecated_protocol)
1134
+ continue
1135
+
1136
+ if finish_reason == "length" and assistant_tool_calls:
1137
+ protocol_error = "assistant tool call turn was truncated by output limit"
1138
+ trace_writer.append(
1139
+ role="assistant",
1140
+ text=assistant_text.strip(),
1141
+ turn_index=round_index,
1142
+ tool_call_ids=assistant_tool_call_ids,
1143
+ tool_names=assistant_tool_names,
1144
+ tool_arguments=assistant_tool_arguments,
1145
+ finish_reason=finish_reason,
1146
+ error=protocol_error,
1147
+ )
1148
+ retry_assistant_message = assistant_retry_history_message(
1149
+ content=assistant_content,
1150
+ reasoning_content=assistant_reasoning,
1151
+ )
1152
+ if retry_assistant_message is not None:
1153
+ messages.append(retry_assistant_message)
1154
+ correction_text = (
1155
+ "Error: The previous assistant turn hit the output limit while emitting native tool calls, "
1156
+ "so none of those tool calls were executed. Re-emit the needed tool calls in a smaller form. "
1157
+ "If a file is large, split it into multiple smaller Write calls or create it via shorter steps. "
1158
+ "Do not resend the same oversized truncated tool call."
1159
+ )
1160
+ messages.append({"role": "user", "content": correction_text})
1161
+ trace_writer.append(role="user", text=correction_text, turn_index=round_index)
1162
+ persist_state(error=protocol_error)
1163
+ continue
1164
+
1165
+ if assistant_tool_calls:
1166
+ trace_writer.append(
1167
+ role="assistant",
1168
+ text=assistant_text.strip(),
1169
+ turn_index=round_index,
1170
+ tool_call_ids=assistant_tool_call_ids,
1171
+ tool_names=assistant_tool_names,
1172
+ tool_arguments=assistant_tool_arguments,
1173
+ finish_reason=finish_reason,
1174
+ )
1175
+ assistant_message = assistant_history_message(
1176
+ content=assistant_content,
1177
+ tool_calls=assistant_tool_calls,
1178
+ reasoning_content=assistant_reasoning,
1179
+ raw_message=assistant_raw_message,
1180
+ )
1181
+ tool_turn_message_start = len(messages)
1182
+ messages.append(assistant_message)
1183
+ deferred_image_contexts: list[tuple[str, str, Any, Any, dict[str, Any]]] = []
1184
+ for tool_call, tool_arguments in zip(assistant_tool_calls, assistant_tool_arguments):
1185
+ if remaining_runtime_seconds(runtime_deadline) is not None and remaining_runtime_seconds(runtime_deadline) <= 0:
1186
+ result_text = "No result found before the maximum agent runtime limit."
1187
+ termination = f"agent runtime limit reached: {agent_runtime_limit}s"
1188
+ return finalize(result_text, termination, error=termination)
1189
+ tool_call_id = str(tool_call.get("id", ""))
1190
+ function_block = tool_call.get("function", {}) if isinstance(tool_call, dict) else {}
1191
+ tool_name = str(function_block.get("name", ""))
1192
+ try:
1193
+ result = self.custom_call_tool(
1194
+ tool_name,
1195
+ tool_arguments,
1196
+ workspace_root=resolved_workspace_root,
1197
+ runtime_deadline=runtime_deadline,
1198
+ )
1199
+ except KeyboardInterrupt:
1200
+ messages = messages[:tool_turn_message_start]
1201
+ return finalize_interrupted()
1202
+ tool_result_text = tool_result_message_content(result)
1203
+ messages.append(api_tool_message(tool_call_id, result))
1204
+ trace_writer.append(
1205
+ role="tool",
1206
+ text=tool_result_text,
1207
+ turn_index=round_index,
1208
+ tool_call_ids=[tool_call_id],
1209
+ tool_names=[tool_name],
1210
+ tool_arguments=[tool_arguments],
1211
+ )
1212
+ extra_image_context = image_context_message(result, self.model)
1213
+ if extra_image_context is not None:
1214
+ deferred_image_contexts.append((tool_call_id, tool_name, tool_arguments, result, extra_image_context))
1215
+ for tool_call_id, tool_name, tool_arguments, result, extra_image_context in deferred_image_contexts:
1216
+ messages.append(extra_image_context)
1217
+ trace_writer.append(
1218
+ role="user",
1219
+ text=image_context_trace_text(result),
1220
+ turn_index=round_index,
1221
+ tool_call_ids=[tool_call_id],
1222
+ tool_names=[tool_name],
1223
+ tool_arguments=[tool_arguments],
1224
+ image_paths=image_trace_paths(result),
1225
+ )
1226
+ if remaining_runtime_seconds(runtime_deadline) is not None and remaining_runtime_seconds(runtime_deadline) <= 0:
1227
+ result_text = "No result found before the maximum agent runtime limit."
1228
+ termination = f"agent runtime limit reached: {agent_runtime_limit}s"
1229
+ return finalize(result_text, termination, error=termination)
1230
+ persist_state()
1231
+ if interruption_requested():
1232
+ return finalize_interrupted()
1233
+ elif assistant_has_meaningful_text(assistant_content):
1234
+ current_result_text = assistant_text.strip()
1235
+ messages.append(
1236
+ assistant_history_message(
1237
+ content=current_result_text,
1238
+ reasoning_content=assistant_reasoning,
1239
+ raw_message=assistant_raw_message,
1240
+ )
1241
+ )
1242
+ should_accept_result = self.should_accept_plaintext_result(
1243
+ result_text=current_result_text,
1244
+ workspace_root=resolved_workspace_root,
1245
+ messages=messages,
1246
+ )
1247
+ if should_accept_result:
1248
+ return finalize(current_result_text, "result", role="assistant")
1249
+ protocol_error = "plain result rejected by additional stop condition"
1250
+ trace_writer.append(
1251
+ role="assistant",
1252
+ text=current_result_text,
1253
+ turn_index=round_index,
1254
+ finish_reason=finish_reason,
1255
+ error=protocol_error,
1256
+ )
1257
+ correction_text = self.rejected_plaintext_result_message(
1258
+ result_text=current_result_text,
1259
+ workspace_root=resolved_workspace_root,
1260
+ messages=messages,
1261
+ ).strip()
1262
+ if not correction_text:
1263
+ correction_text = (
1264
+ "The previous assistant turn was not accepted as the final result because the additional stop condition returned false. "
1265
+ "Continue working. If the task is incomplete, use tool calls to produce the required artifacts before finishing."
1266
+ )
1267
+ messages.append({"role": "user", "content": correction_text})
1268
+ trace_writer.append(role="user", text=correction_text, turn_index=round_index)
1269
+ persist_state(error=protocol_error)
1270
+ continue
1271
+ else:
1272
+ protocol_error = "assistant emitted empty response"
1273
+ trace_writer.append(
1274
+ role="assistant",
1275
+ text="",
1276
+ turn_index=round_index,
1277
+ finish_reason=finish_reason,
1278
+ error=protocol_error,
1279
+ )
1280
+ retry_assistant_message = assistant_retry_history_message(
1281
+ content=assistant_content,
1282
+ reasoning_content=assistant_reasoning,
1283
+ )
1284
+ if retry_assistant_message is not None:
1285
+ messages.append(retry_assistant_message)
1286
+ correction_text = (
1287
+ "Error: The previous assistant turn was empty. "
1288
+ "If tools are needed, use native tool calling. Otherwise return the final result text."
1289
+ )
1290
+ messages.append(
1291
+ {
1292
+ "role": "user",
1293
+ "content": correction_text,
1294
+ }
1295
+ )
1296
+ trace_writer.append(role="user", text=correction_text, turn_index=round_index)
1297
+ persist_state(error=protocol_error)
1298
+ continue
1299
+
1300
+ token_count = self.count_tokens(messages)
1301
+ if debug_enabled():
1302
+ print(f"round: {round_index}, token count: {token_count}")
1303
+ persist_state()
1304
+
1305
+ result_text = 'No result found.'
1306
+ termination = 'result not found'
1307
+ if round_index >= self.max_rounds:
1308
+ termination = 'exceed available rounds'
1309
+ elif num_llm_calls_available == 0:
1310
+ termination = 'exceed available llm calls'
1311
+ return finalize(result_text, termination, error=termination)
1312
+
1313
+ def custom_call_tool(self, tool_name: str, tool_args: Any, **kwargs):
1314
+ return execute_tool_by_name(self.tool_map, tool_name, tool_args, **kwargs)
1315
+
1316
+
1317
+ def _path_has_suffix(path: Path, suffix_parts: Sequence[str]) -> bool:
1318
+ normalized_parts = tuple(part.casefold() for part in path.parts)
1319
+ normalized_suffix = tuple(part.casefold() for part in suffix_parts)
1320
+ if len(normalized_parts) < len(normalized_suffix):
1321
+ return False
1322
+ return normalized_parts[-len(normalized_suffix) :] == normalized_suffix
1323
+
1324
+
1325
+ def resolve_agent_class_for_role_prompt_files(role_prompt_files: Sequence[str]) -> Type[MultiTurnReactAgent]:
1326
+ for raw_path in role_prompt_files:
1327
+ path_text = str(raw_path).strip()
1328
+ if not path_text:
1329
+ continue
1330
+ path = Path(path_text).expanduser().resolve(strict=False)
1331
+ if _path_has_suffix(path, ("benchmarks", "ResearchClawBench", "role_prompt.md")):
1332
+ from benchmarks.ResearchClawBench.adapter import ResearchClawBenchAgent
1333
+
1334
+ return ResearchClawBenchAgent
1335
+ return MultiTurnReactAgent
1336
+
1337
+
1338
+ def _parse_cli_args(argv: list[str]) -> tuple[str, Optional[str], Optional[str], str, list[str], list[str], Optional[bool]]:
1339
+ parser = argparse.ArgumentParser(description="Run the local agent directly from agent_base.react_agent.")
1340
+ parser.add_argument("prompt", nargs="*", help="Prompt text.")
1341
+ parser.add_argument("--prompt-file", help="Optional UTF-8 text file containing the prompt.")
1342
+ parser.add_argument("--trace-dir", help="Optional directory where the run trace JSONL should be created.")
1343
+ parser.add_argument(
1344
+ "--workspace-root",
1345
+ help="Optional workspace root for local file tools, Bash, and TerminalStart.",
1346
+ )
1347
+ parser.add_argument(
1348
+ "--role-prompt-file",
1349
+ action="append",
1350
+ default=[],
1351
+ dest="role_prompt_files",
1352
+ metavar="PATH",
1353
+ help="Append one role-specific prompt file to the base system prompt. May be passed multiple times.",
1354
+ )
1355
+ parser.add_argument(
1356
+ "--images",
1357
+ action="append",
1358
+ nargs="+",
1359
+ default=[],
1360
+ dest="image_paths",
1361
+ metavar="PATH",
1362
+ help="Attach one or more local image paths to the initial user message.",
1363
+ )
1364
+ parser.add_argument(
1365
+ "--chat",
1366
+ action=argparse.BooleanOptionalAction,
1367
+ default=None,
1368
+ help="Continue asking for follow-up user messages after each final answer. Defaults to on only in an interactive terminal.",
1369
+ )
1370
+ args = parser.parse_args(argv)
1371
+
1372
+ prompt_text = ""
1373
+ if args.prompt_file:
1374
+ prompt_text = Path(args.prompt_file).read_text(encoding="utf-8").strip()
1375
+ elif args.prompt:
1376
+ prompt_text = " ".join(args.prompt).strip()
1377
+
1378
+ if not prompt_text:
1379
+ raise ValueError("A non-empty prompt is required via positional args or --prompt-file.")
1380
+ role_prompt = read_role_prompt_files(args.role_prompt_files)
1381
+ return (
1382
+ prompt_text,
1383
+ args.trace_dir,
1384
+ args.workspace_root,
1385
+ role_prompt,
1386
+ list(args.role_prompt_files),
1387
+ [path for group in args.image_paths for path in group],
1388
+ args.chat,
1389
+ )
1390
+
1391
+
1392
+ def main(argv: Optional[list[str]] = None) -> int:
1393
+ load_dotenv(PROJECT_ROOT / ".env")
1394
+ try:
1395
+ require_required_env("ResearchHarness agent")
1396
+ prompt_text, trace_dir, workspace_root, role_prompt, role_prompt_files, image_paths, chat_arg = _parse_cli_args(argv or sys.argv[1:])
1397
+ agent_cls = resolve_agent_class_for_role_prompt_files(role_prompt_files)
1398
+ agent = agent_cls(
1399
+ llm=default_llm_config(),
1400
+ trace_dir=trace_dir,
1401
+ role_prompt=role_prompt or None,
1402
+ )
1403
+ resolved_workspace_root = normalize_workspace_root(workspace_root)
1404
+ initial_content_parts: list[dict[str, Any]] = []
1405
+ saved_image_paths: list[str] = []
1406
+ for image_index, image_path in enumerate(image_paths):
1407
+ saved_path, data_url = stage_image_file_for_input(
1408
+ image_path,
1409
+ workspace_root=resolved_workspace_root,
1410
+ image_index=image_index,
1411
+ )
1412
+ saved_image_paths.append(saved_path)
1413
+ initial_content_parts.extend(image_input_content_parts(data_url, saved_path))
1414
+ run_prompt = append_saved_image_paths_to_prompt(prompt_text, saved_image_paths)
1415
+ printer = ConsoleEventPrinter(
1416
+ model_name=agent.model,
1417
+ workspace_root=resolved_workspace_root,
1418
+ prompt=run_prompt,
1419
+ )
1420
+ printer.print_header()
1421
+ session = agent._run_session(
1422
+ run_prompt,
1423
+ workspace_root=str(resolved_workspace_root),
1424
+ event_callback=printer.handle_event,
1425
+ initial_content_parts=initial_content_parts or None,
1426
+ )
1427
+ chat_enabled = chat_arg if chat_arg is not None else (sys.stdin.isatty() and sys.stdout.isatty())
1428
+ messages = session.get("messages", [])
1429
+ while chat_enabled:
1430
+ try:
1431
+ followup = input("\n[ResearchHarness] Follow-up (Ctrl+C to exit): ").strip()
1432
+ except (KeyboardInterrupt, EOFError):
1433
+ print("\n[ResearchHarness] Chat ended.")
1434
+ break
1435
+ if not followup:
1436
+ continue
1437
+ print(f"\n[ResearchHarness] Continuing conversation: {followup}")
1438
+ printer.reset_rounds()
1439
+ session = agent._run_session(
1440
+ followup,
1441
+ workspace_root=str(resolved_workspace_root),
1442
+ event_callback=printer.handle_event,
1443
+ prior_messages=messages,
1444
+ )
1445
+ messages = session.get("messages", messages)
1446
+ return 0
1447
+ except (MissingRequiredEnvError, ValueError) as exc:
1448
+ print(str(exc), file=sys.stderr)
1449
+ return 1
1450
+
1451
+
1452
+ if __name__ == "__main__":
1453
+ raise SystemExit(main())
agent_base/session_state.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from dataclasses import asdict, dataclass, field
5
+ from pathlib import Path
6
+ from typing import Any, Optional, Sequence
7
+
8
+ from agent_base.model_profiles import ModelProfile
9
+ from agent_base.utils import safe_jsonable
10
+
11
+
12
+ SESSION_STATE_FILENAME = "_session_state.json"
13
+
14
+
15
+ @dataclass
16
+ class CompactionRecord:
17
+ turn_index: int
18
+ status: str
19
+ trigger_reason: str
20
+ prior_token_estimate: int
21
+ prior_message_count: int
22
+ compacted_group_count: int = 0
23
+ kept_group_count: int = 0
24
+ new_token_estimate: Optional[int] = None
25
+ new_message_count: Optional[int] = None
26
+ summary_text: str = ""
27
+ error: str = ""
28
+
29
+
30
+ @dataclass
31
+ class AgentSessionState:
32
+ run_id: str
33
+ model_name: str
34
+ workspace_root: str
35
+ prompt: str
36
+ trace_path: str = ""
37
+ turn_index: int = 0
38
+ llm_calls_remaining: int = 0
39
+ max_rounds: int = 0
40
+ max_input_tokens: int = 0
41
+ max_output_tokens: int = 0
42
+ last_input_tokens: Optional[int] = None
43
+ current_token_estimate: int = 0
44
+ termination: str = ""
45
+ error: str = ""
46
+ messages: list[dict[str, Any]] = field(default_factory=list)
47
+ compactions: list[CompactionRecord] = field(default_factory=list)
48
+ model_profile: Optional[ModelProfile] = None
49
+
50
+ def capture_messages(self, messages: Sequence[dict[str, Any]]) -> None:
51
+ self.messages = safe_jsonable(list(messages))
52
+
53
+ def payload(self) -> dict[str, Any]:
54
+ profile = self.model_profile
55
+ return {
56
+ "version": 1,
57
+ "run_id": self.run_id,
58
+ "model_name": self.model_name,
59
+ "workspace_root": self.workspace_root,
60
+ "prompt": self.prompt,
61
+ "trace_path": self.trace_path,
62
+ "turn_index": self.turn_index,
63
+ "llm_calls_remaining": self.llm_calls_remaining,
64
+ "max_rounds": self.max_rounds,
65
+ "max_input_tokens": self.max_input_tokens,
66
+ "max_output_tokens": self.max_output_tokens,
67
+ "last_input_tokens": self.last_input_tokens,
68
+ "current_token_estimate": self.current_token_estimate,
69
+ "termination": self.termination,
70
+ "error": self.error,
71
+ "messages": self.messages,
72
+ "compactions": [safe_jsonable(asdict(record)) for record in self.compactions],
73
+ "model_profile": safe_jsonable(asdict(profile)) if profile is not None else None,
74
+ }
75
+
76
+
77
+ def resolve_session_state_path(trace_dir: str | Path) -> Path:
78
+ return Path(trace_dir) / SESSION_STATE_FILENAME
79
+
80
+
81
+ def persist_session_state(path: str | Path, state: AgentSessionState) -> None:
82
+ output_path = Path(path)
83
+ output_path.parent.mkdir(parents=True, exist_ok=True)
84
+ output_path.write_text(json.dumps(state.payload(), ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
agent_base/tools/README.md ADDED
@@ -0,0 +1,457 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Tools
2
+
3
+ This document describes the tool surface exposed to the model. Tool names use PascalCase consistently.
4
+
5
+ The current implementation is grouped by category:
6
+
7
+ - `agent_base/tools/tool_file.py`
8
+ - `agent_base/tools/tool_runtime.py`
9
+ - `agent_base/tools/tool_user.py`
10
+ - `agent_base/tools/tool_web.py`
11
+
12
+ ## Overview
13
+
14
+ The current tool set is:
15
+
16
+ - `Glob`
17
+ - `Grep`
18
+ - `Read`
19
+ - `ReadPDF`
20
+ - `ReadImage`
21
+ - `Write`
22
+ - `Edit`
23
+ - `Bash`
24
+ - `WebSearch`
25
+ - `ScholarSearch`
26
+ - `WebFetch`
27
+ - `AskUser`
28
+ - `TerminalStart`
29
+ - `TerminalWrite`
30
+ - `TerminalRead`
31
+ - `TerminalInterrupt`
32
+ - `TerminalKill`
33
+
34
+ ## Tool Matrix
35
+
36
+ | Tool | Category | Arguments | Description | Return Shape / Notes |
37
+ | --- | --- | --- | --- | --- |
38
+ | `Glob` | Local files | `pattern`, `path?`, `include_dirs?`, `max_results?` | Discover files or directories by pathname pattern inside the workspace. | Returns `root`, `match_count`, `truncated`, and `results`. Best for pathname discovery rather than reading content. |
39
+ | `Grep` | Local files | `pattern`, `path?`, `glob?`, `case_sensitive?`, `max_results?`, `max_chars?` | Search local text files by content and return matching lines. | Returns search metadata plus matched file paths, line numbers, and line text. Skips obvious binary files, images, and PDFs. |
40
+ | `Read` | Local files | `path`, `start_line?`, `end_line?`, `max_chars?` | Read a local text file, optionally by line range. | Returns normalized path, line metadata, truncation status, and `content`. Redirects PDF/image tasks toward `ReadPDF` or `ReadImage`. |
41
+ | `ReadPDF` | Local files | `path`, `max_chars?`, `max_image_paths?` | Read a local PDF, extract text, and expose extracted image paths when available. | Returns text content plus `image_paths` and image-count metadata. Depends on [`structai`](https://github.com/black-yt/structai) and `MINERU_TOKEN`. |
42
+ | `ReadImage` | Local files | `path` | Read a local image and expose image metadata for runtime multimodal use. | Returns image metadata only. During agent runs, the runtime sends a compressed attachment to the LLM API as an `image_url` content part. |
43
+ | `Write` | Local files | `path`, `content`, `overwrite?` | Create a text file or overwrite one when explicitly allowed. | Creates parent directories automatically. Returns an error if the file exists and `overwrite=false`. |
44
+ | `Edit` | Local files | `path`, `patch` | Apply a targeted patch to a local text file. | Expects unified-diff / hunk-style input. Context-based matching, not a full `patch(1)` implementation. |
45
+ | `Bash` | Runtime | `command`, `timeout?`, `workdir?` | Run one-shot shell commands for deterministic local execution, parsing, and validation. | Returns `stdout` and `stderr`. Primary local execution tool for short Python, `rg`, `find`, `git`, and structured local processing. |
46
+ | `WebSearch` | Web | `query` | Perform general web search over one or more complementary queries. | Returns a text summary headed by `## Web Results` with title, link, snippet, and date/source when available. Uses Serper. |
47
+ | `ScholarSearch` | Web | `query` | Search academic results such as papers, year, abstract, and citations. | Returns a text summary headed by `## Scholar Results` with title, PDF link, publication info, year, citation count, and abstract. Uses Serper Scholar. |
48
+ | `WebFetch` | Web | `url`, `goal` | Fetch a page, extract evidence relevant to a concrete goal, and summarize it. | Uses Jina Reader plus the configured summary model. Returns evidence-focused text rather than raw HTML. |
49
+ | `AskUser` | Human interaction | `question`, `context?` | Ask the human user one concise clarification question when essential information cannot be determined from tools or existing instructions. | Writes the question to the interactive terminal and returns the user's answer. If no interactive terminal is available, returns an explicit unavailable message. |
50
+ | `TerminalStart` | Runtime | `cwd?`, `shell?`, `rows?`, `cols?` | Start a persistent terminal session. | Returns session metadata such as `session_id`, `pid`, `cwd`, `shell`, `alive`, and `returncode`. |
51
+ | `TerminalWrite` | Runtime | `session_id`, `input`, `append_newline?`, `yield_time_ms?`, `max_output_chars?` | Send input to a persistent terminal session and read incremental output. | Best for stateful shells, REPLs, and long-running foreground processes. |
52
+ | `TerminalRead` | Runtime | `session_id`, `yield_time_ms?`, `max_output_chars?` | Read unread output from an existing persistent terminal session. | Useful when a process is still running and output arrives over time. |
53
+ | `TerminalInterrupt` | Runtime | `session_id`, `max_output_chars?` | Send `Ctrl-C` to the foreground process in a terminal session without destroying the session. | Use when a long-running process must be interrupted but the shell should remain alive. |
54
+ | `TerminalKill` | Runtime | `session_id`, `force?` | Terminate a persistent terminal session and release resources. | Final cleanup step for terminal sessions that are no longer needed. |
55
+
56
+ ## Glob
57
+
58
+ Purpose:
59
+
60
+ - Discover local files or directories by glob pattern.
61
+ - Good for pathname discovery, not for reading file contents.
62
+
63
+ Arguments:
64
+
65
+ - `pattern`: string, a `pathlib`-style glob such as `**/*.py`
66
+ - `path`: optional string, search root, defaults to the current workspace
67
+ - `include_dirs`: optional boolean, defaults to `false`
68
+ - `max_results`: optional integer, defaults to `200`
69
+
70
+ Returns:
71
+
72
+ - `root`
73
+ - `pattern`
74
+ - `include_dirs`
75
+ - `match_count`
76
+ - `truncated`
77
+ - `results`
78
+
79
+ ## Grep
80
+
81
+ Purpose:
82
+
83
+ - Search local text files by content.
84
+ - Return matched file paths, line numbers, and line text.
85
+
86
+ Arguments:
87
+
88
+ - `pattern`: string, regular expression
89
+ - `path`: optional string, file or directory path, defaults to the current workspace
90
+ - `glob`: optional string, file filter when scanning a directory, defaults to `**/*`
91
+ - `case_sensitive`: optional boolean, defaults to `false`
92
+ - `max_results`: optional integer, defaults to `100`
93
+ - `max_chars`: optional integer, defaults to `20000`
94
+
95
+ Behavior:
96
+
97
+ - If `path` is a file, only that file is searched.
98
+ - If `path` is a directory, matching text files are searched recursively.
99
+ - Images, PDFs, and obviously binary files are skipped.
100
+
101
+ Returns:
102
+
103
+ - `root`
104
+ - `pattern`
105
+ - `glob`
106
+ - `case_sensitive`
107
+ - `files_scanned`
108
+ - `match_count`
109
+ - `truncated`
110
+ - `results`
111
+
112
+ ## Read
113
+
114
+ Purpose:
115
+
116
+ - Read a local text file.
117
+ - Support partial line ranges.
118
+ - Support long-text truncation.
119
+
120
+ Arguments:
121
+
122
+ - `path`: string, file path
123
+ - `start_line`: optional integer, 1-based start line
124
+ - `end_line`: optional integer, 1-based end line
125
+ - `max_chars`: optional integer, maximum returned characters, defaults to `20000`
126
+
127
+ Behavior:
128
+
129
+ - Only text files are handled directly.
130
+ - If the input is a PDF, the tool tells the model to use `ReadPDF`.
131
+ - If the input is an image, the tool tells the model to use `ReadImage`.
132
+
133
+ Returns:
134
+
135
+ - `path`
136
+ - `source_type: text`
137
+ - `start_line`
138
+ - `end_line`
139
+ - `total_lines`
140
+ - `truncated`
141
+ - `content`
142
+
143
+ ## ReadPDF
144
+
145
+ Purpose:
146
+
147
+ - Read a local PDF.
148
+ - Return extracted text.
149
+ - Return extracted local image paths when the PDF parser produces image assets.
150
+
151
+ Arguments:
152
+
153
+ - `path`: string, PDF path
154
+ - `max_chars`: optional integer, maximum returned characters, defaults to `20000`
155
+ - `max_image_paths`: optional integer, maximum listed extracted image paths, defaults to `20`
156
+
157
+ Behavior:
158
+
159
+ - Calls `structai.read_pdf(...)` from [`structai`](https://github.com/black-yt/structai) underneath.
160
+ - Uses the returned `text` and `img_paths`.
161
+ - Depends on `MINERU_TOKEN`.
162
+ - If [`structai`](https://github.com/black-yt/structai) is missing, returns a clear dependency error instead of breaking unrelated file tools.
163
+ - For PDF figure tasks, prefer `ReadPDF` first to discover extracted text and extracted image paths, then use `ReadImage` on the actual extracted image file.
164
+
165
+ Returns:
166
+
167
+ - `path`
168
+ - `source_type: pdf`
169
+ - `total_lines`
170
+ - `truncated`
171
+ - `image_count`
172
+ - `image_paths_listed`
173
+ - `image_paths_truncated`
174
+ - `image_paths`
175
+ - `content`
176
+
177
+ ## ReadImage
178
+
179
+ Purpose:
180
+
181
+ - Read a local image.
182
+ - Return image metadata.
183
+ - During a main agent run, pass a compressed image to the LLM API as an `image_url` content part instead of stuffing raw base64 text into ordinary message text.
184
+
185
+ Arguments:
186
+
187
+ - `path`: string, image path
188
+
189
+ Behavior:
190
+
191
+ - Uses `PIL.Image.open(...)` underneath.
192
+ - The runtime creates a compressed JPEG attachment for the LLM request and sends it as an inline `data:` URL in an `image_url` content part.
193
+ - Trace records and direct tool output keep image metadata only, not the full binary payload.
194
+
195
+ Returns:
196
+
197
+ - `path`
198
+ - `source_type`
199
+ - `format`
200
+ - `mime_type`
201
+ - `mode`
202
+ - `width`
203
+ - `height`
204
+ - `byte_count`
205
+ - `llm_attachment_format`
206
+ - `llm_attachment_width`
207
+ - `llm_attachment_height`
208
+ - `llm_attachment_byte_count`
209
+
210
+ ## Write
211
+
212
+ Purpose:
213
+
214
+ - Create a text file.
215
+ - Overwrite an existing file when explicitly requested.
216
+
217
+ Arguments:
218
+
219
+ - `path`: string, destination file path
220
+ - `content`: string, complete file content
221
+ - `overwrite`: optional boolean, defaults to `false`
222
+
223
+ Behavior:
224
+
225
+ - Parent directories are created automatically.
226
+ - If `overwrite=false` and the file already exists, the tool returns an error.
227
+
228
+ ## Edit
229
+
230
+ Purpose:
231
+
232
+ - Edit a local text file partially.
233
+ - Best for targeted patches, not full-file rewrites.
234
+
235
+ Arguments:
236
+
237
+ - `path`: string, destination file path
238
+ - `patch`: string, unified-diff / hunk-style patch
239
+
240
+ Behavior:
241
+
242
+ - Requires explicit hunks such as `@@ -1,2 +1,2 @@`.
243
+ - The current implementation matches by surrounding context blocks rather than implementing full `patch(1)` line-number semantics.
244
+
245
+ Returns:
246
+
247
+ - updated file path on success
248
+ - applied hunk count
249
+
250
+ ## Bash
251
+
252
+ Purpose:
253
+
254
+ - Execute one-shot shell commands.
255
+ - Handle paths, search, git, conda, and local script orchestration.
256
+ - Serve as the primary local execution tool for temporary Python, deterministic computation, validation, formatting, and parsing.
257
+
258
+ Arguments:
259
+
260
+ - `command`: string, shell command to execute
261
+ - `timeout`: optional integer, seconds, defaults to `30`
262
+ - `workdir`: optional string, working directory
263
+
264
+ Behavior:
265
+
266
+ - Uses local `bash`.
267
+ - Returns both `stdout` and `stderr`.
268
+ - Timeout produces an explicit error.
269
+ - Short scripts are well suited to a heredoc such as `python3 - <<'PY'`.
270
+
271
+ Recommended use cases:
272
+
273
+ - pathname and file discovery
274
+ - `rg`, `find`, `git`
275
+ - local Python or other CLI programs
276
+ - deterministic CSV / JSON / text processing
277
+ - local computation and validation against absolute paths returned by file tools
278
+
279
+ ## WebSearch
280
+
281
+ Purpose:
282
+
283
+ - General web search.
284
+ - Supports passing multiple complementary queries in one call.
285
+
286
+ Arguments:
287
+
288
+ - `query`: array of strings, at least one query
289
+
290
+ Behavior:
291
+
292
+ - Calls Serper's Google Search endpoint.
293
+ - Reads `SERPER_KEY_ID` at runtime.
294
+
295
+ Returns:
296
+
297
+ - query summary text
298
+ - `## Web Results`
299
+ - title, link, snippet, and date/source when available
300
+
301
+ ## ScholarSearch
302
+
303
+ Purpose:
304
+
305
+ - Academic search.
306
+ - Return paper title, year, abstract, citation count, and related metadata.
307
+
308
+ Arguments:
309
+
310
+ - `query`: array of strings, at least one query
311
+
312
+ Behavior:
313
+
314
+ - Calls Serper's Google Scholar endpoint.
315
+ - Reads `SERPER_KEY_ID` at runtime.
316
+
317
+ Returns:
318
+
319
+ - query summary text
320
+ - `## Scholar Results`
321
+ - title, PDF link, `publicationInfo`, year, citation count, and abstract
322
+
323
+ ## WebFetch
324
+
325
+ Purpose:
326
+
327
+ - Visit a webpage.
328
+ - Extract evidence relevant to a concrete goal.
329
+ - Produce a goal-oriented summary.
330
+
331
+ Arguments:
332
+
333
+ - `url`: string or array of strings, page URL or URLs
334
+ - `goal`: string, the specific goal to extract from the page
335
+
336
+ Behavior:
337
+
338
+ - Fetches page text through Jina Reader first.
339
+ - Then calls the configured summary-model endpoint for evidence extraction and summarization.
340
+ - Returns a fetch-and-extract result, not raw HTML.
341
+
342
+ Dependencies:
343
+
344
+ - `JINA_API_KEYS`
345
+ - `API_KEY`
346
+ - `API_BASE`
347
+ - `MODEL_NAME`
348
+
349
+ Returns:
350
+
351
+ - `The useful information in ...`
352
+ - `Evidence in page:`
353
+ - `Summary:`
354
+
355
+ ## TerminalStart
356
+
357
+ Purpose:
358
+
359
+ - Start a persistent terminal session.
360
+
361
+ Arguments:
362
+
363
+ - `cwd`: optional string, working directory
364
+ - `shell`: optional string, shell path
365
+ - `rows`: optional integer, terminal rows, defaults to `30`
366
+ - `cols`: optional integer, terminal columns, defaults to `120`
367
+
368
+ Returns:
369
+
370
+ - `session_id`
371
+ - `pid`
372
+ - `cwd`
373
+ - `shell`
374
+ - `alive`
375
+ - `returncode`
376
+
377
+ ## TerminalWrite
378
+
379
+ Purpose:
380
+
381
+ - Send input to an existing terminal session and read output.
382
+
383
+ Arguments:
384
+
385
+ - `session_id`: string, session id
386
+ - `input`: string, text to send
387
+ - `append_newline`: optional boolean, defaults to `true`
388
+ - `yield_time_ms`: optional integer, defaults to `200`
389
+ - `max_output_chars`: optional integer, defaults to `20000`
390
+
391
+ ## TerminalRead
392
+
393
+ Purpose:
394
+
395
+ - Read unread output from an existing terminal session.
396
+
397
+ Arguments:
398
+
399
+ - `session_id`: string, session id
400
+ - `yield_time_ms`: optional integer, defaults to `200`
401
+ - `max_output_chars`: optional integer, defaults to `20000`
402
+
403
+ ## TerminalInterrupt
404
+
405
+ Purpose:
406
+
407
+ - Send `Ctrl-C` to the foreground process in a terminal session.
408
+ - Keep the session alive.
409
+
410
+ Arguments:
411
+
412
+ - `session_id`: string, session id
413
+ - `max_output_chars`: optional integer, defaults to `20000`
414
+
415
+ ## TerminalKill
416
+
417
+ Purpose:
418
+
419
+ - Terminate a terminal session.
420
+ - Release related resources.
421
+
422
+ Arguments:
423
+
424
+ - `session_id`: string, session id
425
+ - `force`: optional boolean, defaults to `false`
426
+
427
+ ## AskUser
428
+
429
+ Purpose:
430
+
431
+ - Ask the human user for essential missing information, preference, or approval.
432
+ - Use only when the answer cannot be determined from workspace files, available tools, or existing instructions.
433
+
434
+ Arguments:
435
+
436
+ - `question`: string, concise question to ask.
437
+ - `context`: optional string, brief explanation of why the question is necessary.
438
+
439
+ Behavior:
440
+
441
+ - Writes the question to the interactive terminal and waits for one user answer.
442
+ - Returns an explicit unavailable message instead of blocking when no interactive terminal exists.
443
+ - Not available in ResearchClawBench runs.
444
+
445
+ ## Suggested Usage
446
+
447
+ - Use `Glob` first for pathname discovery.
448
+ - Use `Grep` first for local text search.
449
+ - Use `Read` for local text files.
450
+ - Use `ReadPDF` for local PDFs.
451
+ - Use `ReadImage` for local images.
452
+ - Use `Edit` for targeted file changes.
453
+ - Use `Write` for full-file writes.
454
+ - Use `Bash` for one-shot system commands.
455
+ - Use `AskUser` only when a human answer is genuinely necessary.
456
+ - Use `Terminal*` only when persistent interactive shell state is actually needed.
457
+ - Route pure Python analysis through `Bash` rather than introducing a separate Python tool.
agent_base/tools/__init__.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from importlib import import_module
2
+
3
+ __all__ = [
4
+ "Bash",
5
+ "AskUser",
6
+ "Edit",
7
+ "Glob",
8
+ "Grep",
9
+ "Read",
10
+ "ReadImage",
11
+ "ReadPDF",
12
+ "ScholarSearch",
13
+ "TerminalInterrupt",
14
+ "TerminalKill",
15
+ "TerminalRead",
16
+ "TerminalStart",
17
+ "TerminalWrite",
18
+ "WebFetch",
19
+ "WebSearch",
20
+ "Write",
21
+ ]
22
+
23
+ _EXPORT_TO_MODULE = {
24
+ "Bash": "agent_base.tools.tool_runtime",
25
+ "AskUser": "agent_base.tools.tool_user",
26
+ "Edit": "agent_base.tools.tool_file",
27
+ "Glob": "agent_base.tools.tool_file",
28
+ "Grep": "agent_base.tools.tool_file",
29
+ "Read": "agent_base.tools.tool_file",
30
+ "ReadImage": "agent_base.tools.tool_file",
31
+ "ReadPDF": "agent_base.tools.tool_file",
32
+ "ScholarSearch": "agent_base.tools.tool_web",
33
+ "TerminalInterrupt": "agent_base.tools.tool_runtime",
34
+ "TerminalKill": "agent_base.tools.tool_runtime",
35
+ "TerminalRead": "agent_base.tools.tool_runtime",
36
+ "TerminalStart": "agent_base.tools.tool_runtime",
37
+ "TerminalWrite": "agent_base.tools.tool_runtime",
38
+ "WebFetch": "agent_base.tools.tool_web",
39
+ "WebSearch": "agent_base.tools.tool_web",
40
+ "Write": "agent_base.tools.tool_file",
41
+ }
42
+
43
+
44
+ def __getattr__(name: str):
45
+ module_name = _EXPORT_TO_MODULE.get(name)
46
+ if module_name is None:
47
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
48
+ module = import_module(module_name)
49
+ return getattr(module, name)
agent_base/tools/tool_file.py ADDED
@@ -0,0 +1,933 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import base64
3
+ import io
4
+ import os
5
+ import re
6
+ import sys
7
+ from pathlib import Path
8
+ from typing import Any, Optional, Union
9
+
10
+ from PIL import Image
11
+
12
+ from agent_base.tools.tooling import ToolBase, normalize_base_root, validate_tool_path, workspace_root
13
+ from agent_base.utils import PROJECT_ROOT, load_dotenv, read_text_lossy
14
+
15
+
16
+ IMAGE_SUFFIXES = {
17
+ ".png",
18
+ ".jpg",
19
+ ".jpeg",
20
+ ".gif",
21
+ ".bmp",
22
+ ".webp",
23
+ ".tif",
24
+ ".tiff",
25
+ }
26
+
27
+ DEFAULT_LLM_IMAGE_MAX_EDGE = 1568
28
+ DEFAULT_LLM_IMAGE_MAX_BYTES = 512 * 1024
29
+ DEFAULT_LLM_IMAGE_JPEG_QUALITY = 85
30
+ MIN_LLM_IMAGE_JPEG_QUALITY = 45
31
+ MIN_LLM_IMAGE_EDGE = 256
32
+ DEFAULT_GLOB_MAX_RESULTS = 200
33
+ DEFAULT_GREP_MAX_RESULTS = 100
34
+ DEFAULT_GREP_MAX_CHARS = 20000
35
+
36
+
37
+ def resolve_file_path(path_value: str, *, base_root: Optional[Path] = None) -> Path:
38
+ path = Path(path_value).expanduser()
39
+ root = normalize_base_root(base_root)
40
+ if path.is_absolute():
41
+ return validate_tool_path(path, "Read access", base_root=root)
42
+
43
+ direct_candidate = root / path
44
+ if direct_candidate.exists():
45
+ return validate_tool_path(direct_candidate.resolve(), "Read access", base_root=root)
46
+
47
+ if base_root is None and path.exists():
48
+ return validate_tool_path(path.resolve(), "Read access", base_root=root)
49
+
50
+ return validate_tool_path((root / path).resolve(strict=False), "Read access", base_root=root)
51
+
52
+
53
+ def resolve_search_root(path_value: str, *, base_root: Optional[Path] = None) -> Path:
54
+ path = Path(path_value).expanduser()
55
+ root = normalize_base_root(base_root)
56
+ if path.is_absolute():
57
+ return validate_tool_path(path, "Search access", base_root=root)
58
+ return validate_tool_path(root / path, "Search access", base_root=root)
59
+
60
+
61
+ def _is_probably_binary(path: Path, *, sample_size: int = 4096) -> bool:
62
+ try:
63
+ sample = path.read_bytes()[:sample_size]
64
+ except OSError:
65
+ return False
66
+ return b"\x00" in sample
67
+
68
+
69
+ class Read(ToolBase):
70
+ name = "Read"
71
+ description = "Read a local text file with support for partial line-range reads and output truncation."
72
+ parameters = {
73
+ "type": "object",
74
+ "properties": {
75
+ "path": {
76
+ "type": "string",
77
+ "description": "The local file path to read.",
78
+ },
79
+ "start_line": {
80
+ "type": "integer",
81
+ "description": "Optional 1-based start line for partial reading. Default is 1.",
82
+ },
83
+ "end_line": {
84
+ "type": "integer",
85
+ "description": "Optional 1-based end line for partial reading. If omitted, read to the end.",
86
+ },
87
+ "max_chars": {
88
+ "type": "integer",
89
+ "description": "Maximum number of characters to return. Default is 20000.",
90
+ },
91
+ },
92
+ "required": ["path"],
93
+ }
94
+
95
+ def __init__(self, cfg: Optional[dict] = None):
96
+ super().__init__(cfg)
97
+
98
+ def _read_text_file(self, path: Path) -> str:
99
+ return read_text_lossy(path)
100
+
101
+ def call(self, params: Union[str, dict], **kwargs) -> str:
102
+ try:
103
+ params = self.parse_json_args(params)
104
+ except ValueError as exc:
105
+ return f"[Read] {exc}"
106
+ base_root = kwargs.get("workspace_root")
107
+
108
+ start_line_raw = params.get("start_line", 1)
109
+ end_line_raw = params.get("end_line")
110
+ max_chars_raw = params.get("max_chars", 20000)
111
+ try:
112
+ start_line = int(start_line_raw)
113
+ end_line = end_line_raw
114
+ end_line = int(end_line) if end_line is not None else None
115
+ max_chars = int(max_chars_raw)
116
+ except (TypeError, ValueError):
117
+ return "[Read] start_line, end_line, and max_chars must be integers when provided."
118
+ try:
119
+ path = resolve_file_path(params["path"], base_root=base_root)
120
+ except ValueError as exc:
121
+ return f"[Read] Blocked or invalid path: {exc}"
122
+
123
+ if not path.exists():
124
+ return f"[Read] File not found: {path}"
125
+ if not path.is_file():
126
+ return f"[Read] Path is not a file: {path}"
127
+ if path.suffix.lower() == ".pdf":
128
+ return f"[Read] PDF files are not supported by Read. Use ReadPDF instead: {path}"
129
+ if path.suffix.lower() in IMAGE_SUFFIXES:
130
+ return f"[Read] Image files are not supported by Read. Use ReadImage instead: {path}"
131
+ if start_line < 1:
132
+ return "[Read] start_line must be >= 1."
133
+ if end_line is not None and end_line < start_line:
134
+ return "[Read] end_line must be >= start_line."
135
+ if max_chars <= 0:
136
+ return "[Read] max_chars must be > 0."
137
+
138
+ try:
139
+ text = self._read_text_file(path)
140
+ except OSError as exc:
141
+ return f"[Read] Error reading file: {exc}"
142
+
143
+ lines = text.splitlines()
144
+ selected = lines[start_line - 1:end_line]
145
+ content = "\n".join(selected)
146
+
147
+ truncated = False
148
+ if len(content) > max_chars:
149
+ content = content[:max_chars]
150
+ truncated = True
151
+
152
+ meta = [
153
+ f"path: {path}",
154
+ "source_type: text",
155
+ f"start_line: {start_line}",
156
+ f"end_line: {end_line if end_line is not None else len(lines)}",
157
+ f"total_lines: {len(lines)}",
158
+ f"truncated: {str(truncated).lower()}",
159
+ ]
160
+ return "\n".join(meta) + "\ncontent:\n" + content
161
+
162
+
163
+ class ReadPDF(ToolBase):
164
+ name = "ReadPDF"
165
+ description = "Read a local PDF file and return extracted text. When the PDF parser extracts local image assets, also return their local paths so downstream steps can inspect the actual figure files with ReadImage."
166
+ parameters = {
167
+ "type": "object",
168
+ "properties": {
169
+ "path": {
170
+ "type": "string",
171
+ "description": "The local PDF path to read. Relative paths are resolved from the current workspace.",
172
+ },
173
+ "max_chars": {
174
+ "type": "integer",
175
+ "description": "Maximum number of characters to return. Default is 20000.",
176
+ },
177
+ "max_image_paths": {
178
+ "type": "integer",
179
+ "description": "Maximum number of extracted image paths to list. Default is 20.",
180
+ },
181
+ },
182
+ "required": ["path"],
183
+ }
184
+
185
+ def __init__(self, cfg: Optional[dict] = None):
186
+ super().__init__(cfg)
187
+
188
+ def call(self, params: Union[str, dict], **kwargs) -> str:
189
+ try:
190
+ params = self.parse_json_args(params)
191
+ except ValueError as exc:
192
+ return f"[ReadPDF] {exc}"
193
+ base_root = kwargs.get("workspace_root")
194
+
195
+ try:
196
+ max_chars = int(params.get("max_chars", 20000))
197
+ max_image_paths = int(params.get("max_image_paths", 20))
198
+ except (TypeError, ValueError):
199
+ return "[ReadPDF] max_chars and max_image_paths must be integers."
200
+ try:
201
+ path = resolve_file_path(params["path"], base_root=base_root)
202
+ except ValueError as exc:
203
+ return f"[ReadPDF] Blocked or invalid path: {exc}"
204
+
205
+ if not path.exists():
206
+ return f"[ReadPDF] File not found: {path}"
207
+ if not path.is_file():
208
+ return f"[ReadPDF] Path is not a file: {path}"
209
+ if path.suffix.lower() != ".pdf":
210
+ return f"[ReadPDF] File is not a PDF: {path}"
211
+ if max_chars <= 0:
212
+ return "[ReadPDF] max_chars must be > 0."
213
+ if max_image_paths <= 0:
214
+ return "[ReadPDF] max_image_paths must be > 0."
215
+
216
+ try:
217
+ from structai import read_pdf as structai_read_pdf
218
+ except ImportError:
219
+ return "[ReadPDF] Missing required dependency: structai. Install requirements and configure MINERU_TOKEN to enable PDF reading."
220
+
221
+ try:
222
+ result = structai_read_pdf(str(path))
223
+ if isinstance(result, list):
224
+ result = result[0] if result else None
225
+ if not isinstance(result, dict):
226
+ raise ValueError(f"unexpected pdf result type: {type(result)}")
227
+ text = result.get("text", "")
228
+ if not isinstance(text, str):
229
+ raise ValueError("PDF text must be a string")
230
+ raw_img_paths = result.get("img_paths", []) or []
231
+ if not isinstance(raw_img_paths, list):
232
+ raise ValueError("PDF img_paths must be a list when present")
233
+ if not text.strip() and not raw_img_paths:
234
+ raise ValueError("PDF text is empty and no extracted images were found")
235
+ except (OSError, ValueError, TypeError) as exc:
236
+ return f"[ReadPDF] Error reading PDF: {exc}"
237
+
238
+ resolved_img_paths: list[str] = []
239
+ for raw_img_path in raw_img_paths:
240
+ if not isinstance(raw_img_path, str) or not raw_img_path.strip():
241
+ continue
242
+ candidate = Path(raw_img_path).expanduser()
243
+ if not candidate.is_absolute():
244
+ candidate = (path.parent / candidate).resolve()
245
+ try:
246
+ validated = validate_tool_path(candidate, "ReadPDF extracted image access", base_root=base_root)
247
+ except ValueError:
248
+ continue
249
+ resolved_img_paths.append(str(validated))
250
+
251
+ truncated = len(text) > max_chars
252
+ content = text[:max_chars] if truncated else text
253
+ line_count = len(text.splitlines())
254
+ listed_img_paths = resolved_img_paths[:max_image_paths]
255
+ img_paths_truncated = len(resolved_img_paths) > len(listed_img_paths)
256
+ meta = [
257
+ f"path: {path}",
258
+ "source_type: pdf",
259
+ f"total_lines: {line_count}",
260
+ f"truncated: {str(truncated).lower()}",
261
+ f"image_count: {len(resolved_img_paths)}",
262
+ f"image_paths_listed: {len(listed_img_paths)}",
263
+ f"image_paths_truncated: {str(img_paths_truncated).lower()}",
264
+ ]
265
+ output = "\n".join(meta)
266
+ if listed_img_paths:
267
+ output += "\nimage_paths:\n" + "\n".join(listed_img_paths)
268
+ return output + "\ncontent:\n" + content
269
+
270
+
271
+ class ReadImage(ToolBase):
272
+ name = "ReadImage"
273
+ description = "Read a local image file and return metadata. In the main agent runtime, the image is attached to the llm api request as an image content part instead of being inlined as ordinary conversation text."
274
+ parameters = {
275
+ "type": "object",
276
+ "properties": {
277
+ "path": {
278
+ "type": "string",
279
+ "description": "The local image path to read. Relative paths are resolved from the current workspace.",
280
+ },
281
+ },
282
+ "required": ["path"],
283
+ }
284
+
285
+ def __init__(self, cfg: Optional[dict] = None):
286
+ super().__init__(cfg)
287
+
288
+ def _build_llm_attachment(self, image: Image.Image) -> tuple[bytes, int, int]:
289
+ max_edge = int(os.getenv("LLM_IMAGE_MAX_EDGE", str(DEFAULT_LLM_IMAGE_MAX_EDGE)))
290
+ max_bytes = int(os.getenv("LLM_IMAGE_MAX_BYTES", str(DEFAULT_LLM_IMAGE_MAX_BYTES)))
291
+ quality = int(os.getenv("LLM_IMAGE_JPEG_QUALITY", str(DEFAULT_LLM_IMAGE_JPEG_QUALITY)))
292
+
293
+ attachment = image.copy()
294
+ if max(attachment.size) > max_edge:
295
+ attachment.thumbnail((max_edge, max_edge), Image.Resampling.LANCZOS)
296
+ if attachment.mode not in {"RGB", "L"}:
297
+ attachment = attachment.convert("RGB")
298
+
299
+ payload = b""
300
+ while True:
301
+ current_quality = quality
302
+ while True:
303
+ buffer = io.BytesIO()
304
+ attachment.save(buffer, format="JPEG", quality=current_quality, optimize=True)
305
+ payload = buffer.getvalue()
306
+ if len(payload) <= max_bytes:
307
+ return payload, attachment.size[0], attachment.size[1]
308
+ if current_quality <= MIN_LLM_IMAGE_JPEG_QUALITY:
309
+ break
310
+ current_quality = max(current_quality - 10, MIN_LLM_IMAGE_JPEG_QUALITY)
311
+
312
+ width, height = attachment.size
313
+ if max(width, height) <= MIN_LLM_IMAGE_EDGE:
314
+ raise ValueError(
315
+ f"compressed image attachment still exceeds LLM_IMAGE_MAX_BYTES={max_bytes}"
316
+ )
317
+
318
+ shrink_ratio = 0.85
319
+ next_width = max(int(width * shrink_ratio), MIN_LLM_IMAGE_EDGE)
320
+ next_height = max(int(height * shrink_ratio), MIN_LLM_IMAGE_EDGE)
321
+ if (next_width, next_height) == (width, height):
322
+ raise ValueError(
323
+ f"compressed image attachment still exceeds LLM_IMAGE_MAX_BYTES={max_bytes}"
324
+ )
325
+ attachment = attachment.resize((next_width, next_height), Image.Resampling.LANCZOS)
326
+
327
+ def _read_image_artifact(self, params: Union[str, dict], **kwargs) -> Union[str, dict[str, Any]]:
328
+ try:
329
+ params = self.parse_json_args(params)
330
+ except ValueError as exc:
331
+ return f"[ReadImage] {exc}"
332
+ base_root = kwargs.get("workspace_root")
333
+
334
+ try:
335
+ path = resolve_file_path(params["path"], base_root=base_root)
336
+ except ValueError as exc:
337
+ return f"[ReadImage] Blocked or invalid path: {exc}"
338
+
339
+ if not path.exists():
340
+ return f"[ReadImage] File not found: {path}"
341
+ if not path.is_file():
342
+ return f"[ReadImage] Path is not a file: {path}"
343
+
344
+ try:
345
+ with Image.open(path) as image:
346
+ image.load()
347
+ format_name = image.format or "unknown"
348
+ width, height = image.size
349
+ mode = image.mode
350
+ image_bytes = path.read_bytes()
351
+ attachment_bytes, attachment_width, attachment_height = self._build_llm_attachment(image)
352
+ except (OSError, ValueError) as exc:
353
+ return f"[ReadImage] Error reading image: {exc}"
354
+
355
+ mime_type = Image.MIME.get(format_name.upper(), None) if isinstance(format_name, str) else None
356
+ if not mime_type:
357
+ suffix = path.suffix.lower()
358
+ if suffix in {".jpg", ".jpeg"}:
359
+ mime_type = "image/jpeg"
360
+ elif suffix == ".png":
361
+ mime_type = "image/png"
362
+ elif suffix == ".gif":
363
+ mime_type = "image/gif"
364
+ elif suffix == ".webp":
365
+ mime_type = "image/webp"
366
+ elif suffix in {".tif", ".tiff"}:
367
+ mime_type = "image/tiff"
368
+ elif suffix == ".bmp":
369
+ mime_type = "image/bmp"
370
+ else:
371
+ mime_type = "application/octet-stream"
372
+
373
+ encoded = base64.b64encode(attachment_bytes).decode("ascii")
374
+ data_url = f"data:image/jpeg;base64,{encoded}"
375
+ return {
376
+ "kind": "image_tool_result",
377
+ "path": str(path),
378
+ "source_type": "image",
379
+ "format": format_name,
380
+ "mode": mode,
381
+ "width": width,
382
+ "height": height,
383
+ "mime_type": mime_type,
384
+ "byte_count": len(image_bytes),
385
+ "llm_attachment_format": "JPEG",
386
+ "llm_attachment_width": attachment_width,
387
+ "llm_attachment_height": attachment_height,
388
+ "llm_attachment_byte_count": len(attachment_bytes),
389
+ "data_url": data_url,
390
+ }
391
+
392
+ @staticmethod
393
+ def _metadata_text(artifact: dict[str, Any]) -> str:
394
+ meta = [
395
+ f"path: {artifact['path']}",
396
+ f"source_type: {artifact['source_type']}",
397
+ f"format: {artifact['format']}",
398
+ f"mime_type: {artifact['mime_type']}",
399
+ f"mode: {artifact['mode']}",
400
+ f"width: {artifact['width']}",
401
+ f"height: {artifact['height']}",
402
+ f"byte_count: {artifact['byte_count']}",
403
+ f"llm_attachment_format: {artifact['llm_attachment_format']}",
404
+ f"llm_attachment_width: {artifact['llm_attachment_width']}",
405
+ f"llm_attachment_height: {artifact['llm_attachment_height']}",
406
+ f"llm_attachment_byte_count: {artifact['llm_attachment_byte_count']}",
407
+ "llm_image_attached: true",
408
+ ]
409
+ return "\n".join(meta)
410
+
411
+ def call(self, params: Union[str, dict], **kwargs) -> str:
412
+ artifact = self._read_image_artifact(params, **kwargs)
413
+ if isinstance(artifact, str):
414
+ return artifact
415
+ return self._metadata_text(artifact)
416
+
417
+ def call_for_llm(self, params: Union[str, dict], **kwargs) -> Union[str, dict[str, Any]]:
418
+ artifact = self._read_image_artifact(params, **kwargs)
419
+ if isinstance(artifact, str):
420
+ return artifact
421
+ return {
422
+ "kind": "image_tool_result",
423
+ "text": self._metadata_text(artifact),
424
+ "path": artifact["path"],
425
+ "source_type": artifact["source_type"],
426
+ "format": artifact["format"],
427
+ "mime_type": artifact["mime_type"],
428
+ "mode": artifact["mode"],
429
+ "width": artifact["width"],
430
+ "height": artifact["height"],
431
+ "byte_count": artifact["byte_count"],
432
+ "llm_attachment_format": artifact["llm_attachment_format"],
433
+ "llm_attachment_width": artifact["llm_attachment_width"],
434
+ "llm_attachment_height": artifact["llm_attachment_height"],
435
+ "llm_attachment_byte_count": artifact["llm_attachment_byte_count"],
436
+ "image_url": artifact["data_url"],
437
+ }
438
+
439
+
440
+ class Glob(ToolBase):
441
+ name = "Glob"
442
+ description = "Find local files or directories by glob pattern inside the workspace."
443
+ parameters = {
444
+ "type": "object",
445
+ "properties": {
446
+ "pattern": {
447
+ "type": "string",
448
+ "description": "A pathlib-style glob pattern such as '**/*.py' or '*.md'.",
449
+ },
450
+ "path": {
451
+ "type": "string",
452
+ "description": "Optional search root. Defaults to the current workspace root.",
453
+ },
454
+ "include_dirs": {
455
+ "type": "boolean",
456
+ "description": "Whether to include directories in results. Default is false.",
457
+ },
458
+ "max_results": {
459
+ "type": "integer",
460
+ "description": "Maximum number of matched paths to return. Default is 200.",
461
+ },
462
+ },
463
+ "required": ["pattern"],
464
+ }
465
+
466
+ def __init__(self, cfg: Optional[dict] = None):
467
+ super().__init__(cfg)
468
+
469
+ def call(self, params: Union[str, dict], **kwargs) -> str:
470
+ try:
471
+ params = self.parse_json_args(params)
472
+ except ValueError as exc:
473
+ return f"[Glob] {exc}"
474
+ base_root = kwargs.get("workspace_root")
475
+
476
+ pattern = params["pattern"].strip()
477
+ if not pattern:
478
+ return "[Glob] pattern must be a non-empty string."
479
+
480
+ search_root_value = str(params.get("path", "."))
481
+ include_dirs = bool(params.get("include_dirs", False))
482
+ try:
483
+ max_results = int(params.get("max_results", DEFAULT_GLOB_MAX_RESULTS))
484
+ except (TypeError, ValueError):
485
+ return "[Glob] max_results must be an integer."
486
+ if max_results <= 0:
487
+ return "[Glob] max_results must be > 0."
488
+
489
+ try:
490
+ search_root = resolve_search_root(search_root_value, base_root=base_root)
491
+ except ValueError as exc:
492
+ return f"[Glob] Blocked or invalid path: {exc}"
493
+
494
+ if not search_root.exists():
495
+ return f"[Glob] Search root not found: {search_root}"
496
+ if not search_root.is_dir():
497
+ return f"[Glob] Search root is not a directory: {search_root}"
498
+
499
+ try:
500
+ raw_matches = sorted(search_root.glob(pattern))
501
+ except (OSError, ValueError) as exc:
502
+ return f"[Glob] Invalid glob pattern or filesystem error: {exc}"
503
+
504
+ matches: list[str] = []
505
+ truncated = False
506
+ for candidate in raw_matches:
507
+ try:
508
+ resolved = validate_tool_path(candidate.resolve(strict=False), "Glob access", base_root=base_root or search_root)
509
+ except ValueError:
510
+ continue
511
+ if resolved.is_dir() and not include_dirs:
512
+ continue
513
+ if resolved.is_file() or (include_dirs and resolved.is_dir()):
514
+ matches.append(str(resolved))
515
+ if len(matches) >= max_results:
516
+ truncated = len(raw_matches) > max_results
517
+ break
518
+
519
+ meta = [
520
+ f"root: {search_root}",
521
+ f"pattern: {pattern}",
522
+ f"include_dirs: {str(include_dirs).lower()}",
523
+ f"match_count: {len(matches)}",
524
+ f"truncated: {str(truncated).lower()}",
525
+ ]
526
+ if not matches:
527
+ return "\n".join(meta) + "\nresults:\n"
528
+ return "\n".join(meta) + "\nresults:\n" + "\n".join(matches)
529
+
530
+
531
+ class Grep(ToolBase):
532
+ name = "Grep"
533
+ description = "Search local text files for a regex pattern and return matching lines with file paths and line numbers."
534
+ parameters = {
535
+ "type": "object",
536
+ "properties": {
537
+ "pattern": {
538
+ "type": "string",
539
+ "description": "A regular expression pattern to search for.",
540
+ },
541
+ "path": {
542
+ "type": "string",
543
+ "description": "Optional file or directory path to search. Defaults to the current workspace root.",
544
+ },
545
+ "glob": {
546
+ "type": "string",
547
+ "description": "Optional pathlib-style glob filter used when searching a directory. Default is '**/*'.",
548
+ },
549
+ "case_sensitive": {
550
+ "type": "boolean",
551
+ "description": "Whether the regex match should be case-sensitive. Default is false.",
552
+ },
553
+ "max_results": {
554
+ "type": "integer",
555
+ "description": "Maximum number of matching lines to return. Default is 100.",
556
+ },
557
+ "max_chars": {
558
+ "type": "integer",
559
+ "description": "Maximum number of characters to return. Default is 20000.",
560
+ },
561
+ },
562
+ "required": ["pattern"],
563
+ }
564
+
565
+ def __init__(self, cfg: Optional[dict] = None):
566
+ super().__init__(cfg)
567
+
568
+ def _iter_candidate_files(self, root: Path, glob_pattern: str, *, base_root: Optional[Path]) -> list[Path]:
569
+ if root.is_file():
570
+ return [root]
571
+ candidates: list[Path] = []
572
+ for candidate in root.glob(glob_pattern):
573
+ try:
574
+ resolved = validate_tool_path(candidate.resolve(strict=False), "Grep access", base_root=base_root or root)
575
+ except ValueError:
576
+ continue
577
+ if resolved.is_file():
578
+ candidates.append(resolved)
579
+ return sorted(candidates)
580
+
581
+ def call(self, params: Union[str, dict], **kwargs) -> str:
582
+ try:
583
+ params = self.parse_json_args(params)
584
+ except ValueError as exc:
585
+ return f"[Grep] {exc}"
586
+ base_root = kwargs.get("workspace_root")
587
+
588
+ pattern = params["pattern"].strip()
589
+ if not pattern:
590
+ return "[Grep] pattern must be a non-empty string."
591
+
592
+ search_root_value = str(params.get("path", "."))
593
+ glob_pattern = str(params.get("glob", "**/*")).strip() or "**/*"
594
+ case_sensitive = bool(params.get("case_sensitive", False))
595
+ try:
596
+ max_results = int(params.get("max_results", DEFAULT_GREP_MAX_RESULTS))
597
+ max_chars = int(params.get("max_chars", DEFAULT_GREP_MAX_CHARS))
598
+ except (TypeError, ValueError):
599
+ return "[Grep] max_results and max_chars must be integers."
600
+ if max_results <= 0:
601
+ return "[Grep] max_results must be > 0."
602
+ if max_chars <= 0:
603
+ return "[Grep] max_chars must be > 0."
604
+
605
+ flags = 0 if case_sensitive else re.IGNORECASE
606
+ try:
607
+ compiled = re.compile(pattern, flags)
608
+ except re.error as exc:
609
+ return f"[Grep] Invalid regex pattern: {exc}"
610
+
611
+ try:
612
+ search_root = resolve_search_root(search_root_value, base_root=base_root)
613
+ except ValueError as exc:
614
+ return f"[Grep] Blocked or invalid path: {exc}"
615
+
616
+ if not search_root.exists():
617
+ return f"[Grep] Search root not found: {search_root}"
618
+ if not search_root.is_file() and not search_root.is_dir():
619
+ return f"[Grep] Search root is not a file or directory: {search_root}"
620
+
621
+ matches: list[str] = []
622
+ files_scanned = 0
623
+ truncated = False
624
+ for candidate in self._iter_candidate_files(search_root, glob_pattern, base_root=base_root):
625
+ if candidate.suffix.lower() == ".pdf" or candidate.suffix.lower() in IMAGE_SUFFIXES:
626
+ continue
627
+ if _is_probably_binary(candidate):
628
+ continue
629
+ try:
630
+ with candidate.open("r", encoding="utf-8", errors="replace") as handle:
631
+ files_scanned += 1
632
+ for line_index, raw_line in enumerate(handle, start=1):
633
+ line = raw_line.rstrip("\n")
634
+ if not compiled.search(line):
635
+ continue
636
+ entry = f"{candidate}:{line_index}: {line}"
637
+ projected_length = len("\n".join(matches + [entry]))
638
+ if projected_length > max_chars:
639
+ truncated = True
640
+ break
641
+ matches.append(entry)
642
+ if len(matches) >= max_results:
643
+ truncated = True
644
+ break
645
+ except OSError:
646
+ continue
647
+ if truncated:
648
+ break
649
+
650
+ body = "\n".join(matches)
651
+
652
+ meta = [
653
+ f"root: {search_root}",
654
+ f"pattern: {pattern}",
655
+ f"glob: {glob_pattern}",
656
+ f"case_sensitive: {str(case_sensitive).lower()}",
657
+ f"files_scanned: {files_scanned}",
658
+ f"match_count: {len(matches)}",
659
+ f"truncated: {str(truncated).lower()}",
660
+ ]
661
+ if not body:
662
+ return "\n".join(meta) + "\nresults:\n"
663
+ return "\n".join(meta) + "\nresults:\n" + body
664
+
665
+
666
+ class Write(ToolBase):
667
+ name = "Write"
668
+ description = "Create a local text file with full content. Parent directories are created automatically."
669
+ parameters = {
670
+ "type": "object",
671
+ "properties": {
672
+ "path": {
673
+ "type": "string",
674
+ "description": "The local file path to create.",
675
+ },
676
+ "content": {
677
+ "type": "string",
678
+ "description": "The full file content to write.",
679
+ },
680
+ "overwrite": {
681
+ "type": "boolean",
682
+ "description": "Whether to overwrite an existing file. Default is false.",
683
+ },
684
+ },
685
+ "required": ["path", "content"],
686
+ }
687
+
688
+ def __init__(self, cfg: Optional[dict] = None):
689
+ super().__init__(cfg)
690
+
691
+ def call(self, params: Union[str, dict], **kwargs) -> str:
692
+ try:
693
+ params = self.parse_json_args(params)
694
+ base_root = kwargs.get("workspace_root") or workspace_root()
695
+ path = validate_tool_path(params["path"], "Write access", base_root=base_root)
696
+ except ValueError as exc:
697
+ return f"[Write] {exc}"
698
+
699
+ content = params["content"]
700
+ overwrite = bool(params.get("overwrite", False))
701
+
702
+ if path.exists() and not overwrite:
703
+ return f"[Write] File already exists and overwrite is false: {path}"
704
+
705
+ try:
706
+ path.parent.mkdir(parents=True, exist_ok=True)
707
+ path.write_text(content, encoding="utf-8")
708
+ return f"[Write] Wrote file: {path}"
709
+ except OSError as exc:
710
+ return f"[Write] Error writing file: {exc}"
711
+
712
+
713
+ class Edit(ToolBase):
714
+ name = "Edit"
715
+ description = "Edit a local text file using unified diff style hunks. The patch must describe the exact line-level changes to apply."
716
+ parameters = {
717
+ "type": "object",
718
+ "properties": {
719
+ "path": {
720
+ "type": "string",
721
+ "description": "The local file path to edit.",
722
+ },
723
+ "patch": {
724
+ "type": "string",
725
+ "description": "A unified diff style patch containing one or more hunks for this file. Include hunk headers such as @@ -1,2 +1,2 @@.",
726
+ },
727
+ },
728
+ "required": ["path", "patch"],
729
+ }
730
+
731
+ def __init__(self, cfg: Optional[dict] = None):
732
+ super().__init__(cfg)
733
+
734
+ def _parse_unified_patch(self, patch_text: str) -> list[dict]:
735
+ lines = patch_text.splitlines()
736
+ hunks: list[dict] = []
737
+ current_hunk = None
738
+
739
+ for line in lines:
740
+ if line.startswith("--- ") or line.startswith("+++ "):
741
+ continue
742
+ if line.startswith("@@ "):
743
+ if current_hunk is not None:
744
+ hunks.append(current_hunk)
745
+ current_hunk = {"header": line, "lines": []}
746
+ continue
747
+ if current_hunk is None:
748
+ continue
749
+ if line.startswith((" ", "+", "-")):
750
+ current_hunk["lines"].append((line[:1], line[1:]))
751
+ continue
752
+ if line == r"":
753
+ continue
754
+ raise ValueError(f"unsupported patch line: {line}")
755
+
756
+ if current_hunk is not None:
757
+ hunks.append(current_hunk)
758
+
759
+ if not hunks:
760
+ raise ValueError("no patch hunks found")
761
+ return hunks
762
+
763
+ def _apply_hunks(self, original_text: str, hunks: list[dict]) -> tuple[str, int]:
764
+ original_lines = original_text.splitlines()
765
+ original_endswith_newline = original_text.endswith("\n")
766
+ output_lines: list[str] = []
767
+ cursor = 0
768
+
769
+ for hunk_index, hunk in enumerate(hunks, start=1):
770
+ hunk_lines = hunk["lines"]
771
+ old_block = []
772
+ new_block = []
773
+ for prefix, content in hunk_lines:
774
+ if prefix in {" ", "-"}:
775
+ old_block.append(content)
776
+ if prefix in {" ", "+"}:
777
+ new_block.append(content)
778
+
779
+ start_pos = None
780
+ max_start = len(original_lines) - len(old_block)
781
+ for pos in range(cursor, max_start + 1):
782
+ if original_lines[pos:pos + len(old_block)] == old_block:
783
+ start_pos = pos
784
+ break
785
+
786
+ if start_pos is None:
787
+ old_preview = "\n".join(old_block)
788
+ raise ValueError(f"hunk #{hunk_index} context not found:\n{old_preview}")
789
+
790
+ output_lines.extend(original_lines[cursor:start_pos])
791
+ output_lines.extend(new_block)
792
+ cursor = start_pos + len(old_block)
793
+
794
+ output_lines.extend(original_lines[cursor:])
795
+ updated_text = "\n".join(output_lines)
796
+ if original_endswith_newline:
797
+ updated_text += "\n"
798
+ return updated_text, len(hunks)
799
+
800
+ def call(self, params: Union[str, dict], **kwargs) -> str:
801
+ try:
802
+ params = self.parse_json_args(params)
803
+ base_root = kwargs.get("workspace_root") or workspace_root()
804
+ path = validate_tool_path(params["path"], "Edit access", base_root=base_root)
805
+ except ValueError as exc:
806
+ return f"[Edit] {exc}"
807
+
808
+ patch_text = str(params["patch"])
809
+
810
+ if not path.exists():
811
+ return f"[Edit] File not found: {path}"
812
+ if not path.is_file():
813
+ return f"[Edit] Path is not a file: {path}"
814
+ if not patch_text.strip():
815
+ return "[Edit] 'patch' must be a non-empty unified diff string."
816
+
817
+ try:
818
+ text = read_text_lossy(path)
819
+ except OSError as exc:
820
+ return f"[Edit] Error reading file: {exc}"
821
+
822
+ try:
823
+ hunks = self._parse_unified_patch(patch_text)
824
+ updated, applied = self._apply_hunks(text, hunks)
825
+ except ValueError as exc:
826
+ return f"[Edit] Failed to apply patch: {exc}"
827
+
828
+ if updated == text:
829
+ return f"[Edit] No changes applied: {path}"
830
+
831
+ try:
832
+ path.write_text(updated, encoding="utf-8")
833
+ return f"[Edit] Updated file: {path}; applied_hunks: {applied}"
834
+ except OSError as exc:
835
+ return f"[Edit] Error writing file: {exc}"
836
+
837
+
838
+ def main(argv: Optional[list[str]] = None) -> int:
839
+ parser = argparse.ArgumentParser(description="Run local file tools directly.")
840
+ subparsers = parser.add_subparsers(dest="tool", required=True)
841
+
842
+ read_parser = subparsers.add_parser("read", help="Run Read on a text file.")
843
+ read_parser.add_argument("path")
844
+ read_parser.add_argument("--start-line", type=int, default=1)
845
+ read_parser.add_argument("--end-line", type=int)
846
+ read_parser.add_argument("--max-chars", type=int, default=20000)
847
+
848
+ pdf_parser = subparsers.add_parser("pdf", help="Run ReadPDF on a PDF file.")
849
+ pdf_parser.add_argument("path")
850
+ pdf_parser.add_argument("--max-chars", type=int, default=20000)
851
+
852
+ image_parser = subparsers.add_parser("image", help="Run ReadImage on an image file.")
853
+ image_parser.add_argument("path")
854
+
855
+ glob_parser = subparsers.add_parser("glob", help="Run Glob to find local files or directories.")
856
+ glob_parser.add_argument("pattern")
857
+ glob_parser.add_argument("--path", default=".")
858
+ glob_parser.add_argument("--include-dirs", action="store_true")
859
+ glob_parser.add_argument("--max-results", type=int, default=DEFAULT_GLOB_MAX_RESULTS)
860
+
861
+ grep_parser = subparsers.add_parser("grep", help="Run Grep to search local text files.")
862
+ grep_parser.add_argument("pattern")
863
+ grep_parser.add_argument("--path", default=".")
864
+ grep_parser.add_argument("--glob", default="**/*")
865
+ grep_parser.add_argument("--case-sensitive", action="store_true")
866
+ grep_parser.add_argument("--max-results", type=int, default=DEFAULT_GREP_MAX_RESULTS)
867
+ grep_parser.add_argument("--max-chars", type=int, default=DEFAULT_GREP_MAX_CHARS)
868
+
869
+ write_parser = subparsers.add_parser("write", help="Run Write on a text file.")
870
+ write_parser.add_argument("path")
871
+ write_parser.add_argument("content")
872
+ write_parser.add_argument("--overwrite", action="store_true")
873
+
874
+ edit_parser = subparsers.add_parser("edit", help="Run Edit on a text file.")
875
+ edit_parser.add_argument("path")
876
+ edit_parser.add_argument("patch")
877
+
878
+ parser.add_argument("--workspace-root", help="Optional workspace root override.")
879
+ args = parser.parse_args(argv)
880
+
881
+ load_dotenv(PROJECT_ROOT / ".env")
882
+ workspace_root = Path(args.workspace_root).expanduser().resolve() if args.workspace_root else None
883
+
884
+ if args.tool == "read":
885
+ result = Read().call(
886
+ {
887
+ "path": args.path,
888
+ "start_line": args.start_line,
889
+ "end_line": args.end_line,
890
+ "max_chars": args.max_chars,
891
+ },
892
+ workspace_root=workspace_root,
893
+ )
894
+ elif args.tool == "pdf":
895
+ result = ReadPDF().call({"path": args.path, "max_chars": args.max_chars}, workspace_root=workspace_root)
896
+ elif args.tool == "image":
897
+ result = ReadImage().call({"path": args.path}, workspace_root=workspace_root)
898
+ elif args.tool == "glob":
899
+ result = Glob().call(
900
+ {
901
+ "pattern": args.pattern,
902
+ "path": args.path,
903
+ "include_dirs": args.include_dirs,
904
+ "max_results": args.max_results,
905
+ },
906
+ workspace_root=workspace_root,
907
+ )
908
+ elif args.tool == "grep":
909
+ result = Grep().call(
910
+ {
911
+ "pattern": args.pattern,
912
+ "path": args.path,
913
+ "glob": args.glob,
914
+ "case_sensitive": args.case_sensitive,
915
+ "max_results": args.max_results,
916
+ "max_chars": args.max_chars,
917
+ },
918
+ workspace_root=workspace_root,
919
+ )
920
+ elif args.tool == "write":
921
+ result = Write().call(
922
+ {"path": args.path, "content": args.content, "overwrite": args.overwrite},
923
+ workspace_root=workspace_root,
924
+ )
925
+ else:
926
+ result = Edit().call({"path": args.path, "patch": args.patch}, workspace_root=workspace_root)
927
+
928
+ print(result)
929
+ return 0
930
+
931
+
932
+ if __name__ == "__main__":
933
+ raise SystemExit(main(sys.argv[1:]))
agent_base/tools/tool_runtime.py ADDED
@@ -0,0 +1,732 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import atexit
3
+ import itertools
4
+ import os
5
+ import pty
6
+ import re
7
+ import select
8
+ import shutil
9
+ import signal
10
+ import struct
11
+ import subprocess
12
+ import termios
13
+ import threading
14
+ import time
15
+ from pathlib import Path
16
+ from typing import Optional, Union
17
+ import sys
18
+
19
+ from agent_base.utils import PROJECT_ROOT, load_dotenv
20
+ from agent_base.tools.tooling import (
21
+ ToolBase,
22
+ command_safety_issue,
23
+ sanitized_subprocess_env,
24
+ validate_tool_path,
25
+ workspace_root,
26
+ )
27
+
28
+
29
+ DEFAULT_BUFFER_LIMIT = 200000
30
+ DEFAULT_OUTPUT_CHARS = 20000
31
+ DEFAULT_YIELD_MS = 200
32
+ REPEAT_COLLAPSE_THRESHOLD = 3
33
+
34
+ def _default_shell() -> str:
35
+ return shutil.which("bash") or "/bin/bash"
36
+
37
+
38
+ def _resolve_cwd(path_value: Optional[str], *, base_root: Optional[Path] = None) -> Path:
39
+ if not path_value:
40
+ return (base_root or workspace_root()).resolve()
41
+ return validate_tool_path(path_value, "Working directory", base_root=base_root)
42
+
43
+
44
+ def _set_terminal_size(fd: int, rows: int, cols: int) -> None:
45
+ winsize = struct.pack("HHHH", rows, cols, 0, 0)
46
+ try:
47
+ import fcntl
48
+
49
+ fcntl.ioctl(fd, termios.TIOCSWINSZ, winsize)
50
+ except (ImportError, OSError):
51
+ return
52
+
53
+
54
+ def _disable_echo(fd: int) -> None:
55
+ try:
56
+ attrs = termios.tcgetattr(fd)
57
+ attrs[3] &= ~termios.ECHO
58
+ termios.tcsetattr(fd, termios.TCSANOW, attrs)
59
+ except termios.error:
60
+ return
61
+
62
+
63
+ def _collapse_repeated_lines(text: str, *, threshold: int = REPEAT_COLLAPSE_THRESHOLD) -> str:
64
+ if not text:
65
+ return text
66
+ lines = text.splitlines(keepends=True)
67
+ if not lines:
68
+ return text
69
+ collapsed: list[str] = []
70
+ current = lines[0]
71
+ count = 1
72
+ for line in lines[1:]:
73
+ if line == current:
74
+ count += 1
75
+ continue
76
+ if count >= threshold:
77
+ collapsed.append(current)
78
+ collapsed.append(f"[previous line repeated {count - 1} additional times]\n")
79
+ else:
80
+ collapsed.extend([current] * count)
81
+ current = line
82
+ count = 1
83
+ if count >= threshold:
84
+ collapsed.append(current)
85
+ collapsed.append(f"[previous line repeated {count - 1} additional times]\n")
86
+ else:
87
+ collapsed.extend([current] * count)
88
+ return "".join(collapsed)
89
+
90
+
91
+ def _bounded_output(text: str, *, max_output_chars: int = DEFAULT_OUTPUT_CHARS) -> str:
92
+ if not text:
93
+ return text
94
+ compressed = _collapse_repeated_lines(text)
95
+ if len(compressed) <= max_output_chars:
96
+ return compressed
97
+ omitted = len(compressed) - max_output_chars
98
+ suffix = f"\n[output truncated: omitted {omitted} chars]\n"
99
+ keep = max(0, max_output_chars - len(suffix))
100
+ return compressed[:keep] + suffix
101
+
102
+
103
+ class Bash(ToolBase):
104
+ name = "Bash"
105
+ description = (
106
+ "Run a local bash command and return stdout and stderr. This is the primary local execution tool for "
107
+ "shell commands, path operations, ripgrep, git, temporary python3 heredoc scripts, parsing, validation, "
108
+ "and local result transformation."
109
+ )
110
+ parameters = {
111
+ "type": "object",
112
+ "properties": {
113
+ "command": {
114
+ "type": "string",
115
+ "description": "The shell command to execute.",
116
+ },
117
+ "timeout": {
118
+ "type": "integer",
119
+ "description": "Timeout in seconds. Default is 30.",
120
+ },
121
+ "workdir": {
122
+ "type": "string",
123
+ "description": "Optional working directory for the command. Defaults to the current workspace root.",
124
+ },
125
+ "max_output_chars": {
126
+ "type": "integer",
127
+ "description": f"Maximum combined stdout/stderr characters returned after repeated-line compression. Default is {DEFAULT_OUTPUT_CHARS}.",
128
+ },
129
+ },
130
+ "required": ["command"],
131
+ }
132
+
133
+ def __init__(self, cfg: Optional[dict] = None):
134
+ super().__init__(cfg)
135
+
136
+ def call(self, params: Union[str, dict], **kwargs) -> str:
137
+ try:
138
+ params = self.parse_json_args(params)
139
+ except ValueError as exc:
140
+ return f"[Bash] {exc}"
141
+ base_root = kwargs.get("workspace_root")
142
+ runtime_deadline = kwargs.get("runtime_deadline")
143
+
144
+ command = str(params["command"])
145
+ workdir = params.get("workdir")
146
+ try:
147
+ timeout = int(params.get("timeout", 30))
148
+ max_output_chars = int(params.get("max_output_chars", DEFAULT_OUTPUT_CHARS))
149
+ except (TypeError, ValueError):
150
+ return "[Bash] timeout and max_output_chars must be integers."
151
+
152
+ issue = command_safety_issue(str(command))
153
+ if issue:
154
+ return f"[Bash] Blocked by safety policy: {issue}"
155
+
156
+ try:
157
+ cwd = _resolve_cwd(workdir, base_root=base_root)
158
+ except ValueError as exc:
159
+ return f"[Bash] Invalid or blocked working directory: {exc}"
160
+ if not cwd.exists():
161
+ return f"[Bash] Working directory does not exist: {cwd}"
162
+ if not cwd.is_dir():
163
+ return f"[Bash] Working directory is not a directory: {cwd}"
164
+ if timeout <= 0:
165
+ return "[Bash] timeout must be > 0."
166
+ if max_output_chars <= 0:
167
+ return "[Bash] max_output_chars must be > 0."
168
+
169
+ effective_timeout: float = float(timeout)
170
+ if runtime_deadline is not None:
171
+ remaining = float(runtime_deadline) - time.time()
172
+ if remaining <= 0:
173
+ return "[Bash] Agent runtime limit reached before command execution."
174
+ effective_timeout = min(effective_timeout, max(remaining, 0.001))
175
+
176
+ try:
177
+ proc = subprocess.run(
178
+ command,
179
+ shell=True,
180
+ capture_output=True,
181
+ text=True,
182
+ timeout=effective_timeout,
183
+ cwd=str(cwd),
184
+ env=sanitized_subprocess_env(base_root=base_root),
185
+ executable=shutil.which("bash") or "/bin/bash",
186
+ )
187
+ except subprocess.TimeoutExpired:
188
+ return "[Bash] TimeoutError: Execution timed out."
189
+ except (OSError, subprocess.SubprocessError) as exc:
190
+ return f"[Bash] Error executing command: {exc}"
191
+
192
+ parts = [f"exit_code: {proc.returncode}"]
193
+ stdout = _bounded_output(proc.stdout, max_output_chars=max_output_chars)
194
+ stderr = _bounded_output(proc.stderr, max_output_chars=max_output_chars)
195
+ if stdout:
196
+ parts.append(f"stdout:\n{stdout}")
197
+ if stderr:
198
+ parts.append(f"stderr:\n{stderr}")
199
+ return "\n".join(parts)
200
+
201
+ class TerminalSession:
202
+ def __init__(self, cwd: Path, shell: str, rows: int, cols: int, *, base_root: Optional[Path] = None):
203
+ self.cwd = cwd
204
+ self.shell = shell
205
+ self.rows = rows
206
+ self.cols = cols
207
+ self._buffer_limit = DEFAULT_BUFFER_LIMIT
208
+ self._pending_output = ""
209
+ self._dropped_output_chars = 0
210
+ self._lock = threading.Lock()
211
+
212
+ master_fd, slave_fd = pty.openpty()
213
+ _set_terminal_size(slave_fd, rows, cols)
214
+ _disable_echo(slave_fd)
215
+
216
+ env = sanitized_subprocess_env(base_root=base_root)
217
+ env.setdefault("TERM", "xterm-256color")
218
+ env.setdefault("PS1", "")
219
+ env.setdefault("PROMPT_COMMAND", "")
220
+
221
+ self._proc = subprocess.Popen(
222
+ [shell, "--noprofile", "--norc"],
223
+ stdin=slave_fd,
224
+ stdout=slave_fd,
225
+ stderr=slave_fd,
226
+ cwd=str(cwd),
227
+ env=env,
228
+ text=False,
229
+ close_fds=True,
230
+ start_new_session=True,
231
+ )
232
+ os.close(slave_fd)
233
+ self._master_fd = master_fd
234
+ self._reader = threading.Thread(target=self._reader_loop, daemon=True)
235
+ self._reader.start()
236
+
237
+ @property
238
+ def pid(self) -> int:
239
+ return self._proc.pid
240
+
241
+ @property
242
+ def alive(self) -> bool:
243
+ return self._proc.poll() is None
244
+
245
+ @property
246
+ def returncode(self) -> Optional[int]:
247
+ return self._proc.poll()
248
+
249
+ def _reader_loop(self) -> None:
250
+ while True:
251
+ try:
252
+ ready, _, _ = select.select([self._master_fd], [], [], 0.1)
253
+ except (OSError, ValueError):
254
+ break
255
+
256
+ if not ready:
257
+ if self._proc.poll() is not None:
258
+ break
259
+ continue
260
+
261
+ try:
262
+ data = os.read(self._master_fd, 4096)
263
+ except OSError:
264
+ break
265
+
266
+ if not data:
267
+ if self._proc.poll() is not None:
268
+ break
269
+ continue
270
+
271
+ decoded = data.decode("utf-8", errors="replace")
272
+ with self._lock:
273
+ self._pending_output += decoded
274
+ overflow = len(self._pending_output) - self._buffer_limit
275
+ if overflow > 0:
276
+ self._pending_output = self._pending_output[overflow:]
277
+ self._dropped_output_chars += overflow
278
+
279
+ try:
280
+ os.close(self._master_fd)
281
+ except OSError:
282
+ pass
283
+
284
+ def write(self, data: str) -> None:
285
+ if not self.alive:
286
+ raise RuntimeError("session is not running")
287
+ os.write(self._master_fd, data.encode("utf-8", errors="replace"))
288
+
289
+ def read(self, yield_time_ms: int = DEFAULT_YIELD_MS, max_output_chars: int = DEFAULT_OUTPUT_CHARS) -> dict:
290
+ if yield_time_ms > 0:
291
+ time.sleep(yield_time_ms / 1000.0)
292
+
293
+ with self._lock:
294
+ output = self._pending_output[:max_output_chars]
295
+ self._pending_output = self._pending_output[max_output_chars:]
296
+ remaining_output_chars = len(self._pending_output)
297
+ dropped_output_chars = self._dropped_output_chars
298
+ self._dropped_output_chars = 0
299
+
300
+ return {
301
+ "alive": self.alive,
302
+ "returncode": self.returncode,
303
+ "output": output,
304
+ "remaining_output_chars": remaining_output_chars,
305
+ "dropped_output_chars": dropped_output_chars,
306
+ "truncated": remaining_output_chars > 0,
307
+ }
308
+
309
+ def interrupt(self, *, max_output_chars: int = DEFAULT_OUTPUT_CHARS) -> dict:
310
+ if not self.alive:
311
+ raise RuntimeError("session is not running")
312
+ os.write(self._master_fd, b"\x03")
313
+ return self.read(yield_time_ms=DEFAULT_YIELD_MS, max_output_chars=max_output_chars)
314
+
315
+ def terminate(self, force: bool = False) -> Optional[int]:
316
+ if self.alive:
317
+ try:
318
+ os.killpg(os.getpgid(self.pid), signal.SIGKILL if force else signal.SIGTERM)
319
+ except ProcessLookupError:
320
+ pass
321
+ except OSError:
322
+ self._proc.kill() if force else self._proc.terminate()
323
+ try:
324
+ self._proc.wait(timeout=2 if not force else 1)
325
+ except subprocess.TimeoutExpired:
326
+ if not force:
327
+ return self.terminate(force=True)
328
+ return self.returncode
329
+
330
+
331
+ class TerminalSessionManager:
332
+ def __init__(self):
333
+ self._lock = threading.Lock()
334
+ self._counter = itertools.count(1)
335
+ self._sessions: dict[str, TerminalSession] = {}
336
+
337
+ def start(self, cwd: Path, shell: str, rows: int, cols: int, *, base_root: Optional[Path] = None) -> tuple[str, TerminalSession]:
338
+ session = TerminalSession(cwd=cwd, shell=shell, rows=rows, cols=cols, base_root=base_root)
339
+ session_id = f"term_{next(self._counter)}"
340
+ with self._lock:
341
+ self._sessions[session_id] = session
342
+ return session_id, session
343
+
344
+ def get(self, session_id: str) -> Optional[TerminalSession]:
345
+ with self._lock:
346
+ return self._sessions.get(session_id)
347
+
348
+ def pop(self, session_id: str) -> Optional[TerminalSession]:
349
+ with self._lock:
350
+ return self._sessions.pop(session_id, None)
351
+
352
+ def cleanup(self) -> None:
353
+ with self._lock:
354
+ sessions = list(self._sessions.items())
355
+ self._sessions.clear()
356
+ for _, session in sessions:
357
+ session.terminate(force=True)
358
+
359
+
360
+ SESSION_MANAGER = TerminalSessionManager()
361
+ atexit.register(SESSION_MANAGER.cleanup)
362
+
363
+
364
+ def _format_terminal_response(
365
+ prefix: str,
366
+ session_id: str,
367
+ payload: dict,
368
+ cwd: Optional[Path] = None,
369
+ shell: Optional[str] = None,
370
+ pid: Optional[int] = None,
371
+ ) -> str:
372
+ lines = [prefix, f"session_id: {session_id}"]
373
+ if pid is not None:
374
+ lines.append(f"pid: {pid}")
375
+ if cwd is not None:
376
+ lines.append(f"cwd: {cwd}")
377
+ if shell is not None:
378
+ lines.append(f"shell: {shell}")
379
+ if "alive" in payload:
380
+ lines.append(f"alive: {str(payload['alive']).lower()}")
381
+ if "returncode" in payload:
382
+ lines.append(f"returncode: {payload['returncode']}")
383
+ if "truncated" in payload:
384
+ lines.append(f"truncated: {str(payload['truncated']).lower()}")
385
+ if "remaining_output_chars" in payload:
386
+ lines.append(f"remaining_output_chars: {payload['remaining_output_chars']}")
387
+ if "dropped_output_chars" in payload:
388
+ lines.append(f"dropped_output_chars: {payload['dropped_output_chars']}")
389
+ if "output" in payload:
390
+ lines.append("output:")
391
+ lines.append(payload["output"])
392
+ return "\n".join(lines)
393
+
394
+
395
+ class TerminalStart(ToolBase):
396
+ name = "TerminalStart"
397
+ description = "Start a persistent local terminal session backed by a PTY shell."
398
+ parameters = {
399
+ "type": "object",
400
+ "properties": {
401
+ "cwd": {
402
+ "type": "string",
403
+ "description": "Optional working directory for the terminal session. Default is the current workspace root.",
404
+ },
405
+ "shell": {
406
+ "type": "string",
407
+ "description": "Optional shell executable path. Default is bash.",
408
+ },
409
+ "rows": {
410
+ "type": "integer",
411
+ "description": "Terminal row count. Default is 30.",
412
+ },
413
+ "cols": {
414
+ "type": "integer",
415
+ "description": "Terminal column count. Default is 120.",
416
+ },
417
+ },
418
+ "required": [],
419
+ }
420
+
421
+ def __init__(self, cfg: Optional[dict] = None):
422
+ super().__init__(cfg)
423
+
424
+ def call(self, params: Union[str, dict], **kwargs) -> str:
425
+ try:
426
+ params = self.parse_json_args(params)
427
+ except ValueError as exc:
428
+ return f"[TerminalStart] {exc}"
429
+ base_root = kwargs.get("workspace_root")
430
+ try:
431
+ cwd = _resolve_cwd(params.get("cwd"), base_root=base_root)
432
+ shell = params.get("shell") or _default_shell()
433
+ rows = int(params.get("rows", 30))
434
+ cols = int(params.get("cols", 120))
435
+ except ValueError as exc:
436
+ return f"[TerminalStart] {exc}"
437
+ except (TypeError, OverflowError):
438
+ return "[TerminalStart] rows and cols must be integers."
439
+
440
+ if not cwd.exists():
441
+ return f"[TerminalStart] Working directory does not exist: {cwd}"
442
+ if not cwd.is_dir():
443
+ return f"[TerminalStart] Working directory is not a directory: {cwd}"
444
+ if not Path(shell).exists() and shutil.which(shell) is None:
445
+ return f"[TerminalStart] Shell not found: {shell}"
446
+ if rows <= 0 or cols <= 0:
447
+ return "[TerminalStart] rows and cols must both be > 0."
448
+
449
+ try:
450
+ session_id, session = SESSION_MANAGER.start(cwd=cwd, shell=shell, rows=rows, cols=cols, base_root=base_root)
451
+ except (OSError, RuntimeError, subprocess.SubprocessError) as exc:
452
+ return f"[TerminalStart] Failed to start terminal session: {exc}"
453
+
454
+ return _format_terminal_response(
455
+ "[TerminalStart] Started terminal session.",
456
+ session_id=session_id,
457
+ payload={"alive": session.alive, "returncode": session.returncode},
458
+ cwd=cwd,
459
+ shell=shell,
460
+ pid=session.pid,
461
+ )
462
+
463
+
464
+ class TerminalWrite(ToolBase):
465
+ name = "TerminalWrite"
466
+ description = "Write input into an existing terminal session and read back newly produced output."
467
+ parameters = {
468
+ "type": "object",
469
+ "properties": {
470
+ "session_id": {
471
+ "type": "string",
472
+ "description": "The terminal session ID returned by TerminalStart.",
473
+ },
474
+ "input": {
475
+ "type": "string",
476
+ "description": "The text to send to the terminal session.",
477
+ },
478
+ "append_newline": {
479
+ "type": "boolean",
480
+ "description": "Whether to append a newline after the provided input. Default is true.",
481
+ },
482
+ "yield_time_ms": {
483
+ "type": "integer",
484
+ "description": "Milliseconds to wait before reading output. Default is 200.",
485
+ },
486
+ "max_output_chars": {
487
+ "type": "integer",
488
+ "description": "Maximum number of output characters to return. Default is 20000.",
489
+ },
490
+ },
491
+ "required": ["session_id", "input"],
492
+ }
493
+
494
+ def __init__(self, cfg: Optional[dict] = None):
495
+ super().__init__(cfg)
496
+
497
+ def call(self, params: Union[str, dict], **kwargs) -> str:
498
+ try:
499
+ params = self.parse_json_args(params)
500
+ except ValueError as exc:
501
+ return f"[TerminalWrite] {exc}"
502
+
503
+ session_id = str(params["session_id"])
504
+ input_text = str(params["input"])
505
+ append_newline = bool(params.get("append_newline", True))
506
+ try:
507
+ yield_time_ms = int(params.get("yield_time_ms", DEFAULT_YIELD_MS))
508
+ max_output_chars = int(params.get("max_output_chars", DEFAULT_OUTPUT_CHARS))
509
+ except (TypeError, ValueError):
510
+ return "[TerminalWrite] yield_time_ms and max_output_chars must be integers."
511
+
512
+ issue = command_safety_issue(input_text)
513
+ if issue:
514
+ return f"[TerminalWrite] Blocked by safety policy: {issue}"
515
+
516
+ session = SESSION_MANAGER.get(session_id)
517
+ if session is None:
518
+ return f"[TerminalWrite] Session not found: {session_id}"
519
+ if max_output_chars <= 0:
520
+ return "[TerminalWrite] max_output_chars must be > 0."
521
+ if yield_time_ms < 0:
522
+ return "[TerminalWrite] yield_time_ms must be >= 0."
523
+
524
+ payload_input = input_text + ("\n" if append_newline else "")
525
+ try:
526
+ session.write(payload_input)
527
+ payload = session.read(yield_time_ms=yield_time_ms, max_output_chars=max_output_chars)
528
+ except (OSError, RuntimeError, subprocess.SubprocessError) as exc:
529
+ return f"[TerminalWrite] Failed to write to session {session_id}: {exc}"
530
+
531
+ return _format_terminal_response("[TerminalWrite] Session updated.", session_id=session_id, payload=payload)
532
+
533
+
534
+ class TerminalRead(ToolBase):
535
+ name = "TerminalRead"
536
+ description = "Read unread output from an existing terminal session."
537
+ parameters = {
538
+ "type": "object",
539
+ "properties": {
540
+ "session_id": {
541
+ "type": "string",
542
+ "description": "The terminal session ID returned by TerminalStart.",
543
+ },
544
+ "yield_time_ms": {
545
+ "type": "integer",
546
+ "description": "Milliseconds to wait before reading output. Default is 200.",
547
+ },
548
+ "max_output_chars": {
549
+ "type": "integer",
550
+ "description": "Maximum number of output characters to return. Default is 20000.",
551
+ },
552
+ },
553
+ "required": ["session_id"],
554
+ }
555
+
556
+ def __init__(self, cfg: Optional[dict] = None):
557
+ super().__init__(cfg)
558
+
559
+ def call(self, params: Union[str, dict], **kwargs) -> str:
560
+ try:
561
+ params = self.parse_json_args(params)
562
+ except ValueError as exc:
563
+ return f"[TerminalRead] {exc}"
564
+
565
+ session_id = str(params["session_id"])
566
+ try:
567
+ yield_time_ms = int(params.get("yield_time_ms", DEFAULT_YIELD_MS))
568
+ max_output_chars = int(params.get("max_output_chars", DEFAULT_OUTPUT_CHARS))
569
+ except (TypeError, ValueError):
570
+ return "[TerminalRead] yield_time_ms and max_output_chars must be integers."
571
+
572
+ session = SESSION_MANAGER.get(session_id)
573
+ if session is None:
574
+ return f"[TerminalRead] Session not found: {session_id}"
575
+ if max_output_chars <= 0:
576
+ return "[TerminalRead] max_output_chars must be > 0."
577
+ if yield_time_ms < 0:
578
+ return "[TerminalRead] yield_time_ms must be >= 0."
579
+
580
+ try:
581
+ payload = session.read(yield_time_ms=yield_time_ms, max_output_chars=max_output_chars)
582
+ except (OSError, RuntimeError, subprocess.SubprocessError) as exc:
583
+ return f"[TerminalRead] Failed to read session {session_id}: {exc}"
584
+
585
+ return _format_terminal_response("[TerminalRead] Session output fetched.", session_id=session_id, payload=payload)
586
+
587
+
588
+ class TerminalKill(ToolBase):
589
+ name = "TerminalKill"
590
+ description = "Terminate an existing terminal session and release its resources."
591
+ parameters = {
592
+ "type": "object",
593
+ "properties": {
594
+ "session_id": {
595
+ "type": "string",
596
+ "description": "The terminal session ID returned by TerminalStart.",
597
+ },
598
+ "force": {
599
+ "type": "boolean",
600
+ "description": "Whether to force kill the terminal session immediately. Default is false.",
601
+ },
602
+ },
603
+ "required": ["session_id"],
604
+ }
605
+
606
+ def __init__(self, cfg: Optional[dict] = None):
607
+ super().__init__(cfg)
608
+
609
+ def call(self, params: Union[str, dict], **kwargs) -> str:
610
+ try:
611
+ params = self.parse_json_args(params)
612
+ except ValueError as exc:
613
+ return f"[TerminalKill] {exc}"
614
+
615
+ session_id = str(params["session_id"])
616
+ force = bool(params.get("force", False))
617
+
618
+ session = SESSION_MANAGER.pop(session_id)
619
+ if session is None:
620
+ return f"[TerminalKill] Session not found: {session_id}"
621
+
622
+ try:
623
+ returncode = session.terminate(force=force)
624
+ except (OSError, RuntimeError, subprocess.SubprocessError) as exc:
625
+ return f"[TerminalKill] Failed to terminate session {session_id}: {exc}"
626
+
627
+ return _format_terminal_response(
628
+ "[TerminalKill] Terminal session terminated.",
629
+ session_id=session_id,
630
+ payload={"alive": False, "returncode": returncode},
631
+ )
632
+
633
+
634
+ class TerminalInterrupt(ToolBase):
635
+ name = "TerminalInterrupt"
636
+ description = "Send Ctrl-C to the foreground process in an existing terminal session while keeping the session alive."
637
+ parameters = {
638
+ "type": "object",
639
+ "properties": {
640
+ "session_id": {
641
+ "type": "string",
642
+ "description": "The terminal session ID returned by TerminalStart.",
643
+ },
644
+ "max_output_chars": {
645
+ "type": "integer",
646
+ "description": "Maximum number of output characters to return after the interrupt. Default is 20000.",
647
+ },
648
+ },
649
+ "required": ["session_id"],
650
+ }
651
+
652
+ def __init__(self, cfg: Optional[dict] = None):
653
+ super().__init__(cfg)
654
+
655
+ def call(self, params: Union[str, dict], **kwargs) -> str:
656
+ try:
657
+ params = self.parse_json_args(params)
658
+ except ValueError as exc:
659
+ return f"[TerminalInterrupt] {exc}"
660
+
661
+ session_id = str(params["session_id"])
662
+ try:
663
+ max_output_chars = int(params.get("max_output_chars", DEFAULT_OUTPUT_CHARS))
664
+ except (TypeError, ValueError):
665
+ return "[TerminalInterrupt] max_output_chars must be an integer."
666
+
667
+ session = SESSION_MANAGER.get(session_id)
668
+ if session is None:
669
+ return f"[TerminalInterrupt] Session not found: {session_id}"
670
+ if max_output_chars <= 0:
671
+ return "[TerminalInterrupt] max_output_chars must be > 0."
672
+
673
+ try:
674
+ payload = session.interrupt(max_output_chars=max_output_chars)
675
+ except (OSError, RuntimeError, subprocess.SubprocessError) as exc:
676
+ return f"[TerminalInterrupt] Failed to interrupt session {session_id}: {exc}"
677
+
678
+ return _format_terminal_response(
679
+ "[TerminalInterrupt] Sent Ctrl-C to terminal session.",
680
+ session_id=session_id,
681
+ payload=payload,
682
+ )
683
+
684
+
685
+ def main(argv: Optional[list[str]] = None) -> int:
686
+ parser = argparse.ArgumentParser(description="Run runtime and terminal tools directly.")
687
+ subparsers = parser.add_subparsers(dest="tool", required=True)
688
+
689
+ bash_parser = subparsers.add_parser("bash", help="Run the Bash tool.")
690
+ bash_parser.add_argument("command")
691
+ bash_parser.add_argument("--timeout", type=int, default=30)
692
+ bash_parser.add_argument("--workdir")
693
+
694
+ terminal_parser = subparsers.add_parser("terminal", help="Run a minimal terminal session demo.")
695
+ terminal_parser.add_argument("input", help="Input to send after starting the session.")
696
+ terminal_parser.add_argument("--cwd")
697
+ terminal_parser.add_argument("--yield-time-ms", type=int, default=200)
698
+
699
+ args = parser.parse_args(argv)
700
+ load_dotenv(PROJECT_ROOT / ".env")
701
+ workdir_root = Path(args.workdir).expanduser().resolve() if getattr(args, "workdir", None) else None
702
+
703
+ if args.tool == "bash":
704
+ result = Bash().call(
705
+ {"command": args.command, "timeout": args.timeout, "workdir": args.workdir},
706
+ workspace_root=workdir_root,
707
+ )
708
+ print(result)
709
+ return 0
710
+
711
+ terminal_root = Path(args.cwd).expanduser().resolve() if args.cwd else workspace_root()
712
+ start_result = TerminalStart().call({"cwd": str(terminal_root)}, workspace_root=terminal_root)
713
+ print(start_result)
714
+ session_match = re.search(r"session_id: (term_\d+)", start_result)
715
+ if not session_match:
716
+ return 1
717
+ session_id = session_match.group(1)
718
+ write_result = TerminalWrite().call(
719
+ {
720
+ "session_id": session_id,
721
+ "input": args.input,
722
+ "yield_time_ms": args.yield_time_ms,
723
+ },
724
+ workspace_root=terminal_root,
725
+ )
726
+ print(write_result)
727
+ print(TerminalKill().call({"session_id": session_id}, workspace_root=terminal_root))
728
+ return 0
729
+
730
+
731
+ if __name__ == "__main__":
732
+ raise SystemExit(main(sys.argv[1:]))
agent_base/tools/tool_user.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import sys
5
+ from typing import Any, TextIO, Union
6
+
7
+ from agent_base.tools.tooling import ToolBase
8
+
9
+
10
+ class AskUser(ToolBase):
11
+ name = "AskUser"
12
+ description = (
13
+ "Ask the human user a concise clarification question when progress depends on "
14
+ "information, preference, or approval that cannot be determined from the workspace or other tools."
15
+ )
16
+ parameters = {
17
+ "type": "object",
18
+ "properties": {
19
+ "question": {
20
+ "type": "string",
21
+ "description": "The concise question to ask the user.",
22
+ },
23
+ "context": {
24
+ "type": "string",
25
+ "description": "Optional brief context explaining why the question is necessary.",
26
+ },
27
+ },
28
+ "required": ["question"],
29
+ "additionalProperties": False,
30
+ }
31
+
32
+ def call(self, params: Union[str, dict], **kwargs: Any) -> str:
33
+ try:
34
+ parsed = self.parse_json_args(params)
35
+ except ValueError as exc:
36
+ return f"[AskUser] {exc}"
37
+
38
+ question = str(parsed.get("question", "")).strip()
39
+ context = str(parsed.get("context", "") or "").strip()
40
+ if not question:
41
+ return "[AskUser] question must be a non-empty string."
42
+
43
+ input_stream = kwargs.get("input_stream")
44
+ output_stream = kwargs.get("output_stream")
45
+ close_stream = False
46
+ if input_stream is None or output_stream is None:
47
+ input_stream, output_stream, close_stream = _resolve_interactive_streams()
48
+ if input_stream is None or output_stream is None:
49
+ return (
50
+ "[AskUser] Cannot ask the user because no interactive terminal is available. "
51
+ "Continue with available evidence, or state the blocker if the answer is essential."
52
+ )
53
+
54
+ try:
55
+ _write_question(output_stream, question=question, context=context)
56
+ answer = input_stream.readline()
57
+ except OSError as exc:
58
+ return f"[AskUser] Failed to read user input: {exc}"
59
+ finally:
60
+ if close_stream:
61
+ try:
62
+ input_stream.close()
63
+ except OSError:
64
+ pass
65
+
66
+ answer = str(answer or "").strip()
67
+ if not answer:
68
+ return "[AskUser] User answer was empty."
69
+ return f"[AskUser] User answer:\n{answer}"
70
+
71
+
72
+ def _resolve_interactive_streams() -> tuple[TextIO | None, TextIO | None, bool]:
73
+ if sys.stdin.isatty() and sys.stdout.isatty():
74
+ return sys.stdin, sys.stdout, False
75
+ if os.name == "nt":
76
+ return None, None, False
77
+ try:
78
+ tty = open("/dev/tty", "r+", encoding="utf-8")
79
+ except OSError:
80
+ return None, None, False
81
+ return tty, tty, True
82
+
83
+
84
+ def _write_question(output_stream: TextIO, *, question: str, context: str = "") -> None:
85
+ output_stream.write("\n[AskUser]\n")
86
+ if context:
87
+ output_stream.write(f"Context: {context}\n")
88
+ output_stream.write(f"Question: {question}\n> ")
89
+ output_stream.flush()
agent_base/tools/tool_web.py ADDED
@@ -0,0 +1,610 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ import os
4
+ import sys
5
+ import time
6
+ from concurrent.futures import ThreadPoolExecutor
7
+ from typing import Optional, Union
8
+
9
+ import requests
10
+ import tiktoken
11
+ from openai import APIConnectionError, APIError, APITimeoutError, OpenAI
12
+
13
+ from agent_base.provider_compat import apply_sampling_params
14
+ from agent_base.prompt import EXTRACTOR_PROMPT
15
+ from agent_base.tools.tooling import ToolBase
16
+ from agent_base.utils import PROJECT_ROOT, env_flag, load_dotenv
17
+
18
+ DEFAULT_LLM_TIMEOUT_SECONDS = 600.0
19
+ DEFAULT_LLM_MAX_RETRIES = 10
20
+ DEFAULT_TEMPERATURE = 0.6
21
+ DEFAULT_TOP_P = 0.95
22
+ DEFAULT_PRESENCE_PENALTY = 1.1
23
+
24
+
25
+ def search_debug_enabled() -> bool:
26
+ return env_flag("DEBUG_SEARCH")
27
+
28
+
29
+ def scholar_debug_enabled() -> bool:
30
+ return env_flag("DEBUG_SCHOLAR")
31
+
32
+
33
+ def visit_debug_enabled() -> bool:
34
+ return env_flag("DEBUG_VISIT")
35
+
36
+
37
+ def _request_error_text(exc: requests.RequestException) -> str:
38
+ response = getattr(exc, "response", None)
39
+ if response is None:
40
+ return str(exc)
41
+ body = response.text.strip()
42
+ if len(body) > 1000:
43
+ body = body[:1000] + "...(truncated)"
44
+ return f"{exc}; response_body={body}" if body else str(exc)
45
+
46
+
47
+ def truncate_to_tokens(text: str, max_tokens: int = 95000) -> str:
48
+ encoding = tiktoken.get_encoding("cl100k_base")
49
+ tokens = encoding.encode(text)
50
+ if len(tokens) <= max_tokens:
51
+ return text
52
+ truncated_tokens = tokens[:max_tokens]
53
+ return encoding.decode(truncated_tokens)
54
+
55
+
56
+ def _stringify_field(value) -> str:
57
+ if isinstance(value, str):
58
+ return value.strip()
59
+ if value is None:
60
+ return ""
61
+ if isinstance(value, (list, dict)):
62
+ try:
63
+ return json.dumps(value, ensure_ascii=False)
64
+ except (TypeError, ValueError):
65
+ return str(value).strip()
66
+ return str(value).strip()
67
+
68
+
69
+ def _webfetch_failure(url: str, goal: str, reason: str) -> str:
70
+ useful_information = "The useful information in {url} for user goal {goal} as follows: \n\n".format(url=url, goal=goal)
71
+ useful_information += "Evidence in page: \n" + reason + "\n\n"
72
+ useful_information += "Summary: \n" + "The webpage content could not be processed into the required structured summary." + "\n\n"
73
+ return useful_information
74
+
75
+
76
+ def _parse_extractor_payload(raw) -> tuple[str, str] | None:
77
+ if isinstance(raw, str):
78
+ raw = raw.replace("```json", "").replace("```", "").strip()
79
+ try:
80
+ raw = json.loads(raw)
81
+ except json.JSONDecodeError:
82
+ return None
83
+
84
+ if not isinstance(raw, dict):
85
+ return None
86
+
87
+ evidence = _stringify_field(raw.get("evidence"))
88
+ summary = _stringify_field(raw.get("summary"))
89
+ if not evidence or not summary:
90
+ return None
91
+ return evidence, summary
92
+
93
+
94
+ class WebSearch(ToolBase):
95
+ name = "WebSearch"
96
+ description = "Perform Google web searches and return the top results. Accepts multiple complementary queries."
97
+ parameters = {
98
+ "type": "object",
99
+ "properties": {
100
+ "query": {
101
+ "type": "array",
102
+ "items": {
103
+ "type": "string",
104
+ },
105
+ "minItems": 1,
106
+ "description": "Array of query strings. Include multiple complementary search queries in a single call.",
107
+ },
108
+ },
109
+ "required": ["query"],
110
+ }
111
+
112
+ def __init__(self, cfg: Optional[dict] = None):
113
+ super().__init__(cfg)
114
+
115
+ def google_search_with_serp(self, query: str):
116
+ def contains_chinese_basic(text: str) -> bool:
117
+ return any("\u4E00" <= char <= "\u9FFF" for char in text)
118
+
119
+ if contains_chinese_basic(query):
120
+ payload = {
121
+ "q": query,
122
+ "location": "China",
123
+ "gl": "cn",
124
+ "hl": "zh-cn",
125
+ }
126
+ else:
127
+ payload = {
128
+ "q": query,
129
+ "location": "United States",
130
+ "gl": "us",
131
+ "hl": "en",
132
+ }
133
+ serper_key = os.getenv("SERPER_KEY_ID", "").strip()
134
+ if not serper_key:
135
+ return "[WebSearch] SERPER_KEY_ID is not set."
136
+ headers = {
137
+ "X-API-KEY": serper_key,
138
+ "Content-Type": "application/json",
139
+ }
140
+
141
+ last_error = ""
142
+ res = None
143
+ for i in range(5):
144
+ try:
145
+ res = requests.post(
146
+ "https://google.serper.dev/search",
147
+ json=payload,
148
+ headers=headers,
149
+ timeout=20,
150
+ )
151
+ res.raise_for_status()
152
+ break
153
+ except requests.RequestException as exc:
154
+ last_error = _request_error_text(exc)
155
+ if search_debug_enabled():
156
+ print(exc)
157
+ if i == 4:
158
+ return f"[WebSearch] Request failed for '{query}': {last_error}"
159
+
160
+ if res is None:
161
+ return f"[WebSearch] Request failed for '{query}': {last_error or 'unknown error'}"
162
+
163
+ try:
164
+ results = res.json()
165
+ except ValueError as exc:
166
+ return f"[WebSearch] Invalid JSON response for '{query}': {exc}"
167
+
168
+ organic_results = results.get("organic")
169
+ if not isinstance(organic_results, list) or not organic_results:
170
+ return f"No results found for '{query}'. Try with a more general query."
171
+
172
+ web_snippets = []
173
+ for idx, page in enumerate(organic_results, start=1):
174
+ if not isinstance(page, dict):
175
+ continue
176
+ title = str(page.get("title", "Untitled result"))
177
+ link = str(page.get("link", ""))
178
+ date_published = f"\nDate published: {page['date']}" if "date" in page else ""
179
+ source = f"\nSource: {page['source']}" if "source" in page else ""
180
+ snippet = f"\n{page['snippet']}" if "snippet" in page else ""
181
+ redacted_version = f"{idx}. [{title}]({link}){date_published}{source}\n{snippet}"
182
+ redacted_version = redacted_version.replace("Your browser can't play this video.", "")
183
+ web_snippets.append(redacted_version)
184
+
185
+ if not web_snippets:
186
+ return f"No results found for '{query}'. Try with a more general query."
187
+
188
+ content = f"A Google search for '{query}' found {len(web_snippets)} results:\n\n## Web Results\n" + "\n\n".join(web_snippets)
189
+ return content
190
+
191
+ def search_with_serp(self, query: str):
192
+ return self.google_search_with_serp(query)
193
+
194
+ def call(self, params: Union[str, dict], **kwargs) -> str:
195
+ try:
196
+ params = self.parse_json_args(params)
197
+ query = params["query"]
198
+ except ValueError as exc:
199
+ return f"[WebSearch] {exc}"
200
+
201
+ if isinstance(query, list):
202
+ with ThreadPoolExecutor(max_workers=3) as executor:
203
+ responses = list(executor.map(self.search_with_serp, query))
204
+ response = "\n=======\n".join(responses)
205
+ else:
206
+ return "[WebSearch] 'query' must be a list of strings."
207
+
208
+ return response
209
+
210
+
211
+ class ScholarSearch(ToolBase):
212
+ name = "ScholarSearch"
213
+ description = "Search academic sources through Google Scholar and return relevant publication results."
214
+ parameters = {
215
+ "type": "object",
216
+ "properties": {
217
+ "query": {
218
+ "type": "array",
219
+ "items": {"type": "string", "description": "The search query."},
220
+ "minItems": 1,
221
+ "description": "The list of search queries for Google Scholar.",
222
+ },
223
+ },
224
+ "required": ["query"],
225
+ }
226
+
227
+ def __init__(self, cfg: Optional[dict] = None):
228
+ super().__init__(cfg)
229
+
230
+ def google_scholar_with_serp(self, query: str):
231
+ payload = {"q": query}
232
+ serper_key = os.getenv("SERPER_KEY_ID", "").strip()
233
+ if not serper_key:
234
+ return "[ScholarSearch] SERPER_KEY_ID is not set."
235
+ headers = {
236
+ "X-API-KEY": serper_key,
237
+ "Content-Type": "application/json",
238
+ }
239
+ last_error = ""
240
+ res = None
241
+ for i in range(5):
242
+ try:
243
+ res = requests.post(
244
+ "https://google.serper.dev/scholar",
245
+ json=payload,
246
+ headers=headers,
247
+ timeout=20,
248
+ )
249
+ res.raise_for_status()
250
+ break
251
+ except requests.RequestException as exc:
252
+ last_error = _request_error_text(exc)
253
+ if scholar_debug_enabled():
254
+ print(exc)
255
+ if i == 4:
256
+ return f"[ScholarSearch] Request failed for '{query}': {last_error}"
257
+
258
+ if res is None:
259
+ return f"[ScholarSearch] Request failed for '{query}': {last_error or 'unknown error'}"
260
+
261
+ try:
262
+ results = res.json()
263
+ except ValueError as exc:
264
+ return f"[ScholarSearch] Invalid JSON response for '{query}': {exc}"
265
+
266
+ organic_results = results.get("organic")
267
+ if not isinstance(organic_results, list) or not organic_results:
268
+ return f"No results found for '{query}'. Try with a more general query."
269
+
270
+ web_snippets = []
271
+ for idx, page in enumerate(organic_results, start=1):
272
+ if not isinstance(page, dict):
273
+ continue
274
+ title = str(page.get("title", "Untitled result"))
275
+ date_published = f"\nDate published: {page['year']}" if "year" in page else ""
276
+ publication_info = f"\npublicationInfo: {page['publicationInfo']}" if "publicationInfo" in page else ""
277
+ snippet = f"\n{page['snippet']}" if "snippet" in page else ""
278
+ link_info = "no available link"
279
+ if "pdfUrl" in page:
280
+ link_info = "pdfUrl: " + str(page["pdfUrl"])
281
+ cited_by = f"\ncitedBy: {page['citedBy']}" if "citedBy" in page else ""
282
+ redacted_version = f"{idx}. [{title}]({link_info}){publication_info}{date_published}{cited_by}\n{snippet}"
283
+ redacted_version = redacted_version.replace("Your browser can't play this video.", "")
284
+ web_snippets.append(redacted_version)
285
+
286
+ if not web_snippets:
287
+ return f"No results found for '{query}'. Try with a more general query."
288
+
289
+ content = f"A Google scholar for '{query}' found {len(web_snippets)} results:\n\n## Scholar Results\n" + "\n\n".join(web_snippets)
290
+ return content
291
+
292
+ def call(self, params: Union[str, dict], **kwargs) -> str:
293
+ try:
294
+ params = self.parse_json_args(params)
295
+ query = params["query"]
296
+ except ValueError as exc:
297
+ return f"[ScholarSearch] {exc}"
298
+
299
+ if isinstance(query, list):
300
+ with ThreadPoolExecutor(max_workers=3) as executor:
301
+ response = list(executor.map(self.google_scholar_with_serp, query))
302
+ response = "\n=======\n".join(response)
303
+ else:
304
+ return "[ScholarSearch] 'query' must be a list of strings."
305
+ return response
306
+
307
+
308
+ class WebFetch(ToolBase):
309
+ name = "WebFetch"
310
+ description = "Fetch webpage content and return evidence plus a goal-focused summary."
311
+ parameters = {
312
+ "type": "object",
313
+ "properties": {
314
+ "url": {
315
+ "type": ["string", "array"],
316
+ "items": {
317
+ "type": "string",
318
+ },
319
+ "minItems": 1,
320
+ "description": "The URL(s) of the webpage(s) to visit. Can be a single URL or an array of URLs.",
321
+ },
322
+ "goal": {
323
+ "type": "string",
324
+ "description": "The goal of the visit for webpage(s).",
325
+ },
326
+ },
327
+ "required": ["url", "goal"],
328
+ }
329
+
330
+ def __init__(self, cfg: Optional[dict] = None):
331
+ super().__init__(cfg)
332
+ self._summary_client: Optional[OpenAI] = None
333
+ self._summary_api_base: Optional[str] = None
334
+ self._summary_model_name = os.environ.get("MODEL_NAME", "").strip()
335
+ self._summary_timeout_seconds = float(
336
+ os.getenv("LLM_TIMEOUT_SECONDS", str(DEFAULT_LLM_TIMEOUT_SECONDS))
337
+ )
338
+ self._summary_temperature = float(os.getenv("TEMPERATURE", str(DEFAULT_TEMPERATURE)))
339
+ self._summary_top_p = float(os.getenv("TOP_P", str(DEFAULT_TOP_P)))
340
+ self._summary_presence_penalty = float(os.getenv("PRESENCE_PENALTY", str(DEFAULT_PRESENCE_PENALTY)))
341
+
342
+ def _ensure_summary_client(self) -> Optional[OpenAI]:
343
+ if self._summary_client is not None:
344
+ return self._summary_client
345
+ self._summary_api_base = os.environ.get("API_BASE")
346
+ self._summary_model_name = os.environ.get("MODEL_NAME", "").strip()
347
+ self._summary_timeout_seconds = float(
348
+ os.getenv("LLM_TIMEOUT_SECONDS", str(DEFAULT_LLM_TIMEOUT_SECONDS))
349
+ )
350
+ self._summary_temperature = float(os.getenv("TEMPERATURE", str(DEFAULT_TEMPERATURE)))
351
+ self._summary_top_p = float(os.getenv("TOP_P", str(DEFAULT_TOP_P)))
352
+ self._summary_presence_penalty = float(os.getenv("PRESENCE_PENALTY", str(DEFAULT_PRESENCE_PENALTY)))
353
+ if not self._summary_api_base:
354
+ return None
355
+ self._summary_client = OpenAI(
356
+ api_key=os.environ.get("API_KEY", "EMPTY"),
357
+ base_url=self._summary_api_base,
358
+ timeout=self._summary_timeout_seconds,
359
+ )
360
+ return self._summary_client
361
+
362
+ @staticmethod
363
+ def _remaining_budget_seconds(runtime_deadline: Optional[float]) -> Optional[float]:
364
+ if runtime_deadline is None:
365
+ return None
366
+ return runtime_deadline - time.time()
367
+
368
+ def call(self, params: Union[str, dict], **kwargs) -> str:
369
+ try:
370
+ params = self.parse_json_args(params)
371
+ url = params["url"]
372
+ goal = params["goal"]
373
+ except ValueError as exc:
374
+ return f"[WebFetch] {exc}"
375
+ runtime_deadline = kwargs.get("runtime_deadline")
376
+
377
+ start_time = time.time()
378
+
379
+ if isinstance(url, str):
380
+ response = self.readpage_jina(url, goal, runtime_deadline=runtime_deadline)
381
+ elif isinstance(url, list):
382
+ response = []
383
+ start_time = time.time()
384
+ for one_url in url:
385
+ remaining = self._remaining_budget_seconds(runtime_deadline)
386
+ if remaining is not None and remaining <= 0:
387
+ cur_response = _webfetch_failure(
388
+ url=one_url,
389
+ goal=goal,
390
+ reason="Agent runtime limit reached before WebFetch could complete.",
391
+ )
392
+ response.append(cur_response)
393
+ continue
394
+ if time.time() - start_time > 900:
395
+ cur_response = "The useful information in {url} for user goal {goal} as follows: \n\n".format(url=one_url, goal=goal)
396
+ cur_response += "Evidence in page: \n" + "The provided webpage content could not be accessed. Please check the URL or file format." + "\n\n"
397
+ cur_response += "Summary: \n" + "The webpage content could not be processed, and therefore, no information is available." + "\n\n"
398
+ else:
399
+ cur_response = self.readpage_jina(one_url, goal, runtime_deadline=runtime_deadline)
400
+ response.append(cur_response)
401
+ response = "\n=======\n".join(response)
402
+ else:
403
+ return "[WebFetch] 'url' must be a string or a list of strings."
404
+
405
+ if visit_debug_enabled():
406
+ print(f"Summary Length {len(response)}")
407
+ return response.strip()
408
+
409
+ def call_server(self, msgs, max_retries=2, runtime_deadline: Optional[float] = None):
410
+ client = self._ensure_summary_client()
411
+ if client is None or not self._summary_api_base:
412
+ return "[WebFetch] Summary model error: API_BASE is not set."
413
+ if not self._summary_model_name:
414
+ return "[WebFetch] Summary model error: MODEL_NAME is not set."
415
+ last_error = "unknown summary-model error"
416
+ for attempt in range(max_retries):
417
+ remaining = self._remaining_budget_seconds(runtime_deadline)
418
+ if remaining is not None and remaining <= 0:
419
+ return "[WebFetch] Summary model error: agent runtime limit reached."
420
+ try:
421
+ request_client = (
422
+ client.with_options(timeout=min(self._summary_timeout_seconds, max(remaining, 0.001)))
423
+ if remaining is not None
424
+ else client
425
+ )
426
+ request_kwargs = {
427
+ "model": self._summary_model_name,
428
+ "messages": msgs,
429
+ }
430
+ apply_sampling_params(
431
+ request_kwargs,
432
+ model_name=self._summary_model_name,
433
+ temperature=self._summary_temperature,
434
+ top_p=self._summary_top_p,
435
+ presence_penalty=self._summary_presence_penalty,
436
+ )
437
+ chat_response = request_client.chat.completions.create(**request_kwargs)
438
+ content = chat_response.choices[0].message.content
439
+ if content:
440
+ return content
441
+ last_error = "empty response from summary model"
442
+ except (APIError, APIConnectionError, APITimeoutError) as exc:
443
+ last_error = str(exc)
444
+ if attempt == (max_retries - 1):
445
+ return f"[WebFetch] Summary model error: {last_error}"
446
+
447
+ return f"[WebFetch] Summary model error: {last_error}"
448
+
449
+ def jina_readpage(self, url: str, runtime_deadline: Optional[float] = None) -> str:
450
+ max_retries = 3
451
+ timeout = 50
452
+ jina_api_key = os.getenv("JINA_API_KEYS", "").strip()
453
+ if not jina_api_key:
454
+ return "[WebFetch] JINA_API_KEYS is not set."
455
+
456
+ last_error = "unknown page-fetch error"
457
+ for attempt in range(max_retries):
458
+ headers = {
459
+ "Authorization": f"Bearer {jina_api_key}",
460
+ }
461
+ try:
462
+ remaining = self._remaining_budget_seconds(runtime_deadline)
463
+ if remaining is not None and remaining <= 0:
464
+ return "[WebFetch] Failed to read page: agent runtime limit reached."
465
+ response = requests.get(
466
+ f"https://r.jina.ai/{url}",
467
+ headers=headers,
468
+ timeout=min(timeout, max(remaining, 0.001)) if remaining is not None else timeout,
469
+ )
470
+ if response.status_code == 200:
471
+ return response.text
472
+ if visit_debug_enabled():
473
+ print(response.text)
474
+ last_error = f"HTTP {response.status_code}: {response.text[:200]}"
475
+ except requests.RequestException as exc:
476
+ last_error = str(exc)
477
+ remaining = self._remaining_budget_seconds(runtime_deadline)
478
+ if remaining is not None and remaining <= 0:
479
+ return "[WebFetch] Failed to read page: agent runtime limit reached."
480
+ time.sleep(min(0.5, remaining) if remaining is not None else 0.5)
481
+ if attempt == max_retries - 1:
482
+ return f"[WebFetch] Failed to read page: {last_error}"
483
+
484
+ return f"[WebFetch] Failed to read page: {last_error}"
485
+
486
+ def html_readpage_jina(self, url: str, runtime_deadline: Optional[float] = None) -> str:
487
+ max_attempts = 8
488
+ for _ in range(max_attempts):
489
+ remaining = self._remaining_budget_seconds(runtime_deadline)
490
+ if remaining is not None and remaining <= 0:
491
+ return "[WebFetch] Failed to read page: agent runtime limit reached."
492
+ content = self.jina_readpage(url, runtime_deadline=runtime_deadline)
493
+ if content and not content.startswith("[WebFetch] Failed to read page:") and content != "[WebFetch] Empty content." and not content.startswith("[document_parser]"):
494
+ return content
495
+ return "[WebFetch] Failed to read page: exhausted retries"
496
+
497
+ def readpage_jina(self, url: str, goal: str, runtime_deadline: Optional[float] = None) -> str:
498
+ summary_page_func = self.call_server
499
+ max_retries = int(os.getenv("LLM_MAX_RETRIES", str(DEFAULT_LLM_MAX_RETRIES)))
500
+
501
+ content = self.html_readpage_jina(url, runtime_deadline=runtime_deadline)
502
+
503
+ if content and not content.startswith("[WebFetch] Failed to read page:") and content != "[WebFetch] Empty content." and not content.startswith("[document_parser]"):
504
+ content = truncate_to_tokens(content, max_tokens=95000)
505
+ messages = [{"role": "user", "content": EXTRACTOR_PROMPT.format(webpage_content=content, goal=goal)}]
506
+ raw = summary_page_func(messages, max_retries=max_retries, runtime_deadline=runtime_deadline)
507
+ summary_retries = 3
508
+ while len(raw) < 10 and summary_retries >= 0:
509
+ remaining = self._remaining_budget_seconds(runtime_deadline)
510
+ if remaining is not None and remaining <= 0:
511
+ return _webfetch_failure(
512
+ url=url,
513
+ goal=goal,
514
+ reason="Agent runtime limit reached before WebFetch could complete.",
515
+ )
516
+ truncate_length = int(0.7 * len(content)) if summary_retries > 0 else 25000
517
+ status_msg = (
518
+ f"[WebFetch] Summary url[{url}] "
519
+ f"attempt {3 - summary_retries + 1}/3, "
520
+ f"content length: {len(content)}, "
521
+ f"truncating to {truncate_length} chars"
522
+ ) if summary_retries > 0 else (
523
+ f"[WebFetch] Summary url[{url}] failed after 3 attempts, "
524
+ f"final truncation to 25000 chars"
525
+ )
526
+ if visit_debug_enabled():
527
+ print(status_msg)
528
+ content = content[:truncate_length]
529
+ extraction_prompt = EXTRACTOR_PROMPT.format(
530
+ webpage_content=content,
531
+ goal=goal,
532
+ )
533
+ messages = [{"role": "user", "content": extraction_prompt}]
534
+ raw = summary_page_func(messages, max_retries=max_retries, runtime_deadline=runtime_deadline)
535
+ summary_retries -= 1
536
+
537
+ parse_retry_times = 0
538
+ parsed = _parse_extractor_payload(raw)
539
+ while parse_retry_times < 3:
540
+ if parsed is not None:
541
+ break
542
+ remaining = self._remaining_budget_seconds(runtime_deadline)
543
+ if remaining is not None and remaining <= 0:
544
+ return _webfetch_failure(
545
+ url=url,
546
+ goal=goal,
547
+ reason="Agent runtime limit reached before WebFetch could complete.",
548
+ )
549
+ raw = summary_page_func(messages, max_retries=max_retries, runtime_deadline=runtime_deadline)
550
+ parsed = _parse_extractor_payload(raw)
551
+ parse_retry_times += 1
552
+
553
+ if parsed is None:
554
+ reason = "The webpage content was fetched, but the summary model did not return the required evidence and summary fields."
555
+ if isinstance(raw, str) and raw.startswith("[WebFetch] Summary model error:"):
556
+ reason = raw
557
+ useful_information = _webfetch_failure(
558
+ url=url,
559
+ goal=goal,
560
+ reason=reason,
561
+ )
562
+ else:
563
+ evidence, summary = parsed
564
+ useful_information = "The useful information in {url} for user goal {goal} as follows: \n\n".format(url=url, goal=goal)
565
+ useful_information += "Evidence in page: \n" + evidence + "\n\n"
566
+ useful_information += "Summary: \n" + summary + "\n\n"
567
+
568
+ if len(useful_information) < 10 and summary_retries < 0:
569
+ if visit_debug_enabled():
570
+ print("[WebFetch] Could not generate valid summary after maximum retries")
571
+ useful_information = "[WebFetch] Failed to read page."
572
+
573
+ return useful_information
574
+
575
+ return _webfetch_failure(
576
+ url=url,
577
+ goal=goal,
578
+ reason="The provided webpage content could not be accessed. Please check the URL or file format.",
579
+ )
580
+
581
+
582
+ def main(argv: Optional[list[str]] = None) -> int:
583
+ parser = argparse.ArgumentParser(description="Run web tools directly.")
584
+ subparsers = parser.add_subparsers(dest="tool", required=True)
585
+
586
+ search_parser = subparsers.add_parser("search", help="Run WebSearch.")
587
+ search_parser.add_argument("query", nargs="+")
588
+
589
+ scholar_parser = subparsers.add_parser("scholar", help="Run ScholarSearch.")
590
+ scholar_parser.add_argument("query", nargs="+")
591
+
592
+ fetch_parser = subparsers.add_parser("fetch", help="Run WebFetch.")
593
+ fetch_parser.add_argument("url")
594
+ fetch_parser.add_argument("goal")
595
+
596
+ args = parser.parse_args(argv)
597
+ load_dotenv(PROJECT_ROOT / ".env")
598
+
599
+ if args.tool == "search":
600
+ result = WebSearch().call({"query": [" ".join(args.query)]})
601
+ elif args.tool == "scholar":
602
+ result = ScholarSearch().call({"query": [" ".join(args.query)]})
603
+ else:
604
+ result = WebFetch().call({"url": args.url, "goal": args.goal})
605
+ print(result)
606
+ return 0
607
+
608
+
609
+ if __name__ == "__main__":
610
+ raise SystemExit(main(sys.argv[1:]))
agent_base/tools/tooling.py ADDED
@@ -0,0 +1,302 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ import os
4
+ import re
5
+ import sys
6
+ from pathlib import Path
7
+ from typing import Any, Optional, Union
8
+
9
+ import json5
10
+ from agent_base.utils import PROJECT_ROOT, load_dotenv
11
+
12
+ WORKSPACE_ROOT_ENV = "WORKSPACE_ROOT"
13
+
14
+ SENSITIVE_FILE_NAMES = {
15
+ ".env",
16
+ ".env.local",
17
+ ".env.production",
18
+ ".env.development",
19
+ ".env.test",
20
+ ".git-credentials",
21
+ ".netrc",
22
+ ".npmrc",
23
+ ".pypirc",
24
+ "id_rsa",
25
+ "id_dsa",
26
+ "id_ecdsa",
27
+ "id_ed25519",
28
+ "known_hosts",
29
+ "authorized_keys",
30
+ "credentials",
31
+ }
32
+ SENSITIVE_PATH_PARTS = {
33
+ ".git",
34
+ ".ssh",
35
+ ".aws",
36
+ ".gnupg",
37
+ ".kube",
38
+ }
39
+ SENSITIVE_COMMAND_TOKENS = [
40
+ ".env",
41
+ ".git-credentials",
42
+ ".netrc",
43
+ ".npmrc",
44
+ ".pypirc",
45
+ "id_rsa",
46
+ "id_dsa",
47
+ "id_ecdsa",
48
+ "id_ed25519",
49
+ "/etc/passwd",
50
+ "/etc/shadow",
51
+ "/root/.ssh",
52
+ "/root/.aws",
53
+ "~/.ssh",
54
+ "~/.aws",
55
+ ]
56
+ BLOCKED_COMMAND_PATTERNS: list[tuple[re.Pattern[str], str]] = [
57
+ (re.compile(r"(^|[\s;&|])sudo(\s|$)"), "sudo escalation is blocked"),
58
+ (re.compile(r"(^|[\s;&|])su(\s|$)"), "user switching is blocked"),
59
+ (re.compile(r"(^|[\s;&|])(shutdown|reboot|poweroff|halt)(\s|$)"), "system power-control commands are blocked"),
60
+ (re.compile(r"(^|[\s;&|])mkfs(?:\.\w+)?(\s|$)"), "disk-formatting commands are blocked"),
61
+ (re.compile(r"(^|[\s;&|])(fdisk|parted)(\s|$)"), "disk-partitioning commands are blocked"),
62
+ (re.compile(r":\s*\(\)\s*\{\s*:\|:&\s*\};:"), "fork-bomb patterns are blocked"),
63
+ (re.compile(r"\brm\s+-rf\s+/(\s|$)"), "destructive root deletion is blocked"),
64
+ (re.compile(r"\brm\s+-rf\s+~(/|\s|$)"), "destructive home deletion is blocked"),
65
+ ]
66
+ SENSITIVE_ENV_EXACT = {
67
+ "API_KEY",
68
+ "SERPER_KEY_ID",
69
+ "JINA_API_KEYS",
70
+ "MINERU_TOKEN",
71
+ "OPENAI_API_KEY",
72
+ "ANTHROPIC_API_KEY",
73
+ "GOOGLE_API_KEY",
74
+ "AWS_ACCESS_KEY_ID",
75
+ "AWS_SECRET_ACCESS_KEY",
76
+ "AWS_SESSION_TOKEN",
77
+ "AZURE_OPENAI_API_KEY",
78
+ }
79
+ SENSITIVE_ENV_MARKERS = (
80
+ "TOKEN",
81
+ "SECRET",
82
+ "PASSWORD",
83
+ "PASSWD",
84
+ "CREDENTIAL",
85
+ "COOKIE",
86
+ )
87
+ SAFE_ENV_ALWAYS = {
88
+ "PATH",
89
+ "LANG",
90
+ "TERM",
91
+ "TMPDIR",
92
+ "TEMP",
93
+ "TMP",
94
+ "TZ",
95
+ "COLORTERM",
96
+ "PWD",
97
+ "PYTHONIOENCODING",
98
+ "PYTHONUNBUFFERED",
99
+ "CONDA_PREFIX",
100
+ "CONDA_DEFAULT_ENV",
101
+ "VIRTUAL_ENV",
102
+ "LOGNAME",
103
+ "USER",
104
+ "USERNAME",
105
+ "SHELL",
106
+ "SHLVL",
107
+ "_",
108
+ }
109
+
110
+
111
+ def workspace_root() -> Path:
112
+ configured = os.environ.get(WORKSPACE_ROOT_ENV, "").strip()
113
+ root = Path(configured).expanduser() if configured else PROJECT_ROOT
114
+ return root.resolve()
115
+
116
+
117
+ def normalize_base_root(base_root: Optional[Union[str, Path]]) -> Path:
118
+ if base_root is None:
119
+ return workspace_root()
120
+ return Path(base_root).expanduser().resolve()
121
+
122
+
123
+ def normalize_workspace_root(path_value: Optional[Union[str, Path]]) -> Path:
124
+ if path_value is None or str(path_value).strip() == "":
125
+ return workspace_root()
126
+ path = Path(path_value).expanduser()
127
+ if not path.is_absolute():
128
+ path = (Path.cwd() / path).resolve()
129
+ else:
130
+ path = path.resolve()
131
+ if not path.exists():
132
+ path.mkdir(parents=True, exist_ok=True)
133
+ if not path.is_dir():
134
+ raise ValueError(f"Workspace directory is not a directory: {path}")
135
+ return path
136
+
137
+
138
+ def _is_relative_to(path: Path, root: Path) -> bool:
139
+ try:
140
+ path.relative_to(root)
141
+ return True
142
+ except ValueError:
143
+ return False
144
+
145
+
146
+ def resolve_workspace_path(path_value: Union[str, Path], *, base_root: Optional[Path] = None) -> Path:
147
+ path = Path(path_value).expanduser()
148
+ root = normalize_base_root(base_root)
149
+ if not path.is_absolute():
150
+ path = root / path
151
+ return path.resolve(strict=False)
152
+
153
+
154
+ def is_sensitive_path(path: Path) -> bool:
155
+ lowered_parts = {part.lower() for part in path.parts}
156
+ lowered_name = path.name.lower()
157
+ if lowered_name in SENSITIVE_FILE_NAMES:
158
+ return True
159
+ return any(part in SENSITIVE_PATH_PARTS for part in lowered_parts)
160
+
161
+
162
+ def validate_tool_path(path_value: Union[str, Path], purpose: str, *, allow_sensitive: bool = False, base_root: Optional[Path] = None) -> Path:
163
+ path = resolve_workspace_path(path_value, base_root=base_root)
164
+ root = normalize_base_root(base_root)
165
+ if not _is_relative_to(path, root):
166
+ raise ValueError(f"{purpose} is limited to the workspace root: {root}")
167
+ if not allow_sensitive and is_sensitive_path(path):
168
+ raise ValueError(f"{purpose} to sensitive paths is blocked: {path}")
169
+ return path
170
+
171
+
172
+ def command_safety_issue(command: str) -> Optional[str]:
173
+ lowered = command.lower()
174
+ for pattern, reason in BLOCKED_COMMAND_PATTERNS:
175
+ if pattern.search(command):
176
+ return reason
177
+ for token in SENSITIVE_COMMAND_TOKENS:
178
+ if token.lower() in lowered:
179
+ return f"access to sensitive path/token '{token}' is blocked"
180
+ return None
181
+
182
+
183
+ def sanitized_subprocess_env(*, base_root: Optional[Path] = None) -> dict[str, str]:
184
+ env = os.environ.copy()
185
+ for key in list(env.keys()):
186
+ upper = key.upper()
187
+ if upper in SAFE_ENV_ALWAYS:
188
+ continue
189
+ if upper in SENSITIVE_ENV_EXACT or any(marker in upper for marker in SENSITIVE_ENV_MARKERS):
190
+ env.pop(key, None)
191
+ safe_home = str(normalize_base_root(base_root))
192
+ env["HOME"] = safe_home
193
+ env["PWD"] = safe_home
194
+ env.setdefault("TERM", "xterm-256color")
195
+ env.setdefault("LANG", "C.UTF-8")
196
+ env["GIT_TERMINAL_PROMPT"] = "0"
197
+ return env
198
+
199
+
200
+ def _matches_schema_type(value: Any, expected_type: str) -> bool:
201
+ if expected_type == "string":
202
+ return isinstance(value, str)
203
+ if expected_type == "integer":
204
+ return isinstance(value, int) and not isinstance(value, bool)
205
+ if expected_type == "number":
206
+ return (isinstance(value, int) and not isinstance(value, bool)) or isinstance(value, float)
207
+ if expected_type == "boolean":
208
+ return isinstance(value, bool)
209
+ if expected_type == "array":
210
+ return isinstance(value, list)
211
+ if expected_type == "object":
212
+ return isinstance(value, dict)
213
+ return True
214
+
215
+
216
+ def _schema_type_label(type_spec: Any) -> str:
217
+ if isinstance(type_spec, list):
218
+ return " or ".join(str(item) for item in type_spec)
219
+ return str(type_spec)
220
+
221
+
222
+ def _validate_schema_value(param_name: str, value: Any, schema: dict[str, Any]) -> None:
223
+ type_spec = schema.get("type")
224
+ if type_spec is not None:
225
+ allowed_types = type_spec if isinstance(type_spec, list) else [type_spec]
226
+ if not any(_matches_schema_type(value, expected_type) for expected_type in allowed_types):
227
+ raise ValueError(f"Parameter '{param_name}' must be of type {_schema_type_label(type_spec)}.")
228
+
229
+ if isinstance(value, list):
230
+ min_items = schema.get("minItems")
231
+ if isinstance(min_items, int) and len(value) < min_items:
232
+ raise ValueError(f"Parameter '{param_name}' must contain at least {min_items} item(s).")
233
+ item_schema = schema.get("items")
234
+ if isinstance(item_schema, dict):
235
+ for index, item in enumerate(value):
236
+ _validate_schema_value(f"{param_name}[{index}]", item, item_schema)
237
+
238
+
239
+ class ToolBase:
240
+ name: str = ""
241
+ description: str = ""
242
+ parameters: dict[str, Any] = {}
243
+
244
+ def __init__(self, cfg: Optional[dict] = None):
245
+ self.cfg = cfg or {}
246
+ if not self.name:
247
+ raise ValueError(f"{self.__class__.__name__}.name must be set.")
248
+ if not isinstance(self.parameters, dict):
249
+ raise ValueError(f"{self.__class__.__name__}.parameters must be a JSON-schema-like dict.")
250
+
251
+ def call(self, params: Union[str, dict], **kwargs):
252
+ raise NotImplementedError
253
+
254
+ def parse_json_args(self, params: Union[str, dict], strict_json: bool = False) -> dict:
255
+ if isinstance(params, str):
256
+ try:
257
+ if strict_json:
258
+ parsed = json.loads(params)
259
+ else:
260
+ parsed = json5.loads(params)
261
+ except (TypeError, ValueError) as exc:
262
+ raise ValueError("Parameters must be formatted as a valid JSON object.") from exc
263
+ else:
264
+ parsed = params
265
+
266
+ if not isinstance(parsed, dict):
267
+ raise ValueError("Parameters must decode to a JSON object.")
268
+
269
+ required = self.parameters.get("required", [])
270
+ for key in required:
271
+ if key not in parsed:
272
+ raise ValueError(f"Missing required parameter: {key}")
273
+
274
+ properties = self.parameters.get("properties", {})
275
+ if isinstance(properties, dict):
276
+ for key, value in parsed.items():
277
+ schema = properties.get(key)
278
+ if isinstance(schema, dict):
279
+ _validate_schema_value(key, value, schema)
280
+ return parsed
281
+
282
+
283
+ def main(argv: Optional[list[str]] = None) -> int:
284
+ parser = argparse.ArgumentParser(description="Inspect workspace and path resolution helpers.")
285
+ parser.add_argument("--workspace-root", help="Optional workspace root override for this invocation.")
286
+ parser.add_argument("--path", help="Optional path to resolve inside the workspace.")
287
+ args = parser.parse_args(argv)
288
+
289
+ load_dotenv(PROJECT_ROOT / ".env")
290
+ workspace_root = normalize_workspace_root(args.workspace_root)
291
+ payload: dict[str, str] = {
292
+ "project_root": str(PROJECT_ROOT),
293
+ "workspace_root": str(workspace_root),
294
+ }
295
+ if args.path:
296
+ payload["resolved_path"] = str(resolve_workspace_path(args.path, base_root=workspace_root))
297
+ print(json.dumps(payload, ensure_ascii=False, indent=2))
298
+ return 0
299
+
300
+
301
+ if __name__ == "__main__":
302
+ raise SystemExit(main(sys.argv[1:]))
agent_base/trace_utils.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import datetime
3
+ from pathlib import Path
4
+ from typing import Any, Callable, Optional
5
+ from uuid import uuid4
6
+
7
+ from agent_base.utils import append_jsonl, safe_jsonable
8
+
9
+
10
+ TRACE_FIELD_NAMES = [
11
+ "run_id",
12
+ "event_index",
13
+ "turn_index",
14
+ "timestamp",
15
+ "model_name",
16
+ "workspace_root",
17
+ "role",
18
+ "text",
19
+ "tool_call_ids",
20
+ "tool_names",
21
+ "tool_arguments",
22
+ "finish_reason",
23
+ "termination",
24
+ "error",
25
+ "image_paths",
26
+ "capture_type",
27
+ "payload",
28
+ ]
29
+
30
+
31
+ class FlatTraceWriter:
32
+ def __init__(
33
+ self,
34
+ *,
35
+ trace_dir: Optional[str | Path],
36
+ model_name: str,
37
+ workspace_root: str | Path,
38
+ on_event: Optional[Callable[[dict[str, Any]], None]] = None,
39
+ ):
40
+ self.model_name = model_name
41
+ self.workspace_root = str(workspace_root)
42
+ self.on_event = on_event
43
+ self.run_id = uuid4().hex
44
+ self.path = resolve_trace_path(trace_dir, run_id=self.run_id) if trace_dir else None
45
+ self.event_index = 0
46
+
47
+ def append(
48
+ self,
49
+ *,
50
+ role: str,
51
+ text: str = "",
52
+ turn_index: int = 0,
53
+ tool_call_ids: Optional[list[str]] = None,
54
+ tool_names: Optional[list[str]] = None,
55
+ tool_arguments: Optional[list[Any]] = None,
56
+ finish_reason: Optional[str] = None,
57
+ termination: Optional[str] = None,
58
+ error: Optional[str] = None,
59
+ image_paths: Optional[list[str]] = None,
60
+ capture_type: str = "",
61
+ payload: Optional[dict[str, Any]] = None,
62
+ ) -> dict[str, Any]:
63
+ self.event_index += 1
64
+ row = {
65
+ "run_id": self.run_id,
66
+ "event_index": self.event_index,
67
+ "turn_index": turn_index,
68
+ "timestamp": datetime.datetime.now().astimezone().isoformat(timespec="seconds"),
69
+ "model_name": self.model_name,
70
+ "workspace_root": self.workspace_root,
71
+ "role": role,
72
+ "text": text,
73
+ "tool_call_ids": tool_call_ids or [],
74
+ "tool_names": tool_names or [],
75
+ "tool_arguments": safe_jsonable(tool_arguments or []),
76
+ "finish_reason": finish_reason or "",
77
+ "termination": termination or "",
78
+ "error": error or "",
79
+ "image_paths": image_paths or [],
80
+ "capture_type": capture_type or "",
81
+ "payload": safe_jsonable(payload or {}),
82
+ }
83
+ if self.path is not None:
84
+ append_jsonl(self.path, row)
85
+ if self.on_event is not None:
86
+ self.on_event(row)
87
+ return row
88
+
89
+
90
+ def resolve_trace_path(
91
+ trace_dir: str | Path,
92
+ *,
93
+ run_id: str,
94
+ prefix: str = "trace",
95
+ suffix: str = ".jsonl",
96
+ ) -> Path:
97
+ directory = Path(trace_dir)
98
+ timestamp = datetime.datetime.now().astimezone().strftime("%Y%m%d_%H%M%S")
99
+ short_run_id = run_id[:12]
100
+ filename = f"{prefix}_{timestamp}_{short_run_id}{suffix}"
101
+ return directory / filename
102
+
103
+
104
+ def main(argv: Optional[list[str]] = None) -> int:
105
+ parser = argparse.ArgumentParser(description="Inspect the flat trace field order used by the agent.")
106
+ parser.parse_args(argv)
107
+ print("\n".join(TRACE_FIELD_NAMES))
108
+ return 0
109
+
110
+
111
+ if __name__ == "__main__":
112
+ raise SystemExit(main())
agent_base/utils.py ADDED
@@ -0,0 +1,247 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import base64
3
+ import json
4
+ import os
5
+ import re
6
+ import shutil
7
+ import shlex
8
+ import sys
9
+ from pathlib import Path
10
+ from typing import Any, Iterable, Optional, Union
11
+
12
+
13
+ PROJECT_ROOT = Path(__file__).resolve().parent.parent
14
+ _DOTENV_LAST_LOADED: dict[tuple[str, str], str] = {}
15
+ REQUIRED_ENV_VARS = (
16
+ "API_KEY",
17
+ "API_BASE",
18
+ "MODEL_NAME",
19
+ "SERPER_KEY_ID",
20
+ "JINA_API_KEYS",
21
+ "MINERU_TOKEN",
22
+ )
23
+ IMAGE_INPUT_REL_DIR = Path("inputs") / "images"
24
+ MAX_INPUT_IMAGE_BYTES = 25 * 1024 * 1024
25
+ IMAGE_MIME_BY_EXTENSION = {
26
+ ".png": "image/png",
27
+ ".jpg": "image/jpeg",
28
+ ".jpeg": "image/jpeg",
29
+ ".webp": "image/webp",
30
+ ".gif": "image/gif",
31
+ ".bmp": "image/bmp",
32
+ }
33
+
34
+
35
+ class MissingRequiredEnvError(RuntimeError):
36
+ pass
37
+
38
+
39
+ def load_dotenv(path: Union[str, Path]) -> None:
40
+ env_path = Path(path).expanduser()
41
+ if not env_path.exists():
42
+ return
43
+ env_id = str(env_path.resolve())
44
+ for raw_line in env_path.read_text(encoding="utf-8").splitlines():
45
+ line = raw_line.strip()
46
+ if not line or line.startswith("#"):
47
+ continue
48
+ if line.startswith("export "):
49
+ line = line[len("export "):].strip()
50
+ if "=" not in line:
51
+ continue
52
+ key, value = line.split("=", 1)
53
+ key = key.strip()
54
+ value = value.strip()
55
+ if not key:
56
+ continue
57
+ if value:
58
+ lexer = shlex.shlex(value, posix=True)
59
+ lexer.whitespace = ""
60
+ lexer.commenters = "#"
61
+ parsed_value = "".join(list(lexer)).strip()
62
+ else:
63
+ parsed_value = ""
64
+ marker = (env_id, key)
65
+ existing = os.environ.get(key)
66
+ previous_loaded = _DOTENV_LAST_LOADED.get(marker)
67
+ if existing is None or existing == previous_loaded:
68
+ os.environ[key] = parsed_value
69
+ _DOTENV_LAST_LOADED[marker] = parsed_value
70
+
71
+
72
+ def env_flag(name: str) -> bool:
73
+ return os.getenv(name, "").lower() in {"1", "true", "yes", "on"}
74
+
75
+
76
+ def missing_required_env(required: tuple[str, ...] = REQUIRED_ENV_VARS) -> list[str]:
77
+ return [key for key in required if not os.getenv(key, "").strip()]
78
+
79
+
80
+ def require_required_env(context: str = "ResearchHarness") -> None:
81
+ missing = missing_required_env()
82
+ if not missing:
83
+ return
84
+ raise MissingRequiredEnvError(
85
+ f"{context} missing required environment variables: {', '.join(missing)}. "
86
+ "Set them in .env or the process environment before running."
87
+ )
88
+
89
+
90
+ def read_role_prompt_files(paths: Iterable[str]) -> str:
91
+ blocks: list[str] = []
92
+ for raw_path in paths:
93
+ path_text = str(raw_path).strip()
94
+ if not path_text:
95
+ continue
96
+ path = Path(path_text).expanduser()
97
+ if not path.exists():
98
+ raise ValueError(f"Role prompt file does not exist: {path}")
99
+ if not path.is_file():
100
+ raise ValueError(f"Role prompt path is not a file: {path}")
101
+ blocks.append(path.read_text(encoding="utf-8").strip())
102
+ return "\n\n".join(block for block in blocks if block.strip())
103
+
104
+
105
+ def _safe_image_stem(name: str, fallback: str) -> str:
106
+ stem = re.sub(r"[^A-Za-z0-9_.-]+", "_", Path(name).stem).strip("._")
107
+ return stem or fallback
108
+
109
+
110
+ def _unique_image_path(image_dir: Path, *, image_index: int, stem: str, suffix: str) -> Path:
111
+ base_name = f"image_{image_index:03d}_{stem}{suffix}"
112
+ candidate = image_dir / base_name
113
+ if not candidate.exists():
114
+ return candidate
115
+ counter = 1
116
+ while True:
117
+ candidate = image_dir / f"image_{image_index:03d}_{stem}_{counter}{suffix}"
118
+ if not candidate.exists():
119
+ return candidate
120
+ counter += 1
121
+
122
+
123
+ def image_input_content_parts(data_url: str, saved_path: str, *, detail: str = "auto") -> list[dict[str, Any]]:
124
+ """Build standard initial content parts for a saved user image."""
125
+ return [
126
+ {"type": "text", "text": f"[User-provided image saved at {saved_path}]"},
127
+ {"type": "image_url", "image_url": {"url": data_url, "detail": detail or "auto"}},
128
+ ]
129
+
130
+
131
+ def stage_image_bytes_for_input(
132
+ raw: bytes,
133
+ *,
134
+ workspace_root: Union[str, Path],
135
+ filename: str,
136
+ image_index: int,
137
+ suffix: str,
138
+ max_bytes: int = MAX_INPUT_IMAGE_BYTES,
139
+ ) -> str:
140
+ if not raw:
141
+ raise ValueError("image input is empty")
142
+ if len(raw) > max_bytes:
143
+ raise ValueError(f"image input exceeds {max_bytes} bytes")
144
+ normalized_suffix = suffix.lower()
145
+ if normalized_suffix not in IMAGE_MIME_BY_EXTENSION:
146
+ raise ValueError(f"unsupported image extension: {suffix}")
147
+ root = Path(workspace_root).expanduser().resolve()
148
+ image_dir = root / IMAGE_INPUT_REL_DIR
149
+ image_dir.mkdir(parents=True, exist_ok=True)
150
+ stem = _safe_image_stem(filename, f"image_{image_index:03d}")
151
+ dest = _unique_image_path(image_dir, image_index=image_index, stem=stem, suffix=normalized_suffix)
152
+ dest.write_bytes(raw)
153
+ return dest.relative_to(root).as_posix()
154
+
155
+
156
+ def stage_image_file_for_input(
157
+ source_path: Union[str, Path],
158
+ *,
159
+ workspace_root: Union[str, Path],
160
+ image_index: int,
161
+ max_bytes: int = MAX_INPUT_IMAGE_BYTES,
162
+ ) -> tuple[str, str]:
163
+ source = Path(source_path).expanduser()
164
+ if not source.is_absolute():
165
+ source = (Path.cwd() / source).resolve()
166
+ else:
167
+ source = source.resolve()
168
+ if not source.exists():
169
+ raise ValueError(f"image path does not exist: {source}")
170
+ if not source.is_file():
171
+ raise ValueError(f"image path is not a file: {source}")
172
+ suffix = source.suffix.lower()
173
+ mime_type = IMAGE_MIME_BY_EXTENSION.get(suffix)
174
+ if mime_type is None:
175
+ raise ValueError(f"unsupported image extension for {source}; expected one of {', '.join(sorted(IMAGE_MIME_BY_EXTENSION))}")
176
+ size = source.stat().st_size
177
+ if size <= 0:
178
+ raise ValueError(f"image file is empty: {source}")
179
+ if size > max_bytes:
180
+ raise ValueError(f"image file exceeds {max_bytes} bytes: {source}")
181
+ root = Path(workspace_root).expanduser().resolve()
182
+ image_dir = root / IMAGE_INPUT_REL_DIR
183
+ image_dir.mkdir(parents=True, exist_ok=True)
184
+ stem = _safe_image_stem(source.name, f"image_{image_index:03d}")
185
+ dest = _unique_image_path(image_dir, image_index=image_index, stem=stem, suffix=suffix)
186
+ shutil.copyfile(source, dest)
187
+ rel_path = dest.relative_to(root).as_posix()
188
+ data_url = f"data:{mime_type};base64," + base64.b64encode(dest.read_bytes()).decode("ascii")
189
+ return rel_path, data_url
190
+
191
+
192
+ def append_saved_image_paths_to_prompt(prompt: str, saved_paths: Iterable[str]) -> str:
193
+ paths = [str(path).strip() for path in saved_paths if str(path).strip()]
194
+ if not paths:
195
+ return prompt
196
+ lines = "\n".join(f"- {path}" for path in paths)
197
+ return (
198
+ f"{prompt.strip()}\n\n"
199
+ "The user attached image input. The images are saved locally inside the workspace:\n"
200
+ f"{lines}\n"
201
+ "Use the direct image input when the model supports vision. If tool-based inspection is needed, use ReadImage on the saved local paths."
202
+ )
203
+
204
+
205
+ def safe_jsonable(value: Any) -> Any:
206
+ if isinstance(value, (str, int, float, bool)) or value is None:
207
+ return value
208
+ if isinstance(value, dict):
209
+ return {str(key): safe_jsonable(item) for key, item in value.items()}
210
+ if isinstance(value, (list, tuple)):
211
+ return [safe_jsonable(item) for item in value]
212
+ return str(value)
213
+
214
+
215
+ def append_jsonl(path: Union[str, Path], record: dict[str, Any]) -> None:
216
+ output_path = Path(path)
217
+ output_path.parent.mkdir(parents=True, exist_ok=True)
218
+ with output_path.open("a", encoding="utf-8") as fp:
219
+ fp.write(json.dumps(record, ensure_ascii=False) + "\n")
220
+
221
+
222
+ def read_text_lossy(path: Union[str, Path]) -> str:
223
+ file_path = Path(path)
224
+ try:
225
+ return file_path.read_text(encoding="utf-8")
226
+ except UnicodeDecodeError:
227
+ return file_path.read_text(encoding="utf-8", errors="replace")
228
+
229
+
230
+ def main(argv: Optional[list[str]] = None) -> int:
231
+ parser = argparse.ArgumentParser(description="Inspect shared agent_base utilities.")
232
+ parser.add_argument("--dotenv", help="Optional dotenv path to load before printing the summary.")
233
+ args = parser.parse_args(argv)
234
+
235
+ if args.dotenv:
236
+ load_dotenv(args.dotenv)
237
+
238
+ payload = {
239
+ "project_root": str(PROJECT_ROOT),
240
+ "dotenv_loaded": bool(args.dotenv),
241
+ }
242
+ print(json.dumps(payload, ensure_ascii=False, indent=2))
243
+ return 0
244
+
245
+
246
+ if __name__ == "__main__":
247
+ raise SystemExit(main(sys.argv[1:]))
api/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """OpenAI-compatible API helpers for ResearchHarness."""
api/openai_server.py ADDED
@@ -0,0 +1,518 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import base64
4
+ import binascii
5
+ import datetime
6
+ import json
7
+ import re
8
+ import time
9
+ from dataclasses import dataclass
10
+ from pathlib import Path
11
+ from typing import Any, Optional
12
+ from uuid import uuid4
13
+
14
+ import uvicorn
15
+ from fastapi import Body, FastAPI, Request
16
+ from fastapi.responses import JSONResponse
17
+
18
+ from agent_base.react_agent import (
19
+ AVAILABLE_TOOL_MAP,
20
+ MultiTurnReactAgent,
21
+ assistant_text_content,
22
+ default_llm_config,
23
+ model_supports_runtime_image_parts,
24
+ )
25
+ from agent_base.tools.tooling import normalize_workspace_root
26
+ from agent_base.utils import append_jsonl, image_input_content_parts, read_role_prompt_files, safe_jsonable
27
+
28
+
29
+ DATA_IMAGE_RE = re.compile(r"^data:(image/[A-Za-z0-9.+-]+);base64,(.*)$", re.DOTALL)
30
+ IMAGE_EXTENSIONS = {
31
+ "image/png": ".png",
32
+ "image/jpeg": ".jpg",
33
+ "image/jpg": ".jpg",
34
+ "image/webp": ".webp",
35
+ "image/gif": ".gif",
36
+ }
37
+ DEFAULT_MAX_IMAGE_BYTES = 25 * 1024 * 1024
38
+
39
+ INPUT_WRAPPER_SYSTEM_PROMPT = """You are the ResearchHarness input wrapper.
40
+
41
+ Convert the user's OpenAI-compatible chat request into a stable task for a
42
+ tool-using ResearchHarness agent.
43
+
44
+ Return only a JSON object with these string fields:
45
+ - agent_instruction: the task the agent should solve, including all substantive question details.
46
+ - output_contract: the final output format or schema requested by the user. If no strict format is requested, say "plain text".
47
+ - wrapper_notes: brief notes about images, constraints, or benchmark-specific requirements.
48
+
49
+ Rules:
50
+ - Do not answer the task.
51
+ - Do not remove substantive constraints.
52
+ - Keep strict final formatting requirements out of agent_instruction when possible.
53
+ - If images are listed, mention their saved paths in agent_instruction.
54
+ """
55
+
56
+ OUTPUT_WRAPPER_SYSTEM_PROMPT = """You are the ResearchHarness output wrapper.
57
+
58
+ Format the ResearchHarness agent result so it satisfies the user's requested
59
+ final output contract.
60
+
61
+ Rules:
62
+ - Return only the final answer requested by the user.
63
+ - Do not add markdown fences unless the user explicitly required them.
64
+ - Do not solve the task again.
65
+ - Do not introduce facts not present in the agent result.
66
+ - Make the answer complete and self-contained for a remote user or evaluator.
67
+ - The answer may mention workspace files when useful, but it must not depend on
68
+ local files as the only carrier of the answer.
69
+ - Include the actual answer and any necessary evidence or solution steps in the
70
+ returned text.
71
+ - If reasoning or evidence is required, summarize it directly in the final
72
+ answer according to the requested format.
73
+ - If the requested format is JSON, return valid JSON only.
74
+ - If the agent result does not contain enough information, produce the best
75
+ contract-compliant failure answer instead of inventing evidence.
76
+ """
77
+
78
+
79
+ class OpenAICompatError(Exception):
80
+ def __init__(self, status_code: int, message: str, error_type: str = "invalid_request_error"):
81
+ super().__init__(message)
82
+ self.status_code = status_code
83
+ self.message = message
84
+ self.error_type = error_type
85
+
86
+
87
+ @dataclass
88
+ class ServerConfig:
89
+ api_runs_dir: Path
90
+ role_prompt: str = ""
91
+ host: str = "127.0.0.1"
92
+ port: int = 8686
93
+ input_wrapper: bool = True
94
+ output_wrapper: bool = True
95
+
96
+
97
+ @dataclass
98
+ class PreparedInput:
99
+ wrapper_messages: list[dict[str, str]]
100
+ initial_content_parts: list[dict[str, Any]]
101
+ image_paths: list[str]
102
+
103
+
104
+ def openai_error_response(exc: OpenAICompatError) -> JSONResponse:
105
+ return JSONResponse(
106
+ status_code=exc.status_code,
107
+ content={"error": {"message": exc.message, "type": exc.error_type}},
108
+ )
109
+
110
+
111
+ def make_chat_completion_response(*, request_id: str, model: str, content: str) -> dict[str, Any]:
112
+ return {
113
+ "id": request_id,
114
+ "object": "chat.completion",
115
+ "created": int(time.time()),
116
+ "model": model,
117
+ "choices": [
118
+ {
119
+ "index": 0,
120
+ "message": {"role": "assistant", "content": content},
121
+ "finish_reason": "stop",
122
+ }
123
+ ],
124
+ }
125
+
126
+
127
+ def validate_chat_payload(payload: Any) -> dict[str, Any]:
128
+ if not isinstance(payload, dict):
129
+ raise OpenAICompatError(400, "Request body must be a JSON object.")
130
+ if payload.get("stream") is True:
131
+ raise OpenAICompatError(400, "Streaming is not supported by this synchronous endpoint.")
132
+ try:
133
+ n_value = int(payload.get("n", 1) or 1)
134
+ except (TypeError, ValueError) as exc:
135
+ raise OpenAICompatError(400, "n must be an integer.") from exc
136
+ if n_value != 1:
137
+ raise OpenAICompatError(400, "Only n=1 is supported.")
138
+ model = str(payload.get("model", "")).strip()
139
+ if not model:
140
+ raise OpenAICompatError(400, "model is required.")
141
+ messages = payload.get("messages")
142
+ if not isinstance(messages, list) or not messages:
143
+ raise OpenAICompatError(400, "messages must be a non-empty list.")
144
+ return payload
145
+
146
+
147
+ def prepare_openai_input(messages: list[Any], workspace_root: Path) -> PreparedInput:
148
+ wrapper_messages: list[dict[str, str]] = []
149
+ initial_content_parts: list[dict[str, Any]] = []
150
+ image_paths: list[str] = []
151
+ image_dir = workspace_root / "inputs" / "images"
152
+ image_index = 0
153
+
154
+ for message in messages:
155
+ if not isinstance(message, dict):
156
+ raise OpenAICompatError(400, "Each message must be an object.")
157
+ role = str(message.get("role", "")).strip()
158
+ if role not in {"system", "user", "assistant"}:
159
+ raise OpenAICompatError(400, f"Unsupported message role: {role!r}.")
160
+ content = message.get("content", "")
161
+ text_parts: list[str] = []
162
+ if isinstance(content, str):
163
+ text_parts.append(content)
164
+ elif isinstance(content, list):
165
+ for part in content:
166
+ if not isinstance(part, dict):
167
+ raise OpenAICompatError(400, "Multimodal content parts must be objects.")
168
+ part_type = str(part.get("type", "")).strip()
169
+ if part_type == "text":
170
+ text_parts.append(str(part.get("text", "")))
171
+ elif part_type == "image_url":
172
+ image_url = part.get("image_url")
173
+ if not isinstance(image_url, dict):
174
+ raise OpenAICompatError(400, "image_url content must contain an image_url object.")
175
+ url = str(image_url.get("url", "")).strip()
176
+ detail = str(image_url.get("detail", "auto") or "auto")
177
+ rel_path = save_data_image(
178
+ url,
179
+ workspace_root=workspace_root,
180
+ image_dir=image_dir,
181
+ image_index=image_index,
182
+ )
183
+ image_index += 1
184
+ image_paths.append(rel_path)
185
+ text_parts.append(f"[image saved at {rel_path}]")
186
+ initial_content_parts.extend(image_input_content_parts(url, rel_path, detail=detail))
187
+ else:
188
+ raise OpenAICompatError(400, f"Unsupported content part type: {part_type!r}.")
189
+ else:
190
+ raise OpenAICompatError(400, "message content must be a string or a list of content parts.")
191
+ wrapper_messages.append({"role": role, "content": "\n".join(part for part in text_parts if part)})
192
+
193
+ return PreparedInput(
194
+ wrapper_messages=wrapper_messages,
195
+ initial_content_parts=initial_content_parts,
196
+ image_paths=image_paths,
197
+ )
198
+
199
+
200
+ def save_data_image(url: str, *, workspace_root: Path, image_dir: Path, image_index: int) -> str:
201
+ match = DATA_IMAGE_RE.match(url)
202
+ if not match:
203
+ raise OpenAICompatError(
204
+ 400,
205
+ "Only data:image/...;base64,... image_url inputs are supported in the first API version.",
206
+ )
207
+ mime_type = match.group(1).lower()
208
+ extension = IMAGE_EXTENSIONS.get(mime_type)
209
+ if extension is None:
210
+ raise OpenAICompatError(400, f"Unsupported image MIME type: {mime_type}.")
211
+ try:
212
+ image_bytes = base64.b64decode(match.group(2), validate=True)
213
+ except (binascii.Error, ValueError) as exc:
214
+ raise OpenAICompatError(400, "Invalid base64 image data.") from exc
215
+ if len(image_bytes) > DEFAULT_MAX_IMAGE_BYTES:
216
+ raise OpenAICompatError(400, f"Image exceeds the {DEFAULT_MAX_IMAGE_BYTES} byte limit.")
217
+ image_dir.mkdir(parents=True, exist_ok=True)
218
+ filename = f"image_{image_index:03d}{extension}"
219
+ path = image_dir / filename
220
+ path.write_bytes(image_bytes)
221
+ return path.relative_to(workspace_root).as_posix()
222
+
223
+
224
+ def wrapper_request_payload(*, prepared: PreparedInput, payload: dict[str, Any]) -> dict[str, Any]:
225
+ return {
226
+ "messages": prepared.wrapper_messages,
227
+ "saved_image_paths": prepared.image_paths,
228
+ "response_format": safe_jsonable(payload.get("response_format")),
229
+ "requested_model_label": str(payload.get("model", "")),
230
+ }
231
+
232
+
233
+ def build_input_wrapper_messages(*, prepared: PreparedInput, payload: dict[str, Any]) -> list[dict[str, str]]:
234
+ return [
235
+ {"role": "system", "content": INPUT_WRAPPER_SYSTEM_PROMPT},
236
+ {
237
+ "role": "user",
238
+ "content": json.dumps(wrapper_request_payload(prepared=prepared, payload=payload), ensure_ascii=False, indent=2),
239
+ },
240
+ ]
241
+
242
+
243
+ def build_passthrough_input_plan(*, prepared: PreparedInput, payload: dict[str, Any]) -> dict[str, str]:
244
+ conversation = "\n\n".join(
245
+ f"{message['role'].upper()}:\n{message['content']}" for message in prepared.wrapper_messages
246
+ ).strip()
247
+ response_format = payload.get("response_format")
248
+ output_contract = "Follow the final answer requirements in the original request."
249
+ if response_format is not None:
250
+ output_contract += "\nOpenAI response_format request:\n" + json.dumps(
251
+ safe_jsonable(response_format),
252
+ ensure_ascii=False,
253
+ indent=2,
254
+ )
255
+ return {
256
+ "agent_instruction": conversation or "Answer the user's request.",
257
+ "output_contract": output_contract,
258
+ "wrapper_notes": "Input wrapper disabled; the original normalized conversation was passed through directly.",
259
+ }
260
+
261
+
262
+ def build_agent_prompt(input_plan: dict[str, Any], prepared: PreparedInput) -> str:
263
+ image_block = "\n".join(f"- {path}" for path in prepared.image_paths) if prepared.image_paths else "- none"
264
+ return (
265
+ "You are solving a user request through ResearchHarness.\n\n"
266
+ "Task for the agent:\n"
267
+ f"{str(input_plan.get('agent_instruction', '')).strip()}\n\n"
268
+ "User-provided images saved in this workspace:\n"
269
+ f"{image_block}\n\n"
270
+ "The original image content is attached to the initial user message when the backend model supports image parts. "
271
+ "The same images are also saved at the paths above so you may call ReadImage when visual inspection is needed.\n\n"
272
+ "Do not optimize your tool-use loop for the final output schema. Solve the task completely, then finish with a complete, "
273
+ "self-contained internal final text that includes the actual answer, the evidence used, and any concise reasoning needed to understand it. "
274
+ "You may mention files you created or inspected, but the internal final text must not depend on local files as the only carrier of the answer.\n\n"
275
+ "Final output contract that will be enforced by a formatter after your run:\n"
276
+ f"{str(input_plan.get('output_contract', 'plain text')).strip()}\n\n"
277
+ "Wrapper notes:\n"
278
+ f"{str(input_plan.get('wrapper_notes', '')).strip()}"
279
+ )
280
+
281
+
282
+ def build_output_wrapper_messages(
283
+ *,
284
+ prepared: PreparedInput,
285
+ payload: dict[str, Any],
286
+ input_plan: dict[str, Any],
287
+ agent_result_text: str,
288
+ ) -> list[dict[str, str]]:
289
+ output_payload = {
290
+ "original_messages": prepared.wrapper_messages,
291
+ "saved_image_paths": prepared.image_paths,
292
+ "output_contract": str(input_plan.get("output_contract", "plain text")),
293
+ "response_format": safe_jsonable(payload.get("response_format")),
294
+ "agent_result_text": agent_result_text,
295
+ }
296
+ return [
297
+ {"role": "system", "content": OUTPUT_WRAPPER_SYSTEM_PROMPT},
298
+ {"role": "user", "content": json.dumps(output_payload, ensure_ascii=False, indent=2)},
299
+ ]
300
+
301
+
302
+ def extract_json_object(text: str) -> dict[str, Any]:
303
+ stripped = text.strip()
304
+ if stripped.startswith("```"):
305
+ stripped = re.sub(r"^```(?:json)?\s*", "", stripped, flags=re.IGNORECASE)
306
+ stripped = re.sub(r"\s*```$", "", stripped)
307
+ try:
308
+ parsed = json.loads(stripped)
309
+ except json.JSONDecodeError:
310
+ start = stripped.find("{")
311
+ end = stripped.rfind("}")
312
+ if start < 0 or end <= start:
313
+ raise OpenAICompatError(500, "Input wrapper did not return a JSON object.", "server_error") from None
314
+ try:
315
+ parsed = json.loads(stripped[start : end + 1])
316
+ except json.JSONDecodeError as exc:
317
+ raise OpenAICompatError(500, f"Input wrapper returned invalid JSON: {exc}", "server_error") from exc
318
+ if not isinstance(parsed, dict):
319
+ raise OpenAICompatError(500, "Input wrapper JSON must be an object.", "server_error")
320
+ if not str(parsed.get("agent_instruction", "")).strip():
321
+ raise OpenAICompatError(500, "Input wrapper JSON missing agent_instruction.", "server_error")
322
+ if not str(parsed.get("output_contract", "")).strip():
323
+ parsed["output_contract"] = "plain text"
324
+ parsed.setdefault("wrapper_notes", "")
325
+ return parsed
326
+
327
+
328
+ def call_wrapper_text(
329
+ agent: MultiTurnReactAgent,
330
+ messages: list[dict[str, str]],
331
+ *,
332
+ max_output_tokens: Optional[int] = None,
333
+ ) -> str:
334
+ response = agent.call_compaction_api(messages, max_output_tokens=max_output_tokens)
335
+ if not isinstance(response, dict) or response.get("status") == "error":
336
+ error_text = response.get("error", "unknown wrapper error") if isinstance(response, dict) else str(response)
337
+ raise OpenAICompatError(500, error_text, "server_error")
338
+ text = assistant_text_content(response.get("content")).strip()
339
+ if not text:
340
+ raise OpenAICompatError(500, "Wrapper returned empty content.", "server_error")
341
+ return text
342
+
343
+
344
+ def final_max_tokens(payload: dict[str, Any]) -> Optional[int]:
345
+ raw_value = payload.get("max_tokens", payload.get("max_completion_tokens"))
346
+ if raw_value is None:
347
+ return None
348
+ try:
349
+ value = int(raw_value)
350
+ except (TypeError, ValueError) as exc:
351
+ raise OpenAICompatError(400, "max_tokens must be an integer.") from exc
352
+ if value <= 0:
353
+ raise OpenAICompatError(400, "max_tokens must be positive.")
354
+ return value
355
+
356
+
357
+ def append_api_event(trace_dir: Path, event: str, payload: dict[str, Any]) -> None:
358
+ append_jsonl(
359
+ trace_dir / "api_trace.jsonl",
360
+ {
361
+ "timestamp": int(time.time()),
362
+ "event": event,
363
+ "payload": safe_jsonable(payload),
364
+ },
365
+ )
366
+
367
+
368
+ def run_chat_completion(payload: dict[str, Any], config: ServerConfig) -> dict[str, Any]:
369
+ payload = validate_chat_payload(payload)
370
+ request_id = "chatcmpl_" + uuid4().hex
371
+ run_id = "run_" + datetime.datetime.now().astimezone().strftime("%Y%m%d_%H%M%S") + "_" + uuid4().hex[:8]
372
+ run_root = config.api_runs_dir / run_id
373
+ agent_workspace = run_root / "agent_workspace"
374
+ trace_dir = run_root / "agent_trace"
375
+ agent_workspace.mkdir(parents=True, exist_ok=False)
376
+ trace_dir.mkdir(parents=True, exist_ok=False)
377
+ prepared = prepare_openai_input(payload["messages"], agent_workspace)
378
+ llm_config = default_llm_config()
379
+ backend_model = str(llm_config.get("model", ""))
380
+ if prepared.initial_content_parts and not model_supports_runtime_image_parts(backend_model):
381
+ raise OpenAICompatError(
382
+ 400,
383
+ f"Backend model {backend_model!r} does not support image content parts.",
384
+ )
385
+
386
+ tool_names = [name for name in AVAILABLE_TOOL_MAP if name != "AskUser"]
387
+ agent = MultiTurnReactAgent(
388
+ function_list=tool_names,
389
+ llm=llm_config,
390
+ trace_dir=str(trace_dir),
391
+ role_prompt=config.role_prompt or None,
392
+ )
393
+
394
+ if config.input_wrapper:
395
+ input_wrapper_messages = build_input_wrapper_messages(prepared=prepared, payload=payload)
396
+ input_wrapper_text = call_wrapper_text(agent, input_wrapper_messages, max_output_tokens=1200)
397
+ input_plan = extract_json_object(input_wrapper_text)
398
+ append_api_event(
399
+ trace_dir,
400
+ "input_wrapper",
401
+ {
402
+ "enabled": True,
403
+ "request": input_wrapper_messages,
404
+ "response_text": input_wrapper_text,
405
+ "input_plan": input_plan,
406
+ },
407
+ )
408
+ else:
409
+ input_plan = build_passthrough_input_plan(prepared=prepared, payload=payload)
410
+ append_api_event(
411
+ trace_dir,
412
+ "input_wrapper",
413
+ {
414
+ "enabled": False,
415
+ "input_plan": input_plan,
416
+ },
417
+ )
418
+
419
+ agent_prompt = build_agent_prompt(input_plan, prepared)
420
+ session = agent._run_session(
421
+ agent_prompt,
422
+ workspace_root=str(agent_workspace),
423
+ initial_content_parts=prepared.initial_content_parts or None,
424
+ )
425
+ agent_result_text = str(session.get("result_text", "")).strip()
426
+ append_api_event(
427
+ trace_dir,
428
+ "agent_result",
429
+ {
430
+ "termination": session.get("termination", ""),
431
+ "result_text": agent_result_text,
432
+ "trace_path": session.get("trace_path", ""),
433
+ },
434
+ )
435
+
436
+ if config.output_wrapper:
437
+ output_wrapper_messages = build_output_wrapper_messages(
438
+ prepared=prepared,
439
+ payload=payload,
440
+ input_plan=input_plan,
441
+ agent_result_text=agent_result_text,
442
+ )
443
+ final_text = call_wrapper_text(agent, output_wrapper_messages, max_output_tokens=final_max_tokens(payload))
444
+ append_api_event(
445
+ trace_dir,
446
+ "output_wrapper",
447
+ {
448
+ "enabled": True,
449
+ "request": output_wrapper_messages,
450
+ "response_text": final_text,
451
+ },
452
+ )
453
+ else:
454
+ final_text = agent_result_text
455
+ append_api_event(
456
+ trace_dir,
457
+ "output_wrapper",
458
+ {
459
+ "enabled": False,
460
+ "response_text": final_text,
461
+ },
462
+ )
463
+ return make_chat_completion_response(
464
+ request_id=request_id,
465
+ model=str(payload.get("model", "researchharness")),
466
+ content=final_text,
467
+ )
468
+
469
+
470
+ def create_app(config: ServerConfig) -> FastAPI:
471
+ app = FastAPI(title="ResearchHarness OpenAI-Compatible API", version="1.0")
472
+
473
+ @app.exception_handler(OpenAICompatError)
474
+ async def _handle_openai_compat_error(request: Request, exc: OpenAICompatError) -> JSONResponse:
475
+ return openai_error_response(exc)
476
+
477
+ @app.get("/v1/health")
478
+ async def health() -> dict[str, Any]:
479
+ return {
480
+ "status": "ok",
481
+ "api_runs_dir": str(config.api_runs_dir),
482
+ "input_wrapper": config.input_wrapper,
483
+ "output_wrapper": config.output_wrapper,
484
+ }
485
+
486
+ @app.post("/v1/chat/completions")
487
+ async def chat_completions(payload: dict[str, Any] = Body(...)) -> dict[str, Any]:
488
+ try:
489
+ return run_chat_completion(payload, config)
490
+ except OpenAICompatError:
491
+ raise
492
+ except Exception as exc:
493
+ raise OpenAICompatError(500, f"ResearchHarness API error: {exc}", "server_error") from exc
494
+
495
+ return app
496
+
497
+
498
+ def serve(
499
+ *,
500
+ api_runs_dir: str,
501
+ host: str = "127.0.0.1",
502
+ port: int = 8686,
503
+ role_prompt_files: Optional[list[str]] = None,
504
+ input_wrapper: bool = True,
505
+ output_wrapper: bool = True,
506
+ ) -> None:
507
+ root = normalize_workspace_root(api_runs_dir)
508
+ role_prompt = read_role_prompt_files(role_prompt_files or [])
509
+ config = ServerConfig(
510
+ api_runs_dir=root,
511
+ role_prompt=role_prompt,
512
+ host=host,
513
+ port=port,
514
+ input_wrapper=input_wrapper,
515
+ output_wrapper=output_wrapper,
516
+ )
517
+ app = create_app(config)
518
+ uvicorn.run(app, host=host, port=port)
api_runs/.gitkeep ADDED
@@ -0,0 +1 @@
 
 
1
+
app.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Hugging Face Space entrypoint for ResearchHarness."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ from pathlib import Path
7
+
8
+ import uvicorn
9
+
10
+ from agent_base.utils import read_role_prompt_files
11
+ from frontend.local_server import app, configure_frontend
12
+
13
+
14
+ def _int_env(name: str, default: int) -> int:
15
+ raw = os.getenv(name, "").strip()
16
+ if not raw:
17
+ return default
18
+ try:
19
+ return int(raw)
20
+ except ValueError as exc:
21
+ raise ValueError(f"{name} must be an integer, got {raw!r}") from exc
22
+
23
+
24
+ def _role_prompt_files() -> list[str]:
25
+ raw = os.getenv("RH_ROLE_PROMPT_FILES", "").strip()
26
+ if not raw:
27
+ return []
28
+ return [item for item in raw.split(os.pathsep) if item]
29
+
30
+
31
+ def configure_space() -> None:
32
+ runs_dir = Path(os.getenv("RH_SPACE_RUNS_DIR", "/tmp/researchharness_space/runs")).expanduser()
33
+ role_prompt = read_role_prompt_files(_role_prompt_files())
34
+ configure_frontend(
35
+ role_prompt=role_prompt,
36
+ managed_runs_dir=str(runs_dir),
37
+ cleanup_retention_seconds=_int_env("RH_SPACE_RETENTION_SECONDS", 6 * 60 * 60),
38
+ cleanup_max_runs=_int_env("RH_SPACE_MAX_RUNS", 40),
39
+ cleanup_interval_seconds=_int_env("RH_SPACE_CLEANUP_INTERVAL_SECONDS", 15 * 60),
40
+ )
41
+
42
+
43
+ configure_space()
44
+
45
+
46
+ def main() -> int:
47
+ host = os.getenv("HOST", "0.0.0.0")
48
+ port = _int_env("PORT", 7860)
49
+ uvicorn.run(app, host=host, port=port, reload=False)
50
+ return 0
51
+
52
+
53
+ if __name__ == "__main__":
54
+ raise SystemExit(main())
benchmarks/QA/README.md ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # QA / VQA Benchmarks
2
+
3
+ This directory documents the lightweight ResearchHarness contract for
4
+ question-answering benchmarks, including plain-text QA and multimodal VQA-style
5
+ tasks.
6
+
7
+ The recommended integration is the OpenAI-compatible synchronous API server:
8
+
9
+ ```bash
10
+ python3 /abs/path/to/ResearchHarness/run_server.py \
11
+ --api-runs-dir ./api_runs
12
+ ```
13
+
14
+ For QA/VQA benchmark runs, optionally add this benchmark role overlay:
15
+
16
+ ```bash
17
+ python3 /abs/path/to/ResearchHarness/run_server.py \
18
+ --api-runs-dir ./api_runs \
19
+ --role-prompt-file /abs/path/to/ResearchHarness/benchmarks/QA/role_prompt.md
20
+ ```
21
+
22
+ Each request creates a fresh run directory:
23
+
24
+ ```text
25
+ ./api_runs/
26
+ `-- run_YYYYMMDD_HHMMSS_<random>/
27
+ |-- agent_workspace/ # visible to the agent
28
+ | `-- inputs/
29
+ | `-- images/ # user-provided images, when present
30
+ `-- agent_trace/ # server-side trace and session state
31
+ |-- api_trace.jsonl
32
+ |-- trace_*.jsonl
33
+ `-- _session_state.json
34
+ ```
35
+
36
+ The input and output LLM wrappers are enabled by default:
37
+
38
+ - `--input-wrapper` / `--no-input-wrapper` controls the input normalization pass.
39
+ - `--output-wrapper` / `--no-output-wrapper` controls the final answer formatting pass.
40
+
41
+ Strict-format benchmarks should usually keep both wrappers enabled. To return
42
+ the agent's direct final text instead, run:
43
+
44
+ ```bash
45
+ python3 /abs/path/to/ResearchHarness/run_server.py \
46
+ --api-runs-dir ./api_runs \
47
+ --no-input-wrapper \
48
+ --no-output-wrapper
49
+ ```
50
+
51
+ External benchmark runners can then use the regular OpenAI SDK with:
52
+
53
+ ```python
54
+ from openai import OpenAI
55
+
56
+ client = OpenAI(api_key="unused", base_url="http://127.0.0.1:8686/v1")
57
+
58
+ response = client.chat.completions.create(
59
+ model="researchharness",
60
+ messages=[{"role": "user", "content": "Answer the question."}],
61
+ )
62
+
63
+ answer = response.choices[0].message.content
64
+ ```
65
+
66
+ ## Multimodal Input
67
+
68
+ For image benchmarks, send OpenAI-style content parts. The first API version
69
+ supports one or more `data:image/...;base64,...` URLs in the same request.
70
+
71
+ ```python
72
+ response = client.chat.completions.create(
73
+ model="researchharness",
74
+ messages=[
75
+ {
76
+ "role": "user",
77
+ "content": [
78
+ {"type": "text", "text": "What is shown? Return JSON with key answer."},
79
+ {"type": "image_url", "image_url": {"url": data_url}},
80
+ ],
81
+ }
82
+ ],
83
+ )
84
+ ```
85
+
86
+ The API saves each submitted image under `agent_workspace/inputs/images/`,
87
+ passes the image content to the first ResearchHarness model call when the
88
+ backend model supports image parts, and includes each saved path in the
89
+ agent-visible text.
90
+
91
+ The returned answer should be self-contained for a remote evaluator. Workspace
92
+ files may support the run, but the response should not only say to consult
93
+ `answer.md`, `report.md`, an image file, or another local artifact.
94
+
95
+ ## Scope
96
+
97
+ - The endpoint is synchronous and returns one final text answer.
98
+ - Each request gets a separate workspace subdirectory.
99
+ - The API uses an input wrapper, the ResearchHarness agent, and an output
100
+ wrapper so strict benchmark output formats do not destabilize the agent loop.
101
+ - Streaming, async run status, artifact download, and remote image fetching are
102
+ intentionally out of scope for this minimal QA contract.
benchmarks/QA/role_prompt.md ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Benchmark Role Overlay
2
+
3
+ You are running inside ResearchHarness for a QA or VQA benchmark.
4
+
5
+ Behavior:
6
+ - Solve the user's task directly and carefully.
7
+ - Use tools only when they materially improve answer quality.
8
+ - If the request includes saved image paths, inspect the image evidence when it
9
+ is needed for the answer.
10
+ - Do not ask the user follow-up questions.
11
+ - Do not stop with a plan. Produce the answer once enough evidence has been
12
+ gathered.
13
+ - It is acceptable to explain what evidence was used in the agent's internal
14
+ final text; a downstream formatter will enforce the benchmark's exact output
15
+ contract.
16
+ - Assume the remote evaluator only sees the returned text, not your workspace.
17
+ - Your final text must be a complete, independent plain-text answer.
18
+ - Include the actual answer to the original question.
19
+ - Include supporting evidence, calculations, or reasoning steps when they are
20
+ needed to make the answer understandable.
21
+ - In this benchmark role, do not rely on local workspace files as the answer.
22
+ Files such as `answer.md`, `report.md`, images, or other artifacts may support
23
+ your work, but the returned text itself must contain the answer a remote
24
+ evaluator needs.
25
+
26
+ For visual tasks:
27
+ - Prefer the attached image content when it is available in the model input.
28
+ - Use `ReadImage` on saved image paths when additional visual inspection is
29
+ needed or when the prompt explicitly asks you to inspect local image files.
30
+ - Do not invent visual details that are not supported by the image or tool
31
+ output.
benchmarks/README.md ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Benchmarks
2
+
3
+ This folder records benchmark-specific integration contracts that live
4
+ **outside** `agent_base` so the core harness stays generic, lightweight, and
5
+ fair across different evaluations.
6
+
7
+ | Benchmark | Directory | Tracked contract |
8
+ | --- | --- | --- |
9
+ | ResearchClawBench | `benchmarks/ResearchClawBench/` | `README.md` + `role_prompt.md` + `adapter.py` |
10
+ | QA / VQA-style benchmarks | `benchmarks/QA/` | `README.md` + `role_prompt.md` |
11
+
12
+ ## Notes
13
+
14
+ - `agent_base/` stays focused on the reusable harness runtime.
15
+ - Benchmark-specific prompts, adapters, and integration notes should live under
16
+ their own benchmark subdirectory.
17
+ - Local benchmark helpers may exist for private experimentation, but they do
18
+ not define the formal external integration contract.
benchmarks/ResearchClawBench/README.md ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ResearchClawBench
2
+
3
+ This directory contains the tracked files needed to document how `ResearchHarness`
4
+ should be integrated into `ResearchClawBench`.
5
+
6
+ ResearchHarness is intended to serve here as a **general and fair execution
7
+ substrate** for tool-using LLM evaluation, while `ResearchClawBench` remains in
8
+ charge of task construction, hidden-answer isolation, and scoring.
9
+
10
+ ## Recommended `agents.json` Entry
11
+
12
+ Use a single direct command that launches the thin top-level ResearchHarness
13
+ entrypoint.
14
+
15
+ ```json
16
+ {
17
+ "researchharness": {
18
+ "label": "ResearchHarness",
19
+ "icon": "H",
20
+ "logo": "/static/logos/rh.svg",
21
+ "cmd": "python3 /abs/path/to/ResearchHarness/run_agent.py <PROMPT> --workspace-root <WORKSPACE> --role-prompt-file /abs/path/to/ResearchHarness/benchmarks/ResearchClawBench/role_prompt.md --trace-dir <WORKSPACE>"
22
+ }
23
+ }
24
+ ```
25
+
26
+ ## Why This Shape
27
+
28
+ - `ResearchClawBench` already prepares the workspace, writes `INSTRUCTIONS.md`,
29
+ and isolates hidden checklist data.
30
+ - `ResearchHarness` should only execute the agent through a stable harness
31
+ interface.
32
+ - The command stays unchanged. The entrypoint automatically selects the
33
+ lightweight adapter in `benchmarks/ResearchClawBench/adapter.py` when this
34
+ benchmark role prompt is used.
35
+
36
+ ## Notes
37
+
38
+ - Replace `/abs/path/to/ResearchHarness/` with the real local checkout path.
39
+ - The command should stay one-line and non-interactive.
40
+ - The adapter prevents premature termination on long tasks by refusing to accept
41
+ plain-text completion before `report/report.md` exists in the workspace.
42
+ - The adapter excludes `AskUser`; RCB runs must remain fully non-interactive.
43
+ - Any local batch helpers or ad hoc benchmark scripts should remain untracked
44
+ and live outside the formal integration contract.
benchmarks/ResearchClawBench/adapter.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+ from typing import Any, Optional, Sequence
5
+
6
+ from agent_base.react_agent import AVAILABLE_TOOL_MAP, MultiTurnReactAgent
7
+ from agent_base.tools.tooling import normalize_workspace_root
8
+
9
+
10
+ class ResearchClawBenchAgent(MultiTurnReactAgent):
11
+ """
12
+ Lightweight benchmark adapter for ResearchClawBench.
13
+
14
+ The benchmark task is not complete until the run workspace contains the
15
+ canonical final report at report/report.md. Pure planning text without that
16
+ artifact should not terminate the agent loop.
17
+ """
18
+
19
+ required_report_relpath = Path("report") / "report.md"
20
+ forbidden_tool_names = {"AskUser"}
21
+
22
+ def __init__(self, function_list: Optional[Sequence[str]] = None, *args: Any, **kwargs: Any):
23
+ if function_list is None:
24
+ function_list = [
25
+ tool_name
26
+ for tool_name in AVAILABLE_TOOL_MAP
27
+ if tool_name not in self.forbidden_tool_names
28
+ ]
29
+ else:
30
+ function_list = [str(tool_name).strip() for tool_name in function_list if str(tool_name).strip()]
31
+ forbidden = sorted(set(function_list) & self.forbidden_tool_names)
32
+ if forbidden:
33
+ raise ValueError(f"Tools are not allowed in ResearchClawBench runs: {forbidden}")
34
+ super().__init__(function_list=list(function_list), *args, **kwargs)
35
+
36
+ def _required_report_path(self, workspace_root: Optional[str]) -> Path:
37
+ workspace = Path(normalize_workspace_root(workspace_root))
38
+ return workspace / self.required_report_relpath
39
+
40
+ def should_accept_plaintext_result(
41
+ self,
42
+ *,
43
+ result_text: str,
44
+ workspace_root: Optional[str],
45
+ messages: Sequence[dict[str, Any]],
46
+ ) -> bool:
47
+ if not self._required_report_path(workspace_root).exists():
48
+ return False
49
+ return super().should_accept_plaintext_result(
50
+ result_text=result_text,
51
+ workspace_root=workspace_root,
52
+ messages=messages,
53
+ )
54
+
55
+ def rejected_plaintext_result_message(
56
+ self,
57
+ *,
58
+ result_text: str,
59
+ workspace_root: Optional[str],
60
+ messages: Sequence[dict[str, Any]],
61
+ ) -> str:
62
+ if not self._required_report_path(workspace_root).exists():
63
+ return (
64
+ "The previous assistant turn was not accepted as the final result because "
65
+ "ResearchClawBench requires report/report.md and that file is still missing. "
66
+ "Continue working and use tool calls to produce or verify report/report.md before finishing."
67
+ )
68
+ return super().rejected_plaintext_result_message(
69
+ result_text=result_text,
70
+ workspace_root=workspace_root,
71
+ messages=messages,
72
+ )
73
+
74
+ def should_accept_terminal_error(
75
+ self,
76
+ *,
77
+ error_text: str,
78
+ workspace_root: Optional[str],
79
+ messages: Sequence[dict[str, Any]],
80
+ ) -> bool:
81
+ return self._required_report_path(workspace_root).exists()
82
+
83
+ def accepted_terminal_error_result_text(
84
+ self,
85
+ *,
86
+ error_text: str,
87
+ workspace_root: Optional[str],
88
+ messages: Sequence[dict[str, Any]],
89
+ ) -> str:
90
+ return (
91
+ "ResearchClawBench completion recovered after a terminal LLM/runtime error because "
92
+ "report/report.md already exists and the required final artifact has been produced."
93
+ )
benchmarks/ResearchClawBench/role_prompt.md ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Benchmark Role Overlay
2
+
3
+ ## Purpose
4
+
5
+ You are running inside a benchmark-style scientific evaluation.
6
+
7
+ Your job is not just to produce a plausible report. Your job is to produce a
8
+ report whose claims are traceable to concrete artifacts in the workspace and
9
+ whose methods match the task's named scientific commitments as closely as the
10
+ environment allows.
11
+
12
+ This benchmark is non-interactive. Do not use `AskUser` or attempt to ask the
13
+ human for clarification. Resolve ambiguity from `INSTRUCTIONS.md`, workspace
14
+ files, related work, and available local or web tools.
15
+
16
+ ## Method Contract
17
+
18
+ - Parse the task into explicit methodological commitments early.
19
+ - Before broad exploration, infer the likely target artifact families required by
20
+ the task, including:
21
+ - primary quantitative answers
22
+ - required comparison tables
23
+ - expected figure families
24
+ - interpretability artifacts
25
+ - subgroup or condition-specific outputs
26
+ - If the task names a framework, protocol, comparison structure,
27
+ interpretability method, simulator, ablation, posterior treatment,
28
+ reconciliation step, or validation design, treat that as part of the
29
+ contract.
30
+ - Do not silently replace an explicitly named method with a looser descriptive
31
+ analysis.
32
+ - Save a concise contract summary to `outputs/method_contract.json`.
33
+ - Save the inferred target artifact inventory to
34
+ `outputs/target_artifact_inventory.json`.
35
+ - After reading the most relevant related-work papers, refresh both files if the
36
+ papers reveal additional named baselines, architectures, figure families,
37
+ comparison strata, or interpretability artifacts central to the task.
38
+ - Save a concise related-work extraction to `outputs/related_work_contract.json`
39
+ whenever related work materially changes the contract or artifact inventory.
40
+
41
+ ## Capability Check
42
+
43
+ - Before approximating or skipping a named method, check whether the needed
44
+ dependency, library, or runtime capability is available.
45
+ - Save the result to `outputs/dependency_check.json`.
46
+ - If a named method cannot be implemented exactly, state the exact limitation
47
+ and the fallback.
48
+ - If the task centers on a named model family, simulator, architecture, or
49
+ analysis stack, do not quietly swap to a different family just because it is
50
+ easier. Either implement a minimally faithful version of the named approach
51
+ or make the deviation explicit before proceeding.
52
+
53
+ ## Evidence Discipline
54
+
55
+ - Every major scientific claim should have at least one explicit supporting
56
+ artifact in `outputs/` or `report/images/`.
57
+ - Export the exact tables, matrices, or JSON objects used to create each main
58
+ figure.
59
+ - Add a dedicated validation subsection to the report that separates:
60
+ - what was verified directly from workspace data
61
+ - what came from related work
62
+ - what remains an assumption or limitation
63
+ - Answer claim-recovery questions claim-by-claim rather than only with a broad
64
+ narrative.
65
+ - Save a concise claim recovery table before finalizing the report.
66
+ - When the task asks for quantitative constraints, limits, posterior summaries,
67
+ calibration values, or uncertainty summaries, save those values explicitly in
68
+ the requested variables and units rather than only through a proxy
69
+ transformation.
70
+ - If the task ultimately asks for a direct constraint on a named target
71
+ quantity, prefer deriving and reporting that named quantity itself instead of
72
+ stopping at an intermediate proxy axis, surrogate scale, or nearby latent
73
+ variable whenever a defensible derivation is possible from workspace data and
74
+ related work.
75
+ - If posterior samples are a primary input, report canonical distribution
76
+ summaries for each primary source, including mean and standard deviation,
77
+ unless those statistics are mathematically invalid for the variable.
78
+ - If the task names a primary source, cohort, benchmark, or experimental arm,
79
+ produce at least one source-specific artifact for it before emphasizing only
80
+ combined or aggregated results.
81
+ - If the task names a direct target quantity, threshold, or decision criterion,
82
+ export a compact result table that answers it directly before presenting
83
+ broader supporting analyses.
84
+
85
+ ## Related Work Use
86
+
87
+ - Read `related_work/` early, but bounded.
88
+ - Start with concise or bounded reads when papers are long.
89
+ - Extract only task-relevant facts into notes or structured outputs.
90
+ - If related work contains validation metrics, methodological caveats,
91
+ baselines, or target comparison axes that matter for the task, incorporate
92
+ them explicitly.
93
+ - Prefer extracting from related work:
94
+ - named methods or architectures to reproduce or compare against
95
+ - target comparison axes and subgroup splits
96
+ - likely main figure families or panel structures
97
+ - explicit quantitative targets, thresholds, or calibration outputs
98
+
99
+ ## Figure And Comparison Fidelity
100
+
101
+ - Prefer claim-driven figures over generic exploratory plots.
102
+ - Infer likely figure families and comparison structures from the task and
103
+ related work.
104
+ - If the task is about projections, calibration, method agreement, subgroup
105
+ trends, rankings, level-wise comparisons, or ablations, produce figures that
106
+ directly encode those structures.
107
+ - Keep the main figure set compact: each main figure should support a specific
108
+ target claim.
109
+ - If the task's core claim is source-specific, dataset-specific, or benchmark-
110
+ specific, include at least one main figure at that same granularity rather
111
+ than only a pooled or combined summary figure.
112
+ - If the task implies a named figure family such as ablation curves, PR/ROC
113
+ curves, parity plots, subgroup heatmaps, saliency maps, architecture
114
+ diagrams, or level-wise comparisons, prioritize that family over a generic
115
+ substitute.
116
+
117
+ ## Group And Condition Preservation
118
+
119
+ - If the task names groups, conditions, labs, sexes, environments, shells,
120
+ depth levels, or other comparison strata, preserve them in at least one
121
+ exported table or figure.
122
+ - Do not silently collapse mixed categories if the scientific question depends
123
+ on them.
124
+ - When subgroup structure matters over time, prefer a subgroup-by-time matrix
125
+ and save it.
126
+ - If the task is a benchmark or model-comparison study across datasets,
127
+ baselines, cohorts, or conditions, export a compact comparison table with the
128
+ main metric reported as mean ± standard deviation whenever repeated runs,
129
+ folds, or stochastic training are part of the setup.
130
+ - For multi-condition or multi-cohort tasks, save at least one artifact at the
131
+ per-condition granularity before merging across conditions.
132
+
133
+ ## Named Method Fidelity
134
+
135
+ - If the task or related work defines a named mechanism, algorithm, or
136
+ protocol central to the scientific claim, save a fidelity checklist to
137
+ `outputs/method_fidelity_checklist.json`.
138
+ - That checklist should capture:
139
+ - the exact definition
140
+ - assumptions
141
+ - invariants
142
+ - non-negotiable structural steps
143
+ - Use it to verify whether the implemented method actually matches the named
144
+ mechanism.
145
+ - If you deviate, explain exactly how and why in the report.
146
+ - If the task revolves around a named architecture or protocol, capture the key
147
+ structural ingredients that distinguish it from nearby alternatives and check
148
+ them explicitly.
149
+
150
+ ## Small Sweeps And Ablations
151
+
152
+ - If the named mechanism exposes a small discrete design variable, such as
153
+ levels, layers, stages, shells, bins, or ablation settings, run at least a
154
+ small sweep unless it is genuinely impossible from the available workspace.
155
+ - If the task names a specific interpretability method such as SHAP,
156
+ permutation importance, saliency, or similar, produce at least one artifact
157
+ using that named method.
158
+ - If the task claims improved interpretability, do not stop at aggregate metric
159
+ gains alone; produce at least one explicit interpretability artifact and tie
160
+ it back to domain-relevant entities, groups, or substructures named in the
161
+ task or related work.
162
+ - If the task names multiple groups, labs, cohorts, or environments, prefer an
163
+ interpretability artifact that compares them directly instead of a single
164
+ pooled explanation.
165
+ - If interpretability is central and the chosen model family supports a common
166
+ post hoc explanation method, do not stop at native coefficient or impurity
167
+ magnitudes alone. Add at least one post hoc explanation artifact such as
168
+ SHAP, permutation importance, saliency, attention attribution, or a similarly
169
+ standard method for that model family.
170
+
171
+ ## Finalization
172
+
173
+ - Start `report/report.md` as soon as at least two core result families already
174
+ have concrete supporting artifacts in `outputs/` or `report/images/`.
175
+ - Prefer an evidence-backed report draft over one more optional script, one
176
+ more polish pass, or one more non-essential figure.
177
+ - Once the primary quantitative outputs, the main comparison figures, and the
178
+ core validation artifacts exist, write `report/report.md` immediately.
179
+ - Do not postpone the report in order to chase optional supplementary figures,
180
+ extra exploratory analyses, or additional polish that is not required to
181
+ support the task's main claims.
182
+ - Treat optional supplementary work as lower priority than a complete,
183
+ evidence-backed report. If the report can already answer the task directly,
184
+ finish the report first and only then consider extras if there is clear
185
+ remaining need.
186
+ - The final report should be tightly traceable.
187
+ - Important numbers should be reproducible from saved artifacts in the
188
+ workspace.
189
+ - Do not claim exact reproduction if only a rough approximation was achieved.
190
+ - Before finalizing, check that the report contains direct answers to the main
191
+ requested outputs in the named variables, units, and confidence language of
192
+ the task, not only nearby surrogate quantities.
193
+ - Before finalizing, verify that every primary entry in
194
+ `outputs/target_artifact_inventory.json` is either satisfied by a concrete
195
+ saved artifact or explicitly marked as unsatisfied with a reason.
docs/tutorial_en.md ADDED
@@ -0,0 +1,531 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ResearchHarness Tutorial
2
+
3
+ This tutorial explains how to use ResearchHarness from the command line and as
4
+ an OpenAI-compatible API service.
5
+
6
+ ResearchHarness is a lightweight, general-purpose harness for tool-using LLM
7
+ agents. It can be used as:
8
+
9
+ - a command-line local agent,
10
+ - a fair execution substrate for agent benchmarks,
11
+ - an OpenAI-compatible synchronous API backend,
12
+ - a personal assistant runtime for files, code, reports, PDFs, images, and web tasks.
13
+
14
+ ## 1. Install
15
+
16
+ Clone the repository and install dependencies:
17
+
18
+ ```bash
19
+ python3 -m pip install -r requirements.txt
20
+ ```
21
+
22
+ Python 3.10+ is recommended.
23
+
24
+ ## 2. Configure Environment Variables
25
+
26
+ Copy `.env.example` to `.env` and fill in the required values.
27
+
28
+ Required variables:
29
+
30
+ | Variable | Meaning |
31
+ | --- | --- |
32
+ | `API_KEY` | API key for your OpenAI-compatible LLM provider. |
33
+ | `API_BASE` | Base URL for the OpenAI-compatible chat-completions endpoint. |
34
+ | `MODEL_NAME` | Main model used by ResearchHarness. |
35
+ | `SERPER_KEY_ID` | Serper key for `WebSearch` and `ScholarSearch`: https://serper.dev/ |
36
+ | `JINA_API_KEYS` | Jina key for `WebFetch`: https://jina.ai/ |
37
+ | `MINERU_TOKEN` | MinerU token for `ReadPDF`: https://mineru.net/ |
38
+
39
+ Optional variables:
40
+
41
+ | Variable | Default | Meaning |
42
+ | --- | --- | --- |
43
+ | `WORKSPACE_ROOT` | `./workspace` | Default workspace root when no explicit workspace is passed. |
44
+ | `MAX_LLM_CALL_PER_RUN` | `100` | Maximum LLM calls in one agent run. |
45
+ | `MAX_AGENT_ROUNDS` | `100` | Maximum ReAct loop rounds. |
46
+ | `MAX_AGENT_RUNTIME_SECONDS` | `9000` | Maximum wall-clock runtime for one agent run. |
47
+ | `LLM_TIMEOUT_SECONDS` | `600` | Timeout for each LLM API request. |
48
+ | `LLM_MAX_OUTPUT_TOKENS` | `10000` | Requested maximum output tokens. |
49
+ | `MAX_INPUT_TOKENS` | `320000` | Input-token budget used by runtime accounting. |
50
+ | `LLM_MAX_RETRIES` | `10` | Maximum retries for transient LLM API errors. |
51
+ | `TEMPERATURE` | `0.6` | Main model temperature. |
52
+ | `TOP_P` | `0.95` | Main model top-p. |
53
+ | `PRESENCE_PENALTY` | `1.1` | Main model presence penalty when supported. |
54
+ | `AUTO_COMPACT_TRIGGER_TOKENS` | `128k` | Context length threshold for automatic compaction. |
55
+ | `IMAGE_PART_TOKEN_ESTIMATE` | `1536` | Token estimate for each image content part. |
56
+ | `LLM_IMAGE_MAX_EDGE` | `1568` | Maximum image edge sent to multimodal models. |
57
+ | `LLM_IMAGE_MAX_BYTES` | `524288` | Maximum compressed image payload size. |
58
+ | `LLM_IMAGE_JPEG_QUALITY` | `85` | Initial JPEG quality for image compression. |
59
+ | `DEBUG_AGENT` | `false` | Verbose agent-loop logs. |
60
+ | `DEBUG_SEARCH` | `false` | Verbose WebSearch logs. |
61
+ | `DEBUG_SCHOLAR` | `false` | Verbose ScholarSearch logs. |
62
+ | `DEBUG_VISIT` | `false` | Verbose WebFetch logs. |
63
+
64
+ Before real use, run:
65
+
66
+ ```bash
67
+ python3 tests/test_tool_availability.py
68
+ ```
69
+
70
+ All tools should pass. Missing service keys, missing dependencies, exhausted
71
+ credits, or unavailable external tools should be treated as failures.
72
+
73
+ If `WebSearch`, `ScholarSearch`, `WebFetch`, or `ReadPDF` fails with network,
74
+ TLS, upload, download, or parsing errors, try disabling VPN/proxy and rerun the
75
+ test.
76
+
77
+ ## 3. Command-Line Usage
78
+
79
+ Run a simple prompt:
80
+
81
+ ```bash
82
+ python3 run_agent.py "Who proposed the transformer architecture, and in what year was the paper published?"
83
+ ```
84
+
85
+ Use an explicit workspace:
86
+
87
+ ```bash
88
+ python3 run_agent.py "Summarize this project." \
89
+ --workspace-root ./workspace
90
+ ```
91
+
92
+ You can replace `./workspace` with any other workspace directory.
93
+
94
+ Save traces to a directory:
95
+
96
+ ```bash
97
+ python3 run_agent.py "Summarize this project." \
98
+ --workspace-root ./workspace \
99
+ --trace-dir ./traces
100
+ ```
101
+
102
+ You can replace `./traces` with any other trace directory.
103
+
104
+ Without `--trace-dir`, CLI runs do not write a trace file.
105
+
106
+ Append a role prompt:
107
+
108
+ ```bash
109
+ python3 run_agent.py "Answer this QA task." \
110
+ --workspace-root ./workspace \
111
+ --role-prompt-file benchmarks/QA/role_prompt.md
112
+ ```
113
+
114
+ Attach a local image:
115
+
116
+ ```bash
117
+ python3 run_agent.py "Read the image and return JSON." \
118
+ --workspace-root ./workspace \
119
+ --images /path/to/image.png /path/to/second-image.png
120
+ ```
121
+
122
+ Each image path must exist. RH copies images into `./workspace/inputs/images/`,
123
+ sends them as initial `image_url` content parts, and adds each saved relative
124
+ path to the user text so later rounds can call `ReadImage` on the same files.
125
+
126
+ In an interactive terminal, CLI runs continue after a final answer and prompt
127
+ for a follow-up. The follow-up run keeps the prior messages, tool results, and
128
+ saved image path hints. During a running step, `Ctrl+C` interrupts the current
129
+ run at the next safe point and returns to follow-up mode with context preserved.
130
+ Press `Ctrl+C` at the follow-up prompt or send EOF to exit. Use `--no-chat` for
131
+ strict one-shot behavior, or `--chat` to force follow-up mode.
132
+
133
+ For browser-based local use, run `python3 run_frontend.py`. The frontend uses an
134
+ existing workspace selected in the page, streams tool steps live, accepts one or
135
+ more image attachments, and continues the current conversation after each final
136
+ answer until you click **New chat**. While running, the send button becomes
137
+ **Stop**; it interrupts at the next safe point and keeps the conversation
138
+ context for the next message.
139
+
140
+ ### CLI Parameters
141
+
142
+ | Parameter | Required | Meaning |
143
+ | --- | --- | --- |
144
+ | positional `prompt` | yes, unless `--prompt-file` is used | Prompt text. |
145
+ | `--prompt-file PATH` | no | Read prompt text from a UTF-8 file. |
146
+ | `--workspace-root PATH` | no | Workspace root for local file tools, Bash, and terminal sessions. Created if missing. |
147
+ | `--trace-dir PATH` | no | Directory where `trace_*.jsonl` is written. |
148
+ | `--role-prompt-file PATH` | no, repeatable | Append role-specific prompt text to the base system prompt. |
149
+ | `--images PATH [PATH ...]` | no | Copy one or more local images into `inputs/images/` and attach them to the initial user message. |
150
+ | `--chat` / `--no-chat` | no | Enable or disable CLI follow-up mode. Default: enabled only when stdin and stdout are interactive terminals. |
151
+
152
+ ## 4. OpenAI-Compatible API Server
153
+
154
+ ResearchHarness can serve a synchronous OpenAI-compatible endpoint:
155
+
156
+ ```http
157
+ POST /v1/chat/completions
158
+ ```
159
+
160
+ This allows existing OpenAI SDK clients to call ResearchHarness by changing only
161
+ `base_url`.
162
+
163
+ ### Start the Server
164
+
165
+ Default deployment:
166
+
167
+ ```bash
168
+ python3 run_server.py \
169
+ --api-runs-dir ./api_runs \
170
+ --host 127.0.0.1 \
171
+ --port 8686
172
+ ```
173
+
174
+ QA/VQA benchmark deployment with a benchmark role overlay:
175
+
176
+ ```bash
177
+ python3 run_server.py \
178
+ --api-runs-dir ./api_runs \
179
+ --host 127.0.0.1 \
180
+ --port 8686 \
181
+ --role-prompt-file benchmarks/QA/role_prompt.md
182
+ ```
183
+
184
+ ### API Server Parameters
185
+
186
+ | Parameter | Required | Default | Meaning |
187
+ | --- | --- | --- | --- |
188
+ | `--api-runs-dir PATH` | yes | none | Parent directory for API runs. Each request gets one subdirectory. |
189
+ | `--host HOST` | no | `127.0.0.1` | Host to bind. |
190
+ | `--port PORT` | no | `8686` | Port to bind. |
191
+ | `--role-prompt-file PATH` | no, repeatable | none | Append role prompt text to the base ResearchHarness prompt. |
192
+ | `--input-wrapper` / `--no-input-wrapper` | no | enabled | Enable or disable the input LLM wrapper. |
193
+ | `--output-wrapper` / `--no-output-wrapper` | no | enabled | Enable or disable the output LLM wrapper. |
194
+
195
+ ### Wrapper Modes
196
+
197
+ Both wrappers are enabled by default.
198
+
199
+ Strict-format benchmark mode:
200
+
201
+ ```bash
202
+ python3 run_server.py \
203
+ --api-runs-dir ./api_runs \
204
+ --role-prompt-file benchmarks/QA/role_prompt.md \
205
+ --input-wrapper \
206
+ --output-wrapper
207
+ ```
208
+
209
+ Direct agent mode:
210
+
211
+ ```bash
212
+ python3 run_server.py \
213
+ --api-runs-dir ./api_runs \
214
+ --no-input-wrapper \
215
+ --no-output-wrapper
216
+ ```
217
+
218
+ Simple input plus strict final formatting:
219
+
220
+ ```bash
221
+ python3 run_server.py \
222
+ --api-runs-dir ./api_runs \
223
+ --no-input-wrapper \
224
+ --output-wrapper
225
+ ```
226
+
227
+ The input wrapper rewrites the original user request into a stable task for the
228
+ agent. The output wrapper formats the agent result to match the user's requested
229
+ answer contract. Wrappers must not invent new facts; they only normalize input
230
+ and format output.
231
+
232
+ The API server is intentionally one request -> one answer. It does not keep a
233
+ server-side conversation between HTTP requests. If an application needs API
234
+ multi-turn behavior, keep that state in the client and send the needed prior
235
+ context in later requests.
236
+
237
+ ```mermaid
238
+ flowchart LR
239
+ U[User Input] --> IW[Input Wrapper LLM]
240
+ IW --> A[ResearchHarness Agent]
241
+ A --> OW[Output Wrapper LLM]
242
+ OW --> O[Output]
243
+ ```
244
+
245
+ ## 5. API Workspace Layout
246
+
247
+ Each API request creates one run directory:
248
+
249
+ ```text
250
+ ./api_runs/
251
+ `-- run_YYYYMMDD_HHMMSS_<random>/
252
+ |-- agent_workspace/
253
+ | `-- inputs/
254
+ | `-- images/
255
+ `-- agent_trace/
256
+ |-- api_trace.jsonl
257
+ |-- trace_*.jsonl
258
+ `-- _session_state.json
259
+ ```
260
+
261
+ Meaning:
262
+
263
+ | Path | Meaning |
264
+ | --- | --- |
265
+ | `run_YYYYMMDD_HHMMSS_<random>/` | Per-request run root. |
266
+ | `agent_workspace/` | The only workspace visible to the agent. File tools, Bash, `ls`, and `cat` start here. |
267
+ | `agent_workspace/inputs/images/` | User-provided images saved from API requests. |
268
+ | `agent_trace/` | API trace, agent trace, and runtime records. |
269
+
270
+ For multimodal requests, image inputs are handled in two ways at the same time:
271
+ the image content is passed to the backend model as initial multimodal input
272
+ when the selected model supports it, and each image is saved under
273
+ `agent_workspace/inputs/images/`. Each saved relative path is also included in
274
+ the agent-visible text, so later rounds can call `ReadImage` on a stable local
275
+ path without repeatedly resending image bytes.
276
+
277
+ This separation keeps user-visible tool work separate from server-side trace files.
278
+ In API deployment mode, traces are saved by default: every request writes
279
+ `api_trace.jsonl`, `trace_*.jsonl`, and `_session_state.json` under that run's `agent_trace/`
280
+ directory.
281
+
282
+ ## 6. Text Request with OpenAI SDK
283
+
284
+ ```python
285
+ from openai import OpenAI
286
+
287
+ client = OpenAI(api_key="unused", base_url="http://127.0.0.1:8686/v1")
288
+
289
+ response = client.chat.completions.create(
290
+ model="researchharness",
291
+ messages=[
292
+ {"role": "user", "content": "Answer in one sentence: what is 2 + 2?"}
293
+ ],
294
+ )
295
+
296
+ print(response.choices[0].message.content)
297
+ ```
298
+
299
+ ## 7. Multimodal Request with OpenAI SDK
300
+
301
+ The first API version supports one or more `data:image/...;base64,...` image
302
+ URLs in the same request. Remote image URLs and local file paths are
303
+ intentionally not supported by the API server.
304
+
305
+ The example below generates an image in memory and asks for JSON output.
306
+
307
+ ```python
308
+ import base64
309
+ from io import BytesIO
310
+
311
+ from PIL import Image, ImageDraw
312
+ from openai import OpenAI
313
+
314
+ image = Image.new("RGB", (320, 120), "white")
315
+ draw = ImageDraw.Draw(image)
316
+ draw.text((40, 45), "7 + 5 = ?", fill="black")
317
+ buffer = BytesIO()
318
+ image.save(buffer, format="PNG")
319
+ data_url = "data:image/png;base64," + base64.b64encode(buffer.getvalue()).decode("ascii")
320
+
321
+ client = OpenAI(api_key="unused", base_url="http://127.0.0.1:8686/v1")
322
+
323
+ response = client.chat.completions.create(
324
+ model="researchharness",
325
+ messages=[
326
+ {
327
+ "role": "user",
328
+ "content": [
329
+ {
330
+ "type": "text",
331
+ "text": (
332
+ "The image contains a simple arithmetic expression. "
333
+ "Return JSON with exactly two keys: expression and answer."
334
+ ),
335
+ },
336
+ {"type": "image_url", "image_url": {"url": data_url}},
337
+ ],
338
+ }
339
+ ],
340
+ )
341
+
342
+ print(response.choices[0].message.content)
343
+ ```
344
+
345
+ Expected answer shape:
346
+
347
+ ```json
348
+ {"expression":"7 + 5","answer":12}
349
+ ```
350
+
351
+ ## 8. API Request and Response Contract
352
+
353
+ ### `POST /v1/chat/completions`
354
+
355
+ Supported request fields:
356
+
357
+ | Field | Required | Meaning |
358
+ | --- | --- | --- |
359
+ | `model` | yes | Client-visible model label. It does not override `MODEL_NAME`; the backend model comes from `.env`. |
360
+ | `messages` | yes | OpenAI-style chat messages. |
361
+ | `stream` | no | Must be absent or `false`; streaming is not supported. |
362
+ | `n` | no | Must be absent or `1`. |
363
+ | `max_tokens` | no | Maximum output tokens for the output wrapper. |
364
+ | `max_completion_tokens` | no | Alias accepted for output-wrapper max tokens. |
365
+ | `response_format` | no | Passed to the wrappers as an output-format hint. |
366
+
367
+ Supported message roles:
368
+
369
+ | Role | Supported |
370
+ | --- | --- |
371
+ | `system` | yes |
372
+ | `user` | yes |
373
+ | `assistant` | yes |
374
+ | `tool` | no |
375
+
376
+ Supported content forms:
377
+
378
+ ```json
379
+ {"role": "user", "content": "plain text"}
380
+ ```
381
+
382
+ ```json
383
+ {
384
+ "role": "user",
385
+ "content": [
386
+ {"type": "text", "text": "question"},
387
+ {"type": "image_url", "image_url": {"url": "data:image/png;base64,..."}}
388
+ ]
389
+ }
390
+ ```
391
+
392
+ Response shape:
393
+
394
+ ```json
395
+ {
396
+ "id": "chatcmpl_...",
397
+ "object": "chat.completion",
398
+ "created": 1770000000,
399
+ "model": "researchharness",
400
+ "choices": [
401
+ {
402
+ "index": 0,
403
+ "message": {
404
+ "role": "assistant",
405
+ "content": "final answer"
406
+ },
407
+ "finish_reason": "stop"
408
+ }
409
+ ]
410
+ }
411
+ ```
412
+
413
+ Callers usually only need:
414
+
415
+ ```python
416
+ response.choices[0].message.content
417
+ ```
418
+
419
+ ### `GET /v1/health`
420
+
421
+ Returns:
422
+
423
+ ```json
424
+ {
425
+ "status": "ok",
426
+ "api_runs_dir": "./api_runs",
427
+ "input_wrapper": true,
428
+ "output_wrapper": true
429
+ }
430
+ ```
431
+
432
+ ## 9. Tool Surface
433
+
434
+ ResearchHarness currently includes:
435
+
436
+ | Tool | Purpose |
437
+ | --- | --- |
438
+ | `Glob` | Discover files by pattern. |
439
+ | `Grep` | Search text in files. |
440
+ | `Read` | Read text files with bounds. |
441
+ | `ReadPDF` | Parse PDFs with MinerU/structai. |
442
+ | `ReadImage` | Inspect local image files and forward image content to vision-capable models. |
443
+ | `Write` | Write files inside the workspace. |
444
+ | `Edit` | Patch files inside the workspace. |
445
+ | `Bash` | Run shell commands inside the workspace. |
446
+ | `WebSearch` | Web search through Serper. |
447
+ | `ScholarSearch` | Scholar-style search through Serper. |
448
+ | `WebFetch` | Fetch and summarize webpages through Jina and the configured model. |
449
+ | `AskUser` | Ask a human for clarification in interactive runs. Disabled by some benchmark adapters. |
450
+ | `TerminalStart` / `TerminalWrite` / `TerminalRead` / `TerminalInterrupt` / `TerminalKill` | Persistent terminal sessions. |
451
+
452
+ ## 10. Traces and Records
453
+
454
+ CLI runs write traces only when `--trace-dir` is provided. Without
455
+ `--trace-dir`, CLI runs do not write a trace file.
456
+
457
+ API runs write traces under:
458
+
459
+ ```text
460
+ ./api_runs/run_.../agent_trace/
461
+ ```
462
+
463
+ Important files:
464
+
465
+ | File | Meaning |
466
+ | --- | --- |
467
+ | `api_trace.jsonl` | Input wrapper, agent result, and output wrapper records. |
468
+ | `trace_*.jsonl` | Flat agent runtime trace. |
469
+ | `_session_state.json` | Current session state, written next to `trace_*.jsonl` when tracing is enabled. |
470
+
471
+ The trace stores tool calls, tool results, LLM call capture payloads, compaction
472
+ events, errors, and final termination state.
473
+
474
+ ## 11. Benchmark Adapters
475
+
476
+ Tracked benchmark contracts live under `benchmarks/`.
477
+
478
+ Current tracked adapters:
479
+
480
+ | Benchmark | Directory | Notes |
481
+ | --- | --- | --- |
482
+ | ResearchClawBench | `benchmarks/ResearchClawBench/` | CLI integration with role prompt and adapter. |
483
+ | QA / VQA | `benchmarks/QA/` | OpenAI-compatible API integration for text and multimodal QA. |
484
+
485
+ Benchmark-specific behavior should stay outside `agent_base/`.
486
+
487
+ ## 12. Testing
488
+
489
+ Recommended checks:
490
+
491
+ ```bash
492
+ python3 tests/test_tool_availability.py
493
+ python3 tests/test_openai_api_checks.py
494
+ python3 tests/test_agent_extension_checks.py
495
+ python3 tests/test_edge_case_checks.py
496
+ python3 tests/test_toolchain_validation.py
497
+ ```
498
+
499
+ If using conda:
500
+
501
+ ```bash
502
+ /home/xwh/miniconda3/bin/conda run -n agent python3 tests/test_openai_api_checks.py
503
+ ```
504
+
505
+ ## 13. Troubleshooting
506
+
507
+ Common issues:
508
+
509
+ | Symptom | Likely cause | Action |
510
+ | --- | --- | --- |
511
+ | Missing required env error | `.env` is incomplete | Fill required variables. |
512
+ | Web/PDF tools fail | VPN/proxy/TLS/service issue | Disable VPN/proxy and rerun tool availability tests. |
513
+ | Image request returns 400 | Image URL is not a `data:image/...;base64,...` URL | Convert the image to a base64 data URL. |
514
+ | Backend model rejects images | Model endpoint is not vision-capable | Use a vision-capable model or send text-only tasks. |
515
+ | API request fails with streaming error | `stream=true` was sent | Use synchronous requests only. |
516
+ | Unexpected output format | Output wrapper disabled or prompt under-specified | Enable `--output-wrapper` and state the desired format clearly. |
517
+
518
+ ## 14. Current Boundaries
519
+
520
+ The first API version intentionally does not include:
521
+
522
+ - streaming,
523
+ - async run status,
524
+ - cancellation,
525
+ - artifact download endpoints,
526
+ - remote image URL downloading,
527
+ - user authentication,
528
+ - multi-tenant access control.
529
+
530
+ These can be added later as separate layers without changing the core harness
531
+ loop.
docs/tutorial_zh.md ADDED
@@ -0,0 +1,511 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ResearchHarness 教程
2
+
3
+ 本文介绍如何通过命令行和 OpenAI-compatible API 使用 ResearchHarness。
4
+
5
+ ResearchHarness 是一个轻量、通用的 tool-using LLM agent harness。它可以作为:
6
+
7
+ - 命令行本地 agent,
8
+ - agent benchmark 的公平执行底座,
9
+ - OpenAI-compatible 同步 API 后端,
10
+ - 面向代码、文件、报告、PDF、图片、网页任务的个人助手运行时。
11
+
12
+ ## 1. 安装
13
+
14
+ 安装依赖:
15
+
16
+ ```bash
17
+ python3 -m pip install -r requirements.txt
18
+ ```
19
+
20
+ 推荐使用 Python 3.10+。
21
+
22
+ ## 2. 配置环境变量
23
+
24
+ 复制 `.env.example` 为 `.env`,并填写必需变量。
25
+
26
+ 必需变量:
27
+
28
+ | 变量 | 含义 |
29
+ | --- | --- |
30
+ | `API_KEY` | OpenAI-compatible LLM 服务的 API key。 |
31
+ | `API_BASE` | OpenAI-compatible chat-completions endpoint 的 base URL。 |
32
+ | `MODEL_NAME` | ResearchHarness 使用的主模型。 |
33
+ | `SERPER_KEY_ID` | `WebSearch` 和 `ScholarSearch` 使用的 Serper key:https://serper.dev/ |
34
+ | `JINA_API_KEYS` | `WebFetch` 使用的 Jina key:https://jina.ai/ |
35
+ | `MINERU_TOKEN` | `ReadPDF` 使用的 MinerU token:https://mineru.net/ |
36
+
37
+ 可选变量:
38
+
39
+ | 变量 | 默认值 | 含义 |
40
+ | --- | --- | --- |
41
+ | `WORKSPACE_ROOT` | `./workspace` | 未显式传入 workspace 时使用的默认 workspace root。 |
42
+ | `MAX_LLM_CALL_PER_RUN` | `100` | 单次 agent run 最多允许的 LLM 调用次数。 |
43
+ | `MAX_AGENT_ROUNDS` | `100` | ReAct loop 最大轮次。 |
44
+ | `MAX_AGENT_RUNTIME_SECONDS` | `9000` | 单次 agent run 的最大运行秒数。 |
45
+ | `LLM_TIMEOUT_SECONDS` | `600` | 单次 LLM API 请求超时时间。 |
46
+ | `LLM_MAX_OUTPUT_TOKENS` | `10000` | 请求模型输出的最大 token 数。 |
47
+ | `MAX_INPUT_TOKENS` | `320000` | runtime token accounting 使用的输入 token 预算。 |
48
+ | `LLM_MAX_RETRIES` | `10` | 瞬时 LLM API 错误最大重试次数。 |
49
+ | `TEMPERATURE` | `0.6` | 主模型 temperature。 |
50
+ | `TOP_P` | `0.95` | 主模型 top-p。 |
51
+ | `PRESENCE_PENALTY` | `1.1` | provider 支持时使用的 presence penalty。 |
52
+ | `AUTO_COMPACT_TRIGGER_TOKENS` | `128k` | 自动上下文压缩触发阈值。 |
53
+ | `IMAGE_PART_TOKEN_ESTIMATE` | `1536` | 每个 image content part 的 token 估计。 |
54
+ | `LLM_IMAGE_MAX_EDGE` | `1568` | 发送给多模态模型的图片最大边长。 |
55
+ | `LLM_IMAGE_MAX_BYTES` | `524288` | 发送给多模态模型的压缩图片最大字节数。 |
56
+ | `LLM_IMAGE_JPEG_QUALITY` | `85` | 图片压缩时的初始 JPEG 质量。 |
57
+ | `DEBUG_AGENT` | `false` | 打印 agent loop 详细调试日志。 |
58
+ | `DEBUG_SEARCH` | `false` | 打印 WebSearch 调试日志。 |
59
+ | `DEBUG_SCHOLAR` | `false` | 打印 ScholarSearch 调试日志。 |
60
+ | `DEBUG_VISIT` | `false` | 打印 WebFetch 调试日志。 |
61
+
62
+ 正式使用前,先运行:
63
+
64
+ ```bash
65
+ python3 tests/test_tool_availability.py
66
+ ```
67
+
68
+ 预期结果是全部工具通过。缺 key、缺依赖、服务额度耗尽、外部工具不可用都应该视为失败,不应 skip。
69
+
70
+ 如果 `WebSearch`、`ScholarSearch`、`WebFetch` 或 `ReadPDF` 出现 network、TLS、upload、download、PDF parsing 相关错误,优先尝试关闭 VPN / proxy 后重跑测试。
71
+
72
+ ## 3. 命令行使用
73
+
74
+ 直接运行一个 prompt:
75
+
76
+ ```bash
77
+ python3 run_agent.py "Who proposed the transformer architecture, and in what year was the paper published?"
78
+ ```
79
+
80
+ 指定 workspace:
81
+
82
+ ```bash
83
+ python3 run_agent.py "Summarize this project." \
84
+ --workspace-root ./workspace
85
+ ```
86
+
87
+ `./workspace` 可以替换为任何其他 workspace 目录。
88
+
89
+ 保存 trace:
90
+
91
+ ```bash
92
+ python3 run_agent.py "Summarize this project." \
93
+ --workspace-root ./workspace \
94
+ --trace-dir ./traces
95
+ ```
96
+
97
+ `./traces` 可以替换为任何其他 trace 目录。
98
+
99
+ 如果不传 `--trace-dir`,CLI 运行不会写 trace 文件。
100
+
101
+ 追加 role prompt:
102
+
103
+ ```bash
104
+ python3 run_agent.py "Answer this QA task." \
105
+ --workspace-root ./workspace \
106
+ --role-prompt-file benchmarks/QA/role_prompt.md
107
+ ```
108
+
109
+ 附加本地图片:
110
+
111
+ ```bash
112
+ python3 run_agent.py "Read the image and return JSON." \
113
+ --workspace-root ./workspace \
114
+ --images /path/to/image.png /path/to/second-image.png
115
+ ```
116
+
117
+ 每个图片路径都必须存在。RH 会把图片复制到 `./workspace/inputs/images/`,
118
+ 作为初始 `image_url` content part 传给模型,同时把每个保存后的相对路径写进
119
+ 用户文本,让后续轮次可以用 `ReadImage` 重新读取这些图片。
120
+
121
+ 在交互式终端中,CLI 会在最终回答后继续等待 follow-up。下一轮会保留之前的
122
+ messages、工具结果和图片保存路径提示。运行过程中按 `Ctrl+C` 会在下一个安全点
123
+ 中断当前 run,并带着上下文回到 follow-up 模式。在 follow-up 输入处按 `Ctrl+C`
124
+ 或发送 EOF 可退出。脚本或 benchmark 如果需要严格的一问一答行为,使用
125
+ `--no-chat`;需要强制开启时使用 `--chat`。
126
+
127
+ 如果需要浏览器本地界面,运行 `python3 run_frontend.py`。前端使用页面中选择的
128
+ 已有 workspace,实时显示工具步骤,支持一张或多张图片��件,并在每次最终回答后
129
+ 继续当前对话,直到点击 **New chat**。运行中发送按钮会变成 **Stop**;它会在下一个
130
+ 安全点中断,并保留上下文用于下一条消息。
131
+
132
+ ### CLI 参数
133
+
134
+ | 参数 | 是否必需 | 含义 |
135
+ | --- | --- | --- |
136
+ | 位置参数 `prompt` | 是,除非使用 `--prompt-file` | prompt 文本。 |
137
+ | `--prompt-file PATH` | 否 | 从 UTF-8 文件读取 prompt。 |
138
+ | `--workspace-root PATH` | 否 | 本地文件工具、Bash、Terminal 使用的 workspace root;不存在会自动创建。 |
139
+ | `--trace-dir PATH` | 否 | 写入 `trace_*.jsonl` 的目录。 |
140
+ | `--role-prompt-file PATH` | 否,可重复 | 追加 role-specific prompt 到 base system prompt。 |
141
+ | `--images PATH [PATH ...]` | 否 | 把一张或多张本地图片复制到 `inputs/images/` 并附加到初始用户消息。 |
142
+ | `--chat` / `--no-chat` | 否 | 开启或关闭 CLI follow-up 模式。默认只在 stdin 和 stdout 都是交互式终端时开启。 |
143
+
144
+ ## 4. OpenAI-Compatible API Server
145
+
146
+ ResearchHarness 可以部署为同步 OpenAI-compatible endpoint:
147
+
148
+ ```http
149
+ POST /v1/chat/completions
150
+ ```
151
+
152
+ 这样,现有 OpenAI SDK 客户端只需要修改 `base_url` 就可以调用 ResearchHarness。
153
+
154
+ ### 启动服务
155
+
156
+ 默认部署:
157
+
158
+ ```bash
159
+ python3 run_server.py \
160
+ --api-runs-dir ./api_runs \
161
+ --host 127.0.0.1 \
162
+ --port 8686
163
+ ```
164
+
165
+ QA/VQA benchmark 部署,可以额外加 benchmark role overlay:
166
+
167
+ ```bash
168
+ python3 run_server.py \
169
+ --api-runs-dir ./api_runs \
170
+ --host 127.0.0.1 \
171
+ --port 8686 \
172
+ --role-prompt-file benchmarks/QA/role_prompt.md
173
+ ```
174
+
175
+ ### API Server 参数
176
+
177
+ | 参数 | 是否必需 | 默认值 | 含义 |
178
+ | --- | --- | --- | --- |
179
+ | `--api-runs-dir PATH` | 是 | 无 | API runs 的父目录;每个请求会创建一个子目录。 |
180
+ | `--host HOST` | 否 | `127.0.0.1` | 服务监听 host。 |
181
+ | `--port PORT` | 否 | `8686` | 服务监听端口。 |
182
+ | `--role-prompt-file PATH` | 否,可重复 | 无 | 追加 role prompt 到 base ResearchHarness prompt。 |
183
+ | `--input-wrapper` / `--no-input-wrapper` | 否 | 开启 | 开启或关闭输入 LLM wrapper。 |
184
+ | `--output-wrapper` / `--no-output-wrapper` | 否 | 开启 | 开启或关闭输出 LLM wrapper。 |
185
+
186
+ ### Wrapper 模式
187
+
188
+ 默认两个 wrapper 都开启。
189
+
190
+ 严格格式 benchmark 模式:
191
+
192
+ ```bash
193
+ python3 run_server.py \
194
+ --api-runs-dir ./api_runs \
195
+ --role-prompt-file benchmarks/QA/role_prompt.md \
196
+ --input-wrapper \
197
+ --output-wrapper
198
+ ```
199
+
200
+ 直接 agent 模式:
201
+
202
+ ```bash
203
+ python3 run_server.py \
204
+ --api-runs-dir ./api_runs \
205
+ --no-input-wrapper \
206
+ --no-output-wrapper
207
+ ```
208
+
209
+ 输入简单但最终答案需要严格格式:
210
+
211
+ ```bash
212
+ python3 run_server.py \
213
+ --api-runs-dir ./api_runs \
214
+ --no-input-wrapper \
215
+ --output-wrapper
216
+ ```
217
+
218
+ input wrapper 的作用是把原始用户请求整理为适合 agent 稳定执行的任务。output wrapper 的作用是把 agent 的最终结果整理为用户要求的答案格式。wrapper 不应该引入新事实,只做输入规范化和输出格式化。
219
+
220
+ API server 有意保持一问一答:每个 HTTP 请求创建一次隔离 run,并返回一个最终
221
+ assistant message。服务端不会跨请求保存 conversation state。如果应用需要 API
222
+ 多轮对话,应由客户端保存状态,并在后续请求中传入需要的上下文。
223
+
224
+ ```mermaid
225
+ flowchart LR
226
+ U[User Input] --> IW[Input Wrapper LLM]
227
+ IW --> A[ResearchHarness Agent]
228
+ A --> OW[Output Wrapper LLM]
229
+ OW --> O[Output]
230
+ ```
231
+
232
+ ## 5. API Workspace 结构
233
+
234
+ 每个 API 请求会创建一个 run 目录:
235
+
236
+ ```text
237
+ ./api_runs/
238
+ `-- run_YYYYMMDD_HHMMSS_<random>/
239
+ |-- agent_workspace/
240
+ | `-- inputs/
241
+ | `-- images/
242
+ `-- agent_trace/
243
+ |-- api_trace.jsonl
244
+ |-- trace_*.jsonl
245
+ `-- _session_state.json
246
+ ```
247
+
248
+ 含义:
249
+
250
+ | 路径 | 含义 |
251
+ | --- | --- |
252
+ | `run_YYYYMMDD_HHMMSS_<random>/` | 单个请求对应的 run 根目录。 |
253
+ | `agent_workspace/` | agent 唯一可见的 workspace;文件工具、Bash、`ls`、`cat` 都从这里开始。 |
254
+ | `agent_workspace/inputs/images/` | API 请求中用户提交的图片。 |
255
+ | `agent_trace/` | API trace、agent trace 和 runtime 记录。 |
256
+
257
+ 对于多模态请求,每张图片会同时走两条路径:当底层模型支持多模态输入时,
258
+ 图片内容会作为初始多模态输入直接传给模型;每张图片也会保存到
259
+ `agent_workspace/inputs/images/`。每个保存后的相对路径也会写进 agent 可见文本,
260
+ 让后续轮次可以用 `ReadImage` 读取稳定的本地路径,而不是反复依赖内联图片字节。
261
+
262
+ 这个结构把 agent 可见工作目录和服务端记录目录隔离开。
263
+ 在 API 部署模式下,trace 默认保存:每个请求都会在自己的 `agent_trace/`
264
+ 目录下写入 `api_trace.jsonl`、`trace_*.jsonl` 和 `_session_state.json`。
265
+
266
+ ## 6. 纯文本 OpenAI SDK 请求
267
+
268
+ ```python
269
+ from openai import OpenAI
270
+
271
+ client = OpenAI(api_key="unused", base_url="http://127.0.0.1:8686/v1")
272
+
273
+ response = client.chat.completions.create(
274
+ model="researchharness",
275
+ messages=[
276
+ {"role": "user", "content": "Answer in one sentence: what is 2 + 2?"}
277
+ ],
278
+ )
279
+
280
+ print(response.choices[0].message.content)
281
+ ```
282
+
283
+ ## 7. 多模态 OpenAI SDK 请求
284
+
285
+ 第一版 API 支持同一个请求中包含一张或多张 `data:image/...;base64,...` 形式的图片 URL。API server 不支持远程图片 URL,也不支持让外部请求直接传本地文件路径。
286
+
287
+ 下面的示例在代码中生成一张图片,并要求返回 JSON。
288
+
289
+ ```python
290
+ import base64
291
+ from io import BytesIO
292
+
293
+ from PIL import Image, ImageDraw
294
+ from openai import OpenAI
295
+
296
+ image = Image.new("RGB", (320, 120), "white")
297
+ draw = ImageDraw.Draw(image)
298
+ draw.text((40, 45), "7 + 5 = ?", fill="black")
299
+ buffer = BytesIO()
300
+ image.save(buffer, format="PNG")
301
+ data_url = "data:image/png;base64," + base64.b64encode(buffer.getvalue()).decode("ascii")
302
+
303
+ client = OpenAI(api_key="unused", base_url="http://127.0.0.1:8686/v1")
304
+
305
+ response = client.chat.completions.create(
306
+ model="researchharness",
307
+ messages=[
308
+ {
309
+ "role": "user",
310
+ "content": [
311
+ {
312
+ "type": "text",
313
+ "text": (
314
+ "The image contains a simple arithmetic expression. "
315
+ "Return JSON with exactly two keys: expression and answer."
316
+ ),
317
+ },
318
+ {"type": "image_url", "image_url": {"url": data_url}},
319
+ ],
320
+ }
321
+ ],
322
+ )
323
+
324
+ print(response.choices[0].message.content)
325
+ ```
326
+
327
+ 预期答案形状:
328
+
329
+ ```json
330
+ {"expression":"7 + 5","answer":12}
331
+ ```
332
+
333
+ ## 8. API 请求与返回协议
334
+
335
+ ### `POST /v1/chat/completions`
336
+
337
+ 支持的请求字段:
338
+
339
+ | 字段 | 是否必需 | 含义 |
340
+ | --- | --- | --- |
341
+ | `model` | 是 | 客户端看到的 model label;不会覆盖 `.env` 中的 `MODEL_NAME`。 |
342
+ | `messages` | 是 | OpenAI-style chat messages。 |
343
+ | `stream` | 否 | 必须不存在或为 `false`;当前不支持 streaming。 |
344
+ | `n` | 否 | 必须不存在或为 `1`。 |
345
+ | `max_tokens` | 否 | output wrapper 最大输出 token。 |
346
+ | `max_completion_tokens` | 否 | output wrapper 最大输出 token 的兼容别名。 |
347
+ | `response_format` | 否 | 作为输出格式提示传给 wrapper。 |
348
+
349
+ 支持的 message role:
350
+
351
+ | Role | 是否支持 |
352
+ | --- | --- |
353
+ | `system` | 支持 |
354
+ | `user` | 支持 |
355
+ | `assistant` | 支持 |
356
+ | `tool` | 不支持 |
357
+
358
+ 支持的 content 形式:
359
+
360
+ ```json
361
+ {"role": "user", "content": "plain text"}
362
+ ```
363
+
364
+ ```json
365
+ {
366
+ "role": "user",
367
+ "content": [
368
+ {"type": "text", "text": "question"},
369
+ {"type": "image_url", "image_url": {"url": "data:image/png;base64,..."}}
370
+ ]
371
+ }
372
+ ```
373
+
374
+ 返回结构:
375
+
376
+ ```json
377
+ {
378
+ "id": "chatcmpl_...",
379
+ "object": "chat.completion",
380
+ "created": 1770000000,
381
+ "model": "researchharness",
382
+ "choices": [
383
+ {
384
+ "index": 0,
385
+ "message": {
386
+ "role": "assistant",
387
+ "content": "final answer"
388
+ },
389
+ "finish_reason": "stop"
390
+ }
391
+ ]
392
+ }
393
+ ```
394
+
395
+ 调用方通常只需要读取:
396
+
397
+ ```python
398
+ response.choices[0].message.content
399
+ ```
400
+
401
+ ### `GET /v1/health`
402
+
403
+ 返回:
404
+
405
+ ```json
406
+ {
407
+ "status": "ok",
408
+ "api_runs_dir": "./api_runs",
409
+ "input_wrapper": true,
410
+ "output_wrapper": true
411
+ }
412
+ ```
413
+
414
+ ## 9. 工具能力
415
+
416
+ ResearchHarness 当前包含:
417
+
418
+ | 工具 | 用途 |
419
+ | --- | --- |
420
+ | `Glob` | 按模式发现文件。 |
421
+ | `Grep` | 在文件中搜索文本。 |
422
+ | `Read` | 有边界地读取文本文件。 |
423
+ | `ReadPDF` | 通过 MinerU/structai 解析 PDF。 |
424
+ | `ReadImage` | 读取本地图片,并把图片内容传给支持 vision 的模型。 |
425
+ | `Write` | 在 workspace 内写文件。 |
426
+ | `Edit` | 在 workspace 内 patch 文件。 |
427
+ | `Bash` | 在 workspace 内执行 shell 命令。 |
428
+ | `WebSearch` | 通过 Serper 进行网页搜索。 |
429
+ | `ScholarSearch` | 通过 Serper 进行学术搜索。 |
430
+ | `WebFetch` | 通过 Jina 和配置模型抓取、总结网页。 |
431
+ | `AskUser` | 交互式运行中向用户提问;某些 benchmark adapter 会禁用。 |
432
+ | `TerminalStart` / `TerminalWrite` / `TerminalRead` / `TerminalInterrupt` / `TerminalKill` | 持久终端会话。 |
433
+
434
+ ## 10. Trace 与记录
435
+
436
+ CLI 运行只有在传入 `--trace-dir` 时才会写 trace。如果不传
437
+ `--trace-dir`,CLI 运行不会写 trace 文件。
438
+
439
+ API 运行时,记录在:
440
+
441
+ ```text
442
+ ./api_runs/run_.../agent_trace/
443
+ ```
444
+
445
+ 重要文件:
446
+
447
+ | 文件 | 含义 |
448
+ | --- | --- |
449
+ | `api_trace.jsonl` | input wrapper、agent result、output wrapper 记录。 |
450
+ | `trace_*.jsonl` | agent runtime 的 flat trace。 |
451
+ | `_session_state.json` | 当前 session state;启用 trace 时和 `trace_*.jsonl` 写在同一目录。 |
452
+
453
+ trace 会记录工具调用、工具结果、LLM call capture payload、context compaction、错误和终止状态。
454
+
455
+ ## 11. Benchmark Adapter
456
+
457
+ tracked benchmark contract 放在 `benchmarks/` 下。
458
+
459
+ 当前 tracked adapter:
460
+
461
+ | Benchmark | 目录 | 说明 |
462
+ | --- | --- | --- |
463
+ | ResearchClawBench | `benchmarks/ResearchClawBench/` | CLI 方式接入,包含 role prompt 和 adapter。 |
464
+ | QA / VQA | `benchmarks/QA/` | OpenAI-compatible API 方式接入,支持纯文本和多模态 QA。 |
465
+
466
+ benchmark-specific 行为应放在 `benchmarks/`,不要塞进 `agent_base/`。
467
+
468
+ ## 12. 测试
469
+
470
+ 推荐检查:
471
+
472
+ ```bash
473
+ python3 tests/test_tool_availability.py
474
+ python3 tests/test_openai_api_checks.py
475
+ python3 tests/test_agent_extension_checks.py
476
+ python3 tests/test_edge_case_checks.py
477
+ python3 tests/test_toolchain_validation.py
478
+ ```
479
+
480
+ 如果使用 conda:
481
+
482
+ ```bash
483
+ /home/xwh/miniconda3/bin/conda run -n agent python3 tests/test_openai_api_checks.py
484
+ ```
485
+
486
+ ## 13. 排障
487
+
488
+ 常见问题:
489
+
490
+ | 现象 | 可能原因 | 处理 |
491
+ | --- | --- | --- |
492
+ | 缺少 required env | `.env` 不完整 | 填写所有必需变量。 |
493
+ | Web/PDF 工具失败 | VPN/proxy/TLS/服务问题 | 关闭 VPN/proxy 后重跑工具可用性测试。 |
494
+ | 图片请求返回 400 | 图片不是 `data:image/...;base64,...` | 把图片转成 base64 data URL。 |
495
+ | 后端模型拒绝图片 | 当前模型 endpoint 不支持 vision | 换用支持 vision 的模型,或改为纯文本任务。 |
496
+ | API 报 streaming 错误 | 请求里传了 `stream=true` | 当前只支持同步请求。 |
497
+ | 输出格式不符合预期 | output wrapper 关闭,或用户格式要求不明确 | 开启 `--output-wrapper`,并清楚说明输出格式。 |
498
+
499
+ ## 14. 当前边界
500
+
501
+ 第一版 API 暂不包括:
502
+
503
+ - streaming,
504
+ - async run status,
505
+ - cancellation,
506
+ - artifact download endpoint,
507
+ - 远程图片 URL 下载,
508
+ - 用户认证,
509
+ - 多租户访问控制。
510
+
511
+ 这些能力以后可以作为外层服务继续扩展,不需要破坏核心 harness loop。
frontend/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """Local browser UI for ResearchHarness."""
frontend/local_server.py ADDED
@@ -0,0 +1,578 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import base64
5
+ import datetime as _dt
6
+ import os
7
+ import re
8
+ import shutil
9
+ import threading
10
+ import time
11
+ import traceback
12
+ from pathlib import Path
13
+ from typing import Any
14
+ from uuid import uuid4
15
+
16
+ from fastapi import FastAPI, WebSocket, WebSocketDisconnect
17
+ from fastapi.responses import FileResponse, JSONResponse
18
+ from fastapi.staticfiles import StaticFiles
19
+
20
+ from agent_base.react_agent import MultiTurnReactAgent, default_llm_config
21
+ from agent_base.utils import (
22
+ MissingRequiredEnvError,
23
+ PROJECT_ROOT,
24
+ append_saved_image_paths_to_prompt,
25
+ image_input_content_parts,
26
+ load_dotenv,
27
+ require_required_env,
28
+ safe_jsonable,
29
+ stage_image_bytes_for_input,
30
+ )
31
+
32
+
33
+ STATIC_DIR = Path(__file__).resolve().parent / "static"
34
+ MAX_UPLOAD_IMAGES = 12
35
+ MAX_IMAGE_BYTES = 12 * 1024 * 1024
36
+ MAX_DIRECTORY_ENTRIES = 800
37
+ FRONTEND_ROLE_PROMPT = ""
38
+ FRONTEND_TRACE_DIR: str | None = None
39
+ FRONTEND_MANAGED_RUNS_DIR: str | None = None
40
+ FRONTEND_CLEANUP_RETENTION_SECONDS = 6 * 60 * 60
41
+ FRONTEND_CLEANUP_MAX_RUNS = 40
42
+ FRONTEND_CLEANUP_INTERVAL_SECONDS = 15 * 60
43
+ _CLEANUP_THREAD_STARTED = False
44
+ _ACTIVE_MANAGED_RUNS: set[str] = set()
45
+ _ACTIVE_MANAGED_RUNS_LOCK = threading.Lock()
46
+
47
+ app = FastAPI(title="ResearchHarness Local UI")
48
+ app.mount("/static", StaticFiles(directory=STATIC_DIR), name="frontend-static")
49
+
50
+
51
+ def configure_frontend(
52
+ *,
53
+ role_prompt: str = "",
54
+ trace_dir: str | None = None,
55
+ managed_runs_dir: str | None = None,
56
+ cleanup_retention_seconds: int | None = None,
57
+ cleanup_max_runs: int | None = None,
58
+ cleanup_interval_seconds: int | None = None,
59
+ ) -> None:
60
+ global FRONTEND_ROLE_PROMPT, FRONTEND_TRACE_DIR, FRONTEND_MANAGED_RUNS_DIR
61
+ global FRONTEND_CLEANUP_RETENTION_SECONDS, FRONTEND_CLEANUP_MAX_RUNS, FRONTEND_CLEANUP_INTERVAL_SECONDS
62
+ FRONTEND_ROLE_PROMPT = str(role_prompt or "").strip()
63
+ if trace_dir:
64
+ path = Path(trace_dir).expanduser()
65
+ if path.exists() and not path.is_dir():
66
+ raise ValueError(f"trace-dir is not a directory: {path}")
67
+ path.mkdir(parents=True, exist_ok=True)
68
+ FRONTEND_TRACE_DIR = str(path)
69
+ else:
70
+ FRONTEND_TRACE_DIR = None
71
+
72
+ if managed_runs_dir:
73
+ path = Path(managed_runs_dir).expanduser()
74
+ if path.exists() and not path.is_dir():
75
+ raise ValueError(f"managed-runs-dir is not a directory: {path}")
76
+ path.mkdir(parents=True, exist_ok=True)
77
+ FRONTEND_MANAGED_RUNS_DIR = str(path)
78
+ if cleanup_retention_seconds is not None:
79
+ FRONTEND_CLEANUP_RETENTION_SECONDS = max(60, int(cleanup_retention_seconds))
80
+ if cleanup_max_runs is not None:
81
+ FRONTEND_CLEANUP_MAX_RUNS = max(1, int(cleanup_max_runs))
82
+ if cleanup_interval_seconds is not None:
83
+ FRONTEND_CLEANUP_INTERVAL_SECONDS = max(60, int(cleanup_interval_seconds))
84
+ cleanup_managed_runs_once()
85
+ _start_managed_cleanup_thread()
86
+ else:
87
+ FRONTEND_MANAGED_RUNS_DIR = None
88
+
89
+
90
+ class FrontendRunBridge:
91
+ def __init__(self, *, loop: asyncio.AbstractEventLoop):
92
+ self.loop = loop
93
+ self.outbound: asyncio.Queue[dict[str, Any]] = asyncio.Queue()
94
+ self.cancelled = threading.Event()
95
+ self.conversation_messages: list[dict[str, Any]] | None = None
96
+ self.conversation_workspace_root: str = ""
97
+ self.managed_run_root: str = ""
98
+ self.managed_workspace_root: str = ""
99
+ self.managed_trace_dir: str = ""
100
+ self._pending_answers: dict[str, str] = {}
101
+ self._pending_events: dict[str, threading.Event] = {}
102
+ self._lock = threading.Lock()
103
+
104
+ def send(self, payload: dict[str, Any]) -> None:
105
+ self.loop.call_soon_threadsafe(self.outbound.put_nowait, safe_jsonable(payload))
106
+
107
+ def trace_event(self, row: dict[str, Any]) -> None:
108
+ self.send({"type": "trace", "row": row})
109
+
110
+ def submit_answer(self, request_id: str, answer: str) -> bool:
111
+ with self._lock:
112
+ event = self._pending_events.get(request_id)
113
+ if event is None:
114
+ return False
115
+ self._pending_answers[request_id] = str(answer)
116
+ event.set()
117
+ return True
118
+
119
+ def ask_user(self, *, question: str, context: str = "") -> str:
120
+ request_id = uuid4().hex
121
+ event = threading.Event()
122
+ with self._lock:
123
+ self._pending_events[request_id] = event
124
+ self.send(
125
+ {
126
+ "type": "ask_user",
127
+ "request_id": request_id,
128
+ "question": question,
129
+ "context": context,
130
+ }
131
+ )
132
+ while not event.wait(0.2):
133
+ if self.cancelled.is_set():
134
+ return "[AskUser] Cancelled before user answer was received."
135
+ with self._lock:
136
+ answer = self._pending_answers.pop(request_id, "")
137
+ self._pending_events.pop(request_id, None)
138
+ answer = str(answer).strip()
139
+ if not answer:
140
+ return "[AskUser] User answer was empty."
141
+ return f"[AskUser] User answer:\n{answer}"
142
+
143
+
144
+ def _managed_runs_root() -> Path | None:
145
+ if not FRONTEND_MANAGED_RUNS_DIR:
146
+ return None
147
+ return Path(FRONTEND_MANAGED_RUNS_DIR).expanduser().resolve()
148
+
149
+
150
+ def _new_managed_run_root() -> Path:
151
+ root = _managed_runs_root()
152
+ if root is None:
153
+ raise ValueError("managed workspace mode is not configured")
154
+ timestamp = _dt.datetime.now().strftime("%Y%m%d_%H%M%S")
155
+ return root / f"run_{timestamp}_{uuid4().hex[:8]}"
156
+
157
+
158
+ def _mark_managed_run_active(run_root: Path) -> None:
159
+ with _ACTIVE_MANAGED_RUNS_LOCK:
160
+ _ACTIVE_MANAGED_RUNS.add(str(run_root.resolve()))
161
+
162
+
163
+ def _release_managed_run(bridge: FrontendRunBridge) -> None:
164
+ if bridge.managed_run_root:
165
+ with _ACTIVE_MANAGED_RUNS_LOCK:
166
+ _ACTIVE_MANAGED_RUNS.discard(str(Path(bridge.managed_run_root).resolve()))
167
+ bridge.managed_run_root = ""
168
+ bridge.managed_workspace_root = ""
169
+ bridge.managed_trace_dir = ""
170
+
171
+
172
+ def _create_managed_run(bridge: FrontendRunBridge) -> tuple[Path, str]:
173
+ run_root = _new_managed_run_root()
174
+ workspace_root = run_root / "agent_workspace"
175
+ trace_dir = run_root / "agent_trace"
176
+ workspace_root.mkdir(parents=True, exist_ok=True)
177
+ trace_dir.mkdir(parents=True, exist_ok=True)
178
+ bridge.managed_run_root = str(run_root)
179
+ bridge.managed_workspace_root = str(workspace_root)
180
+ bridge.managed_trace_dir = str(trace_dir)
181
+ _mark_managed_run_active(run_root)
182
+ return workspace_root, str(trace_dir)
183
+
184
+
185
+ def cleanup_managed_runs_once() -> None:
186
+ root = _managed_runs_root()
187
+ if root is None or not root.exists():
188
+ return
189
+ now = time.time()
190
+ with _ACTIVE_MANAGED_RUNS_LOCK:
191
+ active = set(_ACTIVE_MANAGED_RUNS)
192
+ runs = []
193
+ for child in root.iterdir():
194
+ if not child.is_dir() or not child.name.startswith("run_"):
195
+ continue
196
+ try:
197
+ resolved = str(child.resolve())
198
+ mtime = child.stat().st_mtime
199
+ except OSError:
200
+ continue
201
+ runs.append((mtime, child, resolved))
202
+
203
+ for mtime, child, resolved in runs:
204
+ if resolved in active:
205
+ continue
206
+ if FRONTEND_CLEANUP_RETENTION_SECONDS and now - mtime > FRONTEND_CLEANUP_RETENTION_SECONDS:
207
+ shutil.rmtree(child, ignore_errors=True)
208
+
209
+ remaining = []
210
+ with _ACTIVE_MANAGED_RUNS_LOCK:
211
+ active = set(_ACTIVE_MANAGED_RUNS)
212
+ for child in root.iterdir():
213
+ if not child.is_dir() or not child.name.startswith("run_"):
214
+ continue
215
+ try:
216
+ remaining.append((child.stat().st_mtime, child, str(child.resolve())))
217
+ except OSError:
218
+ continue
219
+ remaining.sort(reverse=True, key=lambda item: item[0])
220
+ for _, child, resolved in remaining[FRONTEND_CLEANUP_MAX_RUNS:]:
221
+ if resolved not in active:
222
+ shutil.rmtree(child, ignore_errors=True)
223
+
224
+
225
+ def _managed_cleanup_loop() -> None:
226
+ while True:
227
+ time.sleep(FRONTEND_CLEANUP_INTERVAL_SECONDS)
228
+ cleanup_managed_runs_once()
229
+
230
+
231
+ def _start_managed_cleanup_thread() -> None:
232
+ global _CLEANUP_THREAD_STARTED
233
+ if _CLEANUP_THREAD_STARTED:
234
+ return
235
+ thread = threading.Thread(target=_managed_cleanup_loop, daemon=True)
236
+ thread.start()
237
+ _CLEANUP_THREAD_STARTED = True
238
+
239
+
240
+ class FrontendInteractiveAgent(MultiTurnReactAgent):
241
+ def __init__(self, *, bridge: FrontendRunBridge, **kwargs: Any):
242
+ super().__init__(**kwargs)
243
+ self.bridge = bridge
244
+
245
+ def custom_call_tool(self, tool_name: str, tool_args: Any, **kwargs: Any):
246
+ if tool_name != "AskUser":
247
+ return super().custom_call_tool(tool_name, tool_args, **kwargs)
248
+ tool = self.tool_map.get("AskUser")
249
+ if tool is None:
250
+ return "[AskUser] Tool is not available in this run."
251
+ try:
252
+ parsed = tool.parse_json_args(tool_args)
253
+ except ValueError as exc:
254
+ return f"[AskUser] {exc}"
255
+ question = str(parsed.get("question", "")).strip()
256
+ context = str(parsed.get("context", "") or "").strip()
257
+ if not question:
258
+ return "[AskUser] question must be a non-empty string."
259
+ return self.bridge.ask_user(question=question, context=context)
260
+
261
+
262
+ def _safe_image_suffix(mime: str, filename: str = "") -> str:
263
+ suffix = Path(filename).suffix.lower()
264
+ if suffix in {".png", ".jpg", ".jpeg", ".gif", ".webp", ".bmp"}:
265
+ return suffix
266
+ mapping = {
267
+ "image/png": ".png",
268
+ "image/jpeg": ".jpg",
269
+ "image/gif": ".gif",
270
+ "image/webp": ".webp",
271
+ "image/bmp": ".bmp",
272
+ }
273
+ return mapping.get(mime.lower(), ".png")
274
+
275
+
276
+ def decode_image_data_url(data_url: str, *, filename: str = "") -> tuple[str, bytes]:
277
+ match = re.fullmatch(r"data:(image/[A-Za-z0-9.+-]+);base64,(.*)", str(data_url), flags=re.DOTALL)
278
+ if not match:
279
+ raise ValueError("image must be a data:image/...;base64,... URL")
280
+ mime = match.group(1)
281
+ try:
282
+ raw = base64.b64decode(match.group(2), validate=True)
283
+ except ValueError as exc:
284
+ raise ValueError(f"invalid base64 image data: {exc}") from exc
285
+ if not raw:
286
+ raise ValueError("image upload is empty")
287
+ if len(raw) > MAX_IMAGE_BYTES:
288
+ raise ValueError(f"image upload exceeds {MAX_IMAGE_BYTES} bytes")
289
+ return _safe_image_suffix(mime, filename), raw
290
+
291
+
292
+ def save_uploaded_images(workspace_root: Path, images: list[dict[str, Any]]) -> tuple[list[dict[str, Any]], list[str]]:
293
+ if len(images) > MAX_UPLOAD_IMAGES:
294
+ raise ValueError(f"at most {MAX_UPLOAD_IMAGES} images are supported per run")
295
+ if not images:
296
+ return [], []
297
+ timestamp = _dt.datetime.now().strftime("%Y%m%d_%H%M%S")
298
+ content_parts: list[dict[str, Any]] = []
299
+ saved_paths: list[str] = []
300
+ for idx, item in enumerate(images, start=1):
301
+ if not isinstance(item, dict):
302
+ raise ValueError("each image item must be an object")
303
+ data_url = str(item.get("data_url", "")).strip()
304
+ filename = str(item.get("name", "") or f"image_{idx}")
305
+ suffix, raw = decode_image_data_url(data_url, filename=filename)
306
+ saved_path = stage_image_bytes_for_input(
307
+ raw,
308
+ workspace_root=workspace_root,
309
+ filename=f"{timestamp}_{filename}",
310
+ image_index=idx - 1,
311
+ suffix=suffix,
312
+ )
313
+ saved_paths.append(saved_path)
314
+ content_parts.extend(image_input_content_parts(data_url, saved_path))
315
+ return content_parts, saved_paths
316
+
317
+
318
+ def _prompt_with_uploaded_image_paths(prompt: str, saved_paths: list[str]) -> str:
319
+ return append_saved_image_paths_to_prompt(prompt, saved_paths)
320
+
321
+
322
+ def _run_agent_thread(
323
+ *,
324
+ bridge: FrontendRunBridge,
325
+ prompt: str,
326
+ workspace_root: Path,
327
+ initial_content_parts: list[dict[str, Any]],
328
+ trace_dir: str | None = None,
329
+ prior_messages: list[dict[str, Any]] | None = None,
330
+ ) -> None:
331
+ try:
332
+ load_dotenv(PROJECT_ROOT / ".env")
333
+ require_required_env("ResearchHarness frontend")
334
+ effective_trace_dir = trace_dir if trace_dir is not None else FRONTEND_TRACE_DIR
335
+ agent = FrontendInteractiveAgent(
336
+ bridge=bridge,
337
+ llm=default_llm_config(),
338
+ trace_dir=effective_trace_dir,
339
+ role_prompt=FRONTEND_ROLE_PROMPT or None,
340
+ )
341
+ bridge.send(
342
+ {
343
+ "type": "run_started",
344
+ "model": agent.model,
345
+ "workspace_root": str(workspace_root),
346
+ "trace_dir": effective_trace_dir or "",
347
+ }
348
+ )
349
+ result = agent._run_session(
350
+ prompt,
351
+ workspace_root=str(workspace_root),
352
+ event_callback=bridge.trace_event,
353
+ initial_content_parts=initial_content_parts or None,
354
+ prior_messages=prior_messages,
355
+ interrupt_event=bridge.cancelled,
356
+ )
357
+ bridge.conversation_messages = result.get("messages", [])
358
+ bridge.conversation_workspace_root = str(workspace_root)
359
+ bridge.send(
360
+ {
361
+ "type": "run_finished",
362
+ "result_text": result.get("result_text", ""),
363
+ "termination": result.get("termination", ""),
364
+ }
365
+ )
366
+ except (MissingRequiredEnvError, ValueError) as exc:
367
+ bridge.send({"type": "run_error", "error": str(exc)})
368
+ except Exception as exc:
369
+ bridge.send({"type": "run_error", "error": str(exc), "traceback": traceback.format_exc()})
370
+
371
+
372
+ def _resolve_existing_workspace(raw_path: str) -> Path:
373
+ if not str(raw_path or "").strip():
374
+ raise ValueError("workspace path is required")
375
+ path = Path(raw_path).expanduser()
376
+ if not path.is_absolute():
377
+ path = (Path.cwd() / path).resolve()
378
+ else:
379
+ path = path.resolve()
380
+ if not path.exists() or not path.is_dir():
381
+ raise ValueError(f"workspace must be an existing directory: {path}")
382
+ return path
383
+
384
+
385
+ def _resolve_directory_browser_path(raw_path: str = "") -> Path:
386
+ text = str(raw_path or "").strip()
387
+ if text:
388
+ path = Path(text).expanduser()
389
+ else:
390
+ path = Path.home() if Path.home().exists() else PROJECT_ROOT
391
+ if not path.is_absolute():
392
+ path = (Path.cwd() / path).resolve()
393
+ else:
394
+ path = path.resolve()
395
+ if not path.exists() or not path.is_dir():
396
+ raise ValueError(f"directory does not exist: {path}")
397
+ return path
398
+
399
+
400
+ def _directory_root_choices() -> list[dict[str, str]]:
401
+ candidates = [Path.home(), PROJECT_ROOT, PROJECT_ROOT / "workspace", Path.cwd(), Path("/mnt"), Path("/")]
402
+ if os.name == "nt":
403
+ for letter in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
404
+ candidates.append(Path(f"{letter}:\\"))
405
+
406
+ seen: set[str] = set()
407
+ roots: list[dict[str, str]] = []
408
+ for candidate in candidates:
409
+ try:
410
+ resolved = candidate.expanduser().resolve()
411
+ except (OSError, RuntimeError):
412
+ continue
413
+ if not resolved.exists() or not resolved.is_dir():
414
+ continue
415
+ key = str(resolved)
416
+ if key in seen:
417
+ continue
418
+ seen.add(key)
419
+ label = "Home" if resolved == Path.home().resolve() else (resolved.name or key)
420
+ roots.append({"label": label, "path": key})
421
+ return roots
422
+
423
+
424
+ def _workspace_directory_payload(raw_path: str = "") -> dict[str, Any]:
425
+ directory = _resolve_directory_browser_path(raw_path)
426
+ entries: list[dict[str, str]] = []
427
+ truncated = False
428
+ try:
429
+ children = sorted(directory.iterdir(), key=lambda item: item.name.casefold())
430
+ except PermissionError as exc:
431
+ raise ValueError(f"permission denied: {directory}") from exc
432
+ except OSError as exc:
433
+ raise ValueError(f"cannot read directory {directory}: {exc}") from exc
434
+
435
+ for child in children:
436
+ if len(entries) >= MAX_DIRECTORY_ENTRIES:
437
+ truncated = True
438
+ break
439
+ try:
440
+ if not child.is_dir():
441
+ continue
442
+ except OSError:
443
+ continue
444
+ entries.append({"name": child.name or str(child), "path": str(child)})
445
+
446
+ parent = directory.parent if directory.parent != directory else None
447
+ return {
448
+ "path": str(directory),
449
+ "parent": str(parent) if parent else "",
450
+ "entries": entries,
451
+ "truncated": truncated,
452
+ "roots": _directory_root_choices(),
453
+ }
454
+
455
+
456
+ @app.get("/api/workspace-directories")
457
+ def workspace_directories(path: str = "") -> JSONResponse:
458
+ try:
459
+ return JSONResponse(_workspace_directory_payload(path))
460
+ except ValueError as exc:
461
+ return JSONResponse({"error": str(exc)}, status_code=400)
462
+
463
+
464
+ @app.get("/")
465
+ def index() -> FileResponse:
466
+ return FileResponse(STATIC_DIR / "index.html")
467
+
468
+
469
+ @app.get("/favicon.ico")
470
+ def favicon() -> FileResponse:
471
+ return FileResponse(STATIC_DIR / "favicon.svg", media_type="image/svg+xml")
472
+
473
+
474
+ @app.websocket("/ws")
475
+ async def websocket_endpoint(websocket: WebSocket) -> None:
476
+ await websocket.accept()
477
+ bridge = FrontendRunBridge(loop=asyncio.get_running_loop())
478
+ run_thread: threading.Thread | None = None
479
+
480
+ async def sender() -> None:
481
+ while True:
482
+ payload = await bridge.outbound.get()
483
+ await websocket.send_json(payload)
484
+
485
+ sender_task = asyncio.create_task(sender())
486
+ try:
487
+ await websocket.send_json({"type": "ready", "managed_workspace": bool(FRONTEND_MANAGED_RUNS_DIR)})
488
+ while True:
489
+ message = await websocket.receive_json()
490
+ message_type = str(message.get("type", "")).strip()
491
+ if message_type == "start":
492
+ if run_thread is not None and run_thread.is_alive():
493
+ bridge.send({"type": "run_error", "error": "A run is already active. Wait for it to finish before starting a new conversation."})
494
+ continue
495
+ prompt = str(message.get("prompt", "")).strip()
496
+ if not prompt:
497
+ bridge.send({"type": "run_error", "error": "Prompt is required."})
498
+ continue
499
+ try:
500
+ continue_conversation = bool(message.get("continue_conversation"))
501
+ prior_messages = None
502
+ effective_trace_dir = FRONTEND_TRACE_DIR
503
+ if FRONTEND_MANAGED_RUNS_DIR:
504
+ if continue_conversation:
505
+ if not bridge.conversation_messages or not bridge.managed_workspace_root:
506
+ bridge.send({"type": "run_error", "error": "No active conversation is available on the server. Click New chat and start again."})
507
+ continue
508
+ workspace_root = Path(bridge.managed_workspace_root)
509
+ effective_trace_dir = bridge.managed_trace_dir or FRONTEND_TRACE_DIR
510
+ prior_messages = bridge.conversation_messages
511
+ else:
512
+ _release_managed_run(bridge)
513
+ workspace_root, effective_trace_dir = _create_managed_run(bridge)
514
+ else:
515
+ workspace_root = _resolve_existing_workspace(str(message.get("workspace_root", "")))
516
+ if continue_conversation:
517
+ if not bridge.conversation_messages:
518
+ bridge.send({"type": "run_error", "error": "No active conversation is available on the server. Click New chat and start again."})
519
+ continue
520
+ elif bridge.conversation_workspace_root and bridge.conversation_workspace_root != str(workspace_root):
521
+ bridge.send({"type": "run_error", "error": "Workspace changed. Start a new chat before using a different workspace."})
522
+ continue
523
+ else:
524
+ prior_messages = bridge.conversation_messages
525
+ image_parts, saved_paths = save_uploaded_images(
526
+ workspace_root,
527
+ message.get("images", []) if isinstance(message.get("images", []), list) else [],
528
+ )
529
+ run_prompt = _prompt_with_uploaded_image_paths(prompt, saved_paths)
530
+ except ValueError as exc:
531
+ bridge.send({"type": "run_error", "error": str(exc)})
532
+ continue
533
+ bridge.cancelled.clear()
534
+ if not continue_conversation:
535
+ bridge.conversation_messages = None
536
+ bridge.conversation_workspace_root = str(workspace_root)
537
+ bridge.send({"type": "conversation_reset"})
538
+ if saved_paths:
539
+ bridge.send({"type": "uploaded_images", "paths": saved_paths})
540
+ run_thread = threading.Thread(
541
+ target=_run_agent_thread,
542
+ kwargs={
543
+ "bridge": bridge,
544
+ "prompt": run_prompt,
545
+ "workspace_root": workspace_root,
546
+ "initial_content_parts": image_parts,
547
+ "trace_dir": effective_trace_dir,
548
+ "prior_messages": prior_messages,
549
+ },
550
+ daemon=True,
551
+ )
552
+ run_thread.start()
553
+ elif message_type == "ask_user_answer":
554
+ ok = bridge.submit_answer(str(message.get("request_id", "")), str(message.get("answer", "")))
555
+ if not ok:
556
+ bridge.send({"type": "run_error", "error": "No pending AskUser request matched that answer."})
557
+ elif message_type == "interrupt":
558
+ if run_thread is not None and run_thread.is_alive():
559
+ bridge.cancelled.set()
560
+ bridge.send({"type": "interrupt_requested"})
561
+ else:
562
+ bridge.send({"type": "run_error", "error": "No active run is available to interrupt."})
563
+ elif message_type == "new":
564
+ if run_thread is not None and run_thread.is_alive():
565
+ bridge.send({"type": "run_error", "error": "The current run is still active. Start a new conversation after it finishes."})
566
+ else:
567
+ _release_managed_run(bridge)
568
+ bridge.conversation_messages = None
569
+ bridge.conversation_workspace_root = ""
570
+ bridge.send({"type": "conversation_reset"})
571
+ else:
572
+ bridge.send({"type": "run_error", "error": f"Unknown websocket message type: {message_type}"})
573
+ except WebSocketDisconnect:
574
+ bridge.cancelled.set()
575
+ finally:
576
+ bridge.cancelled.set()
577
+ _release_managed_run(bridge)
578
+ sender_task.cancel()
frontend/static/app.css ADDED
@@ -0,0 +1,955 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ :root {
2
+ --bg: #ffffff;
3
+ --bar: #f5f5f5;
4
+ --border: #e8e8e8;
5
+ --panel: rgba(255, 255, 255, 0.82);
6
+ --panel-strong: rgba(255, 255, 255, 0.96);
7
+ --hover: #f7f7f7;
8
+ --text: #171717;
9
+ --muted: #747474;
10
+ --accent-start: #1a1a1a;
11
+ --accent-end: #333333;
12
+ --accent-text: #ffffff;
13
+ --glow-rgb: 0, 0, 0;
14
+ --danger: #b42318;
15
+ --ok: #1f7a42;
16
+ --warn: #9a6700;
17
+ --shadow: 0 18px 70px rgba(0, 0, 0, 0.08);
18
+ }
19
+
20
+ [data-theme="yellow"] {
21
+ --bg: #faf8f4;
22
+ --bar: #f0ebe1;
23
+ --border: #e5ddd0;
24
+ --panel: rgba(255, 252, 246, 0.84);
25
+ --panel-strong: rgba(255, 252, 246, 0.96);
26
+ --hover: #f0ece4;
27
+ --text: #2f2113;
28
+ --muted: #8a7055;
29
+ --accent-start: #1c1208;
30
+ --accent-end: #3a2410;
31
+ --accent-text: #ffffff;
32
+ --glow-rgb: 180, 128, 40;
33
+ }
34
+
35
+ [data-theme="blue"] {
36
+ --bg: #f3f5f8;
37
+ --bar: #e3e8ef;
38
+ --border: #d3dae3;
39
+ --panel: rgba(248, 251, 255, 0.84);
40
+ --panel-strong: rgba(248, 251, 255, 0.96);
41
+ --hover: #e8eef5;
42
+ --text: #172f4a;
43
+ --muted: #6a8aaa;
44
+ --accent-start: #1a3654;
45
+ --accent-end: #1e4a7a;
46
+ --accent-text: #ffffff;
47
+ --glow-rgb: 38, 88, 155;
48
+ }
49
+
50
+ [data-theme="dark"] {
51
+ --bg: #111110;
52
+ --bar: #1c1c1a;
53
+ --border: #2e2e2b;
54
+ --panel: rgba(28, 28, 26, 0.86);
55
+ --panel-strong: rgba(28, 28, 26, 0.98);
56
+ --hover: #242420;
57
+ --text: #e8e6df;
58
+ --muted: #8a8a80;
59
+ --accent-start: #e8e6df;
60
+ --accent-end: #d0cec7;
61
+ --accent-text: #111110;
62
+ --glow-rgb: 220, 210, 180;
63
+ --danger: #ffb4a9;
64
+ --ok: #9de8b5;
65
+ --warn: #f7d36f;
66
+ --shadow: 0 18px 70px rgba(0, 0, 0, 0.34);
67
+ }
68
+
69
+ * {
70
+ box-sizing: border-box;
71
+ }
72
+
73
+ html,
74
+ body {
75
+ height: 100%;
76
+ }
77
+
78
+ body {
79
+ margin: 0;
80
+ overflow: hidden;
81
+ background: var(--bg);
82
+ color: var(--text);
83
+ font-family: "IBM Plex Sans", "Aptos", "Segoe UI Variable", "Noto Sans CJK SC", "Microsoft YaHei", "PingFang SC", sans-serif;
84
+ transition: background 0.3s ease, color 0.3s ease;
85
+ }
86
+
87
+ button,
88
+ input,
89
+ textarea {
90
+ font: inherit;
91
+ }
92
+
93
+ button {
94
+ cursor: pointer;
95
+ }
96
+
97
+ .chat-shell {
98
+ position: relative;
99
+ z-index: 1;
100
+ display: grid;
101
+ grid-template-rows: auto auto minmax(0, 1fr) auto;
102
+ width: min(980px, 100%);
103
+ height: 100vh;
104
+ height: 100dvh;
105
+ min-height: 0;
106
+ overflow: hidden;
107
+ margin: 0 auto;
108
+ padding: 14px 16px 18px;
109
+ }
110
+
111
+ .chat-shell > * {
112
+ min-height: 0;
113
+ }
114
+
115
+ .topbar,
116
+ .workspace-strip,
117
+ .composer {
118
+ border: 1px solid var(--border);
119
+ background: var(--panel);
120
+ backdrop-filter: blur(18px);
121
+ box-shadow: var(--shadow);
122
+ }
123
+
124
+ .topbar {
125
+ position: sticky;
126
+ top: 0;
127
+ z-index: 4;
128
+ display: flex;
129
+ align-items: center;
130
+ justify-content: space-between;
131
+ gap: 12px;
132
+ border-radius: 22px;
133
+ padding: 10px 12px;
134
+ background: var(--panel-strong);
135
+ box-shadow: 0 14px 38px rgba(var(--glow-rgb), 0.15), 0 3px 10px rgba(0, 0, 0, 0.08);
136
+ }
137
+
138
+ .brand {
139
+ display: flex;
140
+ align-items: center;
141
+ gap: 10px;
142
+ min-width: 0;
143
+ }
144
+
145
+ .brand strong {
146
+ display: block;
147
+ letter-spacing: -0.02em;
148
+ }
149
+
150
+ .logo {
151
+ display: grid;
152
+ place-items: center;
153
+ width: 38px;
154
+ height: 38px;
155
+ border-radius: 12px;
156
+ background: linear-gradient(135deg, var(--accent-start), var(--accent-end));
157
+ color: var(--accent-text);
158
+ font-size: 0.82rem;
159
+ font-weight: 900;
160
+ }
161
+
162
+ .status {
163
+ display: inline-flex;
164
+ align-items: center;
165
+ gap: 6px;
166
+ margin-top: 2px;
167
+ color: var(--muted);
168
+ font-size: 0.78rem;
169
+ font-weight: 800;
170
+ }
171
+
172
+ .status.running::before {
173
+ content: "";
174
+ width: 10px;
175
+ height: 10px;
176
+ border: 2px solid currentColor;
177
+ border-top-color: transparent;
178
+ border-radius: 50%;
179
+ animation: spin 0.82s linear infinite;
180
+ }
181
+
182
+ .status.running {
183
+ color: var(--warn);
184
+ }
185
+
186
+ .status.done {
187
+ color: var(--ok);
188
+ }
189
+
190
+ .status.error {
191
+ color: var(--danger);
192
+ }
193
+
194
+ .top-actions {
195
+ display: flex;
196
+ align-items: center;
197
+ gap: 8px;
198
+ flex-wrap: wrap;
199
+ justify-content: flex-end;
200
+ }
201
+
202
+ .plain,
203
+ .send-button,
204
+ .icon-button {
205
+ border: 1px solid var(--border);
206
+ border-radius: 999px;
207
+ background: var(--panel-strong);
208
+ color: var(--text);
209
+ font-weight: 850;
210
+ transition: transform 0.18s ease, border-color 0.18s ease, background 0.18s ease;
211
+ }
212
+
213
+ .plain {
214
+ padding: 8px 12px;
215
+ }
216
+
217
+ .plain:hover,
218
+ .icon-button:hover {
219
+ border-color: rgba(var(--glow-rgb), 0.38);
220
+ transform: translateY(-1px);
221
+ }
222
+
223
+ .workspace-strip {
224
+ position: sticky;
225
+ top: 66px;
226
+ z-index: 4;
227
+ display: grid;
228
+ grid-template-columns: minmax(0, 1fr);
229
+ align-items: center;
230
+ margin-top: 10px;
231
+ border-radius: 18px;
232
+ padding: 9px 12px;
233
+ background: var(--panel-strong);
234
+ box-shadow: 0 14px 38px rgba(var(--glow-rgb), 0.15), 0 3px 10px rgba(0, 0, 0, 0.08);
235
+ }
236
+
237
+ .workspace-strip input {
238
+ display: none;
239
+ }
240
+
241
+ .workspace-strip span {
242
+ display: block;
243
+ min-width: 0;
244
+ max-width: 100%;
245
+ color: var(--muted);
246
+ font-size: 0.82rem;
247
+ overflow-wrap: anywhere;
248
+ word-break: break-word;
249
+ white-space: normal;
250
+ }
251
+
252
+ .messages {
253
+ display: flex;
254
+ flex-direction: column;
255
+ flex: 1 1 auto;
256
+ gap: 14px;
257
+ height: 100%;
258
+ min-height: 0;
259
+ min-width: 0;
260
+ max-height: 100%;
261
+ overflow-x: hidden;
262
+ overflow-y: scroll;
263
+ overscroll-behavior: contain;
264
+ padding: 24px 4px 18px;
265
+ scrollbar-gutter: stable;
266
+ -webkit-overflow-scrolling: touch;
267
+ }
268
+
269
+ .messages::-webkit-scrollbar,
270
+ .workspace-list::-webkit-scrollbar {
271
+ width: 10px;
272
+ }
273
+
274
+ .messages::-webkit-scrollbar-thumb,
275
+ .workspace-list::-webkit-scrollbar-thumb {
276
+ border: 3px solid transparent;
277
+ border-radius: 999px;
278
+ background: rgba(var(--glow-rgb), 0.24);
279
+ background-clip: padding-box;
280
+ }
281
+
282
+ .welcome {
283
+ margin: auto;
284
+ max-width: 650px;
285
+ text-align: center;
286
+ }
287
+
288
+ .welcome h1 {
289
+ margin: 0;
290
+ font-size: clamp(2.2rem, 6vw, 4.7rem);
291
+ line-height: 0.94;
292
+ letter-spacing: -0.055em;
293
+ }
294
+
295
+ .welcome p {
296
+ margin: 18px auto 0;
297
+ max-width: 520px;
298
+ color: var(--muted);
299
+ line-height: 1.6;
300
+ }
301
+
302
+ .message,
303
+ .event {
304
+ flex: 0 0 auto;
305
+ border: 1px solid var(--border);
306
+ border-radius: 22px;
307
+ background: var(--panel);
308
+ backdrop-filter: blur(18px);
309
+ box-shadow: 0 10px 34px rgba(0, 0, 0, 0.05);
310
+ overflow: hidden;
311
+ }
312
+
313
+ .message {
314
+ max-width: min(760px, 92%);
315
+ }
316
+
317
+ .event {
318
+ width: min(760px, 92%);
319
+ }
320
+
321
+ .message.user {
322
+ align-self: flex-end;
323
+ background: linear-gradient(135deg, var(--accent-start), var(--accent-end));
324
+ color: var(--accent-text);
325
+ }
326
+
327
+ .message.assistant,
328
+ .event {
329
+ align-self: flex-start;
330
+ }
331
+
332
+ .message-body {
333
+ padding: 14px 16px;
334
+ }
335
+
336
+ .event-body {
337
+ max-height: none;
338
+ overflow: hidden;
339
+ transition: max-height 0.24s ease;
340
+ }
341
+
342
+ .event-body-inner {
343
+ padding: 14px 16px;
344
+ }
345
+
346
+ .event.collapsed .event-body {
347
+ max-height: 220px;
348
+ }
349
+
350
+ .event.collapsed .event-body-inner {
351
+ position: relative;
352
+ overflow: hidden;
353
+ }
354
+
355
+ .event.collapsed .event-body-inner::after {
356
+ content: "";
357
+ position: absolute;
358
+ right: 0;
359
+ bottom: 0;
360
+ left: 0;
361
+ height: 60px;
362
+ background: linear-gradient(to bottom, transparent, var(--panel-strong));
363
+ pointer-events: none;
364
+ }
365
+
366
+ .event.can-collapse {
367
+ cursor: pointer;
368
+ }
369
+
370
+ .event.latest {
371
+ cursor: default;
372
+ }
373
+
374
+ .event:not(.can-collapse) .event-toggle {
375
+ display: none;
376
+ }
377
+
378
+ .message-body pre,
379
+ .event-body pre {
380
+ margin: 0;
381
+ white-space: pre-wrap;
382
+ word-break: break-word;
383
+ font-family: "IBM Plex Mono", "SFMono-Regular", Consolas, monospace;
384
+ font-size: 0.86rem;
385
+ line-height: 1.5;
386
+ }
387
+
388
+ .markdown-body {
389
+ line-height: 1.6;
390
+ word-break: break-word;
391
+ }
392
+
393
+ .markdown-body > *:first-child {
394
+ margin-top: 0;
395
+ }
396
+
397
+ .markdown-body > *:last-child {
398
+ margin-bottom: 0;
399
+ }
400
+
401
+ .markdown-body p,
402
+ .markdown-body ul,
403
+ .markdown-body ol,
404
+ .markdown-body blockquote,
405
+ .markdown-body table,
406
+ .markdown-body pre {
407
+ margin: 0 0 0.8rem;
408
+ }
409
+
410
+ .markdown-body h1,
411
+ .markdown-body h2,
412
+ .markdown-body h3,
413
+ .markdown-body h4,
414
+ .markdown-body h5,
415
+ .markdown-body h6 {
416
+ margin: 0 0 0.65rem;
417
+ line-height: 1.2;
418
+ }
419
+
420
+ .markdown-body ul,
421
+ .markdown-body ol {
422
+ padding-left: 1.35rem;
423
+ }
424
+
425
+ .markdown-body code {
426
+ padding: 0.1rem 0.28rem;
427
+ border-radius: 6px;
428
+ background: rgba(0, 0, 0, 0.08);
429
+ font-family: "IBM Plex Mono", "SFMono-Regular", Consolas, monospace;
430
+ font-size: 0.88em;
431
+ }
432
+
433
+ .markdown-body pre {
434
+ padding: 12px;
435
+ border-radius: 14px;
436
+ background: rgba(0, 0, 0, 0.08);
437
+ overflow-x: auto;
438
+ }
439
+
440
+ .markdown-body pre code {
441
+ padding: 0;
442
+ background: transparent;
443
+ }
444
+
445
+ .markdown-body blockquote {
446
+ padding-left: 0.85rem;
447
+ border-left: 3px solid rgba(var(--glow-rgb), 0.35);
448
+ color: var(--muted);
449
+ }
450
+
451
+ .markdown-body a {
452
+ color: inherit;
453
+ text-decoration: underline;
454
+ text-underline-offset: 3px;
455
+ }
456
+
457
+ .markdown-body table {
458
+ width: 100%;
459
+ border-collapse: collapse;
460
+ overflow: hidden;
461
+ border-radius: 14px;
462
+ }
463
+
464
+ .markdown-body th,
465
+ .markdown-body td {
466
+ padding: 8px 10px;
467
+ border: 1px solid rgba(0, 0, 0, 0.12);
468
+ text-align: left;
469
+ vertical-align: top;
470
+ }
471
+
472
+ .message-images {
473
+ display: flex;
474
+ flex-wrap: wrap;
475
+ gap: 8px;
476
+ margin-bottom: 10px;
477
+ }
478
+
479
+ .message-image {
480
+ max-width: 180px;
481
+ max-height: 180px;
482
+ border-radius: 16px;
483
+ object-fit: cover;
484
+ border: 1px solid rgba(255, 255, 255, 0.24);
485
+ }
486
+
487
+ .event-head {
488
+ display: flex;
489
+ align-items: center;
490
+ justify-content: space-between;
491
+ gap: 10px;
492
+ padding: 10px 14px;
493
+ border-bottom: 1px solid var(--border);
494
+ }
495
+
496
+ .event-title {
497
+ display: flex;
498
+ flex-wrap: wrap;
499
+ align-items: center;
500
+ gap: 8px;
501
+ font-weight: 900;
502
+ }
503
+
504
+ .event-toggle {
505
+ flex: 0 0 auto;
506
+ border: 1px solid var(--border);
507
+ border-radius: 999px;
508
+ background: var(--panel-strong);
509
+ color: var(--muted);
510
+ font-size: 0.76rem;
511
+ font-weight: 850;
512
+ padding: 5px 9px;
513
+ }
514
+
515
+ .event.latest .event-toggle {
516
+ display: none;
517
+ }
518
+
519
+ .event:not(.collapsed) .event-toggle::after {
520
+ content: "collapse";
521
+ }
522
+
523
+ .event.collapsed .event-toggle::after {
524
+ content: "expand";
525
+ }
526
+
527
+ .badge {
528
+ border-radius: 999px;
529
+ background: rgba(var(--glow-rgb), 0.11);
530
+ color: var(--text);
531
+ font-size: 0.72rem;
532
+ font-weight: 850;
533
+ padding: 4px 8px;
534
+ }
535
+
536
+ .tool-grid {
537
+ display: grid;
538
+ gap: 10px;
539
+ margin-top: 10px;
540
+ }
541
+
542
+ .tool-call {
543
+ border: 1px solid var(--border);
544
+ border-radius: 16px;
545
+ padding: 11px;
546
+ background: color-mix(in srgb, var(--hover), transparent 28%);
547
+ }
548
+
549
+ .tool-name {
550
+ margin-bottom: 8px;
551
+ font-weight: 900;
552
+ }
553
+
554
+ .error-text {
555
+ color: var(--danger);
556
+ }
557
+
558
+ .muted-text {
559
+ color: var(--muted);
560
+ }
561
+
562
+ .composer textarea {
563
+ border: 0;
564
+ outline: 0;
565
+ background: transparent;
566
+ color: var(--text);
567
+ }
568
+
569
+ .composer-wrap {
570
+ position: sticky;
571
+ bottom: 0;
572
+ z-index: 4;
573
+ display: grid;
574
+ gap: 8px;
575
+ }
576
+
577
+ .composer {
578
+ display: flex;
579
+ align-items: flex-end;
580
+ gap: 10px;
581
+ border-radius: 26px;
582
+ padding: 11px;
583
+ background: var(--panel-strong);
584
+ box-shadow: 0 14px 38px rgba(var(--glow-rgb), 0.15), 0 3px 10px rgba(0, 0, 0, 0.08);
585
+ }
586
+
587
+ .composer.dragover {
588
+ border-color: rgba(var(--glow-rgb), 0.44);
589
+ box-shadow: 0 0 0 5px rgba(var(--glow-rgb), 0.09), var(--shadow);
590
+ }
591
+
592
+ .composer textarea {
593
+ flex: 1;
594
+ max-height: 180px;
595
+ min-height: 30px;
596
+ resize: none;
597
+ line-height: 1.5;
598
+ padding: 7px 0;
599
+ }
600
+
601
+ .icon-button,
602
+ .send-button {
603
+ display: grid;
604
+ place-items: center;
605
+ flex: 0 0 auto;
606
+ height: 38px;
607
+ min-width: 38px;
608
+ }
609
+
610
+ .icon-button {
611
+ font-size: 1.35rem;
612
+ line-height: 1;
613
+ }
614
+
615
+ .send-button {
616
+ padding: 0 16px;
617
+ background: linear-gradient(135deg, var(--accent-start), var(--accent-end));
618
+ color: var(--accent-text);
619
+ }
620
+
621
+ .send-button.is-running {
622
+ display: flex;
623
+ align-items: center;
624
+ justify-content: center;
625
+ }
626
+
627
+ .send-button.is-running::before {
628
+ content: "";
629
+ width: 12px;
630
+ height: 12px;
631
+ margin-right: 8px;
632
+ border: 2px solid currentColor;
633
+ border-top-color: transparent;
634
+ border-radius: 50%;
635
+ animation: spin 0.82s linear infinite;
636
+ }
637
+
638
+ button:disabled {
639
+ cursor: not-allowed;
640
+ opacity: 0.58;
641
+ transform: none;
642
+ }
643
+
644
+ #imageInput {
645
+ display: none;
646
+ }
647
+
648
+ .image-preview {
649
+ display: flex;
650
+ flex-wrap: wrap;
651
+ gap: 8px;
652
+ padding: 0 8px;
653
+ }
654
+
655
+ .image-chip {
656
+ display: flex;
657
+ align-items: center;
658
+ gap: 8px;
659
+ max-width: 240px;
660
+ border: 1px solid var(--border);
661
+ border-radius: 999px;
662
+ padding: 5px 10px 5px 5px;
663
+ background: var(--panel);
664
+ color: var(--text);
665
+ }
666
+
667
+ .image-chip img {
668
+ width: 30px;
669
+ height: 30px;
670
+ border-radius: 50%;
671
+ object-fit: cover;
672
+ }
673
+
674
+ .image-chip span {
675
+ overflow: hidden;
676
+ text-overflow: ellipsis;
677
+ white-space: nowrap;
678
+ font-size: 0.82rem;
679
+ }
680
+
681
+ .composer-hint {
682
+ margin: 0;
683
+ color: var(--muted);
684
+ font-size: 0.78rem;
685
+ text-align: center;
686
+ }
687
+
688
+ .modal {
689
+ position: fixed;
690
+ inset: 0;
691
+ z-index: 30;
692
+ display: grid;
693
+ place-items: center;
694
+ padding: 18px;
695
+ background: rgba(0, 0, 0, 0.24);
696
+ backdrop-filter: blur(14px);
697
+ }
698
+
699
+ .modal.hidden {
700
+ display: none;
701
+ }
702
+
703
+ .modal-card {
704
+ display: grid;
705
+ grid-template-rows: auto auto auto minmax(0, 1fr) auto;
706
+ gap: 12px;
707
+ width: min(780px, 100%);
708
+ max-height: min(760px, 82vh);
709
+ border: 1px solid var(--border);
710
+ border-radius: 28px;
711
+ background: var(--panel-strong);
712
+ box-shadow: 0 24px 88px rgba(0, 0, 0, 0.22);
713
+ padding: 18px;
714
+ }
715
+
716
+ .modal-head,
717
+ .modal-path-row,
718
+ .modal-actions {
719
+ display: flex;
720
+ align-items: center;
721
+ gap: 12px;
722
+ }
723
+
724
+ .modal-head {
725
+ justify-content: space-between;
726
+ }
727
+
728
+ .modal-head h2,
729
+ .modal-head p {
730
+ margin: 0;
731
+ }
732
+
733
+ .modal-head h2 {
734
+ font-size: 1.18rem;
735
+ letter-spacing: -0.025em;
736
+ }
737
+
738
+ .modal-head p,
739
+ .modal-actions span {
740
+ color: var(--muted);
741
+ font-size: 0.86rem;
742
+ }
743
+
744
+ .modal-path-row {
745
+ border: 1px solid var(--border);
746
+ border-radius: 18px;
747
+ background: var(--hover);
748
+ padding: 8px;
749
+ }
750
+
751
+ .modal-path-row input {
752
+ min-width: 0;
753
+ flex: 1;
754
+ border: 0;
755
+ outline: 0;
756
+ background: transparent;
757
+ color: var(--text);
758
+ }
759
+
760
+ .workspace-roots {
761
+ display: flex;
762
+ flex-wrap: wrap;
763
+ gap: 8px;
764
+ }
765
+
766
+ .root-chip {
767
+ max-width: 190px;
768
+ overflow: hidden;
769
+ border: 1px solid var(--border);
770
+ border-radius: 999px;
771
+ background: var(--panel);
772
+ color: var(--text);
773
+ font-weight: 800;
774
+ padding: 7px 11px;
775
+ text-overflow: ellipsis;
776
+ white-space: nowrap;
777
+ }
778
+
779
+ .workspace-list {
780
+ display: grid;
781
+ align-content: start;
782
+ gap: 7px;
783
+ min-height: 0;
784
+ overflow: auto;
785
+ padding-right: 4px;
786
+ }
787
+
788
+ .dir-row {
789
+ display: grid;
790
+ grid-template-columns: auto minmax(0, 1fr) auto;
791
+ align-items: center;
792
+ gap: 10px;
793
+ width: 100%;
794
+ border: 1px solid var(--border);
795
+ border-radius: 18px;
796
+ background: var(--panel);
797
+ color: var(--text);
798
+ padding: 10px 12px;
799
+ text-align: left;
800
+ }
801
+
802
+ .dir-row:hover,
803
+ .root-chip:hover {
804
+ border-color: rgba(var(--glow-rgb), 0.38);
805
+ background: var(--hover);
806
+ }
807
+
808
+ .dir-icon {
809
+ display: grid;
810
+ place-items: center;
811
+ width: 24px;
812
+ height: 24px;
813
+ border-radius: 50%;
814
+ background: rgba(var(--glow-rgb), 0.1);
815
+ font-weight: 900;
816
+ }
817
+
818
+ .dir-main {
819
+ min-width: 0;
820
+ }
821
+
822
+ .dir-main strong,
823
+ .dir-main small {
824
+ display: block;
825
+ overflow: hidden;
826
+ text-overflow: ellipsis;
827
+ white-space: nowrap;
828
+ }
829
+
830
+ .dir-main small {
831
+ margin-top: 2px;
832
+ color: var(--muted);
833
+ font-size: 0.78rem;
834
+ }
835
+
836
+ .dir-action {
837
+ color: var(--muted);
838
+ font-size: 0.76rem;
839
+ font-weight: 850;
840
+ }
841
+
842
+ .dir-empty {
843
+ border: 1px dashed var(--border);
844
+ border-radius: 18px;
845
+ padding: 18px;
846
+ color: var(--muted);
847
+ text-align: center;
848
+ }
849
+
850
+ .modal-actions {
851
+ justify-content: space-between;
852
+ }
853
+
854
+ #theme-switcher {
855
+ position: fixed;
856
+ right: 22px;
857
+ bottom: 22px;
858
+ z-index: 20;
859
+ display: flex;
860
+ gap: 9px;
861
+ padding: 9px;
862
+ border: 1px solid var(--border);
863
+ border-radius: 999px;
864
+ background: var(--bar);
865
+ box-shadow: 0 12px 34px rgba(0, 0, 0, 0.12);
866
+ }
867
+
868
+ .theme-dot {
869
+ width: 21px;
870
+ height: 21px;
871
+ border: 1.5px solid transparent;
872
+ border-radius: 50%;
873
+ padding: 0;
874
+ transition: transform 0.18s ease, box-shadow 0.18s ease;
875
+ }
876
+
877
+ .theme-dot[data-theme="white"] {
878
+ background: #ffffff;
879
+ border-color: #c8c8c8;
880
+ }
881
+
882
+ .theme-dot[data-theme="yellow"] {
883
+ background: #e8d5a0;
884
+ border-color: #c4a060;
885
+ }
886
+
887
+ .theme-dot[data-theme="blue"] {
888
+ background: #a8c4e8;
889
+ border-color: #5e8ec8;
890
+ }
891
+
892
+ .theme-dot[data-theme="dark"] {
893
+ background: #2a2a26;
894
+ border-color: #585852;
895
+ }
896
+
897
+ .theme-dot.active {
898
+ box-shadow: 0 0 0 2px var(--bg), 0 0 0 4px var(--border);
899
+ transform: scale(1.08);
900
+ }
901
+
902
+ @keyframes spin {
903
+ to {
904
+ transform: rotate(360deg);
905
+ }
906
+ }
907
+
908
+ @media (max-width: 720px) {
909
+ .chat-shell {
910
+ padding: 10px;
911
+ }
912
+
913
+ .topbar,
914
+ .workspace-strip {
915
+ grid-template-columns: 1fr;
916
+ }
917
+
918
+ .topbar {
919
+ align-items: flex-start;
920
+ }
921
+
922
+ .workspace-strip {
923
+ display: grid;
924
+ }
925
+
926
+ .workspace-strip span {
927
+ max-width: none;
928
+ }
929
+
930
+ .modal-card {
931
+ max-height: 88vh;
932
+ padding: 14px;
933
+ }
934
+
935
+ .modal-head,
936
+ .modal-actions {
937
+ align-items: stretch;
938
+ flex-direction: column;
939
+ }
940
+
941
+ .modal-path-row {
942
+ align-items: stretch;
943
+ flex-direction: column;
944
+ }
945
+
946
+ .message,
947
+ .event {
948
+ max-width: 96%;
949
+ }
950
+
951
+ #theme-switcher {
952
+ right: 12px;
953
+ bottom: 12px;
954
+ }
955
+ }
frontend/static/app.js ADDED
@@ -0,0 +1,743 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ (function () {
2
+ var canvas, ctx, t = 0;
3
+ var TILE = 26, GAP = 1;
4
+ var rafId = null;
5
+ var cachedRgb = "10,10,10";
6
+ var lastDraw = 0;
7
+ var FRAME_MS = 1000 / 24;
8
+
9
+ function rgb() {
10
+ var th = document.documentElement.getAttribute("data-theme") || "white";
11
+ if (th === "dark") return "220,210,175";
12
+ if (th === "yellow") return "120,85,20";
13
+ if (th === "blue") return "38,88,155";
14
+ return "10,10,10";
15
+ }
16
+
17
+ function frame(ts) {
18
+ rafId = requestAnimationFrame(frame);
19
+ if (ts - lastDraw < FRAME_MS) return;
20
+ lastDraw = ts;
21
+
22
+ var w = canvas.width, h = canvas.height;
23
+ var cols = Math.ceil(w / TILE) + 1;
24
+ var rows = Math.ceil(h / TILE) + 1;
25
+ var pre = "rgba(" + cachedRgb + ",";
26
+
27
+ ctx.clearRect(0, 0, w, h);
28
+ for (var r = 0; r < rows; r++) {
29
+ for (var c = 0; c < cols; c++) {
30
+ var wave = 0.6 * Math.sin(c * 0.21 + t * 0.36) * Math.sin(r * 0.17 + t * 0.28)
31
+ + 0.4 * Math.sin(c * 0.11 - r * 0.13 + t * 0.19);
32
+ var norm = (wave + 1) * 0.5;
33
+ var v = norm * norm * norm;
34
+ var a = Math.round((0.004 + v * 0.186) * 100) / 100;
35
+ if (a < 0.02) continue;
36
+ ctx.fillStyle = pre + a + ")";
37
+ ctx.fillRect(c * TILE + GAP, r * TILE + GAP, TILE - GAP, TILE - GAP);
38
+ }
39
+ }
40
+ t += 0.007;
41
+ }
42
+
43
+ var resizeTimer;
44
+ function resize() {
45
+ clearTimeout(resizeTimer);
46
+ resizeTimer = setTimeout(function () {
47
+ var newW = window.innerWidth;
48
+ var newH = window.innerHeight;
49
+ if (newW === canvas.width && Math.abs(newH - canvas.height) <= 90) return;
50
+ canvas.width = newW;
51
+ canvas.height = newH;
52
+ }, 120);
53
+ }
54
+
55
+ function onVisibilityChange() {
56
+ if (document.hidden) {
57
+ if (rafId) {
58
+ cancelAnimationFrame(rafId);
59
+ rafId = null;
60
+ }
61
+ } else if (!rafId) {
62
+ rafId = requestAnimationFrame(frame);
63
+ }
64
+ }
65
+
66
+ document.addEventListener("DOMContentLoaded", function () {
67
+ canvas = document.createElement("canvas");
68
+ canvas.style.cssText = "position:fixed;top:0;left:0;width:100%;height:100%;"
69
+ + "z-index:0;pointer-events:none;will-change:transform;"
70
+ + "-webkit-backface-visibility:hidden;backface-visibility:hidden;";
71
+ document.body.insertBefore(canvas, document.body.firstChild);
72
+ ctx = canvas.getContext("2d");
73
+ canvas.width = window.innerWidth;
74
+ canvas.height = window.innerHeight;
75
+ cachedRgb = rgb();
76
+ window.addEventListener("resize", resize);
77
+ document.addEventListener("visibilitychange", onVisibilityChange);
78
+ rafId = requestAnimationFrame(frame);
79
+ });
80
+
81
+ new MutationObserver(function () { cachedRgb = rgb(); })
82
+ .observe(document.documentElement, { attributes: true, attributeFilter: ["data-theme"] });
83
+ })();
84
+
85
+ (function () {
86
+ var THEMES = ["white", "yellow", "blue", "dark"];
87
+ var LABELS = { white: "Pure White", yellow: "Warm Yellow", blue: "Cool Blue", dark: "Dark" };
88
+
89
+ function applyTheme(theme) {
90
+ if (theme === "white") {
91
+ document.documentElement.removeAttribute("data-theme");
92
+ } else {
93
+ document.documentElement.setAttribute("data-theme", theme);
94
+ }
95
+ try { localStorage.setItem("rh-ui-theme", theme); } catch (e) {}
96
+ document.querySelectorAll(".theme-dot").forEach(function (dot) {
97
+ dot.classList.toggle("active", dot.dataset.theme === theme);
98
+ });
99
+ }
100
+
101
+ var saved = "white";
102
+ try { saved = localStorage.getItem("rh-ui-theme") || "white"; } catch (e) {}
103
+ applyTheme(saved);
104
+
105
+ document.addEventListener("DOMContentLoaded", function () {
106
+ var switcher = document.createElement("div");
107
+ switcher.id = "theme-switcher";
108
+ switcher.setAttribute("aria-label", "Choose colour theme");
109
+ THEMES.forEach(function (theme) {
110
+ var btn = document.createElement("button");
111
+ btn.className = "theme-dot";
112
+ btn.dataset.theme = theme;
113
+ btn.title = LABELS[theme];
114
+ btn.setAttribute("aria-label", LABELS[theme]);
115
+ btn.addEventListener("click", function () { applyTheme(theme); });
116
+ switcher.appendChild(btn);
117
+ });
118
+ document.body.appendChild(switcher);
119
+ applyTheme(saved);
120
+ });
121
+ })();
122
+
123
+ (function () {
124
+ var ws;
125
+ var running = false;
126
+ var interrupting = false;
127
+ var pendingAskId = "";
128
+ var keepSubmittedMessageOnReset = false;
129
+ var autoFollowTimeline = true;
130
+ var conversationStarted = false;
131
+ var images = [];
132
+ var COLLAPSED_STEP_HEIGHT = 220;
133
+
134
+ var workspaceInput = document.getElementById("workspaceInput");
135
+ var workspaceStrip = document.getElementById("workspaceStrip");
136
+ var promptInput = document.getElementById("promptInput");
137
+ var runBtn = document.getElementById("runBtn");
138
+ var newBtn = document.getElementById("newBtn");
139
+ var pickWorkspaceBtn = document.getElementById("pickWorkspaceBtn");
140
+ var attachBtn = document.getElementById("attachBtn");
141
+ var imageInput = document.getElementById("imageInput");
142
+ var imagePreview = document.getElementById("imagePreview");
143
+ var dropZone = document.getElementById("dropZone");
144
+ var timeline = document.getElementById("timeline");
145
+ var statusPill = document.getElementById("statusPill");
146
+ var workspaceMeta = document.getElementById("workspaceMeta");
147
+ var workspaceModal = document.getElementById("workspaceModal");
148
+ var workspaceCloseBtn = document.getElementById("workspaceCloseBtn");
149
+ var workspacePathInput = document.getElementById("workspacePathInput");
150
+ var workspaceGoBtn = document.getElementById("workspaceGoBtn");
151
+ var workspaceRoots = document.getElementById("workspaceRoots");
152
+ var workspaceList = document.getElementById("workspaceList");
153
+ var workspaceUseBtn = document.getElementById("workspaceUseBtn");
154
+ var workspacePickerHint = document.getElementById("workspacePickerHint");
155
+ var currentWorkspacePath = "";
156
+ var defaultPromptPlaceholder = promptInput.getAttribute("placeholder") || "Message ResearchHarness";
157
+
158
+ function escapeHtml(value) {
159
+ return String(value || "")
160
+ .replaceAll("&", "&amp;")
161
+ .replaceAll("<", "&lt;")
162
+ .replaceAll(">", "&gt;")
163
+ .replaceAll('"', "&quot;")
164
+ .replaceAll("'", "&#039;");
165
+ }
166
+
167
+ function renderMarkdown(text) {
168
+ if (!window.marked || !window.DOMPurify) {
169
+ console.warn("Markdown renderer unavailable; falling back to plain text.");
170
+ return "<pre>" + escapeHtml(text) + "</pre>";
171
+ }
172
+ try {
173
+ var rawHtml = window.marked.parse(String(text || ""), { gfm: true, breaks: false, async: false });
174
+ var safeHtml = window.DOMPurify.sanitize(rawHtml, { USE_PROFILES: { html: true } });
175
+ return '<div class="markdown-body">' + safeHtml + "</div>";
176
+ } catch (e) {
177
+ console.warn("Markdown rendering failed; falling back to plain text.", e);
178
+ return "<pre>" + escapeHtml(text) + "</pre>";
179
+ }
180
+ }
181
+
182
+ function setStatus(text, kind) {
183
+ statusPill.textContent = text;
184
+ statusPill.className = "status " + (kind || "idle");
185
+ }
186
+
187
+ function setWorkspaceSelected(path) {
188
+ workspaceInput.value = path;
189
+ workspaceMeta.textContent = "Workspace selected: " + path;
190
+ }
191
+
192
+ function updateComposerMode() {
193
+ if (pendingAskId) {
194
+ runBtn.disabled = false;
195
+ runBtn.classList.remove("is-running");
196
+ runBtn.textContent = "Reply";
197
+ promptInput.placeholder = defaultPromptPlaceholder;
198
+ return;
199
+ }
200
+ runBtn.disabled = running && interrupting;
201
+ runBtn.classList.toggle("is-running", running);
202
+ runBtn.textContent = running ? (interrupting ? "Stopping" : "Stop") : "Run";
203
+ promptInput.placeholder = defaultPromptPlaceholder;
204
+ }
205
+
206
+ function setRunning(active, statusText) {
207
+ running = active;
208
+ if (!active) interrupting = false;
209
+ updateComposerMode();
210
+ setStatus(statusText || (active ? "Running" : "Idle"), active ? "running" : "idle");
211
+ }
212
+
213
+ function clearTimeline() {
214
+ autoFollowTimeline = true;
215
+ timeline.innerHTML = ''
216
+ + '<div class="welcome">'
217
+ + '<h1>What should the agent do?</h1>'
218
+ + '<p>Ask a question, attach images, choose a local workspace, and watch tool calls stream here.</p>'
219
+ + '</div>';
220
+ }
221
+
222
+ function ensureTimelineReady() {
223
+ var welcome = timeline.querySelector(".welcome");
224
+ if (welcome) welcome.remove();
225
+ }
226
+
227
+ function isNearBottom() {
228
+ return timeline.scrollHeight - timeline.scrollTop - timeline.clientHeight < 80;
229
+ }
230
+
231
+ function scrollTimeline(force) {
232
+ if (!force && !autoFollowTimeline) return;
233
+ requestAnimationFrame(function () {
234
+ timeline.scrollTop = timeline.scrollHeight;
235
+ requestAnimationFrame(function () {
236
+ timeline.scrollTop = timeline.scrollHeight;
237
+ autoFollowTimeline = isNearBottom();
238
+ });
239
+ });
240
+ }
241
+
242
+ function syncTimelineFollowMode() {
243
+ autoFollowTimeline = isNearBottom();
244
+ }
245
+
246
+ function updateEventToggle(node) {
247
+ var toggle = node.querySelector(".event-toggle");
248
+ if (!toggle) return;
249
+ toggle.setAttribute("aria-expanded", node.classList.contains("collapsed") ? "false" : "true");
250
+ }
251
+
252
+ function eventBody(node) {
253
+ return node.querySelector(".event-body");
254
+ }
255
+
256
+ function eventCanCollapse(node) {
257
+ return node.classList.contains("can-collapse");
258
+ }
259
+
260
+ function refreshEventCollapseCapability(node) {
261
+ var body = eventBody(node);
262
+ var toggle = node.querySelector(".event-toggle");
263
+ if (!body) return;
264
+ var shouldCollapse = body.scrollHeight > COLLAPSED_STEP_HEIGHT + 8;
265
+ node.classList.toggle("can-collapse", shouldCollapse);
266
+ if (toggle) toggle.hidden = !shouldCollapse;
267
+ if (!shouldCollapse) {
268
+ node.classList.remove("collapsed");
269
+ body.style.maxHeight = "none";
270
+ }
271
+ updateEventToggle(node);
272
+ }
273
+
274
+ function setEventExpanded(node, expanded, animate) {
275
+ var body = eventBody(node);
276
+ if (!body) {
277
+ node.classList.toggle("collapsed", !expanded);
278
+ updateEventToggle(node);
279
+ return;
280
+ }
281
+ refreshEventCollapseCapability(node);
282
+ if (!eventCanCollapse(node)) return;
283
+
284
+ if (expanded) {
285
+ node.classList.remove("collapsed");
286
+ body.style.maxHeight = body.scrollHeight + "px";
287
+ if (!animate) {
288
+ body.style.maxHeight = "none";
289
+ } else {
290
+ body.addEventListener("transitionend", function onEnd(event) {
291
+ if (event.propertyName !== "max-height") return;
292
+ body.removeEventListener("transitionend", onEnd);
293
+ if (!node.classList.contains("collapsed")) {
294
+ body.style.maxHeight = "none";
295
+ }
296
+ });
297
+ }
298
+ } else {
299
+ if (body.style.maxHeight === "none" || !body.style.maxHeight) {
300
+ body.style.maxHeight = body.scrollHeight + "px";
301
+ }
302
+ body.offsetHeight;
303
+ node.classList.add("collapsed");
304
+ body.style.maxHeight = COLLAPSED_STEP_HEIGHT + "px";
305
+ }
306
+ updateEventToggle(node);
307
+ }
308
+
309
+ function toggleEvent(node) {
310
+ if (node.classList.contains("latest") || !eventCanCollapse(node)) return;
311
+ setEventExpanded(node, node.classList.contains("collapsed"), true);
312
+ }
313
+
314
+ function addEvent(kind, title, bodyHtml, badges) {
315
+ var shouldFollow = autoFollowTimeline || isNearBottom();
316
+ ensureTimelineReady();
317
+ timeline.querySelectorAll(".event.latest").forEach(function (eventNode) {
318
+ eventNode.classList.remove("latest");
319
+ setEventExpanded(eventNode, false, true);
320
+ updateEventToggle(eventNode);
321
+ });
322
+ var badgeHtml = (badges || []).map(function (badge) {
323
+ return '<span class="badge">' + escapeHtml(badge) + "</span>";
324
+ }).join("");
325
+ var node = document.createElement("article");
326
+ node.className = "event event-" + kind + " latest";
327
+ node.innerHTML = ''
328
+ + '<div class="event-head">'
329
+ + '<div class="event-title">' + escapeHtml(title) + badgeHtml + '</div>'
330
+ + '<button class="event-toggle" type="button" aria-label="Toggle step details"></button>'
331
+ + '</div>'
332
+ + '<div class="event-body"><div class="event-body-inner">' + bodyHtml + '</div></div>';
333
+ node.querySelector(".event-toggle").addEventListener("click", function (event) {
334
+ event.stopPropagation();
335
+ toggleEvent(node);
336
+ });
337
+ node.addEventListener("click", function () {
338
+ toggleEvent(node);
339
+ });
340
+ timeline.appendChild(node);
341
+ setEventExpanded(node, true, false);
342
+ scrollTimeline(shouldFollow);
343
+ }
344
+
345
+ function addMessage(kind, text, attachedImages) {
346
+ autoFollowTimeline = true;
347
+ ensureTimelineReady();
348
+ var node = document.createElement("article");
349
+ node.className = "message " + kind;
350
+ var imageHtml = "";
351
+ (attachedImages || []).forEach(function (image) {
352
+ imageHtml += '<img class="message-image" alt="" src="' + image.data_url + '">';
353
+ });
354
+ node.innerHTML = '<div class="message-body">'
355
+ + (imageHtml ? '<div class="message-images">' + imageHtml + '</div>' : '')
356
+ + '<pre>' + escapeHtml(text) + '</pre>'
357
+ + '</div>';
358
+ timeline.appendChild(node);
359
+ scrollTimeline(true);
360
+ }
361
+
362
+ function formatJson(value) {
363
+ try {
364
+ return JSON.stringify(value, null, 2);
365
+ } catch (e) {
366
+ return String(value);
367
+ }
368
+ }
369
+
370
+ function renderTrace(row) {
371
+ if (!row || row.capture_type === "llm_call" || row.capture_type === "compaction") return;
372
+ var role = row.role || "";
373
+ var turn = row.turn_index || 0;
374
+ var text = row.text || "";
375
+ if (role === "system") return;
376
+ if (role === "user" && turn === 0) return;
377
+
378
+ if (role === "assistant") {
379
+ var tools = Array.isArray(row.tool_names) ? row.tool_names : [];
380
+ var args = Array.isArray(row.tool_arguments) ? row.tool_arguments : [];
381
+ var body = "";
382
+ if (text.trim()) {
383
+ body += (!tools.length && row.termination === "result")
384
+ ? renderMarkdown(text)
385
+ : "<pre>" + escapeHtml(text) + "</pre>";
386
+ }
387
+ if (tools.length) {
388
+ body += '<div class="tool-grid">';
389
+ tools.forEach(function (name, idx) {
390
+ body += '<div class="tool-call"><div class="tool-name">' + escapeHtml(name)
391
+ + '</div><pre>' + escapeHtml(formatJson(args[idx] || {})) + '</pre></div>';
392
+ });
393
+ body += "</div>";
394
+ }
395
+ if (!body) body = '<pre>(empty assistant output)</pre>';
396
+ if (row.error) body += '<pre class="error-text">' + escapeHtml(row.error) + "</pre>";
397
+ addEvent("assistant", "Assistant", body, ["round " + turn]);
398
+ return;
399
+ }
400
+
401
+ if (role === "tool") {
402
+ var toolName = Array.isArray(row.tool_names) && row.tool_names.length ? row.tool_names[0] : "Tool";
403
+ var toolBody = "<pre>" + escapeHtml(text) + "</pre>";
404
+ if (row.error) toolBody += '<pre class="error-text">' + escapeHtml(row.error) + "</pre>";
405
+ addEvent("tool", toolName + " result", toolBody, ["round " + turn]);
406
+ return;
407
+ }
408
+
409
+ if (role === "runtime") {
410
+ if (!text.trim() && !row.error && !row.termination) return;
411
+ var runtimeBody = "<pre>" + escapeHtml(text || row.termination || "") + "</pre>";
412
+ if (row.error) runtimeBody += '<pre class="error-text">' + escapeHtml(row.error) + "</pre>";
413
+ addEvent("runtime", "Runtime", runtimeBody, turn ? ["round " + turn] : []);
414
+ return;
415
+ }
416
+
417
+ if (role === "user") {
418
+ addEvent("runtime", "Runtime message", "<pre>" + escapeHtml(text) + "</pre>", ["round " + turn]);
419
+ }
420
+ }
421
+
422
+ function connect() {
423
+ var protocol = window.location.protocol === "https:" ? "wss:" : "ws:";
424
+ ws = new WebSocket(protocol + "//" + window.location.host + "/ws");
425
+ ws.onopen = function () {
426
+ setStatus("Connected", "idle");
427
+ };
428
+ ws.onclose = function () {
429
+ clearAskRequest();
430
+ setRunning(false, "Disconnected");
431
+ setStatus("Disconnected", "error");
432
+ };
433
+ ws.onmessage = function (event) {
434
+ var message = JSON.parse(event.data);
435
+ if (message.type === "ready") {
436
+ setStatus("Connected", "idle");
437
+ } else if (message.type === "conversation_reset") {
438
+ if (keepSubmittedMessageOnReset) {
439
+ keepSubmittedMessageOnReset = false;
440
+ ensureTimelineReady();
441
+ } else {
442
+ clearTimeline();
443
+ }
444
+ conversationStarted = false;
445
+ clearAskRequest();
446
+ } else if (message.type === "uploaded_images") {
447
+ addEvent("runtime", "Uploaded images saved", "<pre>" + escapeHtml((message.paths || []).join("\n")) + "</pre>", []);
448
+ } else if (message.type === "run_started") {
449
+ setRunning(true, "Running");
450
+ } else if (message.type === "interrupt_requested") {
451
+ interrupting = true;
452
+ updateComposerMode();
453
+ setStatus("Interrupting", "running");
454
+ } else if (message.type === "trace") {
455
+ renderTrace(message.row);
456
+ } else if (message.type === "ask_user") {
457
+ showAskRequest(message);
458
+ } else if (message.type === "run_finished") {
459
+ conversationStarted = true;
460
+ setRunning(false, "Done");
461
+ clearAskRequest();
462
+ setStatus("Done", "done");
463
+ } else if (message.type === "run_error") {
464
+ keepSubmittedMessageOnReset = false;
465
+ clearAskRequest();
466
+ setRunning(false, "Error");
467
+ setStatus("Error", "error");
468
+ addEvent("runtime", "Error", '<pre class="error-text">' + escapeHtml(message.error || "unknown error") + "</pre>", []);
469
+ }
470
+ };
471
+ }
472
+
473
+ function showAskRequest(message) {
474
+ pendingAskId = message.request_id || "";
475
+ var question = message.question || "Question";
476
+ var context = message.context || "";
477
+ var body = "<pre>" + escapeHtml(question) + "</pre>";
478
+ if (context) body += '<pre class="muted-text">' + escapeHtml(context) + "</pre>";
479
+ addEvent("runtime", "Agent question", body, ["AskUser"]);
480
+ setStatus("Waiting for input", "running");
481
+ updateComposerMode();
482
+ promptInput.focus();
483
+ }
484
+
485
+ function clearAskRequest() {
486
+ pendingAskId = "";
487
+ updateComposerMode();
488
+ }
489
+
490
+ function sendStart() {
491
+ if (pendingAskId) {
492
+ sendAskUserAnswer();
493
+ return;
494
+ }
495
+ if (!ws || ws.readyState !== WebSocket.OPEN) {
496
+ setStatus("Disconnected", "error");
497
+ return;
498
+ }
499
+ if (running) {
500
+ sendInterrupt();
501
+ return;
502
+ }
503
+ var prompt = promptInput.value.trim();
504
+ if (!prompt) return;
505
+ var sentImages = images.slice();
506
+ var continueConversation = conversationStarted;
507
+ if (!continueConversation) clearTimeline();
508
+ addMessage("user", prompt, sentImages);
509
+ keepSubmittedMessageOnReset = !continueConversation;
510
+ setRunning(true, "Starting");
511
+ ws.send(JSON.stringify({
512
+ type: "start",
513
+ prompt: prompt,
514
+ workspace_root: workspaceInput.value,
515
+ images: sentImages,
516
+ continue_conversation: continueConversation
517
+ }));
518
+ promptInput.value = "";
519
+ promptInput.style.height = "auto";
520
+ images = [];
521
+ renderImages();
522
+ }
523
+
524
+ function sendInterrupt() {
525
+ if (!running || interrupting || !ws || ws.readyState !== WebSocket.OPEN) return;
526
+ interrupting = true;
527
+ updateComposerMode();
528
+ setStatus("Interrupting", "running");
529
+ ws.send(JSON.stringify({ type: "interrupt" }));
530
+ }
531
+
532
+ function sendAskUserAnswer() {
533
+ if (!pendingAskId || !ws || ws.readyState !== WebSocket.OPEN) return;
534
+ var answer = promptInput.value.trim();
535
+ if (!answer) return;
536
+ var requestId = pendingAskId;
537
+ addMessage("user", answer, []);
538
+ ws.send(JSON.stringify({ type: "ask_user_answer", request_id: requestId, answer: answer }));
539
+ pendingAskId = "";
540
+ promptInput.value = "";
541
+ promptInput.style.height = "auto";
542
+ updateComposerMode();
543
+ setStatus("Running", "running");
544
+ }
545
+
546
+ function addImageFiles(fileList) {
547
+ Array.from(fileList || []).forEach(function (file) {
548
+ if (!file.type || !file.type.startsWith("image/")) return;
549
+ var reader = new FileReader();
550
+ reader.onload = function () {
551
+ images.push({ name: file.name, data_url: String(reader.result || "") });
552
+ renderImages();
553
+ };
554
+ reader.readAsDataURL(file);
555
+ });
556
+ }
557
+
558
+ function renderImages() {
559
+ imagePreview.innerHTML = "";
560
+ images.forEach(function (image, idx) {
561
+ var chip = document.createElement("button");
562
+ chip.type = "button";
563
+ chip.className = "image-chip";
564
+ chip.title = "Remove image";
565
+ chip.innerHTML = '<img alt="" src="' + image.data_url + '"><span>' + escapeHtml(image.name || "image") + "</span>";
566
+ chip.addEventListener("click", function () {
567
+ images.splice(idx, 1);
568
+ renderImages();
569
+ });
570
+ imagePreview.appendChild(chip);
571
+ });
572
+ }
573
+
574
+ function openWorkspaceModal() {
575
+ workspaceModal.classList.remove("hidden");
576
+ loadWorkspaceDirectory(workspaceInput.value.trim());
577
+ }
578
+
579
+ function closeWorkspaceModal() {
580
+ workspaceModal.classList.add("hidden");
581
+ }
582
+
583
+ function setWorkspacePickerBusy(text) {
584
+ workspaceList.innerHTML = '<div class="dir-empty">' + escapeHtml(text || "Loading...") + "</div>";
585
+ workspacePickerHint.textContent = text || "Loading...";
586
+ }
587
+
588
+ function renderWorkspaceError(message) {
589
+ workspaceList.innerHTML = '<div class="dir-empty error-text">' + escapeHtml(message) + "</div>";
590
+ workspacePickerHint.textContent = "Paste a valid existing folder path, then press Go.";
591
+ }
592
+
593
+ function directoryRow(label, path, actionLabel, onClick) {
594
+ var row = document.createElement("button");
595
+ row.type = "button";
596
+ row.className = "dir-row";
597
+ row.innerHTML = ''
598
+ + '<span class="dir-icon">&rsaquo;</span>'
599
+ + '<span class="dir-main"><strong>' + escapeHtml(label) + '</strong><small>' + escapeHtml(path) + '</small></span>'
600
+ + '<span class="dir-action">' + escapeHtml(actionLabel || "Open") + '</span>';
601
+ row.addEventListener("click", onClick);
602
+ return row;
603
+ }
604
+
605
+ function renderWorkspacePicker(payload) {
606
+ currentWorkspacePath = payload.path || "";
607
+ workspacePathInput.value = currentWorkspacePath;
608
+ workspaceRoots.innerHTML = "";
609
+ (payload.roots || []).forEach(function (root) {
610
+ var chip = document.createElement("button");
611
+ chip.type = "button";
612
+ chip.className = "root-chip";
613
+ chip.textContent = root.label || root.path;
614
+ chip.title = root.path || "";
615
+ chip.addEventListener("click", function () {
616
+ loadWorkspaceDirectory(root.path || "");
617
+ });
618
+ workspaceRoots.appendChild(chip);
619
+ });
620
+
621
+ workspaceList.innerHTML = "";
622
+ if (payload.parent) {
623
+ workspaceList.appendChild(directoryRow("..", payload.parent, "Parent", function () {
624
+ loadWorkspaceDirectory(payload.parent);
625
+ }));
626
+ }
627
+ (payload.entries || []).forEach(function (entry) {
628
+ workspaceList.appendChild(directoryRow(entry.name, entry.path, "Open", function () {
629
+ loadWorkspaceDirectory(entry.path);
630
+ }));
631
+ });
632
+ if (!payload.parent && !(payload.entries || []).length) {
633
+ workspaceList.innerHTML = '<div class="dir-empty">No readable child folders.</div>';
634
+ }
635
+ workspacePickerHint.textContent = payload.truncated
636
+ ? "Directory list was truncated. Paste a deeper path if needed."
637
+ : "Current folder will be used when you click Use this folder.";
638
+ }
639
+
640
+ async function loadWorkspaceDirectory(path) {
641
+ setWorkspacePickerBusy("Loading folders...");
642
+ try {
643
+ var url = "/api/workspace-directories";
644
+ if (path) url += "?path=" + encodeURIComponent(path);
645
+ var response = await fetch(url);
646
+ var payload = await response.json();
647
+ if (!response.ok || payload.error) {
648
+ renderWorkspaceError(payload.error || "Cannot open this folder.");
649
+ return;
650
+ }
651
+ renderWorkspacePicker(payload);
652
+ } catch (error) {
653
+ renderWorkspaceError(String(error));
654
+ }
655
+ }
656
+
657
+ runBtn.addEventListener("click", sendStart);
658
+ timeline.addEventListener("scroll", syncTimelineFollowMode);
659
+ timeline.addEventListener("wheel", function (event) {
660
+ if (event.deltaY < 0) autoFollowTimeline = false;
661
+ }, { passive: true });
662
+ timeline.addEventListener("touchmove", function () {
663
+ autoFollowTimeline = false;
664
+ }, { passive: true });
665
+ promptInput.addEventListener("keydown", function (event) {
666
+ if (event.isComposing) return;
667
+ if (event.key === "Enter" && !event.shiftKey && !event.ctrlKey && !event.metaKey) {
668
+ event.preventDefault();
669
+ sendStart();
670
+ }
671
+ });
672
+ promptInput.addEventListener("input", function () {
673
+ promptInput.style.height = "auto";
674
+ promptInput.style.height = Math.min(promptInput.scrollHeight, 180) + "px";
675
+ });
676
+ newBtn.addEventListener("click", function () {
677
+ if (ws && ws.readyState === WebSocket.OPEN) ws.send(JSON.stringify({ type: "new" }));
678
+ if (!running) {
679
+ promptInput.value = "";
680
+ images = [];
681
+ renderImages();
682
+ clearTimeline();
683
+ clearAskRequest();
684
+ conversationStarted = false;
685
+ setRunning(false, "Idle");
686
+ }
687
+ });
688
+ attachBtn.addEventListener("click", function () {
689
+ imageInput.click();
690
+ });
691
+ imageInput.addEventListener("change", function (event) { addImageFiles(event.target.files); });
692
+
693
+ pickWorkspaceBtn.addEventListener("click", function () {
694
+ openWorkspaceModal();
695
+ });
696
+
697
+ workspaceCloseBtn.addEventListener("click", closeWorkspaceModal);
698
+ workspaceModal.addEventListener("click", function (event) {
699
+ if (event.target === workspaceModal) closeWorkspaceModal();
700
+ });
701
+ workspaceGoBtn.addEventListener("click", function () {
702
+ loadWorkspaceDirectory(workspacePathInput.value.trim());
703
+ });
704
+ workspacePathInput.addEventListener("keydown", function (event) {
705
+ if (event.key === "Enter") {
706
+ event.preventDefault();
707
+ loadWorkspaceDirectory(workspacePathInput.value.trim());
708
+ }
709
+ });
710
+ workspaceUseBtn.addEventListener("click", function () {
711
+ if (!currentWorkspacePath) return;
712
+ setWorkspaceSelected(currentWorkspacePath);
713
+ closeWorkspaceModal();
714
+ });
715
+
716
+ ["dragenter", "dragover"].forEach(function (name) {
717
+ dropZone.addEventListener(name, function (event) {
718
+ event.preventDefault();
719
+ dropZone.classList.add("dragover");
720
+ });
721
+ });
722
+ ["dragleave", "drop"].forEach(function (name) {
723
+ dropZone.addEventListener(name, function (event) {
724
+ event.preventDefault();
725
+ dropZone.classList.remove("dragover");
726
+ });
727
+ });
728
+ dropZone.addEventListener("drop", function (event) {
729
+ addImageFiles(event.dataTransfer.files);
730
+ });
731
+ document.addEventListener("paste", function (event) {
732
+ var files = [];
733
+ Array.from(event.clipboardData ? event.clipboardData.items : []).forEach(function (item) {
734
+ if (item.kind === "file") {
735
+ var file = item.getAsFile();
736
+ if (file) files.push(file);
737
+ }
738
+ });
739
+ if (files.length) addImageFiles(files);
740
+ });
741
+
742
+ connect();
743
+ })();
frontend/static/favicon.svg ADDED
frontend/static/index.html ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!doctype html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="utf-8" />
5
+ <meta name="viewport" content="width=device-width, initial-scale=1" />
6
+ <title>ResearchHarness Chat</title>
7
+ <link rel="icon" type="image/svg+xml" href="/static/favicon.svg?v=rocket-1" />
8
+ <link rel="stylesheet" href="/static/app.css" />
9
+ </head>
10
+ <body>
11
+ <main class="chat-shell">
12
+ <header class="topbar">
13
+ <div class="brand">
14
+ <div class="logo">RH</div>
15
+ <div>
16
+ <strong>ResearchHarness</strong>
17
+ <span id="statusPill" class="status idle">Idle</span>
18
+ </div>
19
+ </div>
20
+ <div class="top-actions">
21
+ <button id="pickWorkspaceBtn" class="plain" type="button" hidden>Open workspace</button>
22
+ <button id="newBtn" class="plain" type="button">New chat</button>
23
+ </div>
24
+ </header>
25
+
26
+ <section id="workspaceStrip" class="workspace-strip">
27
+ <input id="workspaceInput" type="hidden" value="" />
28
+ <span id="workspaceMeta">Managed temporary workspace. Each chat uses an isolated runtime directory.</span>
29
+ </section>
30
+
31
+ <section id="timeline" class="messages">
32
+ <div class="welcome">
33
+ <h1>What should the agent do?</h1>
34
+ <p>Ask a question, attach images, and watch tool calls stream from an isolated temporary workspace.</p>
35
+ </div>
36
+ </section>
37
+
38
+ <footer class="composer-wrap">
39
+ <div id="imagePreview" class="image-preview"></div>
40
+ <div id="dropZone" class="composer">
41
+ <button id="attachBtn" class="icon-button" type="button" title="Click + to add one or more images">+</button>
42
+ <input id="imageInput" type="file" accept="image/*" multiple />
43
+ <textarea id="promptInput" rows="1" placeholder="Message ResearchHarness"></textarea>
44
+ <button id="runBtn" class="send-button" type="button">Run</button>
45
+ </div>
46
+ <p class="composer-hint">Enter sends. Ctrl+Enter or Shift+Enter inserts a newline. Click + to add one or more images; paste or drop images also works.</p>
47
+ </footer>
48
+ </main>
49
+
50
+ <section id="workspaceModal" class="modal hidden" role="dialog" aria-modal="true" aria-labelledby="workspaceModalTitle">
51
+ <div class="modal-card">
52
+ <header class="modal-head">
53
+ <div>
54
+ <h2 id="workspaceModalTitle">Open workspace</h2>
55
+ <p>Choose an existing local folder. Unicode paths are supported.</p>
56
+ </div>
57
+ <button id="workspaceCloseBtn" class="plain" type="button" aria-label="Close workspace picker">Close</button>
58
+ </header>
59
+ <div class="modal-path-row">
60
+ <input id="workspacePathInput" type="text" autocomplete="off" placeholder="Paste a folder path..." />
61
+ <button id="workspaceGoBtn" class="plain" type="button">Go</button>
62
+ </div>
63
+ <div id="workspaceRoots" class="workspace-roots"></div>
64
+ <div id="workspaceList" class="workspace-list"></div>
65
+ <footer class="modal-actions">
66
+ <span id="workspacePickerHint">Select a folder to use as the agent workspace.</span>
67
+ <button id="workspaceUseBtn" class="send-button" type="button">Use this folder</button>
68
+ </footer>
69
+ </div>
70
+ </section>
71
+ <script src="https://cdn.jsdelivr.net/npm/dompurify@3.2.6/dist/purify.min.js"></script>
72
+ <script src="https://cdn.jsdelivr.net/npm/marked@15.0.12/marked.min.js"></script>
73
+ <script src="/static/app.js"></script>
74
+ </body>
75
+ </html>
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ fastapi==0.115.6
2
+ json5==0.14.0
3
+ openai==2.3.0
4
+ Pillow==11.3.0
5
+ requests==2.32.5
6
+ structai==0.1.22
7
+ tiktoken==0.12.0
8
+ uvicorn==0.34.0
run_agent.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ """Thin top-level CLI entrypoint for the ResearchHarness agent."""
2
+
3
+ from agent_base.react_agent import main
4
+
5
+
6
+ if __name__ == "__main__":
7
+ raise SystemExit(main())
run_frontend.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Launch the local ResearchHarness browser UI."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import sys
7
+ import threading
8
+ import webbrowser
9
+
10
+ import uvicorn
11
+
12
+ from agent_base.utils import read_role_prompt_files
13
+ from frontend.local_server import app, configure_frontend
14
+
15
+
16
+ def main(argv: list[str] | None = None) -> int:
17
+ parser = argparse.ArgumentParser(description="Run the local ResearchHarness frontend.")
18
+ parser.add_argument("--host", default="127.0.0.1", help="Host to bind. Default: 127.0.0.1")
19
+ parser.add_argument("--port", type=int, default=8765, help="Port to bind. Default: 8765")
20
+ parser.add_argument("--no-browser", action="store_true", help="Do not open the browser automatically.")
21
+ parser.add_argument("--trace-dir", help="Optional directory where frontend agent traces are written.")
22
+ parser.add_argument(
23
+ "--role-prompt-file",
24
+ action="append",
25
+ default=[],
26
+ dest="role_prompt_files",
27
+ metavar="PATH",
28
+ help="Append one role-specific prompt file to the frontend agent. May be passed multiple times.",
29
+ )
30
+ args = parser.parse_args(argv)
31
+
32
+ try:
33
+ role_prompt = read_role_prompt_files(args.role_prompt_files)
34
+ configure_frontend(role_prompt=role_prompt, trace_dir=args.trace_dir)
35
+ except (OSError, ValueError) as exc:
36
+ print(str(exc), file=sys.stderr)
37
+ return 1
38
+
39
+ url = f"http://{args.host}:{args.port}"
40
+ if not args.no_browser:
41
+ threading.Timer(0.8, lambda: webbrowser.open(url)).start()
42
+ print(f"ResearchHarness frontend: {url}")
43
+ uvicorn.run(app, host=args.host, port=args.port, reload=False)
44
+ return 0
45
+
46
+
47
+ if __name__ == "__main__":
48
+ raise SystemExit(main())
run_server.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Run ResearchHarness as a minimal OpenAI-compatible API server."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import sys
7
+
8
+ from agent_base.utils import PROJECT_ROOT, MissingRequiredEnvError, load_dotenv, require_required_env
9
+ from api.openai_server import serve
10
+
11
+
12
+ def main(argv: list[str] | None = None) -> int:
13
+ parser = argparse.ArgumentParser(description="Serve ResearchHarness through /v1/chat/completions.")
14
+ parser.add_argument(
15
+ "--api-runs-dir",
16
+ required=True,
17
+ dest="api_runs_dir",
18
+ help="Directory where the server creates one isolated subdirectory per request.",
19
+ )
20
+ parser.add_argument("--host", default="127.0.0.1", help="Host to bind. Defaults to 127.0.0.1.")
21
+ parser.add_argument("--port", type=int, default=8686, help="Port to bind. Defaults to 8686.")
22
+ parser.add_argument(
23
+ "--role-prompt-file",
24
+ action="append",
25
+ default=[],
26
+ dest="role_prompt_files",
27
+ help="Optional role prompt file appended to the base ResearchHarness prompt.",
28
+ )
29
+ parser.add_argument(
30
+ "--input-wrapper",
31
+ action=argparse.BooleanOptionalAction,
32
+ default=True,
33
+ help="Enable or disable the input LLM wrapper. Enabled by default.",
34
+ )
35
+ parser.add_argument(
36
+ "--output-wrapper",
37
+ action=argparse.BooleanOptionalAction,
38
+ default=True,
39
+ help="Enable or disable the output LLM wrapper. Enabled by default.",
40
+ )
41
+ args = parser.parse_args(argv)
42
+
43
+ load_dotenv(PROJECT_ROOT / ".env")
44
+ try:
45
+ require_required_env("ResearchHarness API server")
46
+ serve(
47
+ api_runs_dir=args.api_runs_dir,
48
+ host=args.host,
49
+ port=args.port,
50
+ role_prompt_files=list(args.role_prompt_files),
51
+ input_wrapper=args.input_wrapper,
52
+ output_wrapper=args.output_wrapper,
53
+ )
54
+ except (MissingRequiredEnvError, ValueError) as exc:
55
+ print(str(exc), file=sys.stderr)
56
+ return 1
57
+ return 0
58
+
59
+
60
+ if __name__ == "__main__":
61
+ raise SystemExit(main())
traces/.gitkeep ADDED
@@ -0,0 +1 @@
 
 
1
+