victordibia commited on
Commit
a08910d
·
1 Parent(s): a23ff80

Deploy 2026-02-03 00:28:32

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. README.md +180 -69
  2. pyproject.toml +9 -2
  3. src/flow/cli/app.py +45 -8
  4. src/flow/cli/optimize.py +386 -29
  5. src/flow/cli/repl.py +12 -10
  6. src/flow/experiments/__init__.py +0 -2
  7. src/flow/experiments/ablation.py +10 -31
  8. src/flow/experiments/data/tasks/coding.jsonl +5 -10
  9. src/flow/experiments/data/tasks/gaia_all.jsonl +0 -0
  10. src/flow/experiments/data/tasks/gaia_level1.jsonl +106 -0
  11. src/flow/experiments/data/tasks/gaia_level2.jsonl +172 -0
  12. src/flow/experiments/data/tasks/gaia_level3.jsonl +52 -0
  13. src/flow/experiments/evaluators/heuristic.py +1 -1
  14. src/flow/experiments/evaluators/llm.py +14 -5
  15. src/flow/experiments/models.py +311 -32
  16. src/flow/experiments/optimizer.py +65 -13
  17. src/flow/experiments/runner.py +11 -5
  18. src/flow/experiments/types.py +50 -0
  19. src/flow/harness/__init__.py +23 -1
  20. src/flow/harness/base.py +24 -21
  21. src/flow/harness/langgraph/__init__.py +37 -0
  22. src/flow/harness/langgraph/compaction.py +51 -0
  23. src/flow/harness/langgraph/harness.py +257 -0
  24. src/flow/harness/langgraph/otel_callback.py +173 -0
  25. src/flow/harness/langgraph/wrappers.py +76 -0
  26. src/flow/harness/maf/__init__.py +4 -0
  27. src/flow/harness/maf/agent.py +15 -18
  28. src/flow/harness/maf/harness.py +68 -51
  29. src/flow/harness/maf/tools/__init__.py +96 -115
  30. src/flow/harness/maf/tools/coding.py +0 -391
  31. src/flow/harness/maf/tools/core.py +0 -100
  32. src/flow/harness/maf/tools/execution.py +0 -479
  33. src/flow/harness/maf/tools/memory.py +0 -260
  34. src/flow/harness/maf/tools/sub_agent.py +0 -196
  35. src/flow/harness/maf/wrappers.py +64 -0
  36. src/flow/harness/miniagent/__init__.py +139 -0
  37. src/flow/harness/miniagent/agent.py +604 -0
  38. src/flow/harness/miniagent/client.py +185 -0
  39. src/flow/harness/miniagent/context.py +664 -0
  40. src/flow/harness/miniagent/harness.py +403 -0
  41. src/flow/harness/miniagent/hooks.py +209 -0
  42. src/flow/harness/miniagent/instructions.py +207 -0
  43. src/flow/harness/miniagent/messages.py +88 -0
  44. src/flow/harness/miniagent/otel.py +258 -0
  45. src/flow/harness/miniagent/tool.py +173 -0
  46. src/flow/harness/miniagent/tools/__init__.py +125 -0
  47. src/flow/harness/miniagent/workspace.py +198 -0
  48. src/flow/harness/registry.py +80 -0
  49. src/flow/llm/__init__.py +49 -0
  50. src/flow/llm/config.py +227 -0
README.md CHANGED
@@ -1,124 +1,235 @@
1
- ---
2
- title: Flow
3
- emoji: 🔄
4
- colorFrom: blue
5
- colorTo: purple
6
- sdk: docker
7
- app_port: 7860
8
- pinned: false
9
- ---
10
-
11
  # Flow
12
 
13
- **Evaluate and Optimize Coding Agent Configurations**
 
 
 
14
 
15
- Flow is a framework for running experiments on LLM coding agents. Compare context engineering strategies (message compaction, agent memory, sub-agents), evaluate results with LLM-as-Judge, and find optimal configurations that balance quality and token cost.
 
 
 
16
 
17
  ![Flow UI](docs/flow.png)
18
 
19
- ## Features
 
 
 
 
 
 
 
 
 
 
 
20
 
21
- - **Ablation Studies**: Test different agent configurations side-by-side
22
- - **LLM-as-Judge Evaluation**: Automatically score agent outputs for correctness
23
- - **Pareto Analysis**: Find optimal quality vs. cost tradeoffs
24
- - **Web UI**: Visual interface for managing experiments and viewing results
25
- - **Config Export**: Export winning configurations for production use
 
26
 
27
  ## Quick Start
28
 
29
  ### 1. Install
30
 
31
  ```bash
32
- # Clone and install with uv
33
  git clone https://github.com/victordibia/flow
34
  cd flow
35
  uv sync
36
  ```
37
 
38
- ### 2. Configure Azure OpenAI
 
 
39
 
40
  ```bash
41
- export AZURE_OPENAI_API_KEY="your-api-key"
42
- export AZURE_OPENAI_ENDPOINT="https://your-resource.openai.azure.com/"
43
- export AZURE_OPENAI_DEPLOYMENT="gpt-4o"
44
  ```
45
 
46
- ### 3. Run Optimization
 
 
 
 
 
 
 
 
47
 
48
  ```bash
49
- # Run with built-in task suite
50
- uv run flow optimize --suite coding
 
 
 
51
 
52
- # Or with custom tasks
53
- uv run flow optimize --tasks my_tasks.jsonl
54
  ```
55
 
56
- ### 4. Launch Web UI
 
 
57
 
58
  ```bash
 
59
  uv run flow serve
60
- # Opens at http://localhost:8091
 
 
61
  ```
62
 
63
- ## CLI Commands
64
 
65
- ```bash
66
- flow optimize [OPTIONS] # Run optimization experiments
67
- flow serve # Start the web UI
68
- flow run [TASK] # Run a single agent task
69
- flow config # Show current configuration
70
- flow init # Initialize Flow directories
 
 
 
 
 
 
71
  ```
72
 
73
- ## What Gets Optimized
74
 
75
- Flow tests different **context engineering strategies**:
 
 
76
 
77
- | Strategy | Description |
78
- |----------|-------------|
79
- | **Message Compaction** | Keep first N + last M messages, discard middle |
80
- | **Agent Memory** | Persistent storage the agent controls |
81
- | **Sub-Agent Isolation** | Delegate research to isolated sub-agent |
82
 
83
- Example configurations:
84
 
85
- ```python
86
- from flow.experiments.models import Agent, CompactionConfig, GridSearchStrategy
 
 
 
 
 
 
 
 
 
87
 
88
- # Define a base agent
89
- base = Agent(name="my_agent", enable_memory=True)
90
 
91
- # Generate candidates via grid search
92
- strategy = GridSearchStrategy(variations={
93
- "enable_memory": [True, False],
94
- "compaction": [CompactionConfig.head_tail(10, 40), CompactionConfig.none()],
95
- })
96
- candidates = strategy.generate(base, budget=10)
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  ```
98
 
99
- ## Task Format
100
 
101
- Tasks are defined in JSONL format:
102
 
103
- ```json
104
- {"name": "fizzbuzz", "prompt": "Create fizzbuzz.py and run it", "criteria": [{"name": "correct", "instruction": "Output shows FizzBuzz pattern"}]}
 
 
 
 
 
 
 
 
105
  ```
106
 
107
- ## Development
 
 
108
 
109
  ```bash
110
- # Install dev dependencies
111
- uv sync --dev
 
 
 
 
 
 
112
 
113
- # Run tests
114
- uv run pytest tests/ -v
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
 
116
- # Type checking
117
- uv run pyright src/
118
 
119
- # Linting
120
- uv run ruff check src/
121
- uv run ruff format src/
 
 
122
  ```
123
 
124
  ## License
 
 
 
 
 
 
 
 
 
 
 
1
  # Flow
2
 
3
+ > [!NOTE]
4
+ > Flow is an experimental prototype and changing rapidly.
5
+
6
+ Flow helps you find the best configuration for your AI coding agent. Define your agent spec, provide evaluation tasks, and Flow automatically generates variants, scores them, and shows you the quality vs. cost tradeoffs.
7
 
8
+ - **Simplified experimentation** Automates the search for optimal agent configurations
9
+ - **Transparency** — See exactly what was tested, scores, and tradeoffs on a Pareto chart
10
+ - **User control** — Choose your tasks, evaluation criteria, and approve variants
11
+ - **Framework agnostic** — Standardized agent spec with pluggable runtime adapters (MAF built-in, extensible)
12
 
13
  ![Flow UI](docs/flow.png)
14
 
15
+ ## How It Works
16
+
17
+ ```mermaid
18
+ flowchart LR
19
+ A[Agent Spec] --> D[Optimizer]
20
+ B[Tasks] --> D
21
+ C[Evaluator] --> D
22
+ D --> E[Agent Variants/Candidates]
23
+ E --> F[Pareto Graph]
24
+ ```
25
+
26
+ ## Core Concepts
27
 
28
+ | Component | What It Is |
29
+ | -------------- | ----------------------------------------------------------------------------------- |
30
+ | **Agent Spec** | Agent configuration (model, tools, compaction, instructions) with pluggable runtime |
31
+ | **Task** | A coding challenge with evaluation criteria |
32
+ | **Evaluator** | Scores agent output (LLM-as-Judge, heuristics, or trace-based) |
33
+ | **Optimizer** | Generates variants and runs experiments (GridSearch, extensible) |
34
 
35
  ## Quick Start
36
 
37
  ### 1. Install
38
 
39
  ```bash
 
40
  git clone https://github.com/victordibia/flow
41
  cd flow
42
  uv sync
43
  ```
44
 
45
+ ### 2. Configure
46
+
47
+ Create a `.env` file in the project root:
48
 
49
  ```bash
50
+ AZURE_OPENAI_API_KEY=your-api-key-here
51
+ AZURE_OPENAI_ENDPOINT=https://your-resource.openai.azure.com/
52
+ AZURE_OPENAI_CHAT_DEPLOYMENT_NAME=gpt-4o-mini
53
  ```
54
 
55
+ **Important:** Make sure your Azure OpenAI deployment has adequate rate limits:
56
+ - **Minimum:** 10,000 tokens per minute (TPM)
57
+ - **Recommended:** 30,000+ TPM for optimization runs
58
+
59
+ See [Azure Portal](https://portal.azure.com) → Your OpenAI resource → Deployments to adjust rate limits.
60
+
61
+ ### 3. Test Your Setup
62
+
63
+ Before running optimization, verify your Azure OpenAI connection:
64
 
65
  ```bash
66
+ # Test Azure OpenAI connection
67
+ uv run python scripts/test_azure_connection.py
68
+
69
+ # Test basic agent execution
70
+ uv run python scripts/test_basic_agent.py
71
 
72
+ # Test LLM evaluator
73
+ uv run python scripts/test_evaluator.py
74
  ```
75
 
76
+ All tests should pass with non-zero scores and token counts.
77
+
78
+ ### 4. Run
79
 
80
  ```bash
81
+ # Launch the web UI
82
  uv run flow serve
83
+
84
+ # Or run optimization from CLI (base agent + variations + tasks)
85
+ uv run flow optimize --agent base.yaml --vary compaction,memory --tasks tasks.jsonl
86
  ```
87
 
88
+ ## Agent Spec
89
 
90
+ Define your agent configuration:
91
+
92
+ ```python
93
+ from flow.experiments.models import Agent, CompactionConfig
94
+
95
+ agent = Agent(
96
+ name="my-agent",
97
+ framework="maf", # default; extensible to other runtimes
98
+ instructions="You are a coding assistant",
99
+ tools="standard", # or "minimal", "full", "readonly"
100
+ compaction=CompactionConfig.head_tail(10, 40), # keep first 10 + last 40 messages
101
+ )
102
  ```
103
 
104
+ Flow tests variations like:
105
 
106
+ - **Compaction strategies** — `none`, `head_tail(N, M)`, `last_n(N)`
107
+ - **Tool configurations** — different tool sets
108
+ - **Instructions** — prompt variations
109
 
110
+ ## Task Format
 
 
 
 
111
 
112
+ Tasks are JSONL with evaluation criteria:
113
 
114
+ ```json
115
+ {
116
+ "name": "fizzbuzz",
117
+ "prompt": "Create fizzbuzz.py and run it",
118
+ "criteria": [
119
+ { "name": "correct", "instruction": "Output shows FizzBuzz pattern" }
120
+ ]
121
+ }
122
+ ```
123
+
124
+ ## Web UI
125
 
126
+ Launch with `uv run flow serve`. Create agents, import task suites, run optimization jobs, and view results with Pareto analysis. Test agents interactively with live trace streaming.
 
127
 
128
+ ## CLI Commands
129
+
130
+ ```bash
131
+ # Web UI
132
+ flow serve # Start the web UI
133
+
134
+ # Optimization
135
+ flow optimize --agent base.yaml --tasks tasks.jsonl # Optimize base agent
136
+ flow optimize --vary compaction,memory # Vary specific parameters
137
+ flow optimize --suite coding # Use built-in task suite
138
+
139
+ # Single Task Execution
140
+ flow run "Create hello.py" # Run a single task
141
+ flow run --config best.yaml "task" # Run with optimized config
142
+
143
+ # Testing & Diagnostics
144
+ python scripts/test_azure_connection.py # Test Azure OpenAI connection
145
+ python scripts/test_basic_agent.py # Test basic agent execution
146
+ python scripts/test_evaluator.py # Test LLM evaluator
147
  ```
148
 
149
+ ## Optimizer
150
 
151
+ Flow includes multiple optimization strategies for finding the best agent configuration.
152
 
153
+ ### Grid Search (Default)
154
+
155
+ Test predefined variations of your agent:
156
+
157
+ ```bash
158
+ # Vary compaction and memory settings
159
+ flow optimize --agent examples/base_agent.yaml --vary compaction,memory --tasks examples/coding_tasks.jsonl
160
+
161
+ # Or define variations in a config file
162
+ flow optimize --config variations.yaml --agent base_agent.yaml --tasks tasks.jsonl
163
  ```
164
 
165
+ ### GEPA (Active Learning)
166
+
167
+ Use GEPA (Generative Evolutionary Prompt Adjustment) for automatic prompt optimization:
168
 
169
  ```bash
170
+ # Run GEPA optimization
171
+ flow optimize \
172
+ --config examples/gepa_strategy.yaml \
173
+ --agent examples/base_agent.yaml \
174
+ --tasks examples/coding_tasks.jsonl \
175
+ --budget 10 \
176
+ --parallel 2
177
+ ```
178
 
179
+ **GEPA Configuration:**
180
+
181
+ 1. **Strategy Config** (`examples/gepa_strategy.yaml`):
182
+ ```yaml
183
+ strategy_type: gepa
184
+ config:
185
+ reflection_lm: gpt-4o-mini # Model for GEPA's reflection
186
+ ```
187
+
188
+ 2. **Base Agent** (`examples/base_agent.yaml`):
189
+ ```yaml
190
+ name: coding-assistant
191
+ model: gpt-4o-mini # Model for agent execution
192
+ tools: standard
193
+ instructions: |
194
+ Your initial prompt that GEPA will optimize...
195
+ ```
196
+
197
+ 3. **Run Optimization:**
198
+ - `--budget`: Number of optimization iterations (default: 10)
199
+ - `--parallel`: Concurrent evaluations (default: 4)
200
+ - Tasks must include evaluation criteria for LLM scoring
201
+
202
+ **Example Output:**
203
+ ```
204
+ [1/10] coding-assistant_gepa_eval/fibonacci: ✓ score=0.85 tokens=1,245
205
+ [2/10] coding-assistant_gepa_eval/palindrome: ✓ score=0.78 tokens=982
206
+ ...
207
+ Best agent exported to: ~/.flow/optimizations/<timestamp>/agents/best_score.yaml
208
+ ```
209
+
210
+ ### Requirements for Optimization
211
+
212
+ - **Azure OpenAI Deployment:** Create a deployment with your chosen model (e.g., `gpt-4o-mini`)
213
+ - **Rate Limits:** Minimum 10K TPM; 30K+ recommended for smooth runs
214
+ - **Task Criteria:** Tasks need evaluation criteria for LLM-based scoring:
215
+ ```json
216
+ {
217
+ "name": "task_name",
218
+ "prompt": "Task description",
219
+ "criteria": [
220
+ {"name": "correctness", "instruction": "Solution is correct", "weight": 1.0},
221
+ {"name": "quality", "instruction": "Code is clean and documented", "weight": 0.7}
222
+ ]
223
+ }
224
+ ```
225
 
226
+ ## Development
 
227
 
228
+ ```bash
229
+ uv sync --dev # Install dev dependencies
230
+ uv run pytest tests/ -v # Run tests
231
+ uv run pyright src/ # Type checking
232
+ uv run ruff check src/ # Linting
233
  ```
234
 
235
  ## License
pyproject.toml CHANGED
@@ -26,7 +26,7 @@ dependencies = [
26
  "typer>=0.9.0",
27
  "httpx>=0.25.0",
28
  "python-dotenv>=1.0.0",
29
- "agent-framework-core>=1.0.0b0",
30
  "azure-identity>=1.15.0",
31
  "pyyaml>=6.0.0",
32
  # OpenTelemetry for experiments tracing
@@ -38,14 +38,21 @@ dependencies = [
38
  "uvicorn>=0.27.0",
39
  "sqlmodel>=0.0.14",
40
  "aiosqlite>=0.19.0",
 
41
  ]
42
 
43
  [project.optional-dependencies]
44
  # Optional features
45
  research = ["beautifulsoup4>=4.12.0", "html2text>=2024.2.26"]
 
 
 
 
 
 
46
 
47
  # Bundles
48
- all = ["flow-agent[research]"]
49
  dev = [
50
  "pytest>=8.0.0",
51
  "pytest-asyncio>=0.23.0",
 
26
  "typer>=0.9.0",
27
  "httpx>=0.25.0",
28
  "python-dotenv>=1.0.0",
29
+ "agent-framework-core>=1.0.0b5",
30
  "azure-identity>=1.15.0",
31
  "pyyaml>=6.0.0",
32
  # OpenTelemetry for experiments tracing
 
38
  "uvicorn>=0.27.0",
39
  "sqlmodel>=0.0.14",
40
  "aiosqlite>=0.19.0",
41
+ "tiktoken>=0.12.0",
42
  ]
43
 
44
  [project.optional-dependencies]
45
  # Optional features
46
  research = ["beautifulsoup4>=4.12.0", "html2text>=2024.2.26"]
47
+ langgraph = [
48
+ "langgraph>=0.2.0",
49
+ "langchain-core>=0.3.0",
50
+ "langchain-openai>=0.2.0",
51
+ ]
52
+ optimizer = ["gepa>=0.0.20"]
53
 
54
  # Bundles
55
+ all = ["flow-agent[research,langgraph,optimizer]"]
56
  dev = [
57
  "pytest>=8.0.0",
58
  "pytest-asyncio>=0.23.0",
src/flow/cli/app.py CHANGED
@@ -11,6 +11,7 @@ from pathlib import Path
11
  from typing import Annotated
12
 
13
  import typer
 
14
  from rich.console import Console
15
 
16
  from flow import __version__
@@ -61,6 +62,10 @@ def run(
61
  Path | None,
62
  typer.Option("--config", "-c", help="Config file from optimization (YAML)"),
63
  ] = None,
 
 
 
 
64
  interactive: Annotated[
65
  bool,
66
  typer.Option("--interactive/--no-interactive", "-i", help="Interactive mode"),
@@ -82,9 +87,14 @@ def run(
82
  workspace_path.mkdir(parents=True, exist_ok=True)
83
  memory_path.mkdir(parents=True, exist_ok=True)
84
 
 
 
 
 
 
85
  if task:
86
  # Single task mode
87
- asyncio.run(_run_single_task(workspace_path, memory_path, task, config))
88
  elif interactive:
89
  # Interactive REPL mode
90
  from flow.cli.repl import FlowREPL
@@ -100,22 +110,47 @@ async def _run_single_task(
100
  memory_path: Path,
101
  task: str,
102
  config_path: Path | None = None,
 
103
  ) -> None:
104
  """Run a single task and print the result."""
105
  from flow.cli.output import print_event
106
  from flow.harness.base import EventType
107
- from flow.harness.maf import MAFHarness
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
 
109
  if config_path:
110
  # Load agent config from optimization result
111
  from flow.experiments.models import load_agent
112
- from flow.experiments.ablation import create_harness_from_agent
113
 
114
  agent_config = load_agent(config_path)
115
- console.print(f"[dim]Using agent config: {agent_config.name}[/]")
116
- harness = create_harness_from_agent(agent_config, workspace)
 
 
 
 
 
 
 
 
 
 
117
  else:
118
- harness = MAFHarness(workspace=workspace, memory_path=memory_path)
 
119
 
120
  try:
121
  console.print("\n[bold blue]Flow[/] - Executing task...\n")
@@ -237,7 +272,7 @@ def config() -> None:
237
  table.add_row("Workspace", str(DEFAULT_WORKSPACE))
238
  table.add_row("Memory Path", str(DEFAULT_MEMORY_PATH))
239
  table.add_row("Azure Endpoint", os.environ.get("AZURE_OPENAI_ENDPOINT", "(not set)"))
240
- table.add_row("Azure Deployment", os.environ.get("AZURE_OPENAI_DEPLOYMENT", "(not set)"))
241
 
242
  console.print(table)
243
 
@@ -256,7 +291,7 @@ def init() -> None:
256
  console.print(" 1. Set your Azure OpenAI credentials:")
257
  console.print(" [dim]export AZURE_OPENAI_API_KEY=your-key[/]")
258
  console.print(" [dim]export AZURE_OPENAI_ENDPOINT=https://your-resource.openai.azure.com/[/]")
259
- console.print(" [dim]export AZURE_OPENAI_DEPLOYMENT=your-deployment[/]")
260
  console.print("\n 2. Run Flow:")
261
  console.print(' [dim]flow run "Create a hello world Python script"[/]')
262
  console.print(" [dim]flow run -i # Interactive mode[/]")
@@ -264,6 +299,8 @@ def init() -> None:
264
 
265
  def main() -> None:
266
  """Main entry point."""
 
 
267
  app()
268
 
269
 
 
11
  from typing import Annotated
12
 
13
  import typer
14
+ from dotenv import load_dotenv
15
  from rich.console import Console
16
 
17
  from flow import __version__
 
62
  Path | None,
63
  typer.Option("--config", "-c", help="Config file from optimization (YAML)"),
64
  ] = None,
65
+ framework: Annotated[
66
+ str,
67
+ typer.Option("--framework", "-f", help="Agent framework: 'maf', 'miniagent', or 'langgraph'"),
68
+ ] = "maf",
69
  interactive: Annotated[
70
  bool,
71
  typer.Option("--interactive/--no-interactive", "-i", help="Interactive mode"),
 
87
  workspace_path.mkdir(parents=True, exist_ok=True)
88
  memory_path.mkdir(parents=True, exist_ok=True)
89
 
90
+ # Validate framework
91
+ if framework not in ("maf", "miniagent", "langgraph"):
92
+ console.print(f"[red]Error:[/] Unknown framework '{framework}'. Use 'maf', 'miniagent', or 'langgraph'.")
93
+ raise typer.Exit(1)
94
+
95
  if task:
96
  # Single task mode
97
+ asyncio.run(_run_single_task(workspace_path, memory_path, task, config, framework))
98
  elif interactive:
99
  # Interactive REPL mode
100
  from flow.cli.repl import FlowREPL
 
110
  memory_path: Path,
111
  task: str,
112
  config_path: Path | None = None,
113
+ framework: str = "maf",
114
  ) -> None:
115
  """Run a single task and print the result."""
116
  from flow.cli.output import print_event
117
  from flow.harness.base import EventType
118
+
119
+ # Import harness modules to register them
120
+ import flow.harness.maf # noqa: F401
121
+ import flow.harness.miniagent # noqa: F401 # pyright: ignore[reportUnusedImport]
122
+
123
+ if framework == "langgraph":
124
+ try:
125
+ import flow.harness.langgraph # noqa: F401
126
+ except ImportError:
127
+ console.print("[red]Error:[/] LangGraph dependencies not installed.")
128
+ console.print("[dim]Install with: pip install flow-agent[langgraph][/]")
129
+ raise typer.Exit(1)
130
+
131
+ from flow.harness import create_harness
132
+ from flow.experiments.models import Agent
133
 
134
  if config_path:
135
  # Load agent config from optimization result
136
  from flow.experiments.models import load_agent
 
137
 
138
  agent_config = load_agent(config_path)
139
+ # Override framework if specified
140
+ if framework != "maf":
141
+ agent_config = Agent(
142
+ name=agent_config.name,
143
+ framework=framework,
144
+ tools=agent_config.tools,
145
+ model=agent_config.model,
146
+ instructions=agent_config.instructions,
147
+ compaction=agent_config.compaction,
148
+ )
149
+ console.print(f"[dim]Using agent config: {agent_config.name} ({framework})[/]")
150
+ harness = create_harness(agent_config, workspace)
151
  else:
152
+ agent = Agent(name="flow-cli", framework=framework)
153
+ harness = create_harness(agent, workspace)
154
 
155
  try:
156
  console.print("\n[bold blue]Flow[/] - Executing task...\n")
 
272
  table.add_row("Workspace", str(DEFAULT_WORKSPACE))
273
  table.add_row("Memory Path", str(DEFAULT_MEMORY_PATH))
274
  table.add_row("Azure Endpoint", os.environ.get("AZURE_OPENAI_ENDPOINT", "(not set)"))
275
+ table.add_row("Azure Deployment", os.environ.get("AZURE_OPENAI_CHAT_DEPLOYMENT_NAME", "(not set)"))
276
 
277
  console.print(table)
278
 
 
291
  console.print(" 1. Set your Azure OpenAI credentials:")
292
  console.print(" [dim]export AZURE_OPENAI_API_KEY=your-key[/]")
293
  console.print(" [dim]export AZURE_OPENAI_ENDPOINT=https://your-resource.openai.azure.com/[/]")
294
+ console.print(" [dim]export AZURE_OPENAI_CHAT_DEPLOYMENT_NAME=your-deployment[/]")
295
  console.print("\n 2. Run Flow:")
296
  console.print(' [dim]flow run "Create a hello world Python script"[/]')
297
  console.print(" [dim]flow run -i # Interactive mode[/]")
 
299
 
300
  def main() -> None:
301
  """Main entry point."""
302
+ # Load environment variables from .env file if present
303
+ load_dotenv()
304
  app()
305
 
306
 
src/flow/cli/optimize.py CHANGED
@@ -6,6 +6,7 @@ from __future__ import annotations
6
 
7
  import asyncio
8
  import importlib.util
 
9
  import sys
10
  from pathlib import Path
11
  from typing import Annotated, Any
@@ -13,7 +14,15 @@ from typing import Annotated, Any
13
  import typer
14
  from rich.console import Console
15
 
16
- from flow.experiments.models import Agent, Candidate, CompactionConfig, GridSearchStrategy
 
 
 
 
 
 
 
 
17
  from flow.experiments.optimizer import FlowOptimizer, load_tasks_from_jsonl
18
  from flow.experiments.types import Task, get_task_suite
19
 
@@ -32,7 +41,14 @@ def optimize(
32
  Path | None,
33
  typer.Option(
34
  "--config", "-c",
35
- help="Path to Python config file with CANDIDATES or VARIATIONS",
 
 
 
 
 
 
 
36
  ),
37
  ] = None,
38
  agent: Annotated[
@@ -60,7 +76,7 @@ def optimize(
60
  str | None,
61
  typer.Option(
62
  "--vary", "-v",
63
- help="Comma-separated params to vary: compaction,memory,subagent",
64
  ),
65
  ] = None,
66
  output: Annotated[
@@ -92,24 +108,31 @@ def optimize(
92
 
93
  Examples:
94
 
 
 
 
95
  # Run with task file and default candidates
96
  flow optimize --tasks tasks.jsonl
97
 
98
- # Use custom candidates from Python file
99
- flow optimize --config my_configs.py --tasks tasks.jsonl
100
-
101
  # Vary specific parameters
102
- flow optimize --vary compaction,memory --tasks tasks.jsonl
 
 
 
103
 
104
  # Use built-in task suite
105
  flow optimize --suite coding --parallel 2
106
 
107
  # Start from a base agent definition
108
- flow optimize --agent base_agent.yaml --vary compaction,memory --tasks tasks.jsonl
 
 
 
109
  """
110
  asyncio.run(_run_optimize(
111
  tasks_path=tasks,
112
  config_path=config,
 
113
  agent_path=agent,
114
  suite=suite,
115
  parallel=parallel,
@@ -123,6 +146,7 @@ def optimize(
123
  async def _run_optimize(
124
  tasks_path: Path | None,
125
  config_path: Path | None,
 
126
  agent_path: Path | None,
127
  suite: str | None,
128
  parallel: int,
@@ -132,6 +156,11 @@ async def _run_optimize(
132
  budget: int,
133
  ) -> None:
134
  """Run the optimization."""
 
 
 
 
 
135
  # Load tasks
136
  tasks = _load_tasks(tasks_path, suite)
137
  if not tasks:
@@ -141,8 +170,24 @@ async def _run_optimize(
141
  # Load base agent
142
  base = _load_base_agent(agent_path)
143
 
144
- # Load/generate candidates
145
- candidates = _load_candidates(config_path, vary, base, budget)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
  if not candidates:
147
  console.print("[red]Error:[/] No candidates to test. Use --config or --vary")
148
  raise typer.Exit(1)
@@ -176,6 +221,94 @@ async def _run_optimize(
176
  raise typer.Exit(1)
177
 
178
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
  def _load_tasks(tasks_path: Path | None, suite: str | None) -> list[Task]:
180
  """Load tasks from file or built-in suite."""
181
  if tasks_path:
@@ -211,47 +344,119 @@ def _load_base_agent(agent_path: Path | None) -> Agent:
211
  return Agent(name="flow_agent")
212
 
213
 
214
- def _load_candidates(
215
  config_path: Path | None,
216
  vary: str | None,
217
  base: Agent,
218
  budget: int,
219
- ) -> list[Candidate]:
220
- """Load candidates from file or generate from variations."""
 
 
 
 
 
 
 
 
 
 
221
  if config_path:
222
  if not config_path.exists():
223
  console.print(f"[red]Error:[/] Config file not found: {config_path}")
224
  raise typer.Exit(1)
225
 
226
- candidates, variations = _load_python_config(config_path)
227
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
228
  if variations:
229
  strategy = GridSearchStrategy(variations)
230
- return strategy.generate(base, budget)
231
  elif candidates:
232
- return candidates
233
  else:
234
- console.print("[red]Error:[/] Config file has no CANDIDATES or VARIATIONS")
235
  raise typer.Exit(1)
236
 
237
  if vary:
238
  variations = _parse_vary_flag(vary)
239
  strategy = GridSearchStrategy(variations)
240
- return strategy.generate(base, budget)
241
 
242
  # Default: explore context engineering dimensions
243
  strategy = GridSearchStrategy(variations={
244
- "enable_memory": [True, False],
245
  "compaction": [
246
  CompactionConfig.head_tail(10, 40),
247
  CompactionConfig.none(),
248
  ],
 
249
  })
250
- return strategy.generate(base, budget)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
251
 
252
 
253
- def _load_python_config(path: Path) -> tuple[list[Candidate], dict[str, Any]]:
254
- """Load CANDIDATES and VARIATIONS from a Python file."""
 
 
 
 
 
 
 
255
  spec = importlib.util.spec_from_file_location("config_module", path)
256
  if spec is None or spec.loader is None:
257
  raise ValueError(f"Cannot load {path}")
@@ -262,12 +467,21 @@ def _load_python_config(path: Path) -> tuple[list[Candidate], dict[str, Any]]:
262
 
263
  candidates = getattr(module, "CANDIDATES", [])
264
  variations = getattr(module, "VARIATIONS", {})
 
265
 
266
- return candidates, variations
267
 
268
 
269
  def _parse_vary_flag(vary: str) -> dict[str, Any]:
270
- """Parse --vary flag into variations dict."""
 
 
 
 
 
 
 
 
271
  variations: dict[str, Any] = {}
272
 
273
  for param in vary.split(","):
@@ -278,10 +492,17 @@ def _parse_vary_flag(vary: str) -> dict[str, Any]:
278
  CompactionConfig.head_tail(10, 40),
279
  CompactionConfig.none(),
280
  ]
281
- elif param in ("memory", "mem"):
282
- variations["enable_memory"] = [True, False]
283
- elif param in ("subagent", "sub"):
284
- variations["enable_sub_agent"] = [True, False]
 
 
 
 
 
 
 
285
  elif param in ("head", "head_size"):
286
  variations["compaction"] = [
287
  CompactionConfig.head_tail(h, 40) for h in [5, 10, 20]
@@ -294,3 +515,139 @@ def _parse_vary_flag(vary: str) -> dict[str, Any]:
294
  console.print(f"[yellow]Warning:[/] Unknown vary param: {param}")
295
 
296
  return variations
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
  import asyncio
8
  import importlib.util
9
+ import logging
10
  import sys
11
  from pathlib import Path
12
  from typing import Annotated, Any
 
14
  import typer
15
  from rich.console import Console
16
 
17
+ from flow.experiments.models import (
18
+ Agent,
19
+ Candidate,
20
+ CompactionConfig,
21
+ Experiment,
22
+ ExperimentResult,
23
+ GridSearchStrategy,
24
+ load_experiment,
25
+ )
26
  from flow.experiments.optimizer import FlowOptimizer, load_tasks_from_jsonl
27
  from flow.experiments.types import Task, get_task_suite
28
 
 
41
  Path | None,
42
  typer.Option(
43
  "--config", "-c",
44
+ help="Path to config file (YAML or Python) with STRATEGY, CANDIDATES, or VARIATIONS",
45
+ ),
46
+ ] = None,
47
+ experiment: Annotated[
48
+ Path | None,
49
+ typer.Option(
50
+ "--experiment", "-e",
51
+ help="Path to experiment YAML file (defines agent, tasks, and variations)",
52
  ),
53
  ] = None,
54
  agent: Annotated[
 
76
  str | None,
77
  typer.Option(
78
  "--vary", "-v",
79
+ help="Comma-separated params to vary: compaction, strategy, tools, head, tail",
80
  ),
81
  ] = None,
82
  output: Annotated[
 
108
 
109
  Examples:
110
 
111
+ # Use experiment YAML (recommended - defines agent, tasks, and variations)
112
+ flow optimize --experiment experiment.yaml
113
+
114
  # Run with task file and default candidates
115
  flow optimize --tasks tasks.jsonl
116
 
 
 
 
117
  # Vary specific parameters
118
+ flow optimize --vary compaction,tools --tasks tasks.jsonl
119
+
120
+ # Test all compaction strategies
121
+ flow optimize --vary strategy --suite coding
122
 
123
  # Use built-in task suite
124
  flow optimize --suite coding --parallel 2
125
 
126
  # Start from a base agent definition
127
+ flow optimize --agent base_agent.yaml --vary compaction,tools --tasks tasks.jsonl
128
+
129
+ # Use GEPA for active prompt optimization (via YAML config)
130
+ flow optimize --config gepa_strategy.yaml --agent base_agent.yaml --tasks tasks.jsonl
131
  """
132
  asyncio.run(_run_optimize(
133
  tasks_path=tasks,
134
  config_path=config,
135
+ experiment_path=experiment,
136
  agent_path=agent,
137
  suite=suite,
138
  parallel=parallel,
 
146
  async def _run_optimize(
147
  tasks_path: Path | None,
148
  config_path: Path | None,
149
+ experiment_path: Path | None,
150
  agent_path: Path | None,
151
  suite: str | None,
152
  parallel: int,
 
156
  budget: int,
157
  ) -> None:
158
  """Run the optimization."""
159
+ # If experiment YAML provided, use it as the source of truth
160
+ if experiment_path:
161
+ await _run_from_experiment(experiment_path, output_dir)
162
+ return
163
+
164
  # Load tasks
165
  tasks = _load_tasks(tasks_path, suite)
166
  if not tasks:
 
170
  # Load base agent
171
  base = _load_base_agent(agent_path)
172
 
173
+ # Load candidates and check if a strategy is defined in config
174
+ candidates, strategy_instance = _load_candidates_and_strategy(config_path, vary, base, budget)
175
+
176
+ # If a strategy was provided (like GepaStrategy), run it directly
177
+ if strategy_instance is not None:
178
+ console.print("\n[bold]Running active optimization strategy...[/]")
179
+ await _run_active_strategy(
180
+ strategy=strategy_instance,
181
+ base_agent=base,
182
+ tasks=tasks,
183
+ output_dir=output_dir,
184
+ parallel=parallel,
185
+ use_llm_eval=use_llm_eval,
186
+ budget=budget
187
+ )
188
+ return
189
+
190
+ # Otherwise, use traditional grid search with candidates
191
  if not candidates:
192
  console.print("[red]Error:[/] No candidates to test. Use --config or --vary")
193
  raise typer.Exit(1)
 
221
  raise typer.Exit(1)
222
 
223
 
224
+ async def _run_from_experiment(experiment_path: Path, output_dir: Path | None) -> None:
225
+ """Run optimization from an experiment YAML file.
226
+
227
+ The experiment YAML defines:
228
+ - base_agent: Path to agent YAML
229
+ - suite/tasks: Which tasks to run
230
+ - variations: Parameter variations for grid search
231
+ - parallel, budget, use_llm_eval: Optimization settings
232
+ """
233
+ if not experiment_path.exists():
234
+ console.print(f"[red]Error:[/] Experiment file not found: {experiment_path}")
235
+ raise typer.Exit(1)
236
+
237
+ exp = load_experiment(experiment_path)
238
+
239
+ # Load base agent
240
+ if exp.base_agent:
241
+ base_agent_path = Path(exp.base_agent)
242
+ # Handle relative paths (relative to experiment file)
243
+ if not base_agent_path.is_absolute():
244
+ base_agent_path = experiment_path.parent / base_agent_path
245
+ if not base_agent_path.exists():
246
+ console.print(f"[red]Error:[/] Base agent file not found: {base_agent_path}")
247
+ raise typer.Exit(1)
248
+ from flow.experiments.models import load_agent
249
+ base = load_agent(base_agent_path)
250
+ else:
251
+ base = Agent(name="flow_agent")
252
+
253
+ # Load tasks
254
+ tasks: list[Task] = []
255
+ if exp.tasks:
256
+ tasks_path = Path(exp.tasks)
257
+ if not tasks_path.is_absolute():
258
+ tasks_path = experiment_path.parent / tasks_path
259
+ if not tasks_path.exists():
260
+ console.print(f"[red]Error:[/] Tasks file not found: {tasks_path}")
261
+ raise typer.Exit(1)
262
+ tasks = load_tasks_from_jsonl(tasks_path)
263
+ elif exp.suite:
264
+ try:
265
+ tasks = get_task_suite(exp.suite)
266
+ except ValueError as e:
267
+ console.print(f"[red]Error:[/] {e}")
268
+ raise typer.Exit(1)
269
+ else:
270
+ console.print("[red]Error:[/] Experiment must specify 'suite' or 'tasks'")
271
+ raise typer.Exit(1)
272
+
273
+ # Generate candidates from variations
274
+ if exp.variations:
275
+ strategy = GridSearchStrategy(exp.variations)
276
+ candidates = strategy.generate(base, exp.budget)
277
+ else:
278
+ candidates = [Candidate(agent=base, mutations={}, rationale="baseline")]
279
+
280
+ console.print(f"\n[bold]Experiment:[/] {experiment_path.name}")
281
+ console.print(f"[bold]Base Agent:[/] {base.name}")
282
+ console.print(f"\n[bold]Tasks:[/] {len(tasks)}")
283
+ for t in tasks:
284
+ console.print(f" - {t.name}")
285
+
286
+ console.print(f"\n[bold]Variations:[/]")
287
+ for key, values in exp.variations.items():
288
+ console.print(f" - {key}: {len(values)} variants")
289
+
290
+ console.print(f"\n[bold]Candidates:[/] {len(candidates)}")
291
+
292
+ # Run optimizer
293
+ optimizer = FlowOptimizer(
294
+ parallel=exp.parallel,
295
+ use_llm_evaluator=exp.use_llm_eval,
296
+ output_dir=output_dir,
297
+ )
298
+
299
+ try:
300
+ result = await optimizer.optimize(candidates, tasks)
301
+
302
+ console.print("\n[bold green]Optimization complete![/]")
303
+ console.print(f"\nBest agents exported to: [cyan]{result.output_dir / 'agents'}[/]")
304
+ console.print("\nTo use an agent config:")
305
+ console.print(f" [dim]flow run --config {result.output_dir / 'agents' / 'best_score.yaml'} \"your task\"[/]")
306
+
307
+ except KeyboardInterrupt:
308
+ console.print("\n[yellow]Optimization cancelled.[/]")
309
+ raise typer.Exit(1)
310
+
311
+
312
  def _load_tasks(tasks_path: Path | None, suite: str | None) -> list[Task]:
313
  """Load tasks from file or built-in suite."""
314
  if tasks_path:
 
344
  return Agent(name="flow_agent")
345
 
346
 
347
+ def _load_candidates_and_strategy(
348
  config_path: Path | None,
349
  vary: str | None,
350
  base: Agent,
351
  budget: int,
352
+ ) -> tuple[list[Candidate], Any | None]:
353
+ """Load candidates from file or generate from variations.
354
+
355
+ Supports both YAML and Python config files:
356
+ - YAML: strategy configuration (strategy_type, config)
357
+ - Python: STRATEGY object, CANDIDATES list, or VARIATIONS dict
358
+
359
+ Returns:
360
+ Tuple of (candidates, strategy_instance)
361
+ - If a STRATEGY is defined in config, returns ([], strategy_instance)
362
+ - Otherwise returns (candidates, None) for traditional grid search
363
+ """
364
  if config_path:
365
  if not config_path.exists():
366
  console.print(f"[red]Error:[/] Config file not found: {config_path}")
367
  raise typer.Exit(1)
368
 
369
+ # Check file extension to determine format
370
+ if config_path.suffix in (".yaml", ".yml"):
371
+ strategy_obj = _load_yaml_strategy(config_path)
372
+ if strategy_obj is not None:
373
+ return [], strategy_obj
374
+ # YAML files currently only support strategy definitions
375
+ console.print("[red]Error:[/] YAML config must define a strategy")
376
+ raise typer.Exit(1)
377
+
378
+ # Python config file
379
+ candidates, variations, strategy_obj = _load_python_config(config_path)
380
+
381
+ # If a strategy object was provided (e.g., GepaStrategy), return it
382
+ if strategy_obj is not None:
383
+ return [], strategy_obj
384
+
385
  if variations:
386
  strategy = GridSearchStrategy(variations)
387
+ return strategy.generate(base, budget), None
388
  elif candidates:
389
+ return candidates, None
390
  else:
391
+ console.print("[red]Error:[/] Config file has no CANDIDATES, VARIATIONS, or STRATEGY")
392
  raise typer.Exit(1)
393
 
394
  if vary:
395
  variations = _parse_vary_flag(vary)
396
  strategy = GridSearchStrategy(variations)
397
+ return strategy.generate(base, budget), None
398
 
399
  # Default: explore context engineering dimensions
400
  strategy = GridSearchStrategy(variations={
 
401
  "compaction": [
402
  CompactionConfig.head_tail(10, 40),
403
  CompactionConfig.none(),
404
  ],
405
+ "tools": ["minimal", "standard"],
406
  })
407
+ return strategy.generate(base, budget), None
408
+
409
+
410
+ def _load_yaml_strategy(path: Path) -> Any | None:
411
+ """Load strategy configuration from a YAML file.
412
+
413
+ Expected YAML format:
414
+ ```yaml
415
+ strategy_type: gepa # or other strategy types
416
+ config:
417
+ reflection_lm: gpt-4o
418
+ population_size: 5
419
+ optimize_fields:
420
+ - instructions
421
+ ```
422
+
423
+ Returns:
424
+ Strategy instance or None if file doesn't define a strategy
425
+ """
426
+ import yaml
427
+
428
+ with open(path) as f:
429
+ data = yaml.safe_load(f)
430
+
431
+ if not data or "strategy_type" not in data:
432
+ return None
433
+
434
+ strategy_type = data["strategy_type"].lower()
435
+ strategy_config = data.get("config", {})
436
+
437
+ if strategy_type == "gepa":
438
+ try:
439
+ from flow.optimizers import GepaStrategy
440
+ return GepaStrategy(config=strategy_config)
441
+ except ImportError:
442
+ console.print("[red]Error:[/] GEPA optimizer not available.")
443
+ console.print("[dim]Install with: pip install flow-agent[optimizer][/]")
444
+ raise typer.Exit(1)
445
+ else:
446
+ console.print(f"[red]Error:[/] Unknown strategy type: {strategy_type}")
447
+ console.print("[dim]Supported: gepa[/]")
448
+ raise typer.Exit(1)
449
 
450
 
451
+ def _load_python_config(path: Path) -> tuple[list[Candidate], dict[str, Any], Any | None]:
452
+ """Load CANDIDATES, VARIATIONS, and STRATEGY from a Python file.
453
+
454
+ Returns:
455
+ Tuple of (candidates, variations, strategy)
456
+ - candidates: List of Candidate objects
457
+ - variations: Dict of parameter variations for GridSearchStrategy
458
+ - strategy: Strategy instance (e.g., GepaStrategy) or None
459
+ """
460
  spec = importlib.util.spec_from_file_location("config_module", path)
461
  if spec is None or spec.loader is None:
462
  raise ValueError(f"Cannot load {path}")
 
467
 
468
  candidates = getattr(module, "CANDIDATES", [])
469
  variations = getattr(module, "VARIATIONS", {})
470
+ strategy = getattr(module, "STRATEGY", None)
471
 
472
+ return candidates, variations, strategy
473
 
474
 
475
  def _parse_vary_flag(vary: str) -> dict[str, Any]:
476
+ """Parse --vary flag into variations dict.
477
+
478
+ Supported parameters:
479
+ compaction, compact: Test head_tail vs none
480
+ strategy: Test all compaction strategies (none, head_tail, sliding_window, summarization)
481
+ tools: Test minimal vs standard tool sets
482
+ head, head_size: Vary head sizes (5, 10, 20)
483
+ tail, tail_size: Vary tail sizes (20, 40, 60)
484
+ """
485
  variations: dict[str, Any] = {}
486
 
487
  for param in vary.split(","):
 
492
  CompactionConfig.head_tail(10, 40),
493
  CompactionConfig.none(),
494
  ]
495
+ elif param in ("strategy", "strategies"):
496
+ # Test all compaction strategies
497
+ variations["compaction"] = [
498
+ CompactionConfig.none(),
499
+ CompactionConfig.head_tail(10, 40),
500
+ CompactionConfig(strategy="sliding_window", token_budget=50_000),
501
+ CompactionConfig(strategy="summarization", token_budget=50_000),
502
+ ]
503
+ elif param in ("tools", "toolset"):
504
+ # Tool variations - memory and subagent are just tools
505
+ variations["tools"] = ["minimal", "standard"]
506
  elif param in ("head", "head_size"):
507
  variations["compaction"] = [
508
  CompactionConfig.head_tail(h, 40) for h in [5, 10, 20]
 
515
  console.print(f"[yellow]Warning:[/] Unknown vary param: {param}")
516
 
517
  return variations
518
+
519
+
520
+ async def _run_active_strategy(
521
+ strategy: Any,
522
+ base_agent: Agent,
523
+ tasks: list[Task],
524
+ output_dir: Path | None,
525
+ parallel: int,
526
+ use_llm_eval: bool,
527
+ budget: int,
528
+ ) -> None:
529
+ """Run an active optimization strategy (like GEPA)."""
530
+ logger = logging.getLogger(__name__)
531
+
532
+ # Create optimizer instance to run evaluations
533
+ optimizer_runner = FlowOptimizer(
534
+ parallel=parallel,
535
+ use_llm_evaluator=use_llm_eval,
536
+ output_dir=None, # Don't export every intermediate run result
537
+ )
538
+
539
+
540
+ main_loop = asyncio.get_running_loop()
541
+
542
+ # Define evaluator function to inject into strategy
543
+ def evaluator(candidate: Candidate, minibatch: list[Task] | None = None) -> ExperimentResult:
544
+ """Evaluate a candidate on a minibatch of tasks."""
545
+ eval_tasks = minibatch if minibatch else tasks
546
+
547
+ logger.info(f"[EVALUATOR] Evaluating candidate '{candidate.agent.name}' on {len(eval_tasks)} tasks")
548
+ logger.info(f"[EVALUATOR] Using LLM evaluator: {use_llm_eval}")
549
+ logger.debug(f"[EVALUATOR] Tasks: {[t.name for t in eval_tasks]}")
550
+
551
+ try:
552
+ # Run async evaluation on the main loop and wait for result
553
+ # This is safe because strategy.generate (which calls this)
554
+ # is running in an executor thread.
555
+ future = asyncio.run_coroutine_threadsafe(
556
+ optimizer_runner.optimize([candidate], eval_tasks),
557
+ main_loop
558
+ )
559
+ optimization_result = future.result()
560
+
561
+ # Check if we got any results
562
+ if not optimization_result.summaries:
563
+ logger.warning(f"[EVALUATOR] Optimization produced no summaries for candidate '{candidate.agent.name}'")
564
+ # Return a fallback result with zero score instead of raising
565
+ return ExperimentResult(
566
+ candidate=candidate,
567
+ run_result=None,
568
+ metrics={"score": 0.0, "error": "No summaries produced"},
569
+ eval_score=0.0,
570
+ eval_passed=False,
571
+ eval_reasoning="Evaluation failed to produce results",
572
+ traces={}
573
+ )
574
+
575
+ summary = optimization_result.summaries[0]
576
+ logger.info(f"[EVALUATOR] Candidate '{candidate.agent.name}' avg_score={summary.avg_score:.3f}, pass_rate={summary.pass_rate:.2f}")
577
+
578
+ # Log individual task results for debugging
579
+ if summary.task_results:
580
+ for tr in summary.task_results:
581
+ logger.info(f"[EVALUATOR] Task '{tr.task_name}': score={tr.eval_score:.3f}, passed={tr.eval_passed}")
582
+ logger.debug(f"[EVALUATOR] Reasoning: '{tr.eval_reasoning[:150]}'")
583
+ logger.debug(f"[EVALUATOR] Metrics: tokens={tr.metrics.total_tokens}, duration={tr.run_result.duration_seconds if tr.run_result else 0:.2f}s")
584
+
585
+ # Convert CandidateSummary to ExperimentResult for GEPA
586
+
587
+ if summary.task_results:
588
+ tr = summary.task_results[0]
589
+ return ExperimentResult(
590
+ candidate=candidate,
591
+ run_result=tr.run_result,
592
+ metrics=tr.metrics,
593
+ eval_score=tr.eval_score,
594
+ eval_passed=tr.eval_passed,
595
+ eval_reasoning=tr.eval_reasoning,
596
+ traces=tr.run_result.trace if tr.run_result and isinstance(tr.run_result.trace, dict) else {}
597
+ )
598
+
599
+ # Fallback to aggregate metrics if no individual task results
600
+ return ExperimentResult(
601
+ candidate=candidate,
602
+ run_result=None,
603
+ metrics={"score": summary.avg_score},
604
+ eval_score=summary.avg_score,
605
+ eval_passed=summary.pass_rate > 0.5,
606
+ eval_reasoning=f"Aggregate pass rate: {summary.pass_rate}",
607
+ traces={}
608
+ )
609
+
610
+ except Exception as e:
611
+ logger.error(f"Error evaluating candidate '{candidate.agent.name}': {e}", exc_info=True)
612
+ # Return a fallback result instead of propagating the exception
613
+ return ExperimentResult(
614
+ candidate=candidate,
615
+ run_result=None,
616
+ metrics={"score": 0.0, "error": str(e)},
617
+ eval_score=0.0,
618
+ eval_passed=False,
619
+ eval_reasoning=f"Evaluation error: {str(e)}",
620
+ traces={}
621
+ )
622
+
623
+
624
+ # Inject dependencies into strategy if supported
625
+ # GepaStrategy accepts them in __init__, but we might have loaded it from config
626
+ # without them.
627
+ if hasattr(strategy, "evaluator") and strategy.evaluator is None:
628
+ strategy.evaluator = evaluator
629
+ if hasattr(strategy, "dataset") and strategy.dataset is None:
630
+ strategy.dataset = tasks
631
+
632
+ # Execute strategy (blocking/sync)
633
+ # We should run this in an executor to avoid blocking the main async loop
634
+ # if we were doing other async things, but here we just wait for it.
635
+ loop = asyncio.get_running_loop()
636
+ candidates = await loop.run_in_executor(None, strategy.generate, base_agent, budget)
637
+
638
+ console.print("\n[bold green]Optimization complete![/]")
639
+ console.print(f"Generated {len(candidates)} candidates.")
640
+
641
+ # Export results
642
+ if output_dir:
643
+ from flow.experiments.models import export_agent
644
+ output_dir.mkdir(parents=True, exist_ok=True)
645
+ (output_dir / "agents").mkdir(exist_ok=True)
646
+
647
+ for i, cand in enumerate(candidates):
648
+ # Basic export
649
+ name = cand.agent.name or f"candidate_{i}"
650
+ export_agent(cand.agent, output_dir / "agents" / f"{name}.yaml", metrics={"rationale": cand.rationale})
651
+
652
+ console.print(f"\nAgents exported to: [cyan]{output_dir / 'agents'}[/]")
653
+
src/flow/cli/repl.py CHANGED
@@ -11,8 +11,8 @@ from pathlib import Path
11
  from rich.console import Console
12
 
13
  from flow.cli.output import print_event, print_welcome
14
- from flow.harness.base import EventType
15
- from flow.harness.maf import MAFHarness
16
 
17
  # Default paths
18
  DEFAULT_WORKSPACE = Path.home() / ".flow" / "workspace"
@@ -40,16 +40,18 @@ class FlowREPL:
40
  self._workspace = workspace or DEFAULT_WORKSPACE
41
  self._memory_path = memory_path or DEFAULT_MEMORY_PATH
42
  self._console = Console()
43
- self._harness: MAFHarness | None = None
44
  self._thread_id: str | None = None
45
 
46
- def _get_harness(self) -> MAFHarness:
47
  """Get or create the harness instance."""
48
  if self._harness is None:
49
- self._harness = MAFHarness(
50
- workspace=self._workspace,
51
- memory_path=self._memory_path,
52
- )
 
 
53
  return self._harness
54
 
55
  async def run(self) -> None:
@@ -112,7 +114,7 @@ class FlowREPL:
112
  except EOFError:
113
  return None
114
 
115
- async def _run_task(self, harness: MAFHarness, task: str) -> None:
116
  """Run a task and stream the output.
117
 
118
  Args:
@@ -122,7 +124,7 @@ class FlowREPL:
122
  self._console.print() # Blank line before output
123
 
124
  try:
125
- async for event in harness.run_stream(task, self._thread_id):
126
  print_event(self._console, event)
127
 
128
  # Store thread ID for conversation continuity
 
11
  from rich.console import Console
12
 
13
  from flow.cli.output import print_event, print_welcome
14
+ from flow.experiments.models import Agent
15
+ from flow.harness.base import BaseHarness, EventType
16
 
17
  # Default paths
18
  DEFAULT_WORKSPACE = Path.home() / ".flow" / "workspace"
 
40
  self._workspace = workspace or DEFAULT_WORKSPACE
41
  self._memory_path = memory_path or DEFAULT_MEMORY_PATH
42
  self._console = Console()
43
+ self._harness: BaseHarness | None = None
44
  self._thread_id: str | None = None
45
 
46
+ def _get_harness(self) -> BaseHarness:
47
  """Get or create the harness instance."""
48
  if self._harness is None:
49
+ # Import maf module to register the harness, then use registry
50
+ import flow.harness.maf # noqa: F401
51
+ from flow.harness import create_harness
52
+
53
+ agent = Agent(name="flow-repl")
54
+ self._harness = create_harness(agent, self._workspace)
55
  return self._harness
56
 
57
  async def run(self) -> None:
 
114
  except EOFError:
115
  return None
116
 
117
+ async def _run_task(self, harness: BaseHarness, task: str) -> None:
118
  """Run a task and stream the output.
119
 
120
  Args:
 
124
  self._console.print() # Blank line before output
125
 
126
  try:
127
+ async for event in harness.run_stream(task):
128
  print_event(self._console, event)
129
 
130
  # Store thread ID for conversation continuity
src/flow/experiments/__init__.py CHANGED
@@ -52,7 +52,6 @@ from .models import (
52
  # Experiment runner + Pareto analysis
53
  from .ablation import (
54
  compute_pareto_frontier,
55
- create_harness_from_agent,
56
  generate_recommendation,
57
  run_experiments,
58
  run_single_experiment,
@@ -146,7 +145,6 @@ __all__ = [ # noqa: RUF022 # Intentionally grouped by category
146
  "print_comparison_table",
147
  "print_eval_result",
148
  # Experiment runner
149
- "create_harness_from_agent",
150
  "run_experiments",
151
  "run_single_experiment",
152
  "compute_pareto_frontier",
 
52
  # Experiment runner + Pareto analysis
53
  from .ablation import (
54
  compute_pareto_frontier,
 
55
  generate_recommendation,
56
  run_experiments,
57
  run_single_experiment,
 
145
  "print_comparison_table",
146
  "print_eval_result",
147
  # Experiment runner
 
148
  "run_experiments",
149
  "run_single_experiment",
150
  "compute_pareto_frontier",
src/flow/experiments/ablation.py CHANGED
@@ -19,46 +19,17 @@ from typing import TYPE_CHECKING, Any
19
 
20
  from .evaluators import HeuristicEvaluator
21
  from .metrics import extract_metrics, metrics_to_dict
22
- from .models import Agent, Candidate, ExperimentResult
23
  from .reporters import print_comparison_table, save_run_result
24
  from .runner import FlowExperimentRunner, setup_tracing
25
  from .types import EvalCriterion, Task
26
 
27
  if TYPE_CHECKING:
28
- from flow.harness.maf import MAFHarness
29
-
30
  from .optimizer import CandidateSummary
31
 
32
  logger = logging.getLogger(__name__)
33
 
34
 
35
- def create_harness_from_agent(agent: Agent, workspace: Path) -> MAFHarness:
36
- """Create a MAFHarness from an Agent definition.
37
-
38
- Args:
39
- agent: The agent definition
40
- workspace: Working directory
41
-
42
- Returns:
43
- A configured MAFHarness
44
- """
45
- from flow.experiments.models import resolve_tools
46
- from flow.harness.maf import MAFHarness
47
-
48
- # Resolve tools to dict form
49
- tools_spec = resolve_tools(agent.tools)
50
-
51
- return MAFHarness(
52
- workspace=workspace,
53
- memory_path=workspace / "memory",
54
- enable_compaction=agent.compaction.enabled,
55
- compaction_head_size=agent.compaction.head_size,
56
- compaction_tail_size=agent.compaction.tail_size,
57
- tools=tools_spec,
58
- instructions=agent.instructions,
59
- )
60
-
61
-
62
  async def run_single_experiment(
63
  candidate: Candidate,
64
  task: Task,
@@ -74,7 +45,15 @@ async def run_single_experiment(
74
  Returns:
75
  ExperimentResult with metrics and evaluation
76
  """
77
- harness = create_harness_from_agent(candidate.agent, workspace)
 
 
 
 
 
 
 
 
78
 
79
  try:
80
  runner = FlowExperimentRunner(keep_workspace=True)
 
19
 
20
  from .evaluators import HeuristicEvaluator
21
  from .metrics import extract_metrics, metrics_to_dict
22
+ from .models import Candidate, ExperimentResult
23
  from .reporters import print_comparison_table, save_run_result
24
  from .runner import FlowExperimentRunner, setup_tracing
25
  from .types import EvalCriterion, Task
26
 
27
  if TYPE_CHECKING:
 
 
28
  from .optimizer import CandidateSummary
29
 
30
  logger = logging.getLogger(__name__)
31
 
32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  async def run_single_experiment(
34
  candidate: Candidate,
35
  task: Task,
 
45
  Returns:
46
  ExperimentResult with metrics and evaluation
47
  """
48
+ # Import harness modules to register them, then use registry
49
+ import flow.harness.maf # noqa: F401
50
+ try:
51
+ import flow.harness.miniagent # noqa: F401
52
+ except ImportError:
53
+ pass # miniagent harness is optional
54
+ from flow.harness import create_harness
55
+
56
+ harness = create_harness(candidate.agent, workspace)
57
 
58
  try:
59
  runner = FlowExperimentRunner(keep_workspace=True)
src/flow/experiments/data/tasks/coding.jsonl CHANGED
@@ -1,10 +1,5 @@
1
- {"name": "fizzbuzz", "prompt": "Create fizzbuzz.py that prints FizzBuzz 1-100 and run it.", "criteria": [{"name": "correct", "instruction": "Correct FizzBuzz output"}], "category": "short"}
2
- {"name": "rest_api", "prompt": "Create a FastAPI CRUD TODO app with GET/POST/DELETE endpoints.", "criteria": [{"name": "has_crud", "instruction": "Has working CRUD"}], "category": "medium"}
3
- {"name": "cli_tool", "prompt": "Create an argparse CLI that counts lines/words/chars in a file.", "criteria": [{"name": "works", "instruction": "CLI works correctly"}], "category": "medium"}
4
- {"name": "data_pipeline", "prompt": "Create a script that reads CSV data, filters rows, aggregates, and outputs JSON.", "criteria": [{"name": "works", "instruction": "Pipeline produces correct output"}], "category": "medium"}
5
- {"name": "unit_tests", "prompt": "Create calc.py with math functions and test_calc.py with pytest tests.", "criteria": [{"name": "tests_pass", "instruction": "Tests pass"}], "category": "medium"}
6
- {"name": "web_scraper", "prompt": "Create a script that fetches a webpage and extracts all links.", "criteria": [{"name": "extracts_links", "instruction": "Extracts links correctly"}], "category": "medium"}
7
- {"name": "async_downloader", "prompt": "Create an async script that downloads multiple URLs concurrently using aiohttp.", "criteria": [{"name": "uses_async", "instruction": "Uses async/await correctly"}], "category": "complex"}
8
- {"name": "database_orm", "prompt": "Create a SQLAlchemy model for Users with CRUD operations.", "criteria": [{"name": "has_orm", "instruction": "Uses SQLAlchemy ORM correctly"}], "category": "complex"}
9
- {"name": "decorator_lib", "prompt": "Create a library with timing, retry, and caching decorators.", "criteria": [{"name": "decorators_work", "instruction": "Decorators function correctly"}], "category": "complex"}
10
- {"name": "config_parser", "prompt": "Create a config parser that supports YAML, JSON, and env vars with validation.", "criteria": [{"name": "multi_format", "instruction": "Supports multiple formats"}], "category": "complex"}
 
1
+ {"name": "repo_documentation", "prompt": "Clone the repository https://github.com/microsoft-foundry/ai-tutorials and generate comprehensive documentation.\n\nSTEP 1: Clone the repository\n- Use bash to clone: git clone https://github.com/microsoft-foundry/ai-tutorials\n- Confirm the clone succeeded by listing the directory contents\n\nSTEP 2: Explore the structure\n- List all files and directories recursively\n- Identify the main components, tutorials, and examples\n\nSTEP 3: Generate documentation\nFor EVERY file in the repository:\n1. Read the complete file\n2. Document its purpose (1-2 sentences)\n3. List key functions/classes if code, or sections if markdown\n4. Note dependencies or prerequisites\n\nSTEP 4: Create a comprehensive report\n- Overall repository purpose and structure\n- Table of contents of all tutorials/examples\n- Prerequisites for running each tutorial\n- Suggested learning path for beginners\n\nBe thorough. Read every file completely. Document everything.", "criteria": [{"name": "clone_success", "instruction": "Repository was successfully cloned", "weight": 1.0}, {"name": "file_coverage", "instruction": "All files in the repository were read and documented", "weight": 0.9}, {"name": "documentation_quality", "instruction": "Each file has meaningful description, not just filenames", "weight": 0.8}, {"name": "synthesis", "instruction": "Final report provides useful overview and learning path", "weight": 0.7}], "metadata": {"expected_iterations": 20, "min_tokens": 50000, "category": "context_stress"}}
2
+ {"name": "code_review", "prompt": "Clone https://github.com/microsoft-foundry/ai-tutorials and perform an exhaustive code review.\n\nSTEP 1: Clone the repository\n- git clone https://github.com/microsoft-foundry/ai-tutorials\n- Verify the clone succeeded\n\nSTEP 2: Inventory all code files\n- Find all Python files (.py)\n- Find all Jupyter notebooks (.ipynb)\n- Find all configuration files\n\nSTEP 3: Review each code file\nFor EVERY Python file and notebook:\n1. Read the complete file\n2. For each function/method, document:\n - Name and signature\n - What it does (1-2 sentences)\n - Any potential issues (edge cases, missing error handling)\n3. For each class, document:\n - Purpose\n - All methods with their purposes\n\nSTEP 4: Synthesize findings\n- Summary table of all modules and their relationships\n- Top 10 code quality issues found\n- Recommendations for improvement\n- Best practices observed worth replicating\n\nRead every file. Be thorough and systematic.", "criteria": [{"name": "clone_success", "instruction": "Repository was successfully cloned", "weight": 1.0}, {"name": "completeness", "instruction": "All code files were read and reviewed", "weight": 0.9}, {"name": "depth", "instruction": "Each function/class has meaningful analysis, not just signatures", "weight": 0.8}, {"name": "issues_found", "instruction": "Identified real code quality issues with specific examples", "weight": 0.7}], "metadata": {"expected_iterations": 22, "min_tokens": 55000, "category": "context_stress"}}
3
+ {"name": "tutorial_analysis", "prompt": "Clone https://github.com/microsoft-foundry/ai-tutorials and analyze the tutorial content for educational effectiveness.\n\nSTEP 1: Clone the repository\n- git clone https://github.com/microsoft-foundry/ai-tutorials\n- List the repository contents to understand structure\n\nSTEP 2: Read ALL tutorials\nFor EACH tutorial or example:\n1. Read the complete content\n2. Identify the learning objectives\n3. List prerequisites assumed\n4. Note the teaching approach used\n\nSTEP 3: Evaluate educational quality\nFor each tutorial, assess:\n- Clarity of explanations\n- Code-to-explanation ratio\n- Progression of difficulty\n- Hands-on exercises included\n- Common pitfalls addressed\n\nSTEP 4: Create improvement report\n- Rank tutorials by educational effectiveness\n- Identify gaps in coverage\n- Suggest specific improvements for each tutorial\n- Recommend additional tutorials that should be added\n- Create an optimal learning sequence\n\nBe thorough. Read every file. Provide specific examples.", "criteria": [{"name": "clone_success", "instruction": "Repository was successfully cloned", "weight": 1.0}, {"name": "tutorial_coverage", "instruction": "All tutorials were read and analyzed", "weight": 0.9}, {"name": "evaluation_depth", "instruction": "Evaluation criteria applied consistently across tutorials", "weight": 0.8}, {"name": "actionable_recommendations", "instruction": "Improvement suggestions are specific and implementable", "weight": 0.7}], "metadata": {"expected_iterations": 20, "min_tokens": 50000, "category": "context_stress"}}
4
+ {"name": "dependency_audit", "prompt": "Clone https://github.com/microsoft-foundry/ai-tutorials and perform a thorough dependency and compatibility audit.\n\nSTEP 1: Clone the repository\n- git clone https://github.com/microsoft-foundry/ai-tutorials\n- Confirm successful clone\n\nSTEP 2: Find all dependency specifications\n- Search for requirements.txt files\n- Search for pyproject.toml files\n- Search for setup.py files\n- Search for environment.yml files\n- Check imports in Python files for implicit dependencies\n\nSTEP 3: Analyze each dependency\nFor EVERY dependency found:\n1. Current version specified (or 'unpinned' if none)\n2. Latest available version\n3. Known security vulnerabilities\n4. Compatibility with Python 3.10, 3.11, 3.12\n5. Transitive dependencies introduced\n\nSTEP 4: Generate audit report\n- Dependency tree visualization (text format)\n- Security vulnerabilities found with severity\n- Version conflicts or incompatibilities\n- Recommendations for updates\n- Suggested requirements.txt with pinned versions\n\nRead all relevant files. Be thorough and specific.", "criteria": [{"name": "clone_success", "instruction": "Repository was successfully cloned", "weight": 1.0}, {"name": "dependency_discovery", "instruction": "All dependency specifications were found and analyzed", "weight": 0.9}, {"name": "analysis_depth", "instruction": "Each dependency was analyzed for versions and compatibility", "weight": 0.8}, {"name": "actionable_report", "instruction": "Report includes specific version recommendations", "weight": 0.7}], "metadata": {"expected_iterations": 18, "min_tokens": 45000, "category": "context_stress"}}
5
+ {"name": "architecture_analysis", "prompt": "Clone https://github.com/microsoft-foundry/ai-tutorials and analyze the overall architecture and design patterns.\n\nSTEP 1: Clone the repository\n- git clone https://github.com/microsoft-foundry/ai-tutorials\n- Verify clone success\n\nSTEP 2: Map the repository structure\n- Create a complete directory tree\n- Identify major components/modules\n- Document file organization patterns\n\nSTEP 3: Analyze design patterns\nFor EACH significant code file:\n1. Read the complete file\n2. Identify design patterns used (factory, singleton, observer, etc.)\n3. Note coding conventions and style\n4. Document error handling approaches\n5. Analyze how components interact\n\nSTEP 4: Create architecture document\n- High-level architecture diagram (text format)\n- Component interaction map\n- Data flow descriptions\n- Design pattern catalog with examples from code\n- Evaluation of architectural decisions\n- Suggestions for architectural improvements\n\nRead every file. Document patterns with specific code references.", "criteria": [{"name": "clone_success", "instruction": "Repository was successfully cloned", "weight": 1.0}, {"name": "structure_mapped", "instruction": "Complete directory structure documented", "weight": 0.8}, {"name": "patterns_identified", "instruction": "Design patterns identified with specific code examples", "weight": 0.9}, {"name": "architecture_doc", "instruction": "Architecture document is comprehensive and accurate", "weight": 0.8}], "metadata": {"expected_iterations": 22, "min_tokens": 55000, "category": "context_stress"}}
 
 
 
 
 
src/flow/experiments/data/tasks/gaia_all.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
src/flow/experiments/data/tasks/gaia_level1.jsonl ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"name": "e1fc63a2-da7a-432f-be78-7c4a95598703", "prompt": "If Eliud Kipchoge could maintain his record-making marathon pace indefinitely, how many thousand hours would it take him to run the distance between the Earth and the Moon its closest approach? Please use the minimum perigee value on the Wikipedia page for the Moon when carrying out your calculation. Round your result to the nearest 1000 hours and do not use any comma separators if necessary.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 17", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "17", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
2
+ {"name": "8e867cd7-cff9-4e6c-867a-ff5ddc2550be", "prompt": "How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 3", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "3", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
3
+ {"name": "ec09fa32-d03f-4bf8-84b0-1f16922c3ae4", "prompt": "Here's a fun riddle that I think you'll enjoy.\n\nYou have been selected to play the final round of the hit new game show \"Pick That Ping-Pong\". In this round, you will be competing for a large cash prize. Your job will be to pick one of several different numbered ping-pong balls, and then the game will commence. The host describes how the game works.\n\nA device consisting of a winding clear ramp and a series of pistons controls the outcome of the game. The ramp feeds balls onto a platform. The platform has room for three ping-pong balls at a time. The three balls on the platform are each aligned with one of three pistons. At each stage of the game, one of the three pistons will randomly fire, ejecting the ball it strikes. If the piston ejects the ball in the first position on the platform the balls in the second and third position on the platform each advance one space, and the next ball on the ramp advances to the third position. If the piston ejects the ball in the second position, the ball in the first position is released and rolls away, the ball in the third position advances two spaces to occupy the first position, and the next two balls on the ramp advance to occupy the second and third positions on the platform. If the piston ejects the ball in the third position, the ball in the first position is released and rolls away, the ball in the second position advances one space to occupy the first position, and the next two balls on the ramp advance to occupy the second and third positions on the platform.\n\nThe ramp begins with 100 numbered ping-pong balls, arranged in ascending order from 1 to 100. The host activates the machine and the first three balls, numbered 1, 2, and 3, advance to the platform. Before the random firing of the pistons begins, you are asked which of the 100 balls you would like to pick. If your pick is ejected by one of the pistons, you win the grand prize, $10,000.\n\nWhich ball should you choose to maximize your odds of winning the big prize? Please provide your answer as the number of the ball selected.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 3", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "3", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
4
+ {"name": "5d0080cb-90d7-4712-bc33-848150e917d3", "prompt": "What was the volume in m^3 of the fish bag that was calculated in the University of Leicester paper \"Can Hiccup Supply Enough Fish to Maintain a Dragon\u2019s Diet?\"", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 0.1777", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "0.1777", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
5
+ {"name": "a1e91b78-d3d8-4675-bb8d-62741b4b68a6", "prompt": "In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird species to be on camera simultaneously?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 3", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "3", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
6
+ {"name": "46719c30-f4c3-4cad-be07-d5cb21eee6bb", "prompt": "Of the authors (First M. Last) that worked on the paper \"Pie Menus or Linear Menus, Which Is Better?\" in 2015, what was the title of the first paper authored by the one that had authored prior papers?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Mapping Human Oriented Information to Software Agents for Online Systems Usage", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "Mapping Human Oriented Information to Software Agents for Online Systems Usage", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
7
+ {"name": "4b6bb5f7-f634-410e-815d-e673ab7f8632", "prompt": "In Series 9, Episode 11 of Doctor Who, the Doctor is trapped inside an ever-shifting maze. What is this location called in the official script for the episode? Give the setting exactly as it appears in the first scene heading.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: THE CASTLE", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "THE CASTLE", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
8
+ {"name": "cffe0e32-c9a6-4c52-9877-78ceb4aaa9fb", "prompt": "An office held a Secret Santa gift exchange where each of its twelve employees was assigned one other employee in the group to present with a gift. Each employee filled out a profile including three likes or hobbies. On the day of the gift exchange, only eleven gifts were given, each one specific to one of the recipient's interests. Based on the information in the document, who did not give a gift?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Fred", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "Fred", "gaia_level": 1, "gaia_file": "cffe0e32-c9a6-4c52-9877-78ceb4aaa9fb.docx", "source": "gaia-benchmark"}}
9
+ {"name": "2d83110e-a098-4ebb-9987-066c06fa42d0", "prompt": ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Right", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "Right", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
10
+ {"name": "5cfb274c-0207-4aa7-9575-6ac0bd95d9b2", "prompt": "Each cell in the attached spreadsheet represents a plot of land. The color of the cell indicates who owns that plot. Green cells are plots owned by Earl Smith. Can Earl walk through every plot he owns (and no other plots) and return to his starting plot without backtracking? For this question, consider backtracking to be any instance where Earl would enter a plot of land he had already entered since leaving his starting plot.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: No", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "No", "gaia_level": 1, "gaia_file": "5cfb274c-0207-4aa7-9575-6ac0bd95d9b2.xlsx", "source": "gaia-benchmark"}}
11
+ {"name": "27d5d136-8563-469e-92bf-fd103c28b57c", "prompt": "\u00ac(A \u2227 B) \u2194 (\u00acA \u2228 \u00acB)\n\u00ac(A \u2228 B) \u2194 (\u00acA \u2227 \u00acB)\n(A \u2192 B) \u2194 (\u00acB \u2192 \u00acA)\n(A \u2192 B) \u2194 (\u00acA \u2228 B)\n(\u00acA \u2192 B) \u2194 (A \u2228 \u00acB)\n\u00ac(A \u2192 B) \u2194 (A \u2227 \u00acB)\n\nWhich of the above is not logically equivalent to the rest? Provide the full statement that doesn't fit.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: (\u00acA \u2192 B) \u2194 (A \u2228 \u00acB)", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "(\u00acA \u2192 B) \u2194 (A \u2228 \u00acB)", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
12
+ {"name": "dc28cf18-6431-458b-83ef-64b3ce566c10", "prompt": "My family reunion is this week, and I was assigned the mashed potatoes to bring. The attendees include my married mother and father, my twin brother and his family, my aunt and her family, my grandma and her brother, her brother's daughter, and his daughter's family. All the adults but me have been married, and no one is divorced or remarried, but my grandpa and my grandma's sister-in-law passed away last year. All living spouses are attending. My brother has two children that are still kids, my aunt has one six-year-old, and my grandma's brother's daughter has three kids under 12. I figure each adult will eat about 1.5 potatoes of mashed potatoes and each kid will eat about 1/2 a potato of mashed potatoes, except my second cousins don't eat carbs. The average potato is about half a pound, and potatoes are sold in 5-pound bags. How many whole bags of potatoes do I need? Just give the number.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 2", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "2", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
13
+ {"name": "b816bfce-3d80-4913-a07d-69b752ce6377", "prompt": "In Emily Midkiff's June 2014 article in a journal named for the one of Hreidmar's sons that guarded his house, what word was quoted from two different authors in distaste for the nature of dragon depictions?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: fluffy", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "fluffy", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
14
+ {"name": "72e110e7-464c-453c-a309-90a95aed6538", "prompt": "Under DDC 633 on Bielefeld University Library's BASE, as of 2020, from what country was the unknown language article with a flag unique from the others?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Guatemala", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "Guatemala", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
15
+ {"name": "42576abe-0deb-4869-8c63-225c2d75a95a", "prompt": "In the fictional language of Tizin, basic sentences are arranged with the Verb first, followed by the direct object, followed by the subject of the sentence. I want to express my love for apples to my Tizin friend. \n\nThe word that indicates oneself is \"Pa\" is the nominative form, \"Mato\" is the accusative form, and \"Sing\" is the genitive form. \n\nThe root verb that indicates an intense like for something is \"Maktay\". When it is used in the present, it is used in it's root form, when it is used in the preterit past, it is \"Tay\", and when it is used in the imperfect past, it is \"Aktay\". It is used differently than in English, and is better translated as \"is pleasing to\", meaning that the thing doing the liking is actually the object of the sentence rather than the subject.\n\nThe word for apples is borrowed from English in Tizin, and so it is \"Apple\" is the nominative form, \"Zapple\" is the accusative form, and \"Izapple\" is the genitive form. \n\nPlease translate \"I like apples\" to Tizin.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Maktay mato apple", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "Maktay mato apple", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
16
+ {"name": "b415aba4-4b68-4fc6-9b89-2c812e55a3e1", "prompt": "In Nature journal's Scientific Reports conference proceedings from 2012, in the article that did not mention plasmons or plasmonics, what nano-compound is studied? Don't use the prefix nano in your answer if there is one.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: diamond", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "diamond", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
17
+ {"name": "cca530fc-4052-43b2-b130-b30968d8aa44", "prompt": "Review the chess position provided in the image. It is black's turn. Provide the correct next move for black which guarantees a win. Please provide your response in algebraic notation.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Rd5", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "Rd5", "gaia_level": 1, "gaia_file": "cca530fc-4052-43b2-b130-b30968d8aa44.png", "source": "gaia-benchmark"}}
18
+ {"name": "935e2cff-ae78-4218-b3f5-115589b19dae", "prompt": "In the year 2022, and before December, what does \"R\" stand for in the three core policies of the type of content that was violated in the public logs on the Legume Wikipedia page?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: research", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "research", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
19
+ {"name": "4fc2f1ae-8625-45b5-ab34-ad4433bc21f8", "prompt": "Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted in November 2016?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: FunkMonk", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "FunkMonk", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
20
+ {"name": "5188369a-3bbe-43d8-8b94-11558f909a08", "prompt": "What writer is quoted by Merriam-Webster for the Word of the Day from June 27, 2022?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Annie Levin", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "Annie Levin", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
21
+ {"name": "6f37996b-2ac7-44b0-8e68-6d28256631b4", "prompt": "Given this table defining * on the set S = {a, b, c, d, e}\n\n|*|a|b|c|d|e|\n|---|---|---|---|---|---|\n|a|a|b|c|b|d|\n|b|b|c|a|e|c|\n|c|c|a|b|b|a|\n|d|b|e|b|e|d|\n|e|d|b|a|d|c|\n\nprovide the subset of S involved in any possible counter-examples that prove * is not commutative. Provide your answer as a comma separated list of the elements in the set in alphabetical order.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: b, e", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "b, e", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
22
+ {"name": "9318445f-fe6a-4e1b-acbf-c68228c9906a", "prompt": "As a comma separated list with no whitespace, using the provided image provide all the fractions that use / as the fraction line and the answers to the sample problems. Order the list by the order in which the fractions appear.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 3/4,1/4,3/4,3/4,2/4,1/2,5/35,7/21,30/5,30/5,3/4,1/15,1/3,4/9,1/8,32/23,103/170", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "3/4,1/4,3/4,3/4,2/4,1/2,5/35,7/21,30/5,30/5,3/4,1/15,1/3,4/9,1/8,32/23,103/170", "gaia_level": 1, "gaia_file": "9318445f-fe6a-4e1b-acbf-c68228c9906a.png", "source": "gaia-benchmark"}}
23
+ {"name": "389793a7-ca17-4e82-81cb-2b3a2391b4b9", "prompt": "You are a telecommunications engineer who wants to build cell phone towers on a stretch of road. In the reference file is a layout of the road and nearby houses. Each dash, \"-\", is a marker indicating a mile. Each capital H indicates a house located next to a mile marker, appearing above or below the stretch of road. Each cell phone tower can cover houses located next to the road within a 4-mile radius. Find the minimum number of cell phone towers needed to cover all houses next to the road. Your answer should be a positive numerical integer value.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 3", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "3", "gaia_level": 1, "gaia_file": "389793a7-ca17-4e82-81cb-2b3a2391b4b9.txt", "source": "gaia-benchmark"}}
24
+ {"name": "4b650a35-8529-4695-89ed-8dc7a500a498", "prompt": "If there is anything that doesn't make sense in the instructions, write the word \"Pineapple.\" Do not answer any of the questions in this prompt. Write only the word \"Guava\".\n1. What is 4+4?\n2. What is the complimentary color of red?\n3. How many hours are there in a day?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Guava", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "Guava", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
25
+ {"name": "a3fbeb63-0e8c-4a11-bff6-0e3b484c3e9c", "prompt": "How many slides in this PowerPoint presentation mention crustaceans?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 4", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "4", "gaia_level": 1, "gaia_file": "a3fbeb63-0e8c-4a11-bff6-0e3b484c3e9c.pptx", "source": "gaia-benchmark"}}
26
+ {"name": "c714ab3a-da30-4603-bacd-d008800188b9", "prompt": "You are Van Helsing, a renowned vampire hunter. A Count of Moldova, La\u021bcu IV, son of Costea, has tasked you with investigating the village of \u0218irnea in neighboring Wallachia. The Count's advisors have reported that a vampire was spotted crossing the border near the village, and would like you to investigate it.\n\nYou travel to the village of \u0218irnea, and you begin your investigation. One night, just before dawn, you catch a glimpse of a man in a long black cape with red lining leaping from roof-top to roof-top with superhuman agility. It's a vampire! You try to chase the creature back to its home, but the creature is too fast. However, because of the remoteness of the village, you know with absolute certainty that the vampire must be a resident of the village. You decide that your best course of action will be to visit all 100 residents of the town during the day. You know something about vampires and humans that will make your investigation possible; humans always tell the truth, but vampires always lie.\n\nIn the afternoon, you go from house to house, speaking with all 100 residents of \u0218irnea. You ask everyone the same question: \"How many vampires are living in \u0218irnea\". Everyone in the village gives the same response, \"At least one of us is a human.\"\n\nHow many residents of \u0218irnea have been turned into vampires?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 100", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "100", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
27
+ {"name": "9d191bce-651d-4746-be2d-7ef8ecadb9c2", "prompt": "Examine the video at https://www.youtube.com/watch?v=1htKBjuUWec.\n\nWhat does Teal'c say in response to the question \"Isn't that hot?\"", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Extremely", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "Extremely", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
28
+ {"name": "65afbc8a-89ca-4ad5-8d62-355bb401f61d", "prompt": "You are given this Excel file as a map. You start on the START cell and move toward the END cell. You are allowed to move two cells per turn, and you may move up, down, left, or right. You may not move fewer than two cells, and you may not move backward. You must avoid moving onto any blue cells. On the eleventh turn, what is the 6-digit hex code (without prefix) of the color of the cell where you land after moving?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: F478A7", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "F478A7", "gaia_level": 1, "gaia_file": "65afbc8a-89ca-4ad5-8d62-355bb401f61d.xlsx", "source": "gaia-benchmark"}}
29
+ {"name": "cabe07ed-9eca-40ea-8ead-410ef5e83f91", "prompt": "What is the surname of the equine veterinarian mentioned in 1.E Exercises from the chemistry materials licensed by Marisa Alviar-Agnew & Henry Agnew under the CK-12 license in LibreText's Introductory Chemistry materials as compiled 08/21/2023?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Louvrier", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "Louvrier", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
30
+ {"name": "3cef3a44-215e-4aed-8e3b-b1e3f08063b7", "prompt": "I'm making a grocery list for my mom, but she's a professor of botany and she's a real stickler when it comes to categorizing things. I need to add different foods to different categories on the grocery list, but if I make a mistake, she won't buy anything inserted in the wrong category. Here's the list I have so far:\n\nmilk, eggs, flour, whole bean coffee, Oreos, sweet potatoes, fresh basil, plums, green beans, rice, corn, bell pepper, whole allspice, acorns, broccoli, celery, zucchini, lettuce, peanuts\n\nI need to make headings for the fruits and vegetables. Could you please create a list of just the vegetables from my list? If you could do that, then I can figure out how to categorize the rest of the list into the appropriate categories. But remember that my mom is a real stickler, so make sure that no botanical fruits end up on the vegetable list, or she won't get them when she's at the store. Please alphabetize the list of vegetables, and place each item in a comma separated list.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: broccoli, celery, fresh basil, lettuce, sweet potatoes", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "broccoli, celery, fresh basil, lettuce, sweet potatoes", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
31
+ {"name": "99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3", "prompt": "Hi, I'm making a pie but I could use some help with my shopping list. I have everything I need for the crust, but I'm not sure about the filling. I got the recipe from my friend Aditi, but she left it as a voice memo and the speaker on my phone is buzzing so I can't quite make out what she's saying. Could you please listen to the recipe and list all of the ingredients that my friend described? I only want the ingredients for the filling, as I have everything I need to make my favorite pie crust. I've attached the recipe as Strawberry pie.mp3.\n\nIn your response, please only list the ingredients, not any measurements. So if the recipe calls for \"a pinch of salt\" or \"two cups of ripe strawberries\" the ingredients on the list would be \"salt\" and \"ripe strawberries\".\n\nPlease format your response as a comma separated list of ingredients. Also, please alphabetize the ingredients.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: cornstarch, freshly squeezed lemon juice, granulated sugar, pure vanilla extract, ripe strawberries", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "cornstarch, freshly squeezed lemon juice, granulated sugar, pure vanilla extract, ripe strawberries", "gaia_level": 1, "gaia_file": "99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3.mp3", "source": "gaia-benchmark"}}
32
+ {"name": "d0633230-7067-47a9-9dbf-ee11e0a2cdd6", "prompt": "In the Scikit-Learn July 2017 changelog, what other predictor base command received a bug fix? Just give the name, not a path.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: BaseLabelPropagation", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "BaseLabelPropagation", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
33
+ {"name": "305ac316-eef6-4446-960a-92d80d542f82", "prompt": "Who did the actor who played Ray in the Polish-language version of Everybody Loves Raymond play in Magda M.? Give only the first name.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Wojciech", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "Wojciech", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
34
+ {"name": "0383a3ee-47a7-41a4-b493-519bdefe0488", "prompt": "On the BBC Earth YouTube video of the Top 5 Silliest Animal Moments, what species of bird is featured?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Rockhopper penguin", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "Rockhopper penguin", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
35
+ {"name": "f918266a-b3e0-4914-865d-4faa564f1aef", "prompt": "What is the final numeric output from the attached Python code?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 0", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "0", "gaia_level": 1, "gaia_file": "f918266a-b3e0-4914-865d-4faa564f1aef.py", "source": "gaia-benchmark"}}
36
+ {"name": "11af4e1a-5f45-467d-9aeb-46f4bb0bf034", "prompt": "How many more blocks (also denoted as layers) in BERT base encoder than the encoder from the architecture proposed in Attention is All You Need?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 6", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "6", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
37
+ {"name": "e142056d-56ab-4352-b091-b56054bd1359", "prompt": "Bob was invited to participate in a game show, and he advanced to the final round. The final round offered Bob the chance to win a large sum by playing a game against the host. The host has 30 shiny prop coins, each of which is worth $1,000 if Bob manages to win them by playing the game. The host hides the coins in three different prize boxes and then shuffles their order. The only rule restricting the host's coin placement is that one box must contain at least 2 coins, and one box must contain 6 more coins than another box. In order to play, Bob must submit three guesses, one guess for the number of coins in each box. The box is then opened and the number of coins is revealed. If Bob's guess is a number greater than the number of coins in the box, Bob earns no coins. If Bob guesses a number equal to or less than the number of coins in the box, Bob wins a number of coins equal to his guess.\n\nIf Bob plays uses the optimal strategy, what's the minimum amount of money he can win from the game?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 16000", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "16000", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
38
+ {"name": "50ad0280-0819-4bd9-b275-5de32d3b5bcb", "prompt": "Pull out the sentence in the following 5x7 block of text. Read from left to right and use all of the letters in order:\n\nTHESE\nAGULL\nGLIDE\nDPEAC\nEFULL\nYTOMY\nCHAIR", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: The seagull glided peacefully to my chair.", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "The seagull glided peacefully to my chair.", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
39
+ {"name": "7673d772-ef80-4f0f-a602-1bf4485c9b43", "prompt": "On Cornell Law School website's legal information institute, under the fifth section of federal rules alphabetically, what word was deleted in the last amendment to the first rule in the article that has \"witnesses\" in the most titles as of 2021?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: inference", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "inference", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
40
+ {"name": "c365c1c7-a3db-4d5e-a9a1-66f56eae7865", "prompt": "Of the cities within the United States where U.S. presidents were born, which two are the farthest apart from the westernmost to the easternmost going east, giving the city names only? Give them to me in alphabetical order, in a comma-separated list", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Braintree, Honolulu", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "Braintree, Honolulu", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
41
+ {"name": "7d4a7d1d-cac6-44a8-96e8-ea9584a70825", "prompt": "According to Girls Who Code, how long did it take in years for the percentage of computer scientists that were women to change by 13% from a starting point of 37%?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 22", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "22", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
42
+ {"name": "dc22a632-937f-4e6a-b72f-ba0ff3f5ff97", "prompt": "What was the complete title of the book in which two James Beard Award winners recommended the restaurant where Ali Khan enjoyed a New Mexican staple in his cost-conscious TV show that started in 2015? Write the numbers in plain text if there are some in the title.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Five Hundred Things To Eat Before It's Too Late: and the Very Best Places to Eat Them", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "Five Hundred Things To Eat Before It's Too Late: and the Very Best Places to Eat Them", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
43
+ {"name": "3f57289b-8c60-48be-bd80-01f8099ca449", "prompt": "How many at bats did the Yankee with the most walks in the 1977 regular season have that same season?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 519", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "519", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
44
+ {"name": "23dd907f-1261-4488-b21c-e9185af91d5e", "prompt": "In Audre Lorde\u2019s poem \u201cFather Son and Holy Ghost\u201d, what is the number of the stanza in which some lines are indented?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 2", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "2", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
45
+ {"name": "1f975693-876d-457b-a649-393859e79bf3", "prompt": "Hi, I was out sick from my classes on Friday, so I'm trying to figure out what I need to study for my Calculus mid-term next week. My friend from class sent me an audio recording of Professor Willowbrook giving out the recommended reading for the test, but my headphones are broken :(\n\nCould you please listen to the recording for me and tell me the page numbers I'm supposed to go over? I've attached a file called Homework.mp3 that has the recording. Please provide just the page numbers as a comma-delimited list. And please provide the list in ascending order.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 132, 133, 134, 197, 245", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "132, 133, 134, 197, 245", "gaia_level": 1, "gaia_file": "1f975693-876d-457b-a649-393859e79bf3.mp3", "source": "gaia-benchmark"}}
46
+ {"name": "840bfca7-4f7b-481a-8794-c560c340185d", "prompt": "On June 6, 2023, an article by Carolyn Collins Petersen was published in Universe Today. This article mentions a team that produced a paper about their observations, linked at the bottom of the article. Find this paper. Under what NASA award number was the work performed by R. G. Arendt supported by?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 80GSFC21M0002", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "80GSFC21M0002", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
47
+ {"name": "a0068077-79f4-461a-adfe-75c1a4148545", "prompt": "What was the actual enrollment count of the clinical trial on H. pylori in acne vulgaris patients from Jan-May 2018 as listed on the NIH website?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 90", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "90", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
48
+ {"name": "bda648d7-d618-4883-88f4-3466eabd860e", "prompt": "Where were the Vietnamese specimens described by Kuznetzov in Nedoshivina's 2010 paper eventually deposited? Just give me the city name without abbreviations.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Saint Petersburg", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "Saint Petersburg", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
49
+ {"name": "50ec8903-b81f-4257-9450-1085afd2c319", "prompt": "A standard Rubik\u2019s cube has been broken into cubes making up its sides. The cubes are jumbled, and one is removed. There are 6 cubes with one colored face, 12 edge cubes with two colored faces, and 8 corner cubes with three colored faces. All blue cubes have been found. All cubes directly left, right, above, and below the orange center cube have been found, along with the center cube. The green corners have all been found, along with all green that borders yellow. For all orange cubes found, the opposite face\u2019s cubes have been found. The removed cube has two colors on its faces. What are they? Answer using a comma separated list, with the colors ordered alphabetically.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: green, white", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "green, white", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
50
+ {"name": "cf106601-ab4f-4af9-b045-5295fe67b37d", "prompt": "What country had the least number of athletes at the 1928 Summer Olympics? If there's a tie for a number of athletes, return the first in alphabetical order. Give the IOC country code as your answer.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: CUB", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "CUB", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
51
+ {"name": "a0c07678-e491-4bbc-8f0b-07405144218f", "prompt": "Who are the pitchers with the number before and after Taish\u014d Tamai's number as of July 2023? Give them to me in the form Pitcher Before, Pitcher After, use their last names only, in Roman characters.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Yoshida, Uehara", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "Yoshida, Uehara", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
52
+ {"name": "7bd855d8-463d-4ed5-93ca-5fe35145f733", "prompt": "The attached Excel file contains the sales of menu items for a local fast-food chain. What were the total sales that the chain made from food (not including drinks)? Express your answer in USD with two decimal places.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 89706.00", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "89706.00", "gaia_level": 1, "gaia_file": "7bd855d8-463d-4ed5-93ca-5fe35145f733.xlsx", "source": "gaia-benchmark"}}
53
+ {"name": "5a0c1adf-205e-4841-a666-7c3ef95def9d", "prompt": "What is the first name of the only Malko Competition recipient from the 20th Century (after 1977) whose nationality on record is a country that no longer exists?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Claus", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "Claus", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
54
+ {"name": "e1fc63a2-da7a-432f-be78-7c4a95598703", "prompt": "If Eliud Kipchoge could maintain his record-making marathon pace indefinitely, how many thousand hours would it take him to run the distance between the Earth and the Moon its closest approach? Please use the minimum perigee value on the Wikipedia page for the Moon when carrying out your calculation. Round your result to the nearest 1000 hours and do not use any comma separators if necessary.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 17", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "17", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
55
+ {"name": "8e867cd7-cff9-4e6c-867a-ff5ddc2550be", "prompt": "How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 3", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "3", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
56
+ {"name": "ec09fa32-d03f-4bf8-84b0-1f16922c3ae4", "prompt": "Here's a fun riddle that I think you'll enjoy.\n\nYou have been selected to play the final round of the hit new game show \"Pick That Ping-Pong\". In this round, you will be competing for a large cash prize. Your job will be to pick one of several different numbered ping-pong balls, and then the game will commence. The host describes how the game works.\n\nA device consisting of a winding clear ramp and a series of pistons controls the outcome of the game. The ramp feeds balls onto a platform. The platform has room for three ping-pong balls at a time. The three balls on the platform are each aligned with one of three pistons. At each stage of the game, one of the three pistons will randomly fire, ejecting the ball it strikes. If the piston ejects the ball in the first position on the platform the balls in the second and third position on the platform each advance one space, and the next ball on the ramp advances to the third position. If the piston ejects the ball in the second position, the ball in the first position is released and rolls away, the ball in the third position advances two spaces to occupy the first position, and the next two balls on the ramp advance to occupy the second and third positions on the platform. If the piston ejects the ball in the third position, the ball in the first position is released and rolls away, the ball in the second position advances one space to occupy the first position, and the next two balls on the ramp advance to occupy the second and third positions on the platform.\n\nThe ramp begins with 100 numbered ping-pong balls, arranged in ascending order from 1 to 100. The host activates the machine and the first three balls, numbered 1, 2, and 3, advance to the platform. Before the random firing of the pistons begins, you are asked which of the 100 balls you would like to pick. If your pick is ejected by one of the pistons, you win the grand prize, $10,000.\n\nWhich ball should you choose to maximize your odds of winning the big prize? Please provide your answer as the number of the ball selected.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 3", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "3", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
57
+ {"name": "5d0080cb-90d7-4712-bc33-848150e917d3", "prompt": "What was the volume in m^3 of the fish bag that was calculated in the University of Leicester paper \"Can Hiccup Supply Enough Fish to Maintain a Dragon\u2019s Diet?\"", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 0.1777", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "0.1777", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
58
+ {"name": "a1e91b78-d3d8-4675-bb8d-62741b4b68a6", "prompt": "In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird species to be on camera simultaneously?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 3", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "3", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
59
+ {"name": "46719c30-f4c3-4cad-be07-d5cb21eee6bb", "prompt": "Of the authors (First M. Last) that worked on the paper \"Pie Menus or Linear Menus, Which Is Better?\" in 2015, what was the title of the first paper authored by the one that had authored prior papers?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Mapping Human Oriented Information to Software Agents for Online Systems Usage", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "Mapping Human Oriented Information to Software Agents for Online Systems Usage", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
60
+ {"name": "4b6bb5f7-f634-410e-815d-e673ab7f8632", "prompt": "In Series 9, Episode 11 of Doctor Who, the Doctor is trapped inside an ever-shifting maze. What is this location called in the official script for the episode? Give the setting exactly as it appears in the first scene heading.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: THE CASTLE", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "THE CASTLE", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
61
+ {"name": "cffe0e32-c9a6-4c52-9877-78ceb4aaa9fb", "prompt": "An office held a Secret Santa gift exchange where each of its twelve employees was assigned one other employee in the group to present with a gift. Each employee filled out a profile including three likes or hobbies. On the day of the gift exchange, only eleven gifts were given, each one specific to one of the recipient's interests. Based on the information in the document, who did not give a gift?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Fred", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "Fred", "gaia_level": 1, "gaia_file": "cffe0e32-c9a6-4c52-9877-78ceb4aaa9fb.docx", "source": "gaia-benchmark"}}
62
+ {"name": "2d83110e-a098-4ebb-9987-066c06fa42d0", "prompt": ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Right", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "Right", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
63
+ {"name": "5cfb274c-0207-4aa7-9575-6ac0bd95d9b2", "prompt": "Each cell in the attached spreadsheet represents a plot of land. The color of the cell indicates who owns that plot. Green cells are plots owned by Earl Smith. Can Earl walk through every plot he owns (and no other plots) and return to his starting plot without backtracking? For this question, consider backtracking to be any instance where Earl would enter a plot of land he had already entered since leaving his starting plot.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: No", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "No", "gaia_level": 1, "gaia_file": "5cfb274c-0207-4aa7-9575-6ac0bd95d9b2.xlsx", "source": "gaia-benchmark"}}
64
+ {"name": "27d5d136-8563-469e-92bf-fd103c28b57c", "prompt": "\u00ac(A \u2227 B) \u2194 (\u00acA \u2228 \u00acB)\n\u00ac(A \u2228 B) \u2194 (\u00acA \u2227 \u00acB)\n(A \u2192 B) \u2194 (\u00acB \u2192 \u00acA)\n(A \u2192 B) \u2194 (\u00acA \u2228 B)\n(\u00acA \u2192 B) \u2194 (A \u2228 \u00acB)\n\u00ac(A \u2192 B) \u2194 (A \u2227 \u00acB)\n\nWhich of the above is not logically equivalent to the rest? Provide the full statement that doesn't fit.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: (\u00acA \u2192 B) \u2194 (A \u2228 \u00acB)", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "(\u00acA \u2192 B) \u2194 (A \u2228 \u00acB)", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
65
+ {"name": "dc28cf18-6431-458b-83ef-64b3ce566c10", "prompt": "My family reunion is this week, and I was assigned the mashed potatoes to bring. The attendees include my married mother and father, my twin brother and his family, my aunt and her family, my grandma and her brother, her brother's daughter, and his daughter's family. All the adults but me have been married, and no one is divorced or remarried, but my grandpa and my grandma's sister-in-law passed away last year. All living spouses are attending. My brother has two children that are still kids, my aunt has one six-year-old, and my grandma's brother's daughter has three kids under 12. I figure each adult will eat about 1.5 potatoes of mashed potatoes and each kid will eat about 1/2 a potato of mashed potatoes, except my second cousins don't eat carbs. The average potato is about half a pound, and potatoes are sold in 5-pound bags. How many whole bags of potatoes do I need? Just give the number.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 2", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "2", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
66
+ {"name": "b816bfce-3d80-4913-a07d-69b752ce6377", "prompt": "In Emily Midkiff's June 2014 article in a journal named for the one of Hreidmar's sons that guarded his house, what word was quoted from two different authors in distaste for the nature of dragon depictions?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: fluffy", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "fluffy", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
67
+ {"name": "72e110e7-464c-453c-a309-90a95aed6538", "prompt": "Under DDC 633 on Bielefeld University Library's BASE, as of 2020, from what country was the unknown language article with a flag unique from the others?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Guatemala", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "Guatemala", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
68
+ {"name": "42576abe-0deb-4869-8c63-225c2d75a95a", "prompt": "In the fictional language of Tizin, basic sentences are arranged with the Verb first, followed by the direct object, followed by the subject of the sentence. I want to express my love for apples to my Tizin friend. \n\nThe word that indicates oneself is \"Pa\" is the nominative form, \"Mato\" is the accusative form, and \"Sing\" is the genitive form. \n\nThe root verb that indicates an intense like for something is \"Maktay\". When it is used in the present, it is used in it's root form, when it is used in the preterit past, it is \"Tay\", and when it is used in the imperfect past, it is \"Aktay\". It is used differently than in English, and is better translated as \"is pleasing to\", meaning that the thing doing the liking is actually the object of the sentence rather than the subject.\n\nThe word for apples is borrowed from English in Tizin, and so it is \"Apple\" is the nominative form, \"Zapple\" is the accusative form, and \"Izapple\" is the genitive form. \n\nPlease translate \"I like apples\" to Tizin.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Maktay mato apple", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "Maktay mato apple", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
69
+ {"name": "b415aba4-4b68-4fc6-9b89-2c812e55a3e1", "prompt": "In Nature journal's Scientific Reports conference proceedings from 2012, in the article that did not mention plasmons or plasmonics, what nano-compound is studied? Don't use the prefix nano in your answer if there is one.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: diamond", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "diamond", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
70
+ {"name": "cca530fc-4052-43b2-b130-b30968d8aa44", "prompt": "Review the chess position provided in the image. It is black's turn. Provide the correct next move for black which guarantees a win. Please provide your response in algebraic notation.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Rd5", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "Rd5", "gaia_level": 1, "gaia_file": "cca530fc-4052-43b2-b130-b30968d8aa44.png", "source": "gaia-benchmark"}}
71
+ {"name": "935e2cff-ae78-4218-b3f5-115589b19dae", "prompt": "In the year 2022, and before December, what does \"R\" stand for in the three core policies of the type of content that was violated in the public logs on the Legume Wikipedia page?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: research", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "research", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
72
+ {"name": "4fc2f1ae-8625-45b5-ab34-ad4433bc21f8", "prompt": "Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted in November 2016?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: FunkMonk", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "FunkMonk", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
73
+ {"name": "5188369a-3bbe-43d8-8b94-11558f909a08", "prompt": "What writer is quoted by Merriam-Webster for the Word of the Day from June 27, 2022?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Annie Levin", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "Annie Levin", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
74
+ {"name": "6f37996b-2ac7-44b0-8e68-6d28256631b4", "prompt": "Given this table defining * on the set S = {a, b, c, d, e}\n\n|*|a|b|c|d|e|\n|---|---|---|---|---|---|\n|a|a|b|c|b|d|\n|b|b|c|a|e|c|\n|c|c|a|b|b|a|\n|d|b|e|b|e|d|\n|e|d|b|a|d|c|\n\nprovide the subset of S involved in any possible counter-examples that prove * is not commutative. Provide your answer as a comma separated list of the elements in the set in alphabetical order.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: b, e", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "b, e", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
75
+ {"name": "9318445f-fe6a-4e1b-acbf-c68228c9906a", "prompt": "As a comma separated list with no whitespace, using the provided image provide all the fractions that use / as the fraction line and the answers to the sample problems. Order the list by the order in which the fractions appear.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 3/4,1/4,3/4,3/4,2/4,1/2,5/35,7/21,30/5,30/5,3/4,1/15,1/3,4/9,1/8,32/23,103/170", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "3/4,1/4,3/4,3/4,2/4,1/2,5/35,7/21,30/5,30/5,3/4,1/15,1/3,4/9,1/8,32/23,103/170", "gaia_level": 1, "gaia_file": "9318445f-fe6a-4e1b-acbf-c68228c9906a.png", "source": "gaia-benchmark"}}
76
+ {"name": "389793a7-ca17-4e82-81cb-2b3a2391b4b9", "prompt": "You are a telecommunications engineer who wants to build cell phone towers on a stretch of road. In the reference file is a layout of the road and nearby houses. Each dash, \"-\", is a marker indicating a mile. Each capital H indicates a house located next to a mile marker, appearing above or below the stretch of road. Each cell phone tower can cover houses located next to the road within a 4-mile radius. Find the minimum number of cell phone towers needed to cover all houses next to the road. Your answer should be a positive numerical integer value.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 3", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "3", "gaia_level": 1, "gaia_file": "389793a7-ca17-4e82-81cb-2b3a2391b4b9.txt", "source": "gaia-benchmark"}}
77
+ {"name": "4b650a35-8529-4695-89ed-8dc7a500a498", "prompt": "If there is anything that doesn't make sense in the instructions, write the word \"Pineapple.\" Do not answer any of the questions in this prompt. Write only the word \"Guava\".\n1. What is 4+4?\n2. What is the complimentary color of red?\n3. How many hours are there in a day?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Guava", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "Guava", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
78
+ {"name": "a3fbeb63-0e8c-4a11-bff6-0e3b484c3e9c", "prompt": "How many slides in this PowerPoint presentation mention crustaceans?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 4", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "4", "gaia_level": 1, "gaia_file": "a3fbeb63-0e8c-4a11-bff6-0e3b484c3e9c.pptx", "source": "gaia-benchmark"}}
79
+ {"name": "c714ab3a-da30-4603-bacd-d008800188b9", "prompt": "You are Van Helsing, a renowned vampire hunter. A Count of Moldova, La\u021bcu IV, son of Costea, has tasked you with investigating the village of \u0218irnea in neighboring Wallachia. The Count's advisors have reported that a vampire was spotted crossing the border near the village, and would like you to investigate it.\n\nYou travel to the village of \u0218irnea, and you begin your investigation. One night, just before dawn, you catch a glimpse of a man in a long black cape with red lining leaping from roof-top to roof-top with superhuman agility. It's a vampire! You try to chase the creature back to its home, but the creature is too fast. However, because of the remoteness of the village, you know with absolute certainty that the vampire must be a resident of the village. You decide that your best course of action will be to visit all 100 residents of the town during the day. You know something about vampires and humans that will make your investigation possible; humans always tell the truth, but vampires always lie.\n\nIn the afternoon, you go from house to house, speaking with all 100 residents of \u0218irnea. You ask everyone the same question: \"How many vampires are living in \u0218irnea\". Everyone in the village gives the same response, \"At least one of us is a human.\"\n\nHow many residents of \u0218irnea have been turned into vampires?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 100", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "100", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
80
+ {"name": "9d191bce-651d-4746-be2d-7ef8ecadb9c2", "prompt": "Examine the video at https://www.youtube.com/watch?v=1htKBjuUWec.\n\nWhat does Teal'c say in response to the question \"Isn't that hot?\"", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Extremely", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "Extremely", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
81
+ {"name": "65afbc8a-89ca-4ad5-8d62-355bb401f61d", "prompt": "You are given this Excel file as a map. You start on the START cell and move toward the END cell. You are allowed to move two cells per turn, and you may move up, down, left, or right. You may not move fewer than two cells, and you may not move backward. You must avoid moving onto any blue cells. On the eleventh turn, what is the 6-digit hex code (without prefix) of the color of the cell where you land after moving?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: F478A7", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "F478A7", "gaia_level": 1, "gaia_file": "65afbc8a-89ca-4ad5-8d62-355bb401f61d.xlsx", "source": "gaia-benchmark"}}
82
+ {"name": "cabe07ed-9eca-40ea-8ead-410ef5e83f91", "prompt": "What is the surname of the equine veterinarian mentioned in 1.E Exercises from the chemistry materials licensed by Marisa Alviar-Agnew & Henry Agnew under the CK-12 license in LibreText's Introductory Chemistry materials as compiled 08/21/2023?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Louvrier", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "Louvrier", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
83
+ {"name": "3cef3a44-215e-4aed-8e3b-b1e3f08063b7", "prompt": "I'm making a grocery list for my mom, but she's a professor of botany and she's a real stickler when it comes to categorizing things. I need to add different foods to different categories on the grocery list, but if I make a mistake, she won't buy anything inserted in the wrong category. Here's the list I have so far:\n\nmilk, eggs, flour, whole bean coffee, Oreos, sweet potatoes, fresh basil, plums, green beans, rice, corn, bell pepper, whole allspice, acorns, broccoli, celery, zucchini, lettuce, peanuts\n\nI need to make headings for the fruits and vegetables. Could you please create a list of just the vegetables from my list? If you could do that, then I can figure out how to categorize the rest of the list into the appropriate categories. But remember that my mom is a real stickler, so make sure that no botanical fruits end up on the vegetable list, or she won't get them when she's at the store. Please alphabetize the list of vegetables, and place each item in a comma separated list.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: broccoli, celery, fresh basil, lettuce, sweet potatoes", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "broccoli, celery, fresh basil, lettuce, sweet potatoes", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
84
+ {"name": "99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3", "prompt": "Hi, I'm making a pie but I could use some help with my shopping list. I have everything I need for the crust, but I'm not sure about the filling. I got the recipe from my friend Aditi, but she left it as a voice memo and the speaker on my phone is buzzing so I can't quite make out what she's saying. Could you please listen to the recipe and list all of the ingredients that my friend described? I only want the ingredients for the filling, as I have everything I need to make my favorite pie crust. I've attached the recipe as Strawberry pie.mp3.\n\nIn your response, please only list the ingredients, not any measurements. So if the recipe calls for \"a pinch of salt\" or \"two cups of ripe strawberries\" the ingredients on the list would be \"salt\" and \"ripe strawberries\".\n\nPlease format your response as a comma separated list of ingredients. Also, please alphabetize the ingredients.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: cornstarch, freshly squeezed lemon juice, granulated sugar, pure vanilla extract, ripe strawberries", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "cornstarch, freshly squeezed lemon juice, granulated sugar, pure vanilla extract, ripe strawberries", "gaia_level": 1, "gaia_file": "99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3.mp3", "source": "gaia-benchmark"}}
85
+ {"name": "d0633230-7067-47a9-9dbf-ee11e0a2cdd6", "prompt": "In the Scikit-Learn July 2017 changelog, what other predictor base command received a bug fix? Just give the name, not a path.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: BaseLabelPropagation", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "BaseLabelPropagation", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
86
+ {"name": "305ac316-eef6-4446-960a-92d80d542f82", "prompt": "Who did the actor who played Ray in the Polish-language version of Everybody Loves Raymond play in Magda M.? Give only the first name.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Wojciech", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "Wojciech", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
87
+ {"name": "0383a3ee-47a7-41a4-b493-519bdefe0488", "prompt": "On the BBC Earth YouTube video of the Top 5 Silliest Animal Moments, what species of bird is featured?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Rockhopper penguin", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "Rockhopper penguin", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
88
+ {"name": "f918266a-b3e0-4914-865d-4faa564f1aef", "prompt": "What is the final numeric output from the attached Python code?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 0", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "0", "gaia_level": 1, "gaia_file": "f918266a-b3e0-4914-865d-4faa564f1aef.py", "source": "gaia-benchmark"}}
89
+ {"name": "11af4e1a-5f45-467d-9aeb-46f4bb0bf034", "prompt": "How many more blocks (also denoted as layers) in BERT base encoder than the encoder from the architecture proposed in Attention is All You Need?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 6", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "6", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
90
+ {"name": "e142056d-56ab-4352-b091-b56054bd1359", "prompt": "Bob was invited to participate in a game show, and he advanced to the final round. The final round offered Bob the chance to win a large sum by playing a game against the host. The host has 30 shiny prop coins, each of which is worth $1,000 if Bob manages to win them by playing the game. The host hides the coins in three different prize boxes and then shuffles their order. The only rule restricting the host's coin placement is that one box must contain at least 2 coins, and one box must contain 6 more coins than another box. In order to play, Bob must submit three guesses, one guess for the number of coins in each box. The box is then opened and the number of coins is revealed. If Bob's guess is a number greater than the number of coins in the box, Bob earns no coins. If Bob guesses a number equal to or less than the number of coins in the box, Bob wins a number of coins equal to his guess.\n\nIf Bob plays uses the optimal strategy, what's the minimum amount of money he can win from the game?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 16000", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "16000", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
91
+ {"name": "50ad0280-0819-4bd9-b275-5de32d3b5bcb", "prompt": "Pull out the sentence in the following 5x7 block of text. Read from left to right and use all of the letters in order:\n\nTHESE\nAGULL\nGLIDE\nDPEAC\nEFULL\nYTOMY\nCHAIR", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: The seagull glided peacefully to my chair.", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "The seagull glided peacefully to my chair.", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
92
+ {"name": "7673d772-ef80-4f0f-a602-1bf4485c9b43", "prompt": "On Cornell Law School website's legal information institute, under the fifth section of federal rules alphabetically, what word was deleted in the last amendment to the first rule in the article that has \"witnesses\" in the most titles as of 2021?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: inference", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "inference", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
93
+ {"name": "c365c1c7-a3db-4d5e-a9a1-66f56eae7865", "prompt": "Of the cities within the United States where U.S. presidents were born, which two are the farthest apart from the westernmost to the easternmost going east, giving the city names only? Give them to me in alphabetical order, in a comma-separated list", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Braintree, Honolulu", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "Braintree, Honolulu", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
94
+ {"name": "7d4a7d1d-cac6-44a8-96e8-ea9584a70825", "prompt": "According to Girls Who Code, how long did it take in years for the percentage of computer scientists that were women to change by 13% from a starting point of 37%?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 22", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "22", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
95
+ {"name": "dc22a632-937f-4e6a-b72f-ba0ff3f5ff97", "prompt": "What was the complete title of the book in which two James Beard Award winners recommended the restaurant where Ali Khan enjoyed a New Mexican staple in his cost-conscious TV show that started in 2015? Write the numbers in plain text if there are some in the title.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Five Hundred Things To Eat Before It's Too Late: and the Very Best Places to Eat Them", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "Five Hundred Things To Eat Before It's Too Late: and the Very Best Places to Eat Them", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
96
+ {"name": "3f57289b-8c60-48be-bd80-01f8099ca449", "prompt": "How many at bats did the Yankee with the most walks in the 1977 regular season have that same season?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 519", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "519", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
97
+ {"name": "23dd907f-1261-4488-b21c-e9185af91d5e", "prompt": "In Audre Lorde\u2019s poem \u201cFather Son and Holy Ghost\u201d, what is the number of the stanza in which some lines are indented?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 2", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "2", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
98
+ {"name": "1f975693-876d-457b-a649-393859e79bf3", "prompt": "Hi, I was out sick from my classes on Friday, so I'm trying to figure out what I need to study for my Calculus mid-term next week. My friend from class sent me an audio recording of Professor Willowbrook giving out the recommended reading for the test, but my headphones are broken :(\n\nCould you please listen to the recording for me and tell me the page numbers I'm supposed to go over? I've attached a file called Homework.mp3 that has the recording. Please provide just the page numbers as a comma-delimited list. And please provide the list in ascending order.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 132, 133, 134, 197, 245", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "132, 133, 134, 197, 245", "gaia_level": 1, "gaia_file": "1f975693-876d-457b-a649-393859e79bf3.mp3", "source": "gaia-benchmark"}}
99
+ {"name": "840bfca7-4f7b-481a-8794-c560c340185d", "prompt": "On June 6, 2023, an article by Carolyn Collins Petersen was published in Universe Today. This article mentions a team that produced a paper about their observations, linked at the bottom of the article. Find this paper. Under what NASA award number was the work performed by R. G. Arendt supported by?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 80GSFC21M0002", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "80GSFC21M0002", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
100
+ {"name": "a0068077-79f4-461a-adfe-75c1a4148545", "prompt": "What was the actual enrollment count of the clinical trial on H. pylori in acne vulgaris patients from Jan-May 2018 as listed on the NIH website?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 90", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "90", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
101
+ {"name": "bda648d7-d618-4883-88f4-3466eabd860e", "prompt": "Where were the Vietnamese specimens described by Kuznetzov in Nedoshivina's 2010 paper eventually deposited? Just give me the city name without abbreviations.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Saint Petersburg", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "Saint Petersburg", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
102
+ {"name": "50ec8903-b81f-4257-9450-1085afd2c319", "prompt": "A standard Rubik\u2019s cube has been broken into cubes making up its sides. The cubes are jumbled, and one is removed. There are 6 cubes with one colored face, 12 edge cubes with two colored faces, and 8 corner cubes with three colored faces. All blue cubes have been found. All cubes directly left, right, above, and below the orange center cube have been found, along with the center cube. The green corners have all been found, along with all green that borders yellow. For all orange cubes found, the opposite face\u2019s cubes have been found. The removed cube has two colors on its faces. What are they? Answer using a comma separated list, with the colors ordered alphabetically.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: green, white", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "green, white", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
103
+ {"name": "cf106601-ab4f-4af9-b045-5295fe67b37d", "prompt": "What country had the least number of athletes at the 1928 Summer Olympics? If there's a tie for a number of athletes, return the first in alphabetical order. Give the IOC country code as your answer.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: CUB", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "CUB", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
104
+ {"name": "a0c07678-e491-4bbc-8f0b-07405144218f", "prompt": "Who are the pitchers with the number before and after Taish\u014d Tamai's number as of July 2023? Give them to me in the form Pitcher Before, Pitcher After, use their last names only, in Roman characters.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Yoshida, Uehara", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "Yoshida, Uehara", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
105
+ {"name": "7bd855d8-463d-4ed5-93ca-5fe35145f733", "prompt": "The attached Excel file contains the sales of menu items for a local fast-food chain. What were the total sales that the chain made from food (not including drinks)? Express your answer in USD with two decimal places.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 89706.00", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "89706.00", "gaia_level": 1, "gaia_file": "7bd855d8-463d-4ed5-93ca-5fe35145f733.xlsx", "source": "gaia-benchmark"}}
106
+ {"name": "5a0c1adf-205e-4841-a666-7c3ef95def9d", "prompt": "What is the first name of the only Malko Competition recipient from the 20th Century (after 1977) whose nationality on record is a country that no longer exists?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Claus", "weight": 1.0}], "category": "level_1", "metadata": {"gaia_answer": "Claus", "gaia_level": 1, "gaia_file": null, "source": "gaia-benchmark"}}
src/flow/experiments/data/tasks/gaia_level2.jsonl ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"name": "c61d22de-5f6c-4958-a7f6-5e9707bd3466", "prompt": "A paper about AI regulation that was originally submitted to arXiv.org in June 2022 shows a figure with three axes, where each axis has a label word at both ends. Which of these words is used to describe a type of society in a Physics and Society article submitted to arXiv.org on August 11, 2016?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: egalitarian", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "egalitarian", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
2
+ {"name": "17b5a6a3-bc87-42e8-b0fb-6ab0781ef2cc", "prompt": "I\u2019m researching species that became invasive after people who kept them as pets released them. There\u2019s a certain species of fish that was popularized as a pet by being the main character of the movie Finding Nemo. According to the USGS, where was this fish found as a nonnative species, before the year 2020? I need the answer formatted as the five-digit zip codes of the places the species was found, separated by commas if there is more than one place.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 34689", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "34689", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
3
+ {"name": "04a04a9b-226c-43fd-b319-d5e89743676f", "prompt": "If we assume all articles published by Nature in 2020 (articles, only, not book reviews/columns, etc) relied on statistical significance to justify their findings and they on average came to a p-value of 0.04, how many papers would be incorrect as to their claims of statistical significance? Round the value up to the next integer.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 41", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "41", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
4
+ {"name": "14569e28-c88c-43e4-8c32-097d35b9a67d", "prompt": "In Unlambda, what exact charcter or text needs to be added to correct the following code to output \"For penguins\"? If what is needed is a character, answer with the name of the character. If there are different names for the character, use the shortest. The text location is not needed. Code:\n\n`r```````````.F.o.r. .p.e.n.g.u.i.n.si", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: backtick", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "backtick", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
5
+ {"name": "32102e3e-d12a-4209-9163-7b3a104efe5d", "prompt": "The attached spreadsheet shows the inventory for a movie and video game rental store in Seattle, Washington. What is the title of the oldest Blu-Ray recorded in this spreadsheet? Return it as appearing in the spreadsheet.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Time-Parking 2: Parallel Universe", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "Time-Parking 2: Parallel Universe", "gaia_level": 2, "gaia_file": "32102e3e-d12a-4209-9163-7b3a104efe5d.xlsx", "source": "gaia-benchmark"}}
6
+ {"name": "3627a8be-a77f-41bb-b807-7e1bd4c0ebdf", "prompt": "The object in the British Museum's collection with a museum number of 2012,5015.17 is the shell of a particular mollusk species. According to the abstract of a research article published in Science Advances in 2021, beads made from the shells of this species were found that are at least how many thousands of years old?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 142", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "142", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
7
+ {"name": "7619a514-5fa8-43ef-9143-83b66a43d7a4", "prompt": "According to github, when was Regression added to the oldest closed numpy.polynomial issue that has the Regression label in MM/DD/YY?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 04/15/18", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "04/15/18", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
8
+ {"name": "7dd30055-0198-452e-8c25-f73dbe27dcb8", "prompt": "Using the Biopython library in Python, parse the PDB file of the protein identified by the PDB ID 5wb7 from the RCSB Protein Data Bank. Calculate the distance between the first and second atoms as they are listed in the PDB file. Report the answer in Angstroms, rounded to the nearest picometer.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 1.456", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "1.456", "gaia_level": 2, "gaia_file": "7dd30055-0198-452e-8c25-f73dbe27dcb8.pdb", "source": "gaia-benchmark"}}
9
+ {"name": "2a649bb1-795f-4a01-b3be-9a01868dae73", "prompt": "What are the EC numbers of the two most commonly used chemicals for the virus testing method in the paper about SPFMV and SPCSV in the Pearl Of Africa from 2016? Return the semicolon-separated numbers in the order of the alphabetized chemicals.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 3.1.3.1; 1.11.1.7", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "3.1.3.1; 1.11.1.7", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
10
+ {"name": "87c610df-bef7-4932-b950-1d83ef4e282b", "prompt": "In April of 1977, who was the Prime Minister of the first place mentioned by name in the Book of Esther (in the New International Version)?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Morarji Desai", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "Morarji Desai", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
11
+ {"name": "624cbf11-6a41-4692-af9c-36b3e5ca3130", "prompt": "What's the last line of the rhyme under the flavor name on the headstone visible in the background of the photo of the oldest flavor's headstone in the Ben & Jerry's online flavor graveyard as of the end of 2022?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: So we had to let it die.", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "So we had to let it die.", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
12
+ {"name": "dd3c7503-f62a-4bd0-9f67-1b63b94194cc", "prompt": "Use density measures from the chemistry materials licensed by Marisa Alviar-Agnew & Henry Agnew under the CK-12 license in LibreText's Introductory Chemistry materials as compiled 08/21/2023.\n\nI have a gallon of honey and a gallon of mayonnaise at 25C. I remove one cup of honey at a time from the gallon of honey. How many times will I need to remove a cup to have the honey weigh less than the mayonaise? Assume the containers themselves weigh the same.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 6", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "6", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
13
+ {"name": "df6561b2-7ee5-4540-baab-5095f742716a", "prompt": "When you take the average of the standard population deviation of the red numbers and the standard sample deviation of the green numbers in this image using the statistics module in Python 3.11, what is the result rounded to the nearest three decimal points?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 17.056", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "17.056", "gaia_level": 2, "gaia_file": "df6561b2-7ee5-4540-baab-5095f742716a.png", "source": "gaia-benchmark"}}
14
+ {"name": "f0f46385-fc03-4599-b5d3-f56496c3e69f", "prompt": "In terms of geographical distance between capital cities, which 2 countries are the furthest from each other within the ASEAN bloc according to wikipedia? Answer using a comma separated list, ordering the countries by alphabetical order.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Indonesia, Myanmar", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "Indonesia, Myanmar", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
15
+ {"name": "e4e91f1c-1dcd-439e-9fdd-cb976f5293fd", "prompt": "I need to fact-check a citation. This is the citation from the bibliography:\n\nGreetham, David. \"Uncoupled: OR, How I Lost My Author(s).\" Textual Cultures: Texts, Contexts, Interpretation, vol. 3 no. 1, 2008, p. 45-46. Project MUSE, doi:10.2979/tex.2008.3.1.44.\n\nAnd this is the in-line citation:\n\nOur relationship with the authors of the works we read can often be \u201cobscured not by a \"cloak of print\" but by the veil of scribal confusion and mis-transmission\u201d (Greetham 45-46).\n\nDoes the quoted text match what is actually in the article? If Yes, answer Yes, otherwise, give me the word in my citation that does not match with the correct one (without any article).", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: cloak", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "cloak", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
16
+ {"name": "56137764-b4e0-45b8-9c52-1866420c3df5", "prompt": "Which contributor to the version of OpenCV where support was added for the Mask-RCNN model has the same name as a former Chinese head of government when the names are transliterated to the Latin alphabet?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Li Peng", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "Li Peng", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
17
+ {"name": "8b3379c0-0981-4f5b-8407-6444610cb212", "prompt": "What is the maximum length in meters of #9 in the first National Geographic short on YouTube that was ever released according to the Monterey Bay Aquarium website? Just give the number.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 1.8", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "1.8", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
18
+ {"name": "0ff53813-3367-4f43-bcbd-3fd725c1bf4b", "prompt": "What two-word type of model did Manash Pratim Kashyap's and PS Fader's studies in customer retention studies published during 2018-2019 have in common (no punctuation)?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: beta geometric", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "beta geometric", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
19
+ {"name": "a7feb290-76bb-4cb7-8800-7edaf7954f2f", "prompt": "How many High Energy Physics - Lattice articles listed in January 2020 on Arxiv had ps versions available?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 31", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "31", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
20
+ {"name": "b4cc024b-3f5e-480e-b96a-6656493255b5", "prompt": "The photograph in the Whitney Museum of American Art's collection with accession number 2022.128 shows a person holding a book. Which military unit did the author of this book join in 1813? Answer without using articles.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Russian-German Legion", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "Russian-German Legion", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
21
+ {"name": "33d8ea3b-6c6b-4ff1-803d-7e270dea8a57", "prompt": "What is the minimum number of page links a person must click on to go from the english Wikipedia page on The Lord of the Rings (the book) to the english Wikipedia page on A Song of Ice and Fire (the book series)? In your count, include each link you would click on to get to the page. Use the pages as they appeared at the end of the day on July 3, 2023.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 2", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "2", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
22
+ {"name": "e8cb5b03-41e0-4086-99e5-f6806cd97211", "prompt": "I went to Virtue restaurant & bar in Chicago for my birthday on March 22, 2021 and the main course I had was delicious! Unfortunately, when I went back about a month later on April 21, it was no longer on the dinner menu. Using the Wayback Machine, can you help me figure out which main course was on the dinner menu for Virtue on March 22, 2021 but not April 21, 2021? Answer using the singular form, without articles.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: shrimp", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "shrimp", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
23
+ {"name": "f46b4380-207e-4434-820b-f32ce04ae2a4", "prompt": "It is 1999. Before you party like it is 1999, please assist me in settling a bet.\n\nFiona Apple and Paula Cole released albums prior to 1999. Of these albums, which didn't receive a letter grade from Robert Christgau? Provide your answer as a comma delimited list of album titles, sorted alphabetically.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Harbinger, Tidal", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "Harbinger, Tidal", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
24
+ {"name": "05407167-39ec-4d3a-a234-73a9120c325d", "prompt": "In the 2018 VSCode blog post on replit.com, what was the command they clicked on in the last video to remove extra lines?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Format Document", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "Format Document", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
25
+ {"name": "b9763138-c053-4832-9f55-86200cb1f99c", "prompt": "Compute the check digit the Tropicos ID for the Order Helotiales would have if it were an ISBN-10 number.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 3", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "3", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
26
+ {"name": "16d825ff-1623-4176-a5b5-42e0f5c2b0ac", "prompt": "What time was the Tri-Rail train that carried the most passengers on May 27, 2019 scheduled to arrive in Pompano Beach? Express your answer in the 12-hour digital clock format without leading zero if any, and include whether it is AM or PM.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 6:41 PM", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "6:41 PM", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
27
+ {"name": "2b3ef98c-cc05-450b-a719-711aee40ac65", "prompt": "Could you help me out with this assignment? Our professor sprung it on us at the end of class Friday, and I'm still trying to figure it out. The question he asked us was about an anagram. I've attached an audio recording of the question that he asked, so if you could please take a listen and give me the answer, I'd really appreciate the help. Please limit your response to the anagram text that could be generated from the original line which fulfills the professor's request, without any other commentary. Also, please don't include any punctuation in your response.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: To be or not to be that is the question whether tis nobler in the mind to suffer the slings and arrows of outrageous fortune", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "To be or not to be that is the question whether tis nobler in the mind to suffer the slings and arrows of outrageous fortune", "gaia_level": 2, "gaia_file": "2b3ef98c-cc05-450b-a719-711aee40ac65.mp3", "source": "gaia-benchmark"}}
28
+ {"name": "bfcd99e1-0690-4b53-a85c-0174a8629083", "prompt": "How many applicants for the job in the PDF are only missing a single qualification?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 17", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "17", "gaia_level": 2, "gaia_file": "bfcd99e1-0690-4b53-a85c-0174a8629083.zip", "source": "gaia-benchmark"}}
29
+ {"name": "544b7f0c-173a-4377-8d56-57b36eb26ddf", "prompt": "In Valentina Re\u2019s contribution to the 2017 book \u201cWorld Building: Transmedia, Fans, Industries\u201d, what horror movie does the author cite as having popularized metalepsis between a dream world and reality? Use the complete name with article if any.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: A Nightmare on Elm Street", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "A Nightmare on Elm Street", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
30
+ {"name": "6b078778-0b90-464d-83f6-59511c811b01", "prompt": "The Metropolitan Museum of Art has a portrait in its collection with an accession number of 29.100.5. Of the consecrators and co-consecrators of this portrait's subject as a bishop, what is the name of the one who never became pope?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Alfonso Visconti", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "Alfonso Visconti", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
31
+ {"name": "076c8171-9b3b-49b9-a477-244d2a532826", "prompt": "The attached file contains a list of vendors in the Liminal Springs mall, along with each vendor\u2019s monthly revenue and the rent they pay the mall. I want you to find the vendor that makes the least money, relative to the rent it pays. Then, tell me what is listed in the \u201ctype\u201d column for that vendor.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Finance", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "Finance", "gaia_level": 2, "gaia_file": "076c8171-9b3b-49b9-a477-244d2a532826.xlsx", "source": "gaia-benchmark"}}
32
+ {"name": "08cae58d-4084-4616-b6dd-dd6534e4825b", "prompt": "According to Google Finance, when was the first year the Apple stock went above $50 (without adjusting for stock split)?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 2018", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "2018", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
33
+ {"name": "2dfc4c37-fec1-4518-84a7-10095d30ad75", "prompt": "According to Box Office Mojo's 2020 Worldwide Box Office list, how many of the top 10 highest-grossing worldwide movies are also on the top 10 highest-grossing domestic movies? Your answer should be a numerical integer value.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 6", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "6", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
34
+ {"name": "9f41b083-683e-4dcf-9185-ccfeaa88fa45", "prompt": "How many pages if the 2023 IPCC report (85 pages version) mentions nuclear energy?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 0", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "0", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
35
+ {"name": "ecbc4f94-95a3-4cc7-b255-6741a458a625", "prompt": "How many images are there in the latest 2022 Lego english wikipedia article?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 13", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "13", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
36
+ {"name": "e9a2c537-8232-4c3f-85b0-b52de6bcba99", "prompt": "The attached file shows a list of books in the collection of Scribe County Public Library. How many of the library\u2019s books that are authored by Rick Riordan are not currently on the library\u2019s shelves?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 7", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "7", "gaia_level": 2, "gaia_file": "e9a2c537-8232-4c3f-85b0-b52de6bcba99.pdf", "source": "gaia-benchmark"}}
37
+ {"name": "71345b0a-9c7d-4b50-b2bf-937ec5879845", "prompt": "On a leap day before the year 2008, a joke was removed from the Wikipedia page for \u201cDragon\u201d. What was the phrase that was removed? Give the phrase as it appeared on the page, but without punctuation.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Here be dragons", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "Here be dragons", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
38
+ {"name": "7b5377b0-3f38-4103-8ad2-90fe89864c04", "prompt": "Find the value of x to the nearest tenth: Lx = (d/dx * (A * x-squared)) + 4-thousand'n'ninety-7 minus C\nWhere L is the last two digits of the year of the Venezuelan Declaration of Independence,\nA is the number of colors in the TikTok logo as of July 2023, excluding black and white,\nand C is the height of the average woman in the Philippines according to a July 2023 Business Insider article, rounded to the nearest whole centimeter", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 563.9", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "563.9", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
39
+ {"name": "114d5fd0-e2ae-4b6d-a65a-870da2d19c08", "prompt": "In the endnote found in the second-to-last paragraph of page 11 of the book with the doi 10.2307/j.ctv9b2xdv, what date in November was the Wikipedia article accessed? Just give the day of the month.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 4", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "4", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
40
+ {"name": "8f80e01c-1296-4371-9486-bb3d68651a60", "prompt": "Using bass clef notes, what is the age of someone who has experienced the word spelled out in the sheet music by the note letters the total number of lines and notes minus the number of notes on lines in the image?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 90", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "90", "gaia_level": 2, "gaia_file": "8f80e01c-1296-4371-9486-bb3d68651a60.png", "source": "gaia-benchmark"}}
41
+ {"name": "ad37a656-079a-49f9-a493-7b739c9167d1", "prompt": "On July 15, 2008, Phys.org published an article about a catastrophe. Find the explosive force of this catastrophe according to Encyclopedia Britannica, then find the name of the US nuclear test that had the same yield. Your answer should only be the last word of the name of the test.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Bravo", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "Bravo", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
42
+ {"name": "366e2f2b-8632-4ef2-81eb-bc3877489217", "prompt": "The attached file lists accommodations in the resort town of Seahorse Island. Based on the information in this file, which seems like the better available place to stay for a family that enjoys swimming and wants a full house?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Shelley's place", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "Shelley's place", "gaia_level": 2, "gaia_file": "366e2f2b-8632-4ef2-81eb-bc3877489217.pdf", "source": "gaia-benchmark"}}
43
+ {"name": "f3917a3d-1d17-4ee2-90c5-683b072218fe", "prompt": "How many edits were made to the Wikipedia page on Antidisestablishmentarianism from its inception until June of 2023?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 2732", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "2732", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
44
+ {"name": "48eb8242-1099-4c26-95d4-ef22b002457a", "prompt": "How many nonindigenous crocodiles were found in Florida from the year 2000 through 2020? You can get the data from the USGS Nonindigenous Aquatic Species database.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 6", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "6", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
45
+ {"name": "c8b7e059-c60d-472e-ad64-3b04ae1166dc", "prompt": "The work referenced in footnote 397 of Federico Lauria's 2014 dissertation is also the source for the titles of two paintings in the Smithsonian American Art Museum's collection, as of August 2023. What is the absolute difference between the chapter numbers of the chapters that the titles of these two paintings quote?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 8", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "8", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
46
+ {"name": "d1af70ea-a9a4-421a-b9cc-94b5e02f1788", "prompt": "As of the 2020 census, what was the population difference between the largest county seat and smallest county seat, by land area of the county seat, in Washington state? For population figures, please use the official data from data.census.gov. Please report the integer difference.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 736455", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "736455", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
47
+ {"name": "08f3a05f-5947-4089-a4c4-d4bcfaa6b7a0", "prompt": "Given $x_0 = -5$ and $f(x) = x^3 + 4x^2 - 3x + 8$, what is the smallest $n$ where using Newton's Method $n = n+1$ after rounding to four decimal places?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 2", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "2", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
48
+ {"name": "54612da3-fd56-4941-80f4-5eb82330de25", "prompt": "The attached file shows the locomotives in the collection of a North American railroad museum. How many wheels do the listed steam locomotives have in total?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 60", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "60", "gaia_level": 2, "gaia_file": "54612da3-fd56-4941-80f4-5eb82330de25.xlsx", "source": "gaia-benchmark"}}
49
+ {"name": "ded28325-3447-4c56-860f-e497d6fb3577", "prompt": "This is a secret message my friend gave me. It says where we should meet for our picnic on Friday. The only problem is, it\u2019s encrypted in the Caesar cipher, so I can\u2019t read it. Can you tell me what it says? This is the message:\n\nZsmxsm sc sx Zyvilsec Zvkjk.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Picnic is in Ploybius Plaza.", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "Picnic is in Ploybius Plaza.", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
50
+ {"name": "6359a0b1-8f7b-499b-9336-840f9ab90688", "prompt": "What is the area of the green polygon in the attached file? The numbers in purple represent the lengths of the side they are next to.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 39", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "39", "gaia_level": 2, "gaia_file": "6359a0b1-8f7b-499b-9336-840f9ab90688.png", "source": "gaia-benchmark"}}
51
+ {"name": "7cc4acfa-63fd-4acc-a1a1-e8e529e0a97f", "prompt": "The attached spreadsheet contains the sales of menu items for a regional fast-food chain. Which city had the greater total sales: Wharvton or Algrimand?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Wharvton", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "Wharvton", "gaia_level": 2, "gaia_file": "7cc4acfa-63fd-4acc-a1a1-e8e529e0a97f.xlsx", "source": "gaia-benchmark"}}
52
+ {"name": "d700d50d-c707-4dca-90dc-4528cddd0c80", "prompt": "Who composed the song that was performed by a rooster and a hamster in separate animated videos at separate tempos with different lyrics? Answer using the format First name Last name.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Roger Miller", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "Roger Miller", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
53
+ {"name": "0a3cd321-3e76-4622-911b-0fda2e5d6b1a", "prompt": "According to the World Bank, which countries had gross savings of over 35% of GDP for every year in the period 2001-2010? Give your answer as a comma-separated list of countries in alphabetical order. Use the countries most common names in english when answering.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Brunei, China, Morocco, Singapore", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "Brunei, China, Morocco, Singapore", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
54
+ {"name": "f2feb6a4-363c-4c09-a804-0db564eafd68", "prompt": "I\u2019m thinking about selling my home, so I want to learn more about how homes in my area sold recently. I live in Pearl City, Hawaii, which is on the island of Oahu. I know two homes near me that sold in 2022 were 2072 Akaikai Loop, and 2017 Komo Mai Drive. Find which of those homes sold for more in 2022, and tell me how much it sold for. Don\u2019t put commas or decimal places in the answer.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 900000", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "900000", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
55
+ {"name": "0b260a57-3f3a-4405-9f29-6d7a1012dbfb", "prompt": "On ScienceDirect, what is the difference to 3 decimal places in the sample standard deviations of the number of Reference Works in each Life Science domain compared to Health Sciences as of 2022?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 0.269", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "0.269", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
56
+ {"name": "ed58682d-bc52-4baa-9eb0-4eb81e1edacc", "prompt": "What is the last word before the second chorus of the King of Pop's fifth single from his sixth studio album?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: stare", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "stare", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
57
+ {"name": "cca70ce6-1952-45d2-acd4-80c903b0bc49", "prompt": "Look at the attached image. The quiz is scored as follows:\n\nProblems that ask the student to add or subtract fractions: 5 points\nProblems that ask the student to multiply or divide fractions: 10 points\nProblems that ask the student to form an improper fraction: 15 points\nProblems that ask the student to form a mixed number: 20 points\n\nDue to a technical issue that delayed having students take the quiz, the teacher is giving everyone 5 bonus points.\n\nIf you graded the quiz in the attached image, how many points would the student have earned? There is no partial credit.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 85", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "85", "gaia_level": 2, "gaia_file": "cca70ce6-1952-45d2-acd4-80c903b0bc49.png", "source": "gaia-benchmark"}}
58
+ {"name": "b7f857e4-d8aa-4387-af2a-0e844df5b9d8", "prompt": "The attached image contains a Python script. Run the Python code against an array of strings, listed below. The output of the Python script will be a URL containing C++ source code. Compile and run this C++ code against the array [35, 12, 8, 99, 21, 5] and return the sum of the third and fifth integers in the sorted list.\n\narr = ['_alg', 'ghi', 'C++', 'jkl', 'tps', '/Q', 'pqr', 'stu', ':', '//', 'rose', 'vwx', 'yz1', '234', 'tta', '567', '890', 'cod', 'e.', 'or', 'g/', 'wiki', '/', 'ing', 'sort', 'abc' , 'or', 'it', 'hms', 'mno' , 'uic', 'ksort', '#', 'ht' ]", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 47", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "47", "gaia_level": 2, "gaia_file": "b7f857e4-d8aa-4387-af2a-0e844df5b9d8.png", "source": "gaia-benchmark"}}
59
+ {"name": "d8152ad6-e4d5-4c12-8bb7-8d57dc10c6de", "prompt": "I have the Standard plan in the image below, and I just uploaded 60 equally sized files and got a message that I'm 100GB over the limit. I have 980 more files of the same size to upload. What is the average additional cost per file in dollar that goes over my current plan limit rounded to the nearest cent if I have to upgrade to the minimum possible plan to store them all? Answer with the following format: x.xx", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 0.03", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "0.03", "gaia_level": 2, "gaia_file": "d8152ad6-e4d5-4c12-8bb7-8d57dc10c6de.png", "source": "gaia-benchmark"}}
60
+ {"name": "67e8878b-5cef-4375-804e-e6291fdbe78a", "prompt": "The attached PDF lists accommodations in the resort community of Seahorse Island. Which type of accommodation has a higher average rating in Seahorse Island?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Hotels", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "Hotels", "gaia_level": 2, "gaia_file": "67e8878b-5cef-4375-804e-e6291fdbe78a.pdf", "source": "gaia-benchmark"}}
61
+ {"name": "023e9d44-96ae-4eed-b912-244ee8c3b994", "prompt": "It's May 2023, and I'm about to drive across the U.S. from California to Maine. I always recycle my water bottles at the end of a trip, and I drink 5 12-ounce water bottles for every 100 miles I travel, rounded to the nearest 100. Assuming I follow I-40 from Los Angeles to Cincinnati, then take I-90 from Cincinnati to Augusta, how many dollars will I get back according to Wikipedia?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 8", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "8", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
62
+ {"name": "0e9e85b8-52b9-4de4-b402-5f635ab9631f", "prompt": "What is the latest chronological year date written in the image on the webpage found when following the first citation reference link on the latest version of Carl Nebel's Wikipedia page as of August 2023?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 1927", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "1927", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
63
+ {"name": "20194330-9976-4043-8632-f8485c6c71b2", "prompt": "The YouTube channel Game Grumps began a Let\u2019s Play of the game Sonic the Hedgehog (2006) in the year 2012. Thirty seconds into the first episode, a phrase is shown on the screen in white letters on a red background. How many times does the letter \"E\" appear in this phrase?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 4", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "4", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
64
+ {"name": "4d51c4bf-4b0e-4f3d-897b-3f6687a7d9f2", "prompt": "This spreadsheet contains a list of clients for a retractable awning company. Each client has ordered a new awning for the back of their house within the last 90 days. The company makes different designs depending on whether the awning is made to block sunrises or sunsets. In this region, houses with odd-numbered street addresses face east, and houses with even-numbered street addresses face west. How many of these clients will be receiving the sunset awning design?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 8", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "8", "gaia_level": 2, "gaia_file": "4d51c4bf-4b0e-4f3d-897b-3f6687a7d9f2.xlsx", "source": "gaia-benchmark"}}
65
+ {"name": "65638e28-7f37-4fa7-b7b9-8c19bb609879", "prompt": "The book with the doi 10.1353/book.24372 concerns a certain neurologist. According to chapter 2 of the book, what author influenced this neurologist\u2019s belief in \u201cendopsychic myths\u201d? Give the last name only.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Kleinpaul", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "Kleinpaul", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
66
+ {"name": "3ff6b7a9-a5bd-4412-ad92-0cd0d45c0fee", "prompt": "The longest-lived vertebrate is named after an island. According to Wikipedia as of January 1, 2021, what is the 2020 estimated population of that island, to the nearest thousand?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 56000", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "56000", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
67
+ {"name": "708b99c5-e4a7-49cb-a5cf-933c8d46470d", "prompt": "On the DeepFruits fruit detection graph on Connected Papers from 2016, what feature caused the largest bubble to be the size it is?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Citations", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "Citations", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
68
+ {"name": "0a65cb96-cb6e-4a6a-8aae-c1084f613456", "prompt": "During the first week of August 2015, one of the NASA Astronomy Pictures of the Day shows the lights of a city on the horizon. The namesake of this city also has a landmark building in Chicago named after him. What is the name of the architectural firm that designed this landmark building? Give the first name appearing in the name of the firm as of June 2023.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Holabird", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "Holabird", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
69
+ {"name": "65da0822-a48a-4a68-bbad-8ed1b835a834", "prompt": "All of the individuals who formally held the position of United States secretary of homeland security prior to April 2019, excluding those who held the position in an acting capacity, have a bachelor's degree. Of the universities that these bachelor's degrees were from, which is the westernmost university and which is the easternmost university? Give them to me as a comma-separated list, I only want the name of the cities where the universities are located, with the westernmost city listed first.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Santa Clara, Boston", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "Santa Clara, Boston", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
70
+ {"name": "0bb3b44a-ede5-4db5-a520-4e844b0079c5", "prompt": "Consider the following symbols: \ud809\udc1c \ud809\udc10\ud809\udc1a\n\nThis is a number written using the Mesopotamian/Babylonian number system and represented with Sumerian cuneiform. Convert this number into Arabic numerals as a decimal number.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 536", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "536", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
71
+ {"name": "73c1b9fe-ee1d-4cf4-96ca-35c08f97b054", "prompt": "According to the USGS, in what year was the American Alligator first found west of Texas (not including Texas)?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 1954", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "1954", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
72
+ {"name": "e2d69698-bc99-4e85-9880-67eaccd66e6c", "prompt": "As of August 2023, who is the only winner of the US version of Survivor to be born in the month of May?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Michele Fitzgerald", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "Michele Fitzgerald", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
73
+ {"name": "a56f1527-3abf-41d6-91f8-7296d6336c3f", "prompt": "The cover of the August 2021 issue of Vogue shows a famous landmark in the background behind some trees. How tall is this monument in yards, rounded to the nearest yard? Give the number only.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 185", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "185", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
74
+ {"name": "42d4198c-5895-4f0a-b0c0-424a66465d83", "prompt": "I'm curious about how much information is available for popular video games before their release. Find the Wikipedia page for the 2019 game that won the British Academy Games Awards. How many revisions did that page have before the month listed as the game's release date on that Wikipedia page (as of the most recent entry from 2022)?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 60", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "60", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
75
+ {"name": "edd4d4f2-1a58-45c4-b038-67337af4e029", "prompt": "The attached spreadsheet lists the locomotives owned by a local railroad museum. What is the typical American name for the type of locomotive this museum uses for the Murder Mystery Express?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Berkshire", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "Berkshire", "gaia_level": 2, "gaia_file": "edd4d4f2-1a58-45c4-b038-67337af4e029.xlsx", "source": "gaia-benchmark"}}
76
+ {"name": "a26649c6-1cb2-470a-871e-6910c64c3e53", "prompt": "What is the absolute difference in tens of thousands between the population of chinstrap penguins on the Wikipedia page for penguin species populations as of the end of 2018 and the population recorded in the Nature.com \"global population assessment of the Chinstrap penguin\" article from 2020, assuming two penguins per breeding pair?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 116", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "116", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
77
+ {"name": "4d0aa727-86b1-406b-9b33-f870dd14a4a5", "prompt": "The attached file lists the locomotives owned by a local railroad museum. It gives each locomotive\u2019s identifying number, operating status, and the name of the daily excursion it heads, if operational. What are the odds that today\u2019s Sunset Picnic Trip will use a steam locomotive? Assume that each day\u2019s excursion picks one of its assigned locomotives at random, and express the answer in the form \u201c1 in 4\u201d, \u201c1 in 5\u201d, etc.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 1 in 3", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "1 in 3", "gaia_level": 2, "gaia_file": "4d0aa727-86b1-406b-9b33-f870dd14a4a5.xlsx", "source": "gaia-benchmark"}}
78
+ {"name": "d5141ca5-e7a0-469f-bf3e-e773507c86e2", "prompt": "When was a picture of St. Thomas Aquinas first added to the Wikipedia page on the Principle of double effect? Answer using the format DD/MM/YYYY.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 19/02/2009", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "19/02/2009", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
79
+ {"name": "1dcc160f-c187-48c2-b68e-319bd4354f3d", "prompt": "According to Openreview.net, at the NeurIPS 2022 Conference, how many papers by an author named Yuri were accepted with a \"certain\" recommendation?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 3", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "3", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
80
+ {"name": "b2c257e0-3ad7-4f05-b8e3-d9da973be36e", "prompt": "If this whole pint is made up of ice cream, how many percent above or below the US federal standards for butterfat content is it when using the standards as reported by Wikipedia in 2020? Answer as + or - a number rounded to one decimal place.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: +4.6", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "+4.6", "gaia_level": 2, "gaia_file": "b2c257e0-3ad7-4f05-b8e3-d9da973be36e.jpg", "source": "gaia-benchmark"}}
81
+ {"name": "e0c10771-d627-4fd7-9694-05348e54ee36", "prompt": "Take the gender split from the 2011 Bulgarian census about those who have completed tertiary education. Subtract the smaller number from the larger number, then return the difference in thousands of women. So if there were 30.1 thousand more men, you'd give \"30.1\"", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 234.9", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "234.9", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
82
+ {"name": "e29834fd-413a-455c-a33e-c3915b07401c", "prompt": "I'd like to learn more about some popular reality television competition shows. As of the end of the 44th season of the American version of Survivor, how many more unique winners have there been compared to the number of winners of American Idol?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 21", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "21", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
83
+ {"name": "08c0b6e9-1b43-4c2e-ae55-4e3fce2c2715", "prompt": "In the film Goldfinger, what color was the object that James Bond concealed himself and his companion Pussy Galore at the end of the film? If there are multiple colors, put them in a comma-separated list in alphabetical order.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: orange, white", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "orange, white", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
84
+ {"name": "db4fd70a-2d37-40ea-873f-9433dc5e301f", "prompt": "As of May 2023, how many stops are between South Station and Windsor Gardens on MBTA\u2019s Franklin-Foxboro line (not included)?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 10", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "10", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
85
+ {"name": "853c8244-429e-46ca-89f2-addf40dfb2bd", "prompt": "In the 2015 Metropolitan Museum of Art exhibition titled after the Chinese zodiac animal of 2015, how many of the \"twelve animals of the Chinese zodiac\" have a hand visible?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 11", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "11", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
86
+ {"name": "7a4a336d-dcfa-45a0-b014-824c7619e8de", "prompt": "At the two-minute mark in the YouTube video uploaded by the channel \u201cGameGrumps\u201d on May 14, 2017 as part of their playthrough of the game Mario Kart 8 Deluxe, the shows\u2019 hosts are competing on one of the game\u2019s racetracks. What was the world record time for that track in the game\u2019s 150cc mode as of June 7, 2023? Express your answer in minutes and seconds, rounding the seconds to the nearest hundredth, e.g. 1:01.001.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 1:41.614", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "1:41.614", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
87
+ {"name": "c61d22de-5f6c-4958-a7f6-5e9707bd3466", "prompt": "A paper about AI regulation that was originally submitted to arXiv.org in June 2022 shows a figure with three axes, where each axis has a label word at both ends. Which of these words is used to describe a type of society in a Physics and Society article submitted to arXiv.org on August 11, 2016?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: egalitarian", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "egalitarian", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
88
+ {"name": "17b5a6a3-bc87-42e8-b0fb-6ab0781ef2cc", "prompt": "I\u2019m researching species that became invasive after people who kept them as pets released them. There\u2019s a certain species of fish that was popularized as a pet by being the main character of the movie Finding Nemo. According to the USGS, where was this fish found as a nonnative species, before the year 2020? I need the answer formatted as the five-digit zip codes of the places the species was found, separated by commas if there is more than one place.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 34689", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "34689", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
89
+ {"name": "04a04a9b-226c-43fd-b319-d5e89743676f", "prompt": "If we assume all articles published by Nature in 2020 (articles, only, not book reviews/columns, etc) relied on statistical significance to justify their findings and they on average came to a p-value of 0.04, how many papers would be incorrect as to their claims of statistical significance? Round the value up to the next integer.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 41", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "41", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
90
+ {"name": "14569e28-c88c-43e4-8c32-097d35b9a67d", "prompt": "In Unlambda, what exact charcter or text needs to be added to correct the following code to output \"For penguins\"? If what is needed is a character, answer with the name of the character. If there are different names for the character, use the shortest. The text location is not needed. Code:\n\n`r```````````.F.o.r. .p.e.n.g.u.i.n.si", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: backtick", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "backtick", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
91
+ {"name": "32102e3e-d12a-4209-9163-7b3a104efe5d", "prompt": "The attached spreadsheet shows the inventory for a movie and video game rental store in Seattle, Washington. What is the title of the oldest Blu-Ray recorded in this spreadsheet? Return it as appearing in the spreadsheet.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Time-Parking 2: Parallel Universe", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "Time-Parking 2: Parallel Universe", "gaia_level": 2, "gaia_file": "32102e3e-d12a-4209-9163-7b3a104efe5d.xlsx", "source": "gaia-benchmark"}}
92
+ {"name": "3627a8be-a77f-41bb-b807-7e1bd4c0ebdf", "prompt": "The object in the British Museum's collection with a museum number of 2012,5015.17 is the shell of a particular mollusk species. According to the abstract of a research article published in Science Advances in 2021, beads made from the shells of this species were found that are at least how many thousands of years old?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 142", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "142", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
93
+ {"name": "7619a514-5fa8-43ef-9143-83b66a43d7a4", "prompt": "According to github, when was Regression added to the oldest closed numpy.polynomial issue that has the Regression label in MM/DD/YY?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 04/15/18", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "04/15/18", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
94
+ {"name": "7dd30055-0198-452e-8c25-f73dbe27dcb8", "prompt": "Using the Biopython library in Python, parse the PDB file of the protein identified by the PDB ID 5wb7 from the RCSB Protein Data Bank. Calculate the distance between the first and second atoms as they are listed in the PDB file. Report the answer in Angstroms, rounded to the nearest picometer.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 1.456", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "1.456", "gaia_level": 2, "gaia_file": "7dd30055-0198-452e-8c25-f73dbe27dcb8.pdb", "source": "gaia-benchmark"}}
95
+ {"name": "2a649bb1-795f-4a01-b3be-9a01868dae73", "prompt": "What are the EC numbers of the two most commonly used chemicals for the virus testing method in the paper about SPFMV and SPCSV in the Pearl Of Africa from 2016? Return the semicolon-separated numbers in the order of the alphabetized chemicals.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 3.1.3.1; 1.11.1.7", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "3.1.3.1; 1.11.1.7", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
96
+ {"name": "87c610df-bef7-4932-b950-1d83ef4e282b", "prompt": "In April of 1977, who was the Prime Minister of the first place mentioned by name in the Book of Esther (in the New International Version)?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Morarji Desai", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "Morarji Desai", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
97
+ {"name": "624cbf11-6a41-4692-af9c-36b3e5ca3130", "prompt": "What's the last line of the rhyme under the flavor name on the headstone visible in the background of the photo of the oldest flavor's headstone in the Ben & Jerry's online flavor graveyard as of the end of 2022?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: So we had to let it die.", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "So we had to let it die.", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
98
+ {"name": "dd3c7503-f62a-4bd0-9f67-1b63b94194cc", "prompt": "Use density measures from the chemistry materials licensed by Marisa Alviar-Agnew & Henry Agnew under the CK-12 license in LibreText's Introductory Chemistry materials as compiled 08/21/2023.\n\nI have a gallon of honey and a gallon of mayonnaise at 25C. I remove one cup of honey at a time from the gallon of honey. How many times will I need to remove a cup to have the honey weigh less than the mayonaise? Assume the containers themselves weigh the same.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 6", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "6", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
99
+ {"name": "df6561b2-7ee5-4540-baab-5095f742716a", "prompt": "When you take the average of the standard population deviation of the red numbers and the standard sample deviation of the green numbers in this image using the statistics module in Python 3.11, what is the result rounded to the nearest three decimal points?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 17.056", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "17.056", "gaia_level": 2, "gaia_file": "df6561b2-7ee5-4540-baab-5095f742716a.png", "source": "gaia-benchmark"}}
100
+ {"name": "f0f46385-fc03-4599-b5d3-f56496c3e69f", "prompt": "In terms of geographical distance between capital cities, which 2 countries are the furthest from each other within the ASEAN bloc according to wikipedia? Answer using a comma separated list, ordering the countries by alphabetical order.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Indonesia, Myanmar", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "Indonesia, Myanmar", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
101
+ {"name": "e4e91f1c-1dcd-439e-9fdd-cb976f5293fd", "prompt": "I need to fact-check a citation. This is the citation from the bibliography:\n\nGreetham, David. \"Uncoupled: OR, How I Lost My Author(s).\" Textual Cultures: Texts, Contexts, Interpretation, vol. 3 no. 1, 2008, p. 45-46. Project MUSE, doi:10.2979/tex.2008.3.1.44.\n\nAnd this is the in-line citation:\n\nOur relationship with the authors of the works we read can often be \u201cobscured not by a \"cloak of print\" but by the veil of scribal confusion and mis-transmission\u201d (Greetham 45-46).\n\nDoes the quoted text match what is actually in the article? If Yes, answer Yes, otherwise, give me the word in my citation that does not match with the correct one (without any article).", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: cloak", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "cloak", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
102
+ {"name": "56137764-b4e0-45b8-9c52-1866420c3df5", "prompt": "Which contributor to the version of OpenCV where support was added for the Mask-RCNN model has the same name as a former Chinese head of government when the names are transliterated to the Latin alphabet?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Li Peng", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "Li Peng", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
103
+ {"name": "8b3379c0-0981-4f5b-8407-6444610cb212", "prompt": "What is the maximum length in meters of #9 in the first National Geographic short on YouTube that was ever released according to the Monterey Bay Aquarium website? Just give the number.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 1.8", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "1.8", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
104
+ {"name": "0ff53813-3367-4f43-bcbd-3fd725c1bf4b", "prompt": "What two-word type of model did Manash Pratim Kashyap's and PS Fader's studies in customer retention studies published during 2018-2019 have in common (no punctuation)?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: beta geometric", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "beta geometric", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
105
+ {"name": "a7feb290-76bb-4cb7-8800-7edaf7954f2f", "prompt": "How many High Energy Physics - Lattice articles listed in January 2020 on Arxiv had ps versions available?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 31", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "31", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
106
+ {"name": "b4cc024b-3f5e-480e-b96a-6656493255b5", "prompt": "The photograph in the Whitney Museum of American Art's collection with accession number 2022.128 shows a person holding a book. Which military unit did the author of this book join in 1813? Answer without using articles.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Russian-German Legion", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "Russian-German Legion", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
107
+ {"name": "33d8ea3b-6c6b-4ff1-803d-7e270dea8a57", "prompt": "What is the minimum number of page links a person must click on to go from the english Wikipedia page on The Lord of the Rings (the book) to the english Wikipedia page on A Song of Ice and Fire (the book series)? In your count, include each link you would click on to get to the page. Use the pages as they appeared at the end of the day on July 3, 2023.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 2", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "2", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
108
+ {"name": "e8cb5b03-41e0-4086-99e5-f6806cd97211", "prompt": "I went to Virtue restaurant & bar in Chicago for my birthday on March 22, 2021 and the main course I had was delicious! Unfortunately, when I went back about a month later on April 21, it was no longer on the dinner menu. Using the Wayback Machine, can you help me figure out which main course was on the dinner menu for Virtue on March 22, 2021 but not April 21, 2021? Answer using the singular form, without articles.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: shrimp", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "shrimp", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
109
+ {"name": "f46b4380-207e-4434-820b-f32ce04ae2a4", "prompt": "It is 1999. Before you party like it is 1999, please assist me in settling a bet.\n\nFiona Apple and Paula Cole released albums prior to 1999. Of these albums, which didn't receive a letter grade from Robert Christgau? Provide your answer as a comma delimited list of album titles, sorted alphabetically.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Harbinger, Tidal", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "Harbinger, Tidal", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
110
+ {"name": "05407167-39ec-4d3a-a234-73a9120c325d", "prompt": "In the 2018 VSCode blog post on replit.com, what was the command they clicked on in the last video to remove extra lines?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Format Document", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "Format Document", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
111
+ {"name": "b9763138-c053-4832-9f55-86200cb1f99c", "prompt": "Compute the check digit the Tropicos ID for the Order Helotiales would have if it were an ISBN-10 number.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 3", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "3", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
112
+ {"name": "16d825ff-1623-4176-a5b5-42e0f5c2b0ac", "prompt": "What time was the Tri-Rail train that carried the most passengers on May 27, 2019 scheduled to arrive in Pompano Beach? Express your answer in the 12-hour digital clock format without leading zero if any, and include whether it is AM or PM.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 6:41 PM", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "6:41 PM", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
113
+ {"name": "2b3ef98c-cc05-450b-a719-711aee40ac65", "prompt": "Could you help me out with this assignment? Our professor sprung it on us at the end of class Friday, and I'm still trying to figure it out. The question he asked us was about an anagram. I've attached an audio recording of the question that he asked, so if you could please take a listen and give me the answer, I'd really appreciate the help. Please limit your response to the anagram text that could be generated from the original line which fulfills the professor's request, without any other commentary. Also, please don't include any punctuation in your response.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: To be or not to be that is the question whether tis nobler in the mind to suffer the slings and arrows of outrageous fortune", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "To be or not to be that is the question whether tis nobler in the mind to suffer the slings and arrows of outrageous fortune", "gaia_level": 2, "gaia_file": "2b3ef98c-cc05-450b-a719-711aee40ac65.mp3", "source": "gaia-benchmark"}}
114
+ {"name": "bfcd99e1-0690-4b53-a85c-0174a8629083", "prompt": "How many applicants for the job in the PDF are only missing a single qualification?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 17", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "17", "gaia_level": 2, "gaia_file": "bfcd99e1-0690-4b53-a85c-0174a8629083.zip", "source": "gaia-benchmark"}}
115
+ {"name": "544b7f0c-173a-4377-8d56-57b36eb26ddf", "prompt": "In Valentina Re\u2019s contribution to the 2017 book \u201cWorld Building: Transmedia, Fans, Industries\u201d, what horror movie does the author cite as having popularized metalepsis between a dream world and reality? Use the complete name with article if any.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: A Nightmare on Elm Street", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "A Nightmare on Elm Street", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
116
+ {"name": "6b078778-0b90-464d-83f6-59511c811b01", "prompt": "The Metropolitan Museum of Art has a portrait in its collection with an accession number of 29.100.5. Of the consecrators and co-consecrators of this portrait's subject as a bishop, what is the name of the one who never became pope?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Alfonso Visconti", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "Alfonso Visconti", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
117
+ {"name": "076c8171-9b3b-49b9-a477-244d2a532826", "prompt": "The attached file contains a list of vendors in the Liminal Springs mall, along with each vendor\u2019s monthly revenue and the rent they pay the mall. I want you to find the vendor that makes the least money, relative to the rent it pays. Then, tell me what is listed in the \u201ctype\u201d column for that vendor.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Finance", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "Finance", "gaia_level": 2, "gaia_file": "076c8171-9b3b-49b9-a477-244d2a532826.xlsx", "source": "gaia-benchmark"}}
118
+ {"name": "08cae58d-4084-4616-b6dd-dd6534e4825b", "prompt": "According to Google Finance, when was the first year the Apple stock went above $50 (without adjusting for stock split)?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 2018", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "2018", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
119
+ {"name": "2dfc4c37-fec1-4518-84a7-10095d30ad75", "prompt": "According to Box Office Mojo's 2020 Worldwide Box Office list, how many of the top 10 highest-grossing worldwide movies are also on the top 10 highest-grossing domestic movies? Your answer should be a numerical integer value.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 6", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "6", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
120
+ {"name": "9f41b083-683e-4dcf-9185-ccfeaa88fa45", "prompt": "How many pages if the 2023 IPCC report (85 pages version) mentions nuclear energy?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 0", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "0", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
121
+ {"name": "ecbc4f94-95a3-4cc7-b255-6741a458a625", "prompt": "How many images are there in the latest 2022 Lego english wikipedia article?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 13", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "13", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
122
+ {"name": "e9a2c537-8232-4c3f-85b0-b52de6bcba99", "prompt": "The attached file shows a list of books in the collection of Scribe County Public Library. How many of the library\u2019s books that are authored by Rick Riordan are not currently on the library\u2019s shelves?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 7", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "7", "gaia_level": 2, "gaia_file": "e9a2c537-8232-4c3f-85b0-b52de6bcba99.pdf", "source": "gaia-benchmark"}}
123
+ {"name": "71345b0a-9c7d-4b50-b2bf-937ec5879845", "prompt": "On a leap day before the year 2008, a joke was removed from the Wikipedia page for \u201cDragon\u201d. What was the phrase that was removed? Give the phrase as it appeared on the page, but without punctuation.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Here be dragons", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "Here be dragons", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
124
+ {"name": "7b5377b0-3f38-4103-8ad2-90fe89864c04", "prompt": "Find the value of x to the nearest tenth: Lx = (d/dx * (A * x-squared)) + 4-thousand'n'ninety-7 minus C\nWhere L is the last two digits of the year of the Venezuelan Declaration of Independence,\nA is the number of colors in the TikTok logo as of July 2023, excluding black and white,\nand C is the height of the average woman in the Philippines according to a July 2023 Business Insider article, rounded to the nearest whole centimeter", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 563.9", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "563.9", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
125
+ {"name": "114d5fd0-e2ae-4b6d-a65a-870da2d19c08", "prompt": "In the endnote found in the second-to-last paragraph of page 11 of the book with the doi 10.2307/j.ctv9b2xdv, what date in November was the Wikipedia article accessed? Just give the day of the month.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 4", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "4", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
126
+ {"name": "8f80e01c-1296-4371-9486-bb3d68651a60", "prompt": "Using bass clef notes, what is the age of someone who has experienced the word spelled out in the sheet music by the note letters the total number of lines and notes minus the number of notes on lines in the image?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 90", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "90", "gaia_level": 2, "gaia_file": "8f80e01c-1296-4371-9486-bb3d68651a60.png", "source": "gaia-benchmark"}}
127
+ {"name": "ad37a656-079a-49f9-a493-7b739c9167d1", "prompt": "On July 15, 2008, Phys.org published an article about a catastrophe. Find the explosive force of this catastrophe according to Encyclopedia Britannica, then find the name of the US nuclear test that had the same yield. Your answer should only be the last word of the name of the test.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Bravo", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "Bravo", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
128
+ {"name": "366e2f2b-8632-4ef2-81eb-bc3877489217", "prompt": "The attached file lists accommodations in the resort town of Seahorse Island. Based on the information in this file, which seems like the better available place to stay for a family that enjoys swimming and wants a full house?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Shelley's place", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "Shelley's place", "gaia_level": 2, "gaia_file": "366e2f2b-8632-4ef2-81eb-bc3877489217.pdf", "source": "gaia-benchmark"}}
129
+ {"name": "f3917a3d-1d17-4ee2-90c5-683b072218fe", "prompt": "How many edits were made to the Wikipedia page on Antidisestablishmentarianism from its inception until June of 2023?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 2732", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "2732", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
130
+ {"name": "48eb8242-1099-4c26-95d4-ef22b002457a", "prompt": "How many nonindigenous crocodiles were found in Florida from the year 2000 through 2020? You can get the data from the USGS Nonindigenous Aquatic Species database.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 6", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "6", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
131
+ {"name": "c8b7e059-c60d-472e-ad64-3b04ae1166dc", "prompt": "The work referenced in footnote 397 of Federico Lauria's 2014 dissertation is also the source for the titles of two paintings in the Smithsonian American Art Museum's collection, as of August 2023. What is the absolute difference between the chapter numbers of the chapters that the titles of these two paintings quote?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 8", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "8", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
132
+ {"name": "d1af70ea-a9a4-421a-b9cc-94b5e02f1788", "prompt": "As of the 2020 census, what was the population difference between the largest county seat and smallest county seat, by land area of the county seat, in Washington state? For population figures, please use the official data from data.census.gov. Please report the integer difference.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 736455", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "736455", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
133
+ {"name": "08f3a05f-5947-4089-a4c4-d4bcfaa6b7a0", "prompt": "Given $x_0 = -5$ and $f(x) = x^3 + 4x^2 - 3x + 8$, what is the smallest $n$ where using Newton's Method $n = n+1$ after rounding to four decimal places?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 2", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "2", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
134
+ {"name": "54612da3-fd56-4941-80f4-5eb82330de25", "prompt": "The attached file shows the locomotives in the collection of a North American railroad museum. How many wheels do the listed steam locomotives have in total?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 60", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "60", "gaia_level": 2, "gaia_file": "54612da3-fd56-4941-80f4-5eb82330de25.xlsx", "source": "gaia-benchmark"}}
135
+ {"name": "ded28325-3447-4c56-860f-e497d6fb3577", "prompt": "This is a secret message my friend gave me. It says where we should meet for our picnic on Friday. The only problem is, it\u2019s encrypted in the Caesar cipher, so I can\u2019t read it. Can you tell me what it says? This is the message:\n\nZsmxsm sc sx Zyvilsec Zvkjk.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Picnic is in Ploybius Plaza.", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "Picnic is in Ploybius Plaza.", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
136
+ {"name": "6359a0b1-8f7b-499b-9336-840f9ab90688", "prompt": "What is the area of the green polygon in the attached file? The numbers in purple represent the lengths of the side they are next to.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 39", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "39", "gaia_level": 2, "gaia_file": "6359a0b1-8f7b-499b-9336-840f9ab90688.png", "source": "gaia-benchmark"}}
137
+ {"name": "7cc4acfa-63fd-4acc-a1a1-e8e529e0a97f", "prompt": "The attached spreadsheet contains the sales of menu items for a regional fast-food chain. Which city had the greater total sales: Wharvton or Algrimand?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Wharvton", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "Wharvton", "gaia_level": 2, "gaia_file": "7cc4acfa-63fd-4acc-a1a1-e8e529e0a97f.xlsx", "source": "gaia-benchmark"}}
138
+ {"name": "d700d50d-c707-4dca-90dc-4528cddd0c80", "prompt": "Who composed the song that was performed by a rooster and a hamster in separate animated videos at separate tempos with different lyrics? Answer using the format First name Last name.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Roger Miller", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "Roger Miller", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
139
+ {"name": "0a3cd321-3e76-4622-911b-0fda2e5d6b1a", "prompt": "According to the World Bank, which countries had gross savings of over 35% of GDP for every year in the period 2001-2010? Give your answer as a comma-separated list of countries in alphabetical order. Use the countries most common names in english when answering.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Brunei, China, Morocco, Singapore", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "Brunei, China, Morocco, Singapore", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
140
+ {"name": "f2feb6a4-363c-4c09-a804-0db564eafd68", "prompt": "I\u2019m thinking about selling my home, so I want to learn more about how homes in my area sold recently. I live in Pearl City, Hawaii, which is on the island of Oahu. I know two homes near me that sold in 2022 were 2072 Akaikai Loop, and 2017 Komo Mai Drive. Find which of those homes sold for more in 2022, and tell me how much it sold for. Don\u2019t put commas or decimal places in the answer.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 900000", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "900000", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
141
+ {"name": "0b260a57-3f3a-4405-9f29-6d7a1012dbfb", "prompt": "On ScienceDirect, what is the difference to 3 decimal places in the sample standard deviations of the number of Reference Works in each Life Science domain compared to Health Sciences as of 2022?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 0.269", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "0.269", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
142
+ {"name": "ed58682d-bc52-4baa-9eb0-4eb81e1edacc", "prompt": "What is the last word before the second chorus of the King of Pop's fifth single from his sixth studio album?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: stare", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "stare", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
143
+ {"name": "cca70ce6-1952-45d2-acd4-80c903b0bc49", "prompt": "Look at the attached image. The quiz is scored as follows:\n\nProblems that ask the student to add or subtract fractions: 5 points\nProblems that ask the student to multiply or divide fractions: 10 points\nProblems that ask the student to form an improper fraction: 15 points\nProblems that ask the student to form a mixed number: 20 points\n\nDue to a technical issue that delayed having students take the quiz, the teacher is giving everyone 5 bonus points.\n\nIf you graded the quiz in the attached image, how many points would the student have earned? There is no partial credit.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 85", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "85", "gaia_level": 2, "gaia_file": "cca70ce6-1952-45d2-acd4-80c903b0bc49.png", "source": "gaia-benchmark"}}
144
+ {"name": "b7f857e4-d8aa-4387-af2a-0e844df5b9d8", "prompt": "The attached image contains a Python script. Run the Python code against an array of strings, listed below. The output of the Python script will be a URL containing C++ source code. Compile and run this C++ code against the array [35, 12, 8, 99, 21, 5] and return the sum of the third and fifth integers in the sorted list.\n\narr = ['_alg', 'ghi', 'C++', 'jkl', 'tps', '/Q', 'pqr', 'stu', ':', '//', 'rose', 'vwx', 'yz1', '234', 'tta', '567', '890', 'cod', 'e.', 'or', 'g/', 'wiki', '/', 'ing', 'sort', 'abc' , 'or', 'it', 'hms', 'mno' , 'uic', 'ksort', '#', 'ht' ]", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 47", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "47", "gaia_level": 2, "gaia_file": "b7f857e4-d8aa-4387-af2a-0e844df5b9d8.png", "source": "gaia-benchmark"}}
145
+ {"name": "d8152ad6-e4d5-4c12-8bb7-8d57dc10c6de", "prompt": "I have the Standard plan in the image below, and I just uploaded 60 equally sized files and got a message that I'm 100GB over the limit. I have 980 more files of the same size to upload. What is the average additional cost per file in dollar that goes over my current plan limit rounded to the nearest cent if I have to upgrade to the minimum possible plan to store them all? Answer with the following format: x.xx", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 0.03", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "0.03", "gaia_level": 2, "gaia_file": "d8152ad6-e4d5-4c12-8bb7-8d57dc10c6de.png", "source": "gaia-benchmark"}}
146
+ {"name": "67e8878b-5cef-4375-804e-e6291fdbe78a", "prompt": "The attached PDF lists accommodations in the resort community of Seahorse Island. Which type of accommodation has a higher average rating in Seahorse Island?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Hotels", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "Hotels", "gaia_level": 2, "gaia_file": "67e8878b-5cef-4375-804e-e6291fdbe78a.pdf", "source": "gaia-benchmark"}}
147
+ {"name": "023e9d44-96ae-4eed-b912-244ee8c3b994", "prompt": "It's May 2023, and I'm about to drive across the U.S. from California to Maine. I always recycle my water bottles at the end of a trip, and I drink 5 12-ounce water bottles for every 100 miles I travel, rounded to the nearest 100. Assuming I follow I-40 from Los Angeles to Cincinnati, then take I-90 from Cincinnati to Augusta, how many dollars will I get back according to Wikipedia?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 8", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "8", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
148
+ {"name": "0e9e85b8-52b9-4de4-b402-5f635ab9631f", "prompt": "What is the latest chronological year date written in the image on the webpage found when following the first citation reference link on the latest version of Carl Nebel's Wikipedia page as of August 2023?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 1927", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "1927", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
149
+ {"name": "20194330-9976-4043-8632-f8485c6c71b2", "prompt": "The YouTube channel Game Grumps began a Let\u2019s Play of the game Sonic the Hedgehog (2006) in the year 2012. Thirty seconds into the first episode, a phrase is shown on the screen in white letters on a red background. How many times does the letter \"E\" appear in this phrase?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 4", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "4", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
150
+ {"name": "4d51c4bf-4b0e-4f3d-897b-3f6687a7d9f2", "prompt": "This spreadsheet contains a list of clients for a retractable awning company. Each client has ordered a new awning for the back of their house within the last 90 days. The company makes different designs depending on whether the awning is made to block sunrises or sunsets. In this region, houses with odd-numbered street addresses face east, and houses with even-numbered street addresses face west. How many of these clients will be receiving the sunset awning design?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 8", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "8", "gaia_level": 2, "gaia_file": "4d51c4bf-4b0e-4f3d-897b-3f6687a7d9f2.xlsx", "source": "gaia-benchmark"}}
151
+ {"name": "65638e28-7f37-4fa7-b7b9-8c19bb609879", "prompt": "The book with the doi 10.1353/book.24372 concerns a certain neurologist. According to chapter 2 of the book, what author influenced this neurologist\u2019s belief in \u201cendopsychic myths\u201d? Give the last name only.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Kleinpaul", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "Kleinpaul", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
152
+ {"name": "3ff6b7a9-a5bd-4412-ad92-0cd0d45c0fee", "prompt": "The longest-lived vertebrate is named after an island. According to Wikipedia as of January 1, 2021, what is the 2020 estimated population of that island, to the nearest thousand?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 56000", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "56000", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
153
+ {"name": "708b99c5-e4a7-49cb-a5cf-933c8d46470d", "prompt": "On the DeepFruits fruit detection graph on Connected Papers from 2016, what feature caused the largest bubble to be the size it is?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Citations", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "Citations", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
154
+ {"name": "0a65cb96-cb6e-4a6a-8aae-c1084f613456", "prompt": "During the first week of August 2015, one of the NASA Astronomy Pictures of the Day shows the lights of a city on the horizon. The namesake of this city also has a landmark building in Chicago named after him. What is the name of the architectural firm that designed this landmark building? Give the first name appearing in the name of the firm as of June 2023.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Holabird", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "Holabird", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
155
+ {"name": "65da0822-a48a-4a68-bbad-8ed1b835a834", "prompt": "All of the individuals who formally held the position of United States secretary of homeland security prior to April 2019, excluding those who held the position in an acting capacity, have a bachelor's degree. Of the universities that these bachelor's degrees were from, which is the westernmost university and which is the easternmost university? Give them to me as a comma-separated list, I only want the name of the cities where the universities are located, with the westernmost city listed first.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Santa Clara, Boston", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "Santa Clara, Boston", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
156
+ {"name": "0bb3b44a-ede5-4db5-a520-4e844b0079c5", "prompt": "Consider the following symbols: \ud809\udc1c \ud809\udc10\ud809\udc1a\n\nThis is a number written using the Mesopotamian/Babylonian number system and represented with Sumerian cuneiform. Convert this number into Arabic numerals as a decimal number.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 536", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "536", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
157
+ {"name": "73c1b9fe-ee1d-4cf4-96ca-35c08f97b054", "prompt": "According to the USGS, in what year was the American Alligator first found west of Texas (not including Texas)?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 1954", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "1954", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
158
+ {"name": "e2d69698-bc99-4e85-9880-67eaccd66e6c", "prompt": "As of August 2023, who is the only winner of the US version of Survivor to be born in the month of May?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Michele Fitzgerald", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "Michele Fitzgerald", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
159
+ {"name": "a56f1527-3abf-41d6-91f8-7296d6336c3f", "prompt": "The cover of the August 2021 issue of Vogue shows a famous landmark in the background behind some trees. How tall is this monument in yards, rounded to the nearest yard? Give the number only.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 185", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "185", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
160
+ {"name": "42d4198c-5895-4f0a-b0c0-424a66465d83", "prompt": "I'm curious about how much information is available for popular video games before their release. Find the Wikipedia page for the 2019 game that won the British Academy Games Awards. How many revisions did that page have before the month listed as the game's release date on that Wikipedia page (as of the most recent entry from 2022)?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 60", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "60", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
161
+ {"name": "edd4d4f2-1a58-45c4-b038-67337af4e029", "prompt": "The attached spreadsheet lists the locomotives owned by a local railroad museum. What is the typical American name for the type of locomotive this museum uses for the Murder Mystery Express?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Berkshire", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "Berkshire", "gaia_level": 2, "gaia_file": "edd4d4f2-1a58-45c4-b038-67337af4e029.xlsx", "source": "gaia-benchmark"}}
162
+ {"name": "a26649c6-1cb2-470a-871e-6910c64c3e53", "prompt": "What is the absolute difference in tens of thousands between the population of chinstrap penguins on the Wikipedia page for penguin species populations as of the end of 2018 and the population recorded in the Nature.com \"global population assessment of the Chinstrap penguin\" article from 2020, assuming two penguins per breeding pair?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 116", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "116", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
163
+ {"name": "4d0aa727-86b1-406b-9b33-f870dd14a4a5", "prompt": "The attached file lists the locomotives owned by a local railroad museum. It gives each locomotive\u2019s identifying number, operating status, and the name of the daily excursion it heads, if operational. What are the odds that today\u2019s Sunset Picnic Trip will use a steam locomotive? Assume that each day\u2019s excursion picks one of its assigned locomotives at random, and express the answer in the form \u201c1 in 4\u201d, \u201c1 in 5\u201d, etc.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 1 in 3", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "1 in 3", "gaia_level": 2, "gaia_file": "4d0aa727-86b1-406b-9b33-f870dd14a4a5.xlsx", "source": "gaia-benchmark"}}
164
+ {"name": "d5141ca5-e7a0-469f-bf3e-e773507c86e2", "prompt": "When was a picture of St. Thomas Aquinas first added to the Wikipedia page on the Principle of double effect? Answer using the format DD/MM/YYYY.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 19/02/2009", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "19/02/2009", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
165
+ {"name": "1dcc160f-c187-48c2-b68e-319bd4354f3d", "prompt": "According to Openreview.net, at the NeurIPS 2022 Conference, how many papers by an author named Yuri were accepted with a \"certain\" recommendation?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 3", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "3", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
166
+ {"name": "b2c257e0-3ad7-4f05-b8e3-d9da973be36e", "prompt": "If this whole pint is made up of ice cream, how many percent above or below the US federal standards for butterfat content is it when using the standards as reported by Wikipedia in 2020? Answer as + or - a number rounded to one decimal place.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: +4.6", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "+4.6", "gaia_level": 2, "gaia_file": "b2c257e0-3ad7-4f05-b8e3-d9da973be36e.jpg", "source": "gaia-benchmark"}}
167
+ {"name": "e0c10771-d627-4fd7-9694-05348e54ee36", "prompt": "Take the gender split from the 2011 Bulgarian census about those who have completed tertiary education. Subtract the smaller number from the larger number, then return the difference in thousands of women. So if there were 30.1 thousand more men, you'd give \"30.1\"", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 234.9", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "234.9", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
168
+ {"name": "e29834fd-413a-455c-a33e-c3915b07401c", "prompt": "I'd like to learn more about some popular reality television competition shows. As of the end of the 44th season of the American version of Survivor, how many more unique winners have there been compared to the number of winners of American Idol?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 21", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "21", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
169
+ {"name": "08c0b6e9-1b43-4c2e-ae55-4e3fce2c2715", "prompt": "In the film Goldfinger, what color was the object that James Bond concealed himself and his companion Pussy Galore at the end of the film? If there are multiple colors, put them in a comma-separated list in alphabetical order.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: orange, white", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "orange, white", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
170
+ {"name": "db4fd70a-2d37-40ea-873f-9433dc5e301f", "prompt": "As of May 2023, how many stops are between South Station and Windsor Gardens on MBTA\u2019s Franklin-Foxboro line (not included)?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 10", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "10", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
171
+ {"name": "853c8244-429e-46ca-89f2-addf40dfb2bd", "prompt": "In the 2015 Metropolitan Museum of Art exhibition titled after the Chinese zodiac animal of 2015, how many of the \"twelve animals of the Chinese zodiac\" have a hand visible?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 11", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "11", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
172
+ {"name": "7a4a336d-dcfa-45a0-b014-824c7619e8de", "prompt": "At the two-minute mark in the YouTube video uploaded by the channel \u201cGameGrumps\u201d on May 14, 2017 as part of their playthrough of the game Mario Kart 8 Deluxe, the shows\u2019 hosts are competing on one of the game\u2019s racetracks. What was the world record time for that track in the game\u2019s 150cc mode as of June 7, 2023? Express your answer in minutes and seconds, rounding the seconds to the nearest hundredth, e.g. 1:01.001.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 1:41.614", "weight": 1.0}], "category": "level_2", "metadata": {"gaia_answer": "1:41.614", "gaia_level": 2, "gaia_file": null, "source": "gaia-benchmark"}}
src/flow/experiments/data/tasks/gaia_level3.jsonl ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"name": "676e5e31-a554-4acc-9286-b60d90a92d26", "prompt": "In July 2, 1959 United States standards for grades of processed fruits, vegetables, and certain other products listed as dehydrated, consider the items in the \"dried and dehydrated section\" specifically marked as dehydrated along with any items in the Frozen/Chilled section that contain the whole name of the item, but not if they're marked Chilled. As of August 2023, what is the percentage (to the nearest percent) of those standards that have been superseded by a new version since the date given in the 1959 standards?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 86", "weight": 1.0}], "category": "level_3", "metadata": {"gaia_answer": "86", "gaia_level": 3, "gaia_file": null, "source": "gaia-benchmark"}}
2
+ {"name": "bec74516-02fc-48dc-b202-55e78d0e17cf", "prompt": "What is the average number of pre-2020 works on the open researcher and contributor identification pages of the people whose identification is in this file?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 26.4", "weight": 1.0}], "category": "level_3", "metadata": {"gaia_answer": "26.4", "gaia_level": 3, "gaia_file": "bec74516-02fc-48dc-b202-55e78d0e17cf.jsonld", "source": "gaia-benchmark"}}
3
+ {"name": "00d579ea-0889-4fd9-a771-2c8d79835c8d", "prompt": "Assuming scientists in the famous youtube video The Thinking Machine (Artificial Intelligence in the 1960s) were interviewed the same year, what is the name of the scientist predicting the sooner thinking machines or robots? Answer using the format First name Last name", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Claude Shannon", "weight": 1.0}], "category": "level_3", "metadata": {"gaia_answer": "Claude Shannon", "gaia_level": 3, "gaia_file": null, "source": "gaia-benchmark"}}
4
+ {"name": "384d0dd8-e8a4-4cfe-963c-d37f256e7662", "prompt": "In the NCATS PubChem compound database for Food Additive Status classification, find the compound that has a molecular weight of 100 g/mol or less, 6 heavy atoms, 1 or fewer hydrogen bond acceptors, and a complexity between 10 and 15. Of the shared gene-chemical co-occurrences between its two possible enzyme transformations, what is the PubChem CID of the heaviest by molecular weight?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 4192", "weight": 1.0}], "category": "level_3", "metadata": {"gaia_answer": "4192", "gaia_level": 3, "gaia_file": null, "source": "gaia-benchmark"}}
5
+ {"name": "de9887f5-ead8-4727-876f-5a4078f8598c", "prompt": "What integer-rounded percentage of the total length of the harlequin shrimp recorded in Omar Valencfia-Mendez 2017 paper was the sea star fed to the same type of shrimp in G. Curt Fiedler's 2002 paper?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 22", "weight": 1.0}], "category": "level_3", "metadata": {"gaia_answer": "22", "gaia_level": 3, "gaia_file": null, "source": "gaia-benchmark"}}
6
+ {"name": "983bba7c-c092-455f-b6c9-7857003d48fc", "prompt": "What animals that were mentioned in both Ilias Lagkouvardos's and Olga Tapia's papers on the alvei species of the genus named for Copenhagen outside the bibliographies were also present in the 2021 article cited on the alvei species' Wikipedia page about a multicenter, randomized, double-blind study?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: mice", "weight": 1.0}], "category": "level_3", "metadata": {"gaia_answer": "mice", "gaia_level": 3, "gaia_file": null, "source": "gaia-benchmark"}}
7
+ {"name": "9b54f9d9-35ee-4a14-b62f-d130ea00317f", "prompt": "Which of the text elements under CATEGORIES in the XML would contain the one food in the spreadsheet that does not appear a second time under a different name?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Soups and Stews", "weight": 1.0}], "category": "level_3", "metadata": {"gaia_answer": "Soups and Stews", "gaia_level": 3, "gaia_file": "9b54f9d9-35ee-4a14-b62f-d130ea00317f.zip", "source": "gaia-benchmark"}}
8
+ {"name": "56db2318-640f-477a-a82f-bc93ad13e882", "prompt": "The following numbers function similarly to ISBN 13 numbers, however, their validation methods are slightly different. Rather than using alternate weights of 1 and 3, the checksum digit is calculated with an alternate weight of 1 and some other positive integer less than 10. Otherwise, the checksum digit is calculated as expected. Unfortunately, there is an error in the data. Two adjacent columns have been transposed. These errored columns do not involve the final column or one of the first three columns. Using this information, please provide all potential solutions with the unknown weight and the smaller index of the two errored columns (assume we start our indexing at 0 and ignore hyphens). Give your answer in the form x, y where x is the weight and y is the smaller index of the two transposed columns.\n\n978-354181391-9\n978-946669746-1\n978-398036139-6\n978-447656680-4\n978-279586664-7\n978-595073693-3\n978-976647652-6\n978-591178125-5\n978-728465924-5\n978-414825155-9", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 7, 9", "weight": 1.0}], "category": "level_3", "metadata": {"gaia_answer": "7, 9", "gaia_level": 3, "gaia_file": null, "source": "gaia-benchmark"}}
9
+ {"name": "8131e2c0-0083-4265-9ce7-78c2d568425d", "prompt": "I was trying to remember how well the Cheater Beater performed in comparison to the Cheater when James tested it on his channel. I know that the Cheater still outperformed the Cheater Beater in terms of CFM. Could you please look that up for me, and report the CFM of both the Cheater and the Cheater Beater? I'm not sure if he made any changes to his testing, but this was back in season 4, so just report the value from that season. Please format your response like this: CFM number for Cheater, CFM number for Cheater beater", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 101.376, 84.348", "weight": 1.0}], "category": "level_3", "metadata": {"gaia_answer": "101.376, 84.348", "gaia_level": 3, "gaia_file": null, "source": "gaia-benchmark"}}
10
+ {"name": "72c06643-a2fa-4186-aa5c-9ec33ae9b445", "prompt": "What is the volume in milliliters of a system comprised of 0.312 kg Freon-12 refrigerant when placed at the bottom of the Marianas Trench and allowed to stabilize at the Trench's peak temperature, rounded to the nearest mL? Provide your answer as just an integer value.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 55", "weight": 1.0}], "category": "level_3", "metadata": {"gaia_answer": "55", "gaia_level": 3, "gaia_file": null, "source": "gaia-benchmark"}}
11
+ {"name": "ebbc1f13-d24d-40df-9068-adcf735b4240", "prompt": "The Latin root of the Yola word \"gimlie\" shares a spelling with a Spanish word. What is the Google translation of the source title for the 1994 example sentence for that word in the Collins Spanish-to-English dictionary online? Answer in plain text, without punctuation.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: The World of the Twenty First Century", "weight": 1.0}], "category": "level_3", "metadata": {"gaia_answer": "The World of the Twenty First Century", "gaia_level": 3, "gaia_file": null, "source": "gaia-benchmark"}}
12
+ {"name": "c526d8d6-5987-4da9-b24c-83466fa172f3", "prompt": "In the NIH translation of the original 1913 Michaelis-Menten Paper, what is the velocity of a reaction to four decimal places using the final equation in the paper based on the information for Reaction 7 in the Excel file?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 0.0424", "weight": 1.0}], "category": "level_3", "metadata": {"gaia_answer": "0.0424", "gaia_level": 3, "gaia_file": "c526d8d6-5987-4da9-b24c-83466fa172f3.xlsx", "source": "gaia-benchmark"}}
13
+ {"name": "3da89939-209c-4086-8520-7eb734e6b4ef", "prompt": "I was referencing each of the tables in the file from papers that were cited by the \"Trans fatty acid contents in chocolates and chocolate wafers in Turkey\" paper. I lost my own reference sheet and need to know which of the papers each table came from. The file may not use the full table caption. If the references in the\"Trans fatty acid\" paper bibliography were numbered starting with 1, give me the numbers in the order that they would be used to fill the cells in the Excel file from top to bottom, as a comma separated list.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 8, 29, 22, 1, 8, 26", "weight": 1.0}], "category": "level_3", "metadata": {"gaia_answer": "8, 29, 22, 1, 8, 26", "gaia_level": 3, "gaia_file": "3da89939-209c-4086-8520-7eb734e6b4ef.xlsx", "source": "gaia-benchmark"}}
14
+ {"name": "8d46b8d6-b38a-47ff-ac74-cda14cf2d19b", "prompt": "What percentage of the total penguin population according to the upper estimates on english Wikipedia at the end of 2012 is made up by the penguins in this file that don't live on Dream Island or have beaks longer than 42mm? Round to the nearest five decimal places.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 0.00033", "weight": 1.0}], "category": "level_3", "metadata": {"gaia_answer": "0.00033", "gaia_level": 3, "gaia_file": "8d46b8d6-b38a-47ff-ac74-cda14cf2d19b.csv", "source": "gaia-benchmark"}}
15
+ {"name": "e961a717-6b25-4175-8a68-874d28190ee4", "prompt": "According to wikipedia, how many Asian countries still have a monarchy and access to the sea in 2021?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 12", "weight": 1.0}], "category": "level_3", "metadata": {"gaia_answer": "12", "gaia_level": 3, "gaia_file": null, "source": "gaia-benchmark"}}
16
+ {"name": "851e570a-e3de-4d84-bcfa-cc85578baa59", "prompt": "I thought we could try a fun word puzzle together :)\n\nI've got a Boggle board here:\n\nABRL\nEITE\nIONS\nFPEI\n\nI'd like to know the longest word that can be generated from the board. Please find the longest English language word that can be generated from this board. If more than one word of the same length exists at the maximum word length, please report the longest word that comes first, alphabetically. Oh, and I know that there might be different wordlists available for Boggle, so let's please just use the words_alpha dictionary found at https://github.com/dwyl/english-words as the dictionary for our game.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Briniest", "weight": 1.0}], "category": "level_3", "metadata": {"gaia_answer": "Briniest", "gaia_level": 3, "gaia_file": null, "source": "gaia-benchmark"}}
17
+ {"name": "50f58759-7bd6-406f-9b0d-5692beb2a926", "prompt": "How many times was a Twitter/X post cited as a reference on the english Wikipedia pages for each day of August in the last June 2023 versions of the pages?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 3", "weight": 1.0}], "category": "level_3", "metadata": {"gaia_answer": "3", "gaia_level": 3, "gaia_file": null, "source": "gaia-benchmark"}}
18
+ {"name": "872bfbb1-9ccf-49f6-8c5f-aa22818ccd66", "prompt": "Which of the fruits shown in the 2008 painting \"Embroidery from Uzbekistan\" were served as part of the October 1949 breakfast menu for the ocean liner that was later used as a floating prop for the film \"The Last Voyage\"? Give the items as a comma-separated list, ordering them in clockwise order based on their arrangement in the painting starting from the 12 o'clock position. Use the plural form of each fruit.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: pears, bananas", "weight": 1.0}], "category": "level_3", "metadata": {"gaia_answer": "pears, bananas", "gaia_level": 3, "gaia_file": null, "source": "gaia-benchmark"}}
19
+ {"name": "c3a79cfe-8206-451f-aca8-3fec8ebe51d3", "prompt": "The year is 2022. I am at the National Air and Space Museum east of the Potomac River. I want to go to Fire Station 301 DCA ARFF using the metro. I go in the wrong direction and end up at the station closest to Cleveland Elementary School. How many metro stations am I away from my original destination if I don't change lines? Your answer should be a numerical integer value.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 8", "weight": 1.0}], "category": "level_3", "metadata": {"gaia_answer": "8", "gaia_level": 3, "gaia_file": null, "source": "gaia-benchmark"}}
20
+ {"name": "da52d699-e8d2-4dc5-9191-a2199e0b6a9b", "prompt": "The attached spreadsheet contains a list of books I read in the year 2022. What is the title of the book that I read the slowest, using the rate of words per day?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Out of the Silent Planet", "weight": 1.0}], "category": "level_3", "metadata": {"gaia_answer": "Out of the Silent Planet", "gaia_level": 3, "gaia_file": "da52d699-e8d2-4dc5-9191-a2199e0b6a9b.xlsx", "source": "gaia-benchmark"}}
21
+ {"name": "ad2b4d70-9314-4fe6-bfbe-894a45f6055f", "prompt": "Eva Draconis has a personal website which can be accessed on her YouTube page. What is the meaning of the only symbol seen in the top banner that has a curved line that isn't a circle or a portion of a circle? Answer without punctuation.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: War is not here this is a land of peace", "weight": 1.0}], "category": "level_3", "metadata": {"gaia_answer": "War is not here this is a land of peace", "gaia_level": 3, "gaia_file": null, "source": "gaia-benchmark"}}
22
+ {"name": "5b2a14e8-6e59-479c-80e3-4696e8980152", "prompt": "The brand that makes these harnesses the dogs are wearing in the attached pic shares stories from their ambassadors on their website. What meat is mentioned in the story added Dec 8th 2022?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: bacon", "weight": 1.0}], "category": "level_3", "metadata": {"gaia_answer": "bacon", "gaia_level": 3, "gaia_file": "5b2a14e8-6e59-479c-80e3-4696e8980152.jpg", "source": "gaia-benchmark"}}
23
+ {"name": "9e1fc53b-46ff-49a1-9d05-9e6faac34cc5", "prompt": "A 5-man group made up of one tank, one healer, and three DPS is doing a dungeon that was just released in World of Warcraft. Two are plate wearers and two are cloth wearers. At the final boss, both the tank and the healer are casting holy spells. Ice and fire are being used, each one by a different DPS. A bear from the group is attacking the boss. Metamorphosis is cast. The Kilt of the Forgotten One drops as loot, but no one can use it. If all classes were using their class abilities and all classes are unique, what are the five classes in the group in alphabetical order separated by commas?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Death Knight, Hunter, Paladin, Priest, Warlock", "weight": 1.0}], "category": "level_3", "metadata": {"gaia_answer": "Death Knight, Hunter, Paladin, Priest, Warlock", "gaia_level": 3, "gaia_file": null, "source": "gaia-benchmark"}}
24
+ {"name": "5f982798-16b9-4051-ab57-cfc7ebdb2a91", "prompt": "I read a paper about multiwavelength observations of fast radio bursts back in March 2021 on Arxiv, and it had a fascinating diagram of an X-ray time profile. There was a similar burst-1 diagram in another paper from one of the same authors about fast radio bursts back in July 2020, but I can't recall what the difference in seconds in the measured time span was. How many more seconds did one measure than the other? Just give the number.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 0.2", "weight": 1.0}], "category": "level_3", "metadata": {"gaia_answer": "0.2", "gaia_level": 3, "gaia_file": null, "source": "gaia-benchmark"}}
25
+ {"name": "0512426f-4d28-49f0-be77-06d05daec096", "prompt": "In the YouTube 360 VR video from March 2018 narrated by the voice actor of Lord of the Rings' Gollum, what number was mentioned by the narrator directly after dinosaurs were first shown in the video?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 100000000", "weight": 1.0}], "category": "level_3", "metadata": {"gaia_answer": "100000000", "gaia_level": 3, "gaia_file": null, "source": "gaia-benchmark"}}
26
+ {"name": "0bdb7c40-671d-4ad1-9ce3-986b159c0ddc", "prompt": "In NASA's Astronomy Picture of the Day on 2006 January 21, two astronauts are visible, with one appearing much smaller than the other. As of August 2023, out of the astronauts in the NASA Astronaut Group that the smaller astronaut was a member of, which one spent the least time in space, and how many minutes did he spend in space, rounded to the nearest minute? Exclude any astronauts who did not spend any time in space. Give the last name of the astronaut, separated from the number of minutes by a semicolon.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: White; 5876", "weight": 1.0}], "category": "level_3", "metadata": {"gaia_answer": "White; 5876", "gaia_level": 3, "gaia_file": null, "source": "gaia-benchmark"}}
27
+ {"name": "676e5e31-a554-4acc-9286-b60d90a92d26", "prompt": "In July 2, 1959 United States standards for grades of processed fruits, vegetables, and certain other products listed as dehydrated, consider the items in the \"dried and dehydrated section\" specifically marked as dehydrated along with any items in the Frozen/Chilled section that contain the whole name of the item, but not if they're marked Chilled. As of August 2023, what is the percentage (to the nearest percent) of those standards that have been superseded by a new version since the date given in the 1959 standards?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 86", "weight": 1.0}], "category": "level_3", "metadata": {"gaia_answer": "86", "gaia_level": 3, "gaia_file": null, "source": "gaia-benchmark"}}
28
+ {"name": "bec74516-02fc-48dc-b202-55e78d0e17cf", "prompt": "What is the average number of pre-2020 works on the open researcher and contributor identification pages of the people whose identification is in this file?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 26.4", "weight": 1.0}], "category": "level_3", "metadata": {"gaia_answer": "26.4", "gaia_level": 3, "gaia_file": "bec74516-02fc-48dc-b202-55e78d0e17cf.jsonld", "source": "gaia-benchmark"}}
29
+ {"name": "00d579ea-0889-4fd9-a771-2c8d79835c8d", "prompt": "Assuming scientists in the famous youtube video The Thinking Machine (Artificial Intelligence in the 1960s) were interviewed the same year, what is the name of the scientist predicting the sooner thinking machines or robots? Answer using the format First name Last name", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Claude Shannon", "weight": 1.0}], "category": "level_3", "metadata": {"gaia_answer": "Claude Shannon", "gaia_level": 3, "gaia_file": null, "source": "gaia-benchmark"}}
30
+ {"name": "384d0dd8-e8a4-4cfe-963c-d37f256e7662", "prompt": "In the NCATS PubChem compound database for Food Additive Status classification, find the compound that has a molecular weight of 100 g/mol or less, 6 heavy atoms, 1 or fewer hydrogen bond acceptors, and a complexity between 10 and 15. Of the shared gene-chemical co-occurrences between its two possible enzyme transformations, what is the PubChem CID of the heaviest by molecular weight?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 4192", "weight": 1.0}], "category": "level_3", "metadata": {"gaia_answer": "4192", "gaia_level": 3, "gaia_file": null, "source": "gaia-benchmark"}}
31
+ {"name": "de9887f5-ead8-4727-876f-5a4078f8598c", "prompt": "What integer-rounded percentage of the total length of the harlequin shrimp recorded in Omar Valencfia-Mendez 2017 paper was the sea star fed to the same type of shrimp in G. Curt Fiedler's 2002 paper?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 22", "weight": 1.0}], "category": "level_3", "metadata": {"gaia_answer": "22", "gaia_level": 3, "gaia_file": null, "source": "gaia-benchmark"}}
32
+ {"name": "983bba7c-c092-455f-b6c9-7857003d48fc", "prompt": "What animals that were mentioned in both Ilias Lagkouvardos's and Olga Tapia's papers on the alvei species of the genus named for Copenhagen outside the bibliographies were also present in the 2021 article cited on the alvei species' Wikipedia page about a multicenter, randomized, double-blind study?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: mice", "weight": 1.0}], "category": "level_3", "metadata": {"gaia_answer": "mice", "gaia_level": 3, "gaia_file": null, "source": "gaia-benchmark"}}
33
+ {"name": "9b54f9d9-35ee-4a14-b62f-d130ea00317f", "prompt": "Which of the text elements under CATEGORIES in the XML would contain the one food in the spreadsheet that does not appear a second time under a different name?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Soups and Stews", "weight": 1.0}], "category": "level_3", "metadata": {"gaia_answer": "Soups and Stews", "gaia_level": 3, "gaia_file": "9b54f9d9-35ee-4a14-b62f-d130ea00317f.zip", "source": "gaia-benchmark"}}
34
+ {"name": "56db2318-640f-477a-a82f-bc93ad13e882", "prompt": "The following numbers function similarly to ISBN 13 numbers, however, their validation methods are slightly different. Rather than using alternate weights of 1 and 3, the checksum digit is calculated with an alternate weight of 1 and some other positive integer less than 10. Otherwise, the checksum digit is calculated as expected. Unfortunately, there is an error in the data. Two adjacent columns have been transposed. These errored columns do not involve the final column or one of the first three columns. Using this information, please provide all potential solutions with the unknown weight and the smaller index of the two errored columns (assume we start our indexing at 0 and ignore hyphens). Give your answer in the form x, y where x is the weight and y is the smaller index of the two transposed columns.\n\n978-354181391-9\n978-946669746-1\n978-398036139-6\n978-447656680-4\n978-279586664-7\n978-595073693-3\n978-976647652-6\n978-591178125-5\n978-728465924-5\n978-414825155-9", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 7, 9", "weight": 1.0}], "category": "level_3", "metadata": {"gaia_answer": "7, 9", "gaia_level": 3, "gaia_file": null, "source": "gaia-benchmark"}}
35
+ {"name": "8131e2c0-0083-4265-9ce7-78c2d568425d", "prompt": "I was trying to remember how well the Cheater Beater performed in comparison to the Cheater when James tested it on his channel. I know that the Cheater still outperformed the Cheater Beater in terms of CFM. Could you please look that up for me, and report the CFM of both the Cheater and the Cheater Beater? I'm not sure if he made any changes to his testing, but this was back in season 4, so just report the value from that season. Please format your response like this: CFM number for Cheater, CFM number for Cheater beater", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 101.376, 84.348", "weight": 1.0}], "category": "level_3", "metadata": {"gaia_answer": "101.376, 84.348", "gaia_level": 3, "gaia_file": null, "source": "gaia-benchmark"}}
36
+ {"name": "72c06643-a2fa-4186-aa5c-9ec33ae9b445", "prompt": "What is the volume in milliliters of a system comprised of 0.312 kg Freon-12 refrigerant when placed at the bottom of the Marianas Trench and allowed to stabilize at the Trench's peak temperature, rounded to the nearest mL? Provide your answer as just an integer value.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 55", "weight": 1.0}], "category": "level_3", "metadata": {"gaia_answer": "55", "gaia_level": 3, "gaia_file": null, "source": "gaia-benchmark"}}
37
+ {"name": "ebbc1f13-d24d-40df-9068-adcf735b4240", "prompt": "The Latin root of the Yola word \"gimlie\" shares a spelling with a Spanish word. What is the Google translation of the source title for the 1994 example sentence for that word in the Collins Spanish-to-English dictionary online? Answer in plain text, without punctuation.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: The World of the Twenty First Century", "weight": 1.0}], "category": "level_3", "metadata": {"gaia_answer": "The World of the Twenty First Century", "gaia_level": 3, "gaia_file": null, "source": "gaia-benchmark"}}
38
+ {"name": "c526d8d6-5987-4da9-b24c-83466fa172f3", "prompt": "In the NIH translation of the original 1913 Michaelis-Menten Paper, what is the velocity of a reaction to four decimal places using the final equation in the paper based on the information for Reaction 7 in the Excel file?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 0.0424", "weight": 1.0}], "category": "level_3", "metadata": {"gaia_answer": "0.0424", "gaia_level": 3, "gaia_file": "c526d8d6-5987-4da9-b24c-83466fa172f3.xlsx", "source": "gaia-benchmark"}}
39
+ {"name": "3da89939-209c-4086-8520-7eb734e6b4ef", "prompt": "I was referencing each of the tables in the file from papers that were cited by the \"Trans fatty acid contents in chocolates and chocolate wafers in Turkey\" paper. I lost my own reference sheet and need to know which of the papers each table came from. The file may not use the full table caption. If the references in the\"Trans fatty acid\" paper bibliography were numbered starting with 1, give me the numbers in the order that they would be used to fill the cells in the Excel file from top to bottom, as a comma separated list.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 8, 29, 22, 1, 8, 26", "weight": 1.0}], "category": "level_3", "metadata": {"gaia_answer": "8, 29, 22, 1, 8, 26", "gaia_level": 3, "gaia_file": "3da89939-209c-4086-8520-7eb734e6b4ef.xlsx", "source": "gaia-benchmark"}}
40
+ {"name": "8d46b8d6-b38a-47ff-ac74-cda14cf2d19b", "prompt": "What percentage of the total penguin population according to the upper estimates on english Wikipedia at the end of 2012 is made up by the penguins in this file that don't live on Dream Island or have beaks longer than 42mm? Round to the nearest five decimal places.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 0.00033", "weight": 1.0}], "category": "level_3", "metadata": {"gaia_answer": "0.00033", "gaia_level": 3, "gaia_file": "8d46b8d6-b38a-47ff-ac74-cda14cf2d19b.csv", "source": "gaia-benchmark"}}
41
+ {"name": "e961a717-6b25-4175-8a68-874d28190ee4", "prompt": "According to wikipedia, how many Asian countries still have a monarchy and access to the sea in 2021?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 12", "weight": 1.0}], "category": "level_3", "metadata": {"gaia_answer": "12", "gaia_level": 3, "gaia_file": null, "source": "gaia-benchmark"}}
42
+ {"name": "851e570a-e3de-4d84-bcfa-cc85578baa59", "prompt": "I thought we could try a fun word puzzle together :)\n\nI've got a Boggle board here:\n\nABRL\nEITE\nIONS\nFPEI\n\nI'd like to know the longest word that can be generated from the board. Please find the longest English language word that can be generated from this board. If more than one word of the same length exists at the maximum word length, please report the longest word that comes first, alphabetically. Oh, and I know that there might be different wordlists available for Boggle, so let's please just use the words_alpha dictionary found at https://github.com/dwyl/english-words as the dictionary for our game.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Briniest", "weight": 1.0}], "category": "level_3", "metadata": {"gaia_answer": "Briniest", "gaia_level": 3, "gaia_file": null, "source": "gaia-benchmark"}}
43
+ {"name": "50f58759-7bd6-406f-9b0d-5692beb2a926", "prompt": "How many times was a Twitter/X post cited as a reference on the english Wikipedia pages for each day of August in the last June 2023 versions of the pages?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 3", "weight": 1.0}], "category": "level_3", "metadata": {"gaia_answer": "3", "gaia_level": 3, "gaia_file": null, "source": "gaia-benchmark"}}
44
+ {"name": "872bfbb1-9ccf-49f6-8c5f-aa22818ccd66", "prompt": "Which of the fruits shown in the 2008 painting \"Embroidery from Uzbekistan\" were served as part of the October 1949 breakfast menu for the ocean liner that was later used as a floating prop for the film \"The Last Voyage\"? Give the items as a comma-separated list, ordering them in clockwise order based on their arrangement in the painting starting from the 12 o'clock position. Use the plural form of each fruit.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: pears, bananas", "weight": 1.0}], "category": "level_3", "metadata": {"gaia_answer": "pears, bananas", "gaia_level": 3, "gaia_file": null, "source": "gaia-benchmark"}}
45
+ {"name": "c3a79cfe-8206-451f-aca8-3fec8ebe51d3", "prompt": "The year is 2022. I am at the National Air and Space Museum east of the Potomac River. I want to go to Fire Station 301 DCA ARFF using the metro. I go in the wrong direction and end up at the station closest to Cleveland Elementary School. How many metro stations am I away from my original destination if I don't change lines? Your answer should be a numerical integer value.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 8", "weight": 1.0}], "category": "level_3", "metadata": {"gaia_answer": "8", "gaia_level": 3, "gaia_file": null, "source": "gaia-benchmark"}}
46
+ {"name": "da52d699-e8d2-4dc5-9191-a2199e0b6a9b", "prompt": "The attached spreadsheet contains a list of books I read in the year 2022. What is the title of the book that I read the slowest, using the rate of words per day?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Out of the Silent Planet", "weight": 1.0}], "category": "level_3", "metadata": {"gaia_answer": "Out of the Silent Planet", "gaia_level": 3, "gaia_file": "da52d699-e8d2-4dc5-9191-a2199e0b6a9b.xlsx", "source": "gaia-benchmark"}}
47
+ {"name": "ad2b4d70-9314-4fe6-bfbe-894a45f6055f", "prompt": "Eva Draconis has a personal website which can be accessed on her YouTube page. What is the meaning of the only symbol seen in the top banner that has a curved line that isn't a circle or a portion of a circle? Answer without punctuation.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: War is not here this is a land of peace", "weight": 1.0}], "category": "level_3", "metadata": {"gaia_answer": "War is not here this is a land of peace", "gaia_level": 3, "gaia_file": null, "source": "gaia-benchmark"}}
48
+ {"name": "5b2a14e8-6e59-479c-80e3-4696e8980152", "prompt": "The brand that makes these harnesses the dogs are wearing in the attached pic shares stories from their ambassadors on their website. What meat is mentioned in the story added Dec 8th 2022?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: bacon", "weight": 1.0}], "category": "level_3", "metadata": {"gaia_answer": "bacon", "gaia_level": 3, "gaia_file": "5b2a14e8-6e59-479c-80e3-4696e8980152.jpg", "source": "gaia-benchmark"}}
49
+ {"name": "9e1fc53b-46ff-49a1-9d05-9e6faac34cc5", "prompt": "A 5-man group made up of one tank, one healer, and three DPS is doing a dungeon that was just released in World of Warcraft. Two are plate wearers and two are cloth wearers. At the final boss, both the tank and the healer are casting holy spells. Ice and fire are being used, each one by a different DPS. A bear from the group is attacking the boss. Metamorphosis is cast. The Kilt of the Forgotten One drops as loot, but no one can use it. If all classes were using their class abilities and all classes are unique, what are the five classes in the group in alphabetical order separated by commas?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: Death Knight, Hunter, Paladin, Priest, Warlock", "weight": 1.0}], "category": "level_3", "metadata": {"gaia_answer": "Death Knight, Hunter, Paladin, Priest, Warlock", "gaia_level": 3, "gaia_file": null, "source": "gaia-benchmark"}}
50
+ {"name": "5f982798-16b9-4051-ab57-cfc7ebdb2a91", "prompt": "I read a paper about multiwavelength observations of fast radio bursts back in March 2021 on Arxiv, and it had a fascinating diagram of an X-ray time profile. There was a similar burst-1 diagram in another paper from one of the same authors about fast radio bursts back in July 2020, but I can't recall what the difference in seconds in the measured time span was. How many more seconds did one measure than the other? Just give the number.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 0.2", "weight": 1.0}], "category": "level_3", "metadata": {"gaia_answer": "0.2", "gaia_level": 3, "gaia_file": null, "source": "gaia-benchmark"}}
51
+ {"name": "0512426f-4d28-49f0-be77-06d05daec096", "prompt": "In the YouTube 360 VR video from March 2018 narrated by the voice actor of Lord of the Rings' Gollum, what number was mentioned by the narrator directly after dinosaurs were first shown in the video?", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: 100000000", "weight": 1.0}], "category": "level_3", "metadata": {"gaia_answer": "100000000", "gaia_level": 3, "gaia_file": null, "source": "gaia-benchmark"}}
52
+ {"name": "0bdb7c40-671d-4ad1-9ce3-986b159c0ddc", "prompt": "In NASA's Astronomy Picture of the Day on 2006 January 21, two astronauts are visible, with one appearing much smaller than the other. As of August 2023, out of the astronauts in the NASA Astronaut Group that the smaller astronaut was a member of, which one spent the least time in space, and how many minutes did he spend in space, rounded to the nearest minute? Exclude any astronauts who did not spend any time in space. Give the last name of the astronaut, separated from the number of minutes by a semicolon.", "criteria": [{"name": "correct_answer", "instruction": "The agent's final answer must match: White; 5876", "weight": 1.0}], "category": "level_3", "metadata": {"gaia_answer": "White; 5876", "gaia_level": 3, "gaia_file": null, "source": "gaia-benchmark"}}
src/flow/experiments/evaluators/heuristic.py CHANGED
@@ -73,7 +73,7 @@ class HeuristicEvaluator:
73
 
74
  # Check if agent reported task complete
75
  output_lower = run_result.output.lower()
76
- if "task_done" in output_lower or "complete" in output_lower or "finished" in output_lower:
77
  criteria_results.append(
78
  CriterionResult(
79
  name="task_completed",
 
73
 
74
  # Check if agent reported task complete
75
  output_lower = run_result.output.lower()
76
+ if "complete" in output_lower or "complete" in output_lower or "finished" in output_lower:
77
  criteria_results.append(
78
  CriterionResult(
79
  name="task_completed",
src/flow/experiments/evaluators/llm.py CHANGED
@@ -38,6 +38,7 @@ class LLMEvaluator:
38
  model_client: Any,
39
  model_name: str = "gpt-4o",
40
  passing_threshold: float = 0.7,
 
41
  ) -> None:
42
  """Initialize the LLM evaluator.
43
 
@@ -46,10 +47,14 @@ class LLMEvaluator:
46
  (e.g., AsyncOpenAI, AsyncAzureOpenAI)
47
  model_name: Model name/deployment to use for evaluation
48
  passing_threshold: Minimum score to pass (0.0 to 1.0)
 
 
 
49
  """
50
  self.model_client = model_client
51
  self.model_name = model_name
52
  self.passing_threshold = passing_threshold
 
53
 
54
  def _get_evaluation_prompt(self, run_result: RunResult) -> str:
55
  """Build the evaluation prompt for the LLM."""
@@ -156,17 +161,21 @@ Tokens used: {metrics.total_tokens} (input: {metrics.input_tokens}, output: {met
156
  prompt = self._get_evaluation_prompt(run_result)
157
 
158
  try:
159
- response = await self.model_client.chat.completions.create(
160
- model=self.model_name,
161
- messages=[
 
162
  {
163
  "role": "system",
164
  "content": "You are an expert evaluator. Respond only with valid JSON.",
165
  },
166
  {"role": "user", "content": prompt},
167
  ],
168
- temperature=0.1, # Low temperature for consistent evaluation
169
- )
 
 
 
170
 
171
  # Extract the response text
172
  response_text = response.choices[0].message.content or ""
 
38
  model_client: Any,
39
  model_name: str = "gpt-4o",
40
  passing_threshold: float = 0.7,
41
+ temperature: float | None = None,
42
  ) -> None:
43
  """Initialize the LLM evaluator.
44
 
 
47
  (e.g., AsyncOpenAI, AsyncAzureOpenAI)
48
  model_name: Model name/deployment to use for evaluation
49
  passing_threshold: Minimum score to pass (0.0 to 1.0)
50
+ temperature: Temperature for LLM calls. None means don't specify
51
+ (use model default). Some models like gpt-5.2-chat
52
+ only support temperature=1.0.
53
  """
54
  self.model_client = model_client
55
  self.model_name = model_name
56
  self.passing_threshold = passing_threshold
57
+ self.temperature = temperature
58
 
59
  def _get_evaluation_prompt(self, run_result: RunResult) -> str:
60
  """Build the evaluation prompt for the LLM."""
 
161
  prompt = self._get_evaluation_prompt(run_result)
162
 
163
  try:
164
+ # Build params - only include temperature if explicitly set
165
+ params: dict[str, Any] = {
166
+ "model": self.model_name,
167
+ "messages": [
168
  {
169
  "role": "system",
170
  "content": "You are an expert evaluator. Respond only with valid JSON.",
171
  },
172
  {"role": "user", "content": prompt},
173
  ],
174
+ }
175
+ if self.temperature is not None:
176
+ params["temperature"] = self.temperature
177
+
178
+ response = await self.model_client.chat.completions.create(**params)
179
 
180
  # Extract the response text
181
  response_text = response.choices[0].message.content or ""
src/flow/experiments/models.py CHANGED
@@ -17,10 +17,16 @@ from __future__ import annotations
17
  from dataclasses import asdict, dataclass, field
18
  from itertools import product as itertools_product
19
  from pathlib import Path
20
- from typing import Any, Protocol, runtime_checkable
21
 
22
  import yaml
23
 
 
 
 
 
 
 
24
 
25
  # =============================================================================
26
  # Tool Configuration
@@ -32,40 +38,55 @@ TOOL_PRESETS: dict[str, dict[str, dict[str, Any]]] = {
32
  "full": {
33
  "read_file": {},
34
  "write_file": {},
35
- "list_directory": {},
36
- "grep_search": {},
37
- "bash_execute": {"timeout": 120},
 
 
 
38
  "check_processes": {},
39
  "python_repl": {},
40
  "think": {},
41
- "task_done": {},
 
42
  "memory": {},
43
- "sub_agent": {"model": "gpt-4o-mini"},
 
 
 
 
 
44
  },
45
  "standard": {
46
  "read_file": {},
47
  "write_file": {},
48
- "list_directory": {},
49
- "grep_search": {},
50
- "bash_execute": {"timeout": 120},
 
 
 
51
  "check_processes": {},
52
  "python_repl": {},
53
  "think": {},
54
- "task_done": {},
 
55
  "memory": {},
 
56
  },
57
  "minimal": {
58
  "read_file": {},
59
  "write_file": {},
60
- "bash_execute": {"timeout": 120},
61
- "task_done": {},
 
62
  },
63
  "readonly": {
64
  "read_file": {},
65
- "list_directory": {},
66
- "grep_search": {},
 
67
  "think": {},
68
- "task_done": {},
69
  },
70
  }
71
 
@@ -91,11 +112,11 @@ def resolve_tools(tools: str | list[str] | dict[str, dict[str, Any]]) -> dict[st
91
  >>> resolve_tools("standard")
92
  {"read_file": {}, "write_file": {}, ...}
93
 
94
- >>> resolve_tools(["read_file", "bash_execute"])
95
- {"read_file": {}, "bash_execute": {}}
96
 
97
- >>> resolve_tools({"bash_execute": {"timeout": 60}})
98
- {"bash_execute": {"timeout": 60}}
99
  """
100
  if isinstance(tools, str):
101
  if tools not in TOOL_PRESETS:
@@ -114,24 +135,30 @@ class CompactionConfig:
114
  """Extensible compaction strategy configuration.
115
 
116
  Supports multiple strategies via a tagged-union pattern:
117
- - "head_tail": Keep first N + last M messages (default)
 
 
 
118
  - "last_n": Keep only the last N messages
119
  - "none": No compaction
120
 
121
- Future strategies (e.g., "summarize") can be added without
122
- changing existing code.
123
-
124
  Attributes:
125
  strategy: The compaction strategy name
126
  params: Strategy-specific parameters
 
127
  """
128
 
129
  strategy: str = "head_tail"
130
  params: dict[str, Any] = field(default_factory=lambda: {"head_size": 10, "tail_size": 40})
 
 
 
 
 
131
 
132
  @staticmethod
133
  def head_tail(head_size: int = 10, tail_size: int = 40) -> CompactionConfig:
134
- """Create a head+tail compaction config."""
135
  return CompactionConfig(strategy="head_tail", params={"head_size": head_size, "tail_size": tail_size})
136
 
137
  @staticmethod
@@ -144,6 +171,92 @@ class CompactionConfig:
144
  """Create a no-compaction config."""
145
  return CompactionConfig(strategy="none", params={})
146
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
  @property
148
  def enabled(self) -> bool:
149
  """Whether compaction is enabled."""
@@ -159,6 +272,11 @@ class CompactionConfig:
159
  """Tail size for head_tail strategy. Returns 0 for other strategies."""
160
  return self.params.get("tail_size", 0)
161
 
 
 
 
 
 
162
 
163
  @dataclass
164
  class Agent:
@@ -171,8 +289,10 @@ class Agent:
171
 
172
  Attributes:
173
  name: Unique identifier for this agent
 
174
  description: Human-readable description
175
  instructions: System prompt / instructions (optional, uses framework default if None)
 
176
  model: Model deployment name (e.g., "gpt-4o")
177
  compaction: Compaction strategy configuration
178
  tools: Tool configuration - can be:
@@ -182,8 +302,10 @@ class Agent:
182
  """
183
 
184
  name: str
 
185
  description: str = ""
186
  instructions: str | None = None
 
187
  model: str | None = None
188
  compaction: CompactionConfig = field(default_factory=CompactionConfig)
189
  tools: str | list[str] | dict[str, dict[str, Any]] = "standard"
@@ -218,27 +340,50 @@ class ExperimentResult:
218
  eval_score: float = 0.0
219
  eval_passed: bool = False
220
  eval_reasoning: str = ""
 
221
 
222
 
223
  @runtime_checkable
224
  class CandidateStrategy(Protocol):
225
  """Protocol for generating candidate variants from a base agent.
226
 
227
- Implementations explore different regions of the optimization space:
 
 
 
 
 
 
 
 
228
  - GridSearchStrategy: Exhaustive grid over parameter combinations
229
- - (Future) HeuristicStrategy: Rule-based mutations from telemetry
230
  - (Future) BayesianStrategy: Bayesian optimization over parameters
231
  """
232
 
233
- def generate(self, base: Agent, budget: int) -> list[Candidate]:
 
 
 
 
 
 
 
 
234
  """Generate candidate variants from a base agent.
235
 
236
  Args:
237
- base: The base agent to mutate
238
- budget: Maximum number of candidates to generate
 
 
 
 
239
 
240
  Returns:
241
- List of Candidate objects (at most `budget` items)
 
 
242
  """
243
  ...
244
 
@@ -272,8 +417,24 @@ class GridSearchStrategy:
272
  """
273
  self.variations = variations
274
 
275
- def generate(self, base: Agent, budget: int) -> list[Candidate]:
276
- """Generate all grid combinations up to budget."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
277
  if not self.variations:
278
  return [Candidate(agent=base, mutations={}, rationale="baseline")]
279
 
@@ -515,3 +676,121 @@ def _extract_metrics(
515
  "pareto_rank": summary.get("pareto_rank"),
516
  "is_pareto_optimal": summary.get("is_pareto_optimal", False),
517
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  from dataclasses import asdict, dataclass, field
18
  from itertools import product as itertools_product
19
  from pathlib import Path
20
+ from typing import TYPE_CHECKING, Any, Protocol, runtime_checkable
21
 
22
  import yaml
23
 
24
+ if TYPE_CHECKING:
25
+ from collections.abc import Awaitable, Callable
26
+
27
+ from .evaluators.base import Evaluator
28
+ from .types import Task
29
+
30
 
31
  # =============================================================================
32
  # Tool Configuration
 
38
  "full": {
39
  "read_file": {},
40
  "write_file": {},
41
+ "edit_file": {},
42
+ "multi_edit": {},
43
+ "glob_files": {},
44
+ "ls": {},
45
+ "grep": {},
46
+ "bash": {"timeout": 120},
47
  "check_processes": {},
48
  "python_repl": {},
49
  "think": {},
50
+ "todo_write": {},
51
+ "todo_read": {},
52
  "memory": {},
53
+ "skills": {},
54
+ "task": {"model": "gpt-4o-mini"},
55
+ "web_search": {},
56
+ "web_fetch": {},
57
+ "notebook_edit": {},
58
+ "notebook_read": {},
59
  },
60
  "standard": {
61
  "read_file": {},
62
  "write_file": {},
63
+ "edit_file": {},
64
+ "multi_edit": {},
65
+ "glob_files": {},
66
+ "ls": {},
67
+ "grep": {},
68
+ "bash": {"timeout": 120},
69
  "check_processes": {},
70
  "python_repl": {},
71
  "think": {},
72
+ "todo_write": {},
73
+ "todo_read": {},
74
  "memory": {},
75
+ "skills": {},
76
  },
77
  "minimal": {
78
  "read_file": {},
79
  "write_file": {},
80
+ "edit_file": {},
81
+ "bash": {"timeout": 120},
82
+ "think": {},
83
  },
84
  "readonly": {
85
  "read_file": {},
86
+ "glob_files": {},
87
+ "ls": {},
88
+ "grep": {},
89
  "think": {},
 
90
  },
91
  }
92
 
 
112
  >>> resolve_tools("standard")
113
  {"read_file": {}, "write_file": {}, ...}
114
 
115
+ >>> resolve_tools(["read_file", "bash"])
116
+ {"read_file": {}, "bash": {}}
117
 
118
+ >>> resolve_tools({"bash": {"timeout": 60}})
119
+ {"bash": {"timeout": 60}}
120
  """
121
  if isinstance(tools, str):
122
  if tools not in TOOL_PRESETS:
 
135
  """Extensible compaction strategy configuration.
136
 
137
  Supports multiple strategies via a tagged-union pattern:
138
+ - "head_tail": Keep first N + last M messages (message-count based)
139
+ - "head_tail_tokens": Token-aware head+tail (miniagent's HeadTailStrategy)
140
+ - "sliding_window": Keep system + recent messages within token budget
141
+ - "summarization": Summarize middle messages using LLM
142
  - "last_n": Keep only the last N messages
143
  - "none": No compaction
144
 
 
 
 
145
  Attributes:
146
  strategy: The compaction strategy name
147
  params: Strategy-specific parameters
148
+ token_budget: Maximum tokens for context window (used by token-based strategies)
149
  """
150
 
151
  strategy: str = "head_tail"
152
  params: dict[str, Any] = field(default_factory=lambda: {"head_size": 10, "tail_size": 40})
153
+ token_budget: int = 100_000
154
+
155
+ # =========================================================================
156
+ # Message-count based strategies (legacy, for MAF/LangGraph)
157
+ # =========================================================================
158
 
159
  @staticmethod
160
  def head_tail(head_size: int = 10, tail_size: int = 40) -> CompactionConfig:
161
+ """Create a message-count based head+tail compaction config."""
162
  return CompactionConfig(strategy="head_tail", params={"head_size": head_size, "tail_size": tail_size})
163
 
164
  @staticmethod
 
171
  """Create a no-compaction config."""
172
  return CompactionConfig(strategy="none", params={})
173
 
174
+ # =========================================================================
175
+ # Token-based strategies (for miniagent)
176
+ # =========================================================================
177
+
178
+ @staticmethod
179
+ def head_tail_tokens(head_ratio: float = 0.2, token_budget: int = 100_000) -> CompactionConfig:
180
+ """Create a token-aware head+tail compaction config.
181
+
182
+ This maps to miniagent's HeadTailStrategy which:
183
+ - Preserves head (system prompt, initial context) using head_ratio of budget
184
+ - Preserves tail (recent tool calls/results) using remaining budget
185
+ - Drops middle messages when over budget
186
+ - Respects atomic groups (tool calls + results stay together)
187
+
188
+ Args:
189
+ head_ratio: Fraction of budget for head messages (default 0.2 = 20%)
190
+ token_budget: Maximum tokens for context window
191
+
192
+ Returns:
193
+ CompactionConfig for token-based head+tail strategy
194
+ """
195
+ return CompactionConfig(
196
+ strategy="head_tail_tokens",
197
+ params={"head_ratio": head_ratio},
198
+ token_budget=token_budget,
199
+ )
200
+
201
+ @staticmethod
202
+ def sliding_window(token_budget: int = 100_000) -> CompactionConfig:
203
+ """Create a sliding window compaction config.
204
+
205
+ This maps to miniagent's SlidingWindowStrategy which:
206
+ - Always keeps system message(s)
207
+ - Keeps most recent messages that fit within token budget
208
+ - Respects atomic groups (tool calls + results stay together)
209
+
210
+ Args:
211
+ token_budget: Maximum tokens for context window
212
+
213
+ Returns:
214
+ CompactionConfig for sliding window strategy
215
+ """
216
+ return CompactionConfig(
217
+ strategy="sliding_window",
218
+ params={},
219
+ token_budget=token_budget,
220
+ )
221
+
222
+ @staticmethod
223
+ def summarization(
224
+ head_messages: int = 2,
225
+ tail_messages: int = 4,
226
+ summary_max_tokens: int = 1000,
227
+ token_budget: int = 100_000,
228
+ ) -> CompactionConfig:
229
+ """Create a summarization compaction config.
230
+
231
+ This maps to miniagent's SummarizationStrategy which:
232
+ - Keeps head messages (system + initial user message)
233
+ - Keeps tail messages (recent context)
234
+ - Summarizes middle messages using LLM instead of dropping them
235
+ - Preserves critical state (files read, findings, progress)
236
+
237
+ Args:
238
+ head_messages: Number of messages to keep at head (default 2)
239
+ tail_messages: Number of messages to keep at tail (default 4)
240
+ summary_max_tokens: Max tokens for the summary (default 1000)
241
+ token_budget: Maximum tokens for context window
242
+
243
+ Returns:
244
+ CompactionConfig for summarization strategy
245
+ """
246
+ return CompactionConfig(
247
+ strategy="summarization",
248
+ params={
249
+ "head_messages": head_messages,
250
+ "tail_messages": tail_messages,
251
+ "summary_max_tokens": summary_max_tokens,
252
+ },
253
+ token_budget=token_budget,
254
+ )
255
+
256
+ # =========================================================================
257
+ # Properties
258
+ # =========================================================================
259
+
260
  @property
261
  def enabled(self) -> bool:
262
  """Whether compaction is enabled."""
 
272
  """Tail size for head_tail strategy. Returns 0 for other strategies."""
273
  return self.params.get("tail_size", 0)
274
 
275
+ @property
276
+ def head_ratio(self) -> float:
277
+ """Head ratio for head_tail_tokens strategy. Returns 0.2 default."""
278
+ return self.params.get("head_ratio", 0.2)
279
+
280
 
281
  @dataclass
282
  class Agent:
 
289
 
290
  Attributes:
291
  name: Unique identifier for this agent
292
+ framework: Which harness to use ("maf", "langgraph", "claude")
293
  description: Human-readable description
294
  instructions: System prompt / instructions (optional, uses framework default if None)
295
+ instructions_preset: Preset name for instructions ("coding", "benchmark", etc.)
296
  model: Model deployment name (e.g., "gpt-4o")
297
  compaction: Compaction strategy configuration
298
  tools: Tool configuration - can be:
 
302
  """
303
 
304
  name: str
305
+ framework: str = "maf"
306
  description: str = ""
307
  instructions: str | None = None
308
+ instructions_preset: str | None = None # e.g., "coding", "benchmark", "research"
309
  model: str | None = None
310
  compaction: CompactionConfig = field(default_factory=CompactionConfig)
311
  tools: str | list[str] | dict[str, dict[str, Any]] = "standard"
 
340
  eval_score: float = 0.0
341
  eval_passed: bool = False
342
  eval_reasoning: str = ""
343
+ traces: dict[str, Any] = field(default_factory=dict)
344
 
345
 
346
  @runtime_checkable
347
  class CandidateStrategy(Protocol):
348
  """Protocol for generating candidate variants from a base agent.
349
 
350
+ Implementations can be:
351
+ - Simple (single-shot): GridSearchStrategy ignores optional params
352
+ - Complex (iterative): Runs internal experiments, checks convergence,
353
+ distills failures, etc. using the provided callbacks
354
+
355
+ All logic is internal to the strategy - the caller just calls generate()
356
+ and receives the final list of candidates.
357
+
358
+ Examples:
359
  - GridSearchStrategy: Exhaustive grid over parameter combinations
360
+ - (Future) AdaptivePromptOptimizer: Iteratively improves prompts from failures
361
  - (Future) BayesianStrategy: Bayesian optimization over parameters
362
  """
363
 
364
+ def generate(
365
+ self,
366
+ base: Agent,
367
+ budget: int,
368
+ *,
369
+ tasks: list[Task] | None = None,
370
+ evaluator: Evaluator | None = None,
371
+ run_experiment: Callable[[Candidate, Task], Awaitable[ExperimentResult]] | None = None,
372
+ ) -> list[Candidate]:
373
  """Generate candidate variants from a base agent.
374
 
375
  Args:
376
+ base: The base agent to optimize
377
+ budget: Maximum number of candidates to return
378
+ tasks: Optional tasks for strategies that run internal experiments
379
+ evaluator: Optional evaluator for strategies that need scoring
380
+ run_experiment: Optional async callback to execute a candidate on a task.
381
+ Signature: async (candidate, task) -> ExperimentResult
382
 
383
  Returns:
384
+ List of Candidate objects (at most `budget` items).
385
+ For iterative strategies, returns the final/best candidates after
386
+ internal optimization loops complete.
387
  """
388
  ...
389
 
 
417
  """
418
  self.variations = variations
419
 
420
+ def generate(
421
+ self,
422
+ base: Agent,
423
+ budget: int,
424
+ *,
425
+ tasks: list[Task] | None = None,
426
+ evaluator: Evaluator | None = None,
427
+ run_experiment: Callable[[Candidate, Task], Awaitable[ExperimentResult]] | None = None,
428
+ ) -> list[Candidate]:
429
+ """Generate all grid combinations up to budget.
430
+
431
+ Note: tasks, evaluator, and run_experiment are accepted for protocol
432
+ compatibility but ignored - GridSearchStrategy is a simple single-shot
433
+ strategy that doesn't run experiments internally.
434
+ """
435
+ # Delete unused params to satisfy linters
436
+ del tasks, evaluator, run_experiment
437
+
438
  if not self.variations:
439
  return [Candidate(agent=base, mutations={}, rationale="baseline")]
440
 
 
676
  "pareto_rank": summary.get("pareto_rank"),
677
  "is_pareto_optimal": summary.get("is_pareto_optimal", False),
678
  }
679
+
680
+
681
+ # =============================================================================
682
+ # Experiment YAML - Defines variations for optimization
683
+ # =============================================================================
684
+
685
+
686
+ @dataclass
687
+ class Experiment:
688
+ """Experiment configuration for optimization.
689
+
690
+ Separates concerns:
691
+ - Agent YAML: What the agent is (model, instructions, defaults)
692
+ - Experiment YAML: How to test it (variations, tasks, evaluation settings)
693
+
694
+ Attributes:
695
+ base_agent: Path to base agent YAML file
696
+ suite: Built-in task suite name (e.g., "coding", "quick")
697
+ tasks: Path to custom tasks JSONL file (alternative to suite)
698
+ variations: Dict of parameter variations for grid search
699
+ parallel: Max concurrent experiments
700
+ budget: Maximum candidates to generate
701
+ use_llm_eval: Whether to use LLM-as-Judge evaluation
702
+
703
+ Example YAML:
704
+ ```yaml
705
+ base_agent: examples/miniagent_base.yaml
706
+ suite: coding
707
+
708
+ variations:
709
+ compaction:
710
+ - strategy: none
711
+ - strategy: head_tail
712
+ params: { head_size: 10, tail_size: 40 }
713
+ - strategy: sliding_window
714
+ token_budget: 50000
715
+ - strategy: summarization
716
+ token_budget: 50000
717
+
718
+ tools:
719
+ - minimal
720
+ - standard
721
+ - [read_file, write_file, bash, memory]
722
+
723
+ parallel: 4
724
+ budget: 20
725
+ use_llm_eval: true
726
+ ```
727
+ """
728
+
729
+ base_agent: str | None = None
730
+ suite: str | None = None
731
+ tasks: str | None = None
732
+ variations: dict[str, list[Any]] = field(default_factory=dict)
733
+ parallel: int = 4
734
+ budget: int = 100
735
+ use_llm_eval: bool = True
736
+
737
+
738
+ def load_experiment(path: Path) -> Experiment:
739
+ """Load an Experiment from a YAML file.
740
+
741
+ Handles conversion of compaction variations from dict to CompactionConfig.
742
+
743
+ Args:
744
+ path: Path to the experiment YAML file
745
+
746
+ Returns:
747
+ Experiment instance with parsed variations
748
+
749
+ Raises:
750
+ FileNotFoundError: If the file doesn't exist
751
+ ValueError: If the config is invalid
752
+ """
753
+ if not path.exists():
754
+ raise FileNotFoundError(f"Experiment config file not found: {path}")
755
+
756
+ data = yaml.safe_load(path.read_text())
757
+
758
+ # Parse variations - convert compaction dicts to CompactionConfig
759
+ variations: dict[str, list[Any]] = {}
760
+ raw_variations = data.get("variations", {})
761
+
762
+ for key, values in raw_variations.items():
763
+ if key == "compaction":
764
+ # Convert each compaction dict to CompactionConfig
765
+ parsed_compactions = []
766
+ for v in values:
767
+ if isinstance(v, dict):
768
+ parsed_compactions.append(CompactionConfig(**v))
769
+ elif isinstance(v, str):
770
+ # Handle shorthand: "none", "head_tail", etc.
771
+ if v == "none":
772
+ parsed_compactions.append(CompactionConfig.none())
773
+ elif v == "head_tail":
774
+ parsed_compactions.append(CompactionConfig.head_tail())
775
+ elif v == "sliding_window":
776
+ parsed_compactions.append(CompactionConfig.sliding_window())
777
+ elif v == "summarization":
778
+ parsed_compactions.append(CompactionConfig.summarization())
779
+ else:
780
+ raise ValueError(f"Unknown compaction shorthand: {v}")
781
+ else:
782
+ parsed_compactions.append(v)
783
+ variations["compaction"] = parsed_compactions
784
+ else:
785
+ # Other variations pass through as-is
786
+ variations[key] = values
787
+
788
+ return Experiment(
789
+ base_agent=data.get("base_agent"),
790
+ suite=data.get("suite"),
791
+ tasks=data.get("tasks"),
792
+ variations=variations,
793
+ parallel=data.get("parallel", 4),
794
+ budget=data.get("budget", 100),
795
+ use_llm_eval=data.get("use_llm_eval", True),
796
+ )
src/flow/experiments/optimizer.py CHANGED
@@ -20,10 +20,7 @@ from typing import Any
20
 
21
  from openai import AsyncAzureOpenAI
22
 
23
- from .ablation import (
24
- compute_pareto_frontier,
25
- create_harness_from_agent,
26
- )
27
  from .evaluators import LLMEvaluator
28
  from .metrics import TraceMetrics, extract_metrics
29
  from .models import (
@@ -47,6 +44,7 @@ class TaskResult:
47
  eval_score: float
48
  eval_passed: bool
49
  eval_reasoning: str
 
50
 
51
 
52
  @dataclass
@@ -84,6 +82,18 @@ class CandidateSummary:
84
  "task_count": self.task_count,
85
  "pareto_rank": self.pareto_rank,
86
  "is_pareto_optimal": self.is_pareto_optimal,
 
 
 
 
 
 
 
 
 
 
 
 
87
  }
88
 
89
 
@@ -287,18 +297,37 @@ class FlowOptimizer:
287
  evaluator: LLMEvaluator | None,
288
  ) -> TaskResult:
289
  """Run a single candidate-task experiment."""
290
- harness = create_harness_from_agent(candidate.agent, workspace)
 
 
 
 
 
 
 
 
291
 
292
  try:
293
  runner = FlowExperimentRunner(keep_workspace=True)
294
  run_result = await runner.run(harness, task, workspace=workspace)
295
  metrics = extract_metrics(run_result.trace)
296
 
 
297
  if evaluator:
298
  eval_result = await evaluator.evaluate(run_result)
299
  eval_score = eval_result.score
300
  eval_passed = eval_result.passed
301
  eval_reasoning = eval_result.reasoning
 
 
 
 
 
 
 
 
 
 
302
  else:
303
  eval_score = 1.0 if run_result.success else 0.0
304
  eval_passed = run_result.success
@@ -312,6 +341,7 @@ class FlowOptimizer:
312
  eval_score=eval_score,
313
  eval_passed=eval_passed,
314
  eval_reasoning=eval_reasoning,
 
315
  )
316
  finally:
317
  await harness.close()
@@ -366,26 +396,48 @@ class FlowOptimizer:
366
 
367
  def _create_evaluator(self) -> LLMEvaluator | None:
368
  """Create LLM evaluator if credentials available."""
 
 
369
  api_key = os.environ.get("AZURE_OPENAI_API_KEY")
370
  endpoint = os.environ.get("AZURE_OPENAI_ENDPOINT")
371
- deployment = os.environ.get("AZURE_OPENAI_DEPLOYMENT", "gpt-4o")
 
 
 
 
 
 
 
372
 
373
  if not api_key or not endpoint:
374
- logger.warning("No Azure OpenAI credentials, using heuristic evaluation")
375
  return None
376
 
377
- client = AsyncAzureOpenAI(
378
- api_key=api_key,
379
- api_version="2024-02-15-preview",
380
- azure_endpoint=endpoint,
381
- )
 
 
 
 
 
 
 
 
 
 
382
 
383
- return LLMEvaluator(
384
  model_client=client,
385
  model_name=deployment,
386
  passing_threshold=0.7,
387
  )
388
 
 
 
 
389
  def _save_config(
390
  self,
391
  candidates: list[Candidate],
 
20
 
21
  from openai import AsyncAzureOpenAI
22
 
23
+ from .ablation import compute_pareto_frontier
 
 
 
24
  from .evaluators import LLMEvaluator
25
  from .metrics import TraceMetrics, extract_metrics
26
  from .models import (
 
44
  eval_score: float
45
  eval_passed: bool
46
  eval_reasoning: str
47
+ criteria_results: list[dict[str, Any]] = field(default_factory=list) # Per-criterion scores
48
 
49
 
50
  @dataclass
 
82
  "task_count": self.task_count,
83
  "pareto_rank": self.pareto_rank,
84
  "is_pareto_optimal": self.is_pareto_optimal,
85
+ # Include per-task results with eval reasoning
86
+ "task_results": [
87
+ {
88
+ "task_name": tr.task_name,
89
+ "eval_score": tr.eval_score,
90
+ "eval_passed": tr.eval_passed,
91
+ "eval_reasoning": tr.eval_reasoning,
92
+ "tokens": tr.metrics.total_tokens,
93
+ "duration": tr.run_result.duration_seconds,
94
+ }
95
+ for tr in self.task_results
96
+ ],
97
  }
98
 
99
 
 
297
  evaluator: LLMEvaluator | None,
298
  ) -> TaskResult:
299
  """Run a single candidate-task experiment."""
300
+ # Import harness modules to register them, then use registry
301
+ import flow.harness.maf # noqa: F401
302
+ try:
303
+ import flow.harness.miniagent # noqa: F401
304
+ except ImportError:
305
+ pass # miniagent harness is optional
306
+ from flow.harness import create_harness
307
+
308
+ harness = create_harness(candidate.agent, workspace)
309
 
310
  try:
311
  runner = FlowExperimentRunner(keep_workspace=True)
312
  run_result = await runner.run(harness, task, workspace=workspace)
313
  metrics = extract_metrics(run_result.trace)
314
 
315
+ criteria_results: list[dict[str, Any]] = []
316
  if evaluator:
317
  eval_result = await evaluator.evaluate(run_result)
318
  eval_score = eval_result.score
319
  eval_passed = eval_result.passed
320
  eval_reasoning = eval_result.reasoning
321
+ # Convert criteria results to dicts for serialization
322
+ criteria_results = [
323
+ {
324
+ "name": cr.name,
325
+ "score": cr.score,
326
+ "passed": cr.passed,
327
+ "reasoning": cr.reasoning,
328
+ }
329
+ for cr in eval_result.criteria_results
330
+ ]
331
  else:
332
  eval_score = 1.0 if run_result.success else 0.0
333
  eval_passed = run_result.success
 
341
  eval_score=eval_score,
342
  eval_passed=eval_passed,
343
  eval_reasoning=eval_reasoning,
344
+ criteria_results=criteria_results,
345
  )
346
  finally:
347
  await harness.close()
 
396
 
397
  def _create_evaluator(self) -> LLMEvaluator | None:
398
  """Create LLM evaluator if credentials available."""
399
+ from openai import AsyncOpenAI
400
+
401
  api_key = os.environ.get("AZURE_OPENAI_API_KEY")
402
  endpoint = os.environ.get("AZURE_OPENAI_ENDPOINT")
403
+ deployment = os.environ.get("AZURE_OPENAI_DEPLOYMENT") or os.environ.get(
404
+ "AZURE_OPENAI_CHAT_DEPLOYMENT_NAME", "gpt-4o"
405
+ )
406
+
407
+ logger.info("Creating LLM evaluator...")
408
+ logger.debug(f" API Key: {'[SET]' if api_key else '[NOT SET]'}")
409
+ logger.debug(f" Endpoint: {endpoint if endpoint else '[NOT SET]'}")
410
+ logger.debug(f" Deployment: {deployment}")
411
 
412
  if not api_key or not endpoint:
413
+ logger.warning("No Azure OpenAI credentials, using heuristic evaluation (binary 0/1 scores)")
414
  return None
415
 
416
+ # Check if using OpenAI-compatible endpoint (e.g., /openai/v1/)
417
+ # vs traditional Azure OpenAI endpoint
418
+ if "/v1" in endpoint:
419
+ logger.info("Creating AsyncOpenAI client for evaluator (OpenAI-compatible endpoint)")
420
+ client = AsyncOpenAI(
421
+ base_url=endpoint,
422
+ api_key=api_key,
423
+ )
424
+ else:
425
+ logger.info("Creating AsyncAzureOpenAI client for evaluator")
426
+ client = AsyncAzureOpenAI(
427
+ api_key=api_key,
428
+ api_version="2024-02-15-preview",
429
+ azure_endpoint=endpoint,
430
+ )
431
 
432
+ evaluator = LLMEvaluator(
433
  model_client=client,
434
  model_name=deployment,
435
  passing_threshold=0.7,
436
  )
437
 
438
+ logger.info(f"LLM evaluator created successfully (model={deployment}, threshold=0.7)")
439
+ return evaluator
440
+
441
  def _save_config(
442
  self,
443
  candidates: list[Candidate],
src/flow/experiments/runner.py CHANGED
@@ -21,7 +21,7 @@ from .trace_collector import FlowTraceCollector
21
  from .types import RunResult, Task
22
 
23
  if TYPE_CHECKING:
24
- from flow.harness.maf import MAFHarness
25
 
26
  logger = logging.getLogger(__name__)
27
 
@@ -66,10 +66,12 @@ class FlowExperimentRunner:
66
  - Supporting streaming execution
67
 
68
  Example:
69
- from flow.harness.maf import MAFHarness
70
  from flow.experiments import FlowExperimentRunner, Task
 
71
 
72
- harness = MAFHarness()
 
73
  runner = FlowExperimentRunner(keep_workspace=True)
74
 
75
  task = Task(name="hello", prompt="Create a hello world script")
@@ -95,7 +97,7 @@ class FlowExperimentRunner:
95
 
96
  async def run(
97
  self,
98
- harness: MAFHarness,
99
  task: Task,
100
  workspace: Path | None = None,
101
  ) -> RunResult:
@@ -109,7 +111,7 @@ class FlowExperimentRunner:
109
  5. Returns a RunResult with all data
110
 
111
  Args:
112
- harness: The MAFHarness to run
113
  task: The task to execute
114
  workspace: Optional workspace directory (creates temp if None)
115
 
@@ -167,6 +169,10 @@ class FlowExperimentRunner:
167
  elif event.type == EventType.TOOL_RESULT:
168
  # Optionally capture tool results
169
  pass
 
 
 
 
170
  finally:
171
  os.chdir(original_cwd)
172
 
 
21
  from .types import RunResult, Task
22
 
23
  if TYPE_CHECKING:
24
+ from flow.harness.base import BaseHarness
25
 
26
  logger = logging.getLogger(__name__)
27
 
 
66
  - Supporting streaming execution
67
 
68
  Example:
69
+ from flow.harness import create_harness
70
  from flow.experiments import FlowExperimentRunner, Task
71
+ from flow.experiments.models import Agent
72
 
73
+ agent = Agent(name="my-agent")
74
+ harness = create_harness(agent, workspace=Path("/tmp"))
75
  runner = FlowExperimentRunner(keep_workspace=True)
76
 
77
  task = Task(name="hello", prompt="Create a hello world script")
 
97
 
98
  async def run(
99
  self,
100
+ harness: "BaseHarness",
101
  task: Task,
102
  workspace: Path | None = None,
103
  ) -> RunResult:
 
111
  5. Returns a RunResult with all data
112
 
113
  Args:
114
+ harness: The harness to run (any BaseHarness implementation)
115
  task: The task to execute
116
  workspace: Optional workspace directory (creates temp if None)
117
 
 
169
  elif event.type == EventType.TOOL_RESULT:
170
  # Optionally capture tool results
171
  pass
172
+ elif event.type == EventType.ERROR:
173
+ # Capture error from harness
174
+ error = event.content
175
+ logger.error(f"Harness error: {error}")
176
  finally:
177
  os.chdir(original_cwd)
178
 
src/flow/experiments/types.py CHANGED
@@ -168,6 +168,56 @@ def get_available_suites() -> list[str]:
168
  return sorted(p.stem for p in _DATA_DIR.glob("*.jsonl"))
169
 
170
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
  def get_task_suite(suite_name: str) -> list[Task]:
172
  """Get a built-in task suite by name.
173
 
 
168
  return sorted(p.stem for p in _DATA_DIR.glob("*.jsonl"))
169
 
170
 
171
+ @dataclass
172
+ class SuiteInfo:
173
+ """Information about a task suite."""
174
+
175
+ name: str
176
+ task_count: int
177
+ description: str
178
+
179
+
180
+ # Suite descriptions for known suites
181
+ _SUITE_DESCRIPTIONS: dict[str, str] = {
182
+ "quick": "Fast testing",
183
+ "core": "Standard evaluation",
184
+ "coding": "Self-contained repo analysis tasks (clone, analyze, report)",
185
+ "gaia_level1": "GAIA easy benchmark",
186
+ "gaia_level2": "GAIA medium benchmark",
187
+ "gaia_level3": "GAIA hard benchmark",
188
+ "gaia_all": "GAIA full benchmark",
189
+ }
190
+
191
+
192
+ def get_suite_info(suite_name: str) -> SuiteInfo:
193
+ """Get information about a specific suite.
194
+
195
+ Args:
196
+ suite_name: Name of the suite
197
+
198
+ Returns:
199
+ SuiteInfo with name, task count, and description
200
+ """
201
+ path = _DATA_DIR / f"{suite_name}.jsonl"
202
+ if not path.exists():
203
+ raise ValueError(f"Suite not found: {suite_name}")
204
+
205
+ # Count lines (tasks) in the file
206
+ task_count = sum(1 for line in path.open() if line.strip())
207
+ description = _SUITE_DESCRIPTIONS.get(suite_name, "Custom task suite")
208
+
209
+ return SuiteInfo(name=suite_name, task_count=task_count, description=description)
210
+
211
+
212
+ def get_all_suite_info() -> list[SuiteInfo]:
213
+ """Get information about all available suites.
214
+
215
+ Returns:
216
+ List of SuiteInfo for each available suite.
217
+ """
218
+ return [get_suite_info(name) for name in get_available_suites()]
219
+
220
+
221
  def get_task_suite(suite_name: str) -> list[Task]:
222
  """Get a built-in task suite by name.
223
 
src/flow/harness/__init__.py CHANGED
@@ -5,14 +5,36 @@ events to a uniform Event format for CLI/UI consumption.
5
 
6
  Available harnesses:
7
  - maf: Microsoft Agent Framework harness
8
- - (future) langchain: LangChain harness
9
  - (future) claude: Claude SDK harness
 
 
 
 
 
 
 
10
  """
11
 
12
  from flow.harness.base import BaseHarness, Event, EventType
 
 
 
 
 
 
 
 
 
 
 
13
 
14
  __all__ = [
15
  "BaseHarness",
16
  "Event",
17
  "EventType",
 
 
 
 
18
  ]
 
5
 
6
  Available harnesses:
7
  - maf: Microsoft Agent Framework harness
8
+ - (future) langgraph: LangGraph harness
9
  - (future) claude: Claude SDK harness
10
+
11
+ Usage:
12
+ from flow.harness import create_harness
13
+ from flow.experiments.models import Agent
14
+
15
+ agent = Agent(name="my-agent", framework="maf")
16
+ harness = create_harness(agent, workspace=Path("/tmp"))
17
  """
18
 
19
  from flow.harness.base import BaseHarness, Event, EventType
20
+ from flow.harness.registry import (
21
+ available_frameworks,
22
+ create_harness,
23
+ get_harness_class,
24
+ register,
25
+ )
26
+
27
+ # Auto-register harnesses by importing them
28
+ # Each harness module calls register() on import
29
+ from flow.harness import maf as _maf # noqa: F401
30
+ from flow.harness import miniagent as _miniagent # noqa: F401
31
 
32
  __all__ = [
33
  "BaseHarness",
34
  "Event",
35
  "EventType",
36
+ "available_frameworks",
37
+ "create_harness",
38
+ "get_harness_class",
39
+ "register",
40
  ]
src/flow/harness/base.py CHANGED
@@ -7,10 +7,16 @@ allowing Flow to run on different agent frameworks.
7
  from __future__ import annotations
8
 
9
  from abc import ABC, abstractmethod
10
- from collections.abc import AsyncIterator, Callable, Coroutine
11
  from dataclasses import dataclass, field
12
  from enum import Enum
13
- from typing import Any
 
 
 
 
 
 
14
 
15
 
16
  class EventType(Enum):
@@ -49,52 +55,49 @@ class BaseHarness(ABC):
49
  to the uniform Flow Event format for CLI/UI consumption.
50
 
51
  Each harness implementation handles:
52
- - Taking a pre-configured agent from the framework
53
- - Running tasks on the agent
54
  - Converting framework-specific events to Flow Events
55
  - Managing conversation threads
56
 
57
  Implementations:
58
  - MAFHarness (flow.harness.maf): Microsoft Agent Framework
59
- - (Future) LangChainHarness: LangChain
60
  - (Future) ClaudeHarness: Claude SDK
61
  """
62
 
 
63
  @abstractmethod
64
- async def run(self, task: str, thread_id: str | None = None) -> str:
65
- """Run a task and return the final response.
 
 
 
 
 
66
 
67
  Args:
68
- task: The task/prompt to execute
69
- thread_id: Optional thread ID for conversation continuity
 
70
 
71
  Returns:
72
- The agent's final response text
73
  """
74
  ...
75
 
76
  @abstractmethod
77
- def run_stream(self, task: str, thread_id: str | None = None) -> AsyncIterator[Event]:
78
  """Run a task with streaming events.
79
 
80
  Args:
81
  task: The task/prompt to execute
82
- thread_id: Optional thread ID for conversation continuity
83
 
84
  Yields:
85
  Event objects representing agent activity
86
  """
87
  ...
88
 
89
- @abstractmethod
90
- def register_tools(self, tools: list[Callable[..., Coroutine[Any, Any, str]]]) -> None:
91
- """Register tools with the harness.
92
-
93
- Args:
94
- tools: List of tool functions to register
95
- """
96
- ...
97
-
98
  @abstractmethod
99
  def get_thread_id(self) -> str:
100
  """Get the current thread ID.
 
7
  from __future__ import annotations
8
 
9
  from abc import ABC, abstractmethod
10
+ from collections.abc import AsyncIterator
11
  from dataclasses import dataclass, field
12
  from enum import Enum
13
+ from typing import TYPE_CHECKING
14
+
15
+ if TYPE_CHECKING:
16
+ from pathlib import Path
17
+
18
+ from flow.experiments.models import Agent
19
+ from flow.llm import LLMClientConfig
20
 
21
 
22
  class EventType(Enum):
 
55
  to the uniform Flow Event format for CLI/UI consumption.
56
 
57
  Each harness implementation handles:
58
+ - Creating an agent from an Agent spec via from_agent()
59
+ - Running tasks on the agent with streaming events
60
  - Converting framework-specific events to Flow Events
61
  - Managing conversation threads
62
 
63
  Implementations:
64
  - MAFHarness (flow.harness.maf): Microsoft Agent Framework
65
+ - (Future) LangGraphHarness: LangGraph
66
  - (Future) ClaudeHarness: Claude SDK
67
  """
68
 
69
+ @classmethod
70
  @abstractmethod
71
+ def from_agent(
72
+ cls,
73
+ agent: "Agent",
74
+ workspace: "Path",
75
+ llm_config: "LLMClientConfig | None" = None,
76
+ ) -> "BaseHarness":
77
+ """Create a harness from an Agent definition.
78
 
79
  Args:
80
+ agent: The Agent spec defining the configuration
81
+ workspace: Working directory for the agent
82
+ llm_config: Optional LLM configuration (falls back to env vars if not provided)
83
 
84
  Returns:
85
+ A configured harness instance
86
  """
87
  ...
88
 
89
  @abstractmethod
90
+ def run_stream(self, task: str) -> AsyncIterator[Event]:
91
  """Run a task with streaming events.
92
 
93
  Args:
94
  task: The task/prompt to execute
 
95
 
96
  Yields:
97
  Event objects representing agent activity
98
  """
99
  ...
100
 
 
 
 
 
 
 
 
 
 
101
  @abstractmethod
102
  def get_thread_id(self) -> str:
103
  """Get the current thread ID.
src/flow/harness/langgraph/__init__.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """LangGraph harness for Flow.
2
+
3
+ This module provides a harness adapter that allows LangGraph agents
4
+ to be used within the Flow experimentation framework.
5
+
6
+ Usage:
7
+ from flow.experiments.models import Agent
8
+ from flow.harness import create_harness
9
+
10
+ agent = Agent(
11
+ name="my-langgraph-agent",
12
+ framework="langgraph", # <-- Use LangGraph harness
13
+ tools="standard",
14
+ model="openai:gpt-4o",
15
+ )
16
+ harness = create_harness(agent, workspace=Path("/tmp/workspace"))
17
+
18
+ async for event in harness.run_stream("Create hello.py"):
19
+ print(event.type, event.content)
20
+ """
21
+
22
+ from flow.harness.langgraph.compaction import create_compaction_hook
23
+ from flow.harness.langgraph.harness import LangGraphHarness
24
+ from flow.harness.langgraph.otel_callback import OTelCallbackHandler
25
+ from flow.harness.langgraph.wrappers import build_langgraph_tools, wrap_for_langgraph
26
+ from flow.harness.registry import register
27
+
28
+ # Register the harness with Flow
29
+ register("langgraph", LangGraphHarness)
30
+
31
+ __all__ = [
32
+ "LangGraphHarness",
33
+ "OTelCallbackHandler",
34
+ "build_langgraph_tools",
35
+ "create_compaction_hook",
36
+ "wrap_for_langgraph",
37
+ ]
src/flow/harness/langgraph/compaction.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Message compaction for LangGraph.
2
+
3
+ Provides a pre-model hook that implements head-tail message compaction,
4
+ similar to MAF's HeadTailCompactingChatMessageStore.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from typing import Any
10
+
11
+ __all__ = ["create_compaction_hook"]
12
+
13
+
14
+ def create_compaction_hook(head_size: int, tail_size: int):
15
+ """Create a pre-model hook for message compaction.
16
+
17
+ This hook compacts messages by keeping the first `head_size` messages
18
+ and the last `tail_size` messages, dropping the middle.
19
+
20
+ Args:
21
+ head_size: Number of messages to keep from the start
22
+ tail_size: Number of messages to keep from the end
23
+
24
+ Returns:
25
+ A function that can be used as a pre_model_hook in create_react_agent
26
+
27
+ Example:
28
+ hook = create_compaction_hook(10, 40)
29
+ graph = create_react_agent(
30
+ model=model,
31
+ tools=tools,
32
+ pre_model_hook=hook,
33
+ )
34
+ """
35
+
36
+ def compact_messages(state: dict[str, Any]) -> dict[str, Any]:
37
+ """Compact messages keeping head and tail, dropping middle."""
38
+ messages = state.get("messages", [])
39
+ total = len(messages)
40
+
41
+ # No compaction needed if within limits
42
+ if total <= head_size + tail_size:
43
+ return {"llm_input_messages": messages}
44
+
45
+ # Keep head and tail
46
+ head = messages[:head_size]
47
+ tail = messages[-tail_size:]
48
+
49
+ return {"llm_input_messages": head + tail}
50
+
51
+ return compact_messages
src/flow/harness/langgraph/harness.py ADDED
@@ -0,0 +1,257 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """LangGraph harness for Flow.
2
+
3
+ Provides a harness adapter that allows LangGraph agents to be used
4
+ within the Flow experimentation framework.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import logging
10
+ import uuid
11
+ from collections.abc import AsyncIterator
12
+ from pathlib import Path
13
+ from typing import TYPE_CHECKING, Any
14
+
15
+ from opentelemetry import trace
16
+
17
+ from flow.harness.base import BaseHarness, Event, EventType
18
+
19
+ if TYPE_CHECKING:
20
+ from flow.experiments.models import Agent
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+ # Get tracer for LangGraph instrumentation
25
+ _tracer = trace.get_tracer("flow.langgraph", "0.1.0")
26
+
27
+ __all__ = ["LangGraphHarness"]
28
+
29
+
30
+ class LangGraphHarness(BaseHarness):
31
+ """Harness adapter for LangGraph.
32
+
33
+ This harness allows LangGraph agents to be used within the Flow
34
+ experimentation framework. It converts LangGraph streaming events
35
+ to Flow's uniform Event format and emits OpenTelemetry spans.
36
+
37
+ Example:
38
+ from flow.experiments.models import Agent
39
+ from flow.harness import create_harness
40
+
41
+ agent = Agent(
42
+ name="my-langgraph-agent",
43
+ framework="langgraph",
44
+ tools="standard",
45
+ model="openai:gpt-4o",
46
+ )
47
+ harness = create_harness(agent, workspace=Path("/tmp/workspace"))
48
+
49
+ async for event in harness.run_stream("Create hello.py"):
50
+ print(event.type, event.content)
51
+ """
52
+
53
+ @classmethod
54
+ def from_agent(cls, agent: Agent, workspace: Path) -> LangGraphHarness:
55
+ """Create a LangGraph harness from an Agent spec.
56
+
57
+ Args:
58
+ agent: Agent configuration
59
+ workspace: Working directory for file operations
60
+
61
+ Returns:
62
+ Configured LangGraphHarness instance
63
+ """
64
+ from flow.experiments.models import resolve_tools
65
+ from flow.harness.langgraph.compaction import create_compaction_hook
66
+ from flow.harness.langgraph.wrappers import build_langgraph_tools
67
+ from langgraph.checkpoint.memory import InMemorySaver
68
+ from langgraph.prebuilt import create_react_agent
69
+
70
+ # Build tools (skip sub_agent - MAF-specific)
71
+ tools_spec = resolve_tools(agent.tools)
72
+ if "sub_agent" in tools_spec:
73
+ logger.warning("sub_agent tool not supported in LangGraph harness, skipping")
74
+ del tools_spec["sub_agent"]
75
+
76
+ memory_path = workspace / "memory"
77
+ memory_path.mkdir(parents=True, exist_ok=True)
78
+ tools = build_langgraph_tools(tools_spec, workspace, memory_path)
79
+
80
+ # Create model
81
+ model = cls._create_model(agent.model)
82
+
83
+ # Create compaction hook if enabled
84
+ pre_model_hook = None
85
+ if agent.compaction and agent.compaction.strategy != "none":
86
+ params = agent.compaction.params or {}
87
+ head_size = params.get("head_size", 10)
88
+ tail_size = params.get("tail_size", 40)
89
+ pre_model_hook = create_compaction_hook(head_size, tail_size)
90
+
91
+ # Build graph
92
+ graph = create_react_agent(
93
+ model=model,
94
+ tools=tools,
95
+ prompt=agent.instructions,
96
+ pre_model_hook=pre_model_hook,
97
+ checkpointer=InMemorySaver(),
98
+ )
99
+
100
+ return cls(graph=graph, agent_name=agent.name, workspace=workspace)
101
+
102
+ @staticmethod
103
+ def _create_model(model_spec: str | None):
104
+ """Create a LangChain chat model from spec.
105
+
106
+ Args:
107
+ model_spec: Model specification, e.g., "openai:gpt-4o" or "gpt-4o"
108
+
109
+ Returns:
110
+ A LangChain chat model instance
111
+ """
112
+ import os
113
+
114
+ if model_spec and ":" in model_spec:
115
+ # "provider:model" syntax - use init_chat_model
116
+ from langchain.chat_models import init_chat_model
117
+
118
+ return init_chat_model(model_spec)
119
+
120
+ # Default: Azure OpenAI from environment
121
+ from langchain_openai import AzureChatOpenAI
122
+
123
+ return AzureChatOpenAI(
124
+ deployment_name=os.environ.get("AZURE_OPENAI_CHAT_DEPLOYMENT_NAME"),
125
+ api_key=os.environ.get("AZURE_OPENAI_API_KEY"),
126
+ azure_endpoint=os.environ.get("AZURE_OPENAI_ENDPOINT"),
127
+ api_version=os.environ.get("AZURE_OPENAI_API_VERSION", "2024-02-15-preview"),
128
+ )
129
+
130
+ def __init__(
131
+ self,
132
+ graph: Any = None,
133
+ agent_name: str = "LangGraphAgent",
134
+ workspace: Path | None = None,
135
+ ) -> None:
136
+ """Initialize the harness.
137
+
138
+ Args:
139
+ graph: A compiled LangGraph StateGraph
140
+ agent_name: Name of the agent (for tracing)
141
+ workspace: Working directory
142
+ """
143
+ from flow.harness.langgraph.otel_callback import OTelCallbackHandler
144
+
145
+ self._graph = graph
146
+ self._agent_name = agent_name
147
+ self._workspace = workspace
148
+ self._thread_id = str(uuid.uuid4())
149
+ self._otel_callback = OTelCallbackHandler()
150
+
151
+ async def run_stream(self, task: str) -> AsyncIterator[Event]:
152
+ """Run a task with streaming events.
153
+
154
+ Args:
155
+ task: The task/prompt to execute
156
+
157
+ Yields:
158
+ Event objects representing the agent's actions
159
+ """
160
+ from langchain_core.messages import HumanMessage
161
+
162
+ config = {
163
+ "configurable": {"thread_id": self._thread_id},
164
+ "callbacks": [self._otel_callback],
165
+ }
166
+ input_state = {"messages": [HumanMessage(content=task)]}
167
+
168
+ # Wrap in agent span for tracing
169
+ with _tracer.start_as_current_span(
170
+ f"invoke_agent {self._agent_name}",
171
+ kind=trace.SpanKind.INTERNAL,
172
+ ) as span:
173
+ span.set_attribute("gen_ai.operation.name", "invoke_agent")
174
+ span.set_attribute("gen_ai.agent.name", self._agent_name)
175
+ span.set_attribute("gen_ai.conversation.id", self._thread_id)
176
+
177
+ try:
178
+ async for chunk in self._graph.astream(
179
+ input_state,
180
+ config,
181
+ stream_mode=["messages", "updates"],
182
+ ):
183
+ for event in self._convert_chunk(chunk):
184
+ yield event
185
+
186
+ yield Event(type=EventType.DONE)
187
+
188
+ except Exception as e:
189
+ logger.exception("Error during LangGraph execution")
190
+ span.record_exception(e)
191
+ span.set_status(trace.StatusCode.ERROR, str(e))
192
+ yield Event(type=EventType.ERROR, content=str(e))
193
+
194
+ def _convert_chunk(self, chunk: tuple) -> list[Event]:
195
+ """Convert a LangGraph stream chunk to Flow Events.
196
+
197
+ Args:
198
+ chunk: A tuple of (stream_mode, data) from LangGraph
199
+
200
+ Returns:
201
+ List of Flow Event objects
202
+ """
203
+ from langchain_core.messages import ToolMessage
204
+
205
+ events: list[Event] = []
206
+
207
+ if not isinstance(chunk, tuple) or len(chunk) != 2:
208
+ return events
209
+
210
+ mode, data = chunk
211
+
212
+ if mode == "messages":
213
+ msg_chunk, metadata = data
214
+
215
+ # Text content
216
+ if hasattr(msg_chunk, "content") and msg_chunk.content:
217
+ events.append(Event(
218
+ type=EventType.TEXT_DELTA,
219
+ content=msg_chunk.content,
220
+ ))
221
+
222
+ # Tool call chunks
223
+ if hasattr(msg_chunk, "tool_call_chunks"):
224
+ for tc in msg_chunk.tool_call_chunks or []:
225
+ if tc.get("name"):
226
+ events.append(Event(
227
+ type=EventType.TOOL_CALL_START,
228
+ tool_name=tc["name"],
229
+ tool_call_id=tc.get("id"),
230
+ ))
231
+ if tc.get("args"):
232
+ events.append(Event(
233
+ type=EventType.TOOL_CALL_ARGS,
234
+ content=tc["args"],
235
+ ))
236
+
237
+ elif mode == "updates":
238
+ for node_name, update in data.items():
239
+ if node_name == "tools" and "messages" in update:
240
+ for msg in update["messages"]:
241
+ if isinstance(msg, ToolMessage):
242
+ events.append(Event(
243
+ type=EventType.TOOL_RESULT,
244
+ content=str(msg.content),
245
+ tool_call_id=msg.tool_call_id,
246
+ ))
247
+ events.append(Event(type=EventType.TOOL_CALL_DONE))
248
+
249
+ return events
250
+
251
+ def get_thread_id(self) -> str:
252
+ """Get the current thread/conversation ID."""
253
+ return self._thread_id
254
+
255
+ async def close(self) -> None:
256
+ """Clean up resources."""
257
+ self._thread_id = None
src/flow/harness/langgraph/otel_callback.py ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """OTel callback for LangGraph - emits GenAI semantic convention spans.
2
+
3
+ This module provides a LangChain callback handler that emits OpenTelemetry
4
+ spans conforming to the GenAI semantic conventions. This fills the gap
5
+ that LangGraph doesn't have native GenAI OTel support like MAF does.
6
+
7
+ Reference: https://opentelemetry.io/docs/specs/semconv/gen-ai/
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from typing import Any
13
+
14
+ from langchain_core.callbacks import BaseCallbackHandler
15
+ from opentelemetry import trace
16
+
17
+ __all__ = ["GenAIAttr", "OTelCallbackHandler"]
18
+
19
+
20
+ class GenAIAttr:
21
+ """OpenTelemetry GenAI semantic convention attributes.
22
+
23
+ These match the attributes used by MAF for consistency.
24
+ Reference: https://opentelemetry.io/docs/specs/semconv/gen-ai/gen-ai-spans/
25
+ """
26
+
27
+ # Operation
28
+ OPERATION_NAME = "gen_ai.operation.name"
29
+ PROVIDER_NAME = "gen_ai.provider.name"
30
+
31
+ # Model
32
+ REQUEST_MODEL = "gen_ai.request.model"
33
+ RESPONSE_MODEL = "gen_ai.response.model"
34
+
35
+ # Tokens
36
+ INPUT_TOKENS = "gen_ai.usage.input_tokens"
37
+ OUTPUT_TOKENS = "gen_ai.usage.output_tokens"
38
+
39
+ # Tool
40
+ TOOL_NAME = "gen_ai.tool.name"
41
+ TOOL_TYPE = "gen_ai.tool.type"
42
+ TOOL_CALL_ID = "gen_ai.tool.call.id"
43
+
44
+ # Error
45
+ ERROR_TYPE = "error.type"
46
+
47
+
48
+ # Get tracer for LangGraph instrumentation
49
+ _tracer = trace.get_tracer("flow.langgraph", "0.1.0")
50
+
51
+
52
+ class OTelCallbackHandler(BaseCallbackHandler):
53
+ """Emit OpenTelemetry spans for LangGraph LLM and tool calls.
54
+
55
+ This callback handler hooks into LangChain's callback system and
56
+ emits spans that conform to the GenAI semantic conventions.
57
+
58
+ Usage:
59
+ callback = OTelCallbackHandler()
60
+ config = {"callbacks": [callback]}
61
+ graph.invoke(input, config)
62
+ """
63
+
64
+ def __init__(self) -> None:
65
+ """Initialize the callback handler."""
66
+ self._spans: dict[str, trace.Span] = {}
67
+
68
+ def on_llm_start(
69
+ self,
70
+ serialized: dict[str, Any],
71
+ prompts: list[str],
72
+ *,
73
+ run_id: Any,
74
+ **kwargs: Any,
75
+ ) -> None:
76
+ """Called when LLM starts generating."""
77
+ # Extract model and provider from serialized data
78
+ model = serialized.get("kwargs", {}).get("model", "unknown")
79
+ if not model or model == "unknown":
80
+ model = serialized.get("kwargs", {}).get("model_name", "unknown")
81
+
82
+ # Try to get provider from serialized id
83
+ serialized_id = serialized.get("id", [])
84
+ provider = serialized_id[-1] if serialized_id else "unknown"
85
+
86
+ # Start span
87
+ span = _tracer.start_span(f"chat {model}", kind=trace.SpanKind.CLIENT)
88
+ span.set_attribute(GenAIAttr.OPERATION_NAME, "chat")
89
+ span.set_attribute(GenAIAttr.REQUEST_MODEL, model)
90
+ span.set_attribute(GenAIAttr.PROVIDER_NAME, provider)
91
+
92
+ self._spans[str(run_id)] = span
93
+
94
+ def on_llm_end(
95
+ self,
96
+ response: Any,
97
+ *,
98
+ run_id: Any,
99
+ **kwargs: Any,
100
+ ) -> None:
101
+ """Called when LLM finishes generating."""
102
+ span = self._spans.pop(str(run_id), None)
103
+ if span:
104
+ # Extract token usage from response
105
+ usage = {}
106
+ if hasattr(response, "llm_output") and response.llm_output:
107
+ usage = response.llm_output.get("token_usage", {})
108
+
109
+ if usage:
110
+ span.set_attribute(GenAIAttr.INPUT_TOKENS, usage.get("prompt_tokens", 0))
111
+ span.set_attribute(GenAIAttr.OUTPUT_TOKENS, usage.get("completion_tokens", 0))
112
+
113
+ span.end()
114
+
115
+ def on_llm_error(
116
+ self,
117
+ error: BaseException,
118
+ *,
119
+ run_id: Any,
120
+ **kwargs: Any,
121
+ ) -> None:
122
+ """Called when LLM encounters an error."""
123
+ span = self._spans.pop(str(run_id), None)
124
+ if span:
125
+ span.set_attribute(GenAIAttr.ERROR_TYPE, type(error).__name__)
126
+ span.record_exception(error)
127
+ span.set_status(trace.StatusCode.ERROR, str(error))
128
+ span.end()
129
+
130
+ def on_tool_start(
131
+ self,
132
+ serialized: dict[str, Any],
133
+ input_str: str,
134
+ *,
135
+ run_id: Any,
136
+ **kwargs: Any,
137
+ ) -> None:
138
+ """Called when a tool starts executing."""
139
+ tool_name = serialized.get("name", "unknown")
140
+
141
+ span = _tracer.start_span(f"execute_tool {tool_name}", kind=trace.SpanKind.INTERNAL)
142
+ span.set_attribute(GenAIAttr.OPERATION_NAME, "execute_tool")
143
+ span.set_attribute(GenAIAttr.TOOL_NAME, tool_name)
144
+ span.set_attribute(GenAIAttr.TOOL_TYPE, "function")
145
+
146
+ self._spans[str(run_id)] = span
147
+
148
+ def on_tool_end(
149
+ self,
150
+ output: str,
151
+ *,
152
+ run_id: Any,
153
+ **kwargs: Any,
154
+ ) -> None:
155
+ """Called when a tool finishes executing."""
156
+ span = self._spans.pop(str(run_id), None)
157
+ if span:
158
+ span.end()
159
+
160
+ def on_tool_error(
161
+ self,
162
+ error: BaseException,
163
+ *,
164
+ run_id: Any,
165
+ **kwargs: Any,
166
+ ) -> None:
167
+ """Called when a tool encounters an error."""
168
+ span = self._spans.pop(str(run_id), None)
169
+ if span:
170
+ span.set_attribute(GenAIAttr.ERROR_TYPE, type(error).__name__)
171
+ span.record_exception(error)
172
+ span.set_status(trace.StatusCode.ERROR, str(error))
173
+ span.end()
src/flow/harness/langgraph/wrappers.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """LangGraph-specific tool wrappers.
2
+
3
+ This module wraps shared tools for use with LangGraph/LangChain.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import logging
9
+ from collections.abc import Callable, Coroutine
10
+ from pathlib import Path
11
+ from typing import Any
12
+
13
+ from langchain_core.tools import tool as langchain_tool
14
+
15
+ from flow.tools import build_tools, get_tool_meta
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+ __all__ = ["build_langgraph_tools", "wrap_for_langgraph"]
20
+
21
+
22
+ def wrap_for_langgraph(
23
+ tool_func: Callable[..., Coroutine[Any, Any, str]]
24
+ ) -> Callable[..., Coroutine[Any, Any, str]]:
25
+ """Wrap a Flow tool for LangGraph/LangChain.
26
+
27
+ Applies LangChain's @tool decorator with metadata from the @tool decorator.
28
+
29
+ Args:
30
+ tool_func: A tool function decorated with @tool
31
+
32
+ Returns:
33
+ The function wrapped with LangChain's @tool for LangGraph
34
+
35
+ Raises:
36
+ ValueError: If the function has no tool metadata
37
+ """
38
+ meta = get_tool_meta(tool_func)
39
+ if meta is None:
40
+ raise ValueError(f"Function {tool_func} has no tool metadata. Decorate with @tool first.")
41
+
42
+ # LangChain's @tool decorator takes name as first positional arg
43
+ # and description as keyword arg
44
+ return langchain_tool(meta.name, description=meta.description)(tool_func)
45
+
46
+
47
+ def build_langgraph_tools(
48
+ tools_spec: dict[str, dict[str, Any]],
49
+ workspace: Path,
50
+ memory_path: Path,
51
+ ) -> list[Any]: # Returns list of LangChain BaseTool
52
+ """Build LangGraph-compatible tools from a specification dict.
53
+
54
+ Creates shared tools and wraps them with LangChain's @tool decorator.
55
+
56
+ Args:
57
+ tools_spec: Dict mapping tool names to their config dicts.
58
+ workspace: Root directory for file operations
59
+ memory_path: Directory for persistent memory
60
+
61
+ Returns:
62
+ List of tool functions wrapped for LangGraph
63
+ """
64
+ # Build raw tools from shared module
65
+ raw_tools = build_tools(tools_spec, workspace, memory_path)
66
+
67
+ # Wrap each with LangChain's @tool
68
+ lg_tools = []
69
+ for tool_func in raw_tools:
70
+ try:
71
+ wrapped = wrap_for_langgraph(tool_func)
72
+ lg_tools.append(wrapped)
73
+ except ValueError as e:
74
+ logger.warning(f"Could not wrap tool: {e}")
75
+
76
+ return lg_tools
src/flow/harness/maf/__init__.py CHANGED
@@ -6,6 +6,10 @@ Provides integration with Microsoft Agent Framework for running Flow agents.
6
  from flow.harness.maf.agent import create_agent
7
  from flow.harness.maf.harness import MAFHarness
8
  from flow.harness.maf.message_store import HeadTailCompactingChatMessageStore
 
 
 
 
9
 
10
  __all__ = [
11
  "create_agent",
 
6
  from flow.harness.maf.agent import create_agent
7
  from flow.harness.maf.harness import MAFHarness
8
  from flow.harness.maf.message_store import HeadTailCompactingChatMessageStore
9
+ from flow.harness.registry import register
10
+
11
+ # Auto-register MAFHarness as the "maf" framework
12
+ register("maf", MAFHarness)
13
 
14
  __all__ = [
15
  "create_agent",
src/flow/harness/maf/agent.py CHANGED
@@ -11,7 +11,7 @@ from typing import TYPE_CHECKING, Any
11
 
12
  from flow.experiments.models import TOOL_PRESETS, resolve_tools
13
  from flow.harness.maf.message_store import HeadTailCompactingChatMessageStore
14
- from flow.harness.maf.tools import build_tools
15
  from flow.prompts import build_instructions
16
 
17
  if TYPE_CHECKING:
@@ -54,7 +54,7 @@ def create_agent(
54
  Args:
55
  endpoint: Azure OpenAI endpoint URL. Defaults to AZURE_OPENAI_ENDPOINT env var.
56
  api_key: Azure OpenAI API key. Defaults to AZURE_OPENAI_API_KEY env var.
57
- deployment: Azure OpenAI deployment name. Defaults to AZURE_OPENAI_DEPLOYMENT env var.
58
  api_version: Azure OpenAI API version.
59
  name: Agent name.
60
  instructions: Agent instructions. Defaults to FLOW_AGENT_INSTRUCTIONS.
@@ -86,7 +86,7 @@ def create_agent(
86
  >>> agent = create_agent(tools={"bash_execute": {"timeout": 60}, "memory": {}})
87
  """
88
  try:
89
- from agent_framework import ChatAgent, ai_function
90
  from agent_framework.azure import AzureOpenAIChatClient
91
  except ImportError as e:
92
  raise ImportError(
@@ -97,7 +97,7 @@ def create_agent(
97
  # Resolve configuration from environment if not provided
98
  endpoint = endpoint or os.environ.get("AZURE_OPENAI_ENDPOINT")
99
  api_key = api_key or os.environ.get("AZURE_OPENAI_API_KEY")
100
- deployment = deployment or os.environ.get("AZURE_OPENAI_DEPLOYMENT")
101
 
102
  if not endpoint:
103
  raise ValueError(
@@ -112,7 +112,7 @@ def create_agent(
112
  if not deployment:
113
  raise ValueError(
114
  "Azure OpenAI deployment is required. "
115
- "Set AZURE_OPENAI_DEPLOYMENT or pass deployment parameter."
116
  )
117
 
118
  # Resolve paths
@@ -125,26 +125,23 @@ def create_agent(
125
 
126
  # Create tools from specification or use provided functions
127
  if isinstance(tools, (str, list, dict)):
128
- # Resolve to dict form and build tools
129
  tools_spec = resolve_tools(tools)
130
- tool_functions = build_tools(tools_spec, workspace, memory_path)
131
  else:
132
- # Already a sequence of callable tools
133
- tool_functions = tools
134
-
135
- # Wrap tools with ai_function decorator for Agent Framework
136
- converted_tools = []
137
- for tool_func in tool_functions:
138
- tool_name = getattr(tool_func, "_tool_name", tool_func.__name__)
139
- tool_description = getattr(tool_func, "_tool_description", tool_func.__doc__ or "")
140
- wrapped = ai_function(name=tool_name, description=tool_description)(tool_func)
141
- converted_tools.append(wrapped)
142
 
143
  # Create the chat client
144
  client = AzureOpenAIChatClient(
145
  api_key=api_key,
146
  endpoint=endpoint,
147
- deployment=deployment,
148
  api_version=api_version,
149
  )
150
 
 
11
 
12
  from flow.experiments.models import TOOL_PRESETS, resolve_tools
13
  from flow.harness.maf.message_store import HeadTailCompactingChatMessageStore
14
+ from flow.harness.maf.wrappers import build_maf_tools
15
  from flow.prompts import build_instructions
16
 
17
  if TYPE_CHECKING:
 
54
  Args:
55
  endpoint: Azure OpenAI endpoint URL. Defaults to AZURE_OPENAI_ENDPOINT env var.
56
  api_key: Azure OpenAI API key. Defaults to AZURE_OPENAI_API_KEY env var.
57
+ deployment: Azure OpenAI deployment name. Defaults to AZURE_OPENAI_CHAT_DEPLOYMENT_NAME env var.
58
  api_version: Azure OpenAI API version.
59
  name: Agent name.
60
  instructions: Agent instructions. Defaults to FLOW_AGENT_INSTRUCTIONS.
 
86
  >>> agent = create_agent(tools={"bash_execute": {"timeout": 60}, "memory": {}})
87
  """
88
  try:
89
+ from agent_framework import ChatAgent, tool
90
  from agent_framework.azure import AzureOpenAIChatClient
91
  except ImportError as e:
92
  raise ImportError(
 
97
  # Resolve configuration from environment if not provided
98
  endpoint = endpoint or os.environ.get("AZURE_OPENAI_ENDPOINT")
99
  api_key = api_key or os.environ.get("AZURE_OPENAI_API_KEY")
100
+ deployment = deployment or os.environ.get("AZURE_OPENAI_CHAT_DEPLOYMENT_NAME")
101
 
102
  if not endpoint:
103
  raise ValueError(
 
112
  if not deployment:
113
  raise ValueError(
114
  "Azure OpenAI deployment is required. "
115
+ "Set AZURE_OPENAI_CHAT_DEPLOYMENT_NAME or pass deployment parameter."
116
  )
117
 
118
  # Resolve paths
 
125
 
126
  # Create tools from specification or use provided functions
127
  if isinstance(tools, (str, list, dict)):
128
+ # Resolve to dict form and build MAF-wrapped tools
129
  tools_spec = resolve_tools(tools)
130
+ converted_tools = build_maf_tools(tools_spec, workspace, memory_path)
131
  else:
132
+ # Already a sequence of callable tools - wrap them with tool decorator
133
+ converted_tools = []
134
+ for tool_func in tools:
135
+ tool_name = getattr(tool_func, "_tool_name", tool_func.__name__)
136
+ tool_description = getattr(tool_func, "_tool_description", tool_func.__doc__ or "")
137
+ wrapped = tool(name=tool_name, description=tool_description)(tool_func)
138
+ converted_tools.append(wrapped)
 
 
 
139
 
140
  # Create the chat client
141
  client = AzureOpenAIChatClient(
142
  api_key=api_key,
143
  endpoint=endpoint,
144
+ deployment_name=deployment,
145
  api_version=api_version,
146
  )
147
 
src/flow/harness/maf/harness.py CHANGED
@@ -3,9 +3,12 @@
3
  A thin adapter that converts Agent Framework events to the uniform Flow Event format.
4
  """
5
 
 
 
6
  import logging
7
  import uuid
8
  from collections.abc import AsyncIterator
 
9
  from typing import TYPE_CHECKING, Any
10
 
11
  from flow.harness.base import BaseHarness, Event, EventType
@@ -13,6 +16,9 @@ from flow.harness.base import BaseHarness, Event, EventType
13
  if TYPE_CHECKING:
14
  from agent_framework import ChatAgent
15
 
 
 
 
16
  logger = logging.getLogger(__name__)
17
 
18
  # Track if instrumentation has been enabled globally
@@ -55,12 +61,69 @@ class MAFHarness(BaseHarness):
55
  >>> async for event in harness.run_stream("Create a hello world script"):
56
  ... print(event)
57
 
58
- >>> # Or with custom agent
59
- >>> from flow.harness.maf import create_agent
60
- >>> agent = create_agent(enable_compaction=False)
61
- >>> harness = MAFHarness(agent)
62
  """
63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  def __init__(
65
  self,
66
  agent: "ChatAgent | None" = None,
@@ -87,61 +150,15 @@ class MAFHarness(BaseHarness):
87
  # Enable OpenTelemetry instrumentation for trace collection
88
  _enable_instrumentation()
89
 
90
- def register_tools(self, tools: list[Any]) -> None:
91
- """Register tools with the harness.
92
-
93
- Note: For MAFHarness, tools should be configured when creating the agent
94
- via create_agent(). This method is provided for interface compatibility
95
- but will log a warning if called.
96
-
97
- Args:
98
- tools: List of tool functions (ignored - configure via create_agent)
99
- """
100
- logger.warning(
101
- "MAFHarness.register_tools() called but tools should be configured "
102
- "via create_agent(). These tools will be ignored."
103
- )
104
-
105
- async def run(self, task: str, thread_id: str | None = None) -> str:
106
- """Run a task and return the final response.
107
-
108
- Args:
109
- task: The task/prompt to execute
110
- thread_id: Optional thread ID for conversation continuity
111
-
112
- Returns:
113
- The agent's final response text
114
- """
115
- if thread_id:
116
- self._thread_id = thread_id
117
-
118
- # Get or create an AgentThread for conversation continuity
119
- if self._thread is None:
120
- self._thread = self._agent.get_new_thread()
121
-
122
- response = await self._agent.run(task, thread=self._thread)
123
-
124
- # Extract text content from response
125
- content = getattr(response, "content", None)
126
- if content is not None:
127
- return str(content)
128
- return str(response)
129
-
130
- async def run_stream(
131
- self, task: str, thread_id: str | None = None
132
- ) -> AsyncIterator[Event]:
133
  """Run a task with streaming events.
134
 
135
  Args:
136
  task: The task/prompt to execute
137
- thread_id: Optional thread ID for conversation continuity
138
 
139
  Yields:
140
  Event objects representing agent activity
141
  """
142
- if thread_id:
143
- self._thread_id = thread_id
144
-
145
  # Get or create an AgentThread for conversation continuity
146
  if self._thread is None:
147
  self._thread = self._agent.get_new_thread()
 
3
  A thin adapter that converts Agent Framework events to the uniform Flow Event format.
4
  """
5
 
6
+ from __future__ import annotations
7
+
8
  import logging
9
  import uuid
10
  from collections.abc import AsyncIterator
11
+ from pathlib import Path
12
  from typing import TYPE_CHECKING, Any
13
 
14
  from flow.harness.base import BaseHarness, Event, EventType
 
16
  if TYPE_CHECKING:
17
  from agent_framework import ChatAgent
18
 
19
+ from flow.experiments.models import Agent
20
+ from flow.llm import LLMClientConfig
21
+
22
  logger = logging.getLogger(__name__)
23
 
24
  # Track if instrumentation has been enabled globally
 
61
  >>> async for event in harness.run_stream("Create a hello world script"):
62
  ... print(event)
63
 
64
+ >>> # Or from Agent spec
65
+ >>> from flow.experiments.models import Agent
66
+ >>> agent = Agent(name="my-agent", tools="standard")
67
+ >>> harness = MAFHarness.from_agent(agent, workspace=Path("/tmp"))
68
  """
69
 
70
+ @classmethod
71
+ def from_agent(
72
+ cls,
73
+ agent: "Agent",
74
+ workspace: Path,
75
+ llm_config: "LLMClientConfig | None" = None,
76
+ ) -> "MAFHarness":
77
+ """Create a MAFHarness from an Agent definition.
78
+
79
+ Args:
80
+ agent: The Agent spec defining the configuration
81
+ workspace: Working directory for the agent
82
+ llm_config: Optional LLM configuration (falls back to env vars if not provided)
83
+
84
+ Returns:
85
+ A configured MAFHarness instance
86
+ """
87
+ from flow.experiments.models import resolve_tools
88
+
89
+ tools_spec = resolve_tools(agent.tools)
90
+
91
+ # Build kwargs for create_agent
92
+ kwargs: dict[str, Any] = {
93
+ "workspace": workspace,
94
+ "memory_path": workspace / "memory",
95
+ "enable_compaction": agent.compaction.enabled,
96
+ "compaction_head_size": agent.compaction.head_size,
97
+ "compaction_tail_size": agent.compaction.tail_size,
98
+ "tools": tools_spec,
99
+ "instructions": agent.instructions,
100
+ }
101
+
102
+ # Extract credentials from LLM config if provided
103
+ if llm_config is not None:
104
+ from flow.llm import LLMProvider
105
+
106
+ if llm_config.provider == LLMProvider.AZURE_OPENAI and llm_config.azure_openai:
107
+ kwargs["endpoint"] = llm_config.azure_openai.get_endpoint()
108
+ kwargs["api_key"] = llm_config.azure_openai.get_api_key()
109
+ kwargs["deployment"] = llm_config.azure_openai.deployment
110
+ kwargs["api_version"] = llm_config.azure_openai.api_version
111
+ elif llm_config.provider == LLMProvider.OPENAI and llm_config.openai:
112
+ # OpenAI uses different endpoint/auth pattern
113
+ # For now, MAF only supports Azure OpenAI natively
114
+ # Log warning and fall back to env vars
115
+ logger.warning(
116
+ f"MAF harness only supports Azure OpenAI natively. "
117
+ f"Provider {llm_config.provider.value} will fall back to env vars."
118
+ )
119
+ else:
120
+ logger.warning(
121
+ f"MAF harness only supports Azure OpenAI. "
122
+ f"Provider {llm_config.provider.value} will fall back to env vars."
123
+ )
124
+
125
+ return cls(**kwargs)
126
+
127
  def __init__(
128
  self,
129
  agent: "ChatAgent | None" = None,
 
150
  # Enable OpenTelemetry instrumentation for trace collection
151
  _enable_instrumentation()
152
 
153
+ async def run_stream(self, task: str) -> AsyncIterator[Event]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
  """Run a task with streaming events.
155
 
156
  Args:
157
  task: The task/prompt to execute
 
158
 
159
  Yields:
160
  Event objects representing agent activity
161
  """
 
 
 
162
  # Get or create an AgentThread for conversation continuity
163
  if self._thread is None:
164
  self._thread = self._agent.get_new_thread()
src/flow/harness/maf/tools/__init__.py CHANGED
@@ -1,86 +1,74 @@
1
  """MAF-specific tools for the Flow agent.
2
 
3
  This module provides tools that work with the Microsoft Agent Framework harness.
4
- Tools are created based on a specification dict that maps tool names to their configs.
 
5
 
6
  Available tools:
7
- - read_file: Read file contents
8
- - write_file: Write/edit file content
9
- - list_directory: List directory contents
10
- - grep_search: Search for text patterns
11
- - bash_execute: Execute bash commands (config: timeout)
12
- - check_processes: Manage background processes
13
- - python_repl: Execute Python code
14
- - think: Explicit reasoning tool
15
- - task_done: Task completion marker
16
- - memory: Persistent memory storage
17
- - sub_agent: Isolated research sub-agent (config: model)
18
  """
19
 
20
- from collections.abc import Callable, Coroutine, Sequence
 
21
  from pathlib import Path
22
  from typing import Any
23
 
24
- from flow.harness.maf.tools.coding import (
25
- create_grep_search_tool,
26
- create_list_directory_tool,
27
- create_read_file_tool,
28
- create_write_file_tool,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  )
30
- from flow.harness.maf.tools.core import task_done, think
31
- from flow.harness.maf.tools.execution import (
32
- create_bash_execute_tool,
33
- create_check_processes_tool,
34
- create_python_repl_tool,
35
- )
36
- from flow.harness.maf.tools.memory import create_memory_tool
37
- from flow.harness.maf.tools.sub_agent import create_sub_agent_tool
38
 
39
  __all__ = [
40
  "build_tools",
41
- "create_bash_execute_tool",
42
- "create_check_processes_tool",
43
- "create_grep_search_tool",
44
- "create_list_directory_tool",
45
- "create_memory_tool",
46
- "create_python_repl_tool",
47
- "create_read_file_tool",
48
- "create_sub_agent_tool",
49
- "create_write_file_tool",
50
- "task_done",
51
- "think",
52
  ]
53
 
54
-
55
- # Registry of tool factories that don't require config
56
- # Maps tool name -> factory function(workspace, memory_path) -> tool
57
- _SIMPLE_TOOL_FACTORIES: dict[str, Callable[..., Any]] = {}
58
-
59
- # Registry of tools that are standalone (no factory needed)
60
- _STANDALONE_TOOLS: dict[str, Callable[..., Coroutine[Any, Any, str]]] = {
61
- "think": think,
62
- "task_done": task_done,
63
- }
64
 
65
 
66
  def build_tools(
67
  tools_spec: dict[str, dict[str, Any]],
68
  workspace: Path,
69
  memory_path: Path,
70
- ) -> Sequence[Callable[..., Coroutine[Any, Any, str]]]:
71
- """Build tool functions from a specification dict.
72
 
73
  This is the main entry point for creating tools based on a resolved
74
- tool specification (from resolve_tools()).
 
75
 
76
  Args:
77
  tools_spec: Dict mapping tool names to their config dicts.
78
- e.g., {"bash_execute": {"timeout": 60}, "read_file": {}}
79
  workspace: Root directory for file operations
80
- memory_path: Directory for persistent memory
81
 
82
  Returns:
83
- List of tool functions ready to use with MAF
84
 
85
  Example:
86
  >>> from flow.experiments.models import resolve_tools
@@ -88,70 +76,63 @@ def build_tools(
88
  >>> tools = build_tools(tools_spec, workspace, memory_path)
89
  """
90
  workspace = Path(workspace).resolve()
91
- memory_path = Path(memory_path).resolve()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
 
93
  tools: list[Callable[..., Coroutine[Any, Any, str]]] = []
94
 
95
- for tool_name, config in tools_spec.items():
96
- tool = _create_tool(tool_name, config, workspace, memory_path)
97
- if tool is not None:
98
- tools.append(tool)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
 
100
  return tools
101
-
102
-
103
- def _create_tool(
104
- name: str,
105
- config: dict[str, Any],
106
- workspace: Path,
107
- memory_path: Path,
108
- ) -> Callable[..., Coroutine[Any, Any, str]] | None:
109
- """Create a single tool by name with the given config.
110
-
111
- Args:
112
- name: Tool name (e.g., "read_file", "bash_execute")
113
- config: Tool-specific configuration dict
114
- workspace: Root directory for file operations
115
- memory_path: Directory for persistent memory
116
-
117
- Returns:
118
- Tool function or None if unknown tool name
119
- """
120
- # Standalone tools (no config needed)
121
- if name in _STANDALONE_TOOLS:
122
- return _STANDALONE_TOOLS[name]
123
-
124
- # Coding tools
125
- if name == "read_file":
126
- return create_read_file_tool(workspace)
127
- if name == "write_file":
128
- return create_write_file_tool(workspace)
129
- if name == "list_directory":
130
- return create_list_directory_tool(workspace)
131
- if name == "grep_search":
132
- return create_grep_search_tool(workspace)
133
-
134
- # Execution tools
135
- if name == "bash_execute":
136
- timeout = config.get("timeout", 120)
137
- return create_bash_execute_tool(workspace, memory_path, timeout)
138
- if name == "check_processes":
139
- return create_check_processes_tool(workspace, memory_path)
140
- if name == "python_repl":
141
- return create_python_repl_tool(workspace)
142
-
143
- # Memory tool
144
- if name == "memory":
145
- return create_memory_tool(memory_path)
146
-
147
- # Sub-agent tool
148
- if name == "sub_agent":
149
- model = config.get("model", "gpt-4o-mini")
150
- return create_sub_agent_tool(workspace, model=model)
151
-
152
- # Unknown tool - log warning and skip
153
- import logging
154
-
155
- logger = logging.getLogger(__name__)
156
- logger.warning(f"Unknown tool name: {name}. Skipping.")
157
- return None
 
1
  """MAF-specific tools for the Flow agent.
2
 
3
  This module provides tools that work with the Microsoft Agent Framework harness.
4
+ Tools are created from the shared flow.tools module and adapted for MAF using
5
+ the to_maf_tool adapter.
6
 
7
  Available tools:
8
+ - read_file, write_file, edit_file, multi_edit, glob_files, grep, ls
9
+ - bash, check_processes, python_repl
10
+ - think, todo_write, todo_read
11
+ - memory, skills, task
12
+ - web_search, web_fetch
13
+ - notebook_edit, notebook_read
 
 
 
 
 
14
  """
15
 
16
+ import logging
17
+ from collections.abc import Callable, Coroutine
18
  from pathlib import Path
19
  from typing import Any
20
 
21
+ from flow.tools import (
22
+ # Coding
23
+ read_file, write_file, edit_file, multi_edit, glob_files, grep, ls,
24
+ # Execution
25
+ bash, check_processes, python_repl,
26
+ # Planning
27
+ think, todo_write, todo_read,
28
+ # Memory
29
+ memory, create_memory_tool,
30
+ # Web
31
+ web_search, web_fetch,
32
+ # Notebooks
33
+ notebook_edit, notebook_read,
34
+ # Skills
35
+ skills, create_skills_tool,
36
+ # Sub-agent
37
+ task, create_task_tool,
38
+ # Workspace management
39
+ set_workspace, Workspace,
40
+ # Adapters
41
+ to_maf_tool,
42
+ # Base
43
+ Tool,
44
  )
 
 
 
 
 
 
 
 
45
 
46
  __all__ = [
47
  "build_tools",
 
 
 
 
 
 
 
 
 
 
 
48
  ]
49
 
50
+ logger = logging.getLogger(__name__)
 
 
 
 
 
 
 
 
 
51
 
52
 
53
  def build_tools(
54
  tools_spec: dict[str, dict[str, Any]],
55
  workspace: Path,
56
  memory_path: Path,
57
+ ) -> list[Callable[..., Coroutine[Any, Any, str]]]:
58
+ """Build MAF-compatible tool functions from a specification dict.
59
 
60
  This is the main entry point for creating tools based on a resolved
61
+ tool specification (from resolve_tools()). It uses the shared tools
62
+ from flow.tools and adapts them for MAF.
63
 
64
  Args:
65
  tools_spec: Dict mapping tool names to their config dicts.
66
+ e.g., {"bash": {"timeout": 60}, "read_file": {}}
67
  workspace: Root directory for file operations
68
+ memory_path: Directory for persistent memory (deprecated, uses workspace)
69
 
70
  Returns:
71
+ List of tool functions wrapped with MAF's @tool decorator
72
 
73
  Example:
74
  >>> from flow.experiments.models import resolve_tools
 
76
  >>> tools = build_tools(tools_spec, workspace, memory_path)
77
  """
78
  workspace = Path(workspace).resolve()
79
+
80
+ # Set workspace for tools that need it (memory, todos, etc.)
81
+ set_workspace(Workspace(workspace))
82
+
83
+ # Map tool names → Tool instances
84
+ tool_map: dict[str, Tool] = {
85
+ # Coding/Filesystem
86
+ "read_file": read_file,
87
+ "write_file": write_file,
88
+ "edit_file": edit_file,
89
+ "multi_edit": multi_edit,
90
+ "glob_files": glob_files,
91
+ "ls": ls,
92
+ "grep": grep,
93
+ # Execution
94
+ "bash": bash,
95
+ "check_processes": check_processes,
96
+ "python_repl": python_repl,
97
+ # Planning
98
+ "think": think,
99
+ "todo_write": todo_write,
100
+ "todo_read": todo_read,
101
+ # Web
102
+ "web_search": web_search,
103
+ "web_fetch": web_fetch,
104
+ # Notebooks
105
+ "notebook_edit": notebook_edit,
106
+ "notebook_read": notebook_read,
107
+ # Memory (default instance)
108
+ "memory": memory,
109
+ # Skills (default instance)
110
+ "skills": skills,
111
+ # Task/sub-agent (default instance)
112
+ "task": task,
113
+ }
114
 
115
  tools: list[Callable[..., Coroutine[Any, Any, str]]] = []
116
 
117
+ for name, config in tools_spec.items():
118
+ if name in tool_map:
119
+ # Convert shared Tool to MAF-decorated function
120
+ maf_tool = to_maf_tool(tool_map[name])
121
+ tools.append(maf_tool)
122
+ elif name == "task" and config:
123
+ # Task tool with custom config
124
+ custom_task = create_task_tool(
125
+ coordinator_tools=list(tool_map.values()),
126
+ model=config.get("model"),
127
+ )
128
+ tools.append(to_maf_tool(custom_task))
129
+ elif name == "skills" and config.get("additional_paths"):
130
+ # Skills with custom paths
131
+ custom_skills = create_skills_tool(
132
+ project_path=Path(config["additional_paths"][0])
133
+ )
134
+ tools.append(to_maf_tool(custom_skills))
135
+ else:
136
+ logger.warning(f"Unknown tool name: {name}. Skipping.")
137
 
138
  return tools
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/flow/harness/maf/tools/coding.py DELETED
@@ -1,391 +0,0 @@
1
- """Coding tools for file operations and code search.
2
-
3
- These tools enable agents to read/write files, list directories,
4
- and search for patterns in code.
5
-
6
- The agent can read and write to any path the user has access to.
7
- The workspace serves as the default working directory for relative paths.
8
- """
9
-
10
- import re
11
- from collections.abc import Callable, Coroutine, Sequence
12
- from pathlib import Path
13
- from typing import Annotated, Any
14
-
15
-
16
- def create_read_file_tool(workspace: Path) -> Callable[..., Coroutine[Any, Any, str]]:
17
- """Create a read_file tool that can read from any path.
18
-
19
- Args:
20
- workspace: Default directory for relative paths (not a restriction)
21
- """
22
-
23
- async def read_file(
24
- file_path: Annotated[str, "Path to the file (absolute or relative to workspace)"],
25
- max_lines: Annotated[int, "Maximum lines to return (default: 500)"] = 500,
26
- ) -> str:
27
- """Read the contents of a file. Can read from any path on the system."""
28
- try:
29
- # Support both absolute and relative paths
30
- path = Path(file_path)
31
- if path.is_absolute():
32
- full_path = path.resolve()
33
- else:
34
- full_path = (workspace / file_path).resolve()
35
-
36
- if not full_path.exists():
37
- return f"Error: File not found: {file_path}"
38
-
39
- if not full_path.is_file():
40
- return f"Error: Not a file: {file_path}"
41
-
42
- content = full_path.read_text(encoding="utf-8")
43
- lines = content.splitlines()
44
-
45
- # Apply line limit
46
- total_lines = len(lines)
47
- if len(lines) > max_lines:
48
- lines = lines[:max_lines]
49
- truncated_msg = f"\n... (truncated, showing first {max_lines} of {total_lines} lines)"
50
- else:
51
- truncated_msg = ""
52
-
53
- # Format with line numbers
54
- numbered_lines = [f"{i + 1:5d}: {line}" for i, line in enumerate(lines)]
55
- result = "\n".join(numbered_lines) + truncated_msg
56
-
57
- return f"File: {full_path} ({total_lines} lines)\n{'=' * 40}\n{result}"
58
-
59
- except UnicodeDecodeError:
60
- return f"Error: Cannot read file (binary or non-UTF-8): {file_path}"
61
- except PermissionError:
62
- return f"Error: Permission denied: {file_path}"
63
- except Exception as e:
64
- return f"Error reading file: {e}"
65
-
66
- # Add tool metadata
67
- read_file._tool_name = "read_file" # type: ignore[attr-defined]
68
- read_file._tool_description = ( # type: ignore[attr-defined]
69
- "Read the contents of a file. Accepts absolute paths (e.g., /path/to/file) "
70
- "or relative paths (relative to workspace). Returns content with line numbers."
71
- )
72
- read_file._is_tool = True # type: ignore[attr-defined]
73
-
74
- return read_file
75
-
76
-
77
- def create_write_file_tool(workspace: Path) -> Callable[..., Coroutine[Any, Any, str]]:
78
- """Create a write_file tool.
79
-
80
- Args:
81
- workspace: Default directory for relative paths
82
- """
83
-
84
- async def write_file(
85
- file_path: Annotated[str, "Path to the file (absolute or relative to workspace)"],
86
- content: Annotated[str | None, "Full content to write (for complete file write)"] = None,
87
- old_str: Annotated[str | None, "Text to replace (for str_replace operation)"] = None,
88
- new_str: Annotated[str | None, "Replacement text (for str_replace operation)"] = None,
89
- insert_line: Annotated[int | None, "Line number to insert at (1-indexed)"] = None,
90
- insert_content: Annotated[str | None, "Content to insert at line"] = None,
91
- ) -> str:
92
- """Write or edit file content.
93
-
94
- Supports: (1) full file write with 'content',
95
- (2) str_replace to replace specific text,
96
- (3) insert_at_line to add content at a specific line.
97
- Creates parent directories if needed.
98
- """
99
- try:
100
- # Support both absolute and relative paths
101
- path = Path(file_path)
102
- if path.is_absolute():
103
- full_path = path.resolve()
104
- else:
105
- full_path = (workspace / file_path).resolve()
106
-
107
- # Create parent directories
108
- full_path.parent.mkdir(parents=True, exist_ok=True)
109
-
110
- # Operation 1: Full file write
111
- if content is not None:
112
- full_path.write_text(content, encoding="utf-8")
113
- return f"Successfully wrote {len(content)} characters to {file_path}"
114
-
115
- # Operation 2: str_replace
116
- if old_str is not None and new_str is not None:
117
- if not full_path.exists():
118
- return f"Error: File not found for str_replace: {file_path}"
119
-
120
- current_content = full_path.read_text(encoding="utf-8")
121
-
122
- if old_str not in current_content:
123
- # Show a snippet of the file to help debug
124
- if len(current_content) > 500:
125
- snippet = current_content[:500] + "..."
126
- else:
127
- snippet = current_content
128
- return (
129
- f"Error: String to replace not found in file.\n"
130
- f"Searching for: '{old_str[:100]}...'\n"
131
- f"File content preview:\n{snippet}"
132
- )
133
-
134
- # Replace first occurrence only
135
- new_content = current_content.replace(old_str, new_str, 1)
136
- full_path.write_text(new_content, encoding="utf-8")
137
- return f"Successfully replaced text in {file_path}"
138
-
139
- # Operation 3: insert_at_line
140
- if insert_line is not None and insert_content is not None:
141
- if full_path.exists():
142
- current_content = full_path.read_text(encoding="utf-8")
143
- lines = current_content.splitlines(keepends=True)
144
- else:
145
- lines = []
146
-
147
- # Ensure insert_content ends with newline
148
- if not insert_content.endswith("\n"):
149
- insert_content += "\n"
150
-
151
- # Insert at specified line (1-indexed)
152
- insert_index = insert_line - 1
153
- if insert_index < 0:
154
- return f"Error: Invalid line number: {insert_line}. Must be >= 1."
155
-
156
- # Allow inserting at end
157
- if insert_index > len(lines):
158
- insert_index = len(lines)
159
-
160
- lines.insert(insert_index, insert_content)
161
- new_content = "".join(lines)
162
- full_path.write_text(new_content, encoding="utf-8")
163
- return f"Successfully inserted content at line {insert_line} in {file_path}"
164
-
165
- return "Error: Must provide either 'content', 'old_str' + 'new_str', or 'insert_line' + 'insert_content'"
166
-
167
- except Exception as e:
168
- return f"Error writing file: {e}"
169
-
170
- # Add tool metadata
171
- write_file._tool_name = "write_file" # type: ignore[attr-defined]
172
- write_file._tool_description = ( # type: ignore[attr-defined]
173
- "Write or edit file content. Accepts absolute paths or relative paths (relative to workspace). "
174
- "Supports: (1) full file write with 'content', (2) str_replace to replace specific text, "
175
- "(3) insert_at_line to add content at a specific line. Creates parent directories if needed."
176
- )
177
- write_file._is_tool = True # type: ignore[attr-defined]
178
-
179
- return write_file
180
-
181
-
182
- def create_list_directory_tool(workspace: Path) -> Callable[..., Coroutine[Any, Any, str]]:
183
- """Create a list_directory tool that can list any directory.
184
-
185
- Args:
186
- workspace: Default directory for relative paths (not a restriction)
187
- """
188
-
189
- async def list_directory(
190
- directory_path: Annotated[str, "Path to directory (absolute or relative to workspace, default: '.')"] = ".",
191
- recursive: Annotated[bool, "List subdirectories recursively (default: false)"] = False,
192
- max_entries: Annotated[int, "Maximum entries to return (default: 200)"] = 200,
193
- ) -> str:
194
- """List files and directories at a given path. Can list any directory on the system."""
195
- try:
196
- # Support both absolute and relative paths
197
- path = Path(directory_path)
198
- if path.is_absolute():
199
- full_path = path.resolve()
200
- else:
201
- full_path = (workspace / directory_path).resolve()
202
-
203
- if not full_path.exists():
204
- return f"Error: Directory not found: {directory_path}"
205
-
206
- if not full_path.is_dir():
207
- return f"Error: Not a directory: {directory_path}"
208
-
209
- entries: list[tuple[str, str, int]] = []
210
-
211
- if recursive:
212
- for item in full_path.rglob("*"):
213
- if len(entries) >= max_entries:
214
- break
215
- # Skip common non-essential directories
216
- skip_dirs = ["node_modules", "__pycache__", ".git", "venv", ".venv"]
217
- if any(part in item.parts for part in skip_dirs):
218
- continue
219
- rel_path = item.relative_to(full_path)
220
- item_type = "file" if item.is_file() else "dir"
221
- size = item.stat().st_size if item.is_file() else 0
222
- entries.append((str(rel_path), item_type, size))
223
- else:
224
- for item in full_path.iterdir():
225
- if len(entries) >= max_entries:
226
- break
227
- item_type = "file" if item.is_file() else "dir"
228
- size = item.stat().st_size if item.is_file() else 0
229
- entries.append((item.name, item_type, size))
230
-
231
- # Sort: directories first, then by name
232
- entries.sort(key=lambda x: (x[1] != "dir", x[0]))
233
-
234
- # Format output
235
- result_lines = [f"Directory: {directory_path} ({len(entries)} entries)"]
236
- result_lines.append("=" * 50)
237
-
238
- for name, item_type, size in entries:
239
- if item_type == "dir":
240
- result_lines.append(f" [DIR] {name}/")
241
- else:
242
- size_str = f"{size:,} bytes" if size < 10000 else f"{size / 1024:.1f} KB"
243
- result_lines.append(f" [FILE] {name} ({size_str})")
244
-
245
- if len(entries) >= max_entries:
246
- result_lines.append(f"\n... (truncated at {max_entries} entries)")
247
-
248
- return "\n".join(result_lines)
249
-
250
- except Exception as e:
251
- return f"Error listing directory: {e}"
252
-
253
- # Add tool metadata
254
- list_directory._tool_name = "list_directory" # type: ignore[attr-defined]
255
- list_directory._tool_description = ( # type: ignore[attr-defined]
256
- "List files and directories at a given path. Accepts absolute paths (e.g., /path/to/dir) "
257
- "or relative paths (relative to workspace). Returns names, types, and sizes."
258
- )
259
- list_directory._is_tool = True # type: ignore[attr-defined]
260
-
261
- return list_directory
262
-
263
-
264
- def create_grep_search_tool(workspace: Path) -> Callable[..., Coroutine[Any, Any, str]]:
265
- """Create a grep_search tool that can search any directory.
266
-
267
- Args:
268
- workspace: Default directory for relative paths (not a restriction)
269
- """
270
-
271
- async def grep_search(
272
- pattern: Annotated[str, "Pattern to search for (regex supported)"],
273
- path: Annotated[str, "Path to search in (absolute or relative to workspace, default: '.')"] = ".",
274
- file_pattern: Annotated[str | None, "File pattern to filter (e.g., '*.py', '*.js')"] = None,
275
- case_sensitive: Annotated[bool, "Case sensitive search (default: true)"] = True,
276
- max_matches: Annotated[int, "Maximum matches to return (default: 50)"] = 50,
277
- ) -> str:
278
- """Search for text patterns in files. Can search any path on the system."""
279
- try:
280
- # Support both absolute and relative paths
281
- search_path = Path(path)
282
- if search_path.is_absolute():
283
- full_path = search_path.resolve()
284
- else:
285
- full_path = (workspace / path).resolve()
286
-
287
- if not full_path.exists():
288
- return f"Error: Path not found: {path}"
289
-
290
- # Compile regex
291
- flags = 0 if case_sensitive else re.IGNORECASE
292
- try:
293
- regex = re.compile(pattern, flags)
294
- except re.error as e:
295
- return f"Error: Invalid regex pattern: {e}"
296
-
297
- matches: list[dict[str, Any]] = []
298
-
299
- # Get files to search
300
- if full_path.is_file():
301
- files = [full_path]
302
- else:
303
- if file_pattern:
304
- files = list(full_path.rglob(file_pattern))
305
- else:
306
- files = [f for f in full_path.rglob("*") if f.is_file()]
307
-
308
- # Search each file
309
- for file_path_item in files:
310
- if len(matches) >= max_matches:
311
- break
312
-
313
- # Skip common non-essential directories and binary files
314
- skip_dirs = ["node_modules", "__pycache__", ".git", "venv", ".venv"]
315
- if any(part in file_path_item.parts for part in skip_dirs):
316
- continue
317
-
318
- try:
319
- # Skip large files (> 1MB)
320
- if file_path_item.stat().st_size > 1_000_000:
321
- continue
322
-
323
- file_content = file_path_item.read_text(encoding="utf-8", errors="ignore")
324
- lines = file_content.splitlines()
325
-
326
- for line_num, line in enumerate(lines, 1):
327
- if len(matches) >= max_matches:
328
- break
329
- if regex.search(line):
330
- # Compute relative path from search root
331
- try:
332
- rel_path = file_path_item.relative_to(full_path)
333
- except ValueError:
334
- # If file is the search path itself, use filename
335
- rel_path = file_path_item.name
336
- matches.append({
337
- "file": str(rel_path),
338
- "line": line_num,
339
- "text": line.strip()[:200],
340
- })
341
- except (UnicodeDecodeError, PermissionError):
342
- continue
343
-
344
- # Format output
345
- if not matches:
346
- return f"No matches found for pattern '{pattern}' in {path}"
347
-
348
- result_lines = [f"Found {len(matches)} match(es) for '{pattern}'"]
349
- result_lines.append("=" * 50)
350
-
351
- for match in matches:
352
- result_lines.append(f"{match['file']}:{match['line']}: {match['text']}")
353
-
354
- if len(matches) >= max_matches:
355
- result_lines.append(f"\n... (truncated at {max_matches} matches)")
356
-
357
- return "\n".join(result_lines)
358
-
359
- except Exception as e:
360
- return f"Error searching: {e}"
361
-
362
- # Add tool metadata
363
- grep_search._tool_name = "grep_search" # type: ignore[attr-defined]
364
- grep_search._tool_description = ( # type: ignore[attr-defined]
365
- "Search for text patterns in files. Accepts absolute paths (e.g., /path/to/dir) "
366
- "or relative paths (relative to workspace). Supports regex patterns and file filtering."
367
- )
368
- grep_search._is_tool = True # type: ignore[attr-defined]
369
-
370
- return grep_search
371
-
372
-
373
- def create_coding_tools(workspace: Path) -> Sequence[Callable[..., Coroutine[Any, Any, str]]]:
374
- """Create all coding tools bound to a workspace.
375
-
376
- Args:
377
- workspace: Root directory for file operations
378
-
379
- Returns:
380
- List of coding tool functions
381
- """
382
- workspace = Path(workspace).resolve()
383
-
384
- return [
385
- create_read_file_tool(workspace),
386
- create_write_file_tool(workspace),
387
- create_list_directory_tool(workspace),
388
- create_grep_search_tool(workspace),
389
- ]
390
-
391
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/flow/harness/maf/tools/core.py DELETED
@@ -1,100 +0,0 @@
1
- """Core metacognitive tools for agent reasoning and task management.
2
-
3
- These tools enable agents to think explicitly, track task status,
4
- and make structured decisions during complex software engineering tasks.
5
- """
6
-
7
- from collections.abc import Callable, Coroutine, Sequence
8
- from typing import Annotated, Any, Literal
9
-
10
-
11
- async def think(
12
- thought: Annotated[
13
- str,
14
- (
15
- "Your detailed reasoning about the current situation. "
16
- "Include: what you've learned, options you're considering, "
17
- "potential risks, and your planned approach."
18
- ),
19
- ],
20
- ) -> str:
21
- """Use this tool to pause and think through a complex problem.
22
-
23
- Helpful when: (1) analyzing tool results, (2) planning multi-step approaches,
24
- (3) making design decisions, (4) debugging issues, (5) avoiding mistakes.
25
- Your reasoning is recorded and helps structure your approach.
26
- """
27
- # The value is in giving the LLM dedicated space to reason
28
- summary = thought[:300] + "..." if len(thought) > 300 else thought
29
- return f"Thought recorded: {summary}"
30
-
31
-
32
- async def task_done(
33
- status: Annotated[
34
- Literal["complete", "incomplete"],
35
- "'complete' if task finished successfully, 'incomplete' if blocked or needs input",
36
- ],
37
- summary: Annotated[
38
- str,
39
- (
40
- "Summary of what was accomplished. "
41
- "If complete: what was done and how to use/test it. "
42
- "If incomplete: what's blocking and what's needed."
43
- ),
44
- ],
45
- files_created: Annotated[
46
- list[str] | None,
47
- "List of files created or modified (if any)",
48
- ] = None,
49
- next_steps: Annotated[
50
- list[str] | None,
51
- "Suggested next steps for the user (if any)",
52
- ] = None,
53
- ) -> str:
54
- """Call this when you have completed the user's task.
55
-
56
- Provide a summary of what was accomplished and any relevant details.
57
- Use 'complete' if all requirements are satisfied,
58
- 'incomplete' if blocked or need more information.
59
- """
60
- result_lines = [
61
- f"Task Status: {status.upper()}",
62
- "",
63
- "Summary:",
64
- summary,
65
- ]
66
-
67
- if files_created:
68
- result_lines.extend([
69
- "",
70
- "Files Created/Modified:",
71
- *[f" - {f}" for f in files_created],
72
- ])
73
-
74
- if next_steps:
75
- result_lines.extend([
76
- "",
77
- "Suggested Next Steps:",
78
- *[f" - {step}" for step in next_steps],
79
- ])
80
-
81
- return "\n".join(result_lines)
82
-
83
-
84
- # Add tool metadata
85
- think._tool_name = "think" # type: ignore[attr-defined]
86
- think._tool_description = think.__doc__ or "" # type: ignore[attr-defined]
87
- think._is_tool = True # type: ignore[attr-defined]
88
-
89
- task_done._tool_name = "task_done" # type: ignore[attr-defined]
90
- task_done._tool_description = task_done.__doc__ or "" # type: ignore[attr-defined]
91
- task_done._is_tool = True # type: ignore[attr-defined]
92
-
93
-
94
- def create_core_tools() -> Sequence[Callable[..., Coroutine[Any, Any, str]]]:
95
- """Create all core metacognitive tools.
96
-
97
- Returns:
98
- List of core tool functions
99
- """
100
- return [think, task_done]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/flow/harness/maf/tools/execution.py DELETED
@@ -1,479 +0,0 @@
1
- """Execution tools for running commands and code.
2
-
3
- These tools enable agents to execute bash commands and Python code
4
- with safety controls (timeouts, output limits), and manage background processes.
5
- """
6
-
7
- import asyncio
8
- import os
9
- import re
10
- import signal
11
- import sys
12
- from collections.abc import Callable, Coroutine, Sequence
13
- from datetime import datetime
14
- from io import StringIO
15
- from pathlib import Path
16
- from typing import Annotated, Any, Literal
17
-
18
-
19
- def _get_process_registry_path(memory_path: Path) -> Path:
20
- """Get the path to the process registry file in memory."""
21
- return memory_path / "processes.md"
22
-
23
-
24
- def _ensure_process_registry(memory_path: Path) -> Path:
25
- """Ensure the process registry file exists and return its path."""
26
- registry_path = _get_process_registry_path(memory_path)
27
- registry_path.parent.mkdir(parents=True, exist_ok=True)
28
-
29
- if not registry_path.exists():
30
- registry_path.write_text(
31
- "# Background Processes\n\n"
32
- "This file tracks background processes started by the Flow agent.\n"
33
- "You can view this file with `memory(command='view', path='/memory/processes.md')`\n\n"
34
- "## Running\n\n"
35
- "## Stopped\n\n"
36
- )
37
- return registry_path
38
-
39
-
40
- def _add_process_to_registry(
41
- memory_path: Path,
42
- pid: int,
43
- command: str,
44
- workspace: str,
45
- log_file: str,
46
- port: int | None = None,
47
- ) -> None:
48
- """Add a process to the registry using checklist format."""
49
- registry_path = _ensure_process_registry(memory_path)
50
- content = registry_path.read_text()
51
-
52
- # Extract port from command if not provided
53
- if port is None:
54
- port_match = re.search(r"(?:--port|-p)\s+(\d+)", command)
55
- if port_match:
56
- port = int(port_match.group(1))
57
- elif ":8000" in command or "8000" in command:
58
- port = 8000
59
- elif ":3000" in command or "3000" in command:
60
- port = 3000
61
-
62
- timestamp = datetime.now().strftime("%Y-%m-%d %H:%M")
63
- port_str = f"Port: {port}" if port else "Port: -"
64
- cmd_short = command[:60] + "..." if len(command) > 60 else command
65
- workspace_short = workspace.split("/")[-1] if "/" in workspace else workspace
66
-
67
- # Create checklist entry
68
- entry = f"- [ ] **PID {pid}** | `{cmd_short}` | {timestamp} | {port_str} | {workspace_short}\n"
69
-
70
- # Add under "## Running" section
71
- if "## Running" in content:
72
- content = content.replace("## Running\n\n", f"## Running\n\n{entry}")
73
- else:
74
- # Add Running section if missing
75
- content += f"\n## Running\n\n{entry}"
76
-
77
- registry_path.write_text(content)
78
-
79
-
80
- def _mark_process_stopped(memory_path: Path, pid: int, reason: str = "killed") -> None:
81
- """Mark a process as stopped in the registry (check the box and move to Stopped)."""
82
- registry_path = _get_process_registry_path(memory_path)
83
- if not registry_path.exists():
84
- return
85
-
86
- content = registry_path.read_text()
87
- lines = content.split("\n")
88
- new_lines: list[str] = []
89
- stopped_entry: str | None = None
90
- timestamp = datetime.now().strftime("%Y-%m-%d %H:%M")
91
-
92
- for line in lines:
93
- if f"**PID {pid}**" in line and "- [ ]" in line:
94
- # Found the running process - mark it as checked and prepare for Stopped section
95
- stopped_entry = line.replace("- [ ]", "- [x]") + f" | {reason} @ {timestamp}"
96
- # Don't add to new_lines yet (will move to Stopped section)
97
- else:
98
- new_lines.append(line)
99
-
100
- # Add stopped entry to Stopped section
101
- if stopped_entry:
102
- content = "\n".join(new_lines)
103
- if "## Stopped" in content:
104
- content = content.replace("## Stopped\n\n", f"## Stopped\n\n{stopped_entry}\n")
105
- else:
106
- content += f"\n## Stopped\n\n{stopped_entry}\n"
107
- registry_path.write_text(content)
108
-
109
-
110
- def _is_process_running(pid: int) -> bool:
111
- """Check if a process is still running."""
112
- try:
113
- os.kill(pid, 0)
114
- return True
115
- except (OSError, ProcessLookupError):
116
- return False
117
-
118
-
119
- def _get_running_pids_from_registry(memory_path: Path) -> list[tuple[int, str]]:
120
- """Get list of (pid, line) for processes marked as running in registry."""
121
- registry_path = _get_process_registry_path(memory_path)
122
- if not registry_path.exists():
123
- return []
124
-
125
- content = registry_path.read_text()
126
- running: list[tuple[int, str]] = []
127
-
128
- for line in content.split("\n"):
129
- if "- [ ]" in line and "**PID" in line:
130
- # Extract PID from format: **PID 12345**
131
- match = re.search(r"\*\*PID (\d+)\*\*", line)
132
- if match:
133
- pid = int(match.group(1))
134
- running.append((pid, line))
135
-
136
- return running
137
-
138
-
139
- def create_bash_execute_tool(
140
- workspace: Path, memory_path: Path, default_timeout: int = 120
141
- ) -> Callable[..., Coroutine[Any, Any, str]]:
142
- """Create a bash_execute tool bound to a specific workspace."""
143
-
144
- async def bash_execute(
145
- command: Annotated[str, "Bash command to execute"],
146
- timeout: Annotated[int, f"Command timeout in seconds (default: {default_timeout})"] = default_timeout,
147
- background: Annotated[
148
- bool, "Run in background and return immediately with PID. Use for servers/long-running processes."
149
- ] = False,
150
- ) -> str:
151
- """Execute bash commands in the workspace.
152
-
153
- Returns stdout, stderr, and return code.
154
- Use for running tests, git commands, package managers, builds, etc.
155
- IMPORTANT: Each call runs in a fresh shell from workspace root -
156
- use 'cd dir && command' for commands in subdirectories.
157
- For long-running processes (servers), use background=True to avoid timeout.
158
- """
159
- try:
160
- if background:
161
- # Run in background using nohup and capture PID
162
- # Redirect output to a log file
163
- log_file = workspace / ".background_logs" / f"bg_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
164
- log_file.parent.mkdir(parents=True, exist_ok=True)
165
-
166
- bg_command = f"nohup {command} > {log_file} 2>&1 & echo $!"
167
-
168
- proc = await asyncio.create_subprocess_shell(
169
- bg_command,
170
- stdout=asyncio.subprocess.PIPE,
171
- stderr=asyncio.subprocess.PIPE,
172
- cwd=str(workspace),
173
- )
174
-
175
- stdout, _ = await proc.communicate()
176
- pid_str = stdout.decode().strip()
177
-
178
- try:
179
- pid = int(pid_str)
180
- # Register the process in memory
181
- _add_process_to_registry(
182
- memory_path=memory_path,
183
- pid=pid,
184
- command=command,
185
- workspace=str(workspace),
186
- log_file=str(log_file),
187
- )
188
-
189
- return (
190
- f"Background process started successfully.\n"
191
- f"PID: {pid}\n"
192
- f"Command: {command}\n"
193
- f"Log file: {log_file}\n"
194
- f"\nProcess registered in /memory/processes.md\n"
195
- f"Use check_processes(action='list') to see all background processes.\n"
196
- f"Use check_processes(action='kill', pid={pid}) to stop this process."
197
- )
198
- except ValueError:
199
- return f"Error: Could not get PID. Output: {pid_str}"
200
-
201
- # Regular (blocking) execution
202
- proc = await asyncio.create_subprocess_shell(
203
- command,
204
- stdout=asyncio.subprocess.PIPE,
205
- stderr=asyncio.subprocess.PIPE,
206
- cwd=str(workspace),
207
- )
208
-
209
- try:
210
- stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=timeout)
211
- except asyncio.TimeoutError:
212
- proc.kill()
213
- await proc.wait()
214
- return (
215
- f"Error: Command timed out after {timeout} seconds.\n"
216
- f"Command: {command}\n\n"
217
- f"TIP: If this is a long-running process (like a server), "
218
- f"use background=True to run it in the background."
219
- )
220
-
221
- stdout_str = stdout.decode("utf-8", errors="replace")
222
- stderr_str = stderr.decode("utf-8", errors="replace")
223
- return_code = proc.returncode
224
-
225
- # Format output
226
- result_parts = [f"Command: {command}"]
227
- result_parts.append(f"Return code: {return_code}")
228
- result_parts.append("=" * 50)
229
-
230
- if stdout_str.strip():
231
- # Truncate very long output
232
- if len(stdout_str) > 15000:
233
- stdout_str = stdout_str[:15000] + "\n... (stdout truncated)"
234
- result_parts.append("STDOUT:")
235
- result_parts.append(stdout_str)
236
-
237
- if stderr_str.strip():
238
- if len(stderr_str) > 5000:
239
- stderr_str = stderr_str[:5000] + "\n... (stderr truncated)"
240
- result_parts.append("STDERR:")
241
- result_parts.append(stderr_str)
242
-
243
- if not stdout_str.strip() and not stderr_str.strip():
244
- result_parts.append("(no output)")
245
-
246
- return "\n".join(result_parts)
247
-
248
- except Exception as e:
249
- return f"Error executing command: {e}"
250
-
251
- # Add tool metadata
252
- bash_execute._tool_name = "bash_execute" # type: ignore[attr-defined]
253
- bash_execute._tool_description = ( # type: ignore[attr-defined]
254
- "Execute bash commands in the workspace. "
255
- "Returns stdout, stderr, and return code. "
256
- "Use for running tests, git commands, package managers, builds, etc."
257
- )
258
- bash_execute._is_tool = True # type: ignore[attr-defined]
259
-
260
- return bash_execute
261
-
262
-
263
- def create_check_processes_tool(
264
- workspace: Path, memory_path: Path
265
- ) -> Callable[..., Coroutine[Any, Any, str]]:
266
- """Create a tool to check and manage background processes."""
267
-
268
- async def check_processes(
269
- action: Annotated[
270
- Literal["list", "kill", "cleanup"],
271
- "'list' to see processes, 'kill' to stop one by PID, 'cleanup' to kill all",
272
- ],
273
- pid: Annotated[int | None, "PID to kill (required for 'kill' action)"] = None,
274
- ) -> str:
275
- """Check and manage background processes.
276
-
277
- Use 'list' to see all background processes (also viewable at /memory/processes.md),
278
- 'kill' to stop a specific process by PID,
279
- 'cleanup' to kill all background processes from this workspace.
280
- """
281
- _ensure_process_registry(memory_path)
282
- registry_path = _get_process_registry_path(memory_path)
283
-
284
- if action == "list":
285
- # Read the registry and update status of running processes
286
- running_pids = _get_running_pids_from_registry(memory_path)
287
- active_count = 0
288
- dead_pids: list[int] = []
289
-
290
- for proc_pid, _ in running_pids:
291
- if _is_process_running(proc_pid):
292
- active_count += 1
293
- else:
294
- dead_pids.append(proc_pid)
295
-
296
- # Mark dead processes as stopped
297
- for dead_pid in dead_pids:
298
- _mark_process_stopped(memory_path, dead_pid, reason="exited")
299
-
300
- # Return the updated registry
301
- content = registry_path.read_text()
302
- return (
303
- f"Active background processes: {active_count}\n"
304
- f"(View full registry at /memory/processes.md)\n\n"
305
- f"{content}"
306
- )
307
-
308
- if action == "kill":
309
- if pid is None:
310
- return "Error: 'pid' is required for 'kill' action."
311
-
312
- try:
313
- os.kill(pid, signal.SIGTERM)
314
- await asyncio.sleep(0.5) # Give it time to terminate
315
-
316
- # Check if it's really dead, if not SIGKILL
317
- if _is_process_running(pid):
318
- os.kill(pid, signal.SIGKILL)
319
- await asyncio.sleep(0.2)
320
-
321
- _mark_process_stopped(memory_path, pid, reason="killed")
322
-
323
- if _is_process_running(pid):
324
- return f"Warning: Process {pid} may still be running after kill attempt."
325
- return f"Successfully killed process {pid}. Updated /memory/processes.md"
326
-
327
- except ProcessLookupError:
328
- _mark_process_stopped(memory_path, pid, reason="not found")
329
- return f"Process {pid} was not running (already terminated). Updated /memory/processes.md"
330
- except PermissionError:
331
- return f"Error: Permission denied to kill process {pid}."
332
- except Exception as e:
333
- return f"Error killing process {pid}: {e}"
334
-
335
- if action == "cleanup":
336
- # Kill all processes from this workspace
337
- running_pids = _get_running_pids_from_registry(memory_path)
338
- workspace_str = str(workspace)
339
- killed: list[int] = []
340
- failed: list[tuple[int, str]] = []
341
-
342
- for proc_pid, line in running_pids:
343
- # Check if this process is from our workspace
344
- workspace_short = workspace_str.split("/")[-1]
345
- if workspace_short in line or workspace_str in line:
346
- try:
347
- os.kill(proc_pid, signal.SIGTERM)
348
- await asyncio.sleep(0.2)
349
- if _is_process_running(proc_pid):
350
- os.kill(proc_pid, signal.SIGKILL)
351
- _mark_process_stopped(memory_path, proc_pid, reason="cleanup")
352
- killed.append(proc_pid)
353
- except (ProcessLookupError, PermissionError) as e:
354
- _mark_process_stopped(memory_path, proc_pid, reason=f"cleanup failed: {e}")
355
- failed.append((proc_pid, str(e)))
356
-
357
- result = "Cleanup complete. Updated /memory/processes.md\n"
358
- if killed:
359
- result += f"Killed processes: {killed}\n"
360
- if failed:
361
- result += f"Failed to kill: {failed}\n"
362
- if not killed and not failed:
363
- result += "No active processes found for this workspace."
364
-
365
- return result
366
-
367
- return f"Unknown action: {action}"
368
-
369
- # Add tool metadata
370
- check_processes._tool_name = "check_processes" # type: ignore[attr-defined]
371
- check_processes._tool_description = ( # type: ignore[attr-defined]
372
- "Check and manage background processes. "
373
- "Use 'list' to see all background processes, "
374
- "'kill' to stop a specific process by PID, "
375
- "'cleanup' to kill all background processes from this workspace."
376
- )
377
- check_processes._is_tool = True # type: ignore[attr-defined]
378
-
379
- return check_processes
380
-
381
-
382
- def create_python_repl_tool(workspace: Path) -> Callable[..., Coroutine[Any, Any, str]]:
383
- """Create a python_repl tool bound to a specific workspace."""
384
-
385
- async def python_repl(
386
- code: Annotated[str, "Python code to execute"],
387
- ) -> str:
388
- """Execute Python code in an isolated namespace.
389
-
390
- Returns the output (stdout) or any errors.
391
- Use for testing code snippets, calculations, data manipulation, or quick validation.
392
- The WORKSPACE variable is available with the workspace path.
393
- """
394
- old_stdout = sys.stdout
395
- old_stderr = sys.stderr
396
-
397
- try:
398
- # Capture stdout and stderr
399
- redirected_output = StringIO()
400
- redirected_error = StringIO()
401
- sys.stdout = redirected_output
402
- sys.stderr = redirected_error
403
-
404
- # Create isolated namespace with builtins
405
- namespace: dict[str, Any] = {
406
- "__builtins__": __builtins__,
407
- "__name__": "__main__",
408
- "WORKSPACE": workspace,
409
- }
410
-
411
- try:
412
- # Try to compile and exec
413
- compiled = compile(code, "<repl>", "exec")
414
- exec(compiled, namespace) # noqa: S102
415
-
416
- output = redirected_output.getvalue()
417
- error = redirected_error.getvalue()
418
-
419
- result_parts = ["Python REPL Output"]
420
- result_parts.append("=" * 50)
421
-
422
- if output.strip():
423
- if len(output) > 15000:
424
- output = output[:15000] + "\n... (output truncated)"
425
- result_parts.append(output)
426
-
427
- if error.strip():
428
- result_parts.append("STDERR:")
429
- result_parts.append(error)
430
-
431
- if not output.strip() and not error.strip():
432
- result_parts.append("(code executed successfully, no output)")
433
-
434
- return "\n".join(result_parts)
435
-
436
- except SyntaxError as e:
437
- return f"SyntaxError: {e}"
438
- except Exception as e:
439
- return f"Error: {type(e).__name__}: {e}"
440
-
441
- finally:
442
- sys.stdout = old_stdout
443
- sys.stderr = old_stderr
444
-
445
- # Add tool metadata
446
- python_repl._tool_name = "python_repl" # type: ignore[attr-defined]
447
- python_repl._tool_description = ( # type: ignore[attr-defined]
448
- "Execute Python code in an isolated namespace. "
449
- "Returns the output (stdout) or any errors. "
450
- "Use for testing code snippets, calculations, data manipulation, or quick validation."
451
- )
452
- python_repl._is_tool = True # type: ignore[attr-defined]
453
-
454
- return python_repl
455
-
456
-
457
- def create_execution_tools(
458
- workspace: Path,
459
- memory_path: Path,
460
- bash_timeout: int = 120,
461
- ) -> Sequence[Callable[..., Coroutine[Any, Any, str]]]:
462
- """Create all execution tools bound to a workspace.
463
-
464
- Args:
465
- workspace: Root directory for command execution
466
- memory_path: Path to memory directory for process registry
467
- bash_timeout: Default timeout for bash commands in seconds
468
-
469
- Returns:
470
- List of execution tool functions
471
- """
472
- workspace = Path(workspace).resolve()
473
- memory_path = Path(memory_path).resolve()
474
-
475
- return [
476
- create_bash_execute_tool(workspace, memory_path, bash_timeout),
477
- create_check_processes_tool(workspace, memory_path),
478
- create_python_repl_tool(workspace),
479
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/flow/harness/maf/tools/memory.py DELETED
@@ -1,260 +0,0 @@
1
- """Memory tool for persistent storage across sessions.
2
-
3
- Provides file-based memory storage allowing agents to store and retrieve
4
- information, patterns, and decisions across conversations.
5
- """
6
-
7
- from collections.abc import Callable, Coroutine
8
- from pathlib import Path
9
- from typing import Annotated, Any, Literal
10
-
11
-
12
- class MemoryBackend:
13
- """File-based memory storage backend with security controls."""
14
-
15
- def __init__(self, base_path: Path) -> None:
16
- """Initialize memory backend."""
17
- self.base_path = Path(base_path).resolve()
18
- self.base_path.mkdir(parents=True, exist_ok=True)
19
-
20
- def _validate_path(self, path: str) -> Path:
21
- """Validate and resolve a memory path."""
22
- # Normalize path (remove /memory prefix if present)
23
- if path.startswith("/memory"):
24
- path = path[len("/memory") :]
25
- path = path.lstrip("/")
26
-
27
- # Handle empty path
28
- if not path:
29
- return self.base_path
30
-
31
- # Resolve to absolute path
32
- full_path = (self.base_path / path).resolve()
33
-
34
- # Security: Ensure path is within base_path
35
- try:
36
- full_path.relative_to(self.base_path)
37
- except ValueError as err:
38
- raise ValueError(f"Access denied: path '{path}' is outside memory directory") from err
39
-
40
- return full_path
41
-
42
- def view(self, path: str, view_range: list[int] | None = None) -> str:
43
- """View directory contents or file contents."""
44
- full_path = self._validate_path(path)
45
-
46
- if not full_path.exists():
47
- return f"Path not found: {path}\nUse 'create' to create new files."
48
-
49
- # Directory listing
50
- if full_path.is_dir():
51
- contents = [f"Directory: {path or '/memory'}"]
52
- items = sorted(full_path.iterdir(), key=lambda x: (x.is_file(), x.name))
53
-
54
- if not items:
55
- contents.append("(empty directory)")
56
- else:
57
- for item in items:
58
- suffix = "/" if item.is_dir() else ""
59
- contents.append(f" - {item.name}{suffix}")
60
-
61
- return "\n".join(contents)
62
-
63
- # File contents
64
- if full_path.is_file():
65
- content = full_path.read_text(encoding="utf-8")
66
- lines = content.splitlines()
67
-
68
- if view_range:
69
- start, end = view_range
70
- start = max(1, start)
71
- end = min(len(lines), end)
72
- lines = lines[start - 1 : end]
73
- numbered_lines = [f"{i + start:5d}: {line}" for i, line in enumerate(lines)]
74
- else:
75
- numbered_lines = [f"{i + 1:5d}: {line}" for i, line in enumerate(lines)]
76
-
77
- return "\n".join(numbered_lines) if numbered_lines else "(empty file)"
78
-
79
- return f"Unknown path type: {path}"
80
-
81
- def create(self, path: str, file_text: str) -> str:
82
- """Create or overwrite a file."""
83
- full_path = self._validate_path(path)
84
- full_path.parent.mkdir(parents=True, exist_ok=True)
85
- full_path.write_text(file_text, encoding="utf-8")
86
- return f"File created successfully at {path}"
87
-
88
- def str_replace(self, path: str, old_str: str, new_str: str) -> str:
89
- """Replace text in a file."""
90
- full_path = self._validate_path(path)
91
-
92
- if not full_path.is_file():
93
- raise FileNotFoundError(f"File not found: {path}")
94
-
95
- content = full_path.read_text(encoding="utf-8")
96
-
97
- if old_str not in content:
98
- raise ValueError(f"Text not found in file: '{old_str[:50]}...'")
99
-
100
- new_content = content.replace(old_str, new_str, 1)
101
- full_path.write_text(new_content, encoding="utf-8")
102
- return f"File {path} has been edited successfully"
103
-
104
- def append(self, path: str, text: str) -> str:
105
- """Append text to end of file."""
106
- full_path = self._validate_path(path)
107
-
108
- if not full_path.exists():
109
- full_path.parent.mkdir(parents=True, exist_ok=True)
110
- full_path.write_text("", encoding="utf-8")
111
-
112
- # Ensure text starts with newline if file isn't empty
113
- if full_path.stat().st_size > 0:
114
- existing = full_path.read_text(encoding="utf-8")
115
- if existing and not existing.endswith("\n"):
116
- text = "\n" + text
117
-
118
- # Ensure text ends with newline
119
- if not text.endswith("\n"):
120
- text += "\n"
121
-
122
- with full_path.open("a", encoding="utf-8") as f:
123
- f.write(text)
124
-
125
- return f"Text appended to {path}"
126
-
127
- def search(self, query: str, path: str = "") -> str:
128
- """Search for text across memory files."""
129
- full_path = self._validate_path(path)
130
-
131
- if not full_path.exists():
132
- return f"Path not found: {path or '/memory'}"
133
-
134
- if not full_path.is_dir():
135
- # Search single file
136
- files = [full_path]
137
- else:
138
- files = list(full_path.rglob("*"))
139
-
140
- matches: list[dict[str, Any]] = []
141
- query_lower = query.lower()
142
-
143
- for file_path in files:
144
- if not file_path.is_file():
145
- continue
146
- try:
147
- content = file_path.read_text(encoding="utf-8")
148
- lines = content.splitlines()
149
-
150
- for line_num, line in enumerate(lines, 1):
151
- if query_lower in line.lower():
152
- rel_path = file_path.relative_to(self.base_path)
153
- matches.append({
154
- "file": str(rel_path),
155
- "line": line_num,
156
- "content": line.strip()[:100],
157
- })
158
- except (UnicodeDecodeError, PermissionError):
159
- continue
160
-
161
- if not matches:
162
- return f"No matches found for '{query}' in {path or '/memory'}"
163
-
164
- result_lines = [f"Found {len(matches)} match(es) for '{query}':\n"]
165
- for match in matches[:50]:
166
- result_lines.append(f" {match['file']}:{match['line']} - {match['content']}")
167
-
168
- if len(matches) > 50:
169
- result_lines.append(f"\n... and {len(matches) - 50} more matches")
170
-
171
- return "\n".join(result_lines)
172
-
173
- def delete(self, path: str) -> str:
174
- """Delete a file or empty directory."""
175
- full_path = self._validate_path(path)
176
-
177
- if not full_path.exists():
178
- raise FileNotFoundError(f"Path not found: {path}")
179
-
180
- if full_path.is_file():
181
- full_path.unlink()
182
- return f"File deleted: {path}"
183
-
184
- if full_path.is_dir():
185
- if any(full_path.iterdir()):
186
- raise ValueError(f"Directory not empty: {path}. Delete contents first.")
187
- full_path.rmdir()
188
- return f"Directory deleted: {path}"
189
-
190
- return f"Unknown path type: {path}"
191
-
192
-
193
- def create_memory_tool(memory_path: Path) -> Callable[..., Coroutine[Any, Any, str]]:
194
- """Create a memory tool bound to a specific memory directory."""
195
- backend = MemoryBackend(memory_path)
196
-
197
- async def memory(
198
- command: Annotated[
199
- Literal["view", "create", "str_replace", "append", "search", "delete"],
200
- "Operation to perform",
201
- ],
202
- path: Annotated[str, "Path to file or directory (e.g., '/memory/patterns/cors.md')"] = "/memory",
203
- file_text: Annotated[str | None, "Content to write (for create)"] = None,
204
- old_str: Annotated[str | None, "Text to find (for str_replace)"] = None,
205
- new_str: Annotated[str | None, "Replacement text (for str_replace)"] = None,
206
- append_text: Annotated[str | None, "Text to append (for append)"] = None,
207
- query: Annotated[str | None, "Search query (for search)"] = None,
208
- view_range: Annotated[list[int] | None, "Line range [start, end] (for view)"] = None,
209
- ) -> str:
210
- """Store and retrieve information in persistent memory.
211
-
212
- Memory persists across conversations - use it to remember patterns,
213
- insights, project context, and decisions.
214
- Operations: view (show directory/file), create (new file),
215
- str_replace (edit file), append (add to file),
216
- search (find text), delete (remove file/dir).
217
- Organize by: /memory/patterns/, /memory/projects/, /memory/decisions/
218
- """
219
- try:
220
- if command == "view":
221
- return backend.view(path, view_range)
222
-
223
- if command == "create":
224
- if file_text is None:
225
- return "Error: 'file_text' is required for create operation"
226
- return backend.create(path, file_text)
227
-
228
- if command == "str_replace":
229
- if old_str is None or new_str is None:
230
- return "Error: 'old_str' and 'new_str' are required for str_replace"
231
- return backend.str_replace(path, old_str, new_str)
232
-
233
- if command == "append":
234
- if append_text is None:
235
- return "Error: 'append_text' is required for append operation"
236
- return backend.append(path, append_text)
237
-
238
- if command == "search":
239
- if query is None:
240
- return "Error: 'query' is required for search operation"
241
- return backend.search(query, path)
242
-
243
- if command == "delete":
244
- return backend.delete(path)
245
-
246
- return f"Error: Unknown command: {command}"
247
-
248
- except Exception as e:
249
- return f"Memory operation failed: {e}"
250
-
251
- # Add tool metadata
252
- memory._tool_name = "memory" # type: ignore[attr-defined]
253
- memory._tool_description = ( # type: ignore[attr-defined]
254
- "Store and retrieve information in persistent memory. "
255
- "Memory persists across conversations - use it to remember patterns, "
256
- "insights, project context, and decisions."
257
- )
258
- memory._is_tool = True # type: ignore[attr-defined]
259
-
260
- return memory
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/flow/harness/maf/tools/sub_agent.py DELETED
@@ -1,196 +0,0 @@
1
- """Sub-agent tool for isolated research tasks.
2
-
3
- Provides context isolation by delegating complex research tasks to a
4
- separate agent that operates in its own context window. The sub-agent
5
- processes the request and returns only a concise summary, preventing
6
- context pollution in the main agent.
7
-
8
- This implements the "Isolation" strategy for context engineering:
9
- - Coordinator agent stays lean with minimal context
10
- - Sub-agent can use 30K+ tokens internally for research
11
- - Only the distilled result (200-500 tokens) returns to coordinator
12
- """
13
-
14
- from __future__ import annotations
15
-
16
- import os
17
- from collections.abc import Callable, Coroutine
18
- from pathlib import Path
19
- from typing import Annotated, Any
20
-
21
- # Sub-agent system prompt focused on research and summarization
22
- SUB_AGENT_INSTRUCTIONS = """You are a research assistant that helps with complex information gathering tasks.
23
-
24
- Your role:
25
- 1. Thoroughly research the given topic or question
26
- 2. Gather relevant information from available tools
27
- 3. Synthesize findings into a clear, concise summary
28
- 4. Return ONLY the essential information needed by the requesting agent
29
-
30
- Guidelines:
31
- - Be thorough in your research but concise in your response
32
- - Focus on facts and actionable information
33
- - If you can't find information, say so clearly
34
- - Your response will be passed to another agent, so make it self-contained
35
- - Target 200-500 tokens for your final response unless more detail is explicitly requested
36
-
37
- Do NOT:
38
- - Include conversational fluff or preamble
39
- - Repeat the original question back
40
- - Add disclaimers about your limitations
41
- - Include information that wasn't requested
42
- """
43
-
44
-
45
- def create_sub_agent_tool(
46
- workspace: Path,
47
- model: str = "gpt-4o-mini",
48
- endpoint: str | None = None,
49
- api_key: str | None = None,
50
- api_version: str = "2024-02-15-preview",
51
- ) -> Callable[..., Coroutine[Any, Any, str]]:
52
- """Create a sub-agent tool for isolated research tasks.
53
-
54
- The sub-agent runs in its own isolated context, preventing context
55
- pollution in the main agent. This is useful for:
56
- - Complex research that requires many tool calls
57
- - Tasks that generate lots of intermediate content
58
- - Keeping the main agent's context lean and focused
59
-
60
- Args:
61
- workspace: Workspace directory for file operations
62
- model: Model to use for sub-agent (default: gpt-4o-mini for efficiency)
63
- endpoint: Azure OpenAI endpoint (defaults to AZURE_OPENAI_ENDPOINT env var)
64
- api_key: Azure OpenAI API key (defaults to AZURE_OPENAI_API_KEY env var)
65
- api_version: Azure OpenAI API version
66
-
67
- Returns:
68
- An async function that can be used as a tool
69
- """
70
- # Resolve credentials from environment if not provided
71
- _endpoint = endpoint or os.environ.get("AZURE_OPENAI_ENDPOINT", "")
72
- _api_key = api_key or os.environ.get("AZURE_OPENAI_API_KEY", "")
73
-
74
- # Lazy import to avoid circular dependencies
75
- _sub_agent: Any = None
76
-
77
- async def _ensure_sub_agent() -> Any:
78
- """Lazily create the sub-agent on first use."""
79
- nonlocal _sub_agent
80
- if _sub_agent is not None:
81
- return _sub_agent
82
-
83
- try:
84
- from agent_framework import ChatAgent
85
- from agent_framework.azure import AzureOpenAIChatClient
86
- except ImportError as e:
87
- raise ImportError(
88
- "Microsoft Agent Framework is required for sub-agent. "
89
- "Install with: pip install agent-framework-core"
90
- ) from e
91
-
92
- # Create a lightweight chat client for the sub-agent
93
- # Uses a smaller/faster model by default for efficiency
94
- client = AzureOpenAIChatClient(
95
- api_key=_api_key,
96
- endpoint=_endpoint,
97
- deployment=model,
98
- api_version=api_version,
99
- )
100
-
101
- # Create basic tools for the sub-agent
102
- # Keep it minimal - just what's needed for research
103
- from flow.harness.maf.tools.coding import (
104
- create_grep_search_tool,
105
- create_list_directory_tool,
106
- create_read_file_tool,
107
- )
108
- from flow.harness.maf.tools.core import task_done, think
109
-
110
- sub_tools: list[Callable[..., Any]] = [
111
- create_read_file_tool(workspace),
112
- create_list_directory_tool(workspace),
113
- create_grep_search_tool(workspace),
114
- think,
115
- task_done,
116
- ]
117
-
118
- # Convert tools to agent_framework format
119
- from agent_framework import ai_function
120
-
121
- converted_tools = []
122
- for tool_func in sub_tools:
123
- name = getattr(tool_func, "_tool_name", tool_func.__name__)
124
- description = getattr(tool_func, "_tool_description", tool_func.__doc__ or "")
125
- wrapped = ai_function(name=name, description=description)(tool_func)
126
- converted_tools.append(wrapped)
127
-
128
- _sub_agent = ChatAgent(
129
- name="ResearchAssistant",
130
- description="Research assistant for complex information gathering",
131
- instructions=SUB_AGENT_INSTRUCTIONS,
132
- chat_client=client,
133
- tools=converted_tools,
134
- )
135
-
136
- return _sub_agent
137
-
138
- async def research(
139
- task: Annotated[
140
- str,
141
- "The research task or question to investigate. Be specific about what information you need.",
142
- ],
143
- context: Annotated[
144
- str | None,
145
- "Optional context to help the sub-agent understand the broader goal.",
146
- ] = None,
147
- ) -> str:
148
- """Delegate a research task to a sub-agent with isolated context.
149
-
150
- Use this tool when you need to:
151
- - Research a complex topic that may require multiple steps
152
- - Gather information without polluting your main context
153
- - Get a summarized answer to a specific question
154
-
155
- The sub-agent operates in its own context window, so it can
156
- use many tokens internally while only returning a concise summary.
157
- This keeps your main context lean and focused.
158
-
159
- Examples:
160
- - "Find all Python files that import the requests library and summarize their purpose"
161
- - "Research how authentication is implemented in this codebase"
162
- - "Analyze the error handling patterns used across the project"
163
- """
164
- sub_agent = await _ensure_sub_agent()
165
-
166
- # Build the research prompt
167
- prompt_parts = [f"Research task: {task}"]
168
- if context:
169
- prompt_parts.insert(0, f"Context: {context}")
170
- prompt_parts.append("\nProvide a concise summary of your findings.")
171
-
172
- full_prompt = "\n\n".join(prompt_parts)
173
-
174
- try:
175
- # Run the sub-agent - it operates in isolated context
176
- response = await sub_agent.run(full_prompt)
177
-
178
- # Extract text content from response
179
- if hasattr(response, "content"):
180
- return str(response.content)
181
- return str(response)
182
-
183
- except Exception as e:
184
- return f"Research failed: {e}"
185
-
186
- # Add tool metadata
187
- research._tool_name = "research" # type: ignore[attr-defined]
188
- research._tool_description = ( # type: ignore[attr-defined]
189
- "Delegate a research task to a sub-agent with isolated context. "
190
- "The sub-agent can thoroughly investigate a topic using many tool calls "
191
- "internally, then return only a concise summary. Use this for complex "
192
- "research that would otherwise pollute your main context."
193
- )
194
- research._is_tool = True # type: ignore[attr-defined]
195
-
196
- return research
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/flow/harness/maf/wrappers.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """MAF-specific tool wrappers.
2
+
3
+ This module provides utilities for wrapping shared tools for use with
4
+ Microsoft Agent Framework. The main functionality is now handled by
5
+ the shared adapters in flow.tools.adapters.
6
+
7
+ This module is maintained for backward compatibility.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import logging
13
+ from collections.abc import Callable, Coroutine
14
+ from pathlib import Path
15
+ from typing import Any
16
+
17
+ from flow.tools import Tool, to_maf_tool
18
+ from flow.harness.maf.tools import build_tools as build_maf_tools_impl
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+ __all__ = ["build_maf_tools", "wrap_for_maf"]
23
+
24
+
25
+ def wrap_for_maf(tool: Tool) -> Callable[..., Coroutine[Any, Any, str]]:
26
+ """Wrap a Flow Tool for Microsoft Agent Framework.
27
+
28
+ Applies the MAF @tool decorator using metadata from the Tool instance.
29
+
30
+ Args:
31
+ tool: A Flow Tool instance
32
+
33
+ Returns:
34
+ The function wrapped with @tool for MAF
35
+
36
+ Raises:
37
+ ValueError: If the input is not a Tool instance
38
+ """
39
+ if not isinstance(tool, Tool):
40
+ raise ValueError(f"Expected Tool instance, got {type(tool)}")
41
+
42
+ return to_maf_tool(tool)
43
+
44
+
45
+ def build_maf_tools(
46
+ tools_spec: dict[str, dict[str, Any]],
47
+ workspace: Path,
48
+ memory_path: Path,
49
+ ) -> list[Callable[..., Coroutine[Any, Any, str]]]:
50
+ """Build MAF-compatible tools from a specification dict.
51
+
52
+ Creates MAF-specific tools using the shared tools from flow.tools
53
+ and wraps them with the MAF @tool decorator.
54
+
55
+ Args:
56
+ tools_spec: Dict mapping tool names to their config dicts.
57
+ workspace: Root directory for file operations
58
+ memory_path: Directory for persistent memory
59
+
60
+ Returns:
61
+ List of tool functions wrapped with @tool
62
+ """
63
+ # Build tools from MAF-specific module (already wrapped with MAF @tool)
64
+ return build_maf_tools_impl(tools_spec, workspace, memory_path)
src/flow/harness/miniagent/__init__.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """MiniAgent harness for Flow - correct context compaction.
2
+
3
+ MiniAgent fixes Agent Framework's broken context compaction by:
4
+ 1. Applying compaction BEFORE each LLM call in the tool loop
5
+ 2. Reassigning the message list (not modifying a copy)
6
+ 3. Supporting token-budget-based strategies
7
+
8
+ ## Usage in Flow
9
+
10
+ from flow.experiments.models import Agent, CompactionConfig
11
+
12
+ agent = Agent(
13
+ name="my-agent",
14
+ framework="miniagent", # Use this harness
15
+ compaction=CompactionConfig.head_tail_tokens(head_ratio=0.2, token_budget=50_000),
16
+ tools="standard",
17
+ )
18
+
19
+ ## Direct Usage
20
+
21
+ from flow.harness.miniagent import ChatAgent, HeadTailStrategy
22
+
23
+ agent = ChatAgent(
24
+ instructions="You are a helpful assistant.",
25
+ tools=tools.coding_tools(),
26
+ context_strategy=HeadTailStrategy(),
27
+ token_budget=100_000,
28
+ )
29
+ response = await agent.run("Find all Python files with TODO comments")
30
+
31
+ ## Context Strategies
32
+
33
+ - NoCompactionStrategy: Baseline (no management)
34
+ - HeadTailStrategy: Keep head (20%) + tail (80%), drop middle (token-aware)
35
+ - SlidingWindowStrategy: Keep system + recent messages within budget
36
+ - SummarizationStrategy: Compress old messages using LLM
37
+
38
+ ## Key Difference from Agent Framework
39
+
40
+ Agent Framework's tool loop:
41
+ prepped_messages = prepare_messages(messages) # Copy made ONCE
42
+ for iteration in range(max_iterations):
43
+ middleware(context) # Modifies a DIFFERENT copy
44
+ response = llm_call(prepped_messages)
45
+ prepped_messages.extend(results) # List grows unbounded
46
+
47
+ MiniAgent's tool loop:
48
+ for iteration in range(max_iterations):
49
+ messages = compact(messages) # Compacted list REPLACES original
50
+ response = llm_call(messages)
51
+ messages.extend(results) # Next iteration will compact again
52
+ """
53
+
54
+ from .agent import ChatAgent, AgentThread, AgentResponse, UsageStats, StreamEvent, StreamEventType
55
+ from .tool import Tool, tool
56
+ from .messages import ChatMessage, ToolCall, ToolResult
57
+ from .context import (
58
+ ContextStrategy,
59
+ NoCompactionStrategy,
60
+ HeadTailStrategy,
61
+ SlidingWindowStrategy,
62
+ SummarizationStrategy,
63
+ )
64
+ from .client import ChatClient, ClientConfig, ChatCompletionResult
65
+ from .hooks import (
66
+ Hooks,
67
+ HookEvent,
68
+ PreToolUseEvent,
69
+ PreToolUseResult,
70
+ PostToolUseEvent,
71
+ PostToolUseResult,
72
+ PreModelCallEvent,
73
+ PostModelCallEvent,
74
+ PreCompactEvent,
75
+ PostCompactEvent,
76
+ AgentStartEvent,
77
+ AgentEndEvent,
78
+ )
79
+ from .instructions import get_instructions, INSTRUCTIONS
80
+ from .workspace import Workspace, get_workspace, set_workspace
81
+ from . import tools
82
+
83
+ # Register with Flow's harness system
84
+ from flow.harness.registry import register
85
+ from .harness import MiniAgentHarness
86
+
87
+ register("miniagent", MiniAgentHarness)
88
+
89
+ __version__ = "0.1.0"
90
+
91
+ __all__ = [
92
+ # Harness
93
+ "MiniAgentHarness",
94
+ # Core
95
+ "ChatAgent",
96
+ "AgentThread",
97
+ "AgentResponse",
98
+ "UsageStats",
99
+ "StreamEvent",
100
+ "StreamEventType",
101
+ # Tools
102
+ "Tool",
103
+ "tool",
104
+ "tools",
105
+ # Messages
106
+ "ChatMessage",
107
+ "ToolCall",
108
+ "ToolResult",
109
+ # Context strategies
110
+ "ContextStrategy",
111
+ "NoCompactionStrategy",
112
+ "HeadTailStrategy",
113
+ "SlidingWindowStrategy",
114
+ "SummarizationStrategy",
115
+ # Client
116
+ "ChatClient",
117
+ "ClientConfig",
118
+ "ChatCompletionResult",
119
+ # Hooks
120
+ "Hooks",
121
+ "HookEvent",
122
+ "PreToolUseEvent",
123
+ "PreToolUseResult",
124
+ "PostToolUseEvent",
125
+ "PostToolUseResult",
126
+ "PreModelCallEvent",
127
+ "PostModelCallEvent",
128
+ "PreCompactEvent",
129
+ "PostCompactEvent",
130
+ "AgentStartEvent",
131
+ "AgentEndEvent",
132
+ # Instructions
133
+ "get_instructions",
134
+ "INSTRUCTIONS",
135
+ # Workspace
136
+ "Workspace",
137
+ "get_workspace",
138
+ "set_workspace",
139
+ ]
src/flow/harness/miniagent/agent.py ADDED
@@ -0,0 +1,604 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ChatAgent - the core agent implementation for MiniAgent.
2
+
3
+ This is the CRITICAL module that fixes Agent Framework's broken compaction.
4
+ The key difference: context strategy is called BEFORE each LLM call in the
5
+ tool loop, and the compacted list continues to the next iteration.
6
+ """
7
+
8
+ from dataclasses import dataclass, field
9
+ from typing import Any, AsyncGenerator
10
+ from enum import Enum
11
+ import json
12
+
13
+ from .messages import ChatMessage, ToolCall
14
+ from .tool import Tool
15
+ from .client import ChatClient, ChatCompletionResult
16
+ from .context import ContextStrategy, NoCompactionStrategy
17
+ from .hooks import (
18
+ Hooks,
19
+ PreToolUseEvent,
20
+ PreToolUseResult,
21
+ PostToolUseEvent,
22
+ PostToolUseResult,
23
+ PreModelCallEvent,
24
+ PostModelCallEvent,
25
+ PreCompactEvent,
26
+ PostCompactEvent,
27
+ AgentStartEvent,
28
+ AgentEndEvent,
29
+ )
30
+
31
+
32
+ class StreamEventType(str, Enum):
33
+ """Types of events emitted during run_stream()."""
34
+ AGENT_START = "agent_start"
35
+ MODEL_START = "model_start"
36
+ MODEL_END = "model_end"
37
+ TOOL_START = "tool_start"
38
+ TOOL_END = "tool_end"
39
+ TEXT = "text"
40
+ AGENT_END = "agent_end"
41
+
42
+
43
+ def _dict_factory() -> dict[str, Any]:
44
+ return {}
45
+
46
+ def _list_factory() -> list[dict[str, int]]:
47
+ return []
48
+
49
+
50
+ @dataclass
51
+ class StreamEvent:
52
+ """Event emitted during agent execution streaming."""
53
+ type: StreamEventType
54
+ data: dict[str, Any] = field(default_factory=_dict_factory)
55
+
56
+ def __str__(self) -> str:
57
+ """Human-readable representation for print(event)."""
58
+ match self.type:
59
+ case StreamEventType.AGENT_START:
60
+ msg = self.data.get("user_message", "")[:50]
61
+ return f"🚀 Agent started: {msg}..."
62
+ case StreamEventType.MODEL_START:
63
+ return f"🧠 Model call (iteration {self.data.get('iteration', 0) + 1})"
64
+ case StreamEventType.MODEL_END:
65
+ usage = self.data.get("usage", {})
66
+ tokens = usage.get("input_tokens", 0)
67
+ has_tools = self.data.get("has_tool_calls", False)
68
+ tool_info = " → calling tools" if has_tools else ""
69
+ return f" ✓ Response ({tokens} tokens){tool_info}"
70
+ case StreamEventType.TOOL_START:
71
+ name = self.data.get("tool_name", "unknown")
72
+ return f"🔧 Tool: {name}"
73
+ case StreamEventType.TOOL_END:
74
+ name = self.data.get("tool_name", "unknown")
75
+ output = self.data.get("tool_output", "")[:100]
76
+ return f" → {name}: {output}..."
77
+ case StreamEventType.TEXT:
78
+ content = self.data.get("content", "")[:200]
79
+ return f"💬 {content}"
80
+ case StreamEventType.AGENT_END:
81
+ usage = self.data.get("usage", {})
82
+ iters = self.data.get("iterations", 0)
83
+ tools = usage.get("tool_calls", 0)
84
+ tokens = usage.get("total_input_tokens", 0) + usage.get("total_output_tokens", 0)
85
+ return f"✅ Done ({iters} iterations, {tools} tool calls, {tokens} tokens)"
86
+ case _:
87
+ return f"Event({self.type.value}): {self.data}"
88
+
89
+ def __repr__(self) -> str:
90
+ return f"StreamEvent(type={self.type.value!r}, data={self.data!r})"
91
+
92
+
93
+ @dataclass
94
+ class UsageStats:
95
+ """Token usage statistics."""
96
+
97
+ total_input_tokens: int = 0
98
+ total_output_tokens: int = 0
99
+ llm_calls: int = 0
100
+ tool_calls: int = 0
101
+ per_call: list[dict[str, int]] = field(default_factory=_list_factory)
102
+
103
+
104
+ @dataclass
105
+ class AgentResponse:
106
+ """Response from agent.run()."""
107
+
108
+ content: str | None
109
+ messages: list[ChatMessage]
110
+ usage: UsageStats
111
+ iterations: int
112
+
113
+
114
+ class AgentThread:
115
+ """Conversation thread with message history.
116
+
117
+ Threads allow multi-turn conversations by preserving history
118
+ between agent.run() calls.
119
+ """
120
+
121
+ def __init__(self, messages: list[ChatMessage] | None = None):
122
+ self.messages: list[ChatMessage] = messages or []
123
+
124
+ def add(self, message: ChatMessage) -> None:
125
+ """Add a single message to the thread."""
126
+ self.messages.append(message)
127
+
128
+ def add_many(self, messages: list[ChatMessage]) -> None:
129
+ """Add multiple messages to the thread."""
130
+ self.messages.extend(messages)
131
+
132
+ def clear(self) -> None:
133
+ """Clear all messages from the thread."""
134
+ self.messages = []
135
+
136
+ def __len__(self) -> int:
137
+ return len(self.messages)
138
+
139
+ def __bool__(self) -> bool:
140
+ # Always truthy, even when empty (to work with `thread or get_new_thread()`)
141
+ return True
142
+
143
+
144
+ class ChatAgent:
145
+ """Minimal agent with correct context compaction and hooks.
146
+
147
+ The key difference from Agent Framework:
148
+ - Context strategy is called BEFORE each LLM call in the tool loop
149
+ - The compacted messages are used for both the call AND next iteration
150
+ - This ensures cumulative token usage actually decreases with compaction
151
+
152
+ Example:
153
+ from miniagent import ChatAgent, tools
154
+
155
+ agent = ChatAgent(
156
+ instructions="You are a helpful assistant.",
157
+ tools=tools.coding_tools(),
158
+ )
159
+ response = await agent.run("List files in the current directory")
160
+ """
161
+
162
+ DEFAULT_MAX_ITERATIONS = 40
163
+ DEFAULT_TOKEN_BUDGET = 100_000
164
+
165
+ def __init__(
166
+ self,
167
+ client: ChatClient | None = None,
168
+ instructions: str | None = None,
169
+ tools: list[Tool] | None = None,
170
+ context_strategy: ContextStrategy | None = None,
171
+ token_budget: int = DEFAULT_TOKEN_BUDGET,
172
+ max_iterations: int = DEFAULT_MAX_ITERATIONS,
173
+ hooks: Hooks | None = None,
174
+ ):
175
+ """Initialize the agent.
176
+
177
+ Args:
178
+ client: Chat client for LLM calls. Auto-created if None.
179
+ instructions: System prompt for the agent.
180
+ tools: List of tools the agent can use.
181
+ context_strategy: Strategy for managing context. Defaults to no compaction.
182
+ token_budget: Maximum tokens for context window.
183
+ max_iterations: Maximum tool loop iterations.
184
+ hooks: Hook configuration for event handling.
185
+ """
186
+ self.client = client or ChatClient()
187
+ self.instructions = instructions
188
+ self.tools = {t.name: t for t in (tools or [])}
189
+ self.context_strategy = context_strategy or NoCompactionStrategy()
190
+ self.token_budget = token_budget
191
+ self.max_iterations = max_iterations
192
+ self.hooks = hooks or Hooks()
193
+
194
+ def get_new_thread(self) -> AgentThread:
195
+ """Create a new conversation thread."""
196
+ return AgentThread()
197
+
198
+ async def run(
199
+ self,
200
+ message: str,
201
+ thread: AgentThread | None = None,
202
+ ) -> AgentResponse:
203
+ """Run the agent on a message (non-streaming).
204
+
205
+ This method delegates to run_stream() and collects the results.
206
+ All logic lives in run_stream() - this is just a convenience wrapper.
207
+
208
+ THE CRITICAL FIX (in run_stream):
209
+ - Messages are compacted BEFORE each LLM call
210
+ - The compacted list is used for both the call AND continues
211
+ - Unlike Agent Framework where prepped_messages grows unbounded
212
+
213
+ Args:
214
+ message: The user message to process.
215
+ thread: Optional thread for conversation continuity.
216
+
217
+ Returns:
218
+ AgentResponse with the result and statistics.
219
+ """
220
+ thread = thread or self.get_new_thread()
221
+
222
+ # Consume the stream and extract final results
223
+ final_content: str | None = None
224
+ iterations: int = 0
225
+ usage_data: dict[str, int] = {}
226
+
227
+ async for event in self.run_stream(message, thread):
228
+ if event.type == StreamEventType.AGENT_END:
229
+ final_content = event.data.get("final_response")
230
+ iterations = event.data.get("iterations", 0)
231
+ usage_data = event.data.get("usage", {})
232
+
233
+ # Build UsageStats from the collected data
234
+ usage = UsageStats(
235
+ llm_calls=usage_data.get("llm_calls", 0),
236
+ tool_calls=usage_data.get("tool_calls", 0),
237
+ total_input_tokens=usage_data.get("total_input_tokens", 0),
238
+ total_output_tokens=usage_data.get("total_output_tokens", 0),
239
+ )
240
+
241
+ return AgentResponse(
242
+ content=final_content,
243
+ messages=thread.messages, # Thread was updated by run_stream
244
+ usage=usage,
245
+ iterations=iterations,
246
+ )
247
+
248
+ async def run_stream(
249
+ self,
250
+ message: str,
251
+ thread: AgentThread | None = None,
252
+ ) -> AsyncGenerator[StreamEvent, None]:
253
+ """Run the agent and yield events as they occur.
254
+
255
+ This is useful for building interactive UIs that need to show
256
+ progress in real-time.
257
+
258
+ Args:
259
+ message: The user message to process.
260
+ thread: Optional thread for conversation continuity.
261
+
262
+ Yields:
263
+ StreamEvent objects for each step of execution.
264
+ """
265
+ thread = thread or self.get_new_thread()
266
+ usage = UsageStats()
267
+
268
+ # Emit AgentStart hook
269
+ await self._emit_agent_start(message, thread)
270
+
271
+ # Emit start event
272
+ yield StreamEvent(
273
+ type=StreamEventType.AGENT_START,
274
+ data={"user_message": message, "thread_length": len(thread)},
275
+ )
276
+
277
+ # Build initial messages
278
+ messages: list[ChatMessage] = []
279
+ if self.instructions:
280
+ messages.append(ChatMessage.system(self.instructions))
281
+ messages.extend(thread.messages)
282
+ user_msg = ChatMessage.user(message)
283
+ messages.append(user_msg)
284
+
285
+ openai_tools = (
286
+ [t.to_openai_tool() for t in self.tools.values()] if self.tools else None
287
+ )
288
+
289
+ final_content: str | None = None
290
+ iteration = 0
291
+
292
+ for iteration in range(self.max_iterations):
293
+ # Apply context strategy
294
+ messages = await self._compact_with_hooks(messages, iteration)
295
+
296
+ # Model call start
297
+ yield StreamEvent(
298
+ type=StreamEventType.MODEL_START,
299
+ data={"iteration": iteration, "message_count": len(messages)},
300
+ )
301
+
302
+ # Emit PreModelCall hook for OTEL tracing
303
+ await self._emit_pre_model_call(messages, iteration)
304
+
305
+ # Make LLM call
306
+ result = await self.client.chat_completion(
307
+ messages=[m.to_openai_format() for m in messages],
308
+ tools=openai_tools,
309
+ )
310
+
311
+ # Track usage
312
+ usage.llm_calls += 1
313
+ usage.total_input_tokens += result.usage["input_tokens"]
314
+ usage.total_output_tokens += result.usage["output_tokens"]
315
+ usage.per_call.append(result.usage)
316
+
317
+ # Emit PostModelCall hook for OTEL tracing
318
+ await self._emit_post_model_call(result, iteration)
319
+
320
+ # Model call end
321
+ yield StreamEvent(
322
+ type=StreamEventType.MODEL_END,
323
+ data={
324
+ "iteration": iteration,
325
+ "usage": result.usage,
326
+ "has_tool_calls": bool(result.tool_calls),
327
+ },
328
+ )
329
+
330
+ # Parse response
331
+ assistant_msg = self._parse_assistant_message(result)
332
+ messages.append(assistant_msg)
333
+
334
+ # Emit text if present
335
+ if assistant_msg.content:
336
+ yield StreamEvent(
337
+ type=StreamEventType.TEXT,
338
+ data={"content": assistant_msg.content},
339
+ )
340
+
341
+ # Check if done
342
+ if not assistant_msg.tool_calls:
343
+ final_content = assistant_msg.content
344
+ break
345
+
346
+ # Execute tools
347
+ should_stop = False
348
+ for tool_call in assistant_msg.tool_calls:
349
+ # Pre-tool hook
350
+ hook_result = await self._emit_pre_tool_use(tool_call, messages, iteration)
351
+
352
+ if hook_result and hook_result.decision == "block":
353
+ tool_msg = ChatMessage.tool(
354
+ tool_call.id,
355
+ f"Tool call blocked: {hook_result.reason or 'No reason provided'}",
356
+ )
357
+ messages.append(tool_msg)
358
+ continue
359
+
360
+ tool_input = json.loads(tool_call.arguments)
361
+ if hook_result and hook_result.decision == "modify" and hook_result.modified_input:
362
+ tool_input = hook_result.modified_input
363
+
364
+ # Tool start
365
+ yield StreamEvent(
366
+ type=StreamEventType.TOOL_START,
367
+ data={"tool_name": tool_call.name, "tool_input": tool_input},
368
+ )
369
+
370
+ # Execute
371
+ tool_result = await self._execute_tool(tool_call.name, tool_input)
372
+ usage.tool_calls += 1
373
+
374
+ # Tool end
375
+ yield StreamEvent(
376
+ type=StreamEventType.TOOL_END,
377
+ data={
378
+ "tool_name": tool_call.name,
379
+ "tool_output": tool_result[:500], # Truncate for streaming
380
+ },
381
+ )
382
+
383
+ # Post-tool hook
384
+ post_result = await self._emit_post_tool_use(
385
+ tool_call, tool_input, tool_result, iteration
386
+ )
387
+
388
+ # Add tool result
389
+ tool_msg = ChatMessage.tool(tool_call.id, tool_result)
390
+ messages.append(tool_msg)
391
+
392
+ if post_result:
393
+ if post_result.additional_context:
394
+ messages.append(ChatMessage.system(post_result.additional_context))
395
+ if post_result.stop_execution:
396
+ should_stop = True
397
+ break
398
+
399
+ if should_stop:
400
+ break
401
+
402
+ # Update thread
403
+ start_idx = 1 if self.instructions else 0
404
+ thread.messages = messages[start_idx:]
405
+
406
+ # Get final content
407
+ if final_content is None:
408
+ for msg in reversed(messages):
409
+ if msg.role == "assistant" and msg.content:
410
+ final_content = msg.content
411
+ break
412
+
413
+ # Emit AgentEnd hook
414
+ await self._emit_agent_end(final_content, iteration + 1, usage)
415
+
416
+ # End event
417
+ yield StreamEvent(
418
+ type=StreamEventType.AGENT_END,
419
+ data={
420
+ "final_response": final_content,
421
+ "iterations": iteration + 1,
422
+ "usage": {
423
+ "total_input_tokens": usage.total_input_tokens,
424
+ "total_output_tokens": usage.total_output_tokens,
425
+ "llm_calls": usage.llm_calls,
426
+ "tool_calls": usage.tool_calls,
427
+ },
428
+ },
429
+ )
430
+
431
+ def _parse_assistant_message(self, result: "ChatCompletionResult") -> ChatMessage:
432
+ """Parse the LLM response into a ChatMessage."""
433
+ tool_calls = None
434
+ if result.tool_calls:
435
+ tool_calls = [
436
+ ToolCall(
437
+ id=tc["id"],
438
+ name=tc["name"],
439
+ arguments=tc["arguments"],
440
+ )
441
+ for tc in result.tool_calls
442
+ ]
443
+
444
+ return ChatMessage.assistant(content=result.content, tool_calls=tool_calls)
445
+
446
+ async def _execute_tool(self, name: str, arguments: dict[str, Any]) -> str:
447
+ """Execute a tool call."""
448
+ tool = self.tools.get(name)
449
+ if not tool:
450
+ return f"Error: Unknown tool '{name}'"
451
+
452
+ try:
453
+ return await tool.invoke(**arguments)
454
+ except Exception as e:
455
+ return f"Error executing {name}: {str(e)}"
456
+
457
+ # === Hook emission methods ===
458
+
459
+ async def _emit_agent_start(self, message: str, thread: AgentThread) -> None:
460
+ """Emit AgentStart event to hooks."""
461
+ event = AgentStartEvent(
462
+ user_message=message,
463
+ thread_message_count=len(thread),
464
+ )
465
+ for hook in self.hooks.agent_start:
466
+ await hook(event)
467
+
468
+ async def _emit_agent_end(
469
+ self, final_response: str | None, iterations: int, usage: UsageStats
470
+ ) -> None:
471
+ """Emit AgentEnd event to hooks."""
472
+ event = AgentEndEvent(
473
+ final_response=final_response,
474
+ total_iterations=iterations,
475
+ total_input_tokens=usage.total_input_tokens,
476
+ total_output_tokens=usage.total_output_tokens,
477
+ tool_calls_made=usage.tool_calls,
478
+ )
479
+ for hook in self.hooks.agent_end:
480
+ await hook(event)
481
+
482
+ async def _emit_pre_model_call(
483
+ self, messages: list[ChatMessage], iteration: int
484
+ ) -> None:
485
+ """Emit PreModelCall event to hooks."""
486
+ event = PreModelCallEvent(
487
+ message_count=len(messages),
488
+ iteration=iteration,
489
+ )
490
+ for hook in self.hooks.pre_model_call:
491
+ await hook(event)
492
+
493
+ async def _emit_post_model_call(self, result: "ChatCompletionResult", iteration: int) -> None:
494
+ """Emit PostModelCall event to hooks."""
495
+ # Extract text content from response
496
+ response_text = result.content or ""
497
+
498
+ event = PostModelCallEvent(
499
+ usage=result.usage,
500
+ iteration=iteration,
501
+ has_tool_calls=bool(result.tool_calls),
502
+ finish_reason=result.finish_reason,
503
+ response_text=response_text,
504
+ )
505
+ for hook in self.hooks.post_model_call:
506
+ await hook(event)
507
+
508
+ async def _emit_pre_tool_use(
509
+ self, tool_call: ToolCall, messages: list[ChatMessage], iteration: int
510
+ ) -> PreToolUseResult | None:
511
+ """Emit PreToolUse event to hooks. Returns combined result."""
512
+ event = PreToolUseEvent(
513
+ tool_name=tool_call.name,
514
+ tool_input=json.loads(tool_call.arguments),
515
+ tool_call_id=tool_call.id,
516
+ iteration=iteration,
517
+ )
518
+
519
+ result: PreToolUseResult | None = None
520
+ for hook in self.hooks.pre_tool_use:
521
+ hook_result = await hook(event)
522
+ if hook_result:
523
+ # First non-allow result wins
524
+ if hook_result.decision != "allow":
525
+ return hook_result
526
+ result = hook_result
527
+
528
+ return result
529
+
530
+ async def _emit_post_tool_use(
531
+ self,
532
+ tool_call: ToolCall,
533
+ tool_input: dict[str, Any],
534
+ tool_output: str,
535
+ iteration: int,
536
+ ) -> PostToolUseResult | None:
537
+ """Emit PostToolUse event to hooks. Returns combined result."""
538
+ error = tool_output if tool_output.startswith("Error") else None
539
+ event = PostToolUseEvent(
540
+ tool_name=tool_call.name,
541
+ tool_input=tool_input,
542
+ tool_output=tool_output,
543
+ tool_call_id=tool_call.id,
544
+ iteration=iteration,
545
+ error=error,
546
+ )
547
+
548
+ combined = PostToolUseResult()
549
+ for hook in self.hooks.post_tool_use:
550
+ hook_result = await hook(event)
551
+ if hook_result:
552
+ if hook_result.additional_context:
553
+ combined.additional_context = hook_result.additional_context
554
+ if hook_result.stop_execution:
555
+ combined.stop_execution = True
556
+ combined.stop_reason = hook_result.stop_reason
557
+
558
+ return combined if (combined.additional_context or combined.stop_execution) else None
559
+
560
+ async def _compact_with_hooks(
561
+ self, messages: list[ChatMessage], iteration: int
562
+ ) -> list[ChatMessage]:
563
+ """Apply context strategy with hooks."""
564
+ # Estimate current tokens (rough)
565
+ current_tokens = sum(
566
+ len(str(m.content or "")) // 4 + 10 for m in messages
567
+ )
568
+
569
+ # Emit PreCompact hook
570
+ pre_event = PreCompactEvent(
571
+ message_count=len(messages),
572
+ current_tokens=current_tokens,
573
+ budget=self.token_budget,
574
+ trigger="auto",
575
+ )
576
+ for hook in self.hooks.pre_compact:
577
+ await hook(pre_event)
578
+
579
+ # Apply context strategy (use async if available for summarization)
580
+ compacted: list[ChatMessage]
581
+ if hasattr(self.context_strategy, "prepare_context_async"):
582
+ # Cast to Any to access optional async method
583
+ strategy: Any = self.context_strategy
584
+ compacted = await strategy.prepare_context_async(
585
+ messages, self.token_budget
586
+ )
587
+ else:
588
+ compacted = self.context_strategy.prepare_context(messages, self.token_budget)
589
+
590
+ # Emit PostCompact hook if something changed
591
+ if len(compacted) != len(messages):
592
+ compacted_tokens = sum(
593
+ len(str(m.content or "")) // 4 + 10 for m in compacted
594
+ )
595
+ post_event = PostCompactEvent(
596
+ messages_before=len(messages),
597
+ messages_after=len(compacted),
598
+ tokens_before=current_tokens,
599
+ tokens_after=compacted_tokens,
600
+ )
601
+ for hook in self.hooks.post_compact:
602
+ await hook(post_event)
603
+
604
+ return compacted
src/flow/harness/miniagent/client.py ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """OpenAI/Azure OpenAI client wrapper for MiniAgent.
2
+
3
+ Provides a unified interface for both OpenAI and Azure OpenAI APIs.
4
+ Auto-detects configuration from environment variables.
5
+ """
6
+
7
+ from dataclasses import dataclass
8
+ from typing import Any
9
+ import os
10
+
11
+ # Load .env file if present (override=True to prefer .env over shell env)
12
+ try:
13
+ from dotenv import load_dotenv
14
+ load_dotenv(override=True)
15
+ except ImportError:
16
+ pass # dotenv not installed, use existing env vars
17
+
18
+
19
+ @dataclass
20
+ class ClientConfig:
21
+ """Configuration for the chat client.
22
+
23
+ Can be provided explicitly or auto-detected from environment variables.
24
+ """
25
+
26
+ api_key: str
27
+ model: str = "gpt-4o"
28
+ endpoint: str | None = None # For Azure OpenAI
29
+ api_version: str = "2024-02-15-preview" # For Azure OpenAI
30
+ temperature: float = 0.0
31
+ max_tokens: int | None = None
32
+
33
+ @classmethod
34
+ def from_env(cls) -> "ClientConfig":
35
+ """Create config from environment variables.
36
+
37
+ Checks for Azure first, then falls back to OpenAI.
38
+
39
+ Environment variables:
40
+ Azure: AZURE_OPENAI_ENDPOINT, AZURE_OPENAI_API_KEY, AZURE_OPENAI_DEPLOYMENT
41
+ OpenAI: OPENAI_API_KEY, OPENAI_MODEL
42
+ """
43
+ # Check for Azure OpenAI
44
+ azure_endpoint = os.environ.get("AZURE_OPENAI_ENDPOINT")
45
+ if azure_endpoint:
46
+ return cls(
47
+ api_key=os.environ.get("AZURE_OPENAI_API_KEY", ""),
48
+ model=os.environ.get("AZURE_OPENAI_DEPLOYMENT", "gpt-4o"),
49
+ endpoint=azure_endpoint,
50
+ api_version=os.environ.get(
51
+ "AZURE_OPENAI_API_VERSION", "2024-02-15-preview"
52
+ ),
53
+ )
54
+
55
+ # Fall back to OpenAI
56
+ return cls(
57
+ api_key=os.environ.get("OPENAI_API_KEY", ""),
58
+ model=os.environ.get("OPENAI_MODEL", "gpt-4o"),
59
+ )
60
+
61
+
62
+ @dataclass
63
+ class ChatCompletionResult:
64
+ """Result from a chat completion call."""
65
+
66
+ content: str | None
67
+ tool_calls: list[dict[str, Any]] | None
68
+ usage: dict[str, int]
69
+ finish_reason: str | None
70
+ raw_response: Any
71
+
72
+
73
+ class ChatClient:
74
+ """Async client for OpenAI/Azure OpenAI chat completions.
75
+
76
+ Wraps the openai Python SDK and provides a simplified interface.
77
+ """
78
+
79
+ def __init__(self, config: ClientConfig | None = None):
80
+ """Initialize the client.
81
+
82
+ Args:
83
+ config: Client configuration. If None, auto-detects from env.
84
+ """
85
+ self.config = config or ClientConfig.from_env()
86
+ self._client = self._create_client()
87
+
88
+ def _create_client(self):
89
+ """Create the appropriate async client."""
90
+ try:
91
+ from openai import AsyncOpenAI, AsyncAzureOpenAI
92
+ except ImportError:
93
+ raise ImportError(
94
+ "openai package is required. Install with: pip install openai"
95
+ )
96
+
97
+ if self.config.endpoint:
98
+ # Check if using OpenAI-compatible endpoint (e.g., /openai/v1/)
99
+ # vs traditional Azure OpenAI endpoint
100
+ if "/v1" in self.config.endpoint:
101
+ # OpenAI-compatible endpoint (like gpt-5.2-chat on victor-test-resource)
102
+ return AsyncOpenAI(
103
+ base_url=self.config.endpoint,
104
+ api_key=self.config.api_key,
105
+ )
106
+ else:
107
+ # Traditional Azure OpenAI
108
+ return AsyncAzureOpenAI(
109
+ api_key=self.config.api_key,
110
+ azure_endpoint=self.config.endpoint,
111
+ api_version=self.config.api_version,
112
+ )
113
+
114
+ # Standard OpenAI
115
+ return AsyncOpenAI(api_key=self.config.api_key)
116
+
117
+ async def chat_completion(
118
+ self,
119
+ messages: list[dict[str, Any]],
120
+ tools: list[dict[str, Any]] | None = None,
121
+ **kwargs: Any,
122
+ ) -> ChatCompletionResult:
123
+ """Make a chat completion request.
124
+
125
+ Args:
126
+ messages: List of messages in OpenAI format
127
+ tools: Optional list of tools in OpenAI format
128
+ **kwargs: Additional parameters to pass to the API
129
+
130
+ Returns:
131
+ ChatCompletionResult with the response
132
+ """
133
+ params: dict[str, Any] = {
134
+ "model": self.config.model,
135
+ "messages": messages,
136
+ }
137
+
138
+ # Only set temperature if not using models that don't support it (like gpt-5.2-chat)
139
+ temp = kwargs.get("temperature", self.config.temperature)
140
+ if temp != 1.0 and "5.2" not in self.config.model:
141
+ params["temperature"] = temp
142
+
143
+ if self.config.max_tokens:
144
+ params["max_tokens"] = self.config.max_tokens
145
+
146
+ if tools:
147
+ params["tools"] = tools
148
+ params["tool_choice"] = kwargs.get("tool_choice", "auto")
149
+ params["parallel_tool_calls"] = kwargs.get("parallel_tool_calls", True)
150
+
151
+ # Add any extra kwargs
152
+ for key, value in kwargs.items():
153
+ if key not in ("temperature", "tool_choice") and value is not None:
154
+ params[key] = value
155
+
156
+ response = await self._client.chat.completions.create(**params) # type: ignore[union-attr]
157
+
158
+ # Extract the message
159
+ choice = response.choices[0] # type: ignore[index]
160
+ message = choice.message # type: ignore[union-attr]
161
+
162
+ # Parse tool calls if present
163
+ tool_calls: list[dict[str, Any]] | None = None
164
+ if message.tool_calls: # type: ignore[union-attr]
165
+ tool_calls = [
166
+ {
167
+ "id": str(tc.id), # type: ignore[union-attr]
168
+ "name": str(tc.function.name), # type: ignore[union-attr]
169
+ "arguments": str(tc.function.arguments), # type: ignore[union-attr]
170
+ }
171
+ for tc in message.tool_calls # type: ignore[union-attr]
172
+ ]
173
+
174
+ return ChatCompletionResult(
175
+ content=str(message.content) if message.content else None, # type: ignore[union-attr]
176
+ tool_calls=tool_calls,
177
+ usage={
178
+ "input_tokens": response.usage.prompt_tokens if response.usage else 0, # type: ignore[union-attr]
179
+ "output_tokens": (
180
+ response.usage.completion_tokens if response.usage else 0 # type: ignore[union-attr]
181
+ ),
182
+ },
183
+ finish_reason=str(choice.finish_reason) if choice.finish_reason else None, # type: ignore[union-attr]
184
+ raw_response=response,
185
+ )
src/flow/harness/miniagent/context.py ADDED
@@ -0,0 +1,664 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Context strategies for MiniAgent.
2
+
3
+ This is the KEY module that fixes Agent Framework's broken compaction.
4
+ Strategies are called BEFORE each LLM call, and the returned (potentially
5
+ compacted) list continues to the next iteration.
6
+ """
7
+
8
+ from dataclasses import dataclass, field
9
+ from typing import Protocol, Any
10
+ import tiktoken
11
+
12
+ from .messages import ChatMessage
13
+
14
+
15
+ class ContextStrategy(Protocol):
16
+ """Protocol for context management strategies.
17
+
18
+ Called BEFORE each LLM call in the tool loop, allowing
19
+ the strategy to modify the message list.
20
+ """
21
+
22
+ def prepare_context(
23
+ self,
24
+ messages: list[ChatMessage],
25
+ token_budget: int,
26
+ ) -> list[ChatMessage]:
27
+ """Prepare messages for the next LLM call.
28
+
29
+ Args:
30
+ messages: Current messages
31
+ token_budget: Maximum tokens for context
32
+
33
+ Returns:
34
+ Messages to use (may be compacted)
35
+ """
36
+ ...
37
+
38
+
39
+ class NoCompactionStrategy:
40
+ """Baseline: no compaction, context grows unbounded.
41
+
42
+ Use this for benchmarking to see how context grows without management.
43
+ """
44
+
45
+ def prepare_context(
46
+ self,
47
+ messages: list[ChatMessage],
48
+ token_budget: int,
49
+ ) -> list[ChatMessage]:
50
+ return messages
51
+
52
+
53
+ @dataclass
54
+ class HeadTailStrategy:
55
+ """Token-aware head+tail compaction.
56
+
57
+ Preserves:
58
+ - Head: System prompt, initial user message (critical context)
59
+ - Tail: Recent tool calls and results (working memory)
60
+
61
+ Drops middle messages when over budget, respecting atomic groups
62
+ (tool calls and their results must stay together).
63
+
64
+ This is the recommended strategy for most use cases.
65
+ """
66
+
67
+ head_ratio: float = 0.2 # 20% for head by default
68
+ model: str = "gpt-4o"
69
+ _encoder: tiktoken.Encoding | None = field(default=None, repr=False)
70
+
71
+ # Statistics
72
+ compaction_count: int = field(default=0, repr=False)
73
+ total_tokens_saved: int = field(default=0, repr=False)
74
+
75
+ def __post_init__(self):
76
+ try:
77
+ self._encoder = tiktoken.encoding_for_model(self.model)
78
+ except KeyError:
79
+ # Fallback for unknown models
80
+ self._encoder = tiktoken.get_encoding("cl100k_base")
81
+
82
+ def _count_tokens(self, messages: list[ChatMessage]) -> int:
83
+ """Count tokens in messages."""
84
+ if not self._encoder:
85
+ # Rough estimate if no encoder
86
+ return sum(len(str(m.content or "")) // 4 for m in messages)
87
+
88
+ total = 0
89
+ for msg in messages:
90
+ # Role overhead (approximately 4 tokens per message)
91
+ total += 4
92
+
93
+ if msg.content:
94
+ total += len(self._encoder.encode(msg.content))
95
+
96
+ if msg.tool_calls:
97
+ for tc in msg.tool_calls:
98
+ # Tool call overhead
99
+ total += 4
100
+ total += len(self._encoder.encode(tc.name))
101
+ total += len(self._encoder.encode(tc.arguments))
102
+
103
+ return total
104
+
105
+ def _find_atomic_groups(
106
+ self, messages: list[ChatMessage]
107
+ ) -> list[tuple[int, ...]]:
108
+ """Group tool_call messages with their results.
109
+
110
+ OpenAI requires every tool_call to have a corresponding result.
111
+ This ensures we never split a tool call from its results.
112
+
113
+ Returns list of tuples, where each tuple contains indices that
114
+ must stay together.
115
+ """
116
+ groups: list[tuple[int, ...]] = []
117
+ i = 0
118
+
119
+ while i < len(messages):
120
+ msg = messages[i]
121
+
122
+ if msg.tool_calls:
123
+ # This message has tool calls - find all results
124
+ call_ids = {tc.id for tc in msg.tool_calls}
125
+ group_indices = [i]
126
+
127
+ # Look ahead for results
128
+ j = i + 1
129
+ while j < len(messages) and call_ids:
130
+ if messages[j].role == "tool" and messages[j].tool_call_id in call_ids:
131
+ group_indices.append(j)
132
+ call_ids.remove(messages[j].tool_call_id)
133
+ j += 1
134
+
135
+ groups.append(tuple(group_indices))
136
+ i = max(group_indices) + 1 if group_indices else i + 1
137
+ else:
138
+ groups.append((i,))
139
+ i += 1
140
+
141
+ return groups
142
+
143
+ def prepare_context(
144
+ self,
145
+ messages: list[ChatMessage],
146
+ token_budget: int,
147
+ ) -> list[ChatMessage]:
148
+ """Compact if over budget."""
149
+ if not messages:
150
+ return messages
151
+
152
+ current_tokens = self._count_tokens(messages)
153
+
154
+ if current_tokens <= token_budget:
155
+ return messages
156
+
157
+ # COMPACTION NEEDED
158
+ self.compaction_count += 1
159
+
160
+ groups = self._find_atomic_groups(messages)
161
+ head_budget = int(token_budget * self.head_ratio)
162
+ tail_budget = token_budget - head_budget
163
+
164
+ # Fill head from start
165
+ head_groups: list[tuple[int, ...]] = []
166
+ head_tokens = 0
167
+
168
+ for group in groups:
169
+ group_msgs = [messages[i] for i in group]
170
+ group_tokens = self._count_tokens(group_msgs)
171
+
172
+ if head_tokens + group_tokens <= head_budget:
173
+ head_groups.append(group)
174
+ head_tokens += group_tokens
175
+ else:
176
+ break
177
+
178
+ # Fill tail from end (skip head groups)
179
+ remaining_groups = groups[len(head_groups) :]
180
+ tail_groups: list[tuple[int, ...]] = []
181
+ tail_tokens = 0
182
+
183
+ for group in reversed(remaining_groups):
184
+ group_msgs = [messages[i] for i in group]
185
+ group_tokens = self._count_tokens(group_msgs)
186
+
187
+ if tail_tokens + group_tokens <= tail_budget:
188
+ tail_groups.insert(0, group)
189
+ tail_tokens += group_tokens
190
+ else:
191
+ break
192
+
193
+ # Build compacted list
194
+ kept_indices: set[int] = set()
195
+ for group in head_groups + tail_groups:
196
+ kept_indices.update(group)
197
+
198
+ compacted = [messages[i] for i in sorted(kept_indices)]
199
+
200
+ # Track savings
201
+ compacted_tokens = self._count_tokens(compacted)
202
+ self.total_tokens_saved += current_tokens - compacted_tokens
203
+
204
+ return compacted
205
+
206
+
207
+ @dataclass
208
+ class SlidingWindowStrategy:
209
+ """Keep only recent messages within budget.
210
+
211
+ Always preserves the system message (if present) plus the most
212
+ recent messages that fit in the budget. Respects atomic groups
213
+ (tool calls and their results must stay together).
214
+
215
+ Simpler than HeadTailStrategy but may lose important early context.
216
+ """
217
+
218
+ model: str = "gpt-4o"
219
+ _encoder: tiktoken.Encoding | None = field(default=None, repr=False)
220
+
221
+ def __post_init__(self):
222
+ try:
223
+ self._encoder = tiktoken.encoding_for_model(self.model)
224
+ except KeyError:
225
+ self._encoder = tiktoken.get_encoding("cl100k_base")
226
+
227
+ def _count_tokens(self, messages: list[ChatMessage]) -> int:
228
+ """Count tokens in messages."""
229
+ if not self._encoder:
230
+ return sum(len(str(m.content or "")) // 4 for m in messages)
231
+
232
+ total = 0
233
+ for msg in messages:
234
+ total += 4
235
+ if msg.content:
236
+ total += len(self._encoder.encode(msg.content))
237
+ if msg.tool_calls:
238
+ for tc in msg.tool_calls:
239
+ total += 4 + len(self._encoder.encode(tc.name))
240
+ total += len(self._encoder.encode(tc.arguments))
241
+ return total
242
+
243
+ def _find_atomic_groups(
244
+ self, messages: list[ChatMessage]
245
+ ) -> list[tuple[int, ...]]:
246
+ """Group tool_call messages with their results.
247
+
248
+ OpenAI requires every tool_call to have a corresponding result.
249
+ This ensures we never split a tool call from its results.
250
+ """
251
+ groups: list[tuple[int, ...]] = []
252
+ i = 0
253
+
254
+ while i < len(messages):
255
+ msg = messages[i]
256
+
257
+ if msg.tool_calls:
258
+ # This message has tool calls - find all results
259
+ call_ids = {tc.id for tc in msg.tool_calls}
260
+ group_indices = [i]
261
+
262
+ # Look ahead for results
263
+ j = i + 1
264
+ while j < len(messages) and call_ids:
265
+ if messages[j].role == "tool" and messages[j].tool_call_id in call_ids:
266
+ group_indices.append(j)
267
+ call_ids.remove(messages[j].tool_call_id)
268
+ j += 1
269
+
270
+ groups.append(tuple(group_indices))
271
+ i = max(group_indices) + 1 if group_indices else i + 1
272
+ else:
273
+ groups.append((i,))
274
+ i += 1
275
+
276
+ return groups
277
+
278
+ def prepare_context(
279
+ self,
280
+ messages: list[ChatMessage],
281
+ token_budget: int,
282
+ ) -> list[ChatMessage]:
283
+ """Keep system message + most recent messages within budget."""
284
+ if not messages:
285
+ return messages
286
+
287
+ # Always keep system messages at the start
288
+ system_msgs: list[ChatMessage] = []
289
+ non_system_start = 0
290
+
291
+ for i, msg in enumerate(messages):
292
+ if msg.role == "system":
293
+ system_msgs.append(msg)
294
+ non_system_start = i + 1
295
+ else:
296
+ break
297
+
298
+ other_msgs = messages[non_system_start:]
299
+
300
+ system_tokens = self._count_tokens(system_msgs)
301
+ remaining_budget = token_budget - system_tokens
302
+
303
+ if remaining_budget <= 0:
304
+ return system_msgs
305
+
306
+ # Find atomic groups in other messages
307
+ groups = self._find_atomic_groups(other_msgs)
308
+
309
+ # Fill from end, respecting atomic groups
310
+ kept_groups: list[tuple[int, ...]] = []
311
+ kept_tokens = 0
312
+
313
+ for group in reversed(groups):
314
+ group_msgs = [other_msgs[i] for i in group]
315
+ group_tokens = self._count_tokens(group_msgs)
316
+
317
+ if kept_tokens + group_tokens <= remaining_budget:
318
+ kept_groups.insert(0, group)
319
+ kept_tokens += group_tokens
320
+ else:
321
+ break
322
+
323
+ # Build result from kept groups
324
+ kept_indices: set[int] = set()
325
+ for group in kept_groups:
326
+ kept_indices.update(group)
327
+
328
+ result = [other_msgs[i] for i in sorted(kept_indices)]
329
+
330
+ return system_msgs + result
331
+
332
+
333
+ @dataclass
334
+ class SummarizationStrategy:
335
+ """Summarize old messages instead of dropping them.
336
+
337
+ When over budget, this strategy:
338
+ 1. Keeps: System message + initial user message (head)
339
+ 2. Keeps: Most recent messages (tail)
340
+ 3. Summarizes: Everything in between into a single "context so far" message
341
+
342
+ This preserves critical state (files read, findings, progress) that would
343
+ otherwise be lost with simple truncation strategies.
344
+
345
+ The summarization uses an LLM call, which adds latency but preserves meaning.
346
+ """
347
+
348
+ # Client for summarization calls (required)
349
+ client: Any = None # ChatClient instance
350
+
351
+ # Configuration
352
+ head_messages: int = 2 # Keep first N messages (system + initial user)
353
+ tail_messages: int = 4 # Keep last N messages (recent context)
354
+ summary_max_tokens: int = 1000 # Max tokens for the summary
355
+ model: str = "gpt-4o"
356
+
357
+ # Statistics
358
+ compaction_count: int = field(default=0, repr=False)
359
+ total_tokens_saved: int = field(default=0, repr=False)
360
+
361
+ _encoder: tiktoken.Encoding | None = field(default=None, repr=False)
362
+
363
+ def __post_init__(self):
364
+ try:
365
+ self._encoder = tiktoken.encoding_for_model(self.model)
366
+ except KeyError:
367
+ self._encoder = tiktoken.get_encoding("cl100k_base")
368
+
369
+ def _count_tokens(self, messages: list[ChatMessage]) -> int:
370
+ if not self._encoder:
371
+ return sum(len(str(m.content or "")) // 4 for m in messages)
372
+ total = 0
373
+ for msg in messages:
374
+ total += 4
375
+ if msg.content:
376
+ total += len(self._encoder.encode(msg.content))
377
+ if msg.tool_calls:
378
+ for tc in msg.tool_calls:
379
+ total += 4 + len(self._encoder.encode(tc.name))
380
+ total += len(self._encoder.encode(tc.arguments))
381
+ return total
382
+
383
+ def _format_messages_for_summary(self, messages: list[ChatMessage]) -> str:
384
+ """Format messages into text for summarization."""
385
+ parts: list[str] = []
386
+ for msg in messages:
387
+ if msg.role == "assistant":
388
+ if msg.content:
389
+ parts.append(f"Assistant: {msg.content}")
390
+ if msg.tool_calls:
391
+ for tc in msg.tool_calls:
392
+ parts.append(f"Tool call: {tc.name}({tc.arguments[:200]}...)")
393
+ elif msg.role == "tool":
394
+ # Truncate long tool outputs
395
+ output = msg.content or ""
396
+ if len(output) > 500:
397
+ output = output[:500] + "... [truncated]"
398
+ parts.append(f"Tool result ({msg.name}): {output}")
399
+ elif msg.role == "user" and msg.content:
400
+ parts.append(f"User: {msg.content}")
401
+
402
+ return "\n\n".join(parts)
403
+
404
+ async def _generate_summary(
405
+ self, messages: list[ChatMessage], original_task: str = ""
406
+ ) -> str:
407
+ """Generate a summary of the messages using the LLM.
408
+
409
+ Args:
410
+ messages: The middle messages to summarize
411
+ original_task: The original user task (for context)
412
+ """
413
+ if not self.client:
414
+ return self._extract_key_info(messages)
415
+
416
+ content = self._format_messages_for_summary(messages)
417
+
418
+ # Extract files that were read (to prevent re-reading)
419
+ files_read = self._extract_files_read(messages)
420
+ files_list = "\n".join(f" - {f}" for f in files_read) if files_read else " (none identified)"
421
+
422
+ summary_prompt = f"""You are helping an AI agent that is working on a task but hit context limits.
423
+ The agent needs to continue from a summary of what was done so far.
424
+
425
+ ORIGINAL TASK:
426
+ {original_task if original_task else "(not provided)"}
427
+
428
+ The conversation below shows {len(messages)} messages of work that needs to be summarized.
429
+ The agent will continue working after receiving this summary.
430
+
431
+ CRITICAL: Your summary MUST include:
432
+ 1. **FILES ALREADY READ** - List EVERY file that was read. The agent must NOT re-read these:
433
+ {files_list}
434
+
435
+ 2. **KEY FINDINGS** - What was discovered in each file (brief, 1-2 lines each)
436
+
437
+ 3. **PROGRESS** - What's been accomplished toward the task
438
+
439
+ 4. **WHAT REMAINS** - What still needs to be done to complete the task
440
+
441
+ Keep summary under {self.summary_max_tokens} tokens. Be specific - vague summaries cause the agent to repeat work.
442
+
443
+ CONVERSATION TO SUMMARIZE:
444
+ {content}
445
+
446
+ SUMMARY:"""
447
+
448
+ try:
449
+ # chat_completion expects messages as dicts, not ChatMessage objects
450
+ # Use max_completion_tokens for newer models, fall back to max_tokens
451
+ response = await self.client.chat_completion(
452
+ messages=[{"role": "user", "content": summary_prompt}],
453
+ max_completion_tokens=self.summary_max_tokens,
454
+ )
455
+ # ChatCompletionResult has .content attribute
456
+ if response.content:
457
+ return response.content
458
+ return self._extract_key_info(messages)
459
+ except Exception as e:
460
+ # Log the error for debugging
461
+ import sys
462
+ print(f"[SummarizationStrategy] LLM call failed: {e}", file=sys.stderr)
463
+ return self._extract_key_info(messages)
464
+
465
+ def _extract_files_read(self, messages: list[ChatMessage]) -> list[str]:
466
+ """Extract list of files that were read from the messages."""
467
+ files: list[str] = []
468
+ for msg in messages:
469
+ if msg.tool_calls:
470
+ for tc in msg.tool_calls:
471
+ if tc.name in ("read_file", "Read"):
472
+ # Try to extract path from arguments
473
+ try:
474
+ import json
475
+ args = json.loads(tc.arguments)
476
+ path = args.get("path") or args.get("file_path") or args.get("filename")
477
+ if path:
478
+ files.append(path)
479
+ except:
480
+ pass
481
+ return list(dict.fromkeys(files)) # Remove duplicates, preserve order
482
+
483
+ def _extract_key_info(self, messages: list[ChatMessage]) -> str:
484
+ """Extract key info without LLM (fallback)."""
485
+ files_read: set[str] = set()
486
+ key_findings: list[str] = []
487
+
488
+ for msg in messages:
489
+ if msg.role == "tool" and msg.name == "read_file":
490
+ # Try to extract filename from the previous tool call
491
+ files_read.add(msg.name or "file")
492
+ if msg.role == "assistant" and msg.content:
493
+ # Keep short assistant messages as findings
494
+ if len(msg.content) < 200:
495
+ key_findings.append(msg.content)
496
+
497
+ parts: list[str] = []
498
+ if files_read:
499
+ parts.append(f"Files accessed: {', '.join(files_read)}")
500
+ if key_findings:
501
+ parts.append(f"Key points: {'; '.join(key_findings[:5])}")
502
+
503
+ return "\n".join(parts) if parts else "Previous context was processed."
504
+
505
+ def prepare_context(
506
+ self,
507
+ messages: list[ChatMessage],
508
+ token_budget: int,
509
+ ) -> list[ChatMessage]:
510
+ """Summarize middle messages if over budget.
511
+
512
+ NOTE: This is synchronous but summarization needs async.
513
+ The actual summarization happens in prepare_context_async.
514
+ This method uses a simple fallback for sync contexts.
515
+ """
516
+ if not messages:
517
+ return messages
518
+
519
+ current_tokens = self._count_tokens(messages)
520
+ if current_tokens <= token_budget:
521
+ return messages
522
+
523
+ # For sync context, use simple extraction (no LLM call)
524
+ return self._compact_with_summary_sync(messages, token_budget)
525
+
526
+ def _find_safe_split_points(self, messages: list[ChatMessage]) -> tuple[int, int]:
527
+ """Find safe points to split messages without breaking tool call/result pairs.
528
+
529
+ Returns (head_end, tail_start) indices where it's safe to summarize between.
530
+ """
531
+ # Find atomic groups (tool calls must stay with their results)
532
+ groups: list[tuple[int, int]] = [] # (start, end) indices
533
+ i = 0
534
+
535
+ while i < len(messages):
536
+ msg = messages[i]
537
+ if msg.tool_calls:
538
+ # Find all results for this tool call
539
+ call_ids = {tc.id for tc in msg.tool_calls}
540
+ end = i
541
+ j = i + 1
542
+ while j < len(messages) and call_ids:
543
+ if messages[j].role == "tool" and messages[j].tool_call_id in call_ids:
544
+ call_ids.discard(messages[j].tool_call_id)
545
+ end = j
546
+ j += 1
547
+ groups.append((i, end + 1))
548
+ i = end + 1
549
+ else:
550
+ groups.append((i, i + 1))
551
+ i += 1
552
+
553
+ # Find safe head end (after self.head_messages worth of groups)
554
+ head_end = 0
555
+ for idx, (_start, end) in enumerate(groups):
556
+ if idx < self.head_messages:
557
+ head_end = end
558
+ else:
559
+ break
560
+
561
+ # Find safe tail start (before last self.tail_messages groups)
562
+ tail_start = len(messages)
563
+ tail_groups = min(self.tail_messages, len(groups))
564
+ if tail_groups > 0 and len(groups) > tail_groups:
565
+ tail_start = groups[-tail_groups][0]
566
+
567
+ # Ensure we don't overlap
568
+ if head_end >= tail_start:
569
+ # Not enough room - just keep everything
570
+ return len(messages), len(messages)
571
+
572
+ return head_end, tail_start
573
+
574
+ async def prepare_context_async(
575
+ self,
576
+ messages: list[ChatMessage],
577
+ token_budget: int,
578
+ ) -> list[ChatMessage]:
579
+ """Async version that can use LLM for summarization."""
580
+ if not messages:
581
+ return messages
582
+
583
+ current_tokens = self._count_tokens(messages)
584
+ if current_tokens <= token_budget:
585
+ return messages
586
+
587
+ self.compaction_count += 1
588
+
589
+ # Find safe split points that don't break tool call/result pairs
590
+ head_end, tail_start = self._find_safe_split_points(messages)
591
+
592
+ head = messages[:head_end]
593
+ tail = messages[tail_start:]
594
+ middle = messages[head_end:tail_start]
595
+
596
+ if not middle:
597
+ # Nothing to summarize - return as is
598
+ return messages
599
+
600
+ # Extract the original task from the first user message
601
+ original_task = ""
602
+ for msg in head:
603
+ if msg.role == "user" and msg.content:
604
+ original_task = msg.content
605
+ break
606
+
607
+ # Generate summary of middle section with task context
608
+ summary_text = await self._generate_summary(middle, original_task)
609
+
610
+ # Create a user message that clearly instructs continuation
611
+ # This works better than assistant role because it's a clear directive
612
+ summary_message = ChatMessage(
613
+ role="user",
614
+ content=f"""[CONTEXT CHECKPOINT - Your previous work has been summarized below]
615
+
616
+ {summary_text}
617
+
618
+ ---
619
+ IMPORTANT: The files listed above have ALREADY been read and analyzed.
620
+ DO NOT re-read them - that would waste tokens and duplicate work.
621
+ Continue from where you left off, completing any remaining items listed in "WHAT REMAINS".
622
+ If all files have been read, proceed to generate the final output.""",
623
+ )
624
+
625
+ # Build compacted message list
626
+ compacted = head + [summary_message] + tail
627
+
628
+ # Track savings
629
+ compacted_tokens = self._count_tokens(compacted)
630
+ self.total_tokens_saved += current_tokens - compacted_tokens
631
+
632
+ return compacted
633
+
634
+ def _compact_with_summary_sync(
635
+ self, messages: list[ChatMessage], token_budget: int
636
+ ) -> list[ChatMessage]:
637
+ """Synchronous compaction with simple summary extraction."""
638
+ self.compaction_count += 1
639
+
640
+ # Find safe split points that don't break tool call/result pairs
641
+ head_end, tail_start = self._find_safe_split_points(messages)
642
+
643
+ head = messages[:head_end]
644
+ tail = messages[tail_start:]
645
+ middle = messages[head_end:tail_start]
646
+
647
+ if not middle:
648
+ return messages
649
+
650
+ # Extract key info without LLM
651
+ summary_text = self._extract_key_info(middle)
652
+
653
+ summary_message = ChatMessage(
654
+ role="user",
655
+ content=f"[CONTEXT SUMMARY - Previous {len(middle)} messages compressed]\n\n{summary_text}\n\n[END SUMMARY - Continue from here]",
656
+ )
657
+
658
+ compacted = head + [summary_message] + tail
659
+
660
+ compacted_tokens = self._count_tokens(compacted)
661
+ current_tokens = self._count_tokens(messages)
662
+ self.total_tokens_saved += current_tokens - compacted_tokens
663
+
664
+ return compacted
src/flow/harness/miniagent/harness.py ADDED
@@ -0,0 +1,403 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """MiniAgent harness - implements BaseHarness for Flow integration.
2
+
3
+ This harness adapts MiniAgent's ChatAgent to Flow's harness interface,
4
+ enabling experiments with correct context compaction.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import logging
10
+ import uuid
11
+ from collections.abc import AsyncIterator
12
+ from pathlib import Path
13
+ from typing import TYPE_CHECKING, Any
14
+
15
+ from flow.harness.base import BaseHarness, Event, EventType
16
+
17
+ if TYPE_CHECKING:
18
+ from flow.experiments.models import Agent
19
+ from flow.llm import LLMClientConfig
20
+
21
+ from .agent import ChatAgent, AgentThread, StreamEvent, StreamEventType
22
+ from .context import (
23
+ ContextStrategy,
24
+ NoCompactionStrategy,
25
+ HeadTailStrategy,
26
+ SlidingWindowStrategy,
27
+ SummarizationStrategy,
28
+ )
29
+ from .client import ChatClient
30
+ from .otel import enable_instrumentation
31
+ from .instructions import get_instructions
32
+
33
+ from flow.tools import Tool
34
+
35
+ logger = logging.getLogger(__name__)
36
+
37
+ # Enable instrumentation on module load (like MAF does)
38
+ enable_instrumentation()
39
+
40
+
41
+ class MiniAgentHarness(BaseHarness):
42
+ """Harness adapter for MiniAgent.
43
+
44
+ This adapter:
45
+ 1. Maps Flow's Agent spec to MiniAgent's ChatAgent
46
+ 2. Maps CompactionConfig to ContextStrategy
47
+ 3. Converts StreamEvents to Flow Events
48
+ 4. Injects OTEL hooks for trace collection
49
+
50
+ Example:
51
+ >>> from flow.harness.miniagent import MiniAgentHarness
52
+ >>> from flow.experiments.models import Agent, CompactionConfig
53
+ >>>
54
+ >>> agent = Agent(
55
+ ... name="test",
56
+ ... framework="miniagent",
57
+ ... compaction=CompactionConfig.head_tail_tokens(0.2, 50_000),
58
+ ... )
59
+ >>> harness = MiniAgentHarness.from_agent(agent, workspace=Path("/tmp"))
60
+ >>> async for event in harness.run_stream("Hello"):
61
+ ... print(event)
62
+ """
63
+
64
+ @classmethod
65
+ def from_agent(
66
+ cls,
67
+ agent: "Agent",
68
+ workspace: Path,
69
+ llm_config: "LLMClientConfig | None" = None,
70
+ ) -> "MiniAgentHarness":
71
+ """Create a MiniAgentHarness from an Agent definition.
72
+
73
+ Args:
74
+ agent: The Agent spec defining the configuration
75
+ workspace: Working directory for the agent
76
+ llm_config: Optional LLM configuration (falls back to env vars if not provided)
77
+
78
+ Returns:
79
+ A configured MiniAgentHarness instance
80
+ """
81
+ from flow.experiments.models import resolve_tools
82
+
83
+ # 1. Map CompactionConfig → ContextStrategy
84
+ context_strategy = cls._create_context_strategy(agent)
85
+
86
+ # 2. Build tools from spec
87
+ tools_spec = resolve_tools(agent.tools)
88
+ tools = cls._build_tools(tools_spec, workspace)
89
+
90
+ # 3. Create OTEL hooks for trace collection
91
+ from .otel import create_otel_hooks
92
+ otel_hooks = create_otel_hooks(model=agent.model or "gpt-4o")
93
+
94
+ # 4. Create ChatClient from LLM config or env
95
+ from .client import ClientConfig
96
+ if llm_config is not None:
97
+ # Use provided LLM config
98
+ config = cls._create_client_config_from_llm_config(llm_config)
99
+ else:
100
+ # Fall back to env vars
101
+ config = ClientConfig.from_env()
102
+ if agent.model:
103
+ config.model = agent.model
104
+
105
+ chat_client = ChatClient(config)
106
+
107
+ # Resolve instructions: explicit > preset > default "coding"
108
+ if agent.instructions:
109
+ instructions = agent.instructions
110
+ elif agent.instructions_preset:
111
+ instructions = get_instructions(agent.instructions_preset)
112
+ else:
113
+ instructions = get_instructions("coding")
114
+
115
+ chat_agent = ChatAgent(
116
+ client=chat_client,
117
+ instructions=instructions,
118
+ tools=tools,
119
+ context_strategy=context_strategy,
120
+ token_budget=agent.compaction.token_budget,
121
+ hooks=otel_hooks,
122
+ )
123
+
124
+ return cls(chat_agent, workspace)
125
+
126
+ @classmethod
127
+ def _create_client_config_from_llm_config(
128
+ cls, llm_config: "LLMClientConfig"
129
+ ) -> "ClientConfig":
130
+ """Create MiniAgent ClientConfig from Flow LLMClientConfig.
131
+
132
+ Args:
133
+ llm_config: Flow's LLM client configuration
134
+
135
+ Returns:
136
+ MiniAgent ClientConfig
137
+ """
138
+ from flow.llm import LLMProvider
139
+ from .client import ClientConfig
140
+
141
+ match llm_config.provider:
142
+ case LLMProvider.AZURE_OPENAI:
143
+ if not llm_config.azure_openai:
144
+ raise ValueError("azure_openai config required for Azure OpenAI provider")
145
+ return ClientConfig(
146
+ api_key=llm_config.azure_openai.get_api_key(),
147
+ model=llm_config.azure_openai.deployment,
148
+ endpoint=llm_config.azure_openai.get_endpoint(),
149
+ api_version=llm_config.azure_openai.api_version,
150
+ )
151
+
152
+ case LLMProvider.OPENAI:
153
+ if not llm_config.openai:
154
+ raise ValueError("openai config required for OpenAI provider")
155
+ return ClientConfig(
156
+ api_key=llm_config.openai.get_api_key(),
157
+ model=llm_config.openai.model_id,
158
+ endpoint=llm_config.openai.base_url,
159
+ )
160
+
161
+ case LLMProvider.CUSTOM:
162
+ if not llm_config.custom:
163
+ raise ValueError("custom config required for custom provider")
164
+ return ClientConfig(
165
+ api_key=llm_config.custom.get_api_key(),
166
+ model=llm_config.custom.model_id,
167
+ endpoint=llm_config.custom.base_url,
168
+ )
169
+
170
+ case _:
171
+ raise ValueError(
172
+ f"MiniAgent does not support provider: {llm_config.provider.value}. "
173
+ f"Supported: openai, azure_openai, custom"
174
+ )
175
+
176
+ @classmethod
177
+ def _create_context_strategy(cls, agent: "Agent") -> ContextStrategy:
178
+ """Map Flow's CompactionConfig to MiniAgent's ContextStrategy."""
179
+ config = agent.compaction
180
+
181
+ match config.strategy:
182
+ case "none":
183
+ return NoCompactionStrategy()
184
+
185
+ case "head_tail":
186
+ # Legacy message-count based → convert to ratio
187
+ total = config.head_size + config.tail_size
188
+ ratio = config.head_size / total if total > 0 else 0.2
189
+ return HeadTailStrategy(head_ratio=ratio)
190
+
191
+ case "head_tail_tokens":
192
+ return HeadTailStrategy(
193
+ head_ratio=config.params.get("head_ratio", 0.2)
194
+ )
195
+
196
+ case "sliding_window":
197
+ return SlidingWindowStrategy()
198
+
199
+ case "summarization":
200
+ # SummarizationStrategy needs a client for LLM calls
201
+ return SummarizationStrategy(
202
+ client=ChatClient(),
203
+ head_messages=config.params.get("head_messages", 2),
204
+ tail_messages=config.params.get("tail_messages", 4),
205
+ summary_max_tokens=config.params.get("summary_max_tokens", 1000),
206
+ )
207
+
208
+ case "last_n":
209
+ # Map to sliding window as closest equivalent
210
+ return SlidingWindowStrategy()
211
+
212
+ case _:
213
+ logger.warning(f"Unknown compaction strategy: {config.strategy}, using none")
214
+ return NoCompactionStrategy()
215
+
216
+ @classmethod
217
+ def _build_tools(cls, tools_spec: dict[str, dict[str, Any]], workspace: Path) -> list[Tool]:
218
+ """Build MiniAgent Tools from Flow tool spec.
219
+
220
+ Uses the shared tools from flow.tools, setting up the workspace
221
+ for tools that need persistent state.
222
+
223
+ Args:
224
+ tools_spec: Dict mapping tool names to their configs
225
+ workspace: Working directory for tools
226
+
227
+ Returns:
228
+ List of Tool instances
229
+ """
230
+ # Import shared tools
231
+ from flow.tools import (
232
+ # Coding
233
+ read_file, write_file, edit_file, multi_edit, glob_files, grep, ls,
234
+ # Execution
235
+ bash, check_processes, python_repl,
236
+ # Planning
237
+ think, todo_write, todo_read,
238
+ # Memory
239
+ memory, create_memory_tool,
240
+ # Web
241
+ web_search, web_fetch,
242
+ # Notebooks
243
+ notebook_edit, notebook_read,
244
+ # Skills
245
+ skills, create_skills_tool,
246
+ # Sub-agent
247
+ task, create_task_tool,
248
+ # Workspace management
249
+ set_workspace, Workspace,
250
+ )
251
+
252
+ # Set workspace for tools that need it (memory, todos, etc.)
253
+ set_workspace(Workspace(workspace))
254
+
255
+ # Map tool names → Tool instances
256
+ tool_map: dict[str, Tool] = {
257
+ # Coding/Filesystem
258
+ "read_file": read_file,
259
+ "write_file": write_file,
260
+ "edit_file": edit_file,
261
+ "multi_edit": multi_edit,
262
+ "glob_files": glob_files,
263
+ "ls": ls,
264
+ "grep": grep,
265
+ # Execution
266
+ "bash": bash,
267
+ "check_processes": check_processes,
268
+ "python_repl": python_repl,
269
+ # Planning
270
+ "think": think,
271
+ "todo_write": todo_write,
272
+ "todo_read": todo_read,
273
+ # Web
274
+ "web_search": web_search,
275
+ "web_fetch": web_fetch,
276
+ # Notebooks
277
+ "notebook_edit": notebook_edit,
278
+ "notebook_read": notebook_read,
279
+ # Memory (default instance)
280
+ "memory": memory,
281
+ # Skills (default instance)
282
+ "skills": skills,
283
+ # Task/sub-agent (default instance)
284
+ "task": task,
285
+ }
286
+
287
+ tools: list[Tool] = []
288
+
289
+ for name, config in tools_spec.items():
290
+ if name in tool_map:
291
+ tools.append(tool_map[name])
292
+ elif name == "task" and config:
293
+ # Task tool with custom config
294
+ tools.append(create_task_tool(
295
+ coordinator_tools=list(tool_map.values()),
296
+ model=config.get("model"),
297
+ ))
298
+ else:
299
+ logger.warning(f"Unknown tool: {name}")
300
+
301
+ return tools
302
+
303
+ def __init__(self, agent: ChatAgent, workspace: Path) -> None:
304
+ """Initialize the harness.
305
+
306
+ Args:
307
+ agent: The MiniAgent ChatAgent instance
308
+ workspace: Working directory
309
+ """
310
+ self._agent = agent
311
+ self._workspace = workspace
312
+ self._thread: AgentThread | None = None
313
+ self._thread_id: str | None = None
314
+
315
+ async def run_stream(self, task: str) -> AsyncIterator[Event]:
316
+ """Run a task with streaming events.
317
+
318
+ Args:
319
+ task: The task/prompt to execute
320
+
321
+ Yields:
322
+ Event objects representing agent activity
323
+ """
324
+ if self._thread is None:
325
+ self._thread = self._agent.get_new_thread()
326
+
327
+ try:
328
+ async for event in self._agent.run_stream(task, thread=self._thread):
329
+ flow_event = self._convert_event(event)
330
+ if flow_event:
331
+ yield flow_event
332
+
333
+ yield Event(type=EventType.DONE)
334
+
335
+ except Exception as e:
336
+ logger.exception(f"Error in MiniAgent execution: {e}")
337
+ yield Event(type=EventType.ERROR, content=str(e))
338
+
339
+ def _convert_event(self, event: StreamEvent) -> Event | None:
340
+ """Convert a MiniAgent StreamEvent to a Flow Event.
341
+
342
+ Args:
343
+ event: StreamEvent from MiniAgent
344
+
345
+ Returns:
346
+ Flow Event or None if no conversion needed
347
+ """
348
+ match event.type:
349
+ case StreamEventType.AGENT_START:
350
+ # Could emit a thinking event
351
+ return None
352
+
353
+ case StreamEventType.MODEL_START:
354
+ return Event(
355
+ type=EventType.THINKING,
356
+ content=f"Iteration {event.data.get('iteration', 0) + 1}",
357
+ )
358
+
359
+ case StreamEventType.MODEL_END:
360
+ # Token usage tracked via OTEL, no event needed
361
+ return None
362
+
363
+ case StreamEventType.TOOL_START:
364
+ return Event(
365
+ type=EventType.TOOL_CALL_START,
366
+ tool_name=str(event.data.get("tool_name", "")),
367
+ )
368
+
369
+ case StreamEventType.TOOL_END:
370
+ return Event(
371
+ type=EventType.TOOL_RESULT,
372
+ content=str(event.data.get("tool_output", ""))[:1000], # Truncate
373
+ tool_name=str(event.data.get("tool_name", "")),
374
+ )
375
+
376
+ case StreamEventType.TEXT:
377
+ content = event.data.get("content", "")
378
+ if content:
379
+ return Event(type=EventType.TEXT_DELTA, content=str(content))
380
+ return None
381
+
382
+ case StreamEventType.AGENT_END:
383
+ # Don't include content - it was already streamed via TEXT events
384
+ # TEXT_DONE just signals completion
385
+ return Event(type=EventType.TEXT_DONE, content="")
386
+
387
+ case _:
388
+ return None
389
+
390
+ def get_thread_id(self) -> str:
391
+ """Get the current thread ID.
392
+
393
+ Returns:
394
+ The current conversation thread ID
395
+ """
396
+ if self._thread_id is None:
397
+ self._thread_id = str(uuid.uuid4())
398
+ return self._thread_id
399
+
400
+ async def close(self) -> None:
401
+ """Clean up resources used by the harness."""
402
+ self._thread = None
403
+ self._thread_id = None
src/flow/harness/miniagent/hooks.py ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Hook types and event definitions for MiniAgent.
2
+
3
+ Inspired by Claude Agent SDK's hooks system. Hooks allow applications to:
4
+ - Observe: Monitor what's happening (logging, metrics)
5
+ - Modify: Change inputs, inject context
6
+ - Control: Block tool calls, stop execution
7
+ """
8
+
9
+ from dataclasses import dataclass, field
10
+ from typing import Any, Callable, Awaitable, Literal
11
+ from enum import Enum
12
+
13
+
14
+ class HookEvent(str, Enum):
15
+ """All supported hook events."""
16
+
17
+ PRE_TOOL_USE = "pre_tool_use"
18
+ POST_TOOL_USE = "post_tool_use"
19
+ PRE_MODEL_CALL = "pre_model_call"
20
+ POST_MODEL_CALL = "post_model_call"
21
+ PRE_COMPACT = "pre_compact"
22
+ POST_COMPACT = "post_compact"
23
+ AGENT_START = "agent_start"
24
+ AGENT_END = "agent_end"
25
+
26
+
27
+ # === Event Data Classes ===
28
+
29
+
30
+ @dataclass
31
+ class PreToolUseEvent:
32
+ """Fired before a tool is executed.
33
+
34
+ Hooks can inspect and optionally block or modify the tool call.
35
+ """
36
+
37
+ tool_name: str
38
+ tool_input: dict[str, Any]
39
+ tool_call_id: str
40
+ iteration: int
41
+
42
+
43
+ @dataclass
44
+ class PreToolUseResult:
45
+ """Result from PreToolUse hook.
46
+
47
+ Controls whether the tool call proceeds.
48
+ """
49
+
50
+ decision: Literal["allow", "block", "modify"] = "allow"
51
+ reason: str | None = None # Shown to model if blocked
52
+ modified_input: dict[str, Any] | None = None # If decision="modify"
53
+
54
+
55
+ @dataclass
56
+ class PostToolUseEvent:
57
+ """Fired after a tool executes.
58
+
59
+ Hooks can inject additional context or stop execution.
60
+ """
61
+
62
+ tool_name: str
63
+ tool_input: dict[str, Any]
64
+ tool_output: str
65
+ tool_call_id: str
66
+ iteration: int
67
+ error: str | None = None
68
+
69
+
70
+ @dataclass
71
+ class PostToolUseResult:
72
+ """Result from PostToolUse hook."""
73
+
74
+ additional_context: str | None = None # Injected into next message
75
+ stop_execution: bool = False
76
+ stop_reason: str | None = None
77
+
78
+
79
+ @dataclass
80
+ class PreModelCallEvent:
81
+ """Fired before an LLM call.
82
+
83
+ Useful for logging, metrics, or inspecting the context.
84
+ """
85
+
86
+ message_count: int
87
+ iteration: int
88
+ estimated_tokens: int | None = None
89
+
90
+
91
+ @dataclass
92
+ class PostModelCallEvent:
93
+ """Fired after an LLM call.
94
+
95
+ Contains usage information and the raw response.
96
+ """
97
+
98
+ usage: dict[str, int]
99
+ iteration: int
100
+ has_tool_calls: bool
101
+ finish_reason: str | None = None
102
+ response_text: str = "" # The model's text response (non-tool content)
103
+
104
+
105
+ @dataclass
106
+ class PreCompactEvent:
107
+ """Fired before context compaction.
108
+
109
+ Allows monitoring when compaction is triggered.
110
+ """
111
+
112
+ message_count: int
113
+ current_tokens: int
114
+ budget: int
115
+ trigger: Literal["auto", "manual"]
116
+
117
+
118
+ @dataclass
119
+ class PostCompactEvent:
120
+ """Fired after context compaction.
121
+
122
+ Reports how much was compacted.
123
+ """
124
+
125
+ messages_before: int
126
+ messages_after: int
127
+ tokens_before: int
128
+ tokens_after: int
129
+
130
+
131
+ @dataclass
132
+ class AgentStartEvent:
133
+ """Fired when agent.run() starts."""
134
+
135
+ user_message: str
136
+ thread_message_count: int
137
+
138
+
139
+ @dataclass
140
+ class AgentEndEvent:
141
+ """Fired when agent.run() completes."""
142
+
143
+ final_response: str | None
144
+ total_iterations: int
145
+ total_input_tokens: int
146
+ total_output_tokens: int
147
+ tool_calls_made: int
148
+
149
+
150
+ # === Hook Type Aliases ===
151
+
152
+ PreToolUseHook = Callable[[PreToolUseEvent], Awaitable[PreToolUseResult | None]]
153
+ PostToolUseHook = Callable[[PostToolUseEvent], Awaitable[PostToolUseResult | None]]
154
+ PreModelCallHook = Callable[[PreModelCallEvent], Awaitable[None]]
155
+ PostModelCallHook = Callable[[PostModelCallEvent], Awaitable[None]]
156
+ PreCompactHook = Callable[[PreCompactEvent], Awaitable[None]]
157
+ PostCompactHook = Callable[[PostCompactEvent], Awaitable[None]]
158
+ AgentStartHook = Callable[[AgentStartEvent], Awaitable[None]]
159
+ AgentEndHook = Callable[[AgentEndEvent], Awaitable[None]]
160
+
161
+
162
+ def _pre_tool_use_factory() -> "list[PreToolUseHook]":
163
+ return []
164
+
165
+ def _post_tool_use_factory() -> "list[PostToolUseHook]":
166
+ return []
167
+
168
+ def _pre_model_call_factory() -> "list[PreModelCallHook]":
169
+ return []
170
+
171
+ def _post_model_call_factory() -> "list[PostModelCallHook]":
172
+ return []
173
+
174
+ def _pre_compact_factory() -> "list[PreCompactHook]":
175
+ return []
176
+
177
+ def _post_compact_factory() -> "list[PostCompactHook]":
178
+ return []
179
+
180
+ def _agent_start_factory() -> "list[AgentStartHook]":
181
+ return []
182
+
183
+ def _agent_end_factory() -> "list[AgentEndHook]":
184
+ return []
185
+
186
+
187
+ @dataclass
188
+ class Hooks:
189
+ """Hook configuration for ChatAgent.
190
+
191
+ All hook lists are optional. Multiple hooks can be registered
192
+ for each event - they are called in order.
193
+
194
+ Example:
195
+ async def log_tokens(event: PostModelCallEvent) -> None:
196
+ print(f"Used {event.usage['input_tokens']} input tokens")
197
+
198
+ hooks = Hooks(post_model_call=[log_tokens])
199
+ agent = ChatAgent(hooks=hooks)
200
+ """
201
+
202
+ pre_tool_use: "list[PreToolUseHook]" = field(default_factory=_pre_tool_use_factory)
203
+ post_tool_use: "list[PostToolUseHook]" = field(default_factory=_post_tool_use_factory)
204
+ pre_model_call: "list[PreModelCallHook]" = field(default_factory=_pre_model_call_factory)
205
+ post_model_call: "list[PostModelCallHook]" = field(default_factory=_post_model_call_factory)
206
+ pre_compact: "list[PreCompactHook]" = field(default_factory=_pre_compact_factory)
207
+ post_compact: "list[PostCompactHook]" = field(default_factory=_post_compact_factory)
208
+ agent_start: "list[AgentStartHook]" = field(default_factory=_agent_start_factory)
209
+ agent_end: "list[AgentEndHook]" = field(default_factory=_agent_end_factory)
src/flow/harness/miniagent/instructions.py ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """System instructions for MiniAgent.
2
+
3
+ Provides well-crafted system prompts that guide the agent on:
4
+ - How to use each tool effectively
5
+ - When to use todo_write for task tracking
6
+ - When to delegate with the task tool
7
+ - Best practices for coding tasks
8
+
9
+ These instructions are informed by Claude Code's system prompt design.
10
+ """
11
+
12
+ # =============================================================================
13
+ # Shared instruction blocks - reused across presets
14
+ # =============================================================================
15
+
16
+ TASK_COMPLETION_INSTRUCTIONS = """
17
+ ## CRITICAL: Task Completion
18
+
19
+ You MUST complete tasks fully and autonomously.
20
+
21
+ ### Execution Rules
22
+ - DO NOT ask "Should I proceed?" or "Would you like me to continue?"
23
+ - DO NOT propose alternatives or phased approaches when the task is clear
24
+ - DO NOT decline tasks due to scope concerns - break them into steps instead
25
+ - DO NOT stop halfway - keep iterating until the task is DONE
26
+ - If something fails, try alternatives automatically
27
+
28
+ ### Use Todos for Multi-Step Tasks
29
+ For any task with 3+ steps:
30
+ 1. Create a todo list with `todo_write` at the START
31
+ 2. Mark tasks `in_progress` as you work on them (only ONE at a time)
32
+ 3. Mark tasks `completed` IMMEDIATELY when done
33
+ 4. Keep working until ALL todos are checked off
34
+ 5. Do NOT finish until the todo list shows everything complete
35
+
36
+ ### Iteration Until Success
37
+ - If a step fails, diagnose and retry with a different approach
38
+ - If you hit a blocker, note it and move to the next step
39
+ - Circle back to unfinished items
40
+ - The task is NOT done until all requirements are met
41
+ """
42
+
43
+ EFFICIENCY_INSTRUCTIONS = """
44
+ ## Efficiency Guidelines
45
+
46
+ ### Batch Tool Calls
47
+ Call ALL independent tools in a SINGLE response:
48
+ - Read 5 files? Call read_file 5 times in one response.
49
+ - Search multiple patterns? Call grep multiple times in one response.
50
+ - List directories and read files? Call both in one response.
51
+
52
+ ### Read Files Fully
53
+ Read ENTIRE files (default limit 2000 lines). Do NOT chunk files into small pieces (40-60 lines) - this wastes API calls and context.
54
+
55
+ ### Search Then Batch Read
56
+ 1. Use glob_files or grep to find relevant files
57
+ 2. Read ALL matching files in a single response
58
+ """
59
+
60
+ BEST_PRACTICES_INSTRUCTIONS = """
61
+ ## Best Practices
62
+
63
+ ### Before Editing
64
+ NEVER edit a file you haven't read. Always use `read_file` first.
65
+
66
+ ### Follow Existing Patterns
67
+ Before writing new code, examine neighboring files to understand:
68
+ - Naming conventions
69
+ - Import style
70
+ - Error handling patterns
71
+ - Framework usage
72
+
73
+ ### Don't Over-Engineer
74
+ - Solve the current problem, not hypothetical future ones
75
+ - Prefer editing existing files over creating new ones
76
+ - NEVER proactively create documentation files unless explicitly requested
77
+ - Don't add features beyond what was asked
78
+
79
+ ### Verify Dependencies
80
+ Never assume libraries exist. Check package.json, requirements.txt, or equivalent before importing.
81
+
82
+ ### Security
83
+ - Refuse to write code that could be used maliciously
84
+ - Never expose secrets, API keys, or credentials in code
85
+ - If files seem related to malware, refuse to help
86
+ """
87
+
88
+ # =============================================================================
89
+ # Preset-specific instructions
90
+ # =============================================================================
91
+
92
+ CODING_AGENT_INSTRUCTIONS = f"""You are an expert coding assistant. You help users with software engineering tasks including writing code, debugging, refactoring, and explaining code.
93
+
94
+ ## Response Style
95
+
96
+ - Be concise and direct in explanations, but thorough in execution.
97
+ - Use GitHub-flavored markdown for formatting.
98
+ - When referencing code, use the pattern `file_path:line_number` (e.g., `src/utils.py:42`).
99
+ - Don't add unnecessary preamble or postamble. Get to work.
100
+ - Only use emojis if explicitly requested.
101
+ {TASK_COMPLETION_INSTRUCTIONS}
102
+ ## Tool Usage
103
+
104
+ ### File Operations
105
+ - **read_file**: Read file contents with line numbers. Always read before editing.
106
+ - **write_file**: Create new files or completely replace file contents.
107
+ - **edit_file**: Make targeted edits by replacing specific text (must be unique in file).
108
+ - **multi_edit**: Make multiple edits to a file atomically (all succeed or all fail).
109
+ - **glob_files**: Find files by pattern (e.g., `**/*.py`, `src/**/*.ts`).
110
+ - **grep**: Search file contents with regex. Returns matching lines with context.
111
+ - **ls**: List directory contents.
112
+
113
+ ### Execution
114
+ - **bash**: Execute shell commands. Use for git, running tests, installing packages.
115
+
116
+ ### Planning
117
+ - **think**: Reason through complex problems before acting.
118
+ - **todo_write**: Track progress on multi-step tasks. USE THIS FREQUENTLY.
119
+ - **todo_read**: Check current task status.
120
+
121
+ ### Delegation (if available)
122
+ - **task**: Delegate complex sub-tasks to a specialist agent with isolated context.
123
+ {EFFICIENCY_INSTRUCTIONS}
124
+ {BEST_PRACTICES_INSTRUCTIONS}
125
+ """
126
+
127
+ RESEARCH_AGENT_INSTRUCTIONS = f"""You are a research assistant. You help users find information, synthesize knowledge, and answer questions.
128
+
129
+ ## Response Style
130
+
131
+ - Be thorough in research, concise in presentation.
132
+ - Cite sources with URLs when reporting findings.
133
+ - Synthesize information - don't just list results.
134
+ {TASK_COMPLETION_INSTRUCTIONS}
135
+ ## Tools
136
+
137
+ ### Search & Fetch
138
+ - **web_search**: Search the web for information.
139
+ - **web_fetch**: Fetch and read web page contents.
140
+
141
+ ### Planning
142
+ - **think**: Work through complex questions step by step.
143
+ - **todo_write**: Track research progress on multi-part questions.
144
+
145
+ ## Research Strategy
146
+
147
+ 1. Start with broad searches to identify relevant sources
148
+ 2. Fetch multiple promising URLs in parallel (batch web_fetch calls)
149
+ 3. Synthesize findings into a coherent answer
150
+ 4. If initial searches don't answer the question, refine and search again
151
+
152
+ ## Guidelines
153
+
154
+ 1. **Be thorough**: Search multiple queries if needed - batch them.
155
+ 2. **Cite sources**: Include URLs when reporting findings.
156
+ 3. **Synthesize**: Draw conclusions, don't just list results.
157
+ 4. **Keep going**: If first searches don't work, try different queries.
158
+ 5. **Acknowledge uncertainty**: If information is unclear, say so.
159
+ """
160
+
161
+ EXPLORE_AGENT_INSTRUCTIONS = f"""You are a codebase exploration specialist. Your job is to quickly find and understand code.
162
+
163
+ ## Response Style
164
+
165
+ - Be concise. Your response goes to another agent, so be self-contained.
166
+ - Include file paths and line numbers in findings.
167
+ - Summarize what you found, don't dump raw content.
168
+ {TASK_COMPLETION_INSTRUCTIONS}
169
+ ## Tools
170
+
171
+ - **read_file**: Read file contents (read fully, don't chunk).
172
+ - **glob_files**: Find files by pattern.
173
+ - **grep**: Search file contents with regex.
174
+ - **ls**: List directory contents.
175
+ - **think**: Reason about what you're finding.
176
+ - **todo_write**: Track exploration progress for complex searches.
177
+ {EFFICIENCY_INSTRUCTIONS}
178
+ ## Guidelines
179
+
180
+ 1. **Start broad, then narrow**: Use glob/grep to find candidates, then batch-read.
181
+ 2. **Be efficient**: Don't read files you don't need.
182
+ 3. **Report clearly**: Include file paths and line numbers.
183
+ 4. **Keep searching**: If first attempt doesn't find what's needed, try different patterns.
184
+ 5. **Summarize**: Be self-contained for the calling agent.
185
+ """
186
+
187
+ # =============================================================================
188
+ # Instruction presets registry
189
+ # =============================================================================
190
+
191
+ INSTRUCTIONS = {
192
+ "coding": CODING_AGENT_INSTRUCTIONS,
193
+ "research": RESEARCH_AGENT_INSTRUCTIONS,
194
+ "explore": EXPLORE_AGENT_INSTRUCTIONS,
195
+ }
196
+
197
+
198
+ def get_instructions(preset: str = "coding") -> str:
199
+ """Get system instructions by preset name.
200
+
201
+ Args:
202
+ preset: One of 'coding', 'research', 'explore'
203
+
204
+ Returns:
205
+ System instruction string
206
+ """
207
+ return INSTRUCTIONS.get(preset, CODING_AGENT_INSTRUCTIONS)
src/flow/harness/miniagent/messages.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Message types for MiniAgent.
2
+
3
+ Defines the core message structures used in the agent loop.
4
+ """
5
+
6
+ from dataclasses import dataclass
7
+ from typing import Any, Literal
8
+
9
+ Role = Literal["system", "user", "assistant", "tool"]
10
+
11
+
12
+ @dataclass
13
+ class ToolCall:
14
+ """A tool call request from the model."""
15
+
16
+ id: str
17
+ name: str
18
+ arguments: str # JSON string
19
+
20
+
21
+ @dataclass
22
+ class ToolResult:
23
+ """Result of executing a tool."""
24
+
25
+ call_id: str
26
+ result: str
27
+ error: str | None = None
28
+
29
+
30
+ @dataclass
31
+ class ChatMessage:
32
+ """A message in the conversation.
33
+
34
+ Supports all OpenAI message roles and tool calling.
35
+ """
36
+
37
+ role: Role
38
+ content: str | None = None
39
+ tool_calls: list[ToolCall] | None = None
40
+ tool_call_id: str | None = None # For tool role messages
41
+ name: str | None = None # Optional name for the message author
42
+
43
+ def to_openai_format(self) -> dict[str, Any]:
44
+ """Convert to OpenAI API format."""
45
+ msg: dict[str, Any] = {"role": self.role}
46
+
47
+ if self.content is not None:
48
+ msg["content"] = self.content
49
+
50
+ if self.tool_calls:
51
+ msg["tool_calls"] = [
52
+ {
53
+ "id": tc.id,
54
+ "type": "function",
55
+ "function": {"name": tc.name, "arguments": tc.arguments},
56
+ }
57
+ for tc in self.tool_calls
58
+ ]
59
+
60
+ if self.tool_call_id:
61
+ msg["tool_call_id"] = self.tool_call_id
62
+
63
+ if self.name:
64
+ msg["name"] = self.name
65
+
66
+ return msg
67
+
68
+ @classmethod
69
+ def system(cls, content: str) -> "ChatMessage":
70
+ """Create a system message."""
71
+ return cls(role="system", content=content)
72
+
73
+ @classmethod
74
+ def user(cls, content: str) -> "ChatMessage":
75
+ """Create a user message."""
76
+ return cls(role="user", content=content)
77
+
78
+ @classmethod
79
+ def assistant(
80
+ cls, content: str | None = None, tool_calls: list[ToolCall] | None = None
81
+ ) -> "ChatMessage":
82
+ """Create an assistant message."""
83
+ return cls(role="assistant", content=content, tool_calls=tool_calls)
84
+
85
+ @classmethod
86
+ def tool(cls, call_id: str, content: str) -> "ChatMessage":
87
+ """Create a tool result message."""
88
+ return cls(role="tool", content=content, tool_call_id=call_id)
src/flow/harness/miniagent/otel.py ADDED
@@ -0,0 +1,258 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """OpenTelemetry instrumentation for MiniAgent.
2
+
3
+ This module provides OTEL span emission that conforms to GenAI semantic
4
+ conventions, enabling Flow's metrics extraction pipeline to work with
5
+ MiniAgent traces.
6
+
7
+ Reference: https://opentelemetry.io/docs/specs/semconv/gen-ai/
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from typing import TYPE_CHECKING
13
+
14
+ from opentelemetry import trace
15
+
16
+ if TYPE_CHECKING:
17
+ from .hooks import (
18
+ Hooks,
19
+ PreModelCallEvent,
20
+ PostModelCallEvent,
21
+ PreToolUseEvent,
22
+ PreToolUseResult,
23
+ PostToolUseEvent,
24
+ PostToolUseResult,
25
+ )
26
+
27
+ __all__ = ["GenAIAttr", "create_otel_hooks", "enable_instrumentation"]
28
+
29
+ # Track if instrumentation has been enabled
30
+ _instrumentation_enabled = False
31
+
32
+
33
+ class GenAIAttr:
34
+ """OpenTelemetry GenAI semantic convention attributes.
35
+
36
+ These match the attributes used by MAF/LangGraph harnesses for consistency.
37
+ Reference: https://opentelemetry.io/docs/specs/semconv/gen-ai/gen-ai-spans/
38
+ """
39
+
40
+ # Operation
41
+ OPERATION_NAME = "gen_ai.operation.name"
42
+ PROVIDER_NAME = "gen_ai.provider.name"
43
+
44
+ # Model
45
+ REQUEST_MODEL = "gen_ai.request.model"
46
+ RESPONSE_MODEL = "gen_ai.response.model"
47
+
48
+ # Tokens
49
+ INPUT_TOKENS = "gen_ai.usage.input_tokens"
50
+ OUTPUT_TOKENS = "gen_ai.usage.output_tokens"
51
+
52
+ # Tool
53
+ TOOL_NAME = "gen_ai.tool.name"
54
+ TOOL_TYPE = "gen_ai.tool.type"
55
+ TOOL_CALL_ID = "gen_ai.tool.call.id"
56
+
57
+ # Error
58
+ ERROR_TYPE = "error.type"
59
+
60
+
61
+ def _get_tracer() -> trace.Tracer:
62
+ """Get tracer lazily to ensure it uses the current TracerProvider.
63
+
64
+ This is important because the TracerProvider may be set up after
65
+ this module is imported (e.g., by Flow's experiment runner).
66
+ """
67
+ return trace.get_tracer("flow.miniagent", "0.1.0")
68
+
69
+
70
+ def start_llm_span(model: str) -> trace.Span:
71
+ """Start a span for an LLM call.
72
+
73
+ Args:
74
+ model: The model name being called
75
+
76
+ Returns:
77
+ An active span for the LLM call
78
+ """
79
+ span = _get_tracer().start_span(f"chat {model}", kind=trace.SpanKind.CLIENT)
80
+ span.set_attribute(GenAIAttr.OPERATION_NAME, "chat")
81
+ span.set_attribute(GenAIAttr.REQUEST_MODEL, model)
82
+ span.set_attribute(GenAIAttr.PROVIDER_NAME, "openai")
83
+ return span
84
+
85
+
86
+ def end_llm_span(span: trace.Span, input_tokens: int, output_tokens: int) -> None:
87
+ """End an LLM span with token usage.
88
+
89
+ Args:
90
+ span: The span to end
91
+ input_tokens: Number of input tokens used
92
+ output_tokens: Number of output tokens generated
93
+ """
94
+ span.set_attribute(GenAIAttr.INPUT_TOKENS, input_tokens)
95
+ span.set_attribute(GenAIAttr.OUTPUT_TOKENS, output_tokens)
96
+ span.end()
97
+
98
+
99
+ def start_tool_span(tool_name: str, call_id: str = "") -> trace.Span:
100
+ """Start a span for a tool call.
101
+
102
+ Args:
103
+ tool_name: Name of the tool being called
104
+ call_id: Optional tool call ID
105
+
106
+ Returns:
107
+ An active span for the tool call
108
+ """
109
+ span = _get_tracer().start_span(f"execute_tool {tool_name}", kind=trace.SpanKind.INTERNAL)
110
+ span.set_attribute(GenAIAttr.OPERATION_NAME, "execute_tool")
111
+ span.set_attribute(GenAIAttr.TOOL_NAME, tool_name)
112
+ span.set_attribute(GenAIAttr.TOOL_TYPE, "function")
113
+ if call_id:
114
+ span.set_attribute(GenAIAttr.TOOL_CALL_ID, call_id)
115
+ return span
116
+
117
+
118
+ def end_tool_span(span: trace.Span, error: Exception | None = None) -> None:
119
+ """End a tool span, optionally recording an error.
120
+
121
+ Args:
122
+ span: The span to end
123
+ error: Optional exception if the tool failed
124
+ """
125
+ if error:
126
+ span.set_attribute(GenAIAttr.ERROR_TYPE, type(error).__name__)
127
+ span.record_exception(error)
128
+ span.set_status(trace.StatusCode.ERROR, str(error))
129
+ span.end()
130
+
131
+
132
+ class OTelHooks:
133
+ """Hook handlers that emit OTEL spans.
134
+
135
+ This class provides hook callbacks that instrument MiniAgent's
136
+ execution with OpenTelemetry spans, enabling trace collection
137
+ for Flow's evaluation pipeline.
138
+
139
+ Usage:
140
+ otel = OTelHooks(model="gpt-4o")
141
+ hooks = Hooks(
142
+ pre_model_call=[otel.on_pre_model_call],
143
+ post_model_call=[otel.on_post_model_call],
144
+ pre_tool_use=[otel.on_pre_tool_use],
145
+ post_tool_use=[otel.on_post_tool_use],
146
+ )
147
+ agent = ChatAgent(..., hooks=hooks)
148
+ """
149
+
150
+ def __init__(self, model: str = "gpt-4o"):
151
+ """Initialize OTEL hooks.
152
+
153
+ Args:
154
+ model: Default model name for spans
155
+ """
156
+ self.model = model
157
+ self._llm_spans: dict[int, trace.Span] = {} # iteration -> span
158
+ self._tool_spans: dict[str, trace.Span] = {} # call_id -> span
159
+
160
+ async def on_pre_model_call(self, event: "PreModelCallEvent") -> None:
161
+ """Start an LLM span before model call.
162
+
163
+ Args:
164
+ event: Pre-model call event with iteration info
165
+ """
166
+ span = start_llm_span(model=self.model)
167
+ self._llm_spans[event.iteration] = span
168
+
169
+ async def on_post_model_call(self, event: "PostModelCallEvent") -> None:
170
+ """End the LLM span after model call.
171
+
172
+ Args:
173
+ event: Post-model call event with usage info
174
+ """
175
+ span = self._llm_spans.pop(event.iteration, None)
176
+ if span:
177
+ input_tokens = event.usage.get("input_tokens", 0)
178
+ output_tokens = event.usage.get("output_tokens", 0)
179
+ end_llm_span(span, input_tokens, output_tokens)
180
+
181
+ async def on_pre_tool_use(self, event: "PreToolUseEvent") -> "PreToolUseResult | None":
182
+ """Start a tool span before tool execution.
183
+
184
+ Args:
185
+ event: Pre-tool use event with tool info
186
+
187
+ Returns:
188
+ None (don't block or modify)
189
+ """
190
+ span = start_tool_span(event.tool_name, event.tool_call_id)
191
+ self._tool_spans[event.tool_call_id] = span
192
+ return None # Don't block
193
+
194
+ async def on_post_tool_use(self, event: "PostToolUseEvent") -> "PostToolUseResult | None":
195
+ """End the tool span after tool execution.
196
+
197
+ Args:
198
+ event: Post-tool use event with result info
199
+
200
+ Returns:
201
+ None (don't inject context or stop)
202
+ """
203
+ span = self._tool_spans.pop(event.tool_call_id, None)
204
+ if span:
205
+ error = Exception(event.error) if event.error else None
206
+ end_tool_span(span, error)
207
+ return None
208
+
209
+
210
+ def enable_instrumentation() -> None:
211
+ """Enable OpenTelemetry instrumentation for MiniAgent.
212
+
213
+ Call this once before running agents to enable trace collection.
214
+ This is the MiniAgent equivalent of agent_framework.observability.enable_instrumentation().
215
+
216
+ Note: This function is idempotent - calling it multiple times is safe.
217
+
218
+ Example:
219
+ from flow.harness.miniagent.otel import enable_instrumentation
220
+ enable_instrumentation()
221
+ # Now traces will be collected when agents run
222
+ """
223
+ global _instrumentation_enabled
224
+ if _instrumentation_enabled:
225
+ return
226
+
227
+ # MiniAgent instrumentation is hook-based, so this is mainly a marker
228
+ # that indicates the system is ready for trace collection.
229
+ # The actual spans are created via OTelHooks attached to agents.
230
+ _instrumentation_enabled = True
231
+
232
+
233
+ def create_otel_hooks(model: str = "gpt-4o") -> "Hooks":
234
+ """Create a Hooks instance with OTEL instrumentation.
235
+
236
+ This is the main entry point for adding OTEL tracing to a MiniAgent.
237
+ The returned Hooks object can be passed directly to ChatAgent.
238
+
239
+ Args:
240
+ model: Model name to use in LLM spans
241
+
242
+ Returns:
243
+ Hooks instance configured for OTEL tracing
244
+
245
+ Example:
246
+ hooks = create_otel_hooks(model="gpt-4o")
247
+ agent = ChatAgent(..., hooks=hooks)
248
+ """
249
+ from .hooks import Hooks
250
+
251
+ otel = OTelHooks(model=model)
252
+
253
+ return Hooks(
254
+ pre_model_call=[otel.on_pre_model_call],
255
+ post_model_call=[otel.on_post_model_call],
256
+ pre_tool_use=[otel.on_pre_tool_use],
257
+ post_tool_use=[otel.on_post_tool_use],
258
+ )
src/flow/harness/miniagent/tool.py ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tool definition and @tool decorator for MiniAgent.
2
+
3
+ Provides a simple way to define tools that can be called by the LLM.
4
+ """
5
+
6
+ from dataclasses import dataclass
7
+ from typing import Any, Callable, Literal, get_type_hints, get_origin, get_args, Annotated
8
+ import inspect
9
+
10
+
11
+ @dataclass
12
+ class Tool:
13
+ """A tool that can be called by the LLM.
14
+
15
+ Tools are functions with metadata that allows the LLM to understand
16
+ how to call them.
17
+ """
18
+
19
+ name: str
20
+ description: str
21
+ parameters: dict[str, Any] # JSON Schema
22
+ func: Callable[..., Any]
23
+
24
+ def to_openai_tool(self) -> dict[str, Any]:
25
+ """Convert to OpenAI tool format."""
26
+ return {
27
+ "type": "function",
28
+ "function": {
29
+ "name": self.name,
30
+ "description": self.description,
31
+ "parameters": self.parameters,
32
+ },
33
+ }
34
+
35
+ async def invoke(self, **kwargs: Any) -> str:
36
+ """Execute the tool and return result as string.
37
+
38
+ Handles both sync and async functions.
39
+ """
40
+ try:
41
+ result = self.func(**kwargs)
42
+ if inspect.iscoroutine(result):
43
+ result = await result
44
+ return str(result) if not isinstance(result, str) else result
45
+ except Exception as e:
46
+ return f"Error executing {self.name}: {str(e)}"
47
+
48
+
49
+ def _python_type_to_json_schema(py_type: Any) -> dict[str, Any]:
50
+ """Convert a Python type hint to JSON Schema."""
51
+ # Handle None/NoneType
52
+ if py_type is None or py_type is type(None):
53
+ return {"type": "null"}
54
+
55
+ # Handle basic types
56
+ if py_type == str:
57
+ return {"type": "string"}
58
+ if py_type == int:
59
+ return {"type": "integer"}
60
+ if py_type == float:
61
+ return {"type": "number"}
62
+ if py_type == bool:
63
+ return {"type": "boolean"}
64
+
65
+ # Handle dict without type args
66
+ if py_type is dict:
67
+ return {"type": "object"}
68
+
69
+ # Handle Optional (Union with None)
70
+ origin = get_origin(py_type)
71
+ args = get_args(py_type)
72
+
73
+ if origin is list:
74
+ if args:
75
+ return {"type": "array", "items": _python_type_to_json_schema(args[0])}
76
+ return {"type": "array"}
77
+
78
+ if origin is dict:
79
+ return {"type": "object"}
80
+
81
+ # Handle Union types (including Optional)
82
+ # In Python 3.10+, Optional[X] is Union[X, None]
83
+ if origin is type(int | str): # Union type
84
+ non_none_args = [a for a in args if a is not type(None)]
85
+ if len(non_none_args) == 1:
86
+ # This is Optional[X]
87
+ return _python_type_to_json_schema(non_none_args[0])
88
+ # Multiple types - use anyOf
89
+ return {"anyOf": [_python_type_to_json_schema(a) for a in non_none_args]}
90
+
91
+ # Handle Literal
92
+ if origin is Literal:
93
+ return {"type": "string", "enum": list(args)}
94
+
95
+ # Default to string
96
+ return {"type": "string"}
97
+
98
+
99
+ def tool(func: Callable[..., Any]) -> Tool:
100
+ """Decorator to convert a function into a Tool.
101
+
102
+ Uses type hints and Annotated[] for parameter descriptions.
103
+ The function's docstring becomes the tool description.
104
+
105
+ Usage:
106
+ @tool
107
+ def search(query: Annotated[str, "The search query"]) -> str:
108
+ '''Search the web for information.'''
109
+ return f"Results for: {query}"
110
+ """
111
+ # Get function signature
112
+ sig = inspect.signature(func)
113
+
114
+ # Get type hints (with extras for Annotated)
115
+ try:
116
+ hints = get_type_hints(func, include_extras=True)
117
+ except Exception:
118
+ hints = {}
119
+
120
+ # Build JSON Schema for parameters
121
+ properties: dict[str, Any] = {}
122
+ required: list[str] = []
123
+
124
+ for param_name, param in sig.parameters.items():
125
+ if param_name in ("self", "cls"):
126
+ continue
127
+
128
+ # Get the type hint
129
+ hint = hints.get(param_name, str)
130
+ description = ""
131
+
132
+ # Check if it's Annotated
133
+ if get_origin(hint) is Annotated:
134
+ args = get_args(hint)
135
+ actual_type = args[0]
136
+ # Look for string descriptions in the annotations
137
+ for annotation in args[1:]:
138
+ if isinstance(annotation, str):
139
+ description = annotation
140
+ break
141
+ else:
142
+ actual_type = hint
143
+
144
+ # Convert to JSON Schema
145
+ prop_schema = _python_type_to_json_schema(actual_type)
146
+ if description:
147
+ prop_schema["description"] = description
148
+
149
+ properties[param_name] = prop_schema
150
+
151
+ # Check if required (no default value)
152
+ if param.default is inspect.Parameter.empty:
153
+ required.append(param_name)
154
+
155
+ # Build the full schema
156
+ parameters_schema: dict[str, Any] = {
157
+ "type": "object",
158
+ "properties": properties,
159
+ }
160
+ if required:
161
+ parameters_schema["required"] = required
162
+
163
+ # Get description from docstring
164
+ description = func.__doc__ or f"Call the {func.__name__} function"
165
+ # Clean up the docstring - take first line/paragraph
166
+ description = description.strip().split("\n\n")[0].strip()
167
+
168
+ return Tool(
169
+ name=func.__name__,
170
+ description=description,
171
+ parameters=parameters_schema,
172
+ func=func,
173
+ )
src/flow/harness/miniagent/tools/__init__.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Built-in tool library for MiniAgent.
2
+
3
+ This module re-exports tools from the shared flow.tools module
4
+ for backward compatibility with existing MiniAgent code.
5
+
6
+ All tools are now implemented in flow.tools and shared across
7
+ all harnesses (MiniAgent, MAF, etc.).
8
+
9
+ Tool Categories:
10
+ - File Operations: read_file, write_file, edit_file, multi_edit, glob_files, grep, ls
11
+ - Notebooks: notebook_edit, notebook_read
12
+ - Execution: bash
13
+ - Planning: think, todo_write, todo_read
14
+ - Memory: memory (agentic memory for persistence)
15
+ - Web: web_search, web_fetch
16
+ - Sub-agents: task (for context isolation)
17
+
18
+ Presets:
19
+ - coding_tools(): Core tools for coding tasks
20
+ - research_tools(): Tools for research and exploration
21
+ - all_tools(): Everything
22
+
23
+ Example:
24
+ from flow.harness.miniagent.tools import coding_tools, task
25
+
26
+ agent = ChatAgent(
27
+ instructions="You are a helpful coding assistant.",
28
+ tools=coding_tools() + [task],
29
+ )
30
+ """
31
+
32
+ # Re-export everything from shared tools
33
+ from flow.tools import (
34
+ # Base
35
+ Tool,
36
+ # File operations
37
+ read_file,
38
+ write_file,
39
+ edit_file,
40
+ multi_edit,
41
+ glob_files,
42
+ grep,
43
+ ls,
44
+ # Notebook operations
45
+ notebook_edit,
46
+ notebook_read,
47
+ # Execution
48
+ bash,
49
+ check_processes,
50
+ python_repl,
51
+ # Planning and reasoning
52
+ think,
53
+ todo_write,
54
+ todo_read,
55
+ # Web operations
56
+ web_search,
57
+ web_fetch,
58
+ # Memory
59
+ memory,
60
+ create_memory_tool,
61
+ # Skills
62
+ skills,
63
+ create_skills_tool,
64
+ # Sub-agent
65
+ task,
66
+ create_task_tool,
67
+ # Presets
68
+ coding_tools,
69
+ planning_tools,
70
+ web_tools as research_tools,
71
+ notebook_tools,
72
+ all_tools,
73
+ )
74
+
75
+ # Compatibility: reset_todos from planning module
76
+ from flow.tools.planning import reset_todos, get_todos
77
+
78
+ # Compatibility: reset_memory from memory module
79
+ from flow.tools.memory import reset_memory
80
+
81
+
82
+ __all__ = [
83
+ # Base
84
+ "Tool",
85
+ # Presets
86
+ "coding_tools",
87
+ "research_tools",
88
+ "notebook_tools",
89
+ "planning_tools",
90
+ "all_tools",
91
+ # File operations
92
+ "read_file",
93
+ "write_file",
94
+ "edit_file",
95
+ "multi_edit",
96
+ "glob_files",
97
+ "grep",
98
+ "ls",
99
+ # Notebook operations
100
+ "notebook_edit",
101
+ "notebook_read",
102
+ # Execution
103
+ "bash",
104
+ "check_processes",
105
+ "python_repl",
106
+ # Planning
107
+ "think",
108
+ "todo_write",
109
+ "todo_read",
110
+ "reset_todos",
111
+ "get_todos",
112
+ # Memory
113
+ "memory",
114
+ "create_memory_tool",
115
+ "reset_memory",
116
+ # Web
117
+ "web_search",
118
+ "web_fetch",
119
+ # Sub-agent
120
+ "task",
121
+ "create_task_tool",
122
+ # Skills
123
+ "skills",
124
+ "create_skills_tool",
125
+ ]
src/flow/harness/miniagent/workspace.py ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Workspace management for MiniAgent.
2
+
3
+ Provides a simple convention for where agent-managed data lives:
4
+ - Working directory is the workspace (or explicitly set)
5
+ - Agent data goes in `{workspace}/.miniagent/`
6
+ - No restrictions on file access - agent can read/write anywhere
7
+
8
+ Structure:
9
+ {workspace}/
10
+ ├── .miniagent/
11
+ │ ├── todos.json # Persisted task list
12
+ │ ├── memory/ # Memory entries
13
+ │ │ ├── {id}.json
14
+ │ │ └── ...
15
+ │ └── config.json # Optional agent config
16
+ └── ... (rest of project)
17
+
18
+ Usage:
19
+ from miniagent.workspace import Workspace
20
+
21
+ # Use current directory
22
+ ws = Workspace()
23
+
24
+ # Or explicit path
25
+ ws = Workspace("/path/to/project")
26
+
27
+ # Get paths
28
+ ws.root # /path/to/project
29
+ ws.data_dir # /path/to/project/.miniagent
30
+ ws.todos_file # /path/to/project/.miniagent/todos.json
31
+ ws.memory_dir # /path/to/project/.miniagent/memory
32
+ """
33
+
34
+ import json
35
+ from pathlib import Path
36
+ from typing import Any
37
+
38
+
39
+ class Workspace:
40
+ """Manages workspace paths and agent data storage.
41
+
42
+ The workspace is where the agent operates. Agent-managed data
43
+ (todos, memories, etc.) is stored in a `.miniagent/` subdirectory.
44
+ """
45
+
46
+ def __init__(self, root: str | Path | None = None):
47
+ """Initialize workspace.
48
+
49
+ Args:
50
+ root: Workspace root directory. Defaults to current working directory.
51
+ """
52
+ if root is None:
53
+ root = Path.cwd()
54
+ self._root = Path(root).resolve()
55
+
56
+ @property
57
+ def root(self) -> Path:
58
+ """Workspace root directory."""
59
+ return self._root
60
+
61
+ @property
62
+ def data_dir(self) -> Path:
63
+ """Agent data directory (.miniagent/)."""
64
+ return self._root / ".miniagent"
65
+
66
+ @property
67
+ def todos_file(self) -> Path:
68
+ """Path to todos.json."""
69
+ return self.data_dir / "todos.json"
70
+
71
+ @property
72
+ def memory_dir(self) -> Path:
73
+ """Path to memory/ directory."""
74
+ return self.data_dir / "memory"
75
+
76
+ @property
77
+ def config_file(self) -> Path:
78
+ """Path to config.json."""
79
+ return self.data_dir / "config.json"
80
+
81
+ def ensure_data_dir(self) -> Path:
82
+ """Create data directory if it doesn't exist."""
83
+ self.data_dir.mkdir(parents=True, exist_ok=True)
84
+ return self.data_dir
85
+
86
+ def ensure_memory_dir(self) -> Path:
87
+ """Create memory directory if it doesn't exist."""
88
+ self.memory_dir.mkdir(parents=True, exist_ok=True)
89
+ return self.memory_dir
90
+
91
+ # --- Todos ---
92
+
93
+ def load_todos(self) -> list[dict[str, Any]]:
94
+ """Load todos from workspace."""
95
+ if not self.todos_file.exists():
96
+ return []
97
+ try:
98
+ with open(self.todos_file) as f:
99
+ return json.load(f) # type: ignore[no-any-return]
100
+ except (json.JSONDecodeError, IOError):
101
+ return []
102
+
103
+ def save_todos(self, todos: list[dict[str, Any]]) -> None:
104
+ """Save todos to workspace."""
105
+ self.ensure_data_dir()
106
+ with open(self.todos_file, "w") as f:
107
+ json.dump(todos, f, indent=2)
108
+
109
+ # --- Memory ---
110
+
111
+ def list_memories(self) -> list[dict[str, Any]]:
112
+ """List all memory entries."""
113
+ if not self.memory_dir.exists():
114
+ return []
115
+
116
+ memories: list[dict[str, Any]] = []
117
+ for filepath in self.memory_dir.glob("*.json"):
118
+ try:
119
+ with open(filepath) as f:
120
+ memories.append(json.load(f))
121
+ except (json.JSONDecodeError, IOError):
122
+ continue
123
+ return memories
124
+
125
+ def load_memory(self, memory_id: str) -> dict[str, Any] | None:
126
+ """Load a specific memory entry."""
127
+ filepath = self.memory_dir / f"{memory_id}.json"
128
+ if not filepath.exists():
129
+ return None
130
+ try:
131
+ with open(filepath) as f:
132
+ return json.load(f) # type: ignore[no-any-return]
133
+ except (json.JSONDecodeError, IOError):
134
+ return None
135
+
136
+ def save_memory(self, memory_id: str, data: dict[str, Any]) -> None:
137
+ """Save a memory entry."""
138
+ self.ensure_memory_dir()
139
+ filepath = self.memory_dir / f"{memory_id}.json"
140
+ with open(filepath, "w") as f:
141
+ json.dump(data, f, indent=2, default=str)
142
+
143
+ def delete_memory(self, memory_id: str) -> bool:
144
+ """Delete a memory entry. Returns True if deleted."""
145
+ filepath = self.memory_dir / f"{memory_id}.json"
146
+ if filepath.exists():
147
+ filepath.unlink()
148
+ return True
149
+ return False
150
+
151
+ # --- Config ---
152
+
153
+ def load_config(self) -> dict[str, Any]:
154
+ """Load workspace config."""
155
+ if not self.config_file.exists():
156
+ return {}
157
+ try:
158
+ with open(self.config_file) as f:
159
+ return json.load(f)
160
+ except (json.JSONDecodeError, IOError):
161
+ return {}
162
+
163
+ def save_config(self, config: dict[str, Any]) -> None:
164
+ """Save workspace config."""
165
+ self.ensure_data_dir()
166
+ with open(self.config_file, "w") as f:
167
+ json.dump(config, f, indent=2)
168
+
169
+ def __repr__(self) -> str:
170
+ return f"Workspace({self._root})"
171
+
172
+
173
+ # Default workspace (current directory)
174
+ _default_workspace: Workspace | None = None
175
+
176
+
177
+ def get_workspace() -> Workspace:
178
+ """Get the default workspace (creates if needed)."""
179
+ global _default_workspace
180
+ if _default_workspace is None:
181
+ _default_workspace = Workspace()
182
+ return _default_workspace
183
+
184
+
185
+ def set_workspace(workspace: Workspace | str | Path) -> Workspace:
186
+ """Set the default workspace."""
187
+ global _default_workspace
188
+ if isinstance(workspace, Workspace):
189
+ _default_workspace = workspace
190
+ else:
191
+ _default_workspace = Workspace(workspace)
192
+ return _default_workspace
193
+
194
+
195
+ def reset_workspace() -> None:
196
+ """Reset default workspace (for testing)."""
197
+ global _default_workspace
198
+ _default_workspace = None
src/flow/harness/registry.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Harness registry for multi-framework support.
2
+
3
+ This module provides a simple registry pattern for harness implementations,
4
+ allowing Flow to support multiple agent frameworks (MAF, LangGraph, Claude SDK).
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from pathlib import Path
10
+ from typing import TYPE_CHECKING
11
+
12
+ if TYPE_CHECKING:
13
+ from flow.experiments.models import Agent
14
+ from flow.harness.base import BaseHarness
15
+ from flow.llm import LLMClientConfig
16
+
17
+ _HARNESSES: dict[str, type["BaseHarness"]] = {}
18
+
19
+
20
+ def register(name: str, harness_class: type["BaseHarness"]) -> None:
21
+ """Register a harness class for a framework.
22
+
23
+ Args:
24
+ name: Framework name (e.g., "maf", "langgraph", "claude")
25
+ harness_class: The harness class to register
26
+ """
27
+ _HARNESSES[name] = harness_class
28
+
29
+
30
+ def get_harness_class(name: str) -> type["BaseHarness"]:
31
+ """Get harness class by framework name.
32
+
33
+ Args:
34
+ name: Framework name
35
+
36
+ Returns:
37
+ The harness class
38
+
39
+ Raises:
40
+ ValueError: If framework is not registered
41
+ """
42
+ if name not in _HARNESSES:
43
+ available = list(_HARNESSES.keys())
44
+ raise ValueError(f"Unknown framework: {name}. Available: {available}")
45
+ return _HARNESSES[name]
46
+
47
+
48
+ def create_harness(
49
+ agent: "Agent",
50
+ workspace: Path,
51
+ llm_config: "LLMClientConfig | None" = None,
52
+ ) -> "BaseHarness":
53
+ """Create a harness from an Agent spec.
54
+
55
+ This is the main entry point for creating harnesses. It looks up
56
+ the appropriate harness class based on agent.framework and calls
57
+ its from_agent() classmethod.
58
+
59
+ Args:
60
+ agent: The Agent spec defining the configuration
61
+ workspace: Working directory for the agent
62
+ llm_config: Optional LLM configuration for the agent (falls back to env vars)
63
+
64
+ Returns:
65
+ A configured harness instance
66
+
67
+ Raises:
68
+ ValueError: If agent.framework is not registered
69
+ """
70
+ harness_class = get_harness_class(agent.framework)
71
+ return harness_class.from_agent(agent, workspace, llm_config=llm_config)
72
+
73
+
74
+ def available_frameworks() -> list[str]:
75
+ """Get list of available framework names.
76
+
77
+ Returns:
78
+ List of registered framework names
79
+ """
80
+ return list(_HARNESSES.keys())
src/flow/llm/__init__.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Microsoft. All rights reserved.
2
+
3
+ """LLM client configuration and factory.
4
+
5
+ This package provides a unified way to configure and create LLM clients
6
+ for different providers and frameworks.
7
+
8
+ Example:
9
+ from flow.llm import LLMClientConfig, LLMProvider, LLMClientFactory
10
+ from flow.llm.config import AzureOpenAIConfig
11
+
12
+ # Create config
13
+ config = LLMClientConfig(
14
+ provider=LLMProvider.AZURE_OPENAI,
15
+ name="My Azure GPT-4o",
16
+ azure_openai=AzureOpenAIConfig(deployment="gpt-4o"),
17
+ )
18
+
19
+ # Create client for MAF
20
+ client = LLMClientFactory.create_maf_client(config)
21
+
22
+ # Create client for LangGraph
23
+ llm = LLMClientFactory.create_langgraph_client(config)
24
+ """
25
+
26
+ from .config import (
27
+ AnthropicConfig,
28
+ AzureOpenAIConfig,
29
+ CustomConfig,
30
+ LLMClientConfig,
31
+ LLMProvider,
32
+ OllamaConfig,
33
+ OpenAIConfig,
34
+ )
35
+ from .factory import LLMClientFactory
36
+
37
+ __all__ = [
38
+ # Enums
39
+ "LLMProvider",
40
+ # Config classes
41
+ "LLMClientConfig",
42
+ "OpenAIConfig",
43
+ "AzureOpenAIConfig",
44
+ "AnthropicConfig",
45
+ "OllamaConfig",
46
+ "CustomConfig",
47
+ # Factory
48
+ "LLMClientFactory",
49
+ ]
src/flow/llm/config.py ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Microsoft. All rights reserved.
2
+
3
+ """LLM client configuration models.
4
+
5
+ This module defines provider-agnostic configuration for LLM clients.
6
+ Secrets are stored as environment variable references, not actual values.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import os
12
+ from enum import Enum
13
+ from typing import Any
14
+
15
+ from pydantic import BaseModel, Field, model_validator
16
+
17
+
18
+ class LLMProvider(str, Enum):
19
+ """Supported LLM providers."""
20
+
21
+ OPENAI = "openai"
22
+ AZURE_OPENAI = "azure_openai"
23
+ ANTHROPIC = "anthropic"
24
+ OLLAMA = "ollama"
25
+ CUSTOM = "custom" # OpenAI-compatible endpoints
26
+
27
+
28
+ class OpenAIConfig(BaseModel):
29
+ """Configuration for OpenAI API."""
30
+
31
+ api_key_env_var: str = Field(
32
+ default="OPENAI_API_KEY",
33
+ description="Environment variable name containing the API key",
34
+ )
35
+ model_id: str = Field(
36
+ default="gpt-4o",
37
+ description="Model ID to use (e.g., gpt-4o, gpt-4-turbo)",
38
+ )
39
+ base_url: str | None = Field(
40
+ default=None,
41
+ description="Optional base URL for API (for proxies)",
42
+ )
43
+
44
+ def get_api_key(self) -> str:
45
+ """Get the API key from environment variable."""
46
+ value = os.environ.get(self.api_key_env_var)
47
+ if not value:
48
+ raise ValueError(f"Environment variable {self.api_key_env_var} is not set")
49
+ return value
50
+
51
+
52
+ class AzureOpenAIConfig(BaseModel):
53
+ """Configuration for Azure OpenAI API."""
54
+
55
+ endpoint_env_var: str = Field(
56
+ default="AZURE_OPENAI_ENDPOINT",
57
+ description="Environment variable name containing the endpoint URL",
58
+ )
59
+ api_key_env_var: str = Field(
60
+ default="AZURE_OPENAI_API_KEY",
61
+ description="Environment variable name containing the API key",
62
+ )
63
+ deployment: str = Field(
64
+ description="Azure OpenAI deployment name",
65
+ )
66
+ api_version: str = Field(
67
+ default="2024-02-15-preview",
68
+ description="Azure OpenAI API version",
69
+ )
70
+
71
+ def get_endpoint(self) -> str:
72
+ """Get the endpoint from environment variable."""
73
+ value = os.environ.get(self.endpoint_env_var)
74
+ if not value:
75
+ raise ValueError(f"Environment variable {self.endpoint_env_var} is not set")
76
+ return value
77
+
78
+ def get_api_key(self) -> str:
79
+ """Get the API key from environment variable."""
80
+ value = os.environ.get(self.api_key_env_var)
81
+ if not value:
82
+ raise ValueError(f"Environment variable {self.api_key_env_var} is not set")
83
+ return value
84
+
85
+
86
+ class AnthropicConfig(BaseModel):
87
+ """Configuration for Anthropic API."""
88
+
89
+ api_key_env_var: str = Field(
90
+ default="ANTHROPIC_API_KEY",
91
+ description="Environment variable name containing the API key",
92
+ )
93
+ model_id: str = Field(
94
+ default="claude-3-5-sonnet-20241022",
95
+ description="Model ID to use",
96
+ )
97
+
98
+ def get_api_key(self) -> str:
99
+ """Get the API key from environment variable."""
100
+ value = os.environ.get(self.api_key_env_var)
101
+ if not value:
102
+ raise ValueError(f"Environment variable {self.api_key_env_var} is not set")
103
+ return value
104
+
105
+
106
+ class OllamaConfig(BaseModel):
107
+ """Configuration for Ollama (local models)."""
108
+
109
+ host: str = Field(
110
+ default="http://localhost:11434",
111
+ description="Ollama server URL",
112
+ )
113
+ model_id: str = Field(
114
+ default="llama3.2",
115
+ description="Model ID to use",
116
+ )
117
+
118
+
119
+ class CustomConfig(BaseModel):
120
+ """Configuration for custom OpenAI-compatible endpoints."""
121
+
122
+ base_url: str = Field(
123
+ description="Base URL for the API",
124
+ )
125
+ api_key_env_var: str = Field(
126
+ default="CUSTOM_API_KEY",
127
+ description="Environment variable name containing the API key",
128
+ )
129
+ model_id: str = Field(
130
+ description="Model ID to use",
131
+ )
132
+
133
+ def get_api_key(self) -> str:
134
+ """Get the API key from environment variable."""
135
+ value = os.environ.get(self.api_key_env_var)
136
+ if not value:
137
+ raise ValueError(f"Environment variable {self.api_key_env_var} is not set")
138
+ return value
139
+
140
+
141
+ class LLMClientConfig(BaseModel):
142
+ """Unified LLM client configuration.
143
+
144
+ This is a discriminated union based on the provider field.
145
+ Only one of the provider-specific configs should be set.
146
+
147
+ Example:
148
+ # Azure OpenAI
149
+ config = LLMClientConfig(
150
+ provider=LLMProvider.AZURE_OPENAI,
151
+ name="My Azure GPT-4o",
152
+ azure_openai=AzureOpenAIConfig(deployment="gpt-4o"),
153
+ )
154
+
155
+ # OpenAI
156
+ config = LLMClientConfig(
157
+ provider=LLMProvider.OPENAI,
158
+ name="OpenAI GPT-4o",
159
+ openai=OpenAIConfig(model_id="gpt-4o"),
160
+ )
161
+ """
162
+
163
+ id: str | None = Field(
164
+ default=None,
165
+ description="Unique identifier (set when stored in DB)",
166
+ )
167
+ provider: LLMProvider = Field(
168
+ description="The LLM provider type",
169
+ )
170
+ name: str = Field(
171
+ description="User-friendly name for this configuration",
172
+ )
173
+ is_default: bool = Field(
174
+ default=False,
175
+ description="Whether this is the default configuration",
176
+ )
177
+
178
+ # Provider-specific configs (discriminated union)
179
+ openai: OpenAIConfig | None = None
180
+ azure_openai: AzureOpenAIConfig | None = None
181
+ anthropic: AnthropicConfig | None = None
182
+ ollama: OllamaConfig | None = None
183
+ custom: CustomConfig | None = None
184
+
185
+ @model_validator(mode="after")
186
+ def validate_provider_config(self) -> "LLMClientConfig":
187
+ """Ensure the correct provider config is set."""
188
+ provider_configs = {
189
+ LLMProvider.OPENAI: self.openai,
190
+ LLMProvider.AZURE_OPENAI: self.azure_openai,
191
+ LLMProvider.ANTHROPIC: self.anthropic,
192
+ LLMProvider.OLLAMA: self.ollama,
193
+ LLMProvider.CUSTOM: self.custom,
194
+ }
195
+
196
+ config = provider_configs.get(self.provider)
197
+ if config is None:
198
+ raise ValueError(
199
+ f"Provider {self.provider.value} requires {self.provider.value} config to be set"
200
+ )
201
+
202
+ return self
203
+
204
+ def get_model_id(self) -> str:
205
+ """Get the model/deployment ID for display purposes."""
206
+ match self.provider:
207
+ case LLMProvider.OPENAI:
208
+ return self.openai.model_id if self.openai else ""
209
+ case LLMProvider.AZURE_OPENAI:
210
+ return self.azure_openai.deployment if self.azure_openai else ""
211
+ case LLMProvider.ANTHROPIC:
212
+ return self.anthropic.model_id if self.anthropic else ""
213
+ case LLMProvider.OLLAMA:
214
+ return self.ollama.model_id if self.ollama else ""
215
+ case LLMProvider.CUSTOM:
216
+ return self.custom.model_id if self.custom else ""
217
+ case _:
218
+ return ""
219
+
220
+ def to_dict(self) -> dict[str, Any]:
221
+ """Convert to dictionary for JSON serialization."""
222
+ return self.model_dump(exclude_none=True)
223
+
224
+ @classmethod
225
+ def from_dict(cls, data: dict[str, Any]) -> "LLMClientConfig":
226
+ """Create from dictionary."""
227
+ return cls.model_validate(data)