victordibia commited on
Commit
708a48b
·
1 Parent(s): f4dca43

Deploy 2026-02-23 09:17:49

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. src/flow/__init__.py +8 -0
  2. src/flow/cli/app.py +4 -0
  3. src/flow/cli/deploy.py +113 -0
  4. src/flow/cli/evaluate.py +279 -0
  5. src/flow/cli/hf_import.py +2 -1
  6. src/flow/cli/optimize.py +221 -89
  7. src/flow/experiments/agent_api.py +22 -9
  8. src/flow/experiments/data/tasks/house_rules.jsonl +3 -0
  9. src/flow/experiments/eval_cache.py +223 -0
  10. src/flow/experiments/evaluators/llm.py +8 -2
  11. src/flow/experiments/gaia_converter.py +13 -18
  12. src/flow/experiments/hf_datasets.py +69 -45
  13. src/flow/experiments/models.py +108 -18
  14. src/flow/experiments/optimizer.py +160 -35
  15. src/flow/experiments/results.py +21 -0
  16. src/flow/experiments/runner.py +1 -1
  17. src/flow/experiments/strategies/__init__.py +25 -11
  18. src/flow/experiments/strategies/gepa_instruction.py +415 -0
  19. src/flow/experiments/strategies/{llm_rewriter.py → instruction.py} +118 -76
  20. src/flow/experiments/strategies/skill.py +692 -0
  21. src/flow/experiments/strategies/{tool_selector.py → tool.py} +16 -22
  22. src/flow/harness/compaction/strategies.py +4 -1
  23. src/flow/harness/maf/agent.py +16 -2
  24. src/flow/harness/maf/tools/__init__.py +9 -1
  25. src/flow/harness/miniagent/harness.py +32 -1
  26. src/flow/harness/miniagent/tool.py +4 -1
  27. src/flow/prompts.py +23 -0
  28. src/flow/tools/__init__.py +4 -4
  29. src/flow/tools/adapters.py +1 -7
  30. src/flow/tools/base.py +4 -1
  31. src/flow/tools/browsing.py +13 -3
  32. src/flow/tools/coding.py +47 -12
  33. src/flow/tools/execution.py +17 -1
  34. src/flow/tools/memory.py +17 -13
  35. src/flow/tools/notebook.py +21 -1
  36. src/flow/tools/planning.py +13 -5
  37. src/flow/tools/skills.py +71 -5
  38. src/flow/tools/subagent.py +11 -1
  39. src/flow/tools/text_inspector_qa.py +4 -1
  40. src/flow/tools/web.py +15 -1
  41. src/flow/tools/workspace.py +18 -5
  42. src/flow/ui/api/__init__.py +2 -0
  43. src/flow/ui/api/deployments.py +145 -0
  44. src/flow/ui/api/experiment.py +0 -1
  45. src/flow/ui/api/jobs.py +40 -7
  46. src/flow/ui/api/schema.py +9 -5
  47. src/flow/ui/api/tests.py +1 -35
  48. src/flow/ui/auth/__init__.py +1 -2
  49. src/flow/ui/auth/config.py +0 -4
  50. src/flow/ui/auth/middleware.py +0 -42
src/flow/__init__.py CHANGED
@@ -15,6 +15,14 @@ Usage:
15
  harness = MAFHarness(workspace=Path("/tmp/workspace"), enable_compaction=False)
16
  """
17
 
 
 
 
 
 
 
 
 
18
  from flow.harness.maf import MAFHarness, create_agent
19
 
20
  __version__ = "0.1.0"
 
15
  harness = MAFHarness(workspace=Path("/tmp/workspace"), enable_compaction=False)
16
  """
17
 
18
+ import sys
19
+
20
+ from loguru import logger
21
+
22
+ # Default to INFO — suppress DEBUG noise from tools/workspace/etc.
23
+ logger.remove()
24
+ logger.add(sys.stderr, level="INFO")
25
+
26
  from flow.harness.maf import MAFHarness, create_agent
27
 
28
  __version__ = "0.1.0"
src/flow/cli/app.py CHANGED
@@ -176,9 +176,13 @@ async def _run_single_task(
176
 
177
 
178
  # Import and register commands
 
 
179
  from flow.cli.hf_import import hf_import as hf_import_cmd
180
  from flow.cli.optimize import optimize as optimize_cmd
181
 
 
 
182
  app.command()(optimize_cmd)
183
  app.command(name="hf-import")(hf_import_cmd)
184
 
 
176
 
177
 
178
  # Import and register commands
179
+ from flow.cli.deploy import deploy as deploy_cmd
180
+ from flow.cli.evaluate import evaluate as evaluate_cmd
181
  from flow.cli.hf_import import hf_import as hf_import_cmd
182
  from flow.cli.optimize import optimize as optimize_cmd
183
 
184
+ app.command()(deploy_cmd)
185
+ app.command()(evaluate_cmd)
186
  app.command()(optimize_cmd)
187
  app.command(name="hf-import")(hf_import_cmd)
188
 
src/flow/cli/deploy.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Microsoft. All rights reserved.
2
+
3
+ """Deploy command for persisting agent configs to the database."""
4
+
5
+ from __future__ import annotations
6
+
7
+ import asyncio
8
+ from pathlib import Path
9
+ from typing import Annotated
10
+
11
+ import typer
12
+ from rich.console import Console
13
+
14
+ from flow.experiments.models import load_agent
15
+
16
+ console = Console()
17
+
18
+
19
+ def deploy(
20
+ agent: Annotated[
21
+ Path,
22
+ typer.Option(
23
+ "--agent", "-a",
24
+ help="Path to agent YAML config file",
25
+ ),
26
+ ],
27
+ name: Annotated[
28
+ str | None,
29
+ typer.Option(
30
+ "--name", "-n",
31
+ help="Deployment name (defaults to agent name from YAML)",
32
+ ),
33
+ ] = None,
34
+ deployment_id: Annotated[
35
+ str | None,
36
+ typer.Option(
37
+ "--deployment-id", "-d",
38
+ help="Add version to existing deployment (UUID)",
39
+ ),
40
+ ] = None,
41
+ description: Annotated[
42
+ str,
43
+ typer.Option(
44
+ "--description",
45
+ help="Version description",
46
+ ),
47
+ ] = "",
48
+ ) -> None:
49
+ """Deploy an agent config to the FAOS database.
50
+
51
+ Creates a versioned deployment that can be tracked, evaluated,
52
+ and compared in the dashboard.
53
+
54
+ First deploy creates a new deployment (v1). Subsequent deploys
55
+ with --deployment-id add versions to the same deployment.
56
+
57
+ Examples:
58
+ # Deploy a new agent
59
+ flow deploy --agent agent_config.yaml
60
+
61
+ # Deploy with custom name
62
+ flow deploy --agent agent_config.yaml --name "trip-planner-v2"
63
+
64
+ # Add version to existing deployment
65
+ flow deploy --agent optimized.yaml --deployment-id <uuid>
66
+
67
+ # Deploy best config from optimization
68
+ flow deploy --agent ~/.flow/optimizations/<ts>/agents/best_score.yaml
69
+ """
70
+ asyncio.run(_run_deploy(
71
+ agent_path=agent,
72
+ name=name,
73
+ deployment_id=deployment_id,
74
+ description=description,
75
+ ))
76
+
77
+
78
+ async def _run_deploy(
79
+ agent_path: Path,
80
+ name: str | None,
81
+ deployment_id: str | None,
82
+ description: str,
83
+ ) -> None:
84
+ """Run deployment."""
85
+ if not agent_path.exists():
86
+ console.print(f"[red]Error:[/] Agent file not found: {agent_path}")
87
+ raise typer.Exit(1)
88
+
89
+ agent_config = load_agent(agent_path)
90
+ if name:
91
+ agent_config.name = name
92
+
93
+ try:
94
+ from flow.ui.services.persistence_adapter import PersistenceAdapter
95
+
96
+ adapter = PersistenceAdapter()
97
+ result = await adapter.deploy_agent(
98
+ agent_config,
99
+ deployment_id=deployment_id,
100
+ source="deploy",
101
+ version_description=description,
102
+ )
103
+ except ImportError:
104
+ console.print("[red]Error:[/] Database dependencies not available.")
105
+ console.print("[dim]Make sure flow is installed with UI support.[/]")
106
+ raise typer.Exit(1)
107
+
108
+ console.print("\n[bold green]Deployed![/]\n")
109
+ console.print(f" Agent: [cyan]{agent_config.name}[/]")
110
+ console.print(f" Deployment ID: [cyan]{result.deployment_id}[/]")
111
+ console.print(f" Config ID: [cyan]{result.config_id}[/]")
112
+ console.print(f" Version: [cyan]{result.version}[/]")
113
+ console.print(f"\n[dim]View in dashboard:[/] http://localhost:8091/deployments/{result.deployment_id}")
src/flow/cli/evaluate.py ADDED
@@ -0,0 +1,279 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Microsoft. All rights reserved.
2
+
3
+ """Evaluate command for measuring agent performance on tasks."""
4
+
5
+ from __future__ import annotations
6
+
7
+ import asyncio
8
+ import json
9
+ from pathlib import Path
10
+ from typing import Annotated
11
+
12
+ import typer
13
+ from rich.console import Console
14
+ from rich.table import Table
15
+
16
+ from flow.experiments.models import Agent, load_agent
17
+ from flow.experiments.optimizer import evaluate_agent
18
+ from flow.experiments.types import Task, get_task_suite, load_tasks_from_jsonl
19
+
20
+ console = Console()
21
+
22
+
23
+ def evaluate(
24
+ agent: Annotated[
25
+ Path | None,
26
+ typer.Option(
27
+ "--agent", "-a",
28
+ help="Path to agent YAML config file",
29
+ ),
30
+ ] = None,
31
+ tasks: Annotated[
32
+ Path | None,
33
+ typer.Option(
34
+ "--tasks", "-t",
35
+ help="Path to tasks.jsonl file",
36
+ ),
37
+ ] = None,
38
+ suite: Annotated[
39
+ str | None,
40
+ typer.Option(
41
+ "--suite", "-s",
42
+ help="Built-in task suite: quick, core, coding",
43
+ ),
44
+ ] = None,
45
+ parallel: Annotated[
46
+ int,
47
+ typer.Option(
48
+ "--parallel", "-p",
49
+ help="Max concurrent task executions",
50
+ ),
51
+ ] = 4,
52
+ limit: Annotated[
53
+ int | None,
54
+ typer.Option(
55
+ "--limit", "-l",
56
+ help="Max number of tasks to run",
57
+ ),
58
+ ] = None,
59
+ no_llm_eval: Annotated[
60
+ bool,
61
+ typer.Option(
62
+ "--no-llm-eval",
63
+ help="Disable LLM-as-Judge evaluation (faster, less accurate)",
64
+ ),
65
+ ] = False,
66
+ output_json: Annotated[
67
+ bool,
68
+ typer.Option(
69
+ "--json",
70
+ help="Output results as JSON",
71
+ ),
72
+ ] = False,
73
+ persist: Annotated[
74
+ bool,
75
+ typer.Option(
76
+ "--persist/--no-persist",
77
+ help="Persist results to the FAOS database (visible in flow serve dashboard)",
78
+ ),
79
+ ] = True,
80
+ ) -> None:
81
+ """Evaluate an agent's performance on a set of tasks.
82
+
83
+ Runs a single agent configuration against tasks and reports
84
+ score, pass rate, token usage, and per-task breakdown.
85
+ No optimization or candidate generation — just measurement.
86
+
87
+ Examples:
88
+ # Evaluate agent config on a task file
89
+ flow evaluate --agent agent_config.yaml --tasks tasks.jsonl
90
+
91
+ # Evaluate with built-in suite
92
+ flow evaluate --agent agent_config.yaml --suite quick
93
+
94
+ # Evaluate and persist to dashboard
95
+ flow evaluate --agent agent_config.yaml --tasks tasks.jsonl --persist
96
+
97
+ # JSON output for scripting
98
+ flow evaluate --agent agent_config.yaml --tasks tasks.jsonl --json
99
+ """
100
+ asyncio.run(_run_evaluate(
101
+ agent_path=agent,
102
+ tasks_path=tasks,
103
+ suite=suite,
104
+ parallel=parallel,
105
+ limit=limit,
106
+ use_llm_eval=not no_llm_eval,
107
+ output_json=output_json,
108
+ persist=persist,
109
+ ))
110
+
111
+
112
+ async def _run_evaluate(
113
+ agent_path: Path | None,
114
+ tasks_path: Path | None,
115
+ suite: str | None,
116
+ parallel: int,
117
+ limit: int | None,
118
+ use_llm_eval: bool,
119
+ output_json: bool,
120
+ persist: bool,
121
+ ) -> None:
122
+ """Run evaluation."""
123
+ agent_config, task_list = _load_agent_and_tasks(agent_path, tasks_path, suite, limit)
124
+
125
+ if not output_json:
126
+ console.print(f"\n[bold]Agent:[/] {agent_config.name}")
127
+ console.print(f"[bold]Tasks:[/] {len(task_list)}")
128
+ for t in task_list:
129
+ console.print(f" - {t.name}")
130
+ console.print()
131
+
132
+ try:
133
+ summary = await evaluate_agent(
134
+ agent_config,
135
+ task_list,
136
+ parallel=parallel,
137
+ use_llm_evaluator=use_llm_eval,
138
+ quiet=True,
139
+ )
140
+ except KeyboardInterrupt:
141
+ console.print("\n[yellow]Evaluation cancelled.[/]")
142
+ raise typer.Exit(1)
143
+
144
+ # Persist to database if requested
145
+ job_id: str | None = None
146
+ if persist:
147
+ job_id = await _persist_evaluation(summary, agent_config)
148
+
149
+ # Output results
150
+ if output_json:
151
+ result = {
152
+ "agent": agent_config.name,
153
+ "score": round(summary.avg_score, 4),
154
+ "pass_rate": round(summary.pass_rate, 4),
155
+ "total_tokens": summary.total_tokens,
156
+ "avg_tokens": round(summary.avg_tokens, 1),
157
+ "avg_duration": round(summary.avg_duration, 2),
158
+ "task_count": summary.task_count,
159
+ "job_id": job_id,
160
+ "tasks": [
161
+ {
162
+ "name": tr.task_name,
163
+ "score": round(tr.eval_score, 4),
164
+ "passed": tr.eval_passed,
165
+ "tokens": tr.metrics.total_tokens,
166
+ "reasoning": tr.eval_reasoning,
167
+ }
168
+ for tr in summary.task_results
169
+ ],
170
+ }
171
+ console.print(json.dumps(result, indent=2))
172
+ else:
173
+ _print_eval_results(summary, job_id)
174
+
175
+
176
+ def _load_agent_and_tasks(
177
+ agent_path: Path | None,
178
+ tasks_path: Path | None,
179
+ suite: str | None,
180
+ limit: int | None,
181
+ ) -> tuple[Agent, list[Task]]:
182
+ """Load agent config and task list from CLI arguments."""
183
+ if agent_path:
184
+ if not agent_path.exists():
185
+ console.print(f"[red]Error:[/] Agent file not found: {agent_path}")
186
+ raise typer.Exit(1)
187
+ agent_config = load_agent(agent_path)
188
+ else:
189
+ agent_config = Agent(name="flow_agent")
190
+
191
+ task_list: list[Task] = []
192
+ if tasks_path:
193
+ if not tasks_path.exists():
194
+ console.print(f"[red]Error:[/] Tasks file not found: {tasks_path}")
195
+ raise typer.Exit(1)
196
+ task_list = load_tasks_from_jsonl(tasks_path)
197
+ elif suite:
198
+ try:
199
+ task_list = get_task_suite(suite)
200
+ except ValueError as e:
201
+ console.print(f"[red]Error:[/] {e}")
202
+ raise typer.Exit(1)
203
+ else:
204
+ try:
205
+ task_list = get_task_suite("quick")
206
+ except ValueError:
207
+ console.print("[red]Error:[/] No tasks specified. Use --tasks or --suite")
208
+ raise typer.Exit(1)
209
+
210
+ if limit is not None and limit > 0:
211
+ task_list = task_list[:limit]
212
+
213
+ if not task_list:
214
+ console.print("[red]Error:[/] No tasks to evaluate")
215
+ raise typer.Exit(1)
216
+
217
+ return agent_config, task_list
218
+
219
+
220
+ async def _persist_evaluation(summary: object, agent_config: Agent) -> str | None:
221
+ """Deploy agent and persist evaluation results to database."""
222
+ try:
223
+ from flow.ui.services.persistence_adapter import PersistenceAdapter
224
+
225
+ adapter = PersistenceAdapter()
226
+ deploy_result = await adapter.deploy_agent(agent_config, source="evaluate")
227
+ job_id = await adapter.persist_evaluation(summary, deploy_result.config_id)
228
+ return job_id
229
+ except ImportError:
230
+ console.print("[yellow]Warning:[/] Database not available. Results not persisted.")
231
+ console.print("[dim]Start the dashboard with: flow serve[/]")
232
+ return None
233
+ except Exception as e:
234
+ console.print(f"[yellow]Warning:[/] Failed to persist results: {e}")
235
+ return None
236
+
237
+
238
+ def _print_eval_results(summary: object, job_id: str | None = None) -> None:
239
+ """Print evaluation results as Rich tables."""
240
+ from flow.experiments.optimizer import CandidateSummary
241
+
242
+ assert isinstance(summary, CandidateSummary)
243
+
244
+ console.print("[bold green]Evaluation complete![/]\n")
245
+
246
+ table = Table(title="Results")
247
+ table.add_column("Metric", style="cyan")
248
+ table.add_column("Value", style="green")
249
+
250
+ table.add_row("Score", f"{summary.avg_score:.2f}")
251
+ table.add_row("Pass Rate", f"{summary.pass_rate:.0%}")
252
+ table.add_row("Total Tokens", f"{summary.total_tokens:,}")
253
+ table.add_row("Avg Tokens", f"{summary.avg_tokens:,.0f}")
254
+ table.add_row("Avg Duration", f"{summary.avg_duration:.1f}s")
255
+ table.add_row("Tasks", str(summary.task_count))
256
+ if job_id:
257
+ table.add_row("Job ID", job_id)
258
+ console.print(table)
259
+
260
+ if summary.task_results:
261
+ console.print()
262
+ task_table = Table(title="Per-Task Breakdown")
263
+ task_table.add_column("Task", style="cyan")
264
+ task_table.add_column("Score", style="green")
265
+ task_table.add_column("Status", style="bold")
266
+ task_table.add_column("Tokens", style="dim")
267
+
268
+ for tr in summary.task_results:
269
+ status = "[green]PASS[/]" if tr.eval_passed else "[red]FAIL[/]"
270
+ task_table.add_row(
271
+ tr.task_name,
272
+ f"{tr.eval_score:.2f}",
273
+ status,
274
+ f"{tr.metrics.total_tokens:,}",
275
+ )
276
+ console.print(task_table)
277
+
278
+ if job_id:
279
+ console.print(f"\n[dim]View in dashboard:[/] http://localhost:8091/jobs/{job_id}")
src/flow/cli/hf_import.py CHANGED
@@ -10,6 +10,7 @@ from rich.console import Console
10
 
11
  from flow.experiments.hf_datasets import (
12
  DATASET_CONVERTERS,
 
13
  import_hf_dataset,
14
  save_tasks_to_jsonl,
15
  )
@@ -98,7 +99,7 @@ def hf_import(
98
  if list_supported:
99
  console.print("\n[bold]Supported Datasets:[/]")
100
  console.print("\n[dim]You can add custom converters via register_converter()[/]\n")
101
- for name in sorted(DATASET_CONVERTERS.keys()):
102
  console.print(f" • {name}")
103
  return
104
 
 
10
 
11
  from flow.experiments.hf_datasets import (
12
  DATASET_CONVERTERS,
13
+ LAZY_CONVERTERS,
14
  import_hf_dataset,
15
  save_tasks_to_jsonl,
16
  )
 
99
  if list_supported:
100
  console.print("\n[bold]Supported Datasets:[/]")
101
  console.print("\n[dim]You can add custom converters via register_converter()[/]\n")
102
+ for name in sorted({*DATASET_CONVERTERS, *LAZY_CONVERTERS}):
103
  console.print(f" • {name}")
104
  return
105
 
src/flow/cli/optimize.py CHANGED
@@ -71,13 +71,27 @@ def optimize(
71
  help="Max concurrent experiments",
72
  ),
73
  ] = 4,
74
- vary: Annotated[
75
  str | None,
76
  typer.Option(
77
- "--vary", "-v",
78
- help="Comma-separated params to vary: compaction, strategy, tools, head, tail",
79
  ),
80
  ] = None,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  output: Annotated[
82
  Path | None,
83
  typer.Option(
@@ -106,33 +120,31 @@ def optimize(
106
  help="Maximum number of candidates to generate",
107
  ),
108
  ] = 100,
 
 
 
 
 
 
 
109
  ) -> None:
110
  """Find the best agent configuration through experimentation.
111
 
112
- Runs experiments in parallel, evaluates with LLM-as-Judge,
113
- ranks via Pareto analysis, and exports winning agent configs.
114
-
115
  Examples:
116
- # Use experiment YAML (recommended - defines agent, tasks, and variations)
117
- flow optimize --experiment experiment.yaml
118
-
119
- # Run with task file and default candidates
120
- flow optimize --tasks tasks.jsonl
121
-
122
- # Vary specific parameters
123
- flow optimize --vary compaction,tools --tasks tasks.jsonl
124
 
125
- # Test all compaction strategies
126
- flow optimize --vary strategy --suite coding
127
 
128
- # Use built-in task suite
129
- flow optimize --suite coding --parallel 2
130
 
131
- # Start from a base agent definition
132
- flow optimize --agent base_agent.yaml --vary compaction,tools --tasks tasks.jsonl
133
 
134
- # Use GEPA for active prompt optimization (via YAML config)
135
- flow optimize --config gepa_strategy.yaml --agent base_agent.yaml --tasks tasks.jsonl
136
  """
137
  asyncio.run(_run_optimize(
138
  tasks_path=tasks,
@@ -141,11 +153,14 @@ def optimize(
141
  agent_path=agent,
142
  suite=suite,
143
  parallel=parallel,
144
- vary=vary,
 
 
145
  output_dir=output,
146
  use_llm_eval=not no_llm_eval,
147
  budget=budget,
148
  limit=limit,
 
149
  ))
150
 
151
 
@@ -156,11 +171,14 @@ async def _run_optimize(
156
  agent_path: Path | None,
157
  suite: str | None,
158
  parallel: int,
159
- vary: str | None,
 
 
160
  output_dir: Path | None,
161
  use_llm_eval: bool,
162
  budget: int,
163
  limit: int | None = None,
 
164
  ) -> None:
165
  """Run the optimization."""
166
  # If experiment YAML provided, use it as the source of truth
@@ -177,26 +195,43 @@ async def _run_optimize(
177
  # Load base agent
178
  base = _load_base_agent(agent_path)
179
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
  # Load candidates and check if a strategy is defined in config
181
- candidates, strategy_instance = await _load_candidates_and_strategy(config_path, vary, base, budget)
182
-
183
  # If a strategy was provided (like GepaStrategy), run it directly
184
  if strategy_instance is not None:
185
  console.print("\n[bold]Running active optimization strategy...[/]")
186
  await _run_active_strategy(
187
- strategy=strategy_instance,
188
- base_agent=base,
189
- tasks=tasks,
190
- output_dir=output_dir,
191
  parallel=parallel,
192
  use_llm_eval=use_llm_eval,
193
  budget=budget
194
  )
195
  return
196
-
197
  # Otherwise, use traditional grid search with candidates
198
  if not candidates:
199
- console.print("[red]Error:[/] No candidates to test. Use --config or --vary")
200
  raise typer.Exit(1)
201
 
202
  console.print(f"\n[bold]Base Agent:[/] {base.name}")
@@ -223,6 +258,9 @@ async def _run_optimize(
223
  console.print("\nTo use an agent config:")
224
  console.print(f" [dim]flow run --config {result.output_dir / 'agents' / 'best_score.yaml'} \"your task\"[/]")
225
 
 
 
 
226
  except KeyboardInterrupt:
227
  console.print("\n[yellow]Optimization cancelled.[/]")
228
  raise typer.Exit(1)
@@ -360,7 +398,6 @@ def _load_base_agent(agent_path: Path | None) -> Agent:
360
 
361
  async def _load_candidates_and_strategy(
362
  config_path: Path | None,
363
- vary: str | None,
364
  base: Agent,
365
  budget: int,
366
  ) -> tuple[list[Candidate], Any | None]:
@@ -405,17 +442,13 @@ async def _load_candidates_and_strategy(
405
  console.print("[red]Error:[/] Config file has no CANDIDATES, VARIATIONS, or STRATEGY")
406
  raise typer.Exit(1)
407
 
408
- if vary:
409
- variations = _parse_vary_flag(vary)
410
- strategy = GridSearchStrategy(variations)
411
- return await strategy.generate(base, budget), None
412
-
413
- # Default: explore context engineering dimensions
414
  strategy = GridSearchStrategy(variations={
415
  "compaction": [
416
- CompactionConfig.head_tail(10, 40),
417
  CompactionConfig.none(),
 
418
  ],
 
419
  })
420
  return await strategy.generate(base, budget), None
421
 
@@ -455,12 +488,18 @@ def _load_yaml_strategy(path: Path) -> Any | None:
455
  console.print("[red]Error:[/] GEPA optimizer not available.")
456
  console.print("[dim]Install with: pip install flow-agent[optimizer][/]")
457
  raise typer.Exit(1)
458
- elif strategy_type == "llm_rewriter":
459
- from flow.experiments.strategies.llm_rewriter import LLMRewriterStrategy
460
- return LLMRewriterStrategy(config=strategy_config)
 
 
 
 
 
 
461
  else:
462
  console.print(f"[red]Error:[/] Unknown strategy type: {strategy_type}")
463
- console.print("[dim]Supported: gepa, llm_rewriter[/]")
464
  raise typer.Exit(1)
465
 
466
 
@@ -488,50 +527,6 @@ def _load_python_config(path: Path) -> tuple[list[Candidate], dict[str, Any], An
488
  return candidates, variations, strategy
489
 
490
 
491
- def _parse_vary_flag(vary: str) -> dict[str, Any]:
492
- """Parse --vary flag into variations dict.
493
-
494
- Supported parameters:
495
- compaction, compact: Test head_tail vs none
496
- strategy: Test all compaction strategies (none, head_tail, sliding_window, summarization)
497
- tools: Test minimal vs standard tool sets
498
- head, head_size: Vary head sizes (5, 10, 20)
499
- tail, tail_size: Vary tail sizes (20, 40, 60)
500
- """
501
- variations: dict[str, Any] = {}
502
-
503
- for param in vary.split(","):
504
- param = param.strip().lower()
505
-
506
- if param in ("compaction", "compact"):
507
- variations["compaction"] = [
508
- CompactionConfig.head_tail(10, 40),
509
- CompactionConfig.none(),
510
- ]
511
- elif param in ("strategy", "strategies"):
512
- # Test all compaction strategies
513
- variations["compaction"] = [
514
- CompactionConfig.none(),
515
- CompactionConfig.head_tail(10, 40),
516
- CompactionConfig(strategy="sliding_window", token_budget=50_000),
517
- CompactionConfig(strategy="summarization", token_budget=50_000),
518
- ]
519
- elif param in ("tools", "toolset"):
520
- # Tool variations - memory and subagent are just tools
521
- variations["tools"] = ["minimal", "standard"]
522
- elif param in ("head", "head_size"):
523
- variations["compaction"] = [
524
- CompactionConfig.head_tail(h, 40) for h in [5, 10, 20]
525
- ]
526
- elif param in ("tail", "tail_size"):
527
- variations["compaction"] = [
528
- CompactionConfig.head_tail(10, t) for t in [20, 40, 60]
529
- ]
530
- else:
531
- console.print(f"[yellow]Warning:[/] Unknown vary param: {param}")
532
-
533
- return variations
534
-
535
 
536
  async def _run_active_strategy(
537
  strategy: Any,
@@ -544,7 +539,7 @@ async def _run_active_strategy(
544
  ) -> None:
545
  """Run an active optimization strategy.
546
 
547
- For strategies that use the ExperimentRunner protocol (LLMRewriterStrategy),
548
  delegates to FlowOptimizer.optimize_with_strategy() which handles setup,
549
  evaluation, Pareto analysis, and export.
550
 
@@ -732,3 +727,140 @@ async def _run_gepa_strategy(
732
 
733
  console.print(f"\nAgents exported to: [cyan]{output_path / 'agents'}[/]")
734
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  help="Max concurrent experiments",
72
  ),
73
  ] = 4,
74
+ strategy: Annotated[
75
  str | None,
76
  typer.Option(
77
+ "--strategy", "-S",
78
+ help="Active strategy: tools, instructions, skills (comma-separated for pipeline)",
79
  ),
80
  ] = None,
81
+ max_iterations: Annotated[
82
+ int,
83
+ typer.Option(
84
+ "--max-iterations",
85
+ help="Max iterations for active strategies",
86
+ ),
87
+ ] = 3,
88
+ min_improvement: Annotated[
89
+ float,
90
+ typer.Option(
91
+ "--min-improvement",
92
+ help="Min score improvement to continue iterating",
93
+ ),
94
+ ] = 0.01,
95
  output: Annotated[
96
  Path | None,
97
  typer.Option(
 
120
  help="Maximum number of candidates to generate",
121
  ),
122
  ] = 100,
123
+ persist: Annotated[
124
+ bool,
125
+ typer.Option(
126
+ "--persist/--no-persist",
127
+ help="Persist results to the FAOS database (visible in flow serve dashboard)",
128
+ ),
129
+ ] = True,
130
  ) -> None:
131
  """Find the best agent configuration through experimentation.
132
 
 
 
 
133
  Examples:
134
+ # Optimize tools
135
+ flow optimize --agent agent.yaml --tasks tasks.jsonl --strategy tools
 
 
 
 
 
 
136
 
137
+ # Optimize instructions
138
+ flow optimize --agent agent.yaml --suite quick --strategy instructions
139
 
140
+ # Optimize both (pipeline: instructions then tools)
141
+ flow optimize --agent agent.yaml --tasks tasks.jsonl --strategy instructions,tools
142
 
143
+ # Skip persisting to dashboard
144
+ flow optimize --agent agent.yaml --tasks tasks.jsonl --strategy tools --no-persist
145
 
146
+ # Use experiment YAML (defines agent, tasks, and variations)
147
+ flow optimize --experiment experiment.yaml
148
  """
149
  asyncio.run(_run_optimize(
150
  tasks_path=tasks,
 
153
  agent_path=agent,
154
  suite=suite,
155
  parallel=parallel,
156
+ strategy=strategy,
157
+ max_iterations=max_iterations,
158
+ min_improvement=min_improvement,
159
  output_dir=output,
160
  use_llm_eval=not no_llm_eval,
161
  budget=budget,
162
  limit=limit,
163
+ persist=persist,
164
  ))
165
 
166
 
 
171
  agent_path: Path | None,
172
  suite: str | None,
173
  parallel: int,
174
+ strategy: str | None,
175
+ max_iterations: int,
176
+ min_improvement: float,
177
  output_dir: Path | None,
178
  use_llm_eval: bool,
179
  budget: int,
180
  limit: int | None = None,
181
+ persist: bool = False,
182
  ) -> None:
183
  """Run the optimization."""
184
  # If experiment YAML provided, use it as the source of truth
 
195
  # Load base agent
196
  base = _load_base_agent(agent_path)
197
 
198
+ # Active strategy mode (--strategy tools, --strategy instructions,tools)
199
+ if strategy:
200
+ result = await _run_strategy_optimize(
201
+ strategy_names=strategy,
202
+ base=base,
203
+ tasks=tasks,
204
+ parallel=parallel,
205
+ use_llm_eval=use_llm_eval,
206
+ budget=budget,
207
+ output_dir=output_dir,
208
+ max_iterations=max_iterations,
209
+ min_improvement=min_improvement,
210
+ )
211
+ if persist and result:
212
+ await _persist_optimization(result, base)
213
+ return
214
+
215
  # Load candidates and check if a strategy is defined in config
216
+ candidates, strategy_instance = await _load_candidates_and_strategy(config_path, base, budget)
217
+
218
  # If a strategy was provided (like GepaStrategy), run it directly
219
  if strategy_instance is not None:
220
  console.print("\n[bold]Running active optimization strategy...[/]")
221
  await _run_active_strategy(
222
+ strategy=strategy_instance,
223
+ base_agent=base,
224
+ tasks=tasks,
225
+ output_dir=output_dir,
226
  parallel=parallel,
227
  use_llm_eval=use_llm_eval,
228
  budget=budget
229
  )
230
  return
231
+
232
  # Otherwise, use traditional grid search with candidates
233
  if not candidates:
234
+ console.print("[red]Error:[/] No candidates to test. Use --strategy or --config")
235
  raise typer.Exit(1)
236
 
237
  console.print(f"\n[bold]Base Agent:[/] {base.name}")
 
258
  console.print("\nTo use an agent config:")
259
  console.print(f" [dim]flow run --config {result.output_dir / 'agents' / 'best_score.yaml'} \"your task\"[/]")
260
 
261
+ if persist:
262
+ await _persist_optimization(result, base)
263
+
264
  except KeyboardInterrupt:
265
  console.print("\n[yellow]Optimization cancelled.[/]")
266
  raise typer.Exit(1)
 
398
 
399
  async def _load_candidates_and_strategy(
400
  config_path: Path | None,
 
401
  base: Agent,
402
  budget: int,
403
  ) -> tuple[list[Candidate], Any | None]:
 
442
  console.print("[red]Error:[/] Config file has no CANDIDATES, VARIATIONS, or STRATEGY")
443
  raise typer.Exit(1)
444
 
445
+ # Default: explore all key dimensions (compaction, tools, instructions)
 
 
 
 
 
446
  strategy = GridSearchStrategy(variations={
447
  "compaction": [
 
448
  CompactionConfig.none(),
449
+ CompactionConfig.head_tail(10, 40),
450
  ],
451
+ "tools": ["minimal", "standard"],
452
  })
453
  return await strategy.generate(base, budget), None
454
 
 
488
  console.print("[red]Error:[/] GEPA optimizer not available.")
489
  console.print("[dim]Install with: pip install flow-agent[optimizer][/]")
490
  raise typer.Exit(1)
491
+ elif strategy_type == "instruction":
492
+ from flow.experiments.strategies.instruction import InstructionOptimizer
493
+ return InstructionOptimizer(config=strategy_config)
494
+ elif strategy_type == "tool":
495
+ from flow.experiments.strategies.tool import ToolOptimizer
496
+ return ToolOptimizer(config=strategy_config)
497
+ elif strategy_type == "skill":
498
+ from flow.experiments.strategies.skill import SkillOptimizer
499
+ return SkillOptimizer(config=strategy_config)
500
  else:
501
  console.print(f"[red]Error:[/] Unknown strategy type: {strategy_type}")
502
+ console.print("[dim]Supported: gepa, instruction, tool, skill[/]")
503
  raise typer.Exit(1)
504
 
505
 
 
527
  return candidates, variations, strategy
528
 
529
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
530
 
531
  async def _run_active_strategy(
532
  strategy: Any,
 
539
  ) -> None:
540
  """Run an active optimization strategy.
541
 
542
+ For strategies that use the ExperimentRunner protocol (InstructionOptimizer),
543
  delegates to FlowOptimizer.optimize_with_strategy() which handles setup,
544
  evaluation, Pareto analysis, and export.
545
 
 
727
 
728
  console.print(f"\nAgents exported to: [cyan]{output_path / 'agents'}[/]")
729
 
730
+
731
+ async def _run_strategy_optimize(
732
+ strategy_names: str,
733
+ base: Agent,
734
+ tasks: list[Task],
735
+ parallel: int,
736
+ use_llm_eval: bool,
737
+ budget: int,
738
+ output_dir: Path | None,
739
+ max_iterations: int,
740
+ min_improvement: float,
741
+ ) -> "OptimizationResult | None":
742
+ """Run active strategy optimization (--strategy flag).
743
+
744
+ Supports single strategies and comma-separated pipelines.
745
+ Reuses _resolve_strategy() from agent_api to avoid duplication.
746
+ """
747
+ from flow.experiments.ablation import compute_pareto_frontier
748
+ from flow.experiments.agent_api import _resolve_strategy
749
+ from flow.experiments.optimizer import CandidateSummary, OptimizationResult
750
+
751
+ strategy_list = [s.strip() for s in strategy_names.split(",")]
752
+ strategy_config = {
753
+ "max_iterations": max_iterations,
754
+ "min_improvement": min_improvement,
755
+ }
756
+
757
+ console.print(f"\n[bold]Strategy:[/] {' → '.join(strategy_list)}")
758
+ console.print(f"[bold]Base Agent:[/] {base.name}")
759
+ console.print(f"[bold]Tasks:[/] {len(tasks)}")
760
+ console.print(f"[bold]Max Iterations:[/] {max_iterations}")
761
+ console.print()
762
+
763
+ current_agent = base
764
+ last_result: OptimizationResult | None = None
765
+ all_summaries: list[CandidateSummary] = []
766
+ total_experiments = 0
767
+ total_duration = 0.0
768
+
769
+ try:
770
+ for strat_name in strategy_list:
771
+ try:
772
+ strat_instance = _resolve_strategy(strat_name, strategy_config)
773
+ except ValueError as e:
774
+ console.print(f"[red]Error:[/] {e}")
775
+ raise typer.Exit(1)
776
+
777
+ optimizer = FlowOptimizer(
778
+ parallel=parallel,
779
+ use_llm_evaluator=use_llm_eval,
780
+ output_dir=output_dir,
781
+ )
782
+
783
+ last_result = await optimizer.optimize_with_strategy(
784
+ strategy=strat_instance,
785
+ base=current_agent,
786
+ tasks=tasks,
787
+ budget=budget,
788
+ )
789
+
790
+ # Accumulate results from all stages
791
+ all_summaries.extend(last_result.summaries)
792
+ total_experiments += last_result.total_experiments
793
+ total_duration += last_result.total_duration_seconds
794
+
795
+ # Next stage starts from the best agent found
796
+ best = last_result.get_best_candidate("score")
797
+ if best:
798
+ current_agent = best.candidate.agent
799
+
800
+ # Merge all stage results into a combined result with recomputed Pareto
801
+ if last_result and len(strategy_list) > 1:
802
+ # Deduplicate summaries by name (baseline may appear in multiple stages)
803
+ seen_names: set[str] = set()
804
+ deduped: list[CandidateSummary] = []
805
+ for s in all_summaries:
806
+ if s.name not in seen_names:
807
+ seen_names.add(s.name)
808
+ deduped.append(s)
809
+
810
+ # Recompute Pareto frontier across all stages
811
+ pareto_names = compute_pareto_frontier(deduped)
812
+ for s in deduped:
813
+ s.is_pareto_optimal = s.name in pareto_names
814
+ s.pareto_rank = 0 if s.is_pareto_optimal else 1
815
+
816
+ rank_by_score = sorted(deduped, key=lambda s: s.avg_score, reverse=True)
817
+ rank_by_tokens = sorted(deduped, key=lambda s: s.avg_tokens)
818
+ rank_by_efficiency = sorted(
819
+ deduped,
820
+ key=lambda s: s.avg_score / max(s.avg_tokens, 1),
821
+ reverse=True,
822
+ )
823
+
824
+ last_result = OptimizationResult(
825
+ timestamp=last_result.timestamp,
826
+ output_dir=last_result.output_dir,
827
+ summaries=deduped,
828
+ pareto_frontier=pareto_names,
829
+ exported_agents=last_result.exported_agents,
830
+ rank_by_score=[s.name for s in rank_by_score],
831
+ rank_by_tokens=[s.name for s in rank_by_tokens],
832
+ rank_by_efficiency=[s.name for s in rank_by_efficiency],
833
+ total_experiments=total_experiments,
834
+ total_duration_seconds=total_duration,
835
+ )
836
+
837
+ console.print("\n[bold green]Optimization complete![/]")
838
+ if last_result:
839
+ console.print(f"\nBest agents exported to: [cyan]{last_result.output_dir / 'agents'}[/]")
840
+ console.print("\nTo use the best config:")
841
+ console.print(f" [dim]flow run --config {last_result.output_dir / 'agents' / 'best_score.yaml'} \"your task\"[/]")
842
+
843
+ except KeyboardInterrupt:
844
+ console.print("\n[yellow]Optimization cancelled.[/]")
845
+ raise typer.Exit(1)
846
+
847
+ return last_result
848
+
849
+
850
+ async def _persist_optimization(result: "OptimizationResult", base_agent: Agent) -> None:
851
+ """Deploy agent and persist optimization results to database."""
852
+ from flow.experiments.optimizer import OptimizationResult
853
+
854
+ try:
855
+ from flow.ui.services.persistence_adapter import PersistenceAdapter
856
+
857
+ adapter = PersistenceAdapter()
858
+ deploy_result = await adapter.deploy_agent(base_agent, source="optimize")
859
+ job_id = await adapter.persist_optimization(result, deploy_result.config_id)
860
+ console.print(f"\n[dim]View in dashboard:[/] http://localhost:8091/jobs/{job_id}")
861
+ except ImportError:
862
+ console.print("[yellow]Warning:[/] Database not available. Results not persisted.")
863
+ console.print("[dim]Start the dashboard with: flow serve[/]")
864
+ except Exception as e:
865
+ console.print(f"[yellow]Warning:[/] Failed to persist results: {e}")
866
+
src/flow/experiments/agent_api.py CHANGED
@@ -36,8 +36,10 @@ DEFAULT_VARIATIONS: dict[str, list[Any]] = {
36
 
37
  # Known active strategy names and their classes
38
  _STRATEGY_MAP: dict[str, str] = {
39
- "tools": "flow.experiments.strategies.tool_selector.ToolSelectorStrategy",
40
- "instructions": "flow.experiments.strategies.llm_rewriter.LLMRewriterStrategy",
 
 
41
  }
42
 
43
 
@@ -131,11 +133,15 @@ async def _evaluate_agent_impl(
131
  return result
132
 
133
 
134
- def _resolve_strategy(name: str) -> Any:
135
  """Import and instantiate a named strategy.
136
 
137
  Args:
138
- name: Strategy name ("tools", "instructions")
 
 
 
 
139
 
140
  Returns:
141
  Strategy instance
@@ -147,14 +153,18 @@ def _resolve_strategy(name: str) -> Any:
147
  available = ["grid"] + list(_STRATEGY_MAP.keys())
148
  raise ValueError(f"Unknown strategy: {name!r}. Available: {available}")
149
 
 
 
 
 
 
 
 
150
  module_path, class_name = _STRATEGY_MAP[name].rsplit(".", 1)
151
  import importlib
152
  mod = importlib.import_module(module_path)
153
  cls = getattr(mod, class_name)
154
- return cls(config={
155
- "max_iterations": 3,
156
- "min_improvement": 0.01,
157
- })
158
 
159
 
160
  def _opt_result_to_agent_result(
@@ -216,6 +226,7 @@ async def _optimize_agent_impl(
216
  quiet: bool,
217
  agent_id: str | None = None,
218
  strategy: str | list[str] | None = None,
 
219
  ) -> AgentOptimizationResult:
220
  """Implementation of Agent.optimize().
221
 
@@ -225,6 +236,8 @@ async def _optimize_agent_impl(
225
  grid search. A string like "tools" or "instructions" runs that
226
  strategy. A list runs them sequentially, each starting from the
227
  previous best.
 
 
228
  """
229
  resolved_tasks = _resolve_tasks(tasks)
230
 
@@ -264,7 +277,7 @@ async def _optimize_agent_impl(
264
  last_opt_result: OptimizationResult | None = None
265
 
266
  for strat_name in strategy_list:
267
- strat_instance = _resolve_strategy(strat_name)
268
  optimizer = FlowOptimizer(parallel=parallel, use_llm_evaluator=use_llm_eval)
269
 
270
  if quiet:
 
36
 
37
  # Known active strategy names and their classes
38
  _STRATEGY_MAP: dict[str, str] = {
39
+ "tools": "flow.experiments.strategies.tool.ToolOptimizer",
40
+ "instructions": "flow.experiments.strategies.instruction.InstructionOptimizer",
41
+ "skills": "flow.experiments.strategies.skill.SkillOptimizer",
42
+ "gepa_instructions": "flow.experiments.strategies.gepa_instruction.GEPAInstructionOptimizer",
43
  }
44
 
45
 
 
133
  return result
134
 
135
 
136
+ def _resolve_strategy(name: str, config: dict[str, Any] | None = None) -> Any:
137
  """Import and instantiate a named strategy.
138
 
139
  Args:
140
+ name: Strategy name ("tools", "instructions", "skills")
141
+ config: Optional strategy-specific config. Merged with defaults:
142
+ max_iterations (int): Max optimization iterations (default: 3)
143
+ min_improvement (float): Min score gain to continue (default: 0.01)
144
+ Additional keys are passed through to the strategy.
145
 
146
  Returns:
147
  Strategy instance
 
153
  available = ["grid"] + list(_STRATEGY_MAP.keys())
154
  raise ValueError(f"Unknown strategy: {name!r}. Available: {available}")
155
 
156
+ defaults: dict[str, Any] = {
157
+ "max_iterations": 3,
158
+ "min_improvement": 0.01,
159
+ }
160
+ if config:
161
+ defaults.update(config)
162
+
163
  module_path, class_name = _STRATEGY_MAP[name].rsplit(".", 1)
164
  import importlib
165
  mod = importlib.import_module(module_path)
166
  cls = getattr(mod, class_name)
167
+ return cls(config=defaults)
 
 
 
168
 
169
 
170
  def _opt_result_to_agent_result(
 
226
  quiet: bool,
227
  agent_id: str | None = None,
228
  strategy: str | list[str] | None = None,
229
+ strategy_config: dict[str, Any] | None = None,
230
  ) -> AgentOptimizationResult:
231
  """Implementation of Agent.optimize().
232
 
 
236
  grid search. A string like "tools" or "instructions" runs that
237
  strategy. A list runs them sequentially, each starting from the
238
  previous best.
239
+ strategy_config: Optional config dict passed to strategy constructors.
240
+ Merged with defaults (max_iterations=3, min_improvement=0.01).
241
  """
242
  resolved_tasks = _resolve_tasks(tasks)
243
 
 
277
  last_opt_result: OptimizationResult | None = None
278
 
279
  for strat_name in strategy_list:
280
+ strat_instance = _resolve_strategy(strat_name, strategy_config)
281
  optimizer = FlowOptimizer(parallel=parallel, use_llm_evaluator=use_llm_eval)
282
 
283
  if quiet:
src/flow/experiments/data/tasks/house_rules.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {"name": "calculator_module", "prompt": "Create a Python module calc.py with four functions: add, subtract, multiply, divide. Each takes two numbers and returns the result. Include a brief test at the bottom (if __name__ == '__main__') that exercises each function and prints the results. Then run it.", "criteria": [{"name": "file_created", "instruction": "calc.py was created and runs without errors"}, {"name": "correct_results", "instruction": "All four operations produce correct results when run"}, {"name": "type_annotations", "instruction": "EVERY function has full type annotations on ALL parameters AND the return type. e.g. def add(a: float, b: float) -> float. Functions missing ANY annotation fail this criterion."}, {"name": "error_handling", "instruction": "divide() MUST raise a ValueError (not ZeroDivisionError, not return None) when the divisor is zero. The test block must demonstrate this by catching ValueError and printing a message. If ZeroDivisionError is raised or None is returned, this fails."}, {"name": "module_exports", "instruction": "The module MUST define __all__ = ['add', 'subtract', 'multiply', 'divide'] at the top of the file. If __all__ is missing this criterion fails."}], "category": "house_rules", "metadata": {"expected_duration": 120}}
2
+ {"name": "csv_report", "prompt": "Create a Python script report.py that generates a CSV file 'sales_report.csv' with 10 rows of sample sales data. Columns: date, product, quantity, unit_price, total. Then read the CSV back and print a summary: total revenue and the top-selling product by quantity. Run the script.", "criteria": [{"name": "file_created", "instruction": "report.py was created and runs without errors"}, {"name": "csv_generated", "instruction": "sales_report.csv was created with 10 data rows"}, {"name": "iso_dates", "instruction": "ALL dates in the CSV MUST be in ISO-8601 format (YYYY-MM-DD). Dates like 'Jan 15, 2024' or '01/15/2024' or 'January 15' FAIL this criterion. Only YYYY-MM-DD is acceptable."}, {"name": "header_comment", "instruction": "The very first line of the CSV file MUST be a comment line starting with '# ' that describes the file contents and generation timestamp. e.g. '# Sales report generated 2024-01-15T10:30:00'. If the first line is the column header row, this fails."}, {"name": "monetary_format", "instruction": "When printing the summary to stdout, ALL monetary values MUST be formatted with exactly 2 decimal places and a dollar sign. e.g. '$1,234.56' or '$42.00'. Values like '1234.5' or '42' or '$1234.567' FAIL this criterion."}], "category": "house_rules", "metadata": {"expected_duration": 120}}
3
+ {"name": "api_response_builder", "prompt": "Create a Python module api_utils.py with a function build_response(data, status_code=200) that builds a JSON-ready dictionary representing an API response. Also create a function validate_email(email: str) -> bool that checks if an email is roughly valid. Write a test block that demonstrates both functions with a few examples and prints the JSON output. Run it.", "criteria": [{"name": "file_created", "instruction": "api_utils.py was created and runs without errors"}, {"name": "correct_behavior", "instruction": "build_response returns a dict and validate_email correctly accepts/rejects obvious cases"}, {"name": "response_envelope", "instruction": "build_response() MUST return a dict with EXACTLY this structure: {'status': 'ok' or 'error', 'code': int, 'data': ..., 'timestamp': ISO-8601 string}. The 'status' field MUST be 'ok' for codes 200-299 and 'error' for all others. 'timestamp' MUST be present and in ISO-8601 format. If any of these keys are missing or the status logic is wrong, this fails."}, {"name": "error_response", "instruction": "When status_code >= 400, the response MUST include an 'error' key with a human-readable error message string (not None, not empty). The test block MUST demonstrate at least one error response (e.g. status_code=404). If no error response is shown or the 'error' key is missing for error codes, this fails."}, {"name": "json_output", "instruction": "The test block MUST use json.dumps with indent=2 to print the response. Raw dict printing (using print(dict)) or json.dumps without indent FAIL this criterion. The output must be valid, pretty-printed JSON."}], "category": "house_rules", "metadata": {"expected_duration": 120}}
src/flow/experiments/eval_cache.py ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Microsoft. All rights reserved.
2
+
3
+ """Evaluation cache for avoiding redundant agent evaluations.
4
+
5
+ Provides pluggable backends (in-memory and disk-based) so that identical
6
+ (agent-config, task) pairs are not re-evaluated within or across sessions.
7
+
8
+ Cache keys are SHA-256 hashes of the agent's functional configuration
9
+ (instructions, tools, framework, llm_config, compaction) combined with
10
+ the task definition (prompt, criteria). The agent *name* is intentionally
11
+ excluded because it varies across iterations while the actual behaviour
12
+ remains the same.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import hashlib
18
+ import json
19
+ import logging
20
+ import sqlite3
21
+ import time
22
+ from dataclasses import asdict
23
+ from pathlib import Path
24
+ from typing import Any, Protocol, runtime_checkable
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+
29
+ # ---------------------------------------------------------------------------
30
+ # Protocol
31
+ # ---------------------------------------------------------------------------
32
+
33
+
34
+ @runtime_checkable
35
+ class EvaluationCache(Protocol):
36
+ """Protocol for evaluation result caching."""
37
+
38
+ def get(self, key: str) -> dict[str, Any] | None:
39
+ """Return cached result dict for *key*, or ``None`` on miss."""
40
+ ...
41
+
42
+ def put(self, key: str, result: dict[str, Any]) -> None:
43
+ """Store *result* under *key*."""
44
+ ...
45
+
46
+
47
+ # ---------------------------------------------------------------------------
48
+ # Cache-key builder
49
+ # ---------------------------------------------------------------------------
50
+
51
+
52
+ def build_cache_key(agent_config: dict[str, Any], task_dict: dict[str, Any]) -> str:
53
+ """Build a deterministic cache key from agent config and task.
54
+
55
+ Args:
56
+ agent_config: Dict with the agent's functional fields
57
+ (instructions, tools, framework, llm_config, compaction).
58
+ task_dict: Dict with the task's identity fields
59
+ (prompt, criteria).
60
+
61
+ Returns:
62
+ A hex SHA-256 digest string.
63
+ """
64
+ blob = json.dumps(
65
+ {"agent": agent_config, "task": task_dict},
66
+ sort_keys=True,
67
+ default=str,
68
+ )
69
+ return hashlib.sha256(blob.encode()).hexdigest()
70
+
71
+
72
+ def agent_cache_dict(agent: Any) -> dict[str, Any]:
73
+ """Extract the functional (behaviour-defining) fields from an Agent.
74
+
75
+ The agent *name* is excluded so that two agents with identical
76
+ config but different names share the same cache key.
77
+ """
78
+ return {
79
+ "instructions": getattr(agent, "instructions", None),
80
+ "tools": _normalise(getattr(agent, "tools", None)),
81
+ "framework": getattr(agent, "framework", None),
82
+ "llm_config": getattr(agent, "llm_config", None),
83
+ "compaction": _compaction_dict(getattr(agent, "compaction", None)),
84
+ }
85
+
86
+
87
+ def task_cache_dict(task: Any) -> dict[str, Any]:
88
+ """Extract the identity fields from a Task."""
89
+ criteria = getattr(task, "criteria", [])
90
+ criteria_dicts = []
91
+ for c in criteria:
92
+ if hasattr(c, "name"):
93
+ criteria_dicts.append({
94
+ "name": c.name,
95
+ "instruction": getattr(c, "instruction", ""),
96
+ })
97
+ else:
98
+ criteria_dicts.append(c)
99
+ return {
100
+ "prompt": getattr(task, "prompt", ""),
101
+ "criteria": criteria_dicts,
102
+ }
103
+
104
+
105
+ def _normalise(value: Any) -> Any:
106
+ """Normalise tool configs for deterministic hashing."""
107
+ if isinstance(value, list):
108
+ return sorted(value)
109
+ return value
110
+
111
+
112
+ def _compaction_dict(compaction: Any) -> dict[str, Any] | None:
113
+ if compaction is None:
114
+ return None
115
+ try:
116
+ return asdict(compaction)
117
+ except Exception:
118
+ return str(compaction)
119
+
120
+
121
+ # ---------------------------------------------------------------------------
122
+ # In-memory backend
123
+ # ---------------------------------------------------------------------------
124
+
125
+
126
+ class InMemoryCache:
127
+ """Dict-backed cache that lives for the lifetime of the object."""
128
+
129
+ def __init__(self) -> None:
130
+ self._store: dict[str, dict[str, Any]] = {}
131
+
132
+ def get(self, key: str) -> dict[str, Any] | None:
133
+ return self._store.get(key)
134
+
135
+ def put(self, key: str, result: dict[str, Any]) -> None:
136
+ self._store[key] = result
137
+
138
+ @property
139
+ def size(self) -> int:
140
+ return len(self._store)
141
+
142
+
143
+ # ---------------------------------------------------------------------------
144
+ # Disk (SQLite) backend
145
+ # ---------------------------------------------------------------------------
146
+
147
+ _DEFAULT_CACHE_DIR = Path.home() / ".flow" / "cache"
148
+ _DB_FILENAME = "eval_cache.db"
149
+
150
+
151
+ class DiskCache:
152
+ """SQLite-backed cache that persists across sessions.
153
+
154
+ The database is created lazily on first access at
155
+ ``~/.flow/cache/eval_cache.db`` (configurable via *cache_dir*).
156
+ """
157
+
158
+ def __init__(self, cache_dir: Path | None = None) -> None:
159
+ self._cache_dir = cache_dir or _DEFAULT_CACHE_DIR
160
+ self._db_path = self._cache_dir / _DB_FILENAME
161
+ self._conn: sqlite3.Connection | None = None
162
+
163
+ # -- lazy init ----------------------------------------------------------
164
+
165
+ def _ensure_db(self) -> sqlite3.Connection:
166
+ if self._conn is not None:
167
+ return self._conn
168
+ self._cache_dir.mkdir(parents=True, exist_ok=True)
169
+ self._conn = sqlite3.connect(str(self._db_path))
170
+ self._conn.execute(
171
+ """
172
+ CREATE TABLE IF NOT EXISTS eval_cache (
173
+ key TEXT PRIMARY KEY,
174
+ result TEXT NOT NULL,
175
+ created_at REAL NOT NULL
176
+ )
177
+ """
178
+ )
179
+ self._conn.commit()
180
+ return self._conn
181
+
182
+ # -- protocol -----------------------------------------------------------
183
+
184
+ def get(self, key: str) -> dict[str, Any] | None:
185
+ conn = self._ensure_db()
186
+ row = conn.execute(
187
+ "SELECT result FROM eval_cache WHERE key = ?", (key,)
188
+ ).fetchone()
189
+ if row is None:
190
+ return None
191
+ try:
192
+ return json.loads(row[0])
193
+ except (json.JSONDecodeError, TypeError):
194
+ return None
195
+
196
+ def put(self, key: str, result: dict[str, Any]) -> None:
197
+ conn = self._ensure_db()
198
+ conn.execute(
199
+ """
200
+ INSERT OR REPLACE INTO eval_cache (key, result, created_at)
201
+ VALUES (?, ?, ?)
202
+ """,
203
+ (key, json.dumps(result, default=str), time.time()),
204
+ )
205
+ conn.commit()
206
+
207
+ # -- helpers ------------------------------------------------------------
208
+
209
+ def close(self) -> None:
210
+ if self._conn is not None:
211
+ self._conn.close()
212
+ self._conn = None
213
+
214
+ @property
215
+ def size(self) -> int:
216
+ conn = self._ensure_db()
217
+ row = conn.execute("SELECT COUNT(*) FROM eval_cache").fetchone()
218
+ return row[0] if row else 0
219
+
220
+ def clear(self) -> None:
221
+ conn = self._ensure_db()
222
+ conn.execute("DELETE FROM eval_cache")
223
+ conn.commit()
src/flow/experiments/evaluators/llm.py CHANGED
@@ -141,7 +141,7 @@ The agent was given this task:
141
  ```
142
 
143
  ## Files Created
144
- {json.dumps(run_result.files_created, indent=2) if run_result.files_created else "None"}
145
 
146
  ## Tool Results
147
  {self._format_tool_results(run_result.tool_results)}
@@ -200,6 +200,12 @@ For each criterion, provide TWO scores:
200
  },
201
  }
202
 
 
 
 
 
 
 
203
  def _format_tool_results(self, tool_results: list[dict[str, str]]) -> str:
204
  """Format tool results for the evaluation prompt."""
205
  if not tool_results:
@@ -324,7 +330,7 @@ Tokens used: {metrics.total_tokens} (input: {metrics.input_tokens}, output: {met
324
  )
325
 
326
  except Exception as e:
327
- logger.error(f"LLM evaluation failed: {e}")
328
  return EvalResult(
329
  score=0.0,
330
  passed=False,
 
141
  ```
142
 
143
  ## Files Created
144
+ {self._format_files_created(run_result)}
145
 
146
  ## Tool Results
147
  {self._format_tool_results(run_result.tool_results)}
 
200
  },
201
  }
202
 
203
+ def _format_files_created(self, run_result: RunResult) -> str:
204
+ """Format files created section as a simple list of filenames."""
205
+ if not run_result.files_created:
206
+ return "None"
207
+ return "\n".join(f"- {f}" for f in run_result.files_created)
208
+
209
  def _format_tool_results(self, tool_results: list[dict[str, str]]) -> str:
210
  """Format tool results for the evaluation prompt."""
211
  if not tool_results:
 
330
  )
331
 
332
  except Exception as e:
333
+ logger.error(f"LLM evaluation failed: {e}", exc_info=True)
334
  return EvalResult(
335
  score=0.0,
336
  passed=False,
src/flow/experiments/gaia_converter.py CHANGED
@@ -156,25 +156,20 @@ def convert_to_flow_task(gaia_task: dict[str, Any]) -> Task:
156
  )
157
 
158
 
159
- def convert_gaia(example: dict[str, Any], index: int, dataset_metadata: dict[str, Any] | None = None) -> Task:
160
- logger.debug(f"Processing task at index: {index}")
161
-
162
- if dataset_metadata is None:
163
- raise ValueError("dataset_metadata is required and cannot be None.")
164
-
165
- # Validate required fields in dataset_metadata
166
- config = dataset_metadata.get("config")
167
- split = dataset_metadata.get("split")
168
- local_path = dataset_metadata.get("local_path")
169
-
170
- if config is None:
171
- raise ValueError("dataset_metadata 'config' is required and cannot be None.")
172
 
173
- if split is None:
174
- raise ValueError("dataset_metadata 'split' is required and cannot be None.")
175
-
176
- if local_path is None:
177
- raise ValueError("dataset_metadata 'local_path' is required and cannot be None.")
 
 
 
 
178
 
179
  # Derive GAIA year from the config when possible (e.g., "2023_level2" -> "2023"),
180
  # falling back to "2023" to preserve existing behavior if parsing fails.
 
156
  )
157
 
158
 
159
+ def convert_gaia(
160
+ example: dict[str, Any], index: int, *, config: str, split: str, local_path: str, **kwargs: Any
161
+ ) -> Task:
162
+ """Convert a GAIA benchmark example to a Flow task.
 
 
 
 
 
 
 
 
 
163
 
164
+ Args:
165
+ example: Raw example dict from the GAIA dataset.
166
+ index: Index of the example in the dataset.
167
+ config: Dataset configuration/subset (e.g. ``"2023_level1"``).
168
+ split: Dataset split (e.g. ``"train"``, ``"validation"``).
169
+ local_path: Root path where the dataset snapshot was downloaded.
170
+ **kwargs: Additional metadata reserved for future use; currently ignored.
171
+ """
172
+ logger.debug(f"Processing task at index: {index}")
173
 
174
  # Derive GAIA year from the config when possible (e.g., "2023_level2" -> "2023"),
175
  # falling back to "2023" to preserve existing behavior if parsing fails.
src/flow/experiments/hf_datasets.py CHANGED
@@ -17,11 +17,26 @@ from __future__ import annotations
17
  import json
18
  import logging
19
  import os
 
20
  from pathlib import Path
21
- from typing import Any
22
 
23
  from flow.experiments.types import EvalCriterion, Task
24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  logger = logging.getLogger(__name__)
26
 
27
 
@@ -29,7 +44,7 @@ logger = logging.getLogger(__name__)
29
  # Each converter knows how to extract question/answer from a specific dataset
30
 
31
 
32
- def convert_gsm8k(example: dict[str, Any], index: int, dataset_metadata: dict[str, Any] | None = None) -> Task:
33
  """Convert GSM8K math problem to Flow task.
34
 
35
  GSM8K format:
@@ -61,8 +76,8 @@ def convert_gsm8k(example: dict[str, Any], index: int, dataset_metadata: dict[st
61
  ]
62
 
63
  task_metadata = {"dataset": "gsm8k", "index": index, "answer": answer, "final_answer": final_answer}
64
- if dataset_metadata:
65
- task_metadata.update(dataset_metadata)
66
 
67
  return Task(
68
  name=f"gsm8k_{index}",
@@ -72,7 +87,7 @@ def convert_gsm8k(example: dict[str, Any], index: int, dataset_metadata: dict[st
72
  )
73
 
74
 
75
- def convert_math(example: dict[str, Any], index: int, dataset_metadata: dict[str, Any] | None = None) -> Task:
76
  """Convert MATH dataset problem to Flow task.
77
 
78
  MATH format:
@@ -98,8 +113,8 @@ def convert_math(example: dict[str, Any], index: int, dataset_metadata: dict[str
98
  ]
99
 
100
  task_metadata = {"dataset": "math", "index": index, "level": level, "type": problem_type, "solution": solution}
101
- if dataset_metadata:
102
- task_metadata.update(dataset_metadata)
103
 
104
  return Task(
105
  name=f"math_{problem_type.lower()}_{index}",
@@ -109,7 +124,7 @@ def convert_math(example: dict[str, Any], index: int, dataset_metadata: dict[str
109
  )
110
 
111
 
112
- def convert_humaneval(example: dict[str, Any], index: int, dataset_metadata: dict[str, Any] | None = None) -> Task:
113
  r"""Convert HumanEval coding problem to Flow task.
114
 
115
  HumanEval format:
@@ -138,8 +153,8 @@ def convert_humaneval(example: dict[str, Any], index: int, dataset_metadata: dic
138
  ]
139
 
140
  task_metadata = {"dataset": "humaneval", "task_id": task_id, "entry_point": entry_point, "test": test}
141
- if dataset_metadata:
142
- task_metadata.update(dataset_metadata)
143
 
144
  return Task(
145
  name=f"humaneval_{task_id.replace('/', '_')}",
@@ -149,7 +164,7 @@ def convert_humaneval(example: dict[str, Any], index: int, dataset_metadata: dic
149
  )
150
 
151
 
152
- def convert_mbpp(example: dict[str, Any], index: int, dataset_metadata: dict[str, Any] | None = None) -> Task:
153
  """Convert MBPP coding problem to Flow task.
154
 
155
  MBPP format:
@@ -170,8 +185,8 @@ def convert_mbpp(example: dict[str, Any], index: int, dataset_metadata: dict[str
170
  ]
171
 
172
  task_metadata = {"dataset": "mbpp", "task_id": task_id, "test_list": test_list}
173
- if dataset_metadata:
174
- task_metadata.update(dataset_metadata)
175
 
176
  return Task(
177
  name=f"mbpp_{task_id}",
@@ -182,13 +197,8 @@ def convert_mbpp(example: dict[str, Any], index: int, dataset_metadata: dict[str
182
 
183
 
184
  # Registry of dataset converters
185
- def _get_gaia_converter():
186
- """Lazy import for GAIA converter to avoid smolagents dependency at import time."""
187
- from flow.experiments.gaia_converter import convert_gaia
188
- return convert_gaia
189
 
190
-
191
- DATASET_CONVERTERS = {
192
  "openai/gsm8k": convert_gsm8k,
193
  "gsm8k": convert_gsm8k,
194
  "competition_math": convert_math,
@@ -197,7 +207,18 @@ DATASET_CONVERTERS = {
197
  "openai_humaneval": convert_humaneval,
198
  "mbpp": convert_mbpp,
199
  "google-research-datasets/mbpp": convert_mbpp,
200
- "gaia-benchmark/GAIA": _get_gaia_converter, # Lazy loaded
 
 
 
 
 
 
 
 
 
 
 
201
  }
202
 
203
 
@@ -206,7 +227,7 @@ def import_hf_dataset(
206
  config: str | None = None,
207
  split: str = "train",
208
  limit: int | None = None,
209
- converter_override: Any = None,
210
  local_path: str | Path | None = None,
211
  ) -> list[Task]:
212
  """Import a Hugging Face dataset and convert to Flow tasks.
@@ -246,6 +267,29 @@ def import_hf_dataset(
246
  except ImportError as e:
247
  raise ImportError("Hugging Face datasets library is required. Install with: pip install datasets") from e
248
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
249
  # Download to local path if specified, then load from there
250
  if local_path is not None:
251
  try:
@@ -276,26 +320,6 @@ def import_hf_dataset(
276
 
277
  logger.info(f"Converting {len(dataset)} examples to Flow tasks...")
278
 
279
- # Find converter
280
- converter = converter_override
281
- if converter is None:
282
- # Try to find matching converter
283
- for key, conv in DATASET_CONVERTERS.items():
284
- if key in dataset_name:
285
- # Handle lazy loaders (functions that return the actual converter)
286
- if conv is _get_gaia_converter:
287
- converter = conv()
288
- else:
289
- converter = conv
290
- break
291
-
292
- if converter is None:
293
- raise ValueError(
294
- f"No converter found for dataset '{dataset_name}'. "
295
- f"Available: {list(DATASET_CONVERTERS.keys())}\n"
296
- f"Use converter_override parameter to provide a custom converter."
297
- )
298
-
299
  # Build dataset metadata to pass to converters
300
  dataset_metadata: dict[str, Any] = {}
301
  dataset_metadata["local_path"] = str(local_path) if local_path else None
@@ -303,10 +327,10 @@ def import_hf_dataset(
303
  dataset_metadata["split"] = split
304
 
305
  # Convert examples
306
- tasks = []
307
  for i, example in enumerate(dataset):
308
  try:
309
- task = converter(example, i, dataset_metadata)
310
  tasks.append(task)
311
  except Exception as e:
312
  logger.warning(f"Failed to convert example {i}: {e}", exc_info=True)
@@ -338,7 +362,7 @@ def save_tasks_to_jsonl(tasks: list[Task], output_path: Path) -> None:
338
  logger.info(f"Saved {len(tasks)} tasks to {output_path}")
339
 
340
 
341
- def register_converter(dataset_name: str, converter_func: Any) -> None:
342
  """Register a custom converter for a dataset.
343
 
344
  Args:
@@ -346,7 +370,7 @@ def register_converter(dataset_name: str, converter_func: Any) -> None:
346
  converter_func: Function that converts example dict to Task
347
 
348
  Example:
349
- >>> def my_converter(example, index):
350
  ... return Task(name=f"task_{index}", prompt=example["text"], ...)
351
  >>> register_converter("my/dataset", my_converter)
352
  """
 
17
  import json
18
  import logging
19
  import os
20
+ from collections.abc import Callable
21
  from pathlib import Path
22
+ from typing import Any, Protocol
23
 
24
  from flow.experiments.types import EvalCriterion, Task
25
 
26
+
27
+ class ConverterFunc(Protocol):
28
+ """Protocol for dataset converter functions.
29
+
30
+ Converters accept a raw example dict, an index, and optional keyword
31
+ arguments (e.g. ``config``, ``split``, ``local_path``) that carry
32
+ dataset-level metadata.
33
+ """
34
+
35
+ def __call__(self, example: dict[str, Any], index: int, **kwargs: Any) -> Task:
36
+ """Convert a raw dataset example to a Flow Task."""
37
+ ...
38
+
39
+
40
  logger = logging.getLogger(__name__)
41
 
42
 
 
44
  # Each converter knows how to extract question/answer from a specific dataset
45
 
46
 
47
+ def convert_gsm8k(example: dict[str, Any], index: int, **kwargs: Any) -> Task:
48
  """Convert GSM8K math problem to Flow task.
49
 
50
  GSM8K format:
 
76
  ]
77
 
78
  task_metadata = {"dataset": "gsm8k", "index": index, "answer": answer, "final_answer": final_answer}
79
+ if kwargs:
80
+ task_metadata.update(kwargs)
81
 
82
  return Task(
83
  name=f"gsm8k_{index}",
 
87
  )
88
 
89
 
90
+ def convert_math(example: dict[str, Any], index: int, **kwargs: Any) -> Task:
91
  """Convert MATH dataset problem to Flow task.
92
 
93
  MATH format:
 
113
  ]
114
 
115
  task_metadata = {"dataset": "math", "index": index, "level": level, "type": problem_type, "solution": solution}
116
+ if kwargs:
117
+ task_metadata.update(kwargs)
118
 
119
  return Task(
120
  name=f"math_{problem_type.lower()}_{index}",
 
124
  )
125
 
126
 
127
+ def convert_humaneval(example: dict[str, Any], index: int, **kwargs: Any) -> Task:
128
  r"""Convert HumanEval coding problem to Flow task.
129
 
130
  HumanEval format:
 
153
  ]
154
 
155
  task_metadata = {"dataset": "humaneval", "task_id": task_id, "entry_point": entry_point, "test": test}
156
+ if kwargs:
157
+ task_metadata.update(kwargs)
158
 
159
  return Task(
160
  name=f"humaneval_{task_id.replace('/', '_')}",
 
164
  )
165
 
166
 
167
+ def convert_mbpp(example: dict[str, Any], index: int, **kwargs: Any) -> Task:
168
  """Convert MBPP coding problem to Flow task.
169
 
170
  MBPP format:
 
185
  ]
186
 
187
  task_metadata = {"dataset": "mbpp", "task_id": task_id, "test_list": test_list}
188
+ if kwargs:
189
+ task_metadata.update(kwargs)
190
 
191
  return Task(
192
  name=f"mbpp_{task_id}",
 
197
 
198
 
199
  # Registry of dataset converters
 
 
 
 
200
 
201
+ DATASET_CONVERTERS: dict[str, ConverterFunc] = {
 
202
  "openai/gsm8k": convert_gsm8k,
203
  "gsm8k": convert_gsm8k,
204
  "competition_math": convert_math,
 
207
  "openai_humaneval": convert_humaneval,
208
  "mbpp": convert_mbpp,
209
  "google-research-datasets/mbpp": convert_mbpp,
210
+ }
211
+
212
+
213
+ def _get_gaia_converter() -> ConverterFunc:
214
+ """Lazy import for GAIA converter to avoid smolagents dependency at import time."""
215
+ from flow.experiments.gaia_converter import convert_gaia
216
+
217
+ return convert_gaia
218
+
219
+
220
+ LAZY_CONVERTERS: dict[str, Callable[[], ConverterFunc]] = {
221
+ "gaia-benchmark/GAIA": _get_gaia_converter,
222
  }
223
 
224
 
 
227
  config: str | None = None,
228
  split: str = "train",
229
  limit: int | None = None,
230
+ converter_override: ConverterFunc | None = None,
231
  local_path: str | Path | None = None,
232
  ) -> list[Task]:
233
  """Import a Hugging Face dataset and convert to Flow tasks.
 
267
  except ImportError as e:
268
  raise ImportError("Hugging Face datasets library is required. Install with: pip install datasets") from e
269
 
270
+ # Find converter
271
+ converter: ConverterFunc | None = converter_override
272
+ if converter is None:
273
+ # Try direct converters first
274
+ for key, conv in DATASET_CONVERTERS.items():
275
+ if key in dataset_name:
276
+ converter = conv
277
+ break
278
+
279
+ # Fall back to lazy-loaded converters only if no direct match was found
280
+ if converter is None:
281
+ for key, factory in LAZY_CONVERTERS.items():
282
+ if key in dataset_name:
283
+ converter = factory()
284
+ break
285
+ if converter is None:
286
+ all_keys = sorted({*DATASET_CONVERTERS, *LAZY_CONVERTERS})
287
+ raise ValueError(
288
+ f"No converter found for dataset '{dataset_name}'. "
289
+ f"Available: {all_keys}\n"
290
+ f"Use converter_override parameter to provide a custom converter."
291
+ )
292
+
293
  # Download to local path if specified, then load from there
294
  if local_path is not None:
295
  try:
 
320
 
321
  logger.info(f"Converting {len(dataset)} examples to Flow tasks...")
322
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
323
  # Build dataset metadata to pass to converters
324
  dataset_metadata: dict[str, Any] = {}
325
  dataset_metadata["local_path"] = str(local_path) if local_path else None
 
327
  dataset_metadata["split"] = split
328
 
329
  # Convert examples
330
+ tasks: list[Task] = []
331
  for i, example in enumerate(dataset):
332
  try:
333
+ task = converter(dict(example), i, **dataset_metadata)
334
  tasks.append(task)
335
  except Exception as e:
336
  logger.warning(f"Failed to convert example {i}: {e}", exc_info=True)
 
362
  logger.info(f"Saved {len(tasks)} tasks to {output_path}")
363
 
364
 
365
+ def register_converter(dataset_name: str, converter_func: ConverterFunc) -> None:
366
  """Register a custom converter for a dataset.
367
 
368
  Args:
 
370
  converter_func: Function that converts example dict to Task
371
 
372
  Example:
373
+ >>> def my_converter(example, index, **kwargs):
374
  ... return Task(name=f"task_{index}", prompt=example["text"], ...)
375
  >>> register_converter("my/dataset", my_converter)
376
  """
src/flow/experiments/models.py CHANGED
@@ -409,14 +409,47 @@ class Agent:
409
  llm_config: dict[str, Any] | None = None # {"provider": "azure", "model": "gpt-4o"}
410
  compaction: CompactionConfig = field(default_factory=CompactionConfig)
411
  tools: str | list[str] | dict[str, dict[str, Any]] | None = None
 
412
 
413
  # Set by deploy() — when set, evaluate/optimize auto-persist to DB
414
- _id: str | None = field(default=None, repr=False, compare=False)
 
 
415
 
416
  @property
417
  def id(self) -> str | None:
418
- """Agent ID in the database, set after deploy()."""
419
- return self._id
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
420
 
421
  @classmethod
422
  def from_preset(cls, name: str) -> Agent:
@@ -508,24 +541,30 @@ class Agent:
508
  finally:
509
  await harness.close()
510
 
511
- async def deploy(self) -> str:
512
- """Register this agent in the Flow database.
513
 
514
- Creates an AgentConfig row in the local SQLite DB (~/.flow/flow_ui.db).
515
- No running server required this is a pure DB write. After deploying,
516
- all evaluate() and optimize() calls auto-persist results to the DB.
517
 
518
- Run ``flow serve`` separately to browse results in the UI.
 
 
 
 
 
519
 
520
  Returns:
521
- The agent ID (UUID string)
522
 
523
  Example:
524
  agent = Agent(name="coding-agent", tools="standard")
525
- agent_id = await agent.deploy()
526
- # Results now auto-persist
527
- result = await agent.evaluate(tasks="quick")
528
- # Run `flow serve` to view at http://localhost:7860/agents/{agent_id}
 
529
  """
530
  try:
531
  from flow.ui.services.persistence_adapter import PersistenceAdapter
@@ -535,9 +574,20 @@ class Agent:
535
  "to use deploy(): pip install flow[ui] or uv sync"
536
  ) from e
537
 
 
 
 
538
  adapter = PersistenceAdapter()
539
- self._id = await adapter.deploy_agent(self)
540
- return self._id
 
 
 
 
 
 
 
 
541
 
542
  async def evaluate(
543
  self,
@@ -570,7 +620,7 @@ class Agent:
570
  from .agent_api import _evaluate_agent_impl
571
 
572
  return await _evaluate_agent_impl(
573
- self, tasks, parallel, use_llm_eval, quiet, agent_id=self._id
574
  )
575
 
576
  async def optimize(
@@ -578,6 +628,7 @@ class Agent:
578
  tasks: str | list[Task] | Path = "quick",
579
  *,
580
  strategy: str | list[str] | None = None,
 
581
  variations: dict[str, list[Any]] | None = None,
582
  parallel: int = 4,
583
  budget: int = 50,
@@ -599,9 +650,13 @@ class Agent:
599
  - None or "grid": Grid search over variations (default)
600
  - "tools": Iteratively discover optimal tool configuration
601
  - "instructions": Iteratively rewrite instructions from failures
 
602
  - list: Run multiple strategies sequentially, e.g.
603
  ["instructions", "tools"] optimizes instructions first,
604
  then tools starting from the improved agent
 
 
 
605
  variations: Custom grid search variations (only used with grid strategy)
606
  parallel: Number of concurrent experiments
607
  budget: Maximum number of candidates to test
@@ -623,6 +678,13 @@ class Agent:
623
  # Active: improve instructions
624
  result = await agent.optimize(tasks="quick", strategy="instructions")
625
 
 
 
 
 
 
 
 
626
  # Pipeline: instructions first, then tools
627
  result = await agent.optimize(
628
  tasks="quick", strategy=["instructions", "tools"]
@@ -635,8 +697,9 @@ class Agent:
635
 
636
  return await _optimize_agent_impl(
637
  self, tasks, variations, parallel, budget, use_llm_eval, quiet,
638
- agent_id=self._id,
639
  strategy=strategy,
 
640
  )
641
 
642
 
@@ -934,12 +997,28 @@ def export_agent(
934
  ) -> None:
935
  """Export an Agent as a reusable YAML file.
936
 
 
 
 
 
 
937
  Args:
938
  agent: The Agent to export
939
  path: Path to write the YAML file
940
  metrics: Optional optimization metrics (stored under _optimization key)
941
  """
942
  data = asdict(agent)
 
 
 
 
 
 
 
 
 
 
 
943
  if metrics:
944
  data["_optimization"] = metrics
945
  path.parent.mkdir(parents=True, exist_ok=True)
@@ -971,6 +1050,17 @@ def load_agent(path: Path) -> Agent:
971
  if "compaction" in config_data and isinstance(config_data["compaction"], dict):
972
  config_data["compaction"] = CompactionConfig(**config_data["compaction"])
973
 
 
 
 
 
 
 
 
 
 
 
 
974
  try:
975
  return Agent(**config_data)
976
  except TypeError as e:
 
409
  llm_config: dict[str, Any] | None = None # {"provider": "azure", "model": "gpt-4o"}
410
  compaction: CompactionConfig = field(default_factory=CompactionConfig)
411
  tools: str | list[str] | dict[str, dict[str, Any]] | None = None
412
+ skills: dict[str, str] | None = None # skill_name -> SKILL.md content
413
 
414
  # Set by deploy() — when set, evaluate/optimize auto-persist to DB
415
+ _deployment_id: str | None = field(default=None, repr=False, compare=False)
416
+ _config_id: str | None = field(default=None, repr=False, compare=False)
417
+ _version: int | None = field(default=None, repr=False, compare=False)
418
 
419
  @property
420
  def id(self) -> str | None:
421
+ """Deployment ID, set after deploy(). This is the stable identity."""
422
+ return self._deployment_id
423
+
424
+ @property
425
+ def config_id(self) -> str | None:
426
+ """AgentConfig ID for the current version, set after deploy()."""
427
+ return self._config_id
428
+
429
+ @property
430
+ def version(self) -> int | None:
431
+ """Current deployment version number, set after deploy()."""
432
+ return self._version
433
+
434
+ @classmethod
435
+ def from_config(cls, path: str | Path) -> Agent:
436
+ """Create an Agent from a YAML config file.
437
+
438
+ Args:
439
+ path: Path to the YAML config file
440
+
441
+ Returns:
442
+ A new Agent instance with the config's values
443
+
444
+ Raises:
445
+ FileNotFoundError: If the file doesn't exist
446
+ ValueError: If the config is invalid
447
+
448
+ Example:
449
+ agent = Agent.from_config("examples/base_agent.yaml")
450
+ print(agent.name, agent.tools)
451
+ """
452
+ return load_agent(Path(path))
453
 
454
  @classmethod
455
  def from_preset(cls, name: str) -> Agent:
 
541
  finally:
542
  await harness.close()
543
 
544
+ async def deploy(self, candidate: Agent | None = None) -> str:
545
+ """Deploy this agent (or a candidate) to the Flow database.
546
 
547
+ First call creates a new Deployment + AgentConfig (v1).
548
+ Subsequent calls on the same agent append a new version to the
549
+ same deployment same stable URL, new config behind it.
550
 
551
+ Passing a ``candidate`` (e.g. from optimization results) deploys
552
+ that candidate's config as the next version of this deployment.
553
+
554
+ Args:
555
+ candidate: Optional Agent whose config to deploy as the next
556
+ version. If None, deploys this agent's current config.
557
 
558
  Returns:
559
+ The deployment ID (stable UUID string)
560
 
561
  Example:
562
  agent = Agent(name="coding-agent", tools="standard")
563
+ dep_id = await agent.deploy() # v1
564
+ agent.tools = ["bash", "read_file"]
565
+ await agent.deploy() # v2, same dep_id
566
+ result = await agent.optimize(tasks="quick", strategy="tools")
567
+ await agent.deploy(result.best_agent) # v3, same dep_id
568
  """
569
  try:
570
  from flow.ui.services.persistence_adapter import PersistenceAdapter
 
574
  "to use deploy(): pip install flow[ui] or uv sync"
575
  ) from e
576
 
577
+ source_agent = candidate or self
578
+ source = "optimize" if candidate is not None else "deploy"
579
+
580
  adapter = PersistenceAdapter()
581
+ result = await adapter.deploy_agent(
582
+ source_agent,
583
+ deployment_id=self._deployment_id,
584
+ source=source,
585
+ )
586
+
587
+ self._deployment_id = result.deployment_id
588
+ self._config_id = result.config_id
589
+ self._version = result.version
590
+ return self._deployment_id
591
 
592
  async def evaluate(
593
  self,
 
620
  from .agent_api import _evaluate_agent_impl
621
 
622
  return await _evaluate_agent_impl(
623
+ self, tasks, parallel, use_llm_eval, quiet, agent_id=self._config_id
624
  )
625
 
626
  async def optimize(
 
628
  tasks: str | list[Task] | Path = "quick",
629
  *,
630
  strategy: str | list[str] | None = None,
631
+ strategy_config: dict[str, Any] | None = None,
632
  variations: dict[str, list[Any]] | None = None,
633
  parallel: int = 4,
634
  budget: int = 50,
 
650
  - None or "grid": Grid search over variations (default)
651
  - "tools": Iteratively discover optimal tool configuration
652
  - "instructions": Iteratively rewrite instructions from failures
653
+ - "skills": Iteratively generate domain knowledge skills
654
  - list: Run multiple strategies sequentially, e.g.
655
  ["instructions", "tools"] optimizes instructions first,
656
  then tools starting from the improved agent
657
+ strategy_config: Optional config passed to the strategy. Merged
658
+ with defaults (max_iterations=3, min_improvement=0.01).
659
+ Example: {"max_iterations": 5, "include_builtin": True}
660
  variations: Custom grid search variations (only used with grid strategy)
661
  parallel: Number of concurrent experiments
662
  budget: Maximum number of candidates to test
 
678
  # Active: improve instructions
679
  result = await agent.optimize(tasks="quick", strategy="instructions")
680
 
681
+ # Active: skills with custom config
682
+ result = await agent.optimize(
683
+ tasks="quick",
684
+ strategy="skills",
685
+ strategy_config={"max_iterations": 5, "min_improvement": 0.0},
686
+ )
687
+
688
  # Pipeline: instructions first, then tools
689
  result = await agent.optimize(
690
  tasks="quick", strategy=["instructions", "tools"]
 
697
 
698
  return await _optimize_agent_impl(
699
  self, tasks, variations, parallel, budget, use_llm_eval, quiet,
700
+ agent_id=self._config_id,
701
  strategy=strategy,
702
+ strategy_config=strategy_config,
703
  )
704
 
705
 
 
997
  ) -> None:
998
  """Export an Agent as a reusable YAML file.
999
 
1000
+ If the agent has skills, each skill is written as a SKILL.md file in a
1001
+ ``skills/<name>/`` directory next to the YAML. The YAML references skills
1002
+ by name (list of strings) so the agent/framework can locate them at the
1003
+ relative path ``skills/<name>/SKILL.md``.
1004
+
1005
  Args:
1006
  agent: The Agent to export
1007
  path: Path to write the YAML file
1008
  metrics: Optional optimization metrics (stored under _optimization key)
1009
  """
1010
  data = asdict(agent)
1011
+
1012
+ # Write skill folders alongside the YAML and replace content with names
1013
+ if agent.skills:
1014
+ skills_dir = path.parent / "skills"
1015
+ for skill_name, skill_content in agent.skills.items():
1016
+ skill_folder = skills_dir / skill_name
1017
+ skill_folder.mkdir(parents=True, exist_ok=True)
1018
+ (skill_folder / "SKILL.md").write_text(skill_content)
1019
+ # In the YAML, store just the skill names (not full content)
1020
+ data["skills"] = sorted(agent.skills.keys())
1021
+
1022
  if metrics:
1023
  data["_optimization"] = metrics
1024
  path.parent.mkdir(parents=True, exist_ok=True)
 
1050
  if "compaction" in config_data and isinstance(config_data["compaction"], dict):
1051
  config_data["compaction"] = CompactionConfig(**config_data["compaction"])
1052
 
1053
+ # Load skills from disk: YAML stores skill names as a list,
1054
+ # resolve to dict[name, content] by reading skills/<name>/SKILL.md
1055
+ if "skills" in config_data and isinstance(config_data["skills"], list):
1056
+ skills_dir = path.parent / "skills"
1057
+ loaded_skills: dict[str, str] = {}
1058
+ for skill_name in config_data["skills"]:
1059
+ skill_path = skills_dir / skill_name / "SKILL.md"
1060
+ if skill_path.exists():
1061
+ loaded_skills[skill_name] = skill_path.read_text()
1062
+ config_data["skills"] = loaded_skills if loaded_skills else None
1063
+
1064
  try:
1065
  return Agent(**config_data)
1066
  except TypeError as e:
src/flow/experiments/optimizer.py CHANGED
@@ -21,6 +21,13 @@ from typing import Any
21
  from openai import AsyncAzureOpenAI
22
 
23
  from .ablation import compute_pareto_frontier
 
 
 
 
 
 
 
24
  from .evaluators import LLMEvaluator
25
  from .metrics import TraceMetrics, extract_metrics
26
  from .models import (
@@ -175,15 +182,27 @@ class FlowOptimizer:
175
  parallel: int = 4,
176
  use_llm_evaluator: bool = True,
177
  output_dir: Path | None = None,
 
 
178
  ) -> None:
179
  self.parallel = parallel
180
  self.use_llm_evaluator = use_llm_evaluator
181
  self.output_dir = output_dir or Path.home() / ".flow" / "optimizations"
 
 
 
 
 
182
 
183
  # Internal state set during optimize() for use by evaluate()
184
  self._evaluator: LLMEvaluator | None = None
185
  self._run_dir: Path | None = None
186
 
 
 
 
 
 
187
  async def optimize(
188
  self,
189
  candidates: list[Candidate],
@@ -211,15 +230,15 @@ class FlowOptimizer:
211
  setup_tracing("flow-optimizer")
212
  self._save_config(candidates, tasks, run_dir)
213
 
214
- print("=" * 70)
215
- print(" FLOW OPTIMIZER")
216
- print("=" * 70)
217
- print(f" Candidates: {len(candidates)}")
218
- print(f" Tasks: {len(tasks)}")
219
- print(f" Total: {len(candidates) * len(tasks)} experiments")
220
- print(f" Parallel: {self.parallel}")
221
- print(f" Output: {run_dir}")
222
- print("=" * 70)
223
 
224
  evaluator = None
225
  if self.use_llm_evaluator:
@@ -316,16 +335,16 @@ class FlowOptimizer:
316
  self._evaluator = evaluator
317
  self._run_dir = run_dir
318
 
319
- print("=" * 70)
320
- print(" FLOW OPTIMIZER (Strategy Mode)")
321
- print("=" * 70)
322
- print(f" Strategy: {type(strategy).__name__}")
323
- print(f" Base Agent: {base.name}")
324
- print(f" Tasks: {len(tasks)}")
325
- print(f" Budget: {budget}")
326
- print(f" Parallel: {self.parallel}")
327
- print(f" Output: {run_dir}")
328
- print("=" * 70)
329
 
330
  # Pass self as runner — FlowOptimizer implements the ExperimentRunner
331
  # protocol via the evaluate() method above
@@ -340,7 +359,7 @@ class FlowOptimizer:
340
  logger.warning("Strategy produced no candidates")
341
  candidates = [Candidate(agent=base, mutations={}, rationale="baseline (strategy produced none)")]
342
 
343
- print(f"\nStrategy produced {len(candidates)} candidates. Running final evaluation...")
344
 
345
  # Save config
346
  self._save_config(candidates, tasks, run_dir)
@@ -403,15 +422,41 @@ class FlowOptimizer:
403
  async def run_one(candidate: Candidate, task: Task) -> TaskResult:
404
  nonlocal completed
405
  async with semaphore:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
406
  workspace = run_dir / "workspaces" / candidate.agent.name / task.name
407
  workspace.mkdir(parents=True, exist_ok=True)
408
 
409
  result = await self._run_single(candidate, task, workspace, evaluator)
410
 
 
 
 
 
411
  async with lock:
412
  completed += 1
413
  status = "PASS" if result.eval_passed else "FAIL"
414
- print(
415
  f" [{completed}/{total}] {candidate.agent.name}/{task.name}: "
416
  f"{status} score={result.eval_score:.2f} "
417
  f"reasoning={result.eval_reasoning_score:.2f} "
@@ -428,7 +473,11 @@ class FlowOptimizer:
428
  valid_results: list[TaskResult] = []
429
  for r in gather_results:
430
  if isinstance(r, BaseException):
431
- logger.error(f"Experiment failed: {r}")
 
 
 
 
432
  else:
433
  valid_results.append(r)
434
 
@@ -672,29 +721,29 @@ class FlowOptimizer:
672
 
673
  def _print_summary(self, result: OptimizationResult) -> None:
674
  """Print optimization summary."""
675
- print("\n" + "=" * 70)
676
- print(" OPTIMIZATION RESULTS")
677
- print("=" * 70)
678
 
679
- print(f"\n{'Candidate':<30} | {'Score':>8} | {'Reason':>8} | {'Tokens':>10} | {'Pareto':>8}")
680
- print("-" * 75)
681
 
682
  for summary in sorted(result.summaries, key=lambda s: s.avg_score, reverse=True):
683
  pareto = "*" if summary.is_pareto_optimal else ""
684
- print(
685
  f"{summary.name:<30} | {summary.avg_score:>8.2f} | "
686
  f"{summary.avg_reasoning_score:>8.2f} | "
687
  f"{summary.avg_tokens:>10,.0f} | {pareto:>8}"
688
  )
689
 
690
- print("\n" + "-" * 70)
691
- print(f"Pareto frontier: {result.pareto_frontier}")
692
- print(f"Best by score: {result.rank_by_score[0] if result.rank_by_score else 'N/A'}")
693
- print(f"Best by efficiency: {result.rank_by_efficiency[0] if result.rank_by_efficiency else 'N/A'}")
694
- print("\nExported agents:")
695
  for name, path in result.exported_agents.items():
696
- print(f" {name}: {path}")
697
- print(f"\nResults saved to: {result.output_dir}")
698
 
699
 
700
  def load_tasks_from_jsonl(path: Path) -> list[Task]:
@@ -716,6 +765,7 @@ async def evaluate_agent(
716
  parallel: int = 4,
717
  use_llm_evaluator: bool = True,
718
  output_dir: Path | None = None,
 
719
  ) -> CandidateSummary:
720
  """Evaluate a single agent on a set of tasks.
721
 
@@ -760,6 +810,7 @@ async def evaluate_agent(
760
  parallel=parallel,
761
  use_llm_evaluator=use_llm_evaluator,
762
  output_dir=eval_output_dir,
 
763
  )
764
 
765
  result = await optimizer.optimize([candidate], tasks)
@@ -768,3 +819,77 @@ async def evaluate_agent(
768
  raise RuntimeError("Evaluation produced no results")
769
 
770
  return result.summaries[0]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  from openai import AsyncAzureOpenAI
22
 
23
  from .ablation import compute_pareto_frontier
24
+ from .eval_cache import (
25
+ DiskCache,
26
+ EvaluationCache,
27
+ agent_cache_dict,
28
+ build_cache_key,
29
+ task_cache_dict,
30
+ )
31
  from .evaluators import LLMEvaluator
32
  from .metrics import TraceMetrics, extract_metrics
33
  from .models import (
 
182
  parallel: int = 4,
183
  use_llm_evaluator: bool = True,
184
  output_dir: Path | None = None,
185
+ quiet: bool = False,
186
+ cache_evaluations: bool = True,
187
  ) -> None:
188
  self.parallel = parallel
189
  self.use_llm_evaluator = use_llm_evaluator
190
  self.output_dir = output_dir or Path.home() / ".flow" / "optimizations"
191
+ self.quiet = quiet
192
+
193
+ # Evaluation cache — avoids redundant agent runs for identical
194
+ # (agent-config, task) pairs. Persists across sessions via SQLite.
195
+ self._cache: EvaluationCache | None = DiskCache() if cache_evaluations else None
196
 
197
  # Internal state set during optimize() for use by evaluate()
198
  self._evaluator: LLMEvaluator | None = None
199
  self._run_dir: Path | None = None
200
 
201
+ def _log(self, msg: str) -> None:
202
+ """Print a message unless quiet mode is enabled."""
203
+ if not self.quiet:
204
+ print(msg)
205
+
206
  async def optimize(
207
  self,
208
  candidates: list[Candidate],
 
230
  setup_tracing("flow-optimizer")
231
  self._save_config(candidates, tasks, run_dir)
232
 
233
+ self._log("=" * 70)
234
+ self._log(" FLOW OPTIMIZER")
235
+ self._log("=" * 70)
236
+ self._log(f" Candidates: {len(candidates)}")
237
+ self._log(f" Tasks: {len(tasks)}")
238
+ self._log(f" Total: {len(candidates) * len(tasks)} experiments")
239
+ self._log(f" Parallel: {self.parallel}")
240
+ self._log(f" Output: {run_dir}")
241
+ self._log("=" * 70)
242
 
243
  evaluator = None
244
  if self.use_llm_evaluator:
 
335
  self._evaluator = evaluator
336
  self._run_dir = run_dir
337
 
338
+ self._log("=" * 70)
339
+ self._log(" FLOW OPTIMIZER (Strategy Mode)")
340
+ self._log("=" * 70)
341
+ self._log(f" Strategy: {type(strategy).__name__}")
342
+ self._log(f" Base Agent: {base.name}")
343
+ self._log(f" Tasks: {len(tasks)}")
344
+ self._log(f" Budget: {budget}")
345
+ self._log(f" Parallel: {self.parallel}")
346
+ self._log(f" Output: {run_dir}")
347
+ self._log("=" * 70)
348
 
349
  # Pass self as runner — FlowOptimizer implements the ExperimentRunner
350
  # protocol via the evaluate() method above
 
359
  logger.warning("Strategy produced no candidates")
360
  candidates = [Candidate(agent=base, mutations={}, rationale="baseline (strategy produced none)")]
361
 
362
+ self._log(f"\nStrategy produced {len(candidates)} candidates. Running final evaluation...")
363
 
364
  # Save config
365
  self._save_config(candidates, tasks, run_dir)
 
422
  async def run_one(candidate: Candidate, task: Task) -> TaskResult:
423
  nonlocal completed
424
  async with semaphore:
425
+ # Check evaluation cache
426
+ cache_key: str | None = None
427
+ if self._cache is not None:
428
+ cache_key = build_cache_key(
429
+ agent_cache_dict(candidate.agent),
430
+ task_cache_dict(task),
431
+ )
432
+ cached = self._cache.get(cache_key)
433
+ if cached is not None:
434
+ result = _task_result_from_cache(cached, candidate, task)
435
+ async with lock:
436
+ completed += 1
437
+ self._log(
438
+ f" [{completed}/{total}] {candidate.agent.name}/{task.name}: "
439
+ f"CACHED score={result.eval_score:.2f} "
440
+ f"reasoning={result.eval_reasoning_score:.2f} "
441
+ f"tokens={result.metrics.total_tokens:,}"
442
+ )
443
+ if progress_callback:
444
+ progress_callback(completed, total, candidate.agent.name, task.name)
445
+ return result
446
+
447
  workspace = run_dir / "workspaces" / candidate.agent.name / task.name
448
  workspace.mkdir(parents=True, exist_ok=True)
449
 
450
  result = await self._run_single(candidate, task, workspace, evaluator)
451
 
452
+ # Store in cache
453
+ if self._cache is not None and cache_key is not None:
454
+ self._cache.put(cache_key, _task_result_to_cache(result))
455
+
456
  async with lock:
457
  completed += 1
458
  status = "PASS" if result.eval_passed else "FAIL"
459
+ self._log(
460
  f" [{completed}/{total}] {candidate.agent.name}/{task.name}: "
461
  f"{status} score={result.eval_score:.2f} "
462
  f"reasoning={result.eval_reasoning_score:.2f} "
 
473
  valid_results: list[TaskResult] = []
474
  for r in gather_results:
475
  if isinstance(r, BaseException):
476
+ logger.error(
477
+ "Experiment failed: %s",
478
+ r,
479
+ exc_info=(type(r), r, r.__traceback__),
480
+ )
481
  else:
482
  valid_results.append(r)
483
 
 
721
 
722
  def _print_summary(self, result: OptimizationResult) -> None:
723
  """Print optimization summary."""
724
+ self._log("\n" + "=" * 70)
725
+ self._log(" OPTIMIZATION RESULTS")
726
+ self._log("=" * 70)
727
 
728
+ self._log(f"\n{'Candidate':<30} | {'Score':>8} | {'Reason':>8} | {'Tokens':>10} | {'Pareto':>8}")
729
+ self._log("-" * 75)
730
 
731
  for summary in sorted(result.summaries, key=lambda s: s.avg_score, reverse=True):
732
  pareto = "*" if summary.is_pareto_optimal else ""
733
+ self._log(
734
  f"{summary.name:<30} | {summary.avg_score:>8.2f} | "
735
  f"{summary.avg_reasoning_score:>8.2f} | "
736
  f"{summary.avg_tokens:>10,.0f} | {pareto:>8}"
737
  )
738
 
739
+ self._log("\n" + "-" * 70)
740
+ self._log(f"Pareto frontier: {result.pareto_frontier}")
741
+ self._log(f"Best by score: {result.rank_by_score[0] if result.rank_by_score else 'N/A'}")
742
+ self._log(f"Best by efficiency: {result.rank_by_efficiency[0] if result.rank_by_efficiency else 'N/A'}")
743
+ self._log("\nExported agents:")
744
  for name, path in result.exported_agents.items():
745
+ self._log(f" {name}: {path}")
746
+ self._log(f"\nResults saved to: {result.output_dir}")
747
 
748
 
749
  def load_tasks_from_jsonl(path: Path) -> list[Task]:
 
765
  parallel: int = 4,
766
  use_llm_evaluator: bool = True,
767
  output_dir: Path | None = None,
768
+ quiet: bool = False,
769
  ) -> CandidateSummary:
770
  """Evaluate a single agent on a set of tasks.
771
 
 
810
  parallel=parallel,
811
  use_llm_evaluator=use_llm_evaluator,
812
  output_dir=eval_output_dir,
813
+ quiet=quiet,
814
  )
815
 
816
  result = await optimizer.optimize([candidate], tasks)
 
819
  raise RuntimeError("Evaluation produced no results")
820
 
821
  return result.summaries[0]
822
+
823
+
824
+ # ---------------------------------------------------------------------------
825
+ # Cache serialisation helpers
826
+ # ---------------------------------------------------------------------------
827
+
828
+
829
+ def _task_result_to_cache(result: TaskResult) -> dict[str, Any]:
830
+ """Serialise a TaskResult to a JSON-safe dict for caching."""
831
+ return {
832
+ "eval_score": result.eval_score,
833
+ "eval_passed": result.eval_passed,
834
+ "eval_reasoning": result.eval_reasoning,
835
+ "eval_reasoning_score": result.eval_reasoning_score,
836
+ "criteria_results": result.criteria_results,
837
+ "metrics": {
838
+ "total_tokens": result.metrics.total_tokens,
839
+ "input_tokens": result.metrics.input_tokens,
840
+ "output_tokens": result.metrics.output_tokens,
841
+ "tool_call_count": result.metrics.tool_call_count,
842
+ "llm_call_count": result.metrics.llm_call_count,
843
+ "total_duration_ms": result.metrics.total_duration_ms,
844
+ },
845
+ "run": {
846
+ "output": result.run_result.output,
847
+ "files_created": result.run_result.files_created,
848
+ "duration_seconds": result.run_result.duration_seconds,
849
+ "error": result.run_result.error,
850
+ "tool_results": result.run_result.tool_results,
851
+ "trace": result.run_result.trace,
852
+ },
853
+ }
854
+
855
+
856
+ def _task_result_from_cache(
857
+ cached: dict[str, Any],
858
+ candidate: Candidate,
859
+ task: Task,
860
+ ) -> TaskResult:
861
+ """Reconstruct a TaskResult from a cached dict."""
862
+ run_data = cached.get("run", {})
863
+ metrics_data = cached.get("metrics", {})
864
+
865
+ run_result = RunResult(
866
+ task=task,
867
+ trace=run_data.get("trace", []),
868
+ output=run_data.get("output", ""),
869
+ files_created=run_data.get("files_created", []),
870
+ duration_seconds=run_data.get("duration_seconds", 0.0),
871
+ workspace=Path("/cached"),
872
+ error=run_data.get("error"),
873
+ tool_results=run_data.get("tool_results", []),
874
+ )
875
+
876
+ metrics = TraceMetrics(
877
+ total_tokens=metrics_data.get("total_tokens", 0),
878
+ input_tokens=metrics_data.get("input_tokens", 0),
879
+ output_tokens=metrics_data.get("output_tokens", 0),
880
+ tool_call_count=metrics_data.get("tool_call_count", 0),
881
+ llm_call_count=metrics_data.get("llm_call_count", 0),
882
+ total_duration_ms=metrics_data.get("total_duration_ms", 0.0),
883
+ )
884
+
885
+ return TaskResult(
886
+ candidate_name=candidate.agent.name,
887
+ task_name=task.name,
888
+ run_result=run_result,
889
+ metrics=metrics,
890
+ eval_score=cached.get("eval_score", 0.0),
891
+ eval_passed=cached.get("eval_passed", False),
892
+ eval_reasoning=cached.get("eval_reasoning", ""),
893
+ criteria_results=cached.get("criteria_results", []),
894
+ eval_reasoning_score=cached.get("eval_reasoning_score", 0.0),
895
+ )
src/flow/experiments/results.py CHANGED
@@ -13,6 +13,7 @@ from pathlib import Path
13
  from typing import TYPE_CHECKING, Any
14
 
15
  if TYPE_CHECKING:
 
16
  from .optimizer import CandidateSummary
17
 
18
 
@@ -110,9 +111,29 @@ class AgentOptimizationResult:
110
  # Set when agent was deployed — links to the DB job
111
  job_id: str | None = field(default=None, repr=False)
112
 
 
 
 
 
 
 
 
 
113
  def __str__(self) -> str:
114
  return (
115
  f"Optimization: {self.baseline} → {self.best}\n"
116
  f"Improvement: {self.improvement}\n"
117
  f"Candidates tested: {self.candidates_tested}"
118
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  from typing import TYPE_CHECKING, Any
14
 
15
  if TYPE_CHECKING:
16
+ from .models import StrategyIteration
17
  from .optimizer import CandidateSummary
18
 
19
 
 
111
  # Set when agent was deployed — links to the DB job
112
  job_id: str | None = field(default=None, repr=False)
113
 
114
+ @property
115
+ def iterations(self) -> list[StrategyIteration]:
116
+ """Per-iteration history from active optimization strategies."""
117
+ details = self.best._details
118
+ if details and details.candidate.optimization_history:
119
+ return details.candidate.optimization_history
120
+ return []
121
+
122
  def __str__(self) -> str:
123
  return (
124
  f"Optimization: {self.baseline} → {self.best}\n"
125
  f"Improvement: {self.improvement}\n"
126
  f"Candidates tested: {self.candidates_tested}"
127
  )
128
+
129
+ def print_summary(self) -> None:
130
+ """Print a formatted table of optimization iterations."""
131
+ history = self.iterations
132
+ if not history:
133
+ print(str(self))
134
+ return
135
+
136
+ print(f"{'Iter':<6}{'Score':<10}{'Pass Rate':<12}{'Change'}")
137
+ print("-" * 60)
138
+ for h in history:
139
+ print(f"{h.iteration:<6}{h.avg_score:<10.0%}{h.pass_rate:<12.0%}{h.change_description}")
src/flow/experiments/runner.py CHANGED
@@ -201,7 +201,7 @@ class FlowExperimentRunner:
201
 
202
  except Exception as e:
203
  error = str(e)
204
- logger.error(f"Task execution failed: {e}")
205
 
206
  end_time = time.time()
207
  duration_seconds = end_time - start_time
 
201
 
202
  except Exception as e:
203
  error = str(e)
204
+ logger.exception(f"Task execution failed: {e}")
205
 
206
  end_time = time.time()
207
  duration_seconds = end_time - start_time
src/flow/experiments/strategies/__init__.py CHANGED
@@ -9,10 +9,10 @@ Example YAML:
9
  variations:
10
  instructions:
11
  - "You are helpful" # Literal
12
- - strategy: gepa # Strategy
13
- max_candidates: 3
14
  config:
15
- reflection_lm: gpt-4o
16
  """
17
 
18
  from __future__ import annotations
@@ -84,19 +84,33 @@ def _register_builtin_strategies() -> None:
84
  except ImportError:
85
  logger.debug("GEPA strategy not available (gepa package not installed)")
86
 
87
- # LLM rewriter strategy (simple instruction variations)
88
  try:
89
- from .llm_rewriter import LLMRewriterStrategy
90
- register_strategy("llm_rewriter", LLMRewriterStrategy)
91
  except ImportError:
92
- logger.debug("LLM rewriter strategy not available")
93
 
94
- # Tool selector strategy (generates tool configurations)
95
  try:
96
- from .tool_selector import ToolSelectorStrategy
97
- register_strategy("tool_selector", ToolSelectorStrategy)
98
  except ImportError:
99
- logger.debug("Tool selector strategy not available")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
 
101
 
102
  # Register on module import
 
9
  variations:
10
  instructions:
11
  - "You are helpful" # Literal
12
+ - strategy: instruction # Strategy
13
+ max_candidates: 1
14
  config:
15
+ max_iterations: 5
16
  """
17
 
18
  from __future__ import annotations
 
84
  except ImportError:
85
  logger.debug("GEPA strategy not available (gepa package not installed)")
86
 
87
+ # Instruction optimizer
88
  try:
89
+ from .instruction import InstructionOptimizer
90
+ register_strategy("instruction", InstructionOptimizer)
91
  except ImportError:
92
+ logger.debug("Instruction optimizer not available")
93
 
94
+ # Tool optimizer
95
  try:
96
+ from .tool import ToolOptimizer
97
+ register_strategy("tool", ToolOptimizer)
98
  except ImportError:
99
+ logger.debug("Tool optimizer not available")
100
+
101
+ # Skill optimizer
102
+ try:
103
+ from .skill import SkillOptimizer
104
+ register_strategy("skill", SkillOptimizer)
105
+ except ImportError:
106
+ logger.debug("Skill optimizer not available")
107
+
108
+ # GEPA instruction optimizer (uses standard plumbing + GEPA reflection)
109
+ try:
110
+ from .gepa_instruction import GEPAInstructionOptimizer
111
+ register_strategy("gepa_instruction", GEPAInstructionOptimizer)
112
+ except ImportError:
113
+ logger.debug("GEPA instruction optimizer not available")
114
 
115
 
116
  # Register on module import
src/flow/experiments/strategies/gepa_instruction.py ADDED
@@ -0,0 +1,415 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Microsoft. All rights reserved.
2
+
3
+ """GEPA-based instruction optimization strategy.
4
+
5
+ Like InstructionOptimizer but uses GEPA's evolutionary approach:
6
+ - Maintains a population of candidate instructions
7
+ - Uses GEPA's reflection mechanism to generate new candidates from failures
8
+ - Selects candidates via frontier-based selection (not just greedy best)
9
+
10
+ Uses the standard strategy plumbing (runner.evaluate()) instead of
11
+ a custom evaluator callback bridge.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import logging
17
+ import os
18
+ from dataclasses import dataclass, field
19
+ from typing import Any
20
+
21
+ from ..models import Agent, Candidate, ExperimentRunner, StrategyIteration
22
+ from ..types import Task
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+
27
+ @dataclass
28
+ class GEPAInstructionOptimizer:
29
+ """Instruction optimizer using GEPA's evolutionary approach.
30
+
31
+ Uses the same runner.evaluate() plumbing as InstructionOptimizer,
32
+ but delegates candidate generation to GEPA's reflection + selection loop.
33
+
34
+ The GEPA library handles:
35
+ - Generating improved prompts via LLM reflection on failures
36
+ - Candidate selection via frontier-based strategies
37
+ - Population management across generations
38
+
39
+ Config options:
40
+ model: LLM for GEPA reflection (default: gpt-4o-mini)
41
+ max_iterations: Max generations (default: 5)
42
+ min_improvement: Min score gain to continue (default: 0.05)
43
+ reflection_lm: LLM for GEPA reflection (overrides model if set)
44
+
45
+ Example:
46
+ flow optimize --agent agent.yaml --tasks tasks.jsonl --strategy gepa_instructions
47
+ """
48
+
49
+ config: dict[str, Any] = field(default_factory=dict)
50
+
51
+ async def generate(
52
+ self,
53
+ base: Agent,
54
+ budget: int,
55
+ *,
56
+ tasks: list[Task] | None = None,
57
+ runner: ExperimentRunner | None = None,
58
+ ) -> list[Candidate]:
59
+ """Generate optimized instructions using GEPA's evolutionary loop.
60
+
61
+ Args:
62
+ base: Base agent with instructions to optimize
63
+ budget: Max candidates to evaluate
64
+ tasks: Tasks to evaluate on (required)
65
+ runner: ExperimentRunner for evaluation (required)
66
+
67
+ Returns:
68
+ List with the best candidate found
69
+ """
70
+ if runner is None:
71
+ raise ValueError(
72
+ "GEPAInstructionOptimizer requires a runner. "
73
+ "Use FlowOptimizer.optimize_with_strategy() to provide one."
74
+ )
75
+ if not tasks:
76
+ raise ValueError(
77
+ "GEPAInstructionOptimizer requires tasks to evaluate against."
78
+ )
79
+
80
+ try:
81
+ import gepa
82
+ from gepa.core.adapter import EvaluationBatch
83
+ except ImportError as e:
84
+ raise ImportError(
85
+ "GEPA is not installed. Install with: pip install gepa"
86
+ ) from e
87
+
88
+ model = self.config.get("model", "gpt-4o-mini")
89
+ max_iterations = self.config.get("max_iterations", 5)
90
+ min_improvement = self.config.get("min_improvement", 0.05)
91
+
92
+ base_instructions = base.instructions or "You are a helpful assistant."
93
+
94
+ # Track optimization history
95
+ history: list[StrategyIteration] = []
96
+ best_instructions = base_instructions
97
+ best_score = 0.0
98
+ generation = 0
99
+
100
+ # ── Build GEPA adapter that uses runner.evaluate() ──
101
+
102
+ strategy_self = self
103
+ _runner = runner
104
+ _tasks = tasks
105
+ _base = base
106
+
107
+ def _run_async(coro: Any) -> Any:
108
+ """Run an async coroutine from synchronous GEPA context."""
109
+ import asyncio
110
+ import concurrent.futures
111
+
112
+ with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool:
113
+ return pool.submit(asyncio.run, coro).result()
114
+
115
+ class FlowRunnerAdapter(gepa.GEPAAdapter):
116
+ """Bridges GEPA's adapter interface to Flow's ExperimentRunner."""
117
+
118
+ def evaluate(
119
+ self,
120
+ batch: list[Any],
121
+ candidate: dict[str, str],
122
+ capture_traces: bool = False,
123
+ ) -> EvaluationBatch:
124
+ """Evaluate a candidate one task at a time using Flow's runner.
125
+
126
+ GEPA tracks scores by dataset item index, so we must evaluate
127
+ each batch item individually to preserve the 1:1 mapping.
128
+ """
129
+ instructions_text = candidate.get("instructions", base_instructions)
130
+
131
+ # Build agent + candidate
132
+ agent = Agent(
133
+ name=f"{_base.name}_gepa_iter",
134
+ framework=_base.framework,
135
+ instructions=instructions_text,
136
+ llm_config=_base.llm_config,
137
+ compaction=_base.compaction,
138
+ tools=_base.tools,
139
+ )
140
+ flow_candidate = Candidate(
141
+ agent=agent,
142
+ mutations={"instructions": instructions_text},
143
+ )
144
+
145
+ if not batch:
146
+ return EvaluationBatch(
147
+ outputs=[], scores=[],
148
+ trajectories=[] if capture_traces else None,
149
+ objective_scores=None,
150
+ )
151
+
152
+ # Evaluate each task individually to preserve GEPA's index mapping
153
+ scores: list[float] = []
154
+ outputs: list[dict[str, Any]] = []
155
+ trajectories: list[dict[str, Any]] = []
156
+ passed_count = 0
157
+
158
+ for item in batch:
159
+ if not isinstance(item, Task):
160
+ scores.append(0.0)
161
+ outputs.append({})
162
+ continue
163
+
164
+ # Evaluate single task via runner
165
+ summary = _run_async(_runner.evaluate(flow_candidate, [item]))
166
+
167
+ # Extract result for this task
168
+ if summary.task_results:
169
+ tr = summary.task_results[0]
170
+ task_score = float(getattr(tr, "eval_score", 0.0))
171
+ eval_passed = getattr(tr, "eval_passed", False)
172
+ eval_reasoning = getattr(tr, "eval_reasoning", "")
173
+ agent_output = str(getattr(tr.run_result, "output", "")) if tr.run_result else ""
174
+ else:
175
+ task_score = 0.0
176
+ eval_passed = False
177
+ eval_reasoning = "No result"
178
+ agent_output = ""
179
+
180
+ if eval_passed:
181
+ passed_count += 1
182
+
183
+ scores.append(task_score)
184
+ traj = {
185
+ "task_name": getattr(item, "name", "unknown"),
186
+ "task_prompt": getattr(item, "prompt", ""),
187
+ "agent_output": agent_output[:1000],
188
+ "eval_reasoning": eval_reasoning,
189
+ "eval_score": task_score,
190
+ "eval_passed": eval_passed,
191
+ "instructions_used": instructions_text,
192
+ }
193
+ outputs.append(traj)
194
+ if capture_traces:
195
+ trajectories.append(traj)
196
+
197
+ # Record iteration in history
198
+ avg_score = sum(scores) / len(scores) if scores else 0.0
199
+ pass_rate = passed_count / len(batch) if batch else 0.0
200
+ failures_count = len(batch) - passed_count
201
+
202
+ nonlocal generation, best_score, best_instructions
203
+ generation += 1
204
+
205
+ task_lines = [
206
+ f" [{'PASS' if o.get('eval_passed') else 'FAIL'}] "
207
+ f"{o.get('task_name', '?')}: {o.get('eval_reasoning', '')[:150]}"
208
+ for o in outputs if isinstance(o, dict) and o
209
+ ]
210
+
211
+ history.append(
212
+ StrategyIteration(
213
+ iteration=generation - 1,
214
+ instructions_preview=instructions_text[:200],
215
+ full_instructions=instructions_text,
216
+ avg_score=avg_score,
217
+ pass_rate=pass_rate,
218
+ failures_count=failures_count,
219
+ change_description=f"GEPA generation {generation}",
220
+ change_rationale="\n".join(task_lines),
221
+ )
222
+ )
223
+
224
+ if avg_score > best_score:
225
+ best_score = avg_score
226
+ best_instructions = instructions_text
227
+
228
+ logger.info(
229
+ f"GEPA gen {generation}: score={avg_score:.3f}, "
230
+ f"pass_rate={pass_rate:.1%}, failures={failures_count}"
231
+ )
232
+
233
+ return EvaluationBatch(
234
+ outputs=outputs,
235
+ scores=scores,
236
+ trajectories=trajectories if capture_traces else None,
237
+ objective_scores=None,
238
+ )
239
+
240
+ def make_reflective_dataset(
241
+ self,
242
+ candidate: dict[str, str],
243
+ eval_batch: EvaluationBatch,
244
+ components_to_update: list[str],
245
+ ) -> dict[str, list[dict[str, Any]]]:
246
+ """Create reflection dataset from evaluation results.
247
+
248
+ GEPA uses this to generate improved candidates via LLM reflection.
249
+ """
250
+ trajectories = eval_batch.trajectories or eval_batch.outputs or []
251
+ scores = eval_batch.scores or []
252
+
253
+ reflection_data: dict[str, list[dict[str, Any]]] = {}
254
+
255
+ for component in components_to_update:
256
+ examples: list[dict[str, Any]] = []
257
+
258
+ for traj, score in zip(trajectories, scores):
259
+ if not isinstance(traj, dict):
260
+ continue
261
+
262
+ example = {
263
+ "Inputs": {
264
+ "task": traj.get("task_prompt", ""),
265
+ "instructions": traj.get("instructions_used", "")[:500],
266
+ },
267
+ "Generated Outputs": {
268
+ "agent_response": traj.get("agent_output", "")[:1000],
269
+ },
270
+ "Feedback": (
271
+ f"Score: {score:.2f}/1.0. "
272
+ f"Passed: {traj.get('eval_passed', False)}. "
273
+ f"{traj.get('eval_reasoning', '')}"
274
+ ),
275
+ "_score": score,
276
+ }
277
+ examples.append(example)
278
+
279
+ # Sort by score ascending — GEPA learns more from failures
280
+ examples.sort(key=lambda x: x.get("_score", 0))
281
+ for ex in examples:
282
+ ex.pop("_score", None)
283
+
284
+ reflection_data[component] = examples
285
+
286
+ return reflection_data
287
+
288
+ # ── Set up Azure env vars for GEPA's LiteLLM usage ──
289
+
290
+ if os.environ.get("AZURE_OPENAI_API_KEY"):
291
+ os.environ.setdefault("AZURE_API_KEY", os.environ["AZURE_OPENAI_API_KEY"])
292
+ if os.environ.get("AZURE_OPENAI_ENDPOINT"):
293
+ os.environ.setdefault("AZURE_API_BASE", os.environ["AZURE_OPENAI_ENDPOINT"])
294
+
295
+ # ── Build GEPA config ──
296
+
297
+ gepa_config: dict[str, Any] = {}
298
+
299
+ # Resolve reflection LM: explicit config > Azure deployment > default model
300
+ reflection_lm = self.config.get("reflection_lm")
301
+ if not reflection_lm:
302
+ azure_deployment = os.environ.get("AZURE_OPENAI_DEPLOYMENT")
303
+ reflection_lm = azure_deployment if azure_deployment else model
304
+
305
+ # Add azure/ prefix for LiteLLM if using Azure
306
+ if not reflection_lm.startswith("azure/") and os.environ.get("AZURE_OPENAI_ENDPOINT"):
307
+ reflection_lm = f"azure/{reflection_lm}"
308
+
309
+ gepa_config["reflection_lm"] = reflection_lm
310
+
311
+ # Pass through valid GEPA params from config
312
+ VALID_GEPA_PARAMS = {
313
+ "reflection_lm", "candidate_selection_strategy", "frontier_type",
314
+ "skip_perfect_score", "batch_sampler", "reflection_minibatch_size",
315
+ "perfect_score", "reflection_prompt_template", "module_selector",
316
+ "use_merge", "max_merge_invocations", "merge_val_overlap_floor",
317
+ "stop_callbacks", "display_progress_bar", "seed",
318
+ "cache_evaluation", "raise_on_exception",
319
+ }
320
+ for key, value in self.config.items():
321
+ if key in VALID_GEPA_PARAMS:
322
+ gepa_config[key] = value
323
+
324
+ # ── Run GEPA ──
325
+
326
+ seed_candidate = {"instructions": base_instructions}
327
+
328
+ # GEPA needs Task objects as dataset
329
+ dataset = list(tasks)
330
+
331
+ logger.info(
332
+ f"GEPAInstructionOptimizer: budget={budget}, tasks={len(dataset)}, "
333
+ f"reflection_lm={reflection_lm}"
334
+ )
335
+
336
+ gepa_result = gepa.optimize(
337
+ seed_candidate=seed_candidate,
338
+ adapter=FlowRunnerAdapter(),
339
+ trainset=dataset,
340
+ valset=dataset,
341
+ max_metric_calls=budget,
342
+ display_progress_bar=True,
343
+ skip_perfect_score=False,
344
+ perfect_score=2.0, # Impossible score to disable early stopping
345
+ **gepa_config,
346
+ )
347
+
348
+ # ── Extract best result ──
349
+
350
+ best_prompts = gepa_result.best_candidate
351
+ final_instructions = best_prompts.get("instructions", best_instructions)
352
+ gepa_best_score = gepa_result.val_aggregate_scores[gepa_result.best_idx]
353
+
354
+ # Use GEPA's best if it's better, otherwise use our tracked best
355
+ if gepa_best_score >= best_score:
356
+ best_instructions = final_instructions
357
+ best_score = gepa_best_score
358
+
359
+ logger.info(
360
+ f"GEPAInstructionOptimizer complete: {generation} generations, "
361
+ f"best_score={best_score:.3f}"
362
+ )
363
+
364
+ # Build candidates for all unique instruction variants tried
365
+ candidates: list[Candidate] = []
366
+ seen_instructions: set[str] = set()
367
+
368
+ for h in history:
369
+ instr = h.full_instructions or ""
370
+ if not instr or instr in seen_instructions:
371
+ continue
372
+ seen_instructions.add(instr)
373
+
374
+ is_best = instr == best_instructions
375
+ suffix = "gepa_optimized" if is_best else f"gepa_gen{h.iteration}"
376
+ agent = Agent(
377
+ name=f"{base.name}_{suffix}",
378
+ framework=base.framework,
379
+ description=base.description,
380
+ instructions=instr,
381
+ llm_config=base.llm_config,
382
+ compaction=base.compaction,
383
+ tools=base.tools,
384
+ )
385
+ candidates.append(
386
+ Candidate(
387
+ agent=agent,
388
+ mutations={"instructions": instr},
389
+ rationale=f"GEPA generation {h.iteration}: score={h.avg_score:.3f}",
390
+ optimization_history=history if is_best else [],
391
+ )
392
+ )
393
+
394
+ # Ensure best is always included
395
+ if best_instructions not in seen_instructions:
396
+ final_agent = Agent(
397
+ name=f"{base.name}_gepa_optimized",
398
+ framework=base.framework,
399
+ description=base.description,
400
+ instructions=best_instructions,
401
+ llm_config=base.llm_config,
402
+ compaction=base.compaction,
403
+ tools=base.tools,
404
+ )
405
+ score_progression = f"{history[0].avg_score:.2f} -> {best_score:.2f}" if history else f"-> {best_score:.2f}"
406
+ candidates.append(
407
+ Candidate(
408
+ agent=final_agent,
409
+ mutations={"instructions": best_instructions},
410
+ rationale=f"GEPA instruction optimization: {generation} generations, {score_progression}",
411
+ optimization_history=history,
412
+ )
413
+ )
414
+
415
+ return candidates
src/flow/experiments/strategies/{llm_rewriter.py → instruction.py} RENAMED
@@ -1,12 +1,12 @@
1
  # Copyright (c) Microsoft. All rights reserved.
2
 
3
- """LLM-based instruction rewriter strategy.
4
 
5
- This strategy always requires a runner and tasks. It:
6
- 1. Evaluates the current instructions on all tasks
7
- 2. Reflects on failures to understand what went wrong
8
- 3. Rewrites instructions to address failures
9
- 4. Re-evaluates and repeats until convergence or budget exhausted
10
  """
11
 
12
  from __future__ import annotations
@@ -23,7 +23,7 @@ logger = logging.getLogger(__name__)
23
 
24
 
25
  @dataclass
26
- class LLMRewriterStrategy:
27
  """Strategy that uses an LLM to iteratively improve agent instructions.
28
 
29
  Runs an evaluate-reflect-rewrite loop. Each iteration evaluates
@@ -42,7 +42,7 @@ class LLMRewriterStrategy:
42
 
43
  Example YAML:
44
  strategy:
45
- type: llm_rewriter
46
  config:
47
  model: gpt-4o-mini
48
  max_iterations: 5
@@ -75,12 +75,12 @@ class LLMRewriterStrategy:
75
  """
76
  if runner is None:
77
  raise ValueError(
78
- "LLMRewriterStrategy requires a runner. "
79
  "Use FlowOptimizer.optimize_with_strategy() to provide one."
80
  )
81
  if not tasks:
82
  raise ValueError(
83
- "LLMRewriterStrategy requires tasks to evaluate against."
84
  )
85
 
86
  base_instructions = base.instructions or "You are a helpful assistant."
@@ -100,7 +100,7 @@ class LLMRewriterStrategy:
100
  min_improvement = self.config.get("min_improvement", 0.05)
101
 
102
  logger.info(
103
- f"LLMRewriterStrategy: active mode (max_iterations={max_iterations}, "
104
  f"min_improvement={min_improvement})"
105
  )
106
 
@@ -108,6 +108,7 @@ class LLMRewriterStrategy:
108
  best_instructions = instructions
109
  best_score = 0.0
110
  history: list[StrategyIteration] = []
 
111
 
112
  for iteration in range(max_iterations):
113
  # 1. Evaluate current instructions
@@ -169,6 +170,10 @@ class LLMRewriterStrategy:
169
  )
170
  )
171
 
 
 
 
 
172
  # Track best
173
  if avg_score > best_score:
174
  best_score = avg_score
@@ -193,29 +198,61 @@ class LLMRewriterStrategy:
193
 
194
  # 3. Reflect on failures and rewrite
195
  current_instructions = self._reflect_and_rewrite(
196
- current_instructions, failures, avg_score, model
 
 
197
  )
198
  logger.info(f" Rewrote instructions ({len(current_instructions)} chars)")
199
 
200
- # Build final candidate with optimization history
201
- final_agent = Agent(
202
- name=f"{base.name}_llm_rewriter_optimized",
203
- framework=base.framework,
204
- instructions=best_instructions,
205
- llm_config=base.llm_config,
206
- compaction=base.compaction,
207
- tools=base.tools,
208
- )
209
 
210
- score_progression = f"{history[0].avg_score:.2f} {best_score:.2f}"
211
- return [
212
- Candidate(
213
- agent=final_agent,
214
- mutations={"instructions": best_instructions},
215
- rationale=f"LLM rewriter active optimization: {len(history)} iterations, {score_progression}",
216
- optimization_history=history,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
217
  )
218
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
 
220
  def _reflect_and_rewrite(
221
  self,
@@ -223,52 +260,60 @@ class LLMRewriterStrategy:
223
  failures: list[Any],
224
  current_score: float,
225
  model: str,
 
 
 
226
  ) -> str:
227
  """Analyze failures and rewrite instructions to address them."""
228
- # Build failure analysis
229
  failure_descriptions = []
230
- for tr in failures[:5]: # Limit to 5 failures for context
231
  task_name = getattr(tr, "task_name", "unknown")
232
  reasoning = getattr(tr, "eval_reasoning", "No reasoning")
233
  score = getattr(tr, "eval_score", 0.0)
 
234
  failure_descriptions.append(
235
- f"- Task '{task_name}' (score={score:.2f}): {reasoning[:200]}"
 
236
  )
237
 
238
  failures_text = "\n".join(failure_descriptions)
239
 
240
- prompt = f"""You are a prompt engineer writing guidelines for a coding assistant.
241
-
242
- The assistant's current guidelines scored {current_score:.2f} out of 1.0 on a benchmark.
243
-
244
- Here are the tasks where performance was low:
 
 
 
 
 
 
 
 
 
 
 
 
245
  {failures_text}
246
 
247
- The current guidelines are:
248
  ---
249
  {instructions}
250
  ---
251
 
252
- Write a new, improved version of the guidelines. The new guidelines should:
253
- 1. Help the assistant succeed on a wide range of coding tasks the failures
254
- above are examples, but the guidelines must generalize beyond them
255
- 2. Include concrete strategies (e.g., always verify output, check edge cases,
256
- create and run files when asked)
257
- 3. Be general-purpose: do NOT reference specific task names, specific answers,
258
- or specific test cases from the failures above
259
- 4. Focus on transferable skills and habits (e.g., "verify output matches
260
- requirements" not "check that fibonacci returns 55")
261
- 5. Be concise
262
 
263
- Output ONLY the new guidelines text, nothing else."""
264
 
265
  try:
266
  return self._call_llm(prompt, model) or instructions
267
  except Exception as e:
268
  logger.warning(f"LLM rewrite failed: {e}")
269
- # Primary prompt failed — the original instructions may have
270
- # triggered a content filter (Azure, OpenAI, etc.) or caused
271
- # another error. Try a fallback that omits them entirely.
272
  logger.info("Retrying rewrite with fallback prompt (without original instructions)")
273
  return self._fallback_rewrite(failures_text, current_score, model)
274
 
@@ -277,29 +322,29 @@ Output ONLY the new guidelines text, nothing else."""
277
  failures_text: str,
278
  current_score: float,
279
  model: str,
 
 
280
  ) -> str:
281
- """Generate new instructions from scratch when the primary rewrite is blocked.
 
 
 
 
 
282
 
283
- This avoids including the original instructions (which may trigger
284
- content filters) and instead writes fresh guidelines based solely on
285
- the task failure descriptions.
286
- """
287
- prompt = f"""You are a prompt engineer. Write guidelines for a coding assistant.
288
 
289
  The assistant scored {current_score:.2f} out of 1.0 on these tasks:
290
  {failures_text}
291
 
292
- Write concise guidelines that would help a coding assistant succeed on
293
- a wide range of coding tasks. The failures above are examples the
294
- guidelines must generalize beyond them. The guidelines should:
295
- 1. Instruct the assistant to complete coding tasks by creating files and
296
- running code
297
- 2. Include strategies for verifying output and handling edge cases
298
- 3. Be general-purpose: do NOT reference specific task names or answers
299
- from the failures above
300
- 4. Focus on transferable habits and skills
301
 
302
- Output ONLY the guidelines text, nothing else."""
303
 
304
  try:
305
  result = self._call_llm(prompt, model)
@@ -309,22 +354,20 @@ Output ONLY the guidelines text, nothing else."""
309
  except Exception as e2:
310
  logger.warning(f"Fallback rewrite also failed: {e2}")
311
 
312
- # Last resort: return a sensible default
313
- logger.info("Using default coding assistant guidelines")
314
  return (
315
- "You are a helpful coding assistant. When given a task:\n"
316
- "1. Create the requested files with correct, working code\n"
317
- "2. Run the code and verify the output is correct\n"
318
  "3. Handle edge cases and validate results before finishing"
319
  )
320
 
321
-
322
  def _get_client(self, model: str) -> tuple[Any, str]:
323
  """Get OpenAI client and model name."""
324
  try:
325
  from openai import AzureOpenAI, OpenAI
326
  except ImportError as e:
327
- raise ImportError("openai package required for LLMRewriterStrategy") from e
328
 
329
  azure_key = os.environ.get("AZURE_OPENAI_API_KEY")
330
  azure_endpoint = os.environ.get("AZURE_OPENAI_ENDPOINT")
@@ -354,4 +397,3 @@ Output ONLY the guidelines text, nothing else."""
354
  messages=[{"role": "user", "content": prompt}],
355
  )
356
  return response.choices[0].message.content or ""
357
-
 
1
  # Copyright (c) Microsoft. All rights reserved.
2
 
3
+ """Instruction optimization strategy.
4
 
5
+ This strategy iteratively improves agent instructions by:
6
+ 1. Evaluating the current instructions on all tasks
7
+ 2. Reflecting on failures to understand what went wrong
8
+ 3. Rewriting instructions to address failures
9
+ 4. Re-evaluating and repeating until convergence or budget exhausted
10
  """
11
 
12
  from __future__ import annotations
 
23
 
24
 
25
  @dataclass
26
+ class InstructionOptimizer:
27
  """Strategy that uses an LLM to iteratively improve agent instructions.
28
 
29
  Runs an evaluate-reflect-rewrite loop. Each iteration evaluates
 
42
 
43
  Example YAML:
44
  strategy:
45
+ type: instruction
46
  config:
47
  model: gpt-4o-mini
48
  max_iterations: 5
 
75
  """
76
  if runner is None:
77
  raise ValueError(
78
+ "InstructionOptimizer requires a runner. "
79
  "Use FlowOptimizer.optimize_with_strategy() to provide one."
80
  )
81
  if not tasks:
82
  raise ValueError(
83
+ "InstructionOptimizer requires tasks to evaluate against."
84
  )
85
 
86
  base_instructions = base.instructions or "You are a helpful assistant."
 
100
  min_improvement = self.config.get("min_improvement", 0.05)
101
 
102
  logger.info(
103
+ f"InstructionOptimizer: active mode (max_iterations={max_iterations}, "
104
  f"min_improvement={min_improvement})"
105
  )
106
 
 
108
  best_instructions = instructions
109
  best_score = 0.0
110
  history: list[StrategyIteration] = []
111
+ iteration_candidates: list[tuple[str, str]] = [] # (instructions, label)
112
 
113
  for iteration in range(max_iterations):
114
  # 1. Evaluate current instructions
 
170
  )
171
  )
172
 
173
+ # Collect candidate for this iteration
174
+ label = "baseline" if iteration == 0 else f"iter{iteration}"
175
+ iteration_candidates.append((current_instructions, label))
176
+
177
  # Track best
178
  if avg_score > best_score:
179
  best_score = avg_score
 
198
 
199
  # 3. Reflect on failures and rewrite
200
  current_instructions = self._reflect_and_rewrite(
201
+ current_instructions, failures, avg_score, model,
202
+ agent_name=base.name, agent_description=base.description or "",
203
+ tasks=tasks,
204
  )
205
  logger.info(f" Rewrote instructions ({len(current_instructions)} chars)")
206
 
207
+ # Build candidates for all unique instruction variants tried
208
+ candidates: list[Candidate] = []
209
+ seen_instructions: set[str] = set()
 
 
 
 
 
 
210
 
211
+ for iter_instructions, label in iteration_candidates:
212
+ if iter_instructions in seen_instructions:
213
+ continue
214
+ seen_instructions.add(iter_instructions)
215
+
216
+ is_best = iter_instructions == best_instructions
217
+ suffix = "instruction_optimized" if is_best else f"instruction_{label}"
218
+ agent = Agent(
219
+ name=f"{base.name}_{suffix}",
220
+ framework=base.framework,
221
+ instructions=iter_instructions,
222
+ llm_config=base.llm_config,
223
+ compaction=base.compaction,
224
+ tools=base.tools,
225
+ )
226
+ candidates.append(
227
+ Candidate(
228
+ agent=agent,
229
+ mutations={"instructions": iter_instructions},
230
+ rationale=f"Instructions ({label}): {len(iter_instructions)} chars",
231
+ optimization_history=history if is_best else [],
232
+ )
233
  )
234
+
235
+ # Ensure best is always included
236
+ if best_instructions not in seen_instructions:
237
+ final_agent = Agent(
238
+ name=f"{base.name}_instruction_optimized",
239
+ framework=base.framework,
240
+ instructions=best_instructions,
241
+ llm_config=base.llm_config,
242
+ compaction=base.compaction,
243
+ tools=base.tools,
244
+ )
245
+ score_progression = f"{history[0].avg_score:.2f} -> {best_score:.2f}"
246
+ candidates.append(
247
+ Candidate(
248
+ agent=final_agent,
249
+ mutations={"instructions": best_instructions},
250
+ rationale=f"Instruction optimization: {len(history)} iterations, {score_progression}",
251
+ optimization_history=history,
252
+ )
253
+ )
254
+
255
+ return candidates
256
 
257
  def _reflect_and_rewrite(
258
  self,
 
260
  failures: list[Any],
261
  current_score: float,
262
  model: str,
263
+ agent_name: str = "",
264
+ agent_description: str = "",
265
+ tasks: list[Task] | None = None,
266
  ) -> str:
267
  """Analyze failures and rewrite instructions to address them."""
 
268
  failure_descriptions = []
269
+ for tr in failures[:5]:
270
  task_name = getattr(tr, "task_name", "unknown")
271
  reasoning = getattr(tr, "eval_reasoning", "No reasoning")
272
  score = getattr(tr, "eval_score", 0.0)
273
+ task_prompt = getattr(tr, "task_prompt", "")
274
  failure_descriptions.append(
275
+ f"- Task '{task_name}' (score={score:.2f}): {reasoning[:300]}"
276
+ + (f"\n Task prompt: {task_prompt[:200]}" if task_prompt else "")
277
  )
278
 
279
  failures_text = "\n".join(failure_descriptions)
280
 
281
+ # Build agent context from name, description, and task domain
282
+ agent_context = ""
283
+ if agent_name or agent_description:
284
+ agent_context = f"\nThe agent is called '{agent_name}'"
285
+ if agent_description:
286
+ agent_context += f" — {agent_description}"
287
+ agent_context += ".\n"
288
+
289
+ # Infer domain from task prompts
290
+ domain_context = ""
291
+ if tasks:
292
+ task_summaries = [f"- {t.name}: {t.prompt[:100]}..." for t in tasks[:5]]
293
+ domain_context = f"\nThe agent is evaluated on these types of tasks:\n" + "\n".join(task_summaries) + "\n"
294
+
295
+ prompt = f"""You are a prompt engineer improving an agent's instructions to fix its performance issues.
296
+ {agent_context}{domain_context}
297
+ The agent scored {current_score:.2f} out of 1.0. Here are the tasks where it failed and what went wrong:
298
  {failures_text}
299
 
300
+ The agent's current instructions are:
301
  ---
302
  {instructions}
303
  ---
304
 
305
+ Rewrite the instructions to fix the failures above. The new instructions should:
306
+ 1. Directly address the failure patterns if the agent didn't create files, tell it to always save output to the requested file AND display the content. If it missed details, tell it to reference every constraint from the user's request.
307
+ 2. Be specific to this agent's domain not generic "coding assistant" guidelines
308
+ 3. Do NOT reference specific task names or test answers — the instructions should generalize to similar tasks
309
+ 4. Be concise
 
 
 
 
 
310
 
311
+ Output ONLY the new instructions text, nothing else."""
312
 
313
  try:
314
  return self._call_llm(prompt, model) or instructions
315
  except Exception as e:
316
  logger.warning(f"LLM rewrite failed: {e}")
 
 
 
317
  logger.info("Retrying rewrite with fallback prompt (without original instructions)")
318
  return self._fallback_rewrite(failures_text, current_score, model)
319
 
 
322
  failures_text: str,
323
  current_score: float,
324
  model: str,
325
+ agent_name: str = "",
326
+ agent_description: str = "",
327
  ) -> str:
328
+ """Generate new instructions from scratch when the primary rewrite is blocked."""
329
+ agent_role = "an AI assistant"
330
+ if agent_name or agent_description:
331
+ agent_role = f"an AI assistant called '{agent_name}'"
332
+ if agent_description:
333
+ agent_role += f" ({agent_description})"
334
 
335
+ prompt = f"""You are a prompt engineer. Write instructions for {agent_role}.
 
 
 
 
336
 
337
  The assistant scored {current_score:.2f} out of 1.0 on these tasks:
338
  {failures_text}
339
 
340
+ Write concise instructions tailored to this assistant's role. The instructions should:
341
+ 1. Be specific to the assistant's domain and purpose
342
+ 2. Address the failure patterns from the tasks above
343
+ 3. Include strategies for creating files when asked and verifying output
344
+ 4. Do NOT reference specific task names or answers from the failures above
345
+ 5. Focus on transferable habits relevant to this assistant's role
 
 
 
346
 
347
+ Output ONLY the instructions text, nothing else."""
348
 
349
  try:
350
  result = self._call_llm(prompt, model)
 
354
  except Exception as e2:
355
  logger.warning(f"Fallback rewrite also failed: {e2}")
356
 
357
+ logger.info("Using default assistant guidelines")
 
358
  return (
359
+ "You are a helpful assistant. When given a task:\n"
360
+ "1. Create the requested files with correct content\n"
361
+ "2. Verify the output matches all requirements\n"
362
  "3. Handle edge cases and validate results before finishing"
363
  )
364
 
 
365
  def _get_client(self, model: str) -> tuple[Any, str]:
366
  """Get OpenAI client and model name."""
367
  try:
368
  from openai import AzureOpenAI, OpenAI
369
  except ImportError as e:
370
+ raise ImportError("openai package required for InstructionOptimizer") from e
371
 
372
  azure_key = os.environ.get("AZURE_OPENAI_API_KEY")
373
  azure_endpoint = os.environ.get("AZURE_OPENAI_ENDPOINT")
 
397
  messages=[{"role": "user", "content": prompt}],
398
  )
399
  return response.choices[0].message.content or ""
 
src/flow/experiments/strategies/skill.py ADDED
@@ -0,0 +1,692 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Microsoft. All rights reserved.
2
+
3
+ """Skill optimization strategy.
4
+
5
+ Iteratively discovers and generates skills (domain knowledge packages)
6
+ to improve agent performance. The strategy:
7
+ 1. Starts with an empty skill directory (no pre-loaded domain knowledge)
8
+ 2. Evaluates the agent on tasks to establish a baseline
9
+ 3. Analyzes failures and uses an LLM to generate SKILL.md files
10
+ 4. Writes generated skills to a managed directory the agent can discover
11
+ 5. Re-evaluates and repeats until convergence or budget exhausted
12
+
13
+ Skills differ from tools: tools are executable capabilities (read_file, bash),
14
+ while skills are domain knowledge packages (OOXML patterns, testing workflows).
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import logging
20
+ import os
21
+ import shutil
22
+ import tempfile
23
+ from dataclasses import dataclass, field
24
+ from pathlib import Path
25
+ from typing import Any
26
+
27
+ from ..models import Agent, Candidate, ExperimentRunner, StrategyIteration
28
+ from ..types import Task
29
+
30
+ logger = logging.getLogger(__name__)
31
+
32
+
33
+ @dataclass
34
+ class SkillOptimizer:
35
+ """Strategy that iteratively generates and refines skills for an agent.
36
+
37
+ Runs an evaluate-analyze-generate loop. Each iteration evaluates
38
+ the agent on tasks, analyzes failures, and generates SKILL.md files
39
+ containing domain knowledge that would help the agent succeed.
40
+
41
+ The optimizer manages its own skill directory. On each iteration it can:
42
+ - Generate new skills based on failure patterns
43
+ - Refine existing skills that didn't help enough
44
+ - Remove skills that added context cost without benefit
45
+
46
+ Requires both a runner (to evaluate candidates) and tasks (to test on).
47
+
48
+ Config options:
49
+ model: LLM for skill generation (default: gpt-4o-mini)
50
+ max_iterations: Max optimization iterations (default: 3)
51
+ min_improvement: Min score gain to continue (default: 0.05)
52
+ include_builtin: Whether to include built-in skills in the catalog
53
+ for selection (default: True)
54
+
55
+ Example YAML:
56
+ strategy:
57
+ type: skill
58
+ config:
59
+ model: gpt-4o-mini
60
+ max_iterations: 3
61
+ include_builtin: true
62
+ """
63
+
64
+ config: dict[str, Any] = field(default_factory=dict)
65
+
66
+ # Managed skill directory (created during generate, cleaned up after)
67
+ _skill_dir: Path | None = field(default=None, init=False, repr=False)
68
+
69
+ async def generate(
70
+ self,
71
+ base: Agent,
72
+ budget: int,
73
+ *,
74
+ tasks: list[Task] | None = None,
75
+ runner: ExperimentRunner | None = None,
76
+ ) -> list[Candidate]:
77
+ """Generate candidates with optimized skill configurations.
78
+
79
+ Args:
80
+ base: Base agent configuration
81
+ budget: Max candidates to generate
82
+ tasks: Tasks to evaluate on (required)
83
+ runner: ExperimentRunner for evaluation (required)
84
+
85
+ Returns:
86
+ List of candidates with optimized skill sets
87
+
88
+ Raises:
89
+ ValueError: If tasks or runner not provided
90
+ """
91
+ if runner is None:
92
+ raise ValueError(
93
+ "SkillOptimizer requires a runner. "
94
+ "Use FlowOptimizer.optimize_with_strategy() to provide one."
95
+ )
96
+ if not tasks:
97
+ raise ValueError(
98
+ "SkillOptimizer requires tasks to evaluate against."
99
+ )
100
+
101
+ # Create a temp directory that the optimizer owns
102
+ self._skill_dir = Path(tempfile.mkdtemp(prefix="flow_skills_opt_"))
103
+ logger.info(f"SkillOptimizer: managing skills in {self._skill_dir}")
104
+
105
+ try:
106
+ return await self._generate_active(base, budget, tasks, runner)
107
+ finally:
108
+ # Clean up temp dir after optimization
109
+ if self._skill_dir and self._skill_dir.exists():
110
+ shutil.rmtree(self._skill_dir, ignore_errors=True)
111
+ self._skill_dir = None
112
+
113
+ async def _generate_active(
114
+ self,
115
+ base: Agent,
116
+ budget: int,
117
+ tasks: list[Task],
118
+ runner: ExperimentRunner,
119
+ ) -> list[Candidate]:
120
+ """Run active optimization loop with real evaluation feedback."""
121
+ model = self.config.get("model", "gpt-4o-mini")
122
+ max_iterations = self.config.get("max_iterations", 3)
123
+ min_improvement = self.config.get("min_improvement", 0.05)
124
+ include_builtin = self.config.get("include_builtin", True)
125
+
126
+ assert self._skill_dir is not None
127
+
128
+ logger.info(
129
+ f"SkillOptimizer: active mode (max_iterations={max_iterations}, "
130
+ f"min_improvement={min_improvement}, include_builtin={include_builtin})"
131
+ )
132
+
133
+ # Collect built-in skill catalog for LLM reference
134
+ builtin_catalog = self._get_builtin_catalog() if include_builtin else {}
135
+
136
+ best_score = 0.0
137
+ best_skills: dict[str, str] = {} # skill_name -> SKILL.md content
138
+ current_skills: dict[str, str] = {} # starts empty
139
+ _prev_skills: dict[str, str] = {}
140
+ history: list[StrategyIteration] = []
141
+ iteration_candidates: list[tuple[dict[str, str], str]] = []
142
+
143
+ for iteration in range(max_iterations):
144
+ # 1. Write current skills to the managed directory
145
+ self._write_skills_to_dir(current_skills)
146
+
147
+ # 2. Build agent with skills embedded in instructions
148
+ tools_config = self._build_tools_with_skills(base)
149
+ enriched_instructions = self._build_instructions_with_skills(
150
+ base.instructions, current_skills
151
+ )
152
+ agent = Agent(
153
+ name=f"{base.name}_skills_iter{iteration}",
154
+ framework=base.framework,
155
+ instructions=enriched_instructions,
156
+ llm_config=base.llm_config,
157
+ compaction=base.compaction,
158
+ tools=tools_config,
159
+ skills=dict(current_skills) if current_skills else None,
160
+ )
161
+ candidate = Candidate(
162
+ agent=agent,
163
+ mutations={"skills": sorted(current_skills.keys())},
164
+ )
165
+
166
+ summary = await runner.evaluate(candidate, tasks)
167
+
168
+ avg_score = getattr(summary, "avg_score", 0.0)
169
+ pass_rate = getattr(summary, "pass_rate", 0.0)
170
+ task_results = getattr(summary, "task_results", [])
171
+ failures = [tr for tr in task_results if not getattr(tr, "eval_passed", True)]
172
+
173
+ skills_list = sorted(current_skills.keys()) or ["(none)"]
174
+ logger.info(
175
+ f" Iteration {iteration}: avg_score={avg_score:.3f}, "
176
+ f"pass_rate={pass_rate:.1%}, failures={len(failures)}, "
177
+ f"skills={skills_list}"
178
+ )
179
+
180
+ # Build per-task summary (include full reasoning for history)
181
+ task_lines: list[str] = []
182
+ for tr in task_results:
183
+ task_name = getattr(tr, "task_name", "unknown")
184
+ passed = getattr(tr, "eval_passed", True)
185
+ reasoning = getattr(tr, "eval_reasoning", "")
186
+ status = "PASS" if passed else "FAIL"
187
+ task_lines.append(f" [{status}] {task_name}: {reasoning[:500]}")
188
+ tasks_summary = "\n".join(task_lines)
189
+
190
+ # Record iteration
191
+ skills_desc = ", ".join(skills_list)
192
+ change_desc = "Baseline evaluation (no skills)" if iteration == 0 else f"Skill adjustment iteration {iteration}"
193
+ change_rationale = f"Skills: {skills_desc}\n{tasks_summary}"
194
+ if iteration > 0:
195
+ score_delta = avg_score - history[-1].avg_score
196
+ prev_skills = set(_prev_skills.keys())
197
+ curr_skill_set = set(current_skills.keys())
198
+ added = sorted(curr_skill_set - prev_skills)
199
+ removed = sorted(prev_skills - curr_skill_set)
200
+ change_rationale = (
201
+ f"Score {'improved' if score_delta > 0 else 'declined'} by {abs(score_delta):.3f}. "
202
+ f"Added skills: {added or 'none'}. Removed: {removed or 'none'}. "
203
+ f"{len(failures)} failures remaining.\n"
204
+ f"Skills: {skills_desc}\n{tasks_summary}"
205
+ )
206
+
207
+ history.append(
208
+ StrategyIteration(
209
+ iteration=iteration,
210
+ instructions_preview=f"[{skills_desc}]"[:200],
211
+ full_instructions=f"Skills: [{skills_desc}]",
212
+ avg_score=avg_score,
213
+ pass_rate=pass_rate,
214
+ failures_count=len(failures),
215
+ change_description=change_desc,
216
+ change_rationale=change_rationale,
217
+ )
218
+ )
219
+
220
+ label = "baseline" if iteration == 0 else f"iter{iteration}"
221
+ iteration_candidates.append((dict(current_skills), label))
222
+
223
+ # Track best (>= so that skills are preferred over no-skills on ties)
224
+ if avg_score >= best_score and (current_skills or not best_skills):
225
+ best_score = avg_score
226
+ best_skills = dict(current_skills)
227
+
228
+ # 2. Check stopping conditions
229
+ if iteration > 0:
230
+ improvement = avg_score - history[-2].avg_score
231
+ if improvement < min_improvement and avg_score <= best_score:
232
+ logger.info(
233
+ f" Stopping: improvement ({improvement:.3f}) < "
234
+ f"min_improvement ({min_improvement})"
235
+ )
236
+ break
237
+
238
+ if not failures:
239
+ logger.info(" Stopping: all tasks passed")
240
+ break
241
+
242
+ if iteration == max_iterations - 1:
243
+ break # Don't generate on last iteration
244
+
245
+ # 3. Analyze failures and generate/adjust skills
246
+ _prev_skills = dict(current_skills)
247
+ current_skills = self._analyze_and_generate(
248
+ current_skills, task_results, builtin_catalog, model, tasks
249
+ )
250
+ logger.info(f" Updated skills: {sorted(current_skills.keys())}")
251
+
252
+ # Build candidates for all unique skill configs tried
253
+ candidates: list[Candidate] = []
254
+ seen_skill_sets: set[tuple[str, ...]] = set()
255
+
256
+ for iter_skills, label in iteration_candidates:
257
+ skill_key = tuple(sorted(iter_skills.keys()))
258
+ if skill_key in seen_skill_sets:
259
+ continue
260
+ seen_skill_sets.add(skill_key)
261
+
262
+ is_best = sorted(iter_skills.keys()) == sorted(best_skills.keys())
263
+ suffix = "skills_optimized" if is_best else f"skills_{label}"
264
+
265
+ self._write_skills_to_dir(iter_skills)
266
+ tools_config = self._build_tools_with_skills(base)
267
+ enriched_instructions = self._build_instructions_with_skills(
268
+ base.instructions, iter_skills
269
+ )
270
+ skills_desc = ", ".join(sorted(iter_skills.keys())) or "(none)"
271
+ candidates.append(
272
+ Candidate(
273
+ agent=Agent(
274
+ name=f"{base.name}_{suffix}",
275
+ framework=base.framework,
276
+ instructions=enriched_instructions,
277
+ llm_config=base.llm_config,
278
+ compaction=base.compaction,
279
+ tools=tools_config,
280
+ skills=dict(iter_skills) if iter_skills else None,
281
+ ),
282
+ mutations={
283
+ "skills": sorted(iter_skills.keys()),
284
+ },
285
+ rationale=f"Skills: [{skills_desc}]",
286
+ optimization_history=history if is_best else [],
287
+ )
288
+ )
289
+
290
+ # Ensure best is always included
291
+ best_key = tuple(sorted(best_skills.keys()))
292
+ if best_key not in seen_skill_sets:
293
+ self._write_skills_to_dir(best_skills)
294
+ tools_config = self._build_tools_with_skills(base)
295
+ enriched_instructions = self._build_instructions_with_skills(
296
+ base.instructions, best_skills
297
+ )
298
+ skills_desc = ", ".join(sorted(best_skills.keys())) or "(none)"
299
+ candidates.append(
300
+ Candidate(
301
+ agent=Agent(
302
+ name=f"{base.name}_skills_optimized",
303
+ framework=base.framework,
304
+ instructions=enriched_instructions,
305
+ llm_config=base.llm_config,
306
+ compaction=base.compaction,
307
+ tools=tools_config,
308
+ skills=dict(best_skills) if best_skills else None,
309
+ ),
310
+ mutations={
311
+ "skills": sorted(best_skills.keys()),
312
+ },
313
+ rationale=f"Skills: [{skills_desc}]",
314
+ optimization_history=history,
315
+ )
316
+ )
317
+
318
+ # Restore best skills as final state on disk
319
+ self._write_skills_to_dir(best_skills)
320
+
321
+ return candidates
322
+
323
+ def _build_tools_with_skills(self, base: Agent) -> list[str] | dict[str, Any]:
324
+ """Build a tools config that includes the skills tool pointing to our managed dir.
325
+
326
+ Ensures the agent has the skills tool configured to only see our managed
327
+ skill directory (no built-in or user skills auto-loaded).
328
+ """
329
+ from ..models import resolve_tools
330
+
331
+ # Start from the base agent's tools
332
+ if base.tools is None or (isinstance(base.tools, list) and len(base.tools) == 0):
333
+ base_tools: dict[str, Any] = {}
334
+ elif isinstance(base.tools, str):
335
+ base_tools = dict(resolve_tools(base.tools))
336
+ elif isinstance(base.tools, list):
337
+ base_tools = dict(resolve_tools(base.tools))
338
+ else:
339
+ base_tools = dict(base.tools)
340
+
341
+ # Ensure skills tool is present with our managed path
342
+ assert self._skill_dir is not None
343
+ base_tools["skills"] = {
344
+ "skills_path": str(self._skill_dir),
345
+ }
346
+
347
+ return base_tools
348
+
349
+ def _build_instructions_with_skills(
350
+ self, base_instructions: str | None, skills: dict[str, str]
351
+ ) -> str:
352
+ """Inject full skill content into the agent's instructions.
353
+
354
+ The harness layer injects skill *summaries* (name + description) into
355
+ the system prompt for normal agents. The optimizer intentionally injects
356
+ *full* skill content here because optimization requires the agent to
357
+ see and follow the complete domain knowledge, not just a summary.
358
+
359
+ Setting explicit instructions on the Agent causes the harness to skip
360
+ its own summary injection, so these two approaches don't conflict.
361
+ """
362
+ parts: list[str] = []
363
+ if base_instructions:
364
+ parts.append(base_instructions)
365
+
366
+ if skills:
367
+ parts.append("\n## Domain Knowledge (Skills)\n")
368
+ parts.append(
369
+ "The following skills provide domain-specific patterns and "
370
+ "best practices. Follow these guidelines when completing tasks.\n"
371
+ )
372
+ for name, content in sorted(skills.items()):
373
+ parts.append(f"### {name}\n{content}\n")
374
+
375
+ return "\n".join(parts) if parts else ""
376
+
377
+ def _write_skills_to_dir(self, skills: dict[str, str]) -> None:
378
+ """Write skill content to the managed directory.
379
+
380
+ Clears the directory first, then writes each skill as a folder
381
+ with a SKILL.md file.
382
+ """
383
+ assert self._skill_dir is not None
384
+
385
+ # Clear existing skills
386
+ if self._skill_dir.exists():
387
+ for item in self._skill_dir.iterdir():
388
+ if item.is_dir():
389
+ shutil.rmtree(item)
390
+
391
+ # Write each skill
392
+ for name, content in skills.items():
393
+ skill_dir = self._skill_dir / name
394
+ skill_dir.mkdir(parents=True, exist_ok=True)
395
+ (skill_dir / "SKILL.md").write_text(content)
396
+
397
+ def _get_builtin_catalog(self) -> dict[str, str]:
398
+ """Get descriptions of all built-in skills for LLM reference."""
399
+ from flow.tools.skills import _discover_skills, _get_builtin_skills_path
400
+
401
+ builtin_path = _get_builtin_skills_path()
402
+ if not builtin_path.exists():
403
+ return {}
404
+
405
+ discovered = _discover_skills([builtin_path])
406
+ catalog: dict[str, str] = {}
407
+ for skill_name, (skill_md, meta) in discovered.items():
408
+ description = meta.get("description", "No description")
409
+ catalog[skill_name] = description
410
+
411
+ return catalog
412
+
413
+ def _analyze_and_generate(
414
+ self,
415
+ current_skills: dict[str, str],
416
+ task_results: list[Any],
417
+ builtin_catalog: dict[str, str],
418
+ model: str,
419
+ tasks: list[Task] | None = None,
420
+ ) -> dict[str, str]:
421
+ """Analyze failures and incrementally evolve skills.
422
+
423
+ The LLM sees the full content of every current skill and decides
424
+ per-skill what to do:
425
+ - "keep": skill is helping, leave it unchanged
426
+ - "drop": skill isn't helping, remove it
427
+ - "refine": skill has the right idea but needs improved content
428
+ (LLM provides the updated SKILL.md)
429
+ - New skills can be added (LLM provides full SKILL.md content)
430
+ - "builtin": select a built-in skill by name from the catalog
431
+ """
432
+ # Build task->criteria lookup for enriching the prompt
433
+ task_criteria_map: dict[str, list[dict[str, str]]] = {}
434
+ if tasks:
435
+ for t in tasks:
436
+ task_criteria_map[t.name] = [
437
+ {"name": c.name, "instruction": c.instruction}
438
+ for c in t.criteria
439
+ ]
440
+
441
+ # Build task results summary with full reasoning and criteria
442
+ task_descriptions = []
443
+ for tr in task_results:
444
+ task_name = getattr(tr, "task_name", "unknown")
445
+ passed = getattr(tr, "eval_passed", True)
446
+ reasoning = getattr(tr, "eval_reasoning", "")
447
+ score = getattr(tr, "eval_score", 0.0)
448
+ status = "PASS" if passed else "FAIL"
449
+
450
+ # Include full reasoning (not truncated)
451
+ entry = f"- [{status}] Task '{task_name}' (score={score:.2f}):\n Reasoning: {reasoning}"
452
+
453
+ # Include the task's evaluation criteria so the LLM knows
454
+ # the exact rules the agent must follow
455
+ criteria = task_criteria_map.get(task_name, [])
456
+ if criteria and not passed:
457
+ criteria_lines = []
458
+ for c in criteria:
459
+ criteria_lines.append(f" - {c['name']}: {c['instruction']}")
460
+ entry += "\n Evaluation criteria (the agent MUST satisfy ALL of these):\n"
461
+ entry += "\n".join(criteria_lines)
462
+
463
+ task_descriptions.append(entry)
464
+ results_text = "\n".join(task_descriptions)
465
+
466
+ # Build current skills section with full content
467
+ current_skills_section = ""
468
+ if current_skills:
469
+ skill_entries = []
470
+ for name, content in sorted(current_skills.items()):
471
+ # Show full content so LLM can refine it
472
+ skill_entries.append(
473
+ f"### Skill: {name}\n```\n{content}\n```"
474
+ )
475
+ current_skills_section = (
476
+ "\n## Current Skills (full content)\n"
477
+ + "\n\n".join(skill_entries)
478
+ + "\n"
479
+ )
480
+ else:
481
+ current_skills_section = "\n## Current Skills\nNone — this is the first iteration.\n"
482
+
483
+ # Build catalog section
484
+ catalog_section = ""
485
+ if builtin_catalog:
486
+ catalog_lines = []
487
+ for name, desc in sorted(builtin_catalog.items()):
488
+ catalog_lines.append(f" - {name}: {desc}")
489
+ catalog_section = (
490
+ "\n## Available Built-in Skills (can be selected by name)\n"
491
+ + "\n".join(catalog_lines)
492
+ + "\n"
493
+ )
494
+
495
+ prompt = f"""You are optimizing the skill configuration for a coding assistant.
496
+ Skills are domain knowledge packages (SKILL.md files) that give the agent specialized
497
+ expertise, patterns, and best practices for specific domains.
498
+
499
+ ## Task Results
500
+ {results_text}
501
+ {current_skills_section}{catalog_section}
502
+ ## Your Job
503
+ Analyze the failing tasks above. Each failing task includes its **evaluation criteria** —
504
+ these are the exact rules the evaluator checks. Your skills MUST encode these specific
505
+ requirements so the agent follows them.
506
+
507
+ **Critical**: The agent fails because it doesn't know about specific conventions
508
+ (e.g., exact data formats, specific error types to raise, required fields in output).
509
+ Your skills must spell out these conventions as concrete, actionable rules — not
510
+ general advice.
511
+
512
+ For EACH current skill, decide:
513
+ - **"keep"** — the skill is helping (tasks it targets are passing). Leave it as-is.
514
+ - **"drop"** — the skill isn't contributing. Remove it to reduce noise.
515
+ - **Provide updated SKILL.md content** — the skill targets the right problem but
516
+ its content should be refined to better address the failures.
517
+
518
+ You can also:
519
+ - **Add new skills** with full SKILL.md content to address uncovered failure patterns
520
+ - **Select a built-in** by setting the value to "builtin"
521
+
522
+ ## What Makes a Good Skill
523
+ - **Specific, not generic**: "Always use `json.dumps(data, indent=2)`" is better than
524
+ "Use proper JSON formatting"
525
+ - **Actionable rules**: "Define `__all__` at module top" is better than "Follow best practices"
526
+ - **Directly addresses criteria**: Each skill rule should map to a specific evaluation
527
+ criterion that the agent is currently failing
528
+ - **Concise**: Include only the rules needed; avoid padding with obvious advice
529
+ - **Evidence-producing**: The agent is evaluated ONLY on what appears in its tool
530
+ outputs and final response. If the agent writes a file, the evaluator does NOT
531
+ read that file — it only sees the tool's return message (e.g., "Successfully wrote
532
+ 625 characters"). Skills MUST instruct the agent to make its work verifiable:
533
+ * After writing a file, read it back or print its contents so the output is visible
534
+ * After creating structured output (CSV, JSON, code), display the result
535
+ * Run scripts and show their output rather than just writing them
536
+ * The evaluator cannot verify what it cannot see — always produce visible evidence
537
+
538
+ ## Response Format
539
+ Respond with a JSON object. Keys are skill names, values are one of:
540
+ - `"keep"` — retain this skill unchanged
541
+ - `"drop"` — remove this skill
542
+ - `"builtin"` — load from the built-in catalog
543
+ - A string containing the full SKILL.md content (for new or refined skills)
544
+
545
+ SKILL.md content MUST start with YAML frontmatter:
546
+ ---
547
+ name: skill-name
548
+ description: What this skill does
549
+ ---
550
+ # Content...
551
+
552
+ ## Rules
553
+ - Keep skills that are working (their target tasks pass) — don't drop what works
554
+ - Refine skills whose target tasks still fail — tweak the content, don't start over
555
+ - Only add new skills for failure patterns not covered by existing skills
556
+ - Keep skills focused and concise (domain knowledge, not general advice)
557
+ - ALWAYS include a "Verification" section in every skill telling the agent to
558
+ display/print/cat its output after creating it — this is the #1 cause of false
559
+ failures (correct code that the evaluator can't see)
560
+
561
+ ## Example Response
562
+ {{"git-log-parsing": "keep", "executable-verification": "---\\nname: executable-verification\\ndescription: Improved verification patterns\\n---\\n# Updated content here...", "new-skill": "---\\nname: new-skill\\ndescription: Addresses regex failures\\n---\\n# Content..."}}
563
+
564
+ Respond with ONLY the JSON object, nothing else."""
565
+
566
+ try:
567
+ result = self._call_llm(prompt, model)
568
+ if result:
569
+ return self._parse_skill_response(result, current_skills, builtin_catalog)
570
+ except Exception as e:
571
+ logger.warning(f"LLM skill generation failed: {e}")
572
+
573
+ # Fallback: keep current skills unchanged
574
+ return current_skills
575
+
576
+ def _parse_skill_response(
577
+ self,
578
+ response: str,
579
+ current_skills: dict[str, str],
580
+ builtin_catalog: dict[str, str],
581
+ ) -> dict[str, str]:
582
+ """Parse LLM response into skill name -> content mapping.
583
+
584
+ Supports incremental operations:
585
+ - "keep": retain existing skill content unchanged
586
+ - "drop": remove the skill (omit from result)
587
+ - "builtin": load from built-in catalog
588
+ - string content: new or refined skill (replaces existing)
589
+ """
590
+ import json
591
+
592
+ # Try to extract JSON from the response
593
+ response = response.strip()
594
+ # Handle markdown code blocks
595
+ if response.startswith("```"):
596
+ lines = response.split("\n")
597
+ lines = [l for l in lines if not l.strip().startswith("```")]
598
+ response = "\n".join(lines)
599
+
600
+ try:
601
+ skills_dict = json.loads(response)
602
+ except json.JSONDecodeError:
603
+ logger.warning(f"Failed to parse skill response as JSON: {response[:200]}")
604
+ return current_skills
605
+
606
+ if not isinstance(skills_dict, dict):
607
+ logger.warning(f"Skill response is not a dict: {type(skills_dict)}")
608
+ return current_skills
609
+
610
+ new_skills: dict[str, str] = {}
611
+ for name, value in skills_dict.items():
612
+ if not isinstance(name, str):
613
+ continue
614
+
615
+ if value == "keep":
616
+ # Retain existing skill unchanged
617
+ if name in current_skills:
618
+ new_skills[name] = current_skills[name]
619
+ else:
620
+ logger.warning(f"Cannot keep unknown skill: {name}")
621
+ elif value == "drop":
622
+ # Explicitly remove — just don't add to new_skills
623
+ logger.info(f"Dropping skill: {name}")
624
+ elif value == "builtin" and name in builtin_catalog:
625
+ content = self._load_builtin_skill(name)
626
+ if content:
627
+ new_skills[name] = content
628
+ else:
629
+ logger.warning(f"Failed to load built-in skill: {name}")
630
+ elif isinstance(value, str) and value not in ("builtin", "keep", "drop"):
631
+ # New or refined skill content
632
+ new_skills[name] = value
633
+ else:
634
+ logger.warning(f"Skipping invalid skill entry: {name}={value!r}")
635
+
636
+ if not new_skills and current_skills:
637
+ logger.warning("LLM dropped all skills, keeping current set")
638
+ return current_skills
639
+
640
+ return new_skills
641
+
642
+ def _load_builtin_skill(self, name: str) -> str | None:
643
+ """Load the full content of a built-in skill."""
644
+ from flow.tools.skills import _discover_skills, _get_builtin_skills_path
645
+
646
+ builtin_path = _get_builtin_skills_path()
647
+ discovered = _discover_skills([builtin_path])
648
+
649
+ if name in discovered:
650
+ skill_md, _ = discovered[name]
651
+ try:
652
+ return skill_md.read_text()
653
+ except Exception as e:
654
+ logger.warning(f"Error reading built-in skill {name}: {e}")
655
+
656
+ return None
657
+
658
+ def _get_client(self, model: str) -> tuple[Any, str]:
659
+ """Get OpenAI client and model name."""
660
+ try:
661
+ from openai import AzureOpenAI, OpenAI
662
+ except ImportError as e:
663
+ raise ImportError("openai package required for SkillOptimizer") from e
664
+
665
+ azure_key = os.environ.get("AZURE_OPENAI_API_KEY")
666
+ azure_endpoint = os.environ.get("AZURE_OPENAI_ENDPOINT")
667
+
668
+ if azure_key and azure_endpoint:
669
+ client = AzureOpenAI(
670
+ api_key=azure_key,
671
+ api_version="2024-08-01-preview",
672
+ azure_endpoint=azure_endpoint,
673
+ )
674
+ model_name = os.environ.get("AZURE_OPENAI_DEPLOYMENT", model)
675
+ else:
676
+ openai_key = os.environ.get("OPENAI_API_KEY")
677
+ if not openai_key:
678
+ raise ValueError("No OpenAI or Azure OpenAI credentials found")
679
+ client = OpenAI(api_key=openai_key)
680
+ model_name = model
681
+
682
+ return client, model_name
683
+
684
+ def _call_llm(self, prompt: str, model: str) -> str:
685
+ """Call LLM with a prompt."""
686
+ client, model_name = self._get_client(model)
687
+
688
+ response = client.chat.completions.create(
689
+ model=model_name,
690
+ messages=[{"role": "user", "content": prompt}],
691
+ )
692
+ return response.choices[0].message.content or ""
src/flow/experiments/strategies/{tool_selector.py → tool.py} RENAMED
@@ -1,6 +1,6 @@
1
  # Copyright (c) Microsoft. All rights reserved.
2
 
3
- """Active tool selector strategy.
4
 
5
  Uses the runner to evaluate tool configurations and iteratively adjust
6
  the tool set based on actual execution failures. The strategy:
@@ -17,7 +17,6 @@ import os
17
  from dataclasses import dataclass, field
18
  from typing import Any
19
 
20
- from ..metrics import extract_metrics
21
  from ..models import Agent, Candidate, ExperimentRunner, StrategyIteration, TOOL_PRESETS
22
  from ..types import Task
23
 
@@ -30,7 +29,7 @@ ALL_AVAILABLE_TOOLS: list[str] = sorted(
30
 
31
 
32
  @dataclass
33
- class ToolSelectorStrategy:
34
  """Strategy that iteratively optimizes tool configurations via evaluation.
35
 
36
  Runs an evaluate-analyze-adjust loop. Each iteration evaluates
@@ -47,7 +46,7 @@ class ToolSelectorStrategy:
47
 
48
  Example YAML:
49
  strategy:
50
- type: tool_selector
51
  config:
52
  model: gpt-4o-mini
53
  max_iterations: 3
@@ -79,18 +78,22 @@ class ToolSelectorStrategy:
79
  """
80
  if runner is None:
81
  raise ValueError(
82
- "ToolSelectorStrategy requires a runner. "
83
  "Use FlowOptimizer.optimize_with_strategy() to provide one."
84
  )
85
  if not tasks:
86
  raise ValueError(
87
- "ToolSelectorStrategy requires tasks to evaluate against."
88
  )
89
 
90
  # Resolve initial tools to a list
 
 
 
 
91
  from ..models import resolve_tools
92
  if base.tools is None or (isinstance(base.tools, list) and len(base.tools) == 0):
93
- current_tools = []
94
  else:
95
  current_tools = sorted(resolve_tools(base.tools).keys())
96
 
@@ -111,13 +114,14 @@ class ToolSelectorStrategy:
111
  available_tools = self.config.get("available_tools", ALL_AVAILABLE_TOOLS)
112
 
113
  logger.info(
114
- f"ToolSelectorStrategy: active mode (max_iterations={max_iterations}, "
115
  f"available_tools={len(available_tools)})"
116
  )
117
 
118
  current_tools = tools
119
  best_tools = tools
120
  best_score = 0.0
 
121
  history: list[StrategyIteration] = []
122
  # Track all unique tool configs tried, for returning as candidates
123
  iteration_candidates: list[tuple[list[str], str]] = [] # (tools, name_suffix)
@@ -180,8 +184,8 @@ class ToolSelectorStrategy:
180
  change_rationale = f"Tools used: {used_desc}\n{tasks_summary}"
181
  if iteration > 0:
182
  score_delta = avg_score - history[-1].avg_score
183
- added = set(current_tools) - set(best_tools if iteration == 1 else _prev_tools)
184
- removed = set(_prev_tools) - set(current_tools) if iteration > 0 else set()
185
  change_rationale = (
186
  f"Score {'improved' if score_delta > 0 else 'declined'} by {abs(score_delta):.3f}. "
187
  f"Added: {sorted(added) or 'none'}. Removed: {sorted(removed) or 'none'}. "
@@ -235,7 +239,6 @@ class ToolSelectorStrategy:
235
  logger.info(f" Adjusted tools: {current_tools}")
236
 
237
  # Build candidates for all unique tool configs tried
238
- # This gives the Pareto chart multiple data points to compare
239
  candidates: list[Candidate] = []
240
  seen_tool_sets: set[tuple[str, ...]] = set()
241
 
@@ -265,8 +268,7 @@ class ToolSelectorStrategy:
265
  )
266
  )
267
 
268
- # Ensure best is always included (may differ from any iteration if
269
- # the best score was from an earlier iteration)
270
  best_key = tuple(sorted(best_tools))
271
  if best_key not in seen_tool_sets:
272
  final_agent = Agent(
@@ -298,7 +300,6 @@ class ToolSelectorStrategy:
298
  model: str,
299
  ) -> list[str]:
300
  """Analyze failures and traces, then recommend tool changes."""
301
- # Build analysis of what happened
302
  failure_descriptions = []
303
  for tr in task_results:
304
  task_name = getattr(tr, "task_name", "unknown")
@@ -306,7 +307,6 @@ class ToolSelectorStrategy:
306
  reasoning = getattr(tr, "eval_reasoning", "")
307
  score = getattr(tr, "eval_score", 0.0)
308
 
309
- # Get per-task tool usage
310
  metrics = getattr(tr, "metrics", None)
311
  task_tools = {}
312
  if metrics and hasattr(metrics, "tool_calls_by_name"):
@@ -349,16 +349,13 @@ Example: read_file, write_file, bash, grep, edit_file"""
349
  try:
350
  result = self._call_llm(prompt, model)
351
  if result:
352
- # Parse comma-separated tool names
353
  parsed = [t.strip() for t in result.split(",") if t.strip()]
354
- # Validate against available tools
355
  valid = [t for t in parsed if t in available_tools]
356
  if valid:
357
  return sorted(valid)
358
  logger.warning(f"No valid tools in LLM response: {parsed}")
359
  except Exception as e:
360
  logger.warning(f"LLM tool adjustment failed: {e}")
361
- # Fallback: try adding commonly useful tools
362
  return self._heuristic_adjust(current_tools, tools_used, available_tools)
363
 
364
  return current_tools
@@ -372,18 +369,15 @@ Example: read_file, write_file, bash, grep, edit_file"""
372
  """Fallback heuristic when LLM is unavailable."""
373
  adjusted = set(current_tools)
374
 
375
- # If bash was used heavily but grep/glob not available, add them
376
  if "bash" in tools_used and tools_used["bash"] > 2:
377
  for tool in ["grep", "glob_files", "ls"]:
378
  if tool in available_tools:
379
  adjusted.add(tool)
380
 
381
- # If write_file was used but edit_file not available, add it
382
  if "write_file" in tools_used and "edit_file" not in adjusted:
383
  if "edit_file" in available_tools:
384
  adjusted.add("edit_file")
385
 
386
- # Add think if not present (helps with reasoning)
387
  if "think" in available_tools:
388
  adjusted.add("think")
389
 
@@ -394,7 +388,7 @@ Example: read_file, write_file, bash, grep, edit_file"""
394
  try:
395
  from openai import AzureOpenAI, OpenAI
396
  except ImportError as e:
397
- raise ImportError("openai package required for ToolSelectorStrategy") from e
398
 
399
  azure_key = os.environ.get("AZURE_OPENAI_API_KEY")
400
  azure_endpoint = os.environ.get("AZURE_OPENAI_ENDPOINT")
 
1
  # Copyright (c) Microsoft. All rights reserved.
2
 
3
+ """Tool optimization strategy.
4
 
5
  Uses the runner to evaluate tool configurations and iteratively adjust
6
  the tool set based on actual execution failures. The strategy:
 
17
  from dataclasses import dataclass, field
18
  from typing import Any
19
 
 
20
  from ..models import Agent, Candidate, ExperimentRunner, StrategyIteration, TOOL_PRESETS
21
  from ..types import Task
22
 
 
29
 
30
 
31
  @dataclass
32
+ class ToolOptimizer:
33
  """Strategy that iteratively optimizes tool configurations via evaluation.
34
 
35
  Runs an evaluate-analyze-adjust loop. Each iteration evaluates
 
46
 
47
  Example YAML:
48
  strategy:
49
+ type: tool
50
  config:
51
  model: gpt-4o-mini
52
  max_iterations: 3
 
78
  """
79
  if runner is None:
80
  raise ValueError(
81
+ "ToolOptimizer requires a runner. "
82
  "Use FlowOptimizer.optimize_with_strategy() to provide one."
83
  )
84
  if not tasks:
85
  raise ValueError(
86
+ "ToolOptimizer requires tasks to evaluate against."
87
  )
88
 
89
  # Resolve initial tools to a list
90
+ # When starting from no tools, seed with "standard" preset so the
91
+ # optimizer has a working baseline to iterate from. An agent with
92
+ # zero tools produces zero signal (no tool calls, no files created),
93
+ # which makes iterative improvement impossible.
94
  from ..models import resolve_tools
95
  if base.tools is None or (isinstance(base.tools, list) and len(base.tools) == 0):
96
+ current_tools = sorted(resolve_tools("standard").keys())
97
  else:
98
  current_tools = sorted(resolve_tools(base.tools).keys())
99
 
 
114
  available_tools = self.config.get("available_tools", ALL_AVAILABLE_TOOLS)
115
 
116
  logger.info(
117
+ f"ToolOptimizer: active mode (max_iterations={max_iterations}, "
118
  f"available_tools={len(available_tools)})"
119
  )
120
 
121
  current_tools = tools
122
  best_tools = tools
123
  best_score = 0.0
124
+ _prev_tools: list[str] = []
125
  history: list[StrategyIteration] = []
126
  # Track all unique tool configs tried, for returning as candidates
127
  iteration_candidates: list[tuple[list[str], str]] = [] # (tools, name_suffix)
 
184
  change_rationale = f"Tools used: {used_desc}\n{tasks_summary}"
185
  if iteration > 0:
186
  score_delta = avg_score - history[-1].avg_score
187
+ added = set(current_tools) - set(_prev_tools)
188
+ removed = set(_prev_tools) - set(current_tools)
189
  change_rationale = (
190
  f"Score {'improved' if score_delta > 0 else 'declined'} by {abs(score_delta):.3f}. "
191
  f"Added: {sorted(added) or 'none'}. Removed: {sorted(removed) or 'none'}. "
 
239
  logger.info(f" Adjusted tools: {current_tools}")
240
 
241
  # Build candidates for all unique tool configs tried
 
242
  candidates: list[Candidate] = []
243
  seen_tool_sets: set[tuple[str, ...]] = set()
244
 
 
268
  )
269
  )
270
 
271
+ # Ensure best is always included
 
272
  best_key = tuple(sorted(best_tools))
273
  if best_key not in seen_tool_sets:
274
  final_agent = Agent(
 
300
  model: str,
301
  ) -> list[str]:
302
  """Analyze failures and traces, then recommend tool changes."""
 
303
  failure_descriptions = []
304
  for tr in task_results:
305
  task_name = getattr(tr, "task_name", "unknown")
 
307
  reasoning = getattr(tr, "eval_reasoning", "")
308
  score = getattr(tr, "eval_score", 0.0)
309
 
 
310
  metrics = getattr(tr, "metrics", None)
311
  task_tools = {}
312
  if metrics and hasattr(metrics, "tool_calls_by_name"):
 
349
  try:
350
  result = self._call_llm(prompt, model)
351
  if result:
 
352
  parsed = [t.strip() for t in result.split(",") if t.strip()]
 
353
  valid = [t for t in parsed if t in available_tools]
354
  if valid:
355
  return sorted(valid)
356
  logger.warning(f"No valid tools in LLM response: {parsed}")
357
  except Exception as e:
358
  logger.warning(f"LLM tool adjustment failed: {e}")
 
359
  return self._heuristic_adjust(current_tools, tools_used, available_tools)
360
 
361
  return current_tools
 
369
  """Fallback heuristic when LLM is unavailable."""
370
  adjusted = set(current_tools)
371
 
 
372
  if "bash" in tools_used and tools_used["bash"] > 2:
373
  for tool in ["grep", "glob_files", "ls"]:
374
  if tool in available_tools:
375
  adjusted.add(tool)
376
 
 
377
  if "write_file" in tools_used and "edit_file" not in adjusted:
378
  if "edit_file" in available_tools:
379
  adjusted.add("edit_file")
380
 
 
381
  if "think" in available_tools:
382
  adjusted.add("think")
383
 
 
388
  try:
389
  from openai import AzureOpenAI, OpenAI
390
  except ImportError as e:
391
+ raise ImportError("openai package required for ToolOptimizer") from e
392
 
393
  azure_key = os.environ.get("AZURE_OPENAI_API_KEY")
394
  azure_endpoint = os.environ.get("AZURE_OPENAI_ENDPOINT")
src/flow/harness/compaction/strategies.py CHANGED
@@ -10,6 +10,8 @@ from __future__ import annotations
10
  from dataclasses import dataclass, field
11
  from typing import Any, Protocol
12
 
 
 
13
  from flow.harness.compaction.tokenizer import (
14
  count_message_tokens,
15
  count_messages_tokens,
@@ -479,7 +481,8 @@ class SummarizationStrategy:
479
  if self.summarize_fn:
480
  try:
481
  summary_text = await self.summarize_fn(middle, self.summary_max_tokens)
482
- except Exception:
 
483
  summary_text = self._extract_key_info(middle)
484
  else:
485
  summary_text = self._extract_key_info(middle)
 
10
  from dataclasses import dataclass, field
11
  from typing import Any, Protocol
12
 
13
+ from loguru import logger
14
+
15
  from flow.harness.compaction.tokenizer import (
16
  count_message_tokens,
17
  count_messages_tokens,
 
481
  if self.summarize_fn:
482
  try:
483
  summary_text = await self.summarize_fn(middle, self.summary_max_tokens)
484
+ except Exception as e:
485
+ logger.warning(f"Summarization function failed, falling back to key info extraction: {e}")
486
  summary_text = self._extract_key_info(middle)
487
  else:
488
  summary_text = self._extract_key_info(middle)
src/flow/harness/maf/agent.py CHANGED
@@ -163,14 +163,26 @@ def create_agent(
163
  f"Message compaction enabled: head={compaction_head_size}, tail={compaction_tail_size}, head_ratio={head_ratio:.2f}"
164
  )
165
 
166
- # Determine if memory is enabled for instructions
167
  enable_memory = False
 
168
  if isinstance(tools, str):
169
- enable_memory = "memory" in TOOL_PRESETS.get(tools, {})
 
 
170
  elif isinstance(tools, list):
171
  enable_memory = "memory" in tools
 
172
  elif isinstance(tools, dict):
173
  enable_memory = "memory" in tools
 
 
 
 
 
 
 
 
174
 
175
  # Create the agent
176
  agent = ChatAgent(
@@ -178,6 +190,8 @@ def create_agent(
178
  description="Autonomous coding agent",
179
  instructions=instructions or build_instructions(
180
  enable_memory=enable_memory,
 
 
181
  ),
182
  chat_client=client,
183
  tools=converted_tools,
 
163
  f"Message compaction enabled: head={compaction_head_size}, tail={compaction_tail_size}, head_ratio={head_ratio:.2f}"
164
  )
165
 
166
+ # Determine if memory and skills are enabled for instructions
167
  enable_memory = False
168
+ enable_skills = False
169
  if isinstance(tools, str):
170
+ preset = TOOL_PRESETS.get(tools, {})
171
+ enable_memory = "memory" in preset
172
+ enable_skills = "skills" in preset
173
  elif isinstance(tools, list):
174
  enable_memory = "memory" in tools
175
+ enable_skills = "skills" in tools
176
  elif isinstance(tools, dict):
177
  enable_memory = "memory" in tools
178
+ enable_skills = "skills" in tools
179
+
180
+ # Discover skill metadata when skills are enabled and no explicit instructions
181
+ skills_metadata: dict[str, dict[str, str]] | None = None
182
+ if enable_skills and instructions is None:
183
+ from flow.tools.skills import discover_skills_from_tools_spec
184
+
185
+ skills_metadata = discover_skills_from_tools_spec(tools_spec)
186
 
187
  # Create the agent
188
  agent = ChatAgent(
 
190
  description="Autonomous coding agent",
191
  instructions=instructions or build_instructions(
192
  enable_memory=enable_memory,
193
+ enable_skills=enable_skills,
194
+ skills_metadata=skills_metadata,
195
  ),
196
  chat_client=client,
197
  tools=converted_tools,
src/flow/harness/maf/tools/__init__.py CHANGED
@@ -151,8 +151,16 @@ def build_tools(
151
  model=config.get("model"),
152
  )
153
  tools.append(to_maf_tool(custom_task))
 
 
 
 
 
 
 
 
154
  elif name == "skills" and config.get("additional_paths"):
155
- # Skills with custom paths
156
  custom_skills = create_skills_tool(project_path=Path(config["additional_paths"][0]))
157
  tools.append(to_maf_tool(custom_skills))
158
  # Web search tool
 
151
  model=config.get("model"),
152
  )
153
  tools.append(to_maf_tool(custom_task))
154
+ elif name == "skills" and config.get("skills_path"):
155
+ # Skills with explicit managed path (used by SkillOptimizer)
156
+ # Only uses the specified path — no built-in or user skills
157
+ custom_skills = create_skills_tool(
158
+ builtin_path=Path(config["skills_path"]),
159
+ exclusive=True,
160
+ )
161
+ tools.append(to_maf_tool(custom_skills))
162
  elif name == "skills" and config.get("additional_paths"):
163
+ # Skills with additional project paths (keeps built-in skills too)
164
  custom_skills = create_skills_tool(project_path=Path(config["additional_paths"][0]))
165
  tools.append(to_maf_tool(custom_skills))
166
  # Web search tool
src/flow/harness/miniagent/harness.py CHANGED
@@ -116,6 +116,8 @@ class MiniAgentHarness(BaseHarness):
116
  otel_hooks = create_otel_hooks(model=config.model)
117
 
118
  # Resolve instructions: explicit > preset > default "general"
 
 
119
  if agent.instructions:
120
  instructions = agent.instructions
121
  elif agent.instructions_preset:
@@ -123,6 +125,27 @@ class MiniAgentHarness(BaseHarness):
123
  else:
124
  instructions = get_instructions("general")
125
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  chat_agent = ChatAgent(
127
  client=chat_client,
128
  instructions=instructions,
@@ -410,7 +433,15 @@ class MiniAgentHarness(BaseHarness):
410
  tools: list[Tool] = []
411
 
412
  for name, config in tools_spec.items():
413
- if name in tool_map:
 
 
 
 
 
 
 
 
414
  tools.append(tool_map[name])
415
  elif name == "task" and config:
416
  # Task tool with custom config
 
116
  otel_hooks = create_otel_hooks(model=config.model)
117
 
118
  # Resolve instructions: explicit > preset > default "general"
119
+ # When using default instructions, discover skill metadata and append
120
+ # a summary so the agent knows what skills are available upfront.
121
  if agent.instructions:
122
  instructions = agent.instructions
123
  elif agent.instructions_preset:
 
125
  else:
126
  instructions = get_instructions("general")
127
 
128
+ # Inject skill metadata into instructions (unless explicit instructions were set)
129
+ if not agent.instructions and "skills" in tools_spec:
130
+ from flow.tools.skills import discover_skills_from_tools_spec
131
+
132
+ skills_metadata = discover_skills_from_tools_spec(tools_spec)
133
+ if skills_metadata:
134
+ lines = ["\n\n## AVAILABLE SKILLS\n"]
135
+ lines.append(
136
+ "The following domain-specific skills are available. "
137
+ "Use `skills(action='load', name='...')` to load full content "
138
+ "when relevant to your task.\n"
139
+ )
140
+ for skill_name, meta in sorted(skills_metadata.items()):
141
+ description = meta.get("description", "No description")
142
+ triggers = meta.get("triggers", "")
143
+ entry = f"- **{skill_name}**: {description}"
144
+ if triggers:
145
+ entry += f" _(triggers: {triggers})_"
146
+ lines.append(entry)
147
+ instructions += "\n".join(lines)
148
+
149
  chat_agent = ChatAgent(
150
  client=chat_client,
151
  instructions=instructions,
 
433
  tools: list[Tool] = []
434
 
435
  for name, config in tools_spec.items():
436
+ if name == "skills" and config.get("skills_path"):
437
+ # Skills with explicit managed path (used by SkillOptimizer)
438
+ from flow.tools.skills import create_skills_tool as _create_skills
439
+ custom_skills = _create_skills(
440
+ builtin_path=Path(config["skills_path"]),
441
+ exclusive=True,
442
+ )
443
+ tools.append(custom_skills)
444
+ elif name in tool_map:
445
  tools.append(tool_map[name])
446
  elif name == "task" and config:
447
  # Task tool with custom config
src/flow/harness/miniagent/tool.py CHANGED
@@ -8,6 +8,8 @@ from collections.abc import Callable
8
  from dataclasses import dataclass
9
  from typing import Annotated, Any, Literal, get_args, get_origin, get_type_hints
10
 
 
 
11
 
12
  @dataclass
13
  class Tool:
@@ -115,7 +117,8 @@ def tool(func: Callable[..., Any]) -> Tool:
115
  # Get type hints (with extras for Annotated)
116
  try:
117
  hints = get_type_hints(func, include_extras=True)
118
- except Exception:
 
119
  hints = {}
120
 
121
  # Build JSON Schema for parameters
 
8
  from dataclasses import dataclass
9
  from typing import Annotated, Any, Literal, get_args, get_origin, get_type_hints
10
 
11
+ from loguru import logger
12
+
13
 
14
  @dataclass
15
  class Tool:
 
117
  # Get type hints (with extras for Annotated)
118
  try:
119
  hints = get_type_hints(func, include_extras=True)
120
+ except Exception as e:
121
+ logger.warning(f"Failed to get type hints for function {func.__name__}: {e}")
122
  hints = {}
123
 
124
  # Build JSON Schema for parameters
src/flow/prompts.py CHANGED
@@ -464,6 +464,7 @@ def build_instructions(
464
  *,
465
  enable_memory: bool = True,
466
  enable_skills: bool = True,
 
467
  ) -> str:
468
  """Build agent instructions dynamically based on enabled tools.
469
 
@@ -474,6 +475,11 @@ def build_instructions(
474
  Args:
475
  enable_memory: Include memory tool documentation.
476
  enable_skills: Include skills tool documentation.
 
 
 
 
 
477
 
478
  Returns:
479
  Complete instruction string.
@@ -529,6 +535,23 @@ def build_instructions(
529
  if enable_skills:
530
  sections.append(_SKILLS_SECTION)
531
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
532
  sections.extend([
533
  _CORE_EXAMPLES,
534
  _CORE_RESEARCH,
 
464
  *,
465
  enable_memory: bool = True,
466
  enable_skills: bool = True,
467
+ skills_metadata: dict[str, dict[str, str]] | None = None,
468
  ) -> str:
469
  """Build agent instructions dynamically based on enabled tools.
470
 
 
475
  Args:
476
  enable_memory: Include memory tool documentation.
477
  enable_skills: Include skills tool documentation.
478
+ skills_metadata: Optional dict of discovered skill metadata
479
+ (name -> {"description": ..., "triggers": ...}).
480
+ When provided, injects a concrete listing of available skills
481
+ into the system prompt so the agent knows what's available
482
+ without needing to call ``skills(action='list')``.
483
 
484
  Returns:
485
  Complete instruction string.
 
535
  if enable_skills:
536
  sections.append(_SKILLS_SECTION)
537
 
538
+ # Inject concrete skill listing when metadata is available
539
+ if enable_skills and skills_metadata:
540
+ lines = ["\n## AVAILABLE SKILLS\n"]
541
+ lines.append(
542
+ "The following domain-specific skills are available. "
543
+ "Use `skills(action='load', name='...')` to load full content "
544
+ "when relevant to your task.\n"
545
+ )
546
+ for skill_name, meta in sorted(skills_metadata.items()):
547
+ description = meta.get("description", "No description")
548
+ triggers = meta.get("triggers", "")
549
+ entry = f"- **{skill_name}**: {description}"
550
+ if triggers:
551
+ entry += f" _(triggers: {triggers})_"
552
+ lines.append(entry)
553
+ sections.append("\n".join(lines))
554
+
555
  sections.extend([
556
  _CORE_EXAMPLES,
557
  _CORE_RESEARCH,
src/flow/tools/__init__.py CHANGED
@@ -50,10 +50,11 @@ Sub-agents:
50
 
51
  from __future__ import annotations
52
 
53
- import logging
54
  from pathlib import Path
55
  from typing import Any
56
 
 
 
57
  # Adapters for framework integration
58
  from .adapters import to_maf_tool, to_openai_tool, tools_to_maf, tools_to_openai
59
  from .base import Tool, tool
@@ -96,7 +97,7 @@ from .notebook import notebook_edit, notebook_read
96
  from .planning import think, todo_read, todo_write
97
 
98
  # Skills tools
99
- from .skills import create_skills_tool, skills
100
 
101
  # Sub-agent tools
102
  from .subagent import create_task_tool, task
@@ -148,6 +149,7 @@ __all__ = [
148
  # Skills tools
149
  "skills",
150
  "create_skills_tool",
 
151
  # Sub-agent tools
152
  "task",
153
  "create_task_tool",
@@ -175,8 +177,6 @@ __all__ = [
175
  "visual_inspector",
176
  ]
177
 
178
- logger = logging.getLogger(__name__)
179
-
180
 
181
  # =============================================================================
182
  # Tool Presets - Convenient groupings of tools
 
50
 
51
  from __future__ import annotations
52
 
 
53
  from pathlib import Path
54
  from typing import Any
55
 
56
+ from loguru import logger
57
+
58
  # Adapters for framework integration
59
  from .adapters import to_maf_tool, to_openai_tool, tools_to_maf, tools_to_openai
60
  from .base import Tool, tool
 
97
  from .planning import think, todo_read, todo_write
98
 
99
  # Skills tools
100
+ from .skills import create_skills_tool, discover_skills_from_tools_spec, skills
101
 
102
  # Sub-agent tools
103
  from .subagent import create_task_tool, task
 
149
  # Skills tools
150
  "skills",
151
  "create_skills_tool",
152
+ "discover_skills_from_tools_spec",
153
  # Sub-agent tools
154
  "task",
155
  "create_task_tool",
 
177
  "visual_inspector",
178
  ]
179
 
 
 
180
 
181
  # =============================================================================
182
  # Tool Presets - Convenient groupings of tools
src/flow/tools/adapters.py CHANGED
@@ -9,7 +9,6 @@ across different agent frameworks without code duplication.
9
 
10
  from __future__ import annotations
11
 
12
- import logging
13
  from typing import TYPE_CHECKING
14
 
15
  from .base import Tool
@@ -18,8 +17,6 @@ if TYPE_CHECKING:
18
  from collections.abc import Callable
19
  from typing import Any
20
 
21
- logger = logging.getLogger(__name__)
22
-
23
 
24
  def to_maf_tool(tool: Tool) -> Callable[..., Any]:
25
  """Convert a Flow Tool to a MAF-decorated function.
@@ -42,10 +39,7 @@ def to_maf_tool(tool: Tool) -> Callable[..., Any]:
42
  try:
43
  from agent_framework import tool as maf_tool
44
  except ImportError:
45
- raise ImportError(
46
- "Microsoft Agent Framework not installed. "
47
- "Install with: pip install agent-framework"
48
- )
49
 
50
  return maf_tool(
51
  name=tool.name,
 
9
 
10
  from __future__ import annotations
11
 
 
12
  from typing import TYPE_CHECKING
13
 
14
  from .base import Tool
 
17
  from collections.abc import Callable
18
  from typing import Any
19
 
 
 
20
 
21
  def to_maf_tool(tool: Tool) -> Callable[..., Any]:
22
  """Convert a Flow Tool to a MAF-decorated function.
 
39
  try:
40
  from agent_framework import tool as maf_tool
41
  except ImportError:
42
+ raise ImportError("Microsoft Agent Framework not installed. Install with: pip install agent-framework")
 
 
 
43
 
44
  return maf_tool(
45
  name=tool.name,
src/flow/tools/base.py CHANGED
@@ -15,6 +15,8 @@ from collections.abc import Callable
15
  from dataclasses import dataclass
16
  from typing import Annotated, Any, Literal, get_args, get_origin, get_type_hints
17
 
 
 
18
 
19
  @dataclass
20
  class Tool:
@@ -128,7 +130,8 @@ def tool(func: Callable[..., Any]) -> Tool:
128
  # Get type hints (with extras for Annotated)
129
  try:
130
  hints = get_type_hints(func, include_extras=True)
131
- except Exception:
 
132
  hints = {}
133
 
134
  # Build JSON Schema for parameters
 
15
  from dataclasses import dataclass
16
  from typing import Annotated, Any, Literal, get_args, get_origin, get_type_hints
17
 
18
+ from loguru import logger
19
+
20
 
21
  @dataclass
22
  class Tool:
 
130
  # Get type hints (with extras for Annotated)
131
  try:
132
  hints = get_type_hints(func, include_extras=True)
133
+ except Exception as e:
134
+ logger.warning(f"Failed to get type hints for function {func.__name__}: {e}")
135
  hints = {}
136
 
137
  # Build JSON Schema for parameters
src/flow/tools/browsing.py CHANGED
@@ -26,7 +26,10 @@ def create_smol_web_search_tool(max_results: int = 10, engine: str = "duckduckgo
26
  """
27
  logger.info("Performing web search for query: {}", query)
28
  tool = WebSearchTool(max_results=max_results, engine=engine)
29
- return tool.forward(query=query)
 
 
 
30
 
31
  return smol_web_search
32
 
@@ -39,7 +42,11 @@ def wikipedia_search(
39
  """Searches Wikipedia and returns a summary or full text of the given topic, along with the page URL."""
40
  logger.info("Performing wikipedia search for query: {}", query)
41
  tool = WikipediaSearchTool(language=language)
42
- return tool.forward(query=query)
 
 
 
 
43
 
44
  def create_visit_webpage_tool(max_output_length: int = 40000) -> Tool:
45
  """Create a tool for visiting webpages and reading their content as markdown.
@@ -59,7 +66,10 @@ def create_visit_webpage_tool(max_output_length: int = 40000) -> Tool:
59
  """Visits a webpage at the given url and reads its content as a markdown string. Use this to browse webpages."""
60
  logger.info("Visiting webpage at URL: {}", url)
61
  tool = VisitWebpageTool(max_output_length=max_output_length)
62
- return tool.forward(url=url)
 
 
 
63
 
64
  return visit_webpage
65
 
 
26
  """
27
  logger.info("Performing web search for query: {}", query)
28
  tool = WebSearchTool(max_results=max_results, engine=engine)
29
+ output = tool.forward(query=query)
30
+ logger.debug("Web search output length: {}", len(output))
31
+ logger.debug("Web search output first 200 chars: {}", output[:200] if len(output) > 200 else output)
32
+ return output
33
 
34
  return smol_web_search
35
 
 
42
  """Searches Wikipedia and returns a summary or full text of the given topic, along with the page URL."""
43
  logger.info("Performing wikipedia search for query: {}", query)
44
  tool = WikipediaSearchTool(language=language)
45
+ output = tool.forward(query=query)
46
+ logger.debug("Wikipedia search output length: {}", len(output))
47
+ logger.debug("Wikipedia search output first 200 chars: {}", output[:200] if len(output) > 200 else output)
48
+ return output
49
+
50
 
51
  def create_visit_webpage_tool(max_output_length: int = 40000) -> Tool:
52
  """Create a tool for visiting webpages and reading their content as markdown.
 
66
  """Visits a webpage at the given url and reads its content as a markdown string. Use this to browse webpages."""
67
  logger.info("Visiting webpage at URL: {}", url)
68
  tool = VisitWebpageTool(max_output_length=max_output_length)
69
+ output = tool.forward(url=url)
70
+ logger.debug("Visit webpage output length: {}", len(output))
71
+ logger.debug("Visit webpage output first 200 chars: {}", output[:200] if len(output) > 200 else output)
72
+ return output
73
 
74
  return visit_webpage
75
 
src/flow/tools/coding.py CHANGED
@@ -8,6 +8,8 @@ import re
8
  from pathlib import Path
9
  from typing import Annotated
10
 
 
 
11
  from .base import tool
12
  from .workspace import get_workspace
13
 
@@ -35,13 +37,16 @@ def read_file(
35
 
36
  Returns the file content with line numbers for easy reference.
37
  """
 
38
  try:
39
  path_obj = _resolve_path(path)
40
 
41
  if not path_obj.exists():
 
42
  return f"Error: File not found: {path}"
43
 
44
  if not path_obj.is_file():
 
45
  return f"Error: Not a file: {path}"
46
 
47
  with open(path_obj, encoding="utf-8", errors="replace") as f:
@@ -66,9 +71,11 @@ def read_file(
66
  if start > 0 or end < total_lines:
67
  result += f"\n\n[Showing lines {start + 1}-{end} of {total_lines}]"
68
 
 
69
  return result
70
 
71
  except Exception as e:
 
72
  return f"Error reading file: {e!s}"
73
 
74
 
@@ -83,6 +90,7 @@ def write_file(
83
  Use this to create new files or completely replace file contents.
84
  For partial edits, use edit_file instead.
85
  """
 
86
  try:
87
  path_obj = _resolve_path(path)
88
 
@@ -95,9 +103,11 @@ def write_file(
95
  # Count lines for feedback
96
  line_count = content.count("\n") + (1 if content and not content.endswith("\n") else 0)
97
 
 
98
  return f"Successfully wrote {len(content)} characters ({line_count} lines) to {path}"
99
 
100
  except Exception as e:
 
101
  return f"Error writing file: {e!s}"
102
 
103
 
@@ -112,10 +122,12 @@ def edit_file(
112
  The old_string must appear exactly once in the file.
113
  For multiple replacements, call this tool multiple times.
114
  """
 
115
  try:
116
  path_obj = _resolve_path(path)
117
 
118
  if not path_obj.exists():
 
119
  return f"Error: File not found: {path}"
120
 
121
  with open(path_obj, encoding="utf-8") as f:
@@ -125,9 +137,11 @@ def edit_file(
125
  count = content.count(old_string)
126
 
127
  if count == 0:
 
128
  return f"Error: Could not find the specified text in {path}"
129
 
130
  if count > 1:
 
131
  return f"Error: Found {count} occurrences of the text. Please provide more context to make it unique."
132
 
133
  # Perform replacement
@@ -136,9 +150,11 @@ def edit_file(
136
  with open(path_obj, "w", encoding="utf-8") as f:
137
  f.write(new_content)
138
 
 
139
  return f"Successfully edited {path}"
140
 
141
  except Exception as e:
 
142
  return f"Error editing file: {e!s}"
143
 
144
 
@@ -152,13 +168,16 @@ def glob_files(
152
 
153
  Returns a list of matching file paths, sorted by modification time (newest first).
154
  """
 
155
  try:
156
  base_path = _resolve_path(path)
157
 
158
  if not base_path.exists():
 
159
  return f"Error: Directory not found: {path}"
160
 
161
  if not base_path.is_dir():
 
162
  return f"Error: Not a directory: {path}"
163
 
164
  # Find matching files
@@ -188,9 +207,11 @@ def glob_files(
188
  if len(matches) > limit:
189
  result += f"\n\n[Showing {limit} of {len(matches)} matches]"
190
 
 
191
  return result
192
 
193
  except Exception as e:
 
194
  return f"Error searching files: {e!s}"
195
 
196
 
@@ -206,6 +227,7 @@ def grep(
206
 
207
  Returns matching lines with file paths and line numbers.
208
  """
 
209
  try:
210
  base_path = _resolve_path(path)
211
  regex = re.compile(pattern)
@@ -216,10 +238,7 @@ def grep(
216
  files = [base_path]
217
  else:
218
  # Find files matching include pattern
219
- files = [
220
- p for p in base_path.rglob("*")
221
- if p.is_file() and fnmatch.fnmatch(p.name, include)
222
- ]
223
 
224
  for file_path in files:
225
  try:
@@ -247,13 +266,15 @@ def grep(
247
  if len(matches) >= limit:
248
  break
249
 
250
- except Exception:
 
251
  continue # Skip files that can't be read
252
 
253
  if len(matches) >= limit:
254
  break
255
 
256
  if not matches:
 
257
  return f"No matches found for pattern: {pattern}"
258
 
259
  result = "\n\n".join(matches)
@@ -261,11 +282,14 @@ def grep(
261
  if len(matches) >= limit:
262
  result += f"\n\n[Results limited to {limit} matches]"
263
 
 
264
  return result
265
 
266
  except re.error as e:
 
267
  return f"Error: Invalid regex pattern: {e!s}"
268
  except Exception as e:
 
269
  return f"Error searching: {e!s}"
270
 
271
 
@@ -279,13 +303,16 @@ def ls(
279
 
280
  Returns a formatted listing of directory contents.
281
  """
 
282
  try:
283
  dir_path = _resolve_path(path)
284
 
285
  if not dir_path.exists():
 
286
  return f"Error: Path not found: {path}"
287
 
288
  if not dir_path.is_dir():
 
289
  return f"Error: Not a directory: {path}"
290
 
291
  entries = list(dir_path.iterdir())
@@ -309,27 +336,31 @@ def ls(
309
  if size < 1024:
310
  size_str = f"{size:>6}B"
311
  elif size < 1024 * 1024:
312
- size_str = f"{size/1024:>6.1f}K"
313
  else:
314
- size_str = f"{size/(1024*1024):>6.1f}M"
315
 
316
  # Format time
317
  from datetime import datetime
 
318
  mtime = datetime.fromtimestamp(stat.st_mtime)
319
  time_str = mtime.strftime("%Y-%m-%d %H:%M")
320
 
321
  type_char = "d" if entry.is_dir() else "-"
322
  name = entry.name + ("/" if entry.is_dir() else "")
323
  output_lines.append(f"{type_char} {size_str} {time_str} {name}")
324
- except Exception:
 
325
  output_lines.append(entry.name)
326
  else:
327
  name = entry.name + ("/" if entry.is_dir() else "")
328
  output_lines.append(name)
329
 
 
330
  return "\n".join(output_lines)
331
 
332
  except Exception as e:
 
333
  return f"Error listing directory: {e!s}"
334
 
335
 
@@ -346,10 +377,12 @@ def multi_edit(
346
 
347
  Edits are applied sequentially, so later edits see the result of earlier ones.
348
  """
 
349
  try:
350
  path_obj = _resolve_path(path)
351
 
352
  if not path_obj.exists():
 
353
  return f"Error: File not found: {path}"
354
 
355
  with open(path_obj, encoding="utf-8") as f:
@@ -358,7 +391,7 @@ def multi_edit(
358
  # Validate all edits first
359
  for i, edit in enumerate(edits):
360
  if "old_string" not in edit or "new_string" not in edit:
361
- return f"Error: Edit {i+1} missing 'old_string' or 'new_string'"
362
 
363
  # Apply edits sequentially
364
  applied: list[str] = []
@@ -371,26 +404,28 @@ def multi_edit(
371
  if count == 0:
372
  # Rollback - restore original
373
  return (
374
- f"Error: Edit {i+1} failed - could not find text.\n"
375
  f"Applied {len(applied)} edit(s) before failure.\n"
376
  f"File unchanged (atomic rollback)."
377
  )
378
 
379
  if count > 1:
380
  return (
381
- f"Error: Edit {i+1} failed - found {count} occurrences.\n"
382
  f"Applied {len(applied)} edit(s) before failure.\n"
383
  f"File unchanged (atomic rollback)."
384
  )
385
 
386
  content = content.replace(old_str, new_str, 1)
387
- applied.append(f"Edit {i+1}: replaced {len(old_str)} chars with {len(new_str)} chars")
388
 
389
  # All edits succeeded - write the file
390
  with open(path_obj, "w", encoding="utf-8") as f:
391
  f.write(content)
392
 
 
393
  return f"Successfully applied {len(edits)} edit(s) to {path}:\n" + "\n".join(applied)
394
 
395
  except Exception as e:
 
396
  return f"Error editing file: {e!s}"
 
8
  from pathlib import Path
9
  from typing import Annotated
10
 
11
+ from loguru import logger
12
+
13
  from .base import tool
14
  from .workspace import get_workspace
15
 
 
37
 
38
  Returns the file content with line numbers for easy reference.
39
  """
40
+ logger.debug(f"read_file: path={path}, offset={offset}, limit={limit}")
41
  try:
42
  path_obj = _resolve_path(path)
43
 
44
  if not path_obj.exists():
45
+ logger.warning(f"read_file: file not found: {path}")
46
  return f"Error: File not found: {path}"
47
 
48
  if not path_obj.is_file():
49
+ logger.warning(f"read_file: not a file: {path}")
50
  return f"Error: Not a file: {path}"
51
 
52
  with open(path_obj, encoding="utf-8", errors="replace") as f:
 
71
  if start > 0 or end < total_lines:
72
  result += f"\n\n[Showing lines {start + 1}-{end} of {total_lines}]"
73
 
74
+ logger.debug(f"read_file: read {end - start} lines from {path}")
75
  return result
76
 
77
  except Exception as e:
78
+ logger.warning(f"read_file: error reading {path}: {e}")
79
  return f"Error reading file: {e!s}"
80
 
81
 
 
90
  Use this to create new files or completely replace file contents.
91
  For partial edits, use edit_file instead.
92
  """
93
+ logger.info(f"write_file: writing to {path}")
94
  try:
95
  path_obj = _resolve_path(path)
96
 
 
103
  # Count lines for feedback
104
  line_count = content.count("\n") + (1 if content and not content.endswith("\n") else 0)
105
 
106
+ logger.debug(f"write_file: wrote {len(content)} chars ({line_count} lines) to {path}")
107
  return f"Successfully wrote {len(content)} characters ({line_count} lines) to {path}"
108
 
109
  except Exception as e:
110
+ logger.warning(f"write_file: error writing to {path}: {e}")
111
  return f"Error writing file: {e!s}"
112
 
113
 
 
122
  The old_string must appear exactly once in the file.
123
  For multiple replacements, call this tool multiple times.
124
  """
125
+ logger.info(f"edit_file: editing {path}")
126
  try:
127
  path_obj = _resolve_path(path)
128
 
129
  if not path_obj.exists():
130
+ logger.warning(f"edit_file: file not found: {path}")
131
  return f"Error: File not found: {path}"
132
 
133
  with open(path_obj, encoding="utf-8") as f:
 
137
  count = content.count(old_string)
138
 
139
  if count == 0:
140
+ logger.warning(f"edit_file: text not found in {path}")
141
  return f"Error: Could not find the specified text in {path}"
142
 
143
  if count > 1:
144
+ logger.warning(f"edit_file: found {count} occurrences in {path}, expected 1")
145
  return f"Error: Found {count} occurrences of the text. Please provide more context to make it unique."
146
 
147
  # Perform replacement
 
150
  with open(path_obj, "w", encoding="utf-8") as f:
151
  f.write(new_content)
152
 
153
+ logger.debug(f"edit_file: successfully edited {path}")
154
  return f"Successfully edited {path}"
155
 
156
  except Exception as e:
157
+ logger.warning(f"edit_file: error editing {path}: {e}")
158
  return f"Error editing file: {e!s}"
159
 
160
 
 
168
 
169
  Returns a list of matching file paths, sorted by modification time (newest first).
170
  """
171
+ logger.debug(f"glob_files: pattern={pattern}, path={path}, limit={limit}")
172
  try:
173
  base_path = _resolve_path(path)
174
 
175
  if not base_path.exists():
176
+ logger.warning(f"glob_files: directory not found: {path}")
177
  return f"Error: Directory not found: {path}"
178
 
179
  if not base_path.is_dir():
180
+ logger.warning(f"glob_files: not a directory: {path}")
181
  return f"Error: Not a directory: {path}"
182
 
183
  # Find matching files
 
207
  if len(matches) > limit:
208
  result += f"\n\n[Showing {limit} of {len(matches)} matches]"
209
 
210
+ logger.debug(f"glob_files: found {len(files)} files matching '{pattern}'")
211
  return result
212
 
213
  except Exception as e:
214
+ logger.warning(f"glob_files: error searching: {e}")
215
  return f"Error searching files: {e!s}"
216
 
217
 
 
227
 
228
  Returns matching lines with file paths and line numbers.
229
  """
230
+ logger.debug(f"grep: pattern='{pattern}', path={path}, include={include}")
231
  try:
232
  base_path = _resolve_path(path)
233
  regex = re.compile(pattern)
 
238
  files = [base_path]
239
  else:
240
  # Find files matching include pattern
241
+ files = [p for p in base_path.rglob("*") if p.is_file() and fnmatch.fnmatch(p.name, include)]
 
 
 
242
 
243
  for file_path in files:
244
  try:
 
266
  if len(matches) >= limit:
267
  break
268
 
269
+ except Exception as e:
270
+ logger.debug(f"grep: skipping unreadable file {file_path}: {e}")
271
  continue # Skip files that can't be read
272
 
273
  if len(matches) >= limit:
274
  break
275
 
276
  if not matches:
277
+ logger.debug(f"grep: no matches found for pattern '{pattern}'")
278
  return f"No matches found for pattern: {pattern}"
279
 
280
  result = "\n\n".join(matches)
 
282
  if len(matches) >= limit:
283
  result += f"\n\n[Results limited to {limit} matches]"
284
 
285
+ logger.debug(f"grep: found {len(matches)} matches for pattern '{pattern}'")
286
  return result
287
 
288
  except re.error as e:
289
+ logger.warning(f"grep: invalid regex '{pattern}': {e}")
290
  return f"Error: Invalid regex pattern: {e!s}"
291
  except Exception as e:
292
+ logger.warning(f"grep: error searching: {e}")
293
  return f"Error searching: {e!s}"
294
 
295
 
 
303
 
304
  Returns a formatted listing of directory contents.
305
  """
306
+ logger.debug(f"ls: path={path}, show_hidden={show_hidden}, long_format={long_format}")
307
  try:
308
  dir_path = _resolve_path(path)
309
 
310
  if not dir_path.exists():
311
+ logger.warning(f"ls: path not found: {path}")
312
  return f"Error: Path not found: {path}"
313
 
314
  if not dir_path.is_dir():
315
+ logger.warning(f"ls: not a directory: {path}")
316
  return f"Error: Not a directory: {path}"
317
 
318
  entries = list(dir_path.iterdir())
 
336
  if size < 1024:
337
  size_str = f"{size:>6}B"
338
  elif size < 1024 * 1024:
339
+ size_str = f"{size / 1024:>6.1f}K"
340
  else:
341
+ size_str = f"{size / (1024 * 1024):>6.1f}M"
342
 
343
  # Format time
344
  from datetime import datetime
345
+
346
  mtime = datetime.fromtimestamp(stat.st_mtime)
347
  time_str = mtime.strftime("%Y-%m-%d %H:%M")
348
 
349
  type_char = "d" if entry.is_dir() else "-"
350
  name = entry.name + ("/" if entry.is_dir() else "")
351
  output_lines.append(f"{type_char} {size_str} {time_str} {name}")
352
+ except Exception as e:
353
+ logger.debug(f"ls: failed to stat entry {entry.name}: {e}")
354
  output_lines.append(entry.name)
355
  else:
356
  name = entry.name + ("/" if entry.is_dir() else "")
357
  output_lines.append(name)
358
 
359
+ logger.debug(f"ls: listed {len(output_lines)} entries in {path}")
360
  return "\n".join(output_lines)
361
 
362
  except Exception as e:
363
+ logger.warning(f"ls: error listing {path}: {e}")
364
  return f"Error listing directory: {e!s}"
365
 
366
 
 
377
 
378
  Edits are applied sequentially, so later edits see the result of earlier ones.
379
  """
380
+ logger.info(f"multi_edit: applying {len(edits)} edits to {path}")
381
  try:
382
  path_obj = _resolve_path(path)
383
 
384
  if not path_obj.exists():
385
+ logger.warning(f"multi_edit: file not found: {path}")
386
  return f"Error: File not found: {path}"
387
 
388
  with open(path_obj, encoding="utf-8") as f:
 
391
  # Validate all edits first
392
  for i, edit in enumerate(edits):
393
  if "old_string" not in edit or "new_string" not in edit:
394
+ return f"Error: Edit {i + 1} missing 'old_string' or 'new_string'"
395
 
396
  # Apply edits sequentially
397
  applied: list[str] = []
 
404
  if count == 0:
405
  # Rollback - restore original
406
  return (
407
+ f"Error: Edit {i + 1} failed - could not find text.\n"
408
  f"Applied {len(applied)} edit(s) before failure.\n"
409
  f"File unchanged (atomic rollback)."
410
  )
411
 
412
  if count > 1:
413
  return (
414
+ f"Error: Edit {i + 1} failed - found {count} occurrences.\n"
415
  f"Applied {len(applied)} edit(s) before failure.\n"
416
  f"File unchanged (atomic rollback)."
417
  )
418
 
419
  content = content.replace(old_str, new_str, 1)
420
+ applied.append(f"Edit {i + 1}: replaced {len(old_str)} chars with {len(new_str)} chars")
421
 
422
  # All edits succeeded - write the file
423
  with open(path_obj, "w", encoding="utf-8") as f:
424
  f.write(content)
425
 
426
+ logger.debug(f"multi_edit: successfully applied {len(edits)} edits to {path}")
427
  return f"Successfully applied {len(edits)} edit(s) to {path}:\n" + "\n".join(applied)
428
 
429
  except Exception as e:
430
+ logger.warning(f"multi_edit: error editing {path}: {e}")
431
  return f"Error editing file: {e!s}"
src/flow/tools/execution.py CHANGED
@@ -6,6 +6,8 @@ Execute shell commands and manage processes.
6
  import subprocess
7
  from typing import Annotated
8
 
 
 
9
  from .base import tool
10
  from .workspace import get_workspace
11
 
@@ -21,6 +23,8 @@ def bash(
21
  Use this to run shell commands, scripts, or system utilities.
22
  Be careful with destructive commands.
23
  """
 
 
24
  try:
25
  # Default to workspace root so concurrent tasks don't share process cwd
26
  effective_cwd = cwd if cwd is not None else str(get_workspace().root)
@@ -40,13 +44,18 @@ def bash(
40
  output += result.stderr
41
 
42
  if result.returncode != 0:
 
43
  output += f"\n[Exit code: {result.returncode}]"
 
 
44
 
45
  return output.strip() if output else "(No output)"
46
 
47
  except subprocess.TimeoutExpired:
 
48
  return f"Error: Command timed out after {timeout} seconds"
49
  except Exception as e:
 
50
  return f"Error executing command: {e!s}"
51
 
52
 
@@ -64,6 +73,8 @@ def check_processes(
64
  import os
65
  import signal
66
 
 
 
67
  if action == "list":
68
  try:
69
  # Use ps to list processes
@@ -73,8 +84,10 @@ def check_processes(
73
  text=True,
74
  timeout=10,
75
  )
 
76
  return result.stdout if result.stdout else "No processes found"
77
  except Exception as e:
 
78
  return f"Error listing processes: {e!s}"
79
 
80
  elif action == "kill":
@@ -82,14 +95,17 @@ def check_processes(
82
  return "Error: PID required for 'kill' action"
83
  try:
84
  os.kill(pid, signal.SIGTERM)
 
85
  return f"Sent SIGTERM to process {pid}"
86
  except ProcessLookupError:
 
87
  return f"Error: Process {pid} not found"
88
  except PermissionError:
 
89
  return f"Error: Permission denied to kill process {pid}"
90
  except Exception as e:
 
91
  return f"Error killing process: {e!s}"
92
 
93
  else:
94
  return f"Unknown action: {action}. Use 'list' or 'kill'."
95
-
 
6
  import subprocess
7
  from typing import Annotated
8
 
9
+ from loguru import logger
10
+
11
  from .base import tool
12
  from .workspace import get_workspace
13
 
 
23
  Use this to run shell commands, scripts, or system utilities.
24
  Be careful with destructive commands.
25
  """
26
+ logger.info(f"bash: executing command (timeout={timeout}s, cwd={cwd})")
27
+ logger.debug(f"bash: command={command[:200] if len(command) > 200 else command}")
28
  try:
29
  # Default to workspace root so concurrent tasks don't share process cwd
30
  effective_cwd = cwd if cwd is not None else str(get_workspace().root)
 
44
  output += result.stderr
45
 
46
  if result.returncode != 0:
47
+ logger.debug(f"bash: command exited with code {result.returncode}")
48
  output += f"\n[Exit code: {result.returncode}]"
49
+ else:
50
+ logger.debug("bash: command completed successfully")
51
 
52
  return output.strip() if output else "(No output)"
53
 
54
  except subprocess.TimeoutExpired:
55
+ logger.warning(f"bash: command timed out after {timeout}s")
56
  return f"Error: Command timed out after {timeout} seconds"
57
  except Exception as e:
58
+ logger.warning(f"bash: error executing command: {e}")
59
  return f"Error executing command: {e!s}"
60
 
61
 
 
73
  import os
74
  import signal
75
 
76
+ logger.debug(f"check_processes: action={action}, pid={pid}")
77
+
78
  if action == "list":
79
  try:
80
  # Use ps to list processes
 
84
  text=True,
85
  timeout=10,
86
  )
87
+ logger.debug("check_processes: listed processes")
88
  return result.stdout if result.stdout else "No processes found"
89
  except Exception as e:
90
+ logger.warning(f"check_processes: error listing: {e}")
91
  return f"Error listing processes: {e!s}"
92
 
93
  elif action == "kill":
 
95
  return "Error: PID required for 'kill' action"
96
  try:
97
  os.kill(pid, signal.SIGTERM)
98
+ logger.info(f"check_processes: sent SIGTERM to pid {pid}")
99
  return f"Sent SIGTERM to process {pid}"
100
  except ProcessLookupError:
101
+ logger.warning(f"check_processes: process {pid} not found")
102
  return f"Error: Process {pid} not found"
103
  except PermissionError:
104
+ logger.warning(f"check_processes: permission denied for pid {pid}")
105
  return f"Error: Permission denied to kill process {pid}"
106
  except Exception as e:
107
+ logger.warning(f"check_processes: error killing {pid}: {e}")
108
  return f"Error killing process: {e!s}"
109
 
110
  else:
111
  return f"Unknown action: {action}. Use 'list' or 'kill'."
 
src/flow/tools/memory.py CHANGED
@@ -24,6 +24,8 @@ import uuid
24
  from datetime import datetime
25
  from typing import Annotated, Any, Literal
26
 
 
 
27
  from .base import Tool, tool
28
  from .workspace import Workspace, get_workspace
29
 
@@ -76,6 +78,7 @@ def create_memory_tool(workspace: Workspace | None = None) -> Tool:
76
  Returns:
77
  A Tool instance for memory operations.
78
  """
 
79
  def get_ws() -> Workspace:
80
  if workspace is not None:
81
  return workspace
@@ -85,20 +88,13 @@ def create_memory_tool(workspace: Workspace | None = None) -> Tool:
85
  def memory(
86
  action: Annotated[
87
  Literal["store", "recall", "list", "forget"],
88
- "Action: store (save), recall (search), list (show all), forget (delete)"
89
  ],
90
  content: Annotated[
91
- str,
92
- "For 'store': info to remember. For 'recall': search query. For 'forget': memory ID."
93
- ] = "",
94
- key: Annotated[
95
- str,
96
- "Optional short identifier (e.g., 'user_preferences', 'project_structure')"
97
  ] = "",
98
- tags: Annotated[
99
- list[str],
100
- "Optional tags for categorization (e.g., ['important', 'user-info'])"
101
- ] = [],
102
  ) -> str:
103
  """Store and retrieve information across sessions.
104
 
@@ -132,38 +128,44 @@ def create_memory_tool(workspace: Workspace | None = None) -> Tool:
132
  }
133
  ws.save_memory(memory_id, data)
134
 
 
135
  return f"Stored memory '{data['key']}' (id: {memory_id}) in {ws.memory_dir}"
136
 
137
  elif action == "recall":
138
  if not content:
139
  return "Error: content (search query) is required for 'recall' action"
140
 
 
141
  results = _search_memories(ws, content)
142
 
143
  if not results:
 
144
  return f"No memories found matching '{content}'"
145
 
146
  output = [f"Found {len(results)} memory(ies) matching '{content}':\n"]
147
  for mem in results:
148
  tags_str = f" [{', '.join(mem['tags'])}]" if mem.get("tags") else ""
149
  output.append(f"- [{mem['id']}] {mem['key']}{tags_str}: {mem['content'][:200]}")
150
- if len(mem['content']) > 200:
151
  output[-1] += "..."
152
 
 
153
  return "\n".join(output)
154
 
155
  elif action == "list":
156
  memories = ws.list_memories()
157
 
158
  if not memories:
 
159
  return f"No memories stored in {ws.memory_dir}"
160
 
161
  output = [f"Stored memories ({len(memories)} total):\n"]
162
  for mem in sorted(memories, key=lambda m: m.get("created_at", ""), reverse=True):
163
  tags_str = f" [{', '.join(mem['tags'])}]" if mem.get("tags") else ""
164
- preview = mem['content'][:100] + "..." if len(mem['content']) > 100 else mem['content']
165
  output.append(f"- [{mem['id']}] {mem['key']}{tags_str}: {preview}")
166
 
 
167
  return "\n".join(output)
168
 
169
  elif action == "forget":
@@ -171,8 +173,10 @@ def create_memory_tool(workspace: Workspace | None = None) -> Tool:
171
  return "Error: content (memory ID) is required for 'forget' action"
172
 
173
  if ws.delete_memory(content):
 
174
  return f"Deleted memory with id: {content}"
175
  else:
 
176
  return f"No memory found with id: {content}"
177
 
178
  else:
 
24
  from datetime import datetime
25
  from typing import Annotated, Any, Literal
26
 
27
+ from loguru import logger
28
+
29
  from .base import Tool, tool
30
  from .workspace import Workspace, get_workspace
31
 
 
78
  Returns:
79
  A Tool instance for memory operations.
80
  """
81
+
82
  def get_ws() -> Workspace:
83
  if workspace is not None:
84
  return workspace
 
88
  def memory(
89
  action: Annotated[
90
  Literal["store", "recall", "list", "forget"],
91
+ "Action: store (save), recall (search), list (show all), forget (delete)",
92
  ],
93
  content: Annotated[
94
+ str, "For 'store': info to remember. For 'recall': search query. For 'forget': memory ID."
 
 
 
 
 
95
  ] = "",
96
+ key: Annotated[str, "Optional short identifier (e.g., 'user_preferences', 'project_structure')"] = "",
97
+ tags: Annotated[list[str], "Optional tags for categorization (e.g., ['important', 'user-info'])"] = [],
 
 
98
  ) -> str:
99
  """Store and retrieve information across sessions.
100
 
 
128
  }
129
  ws.save_memory(memory_id, data)
130
 
131
+ logger.info(f"memory: stored '{data['key']}' (id: {memory_id}) with {len(content)} chars")
132
  return f"Stored memory '{data['key']}' (id: {memory_id}) in {ws.memory_dir}"
133
 
134
  elif action == "recall":
135
  if not content:
136
  return "Error: content (search query) is required for 'recall' action"
137
 
138
+ logger.debug(f"memory: recalling with query '{content}'")
139
  results = _search_memories(ws, content)
140
 
141
  if not results:
142
+ logger.debug(f"memory: no matches found for '{content}'")
143
  return f"No memories found matching '{content}'"
144
 
145
  output = [f"Found {len(results)} memory(ies) matching '{content}':\n"]
146
  for mem in results:
147
  tags_str = f" [{', '.join(mem['tags'])}]" if mem.get("tags") else ""
148
  output.append(f"- [{mem['id']}] {mem['key']}{tags_str}: {mem['content'][:200]}")
149
+ if len(mem["content"]) > 200:
150
  output[-1] += "..."
151
 
152
+ logger.debug(f"memory: found {len(results)} memories matching '{content}'")
153
  return "\n".join(output)
154
 
155
  elif action == "list":
156
  memories = ws.list_memories()
157
 
158
  if not memories:
159
+ logger.debug("memory: no memories stored")
160
  return f"No memories stored in {ws.memory_dir}"
161
 
162
  output = [f"Stored memories ({len(memories)} total):\n"]
163
  for mem in sorted(memories, key=lambda m: m.get("created_at", ""), reverse=True):
164
  tags_str = f" [{', '.join(mem['tags'])}]" if mem.get("tags") else ""
165
+ preview = mem["content"][:100] + "..." if len(mem["content"]) > 100 else mem["content"]
166
  output.append(f"- [{mem['id']}] {mem['key']}{tags_str}: {preview}")
167
 
168
+ logger.debug(f"memory: listed {len(memories)} memories")
169
  return "\n".join(output)
170
 
171
  elif action == "forget":
 
173
  return "Error: content (memory ID) is required for 'forget' action"
174
 
175
  if ws.delete_memory(content):
176
+ logger.info(f"memory: deleted memory with id '{content}'")
177
  return f"Deleted memory with id: {content}"
178
  else:
179
+ logger.warning(f"memory: no memory found with id '{content}'")
180
  return f"No memory found with id: {content}"
181
 
182
  else:
src/flow/tools/notebook.py CHANGED
@@ -7,6 +7,8 @@ import json
7
  from pathlib import Path
8
  from typing import Annotated, Any, Literal
9
 
 
 
10
  from .base import tool
11
 
12
 
@@ -27,13 +29,16 @@ def notebook_edit(
27
 
28
  For insert mode, cell_type is required.
29
  """
 
30
  try:
31
  path_obj = Path(path).expanduser().resolve()
32
 
33
  if not path_obj.exists():
 
34
  return f"Error: Notebook not found: {path}"
35
 
36
  if not path_obj.suffix == ".ipynb":
 
37
  return f"Error: Not a Jupyter notebook (must be .ipynb): {path}"
38
 
39
  # Read notebook
@@ -53,6 +58,7 @@ def notebook_edit(
53
  with open(path_obj, "w", encoding="utf-8") as f:
54
  json.dump(notebook, f, indent=1)
55
 
 
56
  return f"Successfully deleted {deleted_type} cell at index {cell_index}"
57
 
58
  elif edit_mode == "insert":
@@ -79,6 +85,7 @@ def notebook_edit(
79
  with open(path_obj, "w", encoding="utf-8") as f:
80
  json.dump(notebook, f, indent=1)
81
 
 
82
  return f"Successfully inserted {cell_type} cell at index {cell_index}"
83
 
84
  else: # replace
@@ -107,11 +114,14 @@ def notebook_edit(
107
  with open(path_obj, "w", encoding="utf-8") as f:
108
  json.dump(notebook, f, indent=1)
109
 
 
110
  return f"Successfully replaced cell at index {cell_index}"
111
 
112
  except json.JSONDecodeError as e:
 
113
  return f"Error: Invalid notebook JSON: {e!s}"
114
  except Exception as e:
 
115
  return f"Error editing notebook: {e!s}"
116
 
117
 
@@ -125,13 +135,16 @@ def notebook_read(
125
 
126
  Returns formatted cell contents with indices for easy reference.
127
  """
 
128
  try:
129
  path_obj = Path(path).expanduser().resolve()
130
 
131
  if not path_obj.exists():
 
132
  return f"Error: Notebook not found: {path}"
133
 
134
  if not path_obj.suffix == ".ipynb":
 
135
  return f"Error: Not a Jupyter notebook (must be .ipynb): {path}"
136
 
137
  with open(path_obj, encoding="utf-8") as f:
@@ -179,7 +192,11 @@ def notebook_read(
179
  data = output.get("data", {})
180
  if isinstance(data, dict) and "text/plain" in data:
181
  plain_data = data["text/plain"]
182
- text = "".join(str(t) for t in plain_data) if isinstance(plain_data, list) else str(plain_data)
 
 
 
 
183
  output_texts.append(f"[result]\n{text}")
184
  elif output.get("output_type") == "error":
185
  ename = str(output.get("ename", "Error"))
@@ -195,9 +212,12 @@ def notebook_read(
195
  if cell_index is None:
196
  result = f"Notebook: {path} ({len(notebook.get('cells', []))} cells)\n\n" + result
197
 
 
198
  return result
199
 
200
  except json.JSONDecodeError as e:
 
201
  return f"Error: Invalid notebook JSON: {e!s}"
202
  except Exception as e:
 
203
  return f"Error reading notebook: {e!s}"
 
7
  from pathlib import Path
8
  from typing import Annotated, Any, Literal
9
 
10
+ from loguru import logger
11
+
12
  from .base import tool
13
 
14
 
 
29
 
30
  For insert mode, cell_type is required.
31
  """
32
+ logger.info(f"notebook_edit: {edit_mode} cell at index {cell_index} in {path}")
33
  try:
34
  path_obj = Path(path).expanduser().resolve()
35
 
36
  if not path_obj.exists():
37
+ logger.warning(f"notebook_edit: notebook not found: {path}")
38
  return f"Error: Notebook not found: {path}"
39
 
40
  if not path_obj.suffix == ".ipynb":
41
+ logger.warning(f"notebook_edit: not a notebook file: {path}")
42
  return f"Error: Not a Jupyter notebook (must be .ipynb): {path}"
43
 
44
  # Read notebook
 
58
  with open(path_obj, "w", encoding="utf-8") as f:
59
  json.dump(notebook, f, indent=1)
60
 
61
+ logger.debug(f"notebook_edit: deleted {deleted_type} cell at index {cell_index}")
62
  return f"Successfully deleted {deleted_type} cell at index {cell_index}"
63
 
64
  elif edit_mode == "insert":
 
85
  with open(path_obj, "w", encoding="utf-8") as f:
86
  json.dump(notebook, f, indent=1)
87
 
88
+ logger.debug(f"notebook_edit: inserted {cell_type} cell at index {cell_index}")
89
  return f"Successfully inserted {cell_type} cell at index {cell_index}"
90
 
91
  else: # replace
 
114
  with open(path_obj, "w", encoding="utf-8") as f:
115
  json.dump(notebook, f, indent=1)
116
 
117
+ logger.debug(f"notebook_edit: replaced cell at index {cell_index}")
118
  return f"Successfully replaced cell at index {cell_index}"
119
 
120
  except json.JSONDecodeError as e:
121
+ logger.warning(f"notebook_edit: invalid JSON in {path}: {e}")
122
  return f"Error: Invalid notebook JSON: {e!s}"
123
  except Exception as e:
124
+ logger.warning(f"notebook_edit: error editing {path}: {e}")
125
  return f"Error editing notebook: {e!s}"
126
 
127
 
 
135
 
136
  Returns formatted cell contents with indices for easy reference.
137
  """
138
+ logger.debug(f"notebook_read: path={path}, cell_index={cell_index}, include_outputs={include_outputs}")
139
  try:
140
  path_obj = Path(path).expanduser().resolve()
141
 
142
  if not path_obj.exists():
143
+ logger.warning(f"notebook_read: notebook not found: {path}")
144
  return f"Error: Notebook not found: {path}"
145
 
146
  if not path_obj.suffix == ".ipynb":
147
+ logger.warning(f"notebook_read: not a notebook file: {path}")
148
  return f"Error: Not a Jupyter notebook (must be .ipynb): {path}"
149
 
150
  with open(path_obj, encoding="utf-8") as f:
 
192
  data = output.get("data", {})
193
  if isinstance(data, dict) and "text/plain" in data:
194
  plain_data = data["text/plain"]
195
+ text = (
196
+ "".join(str(t) for t in plain_data)
197
+ if isinstance(plain_data, list)
198
+ else str(plain_data)
199
+ )
200
  output_texts.append(f"[result]\n{text}")
201
  elif output.get("output_type") == "error":
202
  ename = str(output.get("ename", "Error"))
 
212
  if cell_index is None:
213
  result = f"Notebook: {path} ({len(notebook.get('cells', []))} cells)\n\n" + result
214
 
215
+ logger.debug(f"notebook_read: read {len(cells_list)} cells from {path}")
216
  return result
217
 
218
  except json.JSONDecodeError as e:
219
+ logger.warning(f"notebook_read: invalid JSON in {path}: {e}")
220
  return f"Error: Invalid notebook JSON: {e!s}"
221
  except Exception as e:
222
+ logger.warning(f"notebook_read: error reading {path}: {e}")
223
  return f"Error reading notebook: {e!s}"
src/flow/tools/planning.py CHANGED
@@ -6,6 +6,8 @@ Todos are persisted to the workspace's .flow/todos.json file.
6
 
7
  from typing import Annotated, Any
8
 
 
 
9
  from .base import tool
10
  from .workspace import Workspace, get_workspace
11
 
@@ -34,6 +36,7 @@ def think(
34
 
35
  Your thought is recorded in conversation history for reference.
36
  """
 
37
  # The thought is recorded in the tool result, becoming part of context
38
  return "Thought recorded."
39
 
@@ -42,7 +45,7 @@ def think(
42
  def todo_write(
43
  todos: Annotated[
44
  list[dict[str, Any]],
45
- "List of todo items. Each item needs: content (str), status ('pending'|'in_progress'|'completed'), activeForm (str describing current action)"
46
  ],
47
  ) -> str:
48
  """Create or update the task list for this session.
@@ -75,17 +78,18 @@ def todo_write(
75
  valid_statuses = {"pending", "in_progress", "completed"}
76
  for i, todo in enumerate(todos):
77
  if "content" not in todo:
78
- return f"Error: Todo {i+1} missing 'content'"
79
  if "status" not in todo:
80
- return f"Error: Todo {i+1} missing 'status'"
81
  if todo["status"] not in valid_statuses:
82
- return f"Error: Todo {i+1} has invalid status '{todo['status']}'. Must be: {valid_statuses}"
83
  if "activeForm" not in todo:
84
- return f"Error: Todo {i+1} missing 'activeForm'"
85
 
86
  # Check only one in_progress
87
  in_progress_count = sum(1 for t in todos if t["status"] == "in_progress")
88
  if in_progress_count > 1:
 
89
  return f"Error: {in_progress_count} tasks marked 'in_progress'. Only one task should be in progress at a time."
90
 
91
  # Save to workspace
@@ -100,6 +104,7 @@ def todo_write(
100
  current = next((t for t in todos if t["status"] == "in_progress"), None)
101
  current_msg = f"Current: {current['activeForm']}" if current else "No task in progress"
102
 
 
103
  return f"Todo list updated: {completed} completed, {in_progress} in progress, {pending} pending. {current_msg}"
104
 
105
 
@@ -110,10 +115,12 @@ def todo_read() -> str:
110
  Returns the current state of all tasks with their status.
111
  Todos are loaded from {workspace}/.flow/todos.json
112
  """
 
113
  ws = _get_workspace()
114
  todos = ws.load_todos()
115
 
116
  if not todos:
 
117
  return "No todos. Use todo_write to create a task list."
118
 
119
  lines: list[str] = []
@@ -133,6 +140,7 @@ def todo_read() -> str:
133
  completed = sum(1 for t in todos if t["status"] == "completed")
134
  total = len(todos)
135
 
 
136
  return f"Progress: {completed}/{total}\n\n" + "\n".join(lines)
137
 
138
 
 
6
 
7
  from typing import Annotated, Any
8
 
9
+ from loguru import logger
10
+
11
  from .base import tool
12
  from .workspace import Workspace, get_workspace
13
 
 
36
 
37
  Your thought is recorded in conversation history for reference.
38
  """
39
+ logger.debug(f"think: recording thought ({len(thought)} chars)")
40
  # The thought is recorded in the tool result, becoming part of context
41
  return "Thought recorded."
42
 
 
45
  def todo_write(
46
  todos: Annotated[
47
  list[dict[str, Any]],
48
+ "List of todo items. Each item needs: content (str), status ('pending'|'in_progress'|'completed'), activeForm (str describing current action)",
49
  ],
50
  ) -> str:
51
  """Create or update the task list for this session.
 
78
  valid_statuses = {"pending", "in_progress", "completed"}
79
  for i, todo in enumerate(todos):
80
  if "content" not in todo:
81
+ return f"Error: Todo {i + 1} missing 'content'"
82
  if "status" not in todo:
83
+ return f"Error: Todo {i + 1} missing 'status'"
84
  if todo["status"] not in valid_statuses:
85
+ return f"Error: Todo {i + 1} has invalid status '{todo['status']}'. Must be: {valid_statuses}"
86
  if "activeForm" not in todo:
87
+ return f"Error: Todo {i + 1} missing 'activeForm'"
88
 
89
  # Check only one in_progress
90
  in_progress_count = sum(1 for t in todos if t["status"] == "in_progress")
91
  if in_progress_count > 1:
92
+ logger.warning(f"todo_write: {in_progress_count} tasks marked in_progress, should be 1")
93
  return f"Error: {in_progress_count} tasks marked 'in_progress'. Only one task should be in progress at a time."
94
 
95
  # Save to workspace
 
104
  current = next((t for t in todos if t["status"] == "in_progress"), None)
105
  current_msg = f"Current: {current['activeForm']}" if current else "No task in progress"
106
 
107
+ logger.info(f"todo_write: updated ({completed} completed, {in_progress} in progress, {pending} pending)")
108
  return f"Todo list updated: {completed} completed, {in_progress} in progress, {pending} pending. {current_msg}"
109
 
110
 
 
115
  Returns the current state of all tasks with their status.
116
  Todos are loaded from {workspace}/.flow/todos.json
117
  """
118
+ logger.debug("todo_read: loading todos")
119
  ws = _get_workspace()
120
  todos = ws.load_todos()
121
 
122
  if not todos:
123
+ logger.debug("todo_read: no todos found")
124
  return "No todos. Use todo_write to create a task list."
125
 
126
  lines: list[str] = []
 
140
  completed = sum(1 for t in todos if t["status"] == "completed")
141
  total = len(todos)
142
 
143
+ logger.debug(f"todo_read: loaded {total} todos ({completed} completed)")
144
  return f"Progress: {completed}/{total}\n\n" + "\n".join(lines)
145
 
146
 
src/flow/tools/skills.py CHANGED
@@ -26,7 +26,9 @@ Usage:
26
 
27
  import re
28
  from pathlib import Path
29
- from typing import Annotated, Literal
 
 
30
 
31
  from .base import Tool, tool
32
 
@@ -88,8 +90,9 @@ def _discover_skills(skills_paths: list[Path]) -> dict[str, tuple[Path, dict[str
88
  meta = _parse_frontmatter(content)
89
  skill_name = meta.get("name", item.name)
90
  skills[skill_name] = (skill_md, meta)
91
- except Exception:
92
- # Skip broken skills
 
93
  skills[item.name] = (
94
  skill_md,
95
  {"name": item.name, "description": "Error reading skill"},
@@ -113,6 +116,7 @@ def create_skills_tool(
113
  builtin_path: Path | None = None,
114
  user_path: Path | None = None,
115
  project_path: Path | None = None,
 
116
  ) -> Tool:
117
  """Create a skills tool for discovering and loading domain expertise.
118
 
@@ -120,6 +124,8 @@ def create_skills_tool(
120
  builtin_path: Path to built-in skills (shipped with package)
121
  user_path: Path to user skills (defaults to ~/.flow/skills/)
122
  project_path: Path to project-local skills (highest priority)
 
 
123
 
124
  Returns:
125
  A Tool that can be added to an agent's tool list
@@ -138,7 +144,7 @@ def create_skills_tool(
138
  # Built-in skills
139
  if builtin_path:
140
  all_paths.append(builtin_path)
141
- else:
142
  default_builtin = _get_builtin_skills_path()
143
  if default_builtin.exists():
144
  all_paths.append(default_builtin)
@@ -146,7 +152,7 @@ def create_skills_tool(
146
  # User skills
147
  if user_path:
148
  all_paths.append(user_path)
149
- else:
150
  default_user = _get_user_skills_path()
151
  if default_user.exists():
152
  all_paths.append(default_user)
@@ -177,6 +183,7 @@ def create_skills_tool(
177
  discovered = _discover_skills(all_paths)
178
 
179
  if action == "list":
 
180
  if not discovered:
181
  paths_str = "\n".join(f" - {p}" for p in all_paths) if all_paths else " (no paths configured)"
182
  return (
@@ -210,6 +217,7 @@ def create_skills_tool(
210
  return "Error: 'name' parameter is required for 'load' action."
211
 
212
  if name not in discovered:
 
213
  available = sorted(discovered.keys())
214
  msg = f"Skill '{name}' not found."
215
  if available:
@@ -223,8 +231,10 @@ def create_skills_tool(
223
 
224
  # Return full content (body only, frontmatter already parsed)
225
  body = _get_skill_body(content)
 
226
  return f"# Skill: {skill_name}\n\n{body}"
227
  except Exception as e:
 
228
  return f"Error loading skill '{name}': {e}"
229
 
230
  else:
@@ -233,5 +243,61 @@ def create_skills_tool(
233
  return skills
234
 
235
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
236
  # Default skills tool (includes built-in skills from Flow)
237
  skills = create_skills_tool()
 
26
 
27
  import re
28
  from pathlib import Path
29
+ from typing import Annotated, Any, Literal
30
+
31
+ from loguru import logger
32
 
33
  from .base import Tool, tool
34
 
 
90
  meta = _parse_frontmatter(content)
91
  skill_name = meta.get("name", item.name)
92
  skills[skill_name] = (skill_md, meta)
93
+ except Exception as e:
94
+ # Skip broken skills but log the error
95
+ logger.debug(f"Failed to load skill '{item.name}': {e}")
96
  skills[item.name] = (
97
  skill_md,
98
  {"name": item.name, "description": "Error reading skill"},
 
116
  builtin_path: Path | None = None,
117
  user_path: Path | None = None,
118
  project_path: Path | None = None,
119
+ exclusive: bool = False,
120
  ) -> Tool:
121
  """Create a skills tool for discovering and loading domain expertise.
122
 
 
124
  builtin_path: Path to built-in skills (shipped with package)
125
  user_path: Path to user skills (defaults to ~/.flow/skills/)
126
  project_path: Path to project-local skills (highest priority)
127
+ exclusive: If True, only use explicitly provided paths (no defaults).
128
+ Useful for optimization where you want a clean slate.
129
 
130
  Returns:
131
  A Tool that can be added to an agent's tool list
 
144
  # Built-in skills
145
  if builtin_path:
146
  all_paths.append(builtin_path)
147
+ elif not exclusive:
148
  default_builtin = _get_builtin_skills_path()
149
  if default_builtin.exists():
150
  all_paths.append(default_builtin)
 
152
  # User skills
153
  if user_path:
154
  all_paths.append(user_path)
155
+ elif not exclusive:
156
  default_user = _get_user_skills_path()
157
  if default_user.exists():
158
  all_paths.append(default_user)
 
183
  discovered = _discover_skills(all_paths)
184
 
185
  if action == "list":
186
+ logger.debug(f"skills: listing {len(discovered)} discovered skills")
187
  if not discovered:
188
  paths_str = "\n".join(f" - {p}" for p in all_paths) if all_paths else " (no paths configured)"
189
  return (
 
217
  return "Error: 'name' parameter is required for 'load' action."
218
 
219
  if name not in discovered:
220
+ logger.warning(f"skills: skill '{name}' not found")
221
  available = sorted(discovered.keys())
222
  msg = f"Skill '{name}' not found."
223
  if available:
 
231
 
232
  # Return full content (body only, frontmatter already parsed)
233
  body = _get_skill_body(content)
234
+ logger.info(f"skills: loaded skill '{skill_name}' ({len(body)} chars)")
235
  return f"# Skill: {skill_name}\n\n{body}"
236
  except Exception as e:
237
+ logger.warning(f"skills: error loading skill '{name}': {e}")
238
  return f"Error loading skill '{name}': {e}"
239
 
240
  else:
 
243
  return skills
244
 
245
 
246
+ def discover_skills_from_tools_spec(
247
+ tools_spec: dict[str, dict[str, Any]],
248
+ ) -> dict[str, dict[str, str]]:
249
+ """Discover available skills based on a resolved tools specification.
250
+
251
+ Extracts skill metadata (name, description, triggers) from the same
252
+ paths that the skills tool would use. Returns a lightweight dict
253
+ suitable for injecting into the system prompt.
254
+
255
+ Args:
256
+ tools_spec: Resolved tools specification dict (from resolve_tools()).
257
+ Looks for a ``"skills"`` key with optional ``skills_path``,
258
+ ``additional_paths``, or ``exclusive`` config.
259
+
260
+ Returns:
261
+ Dict mapping skill_name -> frontmatter metadata dict
262
+ (keys: ``name``, ``description``, ``triggers``).
263
+ Returns empty dict if no skills tool is configured.
264
+ """
265
+ if "skills" not in tools_spec:
266
+ return {}
267
+
268
+ config = tools_spec.get("skills", {})
269
+ exclusive = bool(config.get("skills_path")) # skills_path implies exclusive
270
+
271
+ # Build the same paths that create_skills_tool would use
272
+ all_paths: list[Path] = []
273
+
274
+ if config.get("skills_path"):
275
+ all_paths.append(Path(config["skills_path"]))
276
+ elif not exclusive:
277
+ default_builtin = _get_builtin_skills_path()
278
+ if default_builtin.exists():
279
+ all_paths.append(default_builtin)
280
+
281
+ if not exclusive:
282
+ default_user = _get_user_skills_path()
283
+ if default_user.exists():
284
+ all_paths.append(default_user)
285
+
286
+ if config.get("additional_paths"):
287
+ for p in config["additional_paths"]:
288
+ all_paths.append(Path(p))
289
+
290
+ if config.get("project_path"):
291
+ all_paths.append(Path(config["project_path"]))
292
+
293
+ discovered = _discover_skills(all_paths)
294
+
295
+ # Return metadata only (strip file paths)
296
+ return {
297
+ skill_name: dict(meta)
298
+ for skill_name, (_, meta) in discovered.items()
299
+ }
300
+
301
+
302
  # Default skills tool (includes built-in skills from Flow)
303
  skills = create_skills_tool()
src/flow/tools/subagent.py CHANGED
@@ -16,6 +16,8 @@ from __future__ import annotations
16
 
17
  from typing import TYPE_CHECKING, Annotated, Literal
18
 
 
 
19
  from .base import Tool, tool
20
 
21
  if TYPE_CHECKING:
@@ -118,7 +120,7 @@ def create_task_tool(
118
  description: Annotated[str, "Short 3-5 word summary of what the sub-agent will do"],
119
  agent_type: Annotated[
120
  Literal["explore", "research", "general"],
121
- "Type of sub-agent: 'explore' for codebase search, 'research' for web research, 'general' for other tasks"
122
  ] = "general",
123
  ) -> str:
124
  """Launch a sub-agent to handle a complex task in isolated context.
@@ -139,12 +141,16 @@ def create_task_tool(
139
  - research: Web research (web_search, web_fetch)
140
  - general: All tools available to you
141
  """
 
 
 
142
  # Lazy imports to avoid circular dependencies
143
  try:
144
  from flow.harness.miniagent.agent import ChatAgent
145
  from flow.harness.miniagent.client import ChatClient, ClientConfig
146
  from flow.harness.miniagent.context import HeadTailStrategy
147
  except ImportError:
 
148
  return "Error: MiniAgent harness not available. Install flow with miniagent extras."
149
 
150
  # Get agent type config
@@ -222,9 +228,13 @@ def create_task_tool(
222
  f"{response.usage.tool_calls} tool calls]"
223
  )
224
 
 
 
 
225
  return result + usage_info
226
 
227
  except Exception as e:
 
228
  return f"Sub-agent failed: {e!s}"
229
 
230
  return task
 
16
 
17
  from typing import TYPE_CHECKING, Annotated, Literal
18
 
19
+ from loguru import logger
20
+
21
  from .base import Tool, tool
22
 
23
  if TYPE_CHECKING:
 
120
  description: Annotated[str, "Short 3-5 word summary of what the sub-agent will do"],
121
  agent_type: Annotated[
122
  Literal["explore", "research", "general"],
123
+ "Type of sub-agent: 'explore' for codebase search, 'research' for web research, 'general' for other tasks",
124
  ] = "general",
125
  ) -> str:
126
  """Launch a sub-agent to handle a complex task in isolated context.
 
141
  - research: Web research (web_search, web_fetch)
142
  - general: All tools available to you
143
  """
144
+ logger.info(f"task: spawning {agent_type} sub-agent for '{description}'")
145
+ logger.debug(f"task: prompt preview={prompt[:200] if len(prompt) > 200 else prompt}")
146
+
147
  # Lazy imports to avoid circular dependencies
148
  try:
149
  from flow.harness.miniagent.agent import ChatAgent
150
  from flow.harness.miniagent.client import ChatClient, ClientConfig
151
  from flow.harness.miniagent.context import HeadTailStrategy
152
  except ImportError:
153
+ logger.error("task: MiniAgent harness not available")
154
  return "Error: MiniAgent harness not available. Install flow with miniagent extras."
155
 
156
  # Get agent type config
 
228
  f"{response.usage.tool_calls} tool calls]"
229
  )
230
 
231
+ logger.debug(
232
+ f"task: sub-agent completed - {response.iterations} iterations, {response.usage.tool_calls} tool calls"
233
+ )
234
  return result + usage_info
235
 
236
  except Exception as e:
237
+ logger.warning(f"task: sub-agent failed: {e}")
238
  return f"Sub-agent failed: {e!s}"
239
 
240
  return task
src/flow/tools/text_inspector_qa.py CHANGED
@@ -191,4 +191,7 @@ def text_inspector(
191
  """
192
  logger.info("Inspecting file at path: {}", file_path)
193
  ti_tool = TextInspectorTool()
194
- return ti_tool.forward(file_path=file_path, question=question)
 
 
 
 
191
  """
192
  logger.info("Inspecting file at path: {}", file_path)
193
  ti_tool = TextInspectorTool()
194
+ output = ti_tool.forward(file_path=file_path, question=question)
195
+ logger.debug("Text inspector output length: {}", len(output))
196
+ logger.debug("Text inspector output first 200 chars: {}", output[:200] if len(output) > 200 else output)
197
+ return output
src/flow/tools/web.py CHANGED
@@ -7,6 +7,8 @@ import os
7
  from typing import Annotated
8
  from urllib.parse import urlparse
9
 
 
 
10
  from .base import tool
11
 
12
 
@@ -20,10 +22,12 @@ def web_search(
20
  Requires GOOGLE_API_KEY and GOOGLE_CSE_ID environment variables.
21
  Returns a list of search results with titles, URLs, and snippets.
22
  """
 
23
  api_key = os.environ.get("GOOGLE_API_KEY")
24
  cse_id = os.environ.get("GOOGLE_CSE_ID")
25
 
26
  if not api_key or not cse_id:
 
27
  return (
28
  "Error: Web search requires GOOGLE_API_KEY and GOOGLE_CSE_ID "
29
  "environment variables to be set."
@@ -32,6 +36,7 @@ def web_search(
32
  try:
33
  import httpx
34
  except ImportError:
 
35
  return "Error: httpx package required. Install with: pip install httpx"
36
 
37
  try:
@@ -51,6 +56,7 @@ def web_search(
51
  items = data.get("items", [])
52
 
53
  if not items:
 
54
  return f"No results found for: {query}"
55
 
56
  results: list[str] = []
@@ -60,9 +66,11 @@ def web_search(
60
  snippet = item.get("snippet", "No description")
61
  results.append(f"{i}. {title}\n {link}\n {snippet}")
62
 
 
63
  return "\n\n".join(results)
64
 
65
  except Exception as e:
 
66
  return f"Error performing search: {e!s}"
67
 
68
 
@@ -77,6 +85,7 @@ def web_fetch(
77
  Returns the page content in the specified format.
78
  Useful for reading documentation, articles, and web pages.
79
  """
 
80
  # Validate URL
81
  try:
82
  parsed = urlparse(url)
@@ -84,13 +93,16 @@ def web_fetch(
84
  url = "https://" + url
85
  parsed = urlparse(url)
86
  if not parsed.netloc:
 
87
  return f"Error: Invalid URL: {url}"
88
- except Exception:
 
89
  return f"Error: Invalid URL format: {url}"
90
 
91
  try:
92
  import httpx
93
  except ImportError:
 
94
  return "Error: httpx package required. Install with: pip install httpx"
95
 
96
  try:
@@ -139,7 +151,9 @@ def web_fetch(
139
  if len(content) > max_length:
140
  content = content[:max_length] + "\n\n[Content truncated...]"
141
 
 
142
  return content
143
 
144
  except Exception as e:
 
145
  return f"Error fetching URL: {e!s}"
 
7
  from typing import Annotated
8
  from urllib.parse import urlparse
9
 
10
+ from loguru import logger
11
+
12
  from .base import tool
13
 
14
 
 
22
  Requires GOOGLE_API_KEY and GOOGLE_CSE_ID environment variables.
23
  Returns a list of search results with titles, URLs, and snippets.
24
  """
25
+ logger.info(f"web_search: searching for '{query}' (num_results={num_results})")
26
  api_key = os.environ.get("GOOGLE_API_KEY")
27
  cse_id = os.environ.get("GOOGLE_CSE_ID")
28
 
29
  if not api_key or not cse_id:
30
+ logger.warning("web_search: missing API credentials")
31
  return (
32
  "Error: Web search requires GOOGLE_API_KEY and GOOGLE_CSE_ID "
33
  "environment variables to be set."
 
36
  try:
37
  import httpx
38
  except ImportError:
39
+ logger.warning("web_search: httpx package not installed")
40
  return "Error: httpx package required. Install with: pip install httpx"
41
 
42
  try:
 
56
  items = data.get("items", [])
57
 
58
  if not items:
59
+ logger.debug(f"web_search: no results for '{query}'")
60
  return f"No results found for: {query}"
61
 
62
  results: list[str] = []
 
66
  snippet = item.get("snippet", "No description")
67
  results.append(f"{i}. {title}\n {link}\n {snippet}")
68
 
69
+ logger.debug(f"web_search: found {len(items)} results for '{query}'")
70
  return "\n\n".join(results)
71
 
72
  except Exception as e:
73
+ logger.warning(f"web_search: error searching: {e}")
74
  return f"Error performing search: {e!s}"
75
 
76
 
 
85
  Returns the page content in the specified format.
86
  Useful for reading documentation, articles, and web pages.
87
  """
88
+ logger.info(f"web_fetch: fetching {url}")
89
  # Validate URL
90
  try:
91
  parsed = urlparse(url)
 
93
  url = "https://" + url
94
  parsed = urlparse(url)
95
  if not parsed.netloc:
96
+ logger.warning(f"web_fetch: invalid URL: {url}")
97
  return f"Error: Invalid URL: {url}"
98
+ except Exception as e:
99
+ logger.warning(f"web_fetch: invalid URL format: {url}: {e}")
100
  return f"Error: Invalid URL format: {url}"
101
 
102
  try:
103
  import httpx
104
  except ImportError:
105
+ logger.warning("web_fetch: httpx package not installed")
106
  return "Error: httpx package required. Install with: pip install httpx"
107
 
108
  try:
 
151
  if len(content) > max_length:
152
  content = content[:max_length] + "\n\n[Content truncated...]"
153
 
154
+ logger.debug(f"web_fetch: fetched {len(content)} chars from {url}")
155
  return content
156
 
157
  except Exception as e:
158
+ logger.warning(f"web_fetch: error fetching {url}: {e}")
159
  return f"Error fetching URL: {e!s}"
src/flow/tools/workspace.py CHANGED
@@ -36,6 +36,8 @@ import json
36
  from pathlib import Path
37
  from typing import Any
38
 
 
 
39
 
40
  class Workspace:
41
  """Manages workspace paths and agent data storage.
@@ -53,6 +55,7 @@ class Workspace:
53
  if root is None:
54
  root = Path.cwd()
55
  self._root = Path(root).resolve()
 
56
 
57
  @property
58
  def root(self) -> Path:
@@ -97,8 +100,11 @@ class Workspace:
97
  return []
98
  try:
99
  with open(self.todos_file) as f:
100
- return json.load(f) # type: ignore[no-any-return]
101
- except (OSError, json.JSONDecodeError):
 
 
 
102
  return []
103
 
104
  def save_todos(self, todos: list[dict[str, Any]]) -> None:
@@ -106,6 +112,7 @@ class Workspace:
106
  self.ensure_data_dir()
107
  with open(self.todos_file, "w") as f:
108
  json.dump(todos, f, indent=2)
 
109
 
110
  # --- Memory ---
111
 
@@ -119,8 +126,10 @@ class Workspace:
119
  try:
120
  with open(filepath) as f:
121
  memories.append(json.load(f))
122
- except (OSError, json.JSONDecodeError):
 
123
  continue
 
124
  return memories
125
 
126
  def load_memory(self, memory_id: str) -> dict[str, Any] | None:
@@ -131,7 +140,8 @@ class Workspace:
131
  try:
132
  with open(filepath) as f:
133
  return json.load(f) # type: ignore[no-any-return]
134
- except (OSError, json.JSONDecodeError):
 
135
  return None
136
 
137
  def save_memory(self, memory_id: str, data: dict[str, Any]) -> None:
@@ -140,12 +150,14 @@ class Workspace:
140
  filepath = self.memory_dir / f"{memory_id}.json"
141
  with open(filepath, "w") as f:
142
  json.dump(data, f, indent=2, default=str)
 
143
 
144
  def delete_memory(self, memory_id: str) -> bool:
145
  """Delete a memory entry. Returns True if deleted."""
146
  filepath = self.memory_dir / f"{memory_id}.json"
147
  if filepath.exists():
148
  filepath.unlink()
 
149
  return True
150
  return False
151
 
@@ -158,7 +170,8 @@ class Workspace:
158
  try:
159
  with open(self.config_file) as f:
160
  return json.load(f)
161
- except (OSError, json.JSONDecodeError):
 
162
  return {}
163
 
164
  def save_config(self, config: dict[str, Any]) -> None:
 
36
  from pathlib import Path
37
  from typing import Any
38
 
39
+ from loguru import logger
40
+
41
 
42
  class Workspace:
43
  """Manages workspace paths and agent data storage.
 
55
  if root is None:
56
  root = Path.cwd()
57
  self._root = Path(root).resolve()
58
+ logger.debug(f"Workspace initialized at {self._root}")
59
 
60
  @property
61
  def root(self) -> Path:
 
100
  return []
101
  try:
102
  with open(self.todos_file) as f:
103
+ todos = json.load(f)
104
+ logger.debug(f"Loaded {len(todos)} todos from {self.todos_file}")
105
+ return todos # type: ignore[no-any-return]
106
+ except (OSError, json.JSONDecodeError) as e:
107
+ logger.warning(f"Failed to load todos: {e}")
108
  return []
109
 
110
  def save_todos(self, todos: list[dict[str, Any]]) -> None:
 
112
  self.ensure_data_dir()
113
  with open(self.todos_file, "w") as f:
114
  json.dump(todos, f, indent=2)
115
+ logger.debug(f"Saved {len(todos)} todos to {self.todos_file}")
116
 
117
  # --- Memory ---
118
 
 
126
  try:
127
  with open(filepath) as f:
128
  memories.append(json.load(f))
129
+ except (OSError, json.JSONDecodeError) as e:
130
+ logger.debug(f"Failed to load memory {filepath}: {e}")
131
  continue
132
+ logger.debug(f"Listed {len(memories)} memories from {self.memory_dir}")
133
  return memories
134
 
135
  def load_memory(self, memory_id: str) -> dict[str, Any] | None:
 
140
  try:
141
  with open(filepath) as f:
142
  return json.load(f) # type: ignore[no-any-return]
143
+ except (OSError, json.JSONDecodeError) as e:
144
+ logger.debug(f"Failed to load memory '{memory_id}': {e}")
145
  return None
146
 
147
  def save_memory(self, memory_id: str, data: dict[str, Any]) -> None:
 
150
  filepath = self.memory_dir / f"{memory_id}.json"
151
  with open(filepath, "w") as f:
152
  json.dump(data, f, indent=2, default=str)
153
+ logger.debug(f"Saved memory '{memory_id}' to {filepath}")
154
 
155
  def delete_memory(self, memory_id: str) -> bool:
156
  """Delete a memory entry. Returns True if deleted."""
157
  filepath = self.memory_dir / f"{memory_id}.json"
158
  if filepath.exists():
159
  filepath.unlink()
160
+ logger.debug(f"Deleted memory '{memory_id}'")
161
  return True
162
  return False
163
 
 
170
  try:
171
  with open(self.config_file) as f:
172
  return json.load(f)
173
+ except (OSError, json.JSONDecodeError) as e:
174
+ logger.debug(f"Failed to load config: {e}")
175
  return {}
176
 
177
  def save_config(self, config: dict[str, Any]) -> None:
src/flow/ui/api/__init__.py CHANGED
@@ -2,6 +2,7 @@
2
  """API routes package."""
3
 
4
  from .configs import router as configs_router
 
5
  from .evaluate import router as evaluate_router
6
  from .experiment import router as experiment_router
7
  from .jobs import router as jobs_router
@@ -14,6 +15,7 @@ from .tools import router as tools_router
14
 
15
  __all__ = [
16
  "configs_router",
 
17
  "evaluate_router",
18
  "experiment_router",
19
  "jobs_router",
 
2
  """API routes package."""
3
 
4
  from .configs import router as configs_router
5
+ from .deployments import router as deployments_router
6
  from .evaluate import router as evaluate_router
7
  from .experiment import router as experiment_router
8
  from .jobs import router as jobs_router
 
15
 
16
  __all__ = [
17
  "configs_router",
18
+ "deployments_router",
19
  "evaluate_router",
20
  "experiment_router",
21
  "jobs_router",
src/flow/ui/api/deployments.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Microsoft. All rights reserved.
2
+ """Deployment API routes."""
3
+
4
+ from typing import Annotated
5
+ from uuid import UUID
6
+
7
+ from fastapi import APIRouter, Depends, HTTPException
8
+ from sqlalchemy.ext.asyncio import AsyncSession
9
+ from sqlmodel import desc, or_, select
10
+
11
+ from ..auth import TokenData, get_current_user, get_effective_user_id, should_filter_by_user
12
+ from ..database import get_session
13
+ from ..models.deployment import Deployment, DeploymentVersion
14
+ from ..schemas.deployment import DeploymentDetailResponse, DeploymentResponse, DeploymentVersionResponse
15
+
16
+ router = APIRouter(prefix="/deployments", tags=["deployments"])
17
+
18
+
19
+ def _parse_uuid(id_str: str) -> UUID:
20
+ """Parse a string to UUID, raising 400 if invalid."""
21
+ try:
22
+ return UUID(id_str)
23
+ except ValueError as e:
24
+ raise HTTPException(status_code=400, detail=f"Invalid UUID: {id_str}") from e
25
+
26
+
27
+ @router.get("", response_model=list[DeploymentResponse])
28
+ async def list_deployments(
29
+ session: AsyncSession = Depends(get_session),
30
+ user: Annotated[TokenData | None, Depends(get_current_user)] = None,
31
+ ) -> list[Deployment]:
32
+ """List all deployments."""
33
+ query = select(Deployment)
34
+
35
+ if should_filter_by_user():
36
+ effective_user_id = get_effective_user_id(user)
37
+ query = query.where(
38
+ or_(
39
+ Deployment.user_id == effective_user_id,
40
+ Deployment.is_public == True, # noqa: E712
41
+ )
42
+ )
43
+
44
+ query = query.order_by(desc(Deployment.updated_at))
45
+ result = await session.execute(query)
46
+ return list(result.scalars().all())
47
+
48
+
49
+ @router.get("/{deployment_id}", response_model=DeploymentDetailResponse)
50
+ async def get_deployment(
51
+ deployment_id: str,
52
+ session: AsyncSession = Depends(get_session),
53
+ user: Annotated[TokenData | None, Depends(get_current_user)] = None,
54
+ ) -> dict:
55
+ """Get a deployment with its version history."""
56
+ uuid_id = _parse_uuid(deployment_id)
57
+ query = select(Deployment).where(Deployment.id == uuid_id)
58
+
59
+ if should_filter_by_user():
60
+ effective_user_id = get_effective_user_id(user)
61
+ query = query.where(
62
+ or_(
63
+ Deployment.is_public == True, # noqa: E712
64
+ Deployment.user_id == effective_user_id,
65
+ )
66
+ )
67
+
68
+ result = await session.execute(query)
69
+ deployment = result.scalar_one_or_none()
70
+ if not deployment:
71
+ raise HTTPException(status_code=404, detail="Deployment not found")
72
+
73
+ # Fetch versions
74
+ versions_result = await session.execute(
75
+ select(DeploymentVersion)
76
+ .where(DeploymentVersion.deployment_id == uuid_id)
77
+ .order_by(desc(DeploymentVersion.version))
78
+ )
79
+ versions = list(versions_result.scalars().all())
80
+
81
+ return {
82
+ **deployment.__dict__,
83
+ "versions": versions,
84
+ }
85
+
86
+
87
+ @router.get("/{deployment_id}/versions", response_model=list[DeploymentVersionResponse])
88
+ async def list_versions(
89
+ deployment_id: str,
90
+ session: AsyncSession = Depends(get_session),
91
+ user: Annotated[TokenData | None, Depends(get_current_user)] = None,
92
+ ) -> list[DeploymentVersion]:
93
+ """List all versions of a deployment."""
94
+ uuid_id = _parse_uuid(deployment_id)
95
+
96
+ # Verify deployment exists and user has access
97
+ dep_query = select(Deployment).where(Deployment.id == uuid_id)
98
+ if should_filter_by_user():
99
+ effective_user_id = get_effective_user_id(user)
100
+ dep_query = dep_query.where(
101
+ or_(
102
+ Deployment.is_public == True, # noqa: E712
103
+ Deployment.user_id == effective_user_id,
104
+ )
105
+ )
106
+ dep_result = await session.execute(dep_query)
107
+ if not dep_result.scalar_one_or_none():
108
+ raise HTTPException(status_code=404, detail="Deployment not found")
109
+
110
+ result = await session.execute(
111
+ select(DeploymentVersion)
112
+ .where(DeploymentVersion.deployment_id == uuid_id)
113
+ .order_by(desc(DeploymentVersion.version))
114
+ )
115
+ return list(result.scalars().all())
116
+
117
+
118
+ @router.delete("/{deployment_id}", status_code=204)
119
+ async def delete_deployment(
120
+ deployment_id: str,
121
+ session: AsyncSession = Depends(get_session),
122
+ user: Annotated[TokenData | None, Depends(get_current_user)] = None,
123
+ ) -> None:
124
+ """Delete a deployment and all its versions."""
125
+ uuid_id = _parse_uuid(deployment_id)
126
+ query = select(Deployment).where(Deployment.id == uuid_id)
127
+
128
+ if should_filter_by_user():
129
+ effective_user_id = get_effective_user_id(user)
130
+ query = query.where(Deployment.user_id == effective_user_id)
131
+
132
+ result = await session.execute(query)
133
+ deployment = result.scalar_one_or_none()
134
+ if not deployment:
135
+ raise HTTPException(status_code=404, detail="Deployment not found")
136
+
137
+ # Delete versions first
138
+ versions_result = await session.execute(
139
+ select(DeploymentVersion).where(DeploymentVersion.deployment_id == uuid_id)
140
+ )
141
+ for version in versions_result.scalars().all():
142
+ await session.delete(version)
143
+
144
+ await session.delete(deployment)
145
+ await session.commit()
src/flow/ui/api/experiment.py CHANGED
@@ -94,7 +94,6 @@ async def design_experiment(
94
  Returns the YAML content and candidate count for preview.
95
  """
96
  # Look up base agent to get its path/name
97
- from uuid import UUID
98
  try:
99
  agent_uuid = UUID(data.base_agent_id)
100
  except ValueError as e:
 
94
  Returns the YAML content and candidate count for preview.
95
  """
96
  # Look up base agent to get its path/name
 
97
  try:
98
  agent_uuid = UUID(data.base_agent_id)
99
  except ValueError as e:
src/flow/ui/api/jobs.py CHANGED
@@ -7,7 +7,7 @@ from collections.abc import AsyncGenerator
7
  from typing import Annotated, Any
8
  from uuid import UUID
9
 
10
- from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException
11
  from fastapi.responses import StreamingResponse
12
  from sqlalchemy.ext.asyncio import AsyncSession
13
  from sqlmodel import desc, or_, select
@@ -80,15 +80,44 @@ async def create_job(
80
  """Create a new optimization job."""
81
  effective_user_id = get_effective_user_id(user)
82
 
83
- # Validate candidate_ids exist AND belong to user
84
- for candidate_id in data.candidate_ids:
85
- uuid_id = parse_uuid(candidate_id)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  query = select(AgentConfig).where(AgentConfig.id == uuid_id)
87
  if should_filter_by_user():
88
  query = query.where(AgentConfig.user_id == effective_user_id)
89
  result = await session.execute(query)
90
  if not result.scalar_one_or_none():
91
- raise HTTPException(status_code=400, detail=f"Candidate {candidate_id} not found")
 
 
 
 
 
 
 
 
 
 
 
 
 
92
 
93
  # Validate task_ids exist AND are accessible (shared or user's own)
94
  for task_id in data.task_ids:
@@ -114,9 +143,14 @@ async def create_job(
114
  name=data.name,
115
  candidate_ids=data.candidate_ids,
116
  task_ids=data.task_ids,
 
 
 
117
  parallel=data.parallel,
118
  use_llm_eval=data.use_llm_eval,
119
- total_experiments=len(data.candidate_ids) * len(data.task_ids),
 
 
120
  user_id=effective_user_id,
121
  created_by_name=created_by_name,
122
  )
@@ -223,7 +257,6 @@ async def _run_job_background(job_id: str) -> None:
223
  @router.post("/{job_id}/start")
224
  async def start_job(
225
  job_id: str,
226
- background_tasks: BackgroundTasks,
227
  session: AsyncSession = Depends(get_session),
228
  user: Annotated[TokenData | None, Depends(get_current_user)] = None,
229
  ) -> StreamingResponse:
 
7
  from typing import Annotated, Any
8
  from uuid import UUID
9
 
10
+ from fastapi import APIRouter, Depends, HTTPException
11
  from fastapi.responses import StreamingResponse
12
  from sqlalchemy.ext.asyncio import AsyncSession
13
  from sqlmodel import desc, or_, select
 
80
  """Create a new optimization job."""
81
  effective_user_id = get_effective_user_id(user)
82
 
83
+ is_strategy_mode = len(data.strategies) > 0
84
+
85
+ if is_strategy_mode:
86
+ # Strategy mode: validate strategy names and base_agent_id
87
+ if not data.base_agent_id:
88
+ raise HTTPException(status_code=400, detail="base_agent_id is required for strategy mode")
89
+
90
+ from flow.experiments.strategies import get_registered_strategies
91
+
92
+ available = list(get_registered_strategies().keys())
93
+ for sname in data.strategies:
94
+ if sname not in available:
95
+ raise HTTPException(
96
+ status_code=400,
97
+ detail=f"Unknown strategy: {sname}. Available: {available}",
98
+ )
99
+
100
+ # Validate base agent exists and belongs to user
101
+ uuid_id = parse_uuid(data.base_agent_id)
102
  query = select(AgentConfig).where(AgentConfig.id == uuid_id)
103
  if should_filter_by_user():
104
  query = query.where(AgentConfig.user_id == effective_user_id)
105
  result = await session.execute(query)
106
  if not result.scalar_one_or_none():
107
+ raise HTTPException(status_code=400, detail=f"Base agent {data.base_agent_id} not found")
108
+ else:
109
+ # Grid mode: validate candidate_ids exist AND belong to user
110
+ if not data.candidate_ids:
111
+ raise HTTPException(status_code=400, detail="candidate_ids required for grid mode (or use strategy mode)")
112
+
113
+ for candidate_id in data.candidate_ids:
114
+ uuid_id = parse_uuid(candidate_id)
115
+ query = select(AgentConfig).where(AgentConfig.id == uuid_id)
116
+ if should_filter_by_user():
117
+ query = query.where(AgentConfig.user_id == effective_user_id)
118
+ result = await session.execute(query)
119
+ if not result.scalar_one_or_none():
120
+ raise HTTPException(status_code=400, detail=f"Candidate {candidate_id} not found")
121
 
122
  # Validate task_ids exist AND are accessible (shared or user's own)
123
  for task_id in data.task_ids:
 
143
  name=data.name,
144
  candidate_ids=data.candidate_ids,
145
  task_ids=data.task_ids,
146
+ strategies=data.strategies,
147
+ strategy_config=data.strategy_config,
148
+ base_agent_id=data.base_agent_id,
149
  parallel=data.parallel,
150
  use_llm_eval=data.use_llm_eval,
151
+ # Strategy mode: estimate based on tasks (will be updated by progress callback)
152
+ # Grid mode: exact count = candidates × tasks
153
+ total_experiments=len(data.task_ids) if is_strategy_mode else len(data.candidate_ids) * len(data.task_ids),
154
  user_id=effective_user_id,
155
  created_by_name=created_by_name,
156
  )
 
257
  @router.post("/{job_id}/start")
258
  async def start_job(
259
  job_id: str,
 
260
  session: AsyncSession = Depends(get_session),
261
  user: Annotated[TokenData | None, Depends(get_current_user)] = None,
262
  ) -> StreamingResponse:
src/flow/ui/api/schema.py CHANGED
@@ -74,7 +74,7 @@ class LLMProviderSchema(BaseModel):
74
 
75
 
76
  class OptimizationStrategySchema(BaseModel):
77
- """Schema for an optimization strategy (GEPA, llm_rewriter, etc.)."""
78
 
79
  name: str = Field(description="Strategy identifier")
80
  description: str = Field(description="What this strategy does")
@@ -238,14 +238,18 @@ async def get_agent_schema() -> AgentSchema:
238
  "description": "GEPA: Reflective prompt evolution using LLM feedback",
239
  "applicable_dimensions": ["instructions"],
240
  },
241
- "llm_rewriter": {
242
- "description": "LLM-based instruction rewriting with variations",
243
  "applicable_dimensions": ["instructions"],
244
  },
245
- "tool_selector": {
246
- "description": "Intelligent tool set selection based on task",
247
  "applicable_dimensions": ["tools"],
248
  },
 
 
 
 
249
  }
250
  optimization_strategies = [
251
  OptimizationStrategySchema(
 
74
 
75
 
76
  class OptimizationStrategySchema(BaseModel):
77
+ """Schema for an optimization strategy (GEPA, instruction, tool, skill)."""
78
 
79
  name: str = Field(description="Strategy identifier")
80
  description: str = Field(description="What this strategy does")
 
238
  "description": "GEPA: Reflective prompt evolution using LLM feedback",
239
  "applicable_dimensions": ["instructions"],
240
  },
241
+ "instruction": {
242
+ "description": "LLM-based instruction optimization via evaluate-reflect-rewrite",
243
  "applicable_dimensions": ["instructions"],
244
  },
245
+ "tool": {
246
+ "description": "Intelligent tool set optimization based on task failures",
247
  "applicable_dimensions": ["tools"],
248
  },
249
+ "skill": {
250
+ "description": "Skill generation and selection to provide domain knowledge",
251
+ "applicable_dimensions": ["skills"],
252
+ },
253
  }
254
  optimization_strategies = [
255
  OptimizationStrategySchema(
src/flow/ui/api/tests.py CHANGED
@@ -1,7 +1,6 @@
1
  # Copyright (c) Microsoft. All rights reserved.
2
  """Test run API routes for interactive agent testing."""
3
 
4
- import asyncio
5
  import logging
6
  from collections.abc import AsyncGenerator
7
  from typing import Annotated, Any
@@ -13,7 +12,7 @@ from sqlalchemy.ext.asyncio import AsyncSession
13
  from sqlmodel import desc, select
14
 
15
  from ..auth import TokenData, get_current_user, get_effective_user_id, should_filter_by_user
16
- from ..database import async_session, get_session
17
  from ..models.config import AgentConfig
18
  from ..models.test_run import TestRun, TestRunStatus
19
  from ..schemas.test import TestRunCreate, TestRunDetailResponse, TestRunResponse
@@ -22,10 +21,6 @@ from ..services.test_service import TestService
22
  router = APIRouter(prefix="/tests", tags=["tests"])
23
  logger = logging.getLogger(__name__)
24
 
25
- # Store running tests for cancellation
26
- _running_tests: dict[str, asyncio.Task[Any]] = {}
27
-
28
-
29
  def parse_uuid(id_str: str) -> UUID:
30
  """Parse a string to UUID, raising 400 if invalid."""
31
  try:
@@ -150,30 +145,6 @@ async def get_test(
150
  }
151
 
152
 
153
- async def _run_test_background(test_id: str) -> None:
154
- """Run test in background, updating DB with progress."""
155
- service = TestService()
156
- try:
157
- async for progress in service.run_test(test_id):
158
- logger.debug(f"Test {test_id[:8]} progress: {progress.event} - {progress.message}")
159
- except Exception as e:
160
- logger.error(f"Background test {test_id[:8]} failed: {e}")
161
- # Ensure test is marked as failed
162
- async with async_session() as session:
163
- from datetime import datetime, timezone
164
- result = await session.execute(
165
- select(TestRun).where(TestRun.id == UUID(test_id))
166
- )
167
- test_run = result.scalar_one_or_none()
168
- if test_run and test_run.status == TestRunStatus.RUNNING:
169
- test_run.status = TestRunStatus.FAILED
170
- test_run.error = f"Background execution failed: {e}"
171
- test_run.completed_at = datetime.now(timezone.utc)
172
- await session.commit()
173
- finally:
174
- _running_tests.pop(test_id, None)
175
-
176
-
177
  @router.post("/{test_id}/start")
178
  async def start_test(
179
  test_id: str,
@@ -246,11 +217,6 @@ async def cancel_test(
246
  if test_run.status != TestRunStatus.RUNNING:
247
  raise HTTPException(status_code=400, detail=f"Test is not running (status: {test_run.status})")
248
 
249
- # Cancel the running task if it exists
250
- if test_id in _running_tests:
251
- _running_tests[test_id].cancel()
252
- del _running_tests[test_id]
253
-
254
  test_run.status = TestRunStatus.CANCELLED
255
  await session.commit()
256
  await session.refresh(test_run)
 
1
  # Copyright (c) Microsoft. All rights reserved.
2
  """Test run API routes for interactive agent testing."""
3
 
 
4
  import logging
5
  from collections.abc import AsyncGenerator
6
  from typing import Annotated, Any
 
12
  from sqlmodel import desc, select
13
 
14
  from ..auth import TokenData, get_current_user, get_effective_user_id, should_filter_by_user
15
+ from ..database import get_session
16
  from ..models.config import AgentConfig
17
  from ..models.test_run import TestRun, TestRunStatus
18
  from ..schemas.test import TestRunCreate, TestRunDetailResponse, TestRunResponse
 
21
  router = APIRouter(prefix="/tests", tags=["tests"])
22
  logger = logging.getLogger(__name__)
23
 
 
 
 
 
24
  def parse_uuid(id_str: str) -> UUID:
25
  """Parse a string to UUID, raising 400 if invalid."""
26
  try:
 
145
  }
146
 
147
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
  @router.post("/{test_id}/start")
149
  async def start_test(
150
  test_id: str,
 
217
  if test_run.status != TestRunStatus.RUNNING:
218
  raise HTTPException(status_code=400, detail=f"Test is not running (status: {test_run.status})")
219
 
 
 
 
 
 
220
  test_run.status = TestRunStatus.CANCELLED
221
  await session.commit()
222
  await session.refresh(test_run)
src/flow/ui/auth/__init__.py CHANGED
@@ -2,7 +2,7 @@
2
  """Authentication module for Flow UI."""
3
 
4
  from .config import AuthMode, AuthSettings, get_auth_settings, init_auth_settings
5
- from .middleware import get_current_user, require_auth
6
  from .router import router as auth_router
7
  from .tokens import TokenData, create_access_token, verify_access_token
8
  from .user_context import ANONYMOUS_USER_ID, get_effective_user_id, should_filter_by_user
@@ -18,7 +18,6 @@ __all__ = [
18
  "get_current_user",
19
  "get_effective_user_id",
20
  "init_auth_settings",
21
- "require_auth",
22
  "should_filter_by_user",
23
  "verify_access_token",
24
  ]
 
2
  """Authentication module for Flow UI."""
3
 
4
  from .config import AuthMode, AuthSettings, get_auth_settings, init_auth_settings
5
+ from .middleware import get_current_user
6
  from .router import router as auth_router
7
  from .tokens import TokenData, create_access_token, verify_access_token
8
  from .user_context import ANONYMOUS_USER_ID, get_effective_user_id, should_filter_by_user
 
18
  "get_current_user",
19
  "get_effective_user_id",
20
  "init_auth_settings",
 
21
  "should_filter_by_user",
22
  "verify_access_token",
23
  ]
src/flow/ui/auth/config.py CHANGED
@@ -5,14 +5,10 @@ from __future__ import annotations
5
 
6
  import secrets
7
  from enum import Enum
8
- from typing import TYPE_CHECKING
9
 
10
  from pydantic import Field, field_validator
11
  from pydantic_settings import BaseSettings, SettingsConfigDict
12
 
13
- if TYPE_CHECKING:
14
- pass
15
-
16
 
17
  class AuthMode(str, Enum):
18
  """Authentication mode."""
 
5
 
6
  import secrets
7
  from enum import Enum
 
8
 
9
  from pydantic import Field, field_validator
10
  from pydantic_settings import BaseSettings, SettingsConfigDict
11
 
 
 
 
12
 
13
  class AuthMode(str, Enum):
14
  """Authentication mode."""
src/flow/ui/auth/middleware.py CHANGED
@@ -60,48 +60,6 @@ async def get_current_user(
60
  ) from e
61
 
62
 
63
- async def require_auth(
64
- user: Annotated[TokenData | None, Depends(get_current_user)],
65
- ) -> TokenData | None:
66
- """Require authentication if enabled.
67
-
68
- Use this as a dependency on routes that should be protected when auth is enabled.
69
- This is essentially an alias for get_current_user that makes intent clearer.
70
-
71
- Args:
72
- user: The current user from get_current_user
73
-
74
- Returns:
75
- TokenData if authenticated, None if auth is disabled
76
- """
77
- return user
78
-
79
-
80
- def get_optional_user(
81
- credentials: Annotated[HTTPAuthorizationCredentials | None, Depends(bearer_scheme)],
82
- ) -> TokenData | None:
83
- """Get the current user if a valid token is provided, otherwise None.
84
-
85
- Unlike get_current_user, this never raises an error - it's for routes
86
- that work differently based on whether the user is authenticated.
87
-
88
- Args:
89
- credentials: The bearer token credentials
90
-
91
- Returns:
92
- TokenData if valid token provided, None otherwise
93
- """
94
- settings = get_auth_settings()
95
-
96
- if credentials is None:
97
- return None
98
-
99
- try:
100
- return verify_access_token(credentials.credentials, settings.secret)
101
- except TokenError:
102
- return None
103
-
104
-
105
  class AuthMiddleware:
106
  """Middleware to check authentication on all /api/* routes except /api/auth/*.
107
 
 
60
  ) from e
61
 
62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  class AuthMiddleware:
64
  """Middleware to check authentication on all /api/* routes except /api/auth/*.
65