victordibia commited on
Commit
cbd95af
·
1 Parent(s): c1ec9a0

Deploy 2026-01-28 10:56:31

Browse files
Files changed (40) hide show
  1. .env.example +29 -0
  2. README.md +10 -6
  3. src/flow/cli/app.py +6 -6
  4. src/flow/cli/optimize.py +99 -135
  5. src/flow/experiments/__init__.py +52 -96
  6. src/flow/experiments/ablation.py +76 -248
  7. src/flow/experiments/config_export.py +0 -184
  8. src/flow/experiments/models.py +517 -0
  9. src/flow/experiments/optimizer.py +81 -147
  10. src/flow/experiments/types.py +2 -2
  11. src/flow/harness/maf/agent.py +36 -26
  12. src/flow/harness/maf/tools/__init__.py +157 -0
  13. src/flow/{tools → harness/maf/tools}/coding.py +0 -0
  14. src/flow/{tools → harness/maf/tools}/core.py +0 -0
  15. src/flow/{tools → harness/maf/tools}/execution.py +0 -0
  16. src/flow/{tools → harness/maf/tools}/memory.py +0 -0
  17. src/flow/{tools → harness/maf/tools}/sub_agent.py +14 -6
  18. src/flow/prompts.py +234 -97
  19. src/flow/tools/__init__.py +0 -172
  20. src/flow/ui/api/configs.py +81 -111
  21. src/flow/ui/api/jobs.py +6 -6
  22. src/flow/ui/api/runs.py +13 -13
  23. src/flow/ui/database.py +1 -57
  24. src/flow/ui/models/config.py +4 -4
  25. src/flow/ui/models/job.py +1 -1
  26. src/flow/ui/models/run.py +1 -1
  27. src/flow/ui/models/task.py +1 -1
  28. src/flow/ui/schemas/__init__.py +4 -4
  29. src/flow/ui/schemas/config.py +33 -29
  30. src/flow/ui/schemas/job.py +3 -3
  31. src/flow/ui/schemas/run.py +2 -2
  32. src/flow/ui/services/optimizer_service.py +38 -48
  33. src/flow/ui/tests/test_e2e_user_journey.py +6 -6
  34. src/flow/ui/ui/assets/index-2zMAgGgo.js +0 -0
  35. src/flow/ui/ui/assets/index-BG9n9RHB.js +0 -0
  36. src/flow/ui/ui/assets/index-BHAF8mLj.css +1 -0
  37. src/flow/ui/ui/assets/index-Bx-_JS_6.js +0 -0
  38. src/flow/ui/ui/assets/index-VFZIS3uv.js +0 -0
  39. src/flow/ui/ui/assets/index-_IRgS-wR.css +1 -0
  40. src/flow/ui/ui/index.html +2 -2
.env.example ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Flow UI Deployment Environment
2
+ # Copy this to deploy/.env and fill in values
3
+ # This file is gitignored - secrets stay local
4
+
5
+ # --- Azure OpenAI ---
6
+ AZURE_OPENAI_ENDPOINT=https://your-resource.openai.azure.com/
7
+ AZURE_OPENAI_API_KEY=your-key
8
+ AZURE_OPENAI_DEPLOYMENT=gpt-4o
9
+
10
+ # --- Authentication ---
11
+ AUTH_ENABLED=true
12
+ AUTH_MODE=github
13
+ AUTH_SECRET=change-me-to-a-random-string
14
+
15
+ # For GitHub OAuth (create app at https://github.com/settings/developers):
16
+ # Homepage URL: https://victordibia-flow.hf.space
17
+ # Callback URL: https://victordibia-flow.hf.space/api/auth/github/callback
18
+ AUTH_GITHUB_CLIENT_ID=your-client-id
19
+ AUTH_GITHUB_CLIENT_SECRET=your-client-secret
20
+ AUTH_GITHUB_ALLOWED_USERS=victordibia,teammate1,teammate2
21
+
22
+ # For basic auth (simpler, no GitHub app needed):
23
+ # AUTH_MODE=basic
24
+ # AUTH_BASIC_USERNAME=admin
25
+ # AUTH_BASIC_PASSWORD=your-password
26
+
27
+ # --- Optional ---
28
+ # AUTH_SESSION_HOURS=24
29
+ # UVICORN_WORKERS=2
README.md CHANGED
@@ -83,13 +83,17 @@ Flow tests different **context engineering strategies**:
83
  Example configurations:
84
 
85
  ```python
86
- from flow.experiments.ablation import AblationConfig
87
 
88
- configs = [
89
- AblationConfig(name="baseline", enable_message_compaction=False),
90
- AblationConfig(name="compaction", enable_message_compaction=True, compaction_head_size=10),
91
- AblationConfig(name="full", enable_message_compaction=True, enable_memory_tool=True),
92
- ]
 
 
 
 
93
  ```
94
 
95
  ## Task Format
 
83
  Example configurations:
84
 
85
  ```python
86
+ from flow.experiments.models import Agent, CompactionConfig, GridSearchStrategy
87
 
88
+ # Define a base agent
89
+ base = Agent(name="my_agent", enable_memory=True)
90
+
91
+ # Generate candidates via grid search
92
+ strategy = GridSearchStrategy(variations={
93
+ "enable_memory": [True, False],
94
+ "compaction": [CompactionConfig.head_tail(10, 40), CompactionConfig.none()],
95
+ })
96
+ candidates = strategy.generate(base, budget=10)
97
  ```
98
 
99
  ## Task Format
src/flow/cli/app.py CHANGED
@@ -107,13 +107,13 @@ async def _run_single_task(
107
  from flow.harness.maf import MAFHarness
108
 
109
  if config_path:
110
- # Load config from optimization result
111
- from flow.experiments.config_export import load_config
112
- from flow.experiments.ablation import create_harness_from_config
113
 
114
- ablation_config = load_config(config_path)
115
- console.print(f"[dim]Using config: {ablation_config.name}[/]")
116
- harness = create_harness_from_config(ablation_config, workspace)
117
  else:
118
  harness = MAFHarness(workspace=workspace, memory_path=memory_path)
119
 
 
107
  from flow.harness.maf import MAFHarness
108
 
109
  if config_path:
110
+ # Load agent config from optimization result
111
+ from flow.experiments.models import load_agent
112
+ from flow.experiments.ablation import create_harness_from_agent
113
 
114
+ agent_config = load_agent(config_path)
115
+ console.print(f"[dim]Using agent config: {agent_config.name}[/]")
116
+ harness = create_harness_from_agent(agent_config, workspace)
117
  else:
118
  harness = MAFHarness(workspace=workspace, memory_path=memory_path)
119
 
src/flow/cli/optimize.py CHANGED
@@ -13,13 +13,9 @@ from typing import Annotated, Any
13
  import typer
14
  from rich.console import Console
15
 
16
- from flow.experiments.ablation import AblationConfig, CONTEXT_ENGINEERING_CONFIGS
17
- from flow.experiments.optimizer import (
18
- FlowOptimizer,
19
- generate_grid_configs,
20
- load_tasks_from_jsonl,
21
- )
22
- from flow.experiments.types import EvalCriterion, Task
23
 
24
  console = Console()
25
 
@@ -36,21 +32,21 @@ def optimize(
36
  Path | None,
37
  typer.Option(
38
  "--config", "-c",
39
- help="Path to Python config file with CONFIGS or VARIATIONS",
40
  ),
41
  ] = None,
42
  agent: Annotated[
43
  Path | None,
44
  typer.Option(
45
  "--agent", "-a",
46
- help="Path to base agent Python file (for optimization)",
47
  ),
48
  ] = None,
49
  suite: Annotated[
50
  str | None,
51
  typer.Option(
52
  "--suite", "-s",
53
- help="Built-in task suite: coding, research",
54
  ),
55
  ] = None,
56
  parallel: Annotated[
@@ -60,18 +56,11 @@ def optimize(
60
  help="Max concurrent experiments",
61
  ),
62
  ] = 4,
63
- mode: Annotated[
64
- str,
65
- typer.Option(
66
- "--mode", "-m",
67
- help="Config mode: named (use CONFIGS), grid (use VARIATIONS)",
68
- ),
69
- ] = "named",
70
  vary: Annotated[
71
  str | None,
72
  typer.Option(
73
  "--vary", "-v",
74
- help="Comma-separated params to vary: compaction,memory,model",
75
  ),
76
  ] = None,
77
  output: Annotated[
@@ -88,28 +77,35 @@ def optimize(
88
  help="Disable LLM-as-Judge evaluation (faster, less accurate)",
89
  ),
90
  ] = False,
 
 
 
 
 
 
 
91
  ) -> None:
92
  """Find the best agent configuration through experimentation.
93
 
94
  Runs experiments in parallel, evaluates with LLM-as-Judge,
95
- ranks via Pareto analysis, and exports winning configs.
96
 
97
  Examples:
98
 
99
- # Run with task file and default configs
100
  flow optimize --tasks tasks.jsonl
101
 
102
- # Use custom configs from Python file
103
  flow optimize --config my_configs.py --tasks tasks.jsonl
104
 
105
- # Grid search over variations
106
- flow optimize --config my_configs.py --tasks tasks.jsonl --mode grid
107
 
108
  # Use built-in task suite
109
  flow optimize --suite coding --parallel 2
110
 
111
- # Vary specific parameters
112
- flow optimize --vary compaction,memory --tasks tasks.jsonl
113
  """
114
  asyncio.run(_run_optimize(
115
  tasks_path=tasks,
@@ -117,10 +113,10 @@ def optimize(
117
  agent_path=agent,
118
  suite=suite,
119
  parallel=parallel,
120
- mode=mode,
121
  vary=vary,
122
  output_dir=output,
123
  use_llm_eval=not no_llm_eval,
 
124
  ))
125
 
126
 
@@ -130,10 +126,10 @@ async def _run_optimize(
130
  agent_path: Path | None,
131
  suite: str | None,
132
  parallel: int,
133
- mode: str,
134
  vary: str | None,
135
  output_dir: Path | None,
136
  use_llm_eval: bool,
 
137
  ) -> None:
138
  """Run the optimization."""
139
  # Load tasks
@@ -142,19 +138,23 @@ async def _run_optimize(
142
  console.print("[red]Error:[/] No tasks specified. Use --tasks or --suite")
143
  raise typer.Exit(1)
144
 
145
- # Load configs
146
- configs = _load_configs(config_path, mode, vary)
147
- if not configs:
148
- console.print("[red]Error:[/] No configs to test. Use --config or --vary")
 
 
 
149
  raise typer.Exit(1)
150
 
 
151
  console.print(f"\n[bold]Tasks:[/] {len(tasks)}")
152
  for t in tasks:
153
  console.print(f" - {t.name}")
154
 
155
- console.print(f"\n[bold]Configs:[/] {len(configs)}")
156
- for c in configs:
157
- console.print(f" - {c.name}")
158
 
159
  # Run optimizer
160
  optimizer = FlowOptimizer(
@@ -164,12 +164,12 @@ async def _run_optimize(
164
  )
165
 
166
  try:
167
- result = await optimizer.optimize(configs, tasks)
168
 
169
  console.print("\n[bold green]Optimization complete![/]")
170
- console.print(f"\nBest configs exported to: [cyan]{result.output_dir / 'configs'}[/]")
171
- console.print("\nTo use a config:")
172
- console.print(f" [dim]flow run --config {result.output_dir / 'configs' / 'best_score.yaml'} \"your task\"[/]")
173
 
174
  except KeyboardInterrupt:
175
  console.print("\n[yellow]Optimization cancelled.[/]")
@@ -185,116 +185,73 @@ def _load_tasks(tasks_path: Path | None, suite: str | None) -> list[Task]:
185
  return load_tasks_from_jsonl(tasks_path)
186
 
187
  if suite:
188
- return _get_builtin_suite(suite)
189
-
190
- # Default: simple test suite
191
- return _get_builtin_suite("quick")
192
-
193
-
194
- def _get_builtin_suite(name: str) -> list[Task]:
195
- """Get a built-in task suite."""
196
- suites = {
197
- "quick": [
198
- Task(
199
- name="hello_world",
200
- prompt="Create a Python script 'hello.py' that prints 'Hello, World!' and run it.",
201
- criteria=[
202
- EvalCriterion(name="file_created", instruction="hello.py should be created"),
203
- EvalCriterion(name="correct_output", instruction="Output should include 'Hello, World!'"),
204
- ],
205
- ),
206
- ],
207
- "coding": [
208
- Task(
209
- name="fizzbuzz",
210
- prompt="Create fizzbuzz.py that prints 1-30 with Fizz/Buzz/FizzBuzz rules. Run it.",
211
- criteria=[
212
- EvalCriterion(name="file_created", instruction="fizzbuzz.py should be created"),
213
- EvalCriterion(name="correct_output", instruction="Output shows correct FizzBuzz pattern"),
214
- ],
215
- metadata={"category": "short"},
216
- ),
217
- Task(
218
- name="rest_api",
219
- prompt="Create a FastAPI app with a /health endpoint that returns JSON {'status': 'ok'}. Save as api.py.",
220
- criteria=[
221
- EvalCriterion(name="file_created", instruction="api.py should be created"),
222
- EvalCriterion(name="fastapi_used", instruction="Should use FastAPI"),
223
- EvalCriterion(name="endpoint_defined", instruction="Should have /health endpoint"),
224
- ],
225
- metadata={"category": "medium"},
226
- ),
227
- Task(
228
- name="data_pipeline",
229
- prompt="""Create a data processing pipeline:
230
- 1. data_types.py - DataRecord dataclass (id, name, value)
231
- 2. validators.py - validate_id, validate_name functions
232
- 3. pipeline.py - chain validators together
233
- 4. test_pipeline.py - tests for the pipeline
234
- Run the tests.""",
235
- criteria=[
236
- EvalCriterion(name="modules_created", instruction="All 4 Python files created"),
237
- EvalCriterion(name="tests_run", instruction="Tests should be executed"),
238
- ],
239
- metadata={"category": "long"},
240
- ),
241
- ],
242
- "research": [
243
- Task(
244
- name="codebase_analysis",
245
- prompt="""Analyze this workspace:
246
- 1. Explore the directory structure
247
- 2. Identify Python files and their purposes
248
- 3. Create analysis_report.md with findings""",
249
- criteria=[
250
- EvalCriterion(name="exploration", instruction="Should explore directory"),
251
- EvalCriterion(name="report_created", instruction="analysis_report.md created"),
252
- ],
253
- metadata={"category": "research"},
254
- ),
255
- ],
256
- }
257
 
258
- if name not in suites:
259
- console.print(f"[red]Error:[/] Unknown suite '{name}'. Available: {list(suites.keys())}")
 
 
 
260
  raise typer.Exit(1)
261
 
262
- return suites[name]
 
 
 
 
 
 
 
 
 
 
263
 
264
 
265
- def _load_configs(
266
  config_path: Path | None,
267
- mode: str,
268
  vary: str | None,
269
- ) -> list[AblationConfig]:
270
- """Load configs from file or generate from variations."""
271
- # Load from Python file
 
272
  if config_path:
273
  if not config_path.exists():
274
  console.print(f"[red]Error:[/] Config file not found: {config_path}")
275
  raise typer.Exit(1)
276
 
277
- configs, variations = _load_python_config(config_path)
278
 
279
- if mode == "grid" and variations:
280
- return generate_grid_configs("grid", variations)
281
- elif configs:
282
- return configs
 
283
  else:
284
- console.print("[red]Error:[/] Config file has no CONFIGS or VARIATIONS")
285
  raise typer.Exit(1)
286
 
287
- # Generate from --vary flag
288
  if vary:
289
  variations = _parse_vary_flag(vary)
290
- return generate_grid_configs("vary", variations)
291
-
292
- # Default: use context engineering configs
293
- return CONTEXT_ENGINEERING_CONFIGS
 
 
 
 
 
 
 
 
294
 
295
 
296
- def _load_python_config(path: Path) -> tuple[list[AblationConfig], dict[str, Any]]:
297
- """Load CONFIGS and VARIATIONS from a Python file."""
298
  spec = importlib.util.spec_from_file_location("config_module", path)
299
  if spec is None or spec.loader is None:
300
  raise ValueError(f"Cannot load {path}")
@@ -303,29 +260,36 @@ def _load_python_config(path: Path) -> tuple[list[AblationConfig], dict[str, Any
303
  sys.modules["config_module"] = module
304
  spec.loader.exec_module(module)
305
 
306
- configs = getattr(module, "CONFIGS", [])
307
  variations = getattr(module, "VARIATIONS", {})
308
 
309
- return configs, variations
310
 
311
 
312
  def _parse_vary_flag(vary: str) -> dict[str, Any]:
313
  """Parse --vary flag into variations dict."""
314
- variations = {}
315
 
316
  for param in vary.split(","):
317
  param = param.strip().lower()
318
 
319
  if param in ("compaction", "compact"):
320
- variations["enable_message_compaction"] = [True, False]
 
 
 
321
  elif param in ("memory", "mem"):
322
- variations["enable_memory_tool"] = [True, False]
323
  elif param in ("subagent", "sub"):
324
  variations["enable_sub_agent"] = [True, False]
325
  elif param in ("head", "head_size"):
326
- variations["compaction_head_size"] = [5, 10, 20]
 
 
327
  elif param in ("tail", "tail_size"):
328
- variations["compaction_tail_size"] = [20, 40, 60]
 
 
329
  else:
330
  console.print(f"[yellow]Warning:[/] Unknown vary param: {param}")
331
 
 
13
  import typer
14
  from rich.console import Console
15
 
16
+ from flow.experiments.models import Agent, Candidate, CompactionConfig, GridSearchStrategy
17
+ from flow.experiments.optimizer import FlowOptimizer, load_tasks_from_jsonl
18
+ from flow.experiments.types import Task, get_task_suite
 
 
 
 
19
 
20
  console = Console()
21
 
 
32
  Path | None,
33
  typer.Option(
34
  "--config", "-c",
35
+ help="Path to Python config file with CANDIDATES or VARIATIONS",
36
  ),
37
  ] = None,
38
  agent: Annotated[
39
  Path | None,
40
  typer.Option(
41
  "--agent", "-a",
42
+ help="Path to base agent YAML file (for optimization)",
43
  ),
44
  ] = None,
45
  suite: Annotated[
46
  str | None,
47
  typer.Option(
48
  "--suite", "-s",
49
+ help="Built-in task suite: quick, core, coding",
50
  ),
51
  ] = None,
52
  parallel: Annotated[
 
56
  help="Max concurrent experiments",
57
  ),
58
  ] = 4,
 
 
 
 
 
 
 
59
  vary: Annotated[
60
  str | None,
61
  typer.Option(
62
  "--vary", "-v",
63
+ help="Comma-separated params to vary: compaction,memory,subagent",
64
  ),
65
  ] = None,
66
  output: Annotated[
 
77
  help="Disable LLM-as-Judge evaluation (faster, less accurate)",
78
  ),
79
  ] = False,
80
+ budget: Annotated[
81
+ int,
82
+ typer.Option(
83
+ "--budget", "-b",
84
+ help="Maximum number of candidates to generate",
85
+ ),
86
+ ] = 100,
87
  ) -> None:
88
  """Find the best agent configuration through experimentation.
89
 
90
  Runs experiments in parallel, evaluates with LLM-as-Judge,
91
+ ranks via Pareto analysis, and exports winning agent configs.
92
 
93
  Examples:
94
 
95
+ # Run with task file and default candidates
96
  flow optimize --tasks tasks.jsonl
97
 
98
+ # Use custom candidates from Python file
99
  flow optimize --config my_configs.py --tasks tasks.jsonl
100
 
101
+ # Vary specific parameters
102
+ flow optimize --vary compaction,memory --tasks tasks.jsonl
103
 
104
  # Use built-in task suite
105
  flow optimize --suite coding --parallel 2
106
 
107
+ # Start from a base agent definition
108
+ flow optimize --agent base_agent.yaml --vary compaction,memory --tasks tasks.jsonl
109
  """
110
  asyncio.run(_run_optimize(
111
  tasks_path=tasks,
 
113
  agent_path=agent,
114
  suite=suite,
115
  parallel=parallel,
 
116
  vary=vary,
117
  output_dir=output,
118
  use_llm_eval=not no_llm_eval,
119
+ budget=budget,
120
  ))
121
 
122
 
 
126
  agent_path: Path | None,
127
  suite: str | None,
128
  parallel: int,
 
129
  vary: str | None,
130
  output_dir: Path | None,
131
  use_llm_eval: bool,
132
+ budget: int,
133
  ) -> None:
134
  """Run the optimization."""
135
  # Load tasks
 
138
  console.print("[red]Error:[/] No tasks specified. Use --tasks or --suite")
139
  raise typer.Exit(1)
140
 
141
+ # Load base agent
142
+ base = _load_base_agent(agent_path)
143
+
144
+ # Load/generate candidates
145
+ candidates = _load_candidates(config_path, vary, base, budget)
146
+ if not candidates:
147
+ console.print("[red]Error:[/] No candidates to test. Use --config or --vary")
148
  raise typer.Exit(1)
149
 
150
+ console.print(f"\n[bold]Base Agent:[/] {base.name}")
151
  console.print(f"\n[bold]Tasks:[/] {len(tasks)}")
152
  for t in tasks:
153
  console.print(f" - {t.name}")
154
 
155
+ console.print(f"\n[bold]Candidates:[/] {len(candidates)}")
156
+ for c in candidates:
157
+ console.print(f" - {c.agent.name}")
158
 
159
  # Run optimizer
160
  optimizer = FlowOptimizer(
 
164
  )
165
 
166
  try:
167
+ result = await optimizer.optimize(candidates, tasks)
168
 
169
  console.print("\n[bold green]Optimization complete![/]")
170
+ console.print(f"\nBest agents exported to: [cyan]{result.output_dir / 'agents'}[/]")
171
+ console.print("\nTo use an agent config:")
172
+ console.print(f" [dim]flow run --config {result.output_dir / 'agents' / 'best_score.yaml'} \"your task\"[/]")
173
 
174
  except KeyboardInterrupt:
175
  console.print("\n[yellow]Optimization cancelled.[/]")
 
185
  return load_tasks_from_jsonl(tasks_path)
186
 
187
  if suite:
188
+ try:
189
+ return get_task_suite(suite)
190
+ except ValueError as e:
191
+ console.print(f"[red]Error:[/] {e}")
192
+ raise typer.Exit(1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
 
194
+ # Default: quick suite
195
+ try:
196
+ return get_task_suite("quick")
197
+ except ValueError:
198
+ console.print("[red]Error:[/] No built-in suites available. Use --tasks to specify a JSONL file.")
199
  raise typer.Exit(1)
200
 
201
+
202
+ def _load_base_agent(agent_path: Path | None) -> Agent:
203
+ """Load base agent from YAML or use defaults."""
204
+ if agent_path:
205
+ if not agent_path.exists():
206
+ console.print(f"[red]Error:[/] Agent file not found: {agent_path}")
207
+ raise typer.Exit(1)
208
+ from flow.experiments.models import load_agent
209
+ return load_agent(agent_path)
210
+
211
+ return Agent(name="flow_agent")
212
 
213
 
214
+ def _load_candidates(
215
  config_path: Path | None,
 
216
  vary: str | None,
217
+ base: Agent,
218
+ budget: int,
219
+ ) -> list[Candidate]:
220
+ """Load candidates from file or generate from variations."""
221
  if config_path:
222
  if not config_path.exists():
223
  console.print(f"[red]Error:[/] Config file not found: {config_path}")
224
  raise typer.Exit(1)
225
 
226
+ candidates, variations = _load_python_config(config_path)
227
 
228
+ if variations:
229
+ strategy = GridSearchStrategy(variations)
230
+ return strategy.generate(base, budget)
231
+ elif candidates:
232
+ return candidates
233
  else:
234
+ console.print("[red]Error:[/] Config file has no CANDIDATES or VARIATIONS")
235
  raise typer.Exit(1)
236
 
 
237
  if vary:
238
  variations = _parse_vary_flag(vary)
239
+ strategy = GridSearchStrategy(variations)
240
+ return strategy.generate(base, budget)
241
+
242
+ # Default: explore context engineering dimensions
243
+ strategy = GridSearchStrategy(variations={
244
+ "enable_memory": [True, False],
245
+ "compaction": [
246
+ CompactionConfig.head_tail(10, 40),
247
+ CompactionConfig.none(),
248
+ ],
249
+ })
250
+ return strategy.generate(base, budget)
251
 
252
 
253
+ def _load_python_config(path: Path) -> tuple[list[Candidate], dict[str, Any]]:
254
+ """Load CANDIDATES and VARIATIONS from a Python file."""
255
  spec = importlib.util.spec_from_file_location("config_module", path)
256
  if spec is None or spec.loader is None:
257
  raise ValueError(f"Cannot load {path}")
 
260
  sys.modules["config_module"] = module
261
  spec.loader.exec_module(module)
262
 
263
+ candidates = getattr(module, "CANDIDATES", [])
264
  variations = getattr(module, "VARIATIONS", {})
265
 
266
+ return candidates, variations
267
 
268
 
269
  def _parse_vary_flag(vary: str) -> dict[str, Any]:
270
  """Parse --vary flag into variations dict."""
271
+ variations: dict[str, Any] = {}
272
 
273
  for param in vary.split(","):
274
  param = param.strip().lower()
275
 
276
  if param in ("compaction", "compact"):
277
+ variations["compaction"] = [
278
+ CompactionConfig.head_tail(10, 40),
279
+ CompactionConfig.none(),
280
+ ]
281
  elif param in ("memory", "mem"):
282
+ variations["enable_memory"] = [True, False]
283
  elif param in ("subagent", "sub"):
284
  variations["enable_sub_agent"] = [True, False]
285
  elif param in ("head", "head_size"):
286
+ variations["compaction"] = [
287
+ CompactionConfig.head_tail(h, 40) for h in [5, 10, 20]
288
+ ]
289
  elif param in ("tail", "tail_size"):
290
+ variations["compaction"] = [
291
+ CompactionConfig.head_tail(10, t) for t in [20, 40, 60]
292
+ ]
293
  else:
294
  console.print(f"[yellow]Warning:[/] Unknown vary param: {param}")
295
 
src/flow/experiments/__init__.py CHANGED
@@ -3,96 +3,59 @@
3
  """Experiments framework for running and evaluating Flow agent tasks.
4
 
5
  This package provides a structured way to:
6
- - Define tasks with evaluation criteria
 
7
  - Run agents on tasks and collect OpenTelemetry traces
8
  - Evaluate agent outputs using LLM, heuristic, or trace-based evaluators
9
  - Extract metrics from execution traces
10
- - Run ablation studies comparing different configurations
11
 
12
  Example usage:
13
- from flow.harness.maf import MAFHarness
14
  from flow.experiments import (
15
- FlowExperimentRunner,
 
 
 
16
  Task,
17
  EvalCriterion,
18
- TraceEvaluator,
19
- HeuristicEvaluator,
20
- extract_metrics,
21
- format_metrics_summary,
22
- setup_tracing,
23
  )
24
 
25
- # Setup tracing (call once at startup)
26
- setup_tracing("my-experiment")
27
 
28
- # Define a task
29
- task = Task(
30
- name="hello_world",
31
- prompt="Write a Python function that prints 'Hello, World!'",
32
- criteria=[
33
- EvalCriterion(
34
- name="correctness",
35
- instruction="The function should print exactly 'Hello, World!'",
36
- ),
37
- ],
38
- )
39
-
40
- # Run the experiment
41
- harness = MAFHarness()
42
- runner = FlowExperimentRunner(keep_workspace=True)
43
- result = await runner.run(harness, task)
44
-
45
- # Extract metrics
46
- metrics = extract_metrics(result.trace)
47
- print(format_metrics_summary(metrics))
48
-
49
- # Evaluate the result
50
- evaluator = HeuristicEvaluator()
51
- eval_result = await evaluator.evaluate(result)
52
- print(f"Score: {eval_result.score}, Passed: {eval_result.passed}")
53
-
54
- await harness.close()
55
-
56
- Ablation studies:
57
- from flow.experiments import run_ablations, AblationConfig
58
 
59
- configs = [
60
- AblationConfig(name="baseline", enable_message_compaction=False),
61
- AblationConfig(name="with_compaction", enable_message_compaction=True),
62
- ]
63
-
64
- results = await run_ablations(
65
- configs,
66
- task_prompt="Create a simple HTTP server",
67
- )
68
  """
69
 
70
- # Types
71
- # Ablation
 
 
 
 
 
 
 
 
 
 
 
 
72
  from .ablation import (
73
- AGENT_MEMORY_ONLY,
74
- ALL_CONTEXT_ENGINEERING,
75
- COMPACTION_ONLY,
76
- # Context engineering configs
77
- CONTEXT_ENG_BASELINE,
78
- CONTEXT_ENGINEERING_CONFIGS,
79
- ISOLATION_ONLY,
80
- AblationConfig,
81
- AblationResult,
82
- # Shared utilities
83
  compute_pareto_frontier,
84
- create_harness_from_config,
85
  generate_recommendation,
86
- run_ablations,
87
- run_context_engineering_comparison,
88
- run_single_ablation,
89
- )
90
-
91
- # Config export
92
- from .config_export import (
93
- export_config,
94
- export_optimization_configs,
95
- load_config,
96
  )
97
 
98
  # Evaluators
@@ -116,11 +79,10 @@ from .metrics import (
116
 
117
  # Optimizer
118
  from .optimizer import (
119
- ConfigSummary,
120
  FlowOptimizer,
121
  OptimizationResult,
122
  TaskResult,
123
- generate_grid_configs,
124
  load_tasks_from_jsonl,
125
  )
126
 
@@ -142,6 +104,16 @@ from .trace_collector import FlowTraceCollector
142
  from .types import CriterionResult, EvalCriterion, EvalResult, RunResult, Task
143
 
144
  __all__ = [ # noqa: RUF022 # Intentionally grouped by category
 
 
 
 
 
 
 
 
 
 
145
  # Types
146
  "Task",
147
  "EvalCriterion",
@@ -173,32 +145,16 @@ __all__ = [ # noqa: RUF022 # Intentionally grouped by category
173
  "print_metrics_summary",
174
  "print_comparison_table",
175
  "print_eval_result",
176
- # Ablation
177
- "AblationConfig",
178
- "AblationResult",
179
- "run_ablations",
180
- "run_single_ablation",
181
- "create_harness_from_config",
182
- # Context engineering configs
183
- "CONTEXT_ENG_BASELINE",
184
- "COMPACTION_ONLY",
185
- "AGENT_MEMORY_ONLY",
186
- "ISOLATION_ONLY",
187
- "ALL_CONTEXT_ENGINEERING",
188
- "CONTEXT_ENGINEERING_CONFIGS",
189
- "run_context_engineering_comparison",
190
- # Shared utilities
191
  "compute_pareto_frontier",
192
  "generate_recommendation",
193
  # Optimizer
194
  "FlowOptimizer",
195
  "OptimizationResult",
196
- "ConfigSummary",
197
  "TaskResult",
198
- "generate_grid_configs",
199
  "load_tasks_from_jsonl",
200
- # Config export
201
- "export_config",
202
- "load_config",
203
- "export_optimization_configs",
204
  ]
 
3
  """Experiments framework for running and evaluating Flow agent tasks.
4
 
5
  This package provides a structured way to:
6
+ - Define agents with the Agent dataclass
7
+ - Generate candidate variants via CandidateStrategy implementations
8
  - Run agents on tasks and collect OpenTelemetry traces
9
  - Evaluate agent outputs using LLM, heuristic, or trace-based evaluators
10
  - Extract metrics from execution traces
11
+ - Run optimization studies comparing different candidates
12
 
13
  Example usage:
 
14
  from flow.experiments import (
15
+ Agent,
16
+ Candidate,
17
+ GridSearchStrategy,
18
+ FlowOptimizer,
19
  Task,
20
  EvalCriterion,
 
 
 
 
 
21
  )
22
 
23
+ # Define a base agent
24
+ base = Agent(name="my_agent", enable_memory=True)
25
 
26
+ # Generate candidates
27
+ strategy = GridSearchStrategy(variations={
28
+ "enable_memory": [True, False],
29
+ })
30
+ candidates = strategy.generate(base, budget=10)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
+ # Run optimization
33
+ optimizer = FlowOptimizer(parallel=4)
34
+ tasks = [Task(name="test", prompt="Create hello world")]
35
+ result = await optimizer.optimize(candidates, tasks)
36
+ print(f"Best: {result.rank_by_score[0]}")
 
 
 
 
37
  """
38
 
39
+ # Core models
40
+ from .models import (
41
+ Agent,
42
+ Candidate,
43
+ CandidateStrategy,
44
+ CompactionConfig,
45
+ ExperimentResult,
46
+ GridSearchStrategy,
47
+ export_agent,
48
+ export_optimization_results,
49
+ load_agent,
50
+ )
51
+
52
+ # Experiment runner + Pareto analysis
53
  from .ablation import (
 
 
 
 
 
 
 
 
 
 
54
  compute_pareto_frontier,
55
+ create_harness_from_agent,
56
  generate_recommendation,
57
+ run_experiments,
58
+ run_single_experiment,
 
 
 
 
 
 
 
 
59
  )
60
 
61
  # Evaluators
 
79
 
80
  # Optimizer
81
  from .optimizer import (
82
+ CandidateSummary,
83
  FlowOptimizer,
84
  OptimizationResult,
85
  TaskResult,
 
86
  load_tasks_from_jsonl,
87
  )
88
 
 
104
  from .types import CriterionResult, EvalCriterion, EvalResult, RunResult, Task
105
 
106
  __all__ = [ # noqa: RUF022 # Intentionally grouped by category
107
+ # Core models
108
+ "Agent",
109
+ "Candidate",
110
+ "CandidateStrategy",
111
+ "CompactionConfig",
112
+ "ExperimentResult",
113
+ "GridSearchStrategy",
114
+ "export_agent",
115
+ "load_agent",
116
+ "export_optimization_results",
117
  # Types
118
  "Task",
119
  "EvalCriterion",
 
145
  "print_metrics_summary",
146
  "print_comparison_table",
147
  "print_eval_result",
148
+ # Experiment runner
149
+ "create_harness_from_agent",
150
+ "run_experiments",
151
+ "run_single_experiment",
 
 
 
 
 
 
 
 
 
 
 
152
  "compute_pareto_frontier",
153
  "generate_recommendation",
154
  # Optimizer
155
  "FlowOptimizer",
156
  "OptimizationResult",
157
+ "CandidateSummary",
158
  "TaskResult",
 
159
  "load_tasks_from_jsonl",
 
 
 
 
160
  ]
src/flow/experiments/ablation.py CHANGED
@@ -1,137 +1,91 @@
1
  # Copyright (c) Microsoft. All rights reserved.
2
 
3
- """Ablation runner for comparing Flow agent configurations.
4
 
5
  This module provides:
6
- - AblationConfig: Dataclass for agent configuration parameters
7
  - Pareto analysis utilities for multi-objective optimization
8
- - Pre-defined configurations for context engineering strategies
9
- - Convenience functions for running ablation studies
10
  """
11
 
12
  from __future__ import annotations
13
 
14
  import json
15
  import logging
16
- from dataclasses import asdict, dataclass
17
  from datetime import datetime
18
  from pathlib import Path
19
- from typing import TYPE_CHECKING
20
 
21
  from .evaluators import HeuristicEvaluator
22
- from .metrics import TraceMetrics, extract_metrics, metrics_to_dict
 
23
  from .reporters import print_comparison_table, save_run_result
24
  from .runner import FlowExperimentRunner, setup_tracing
25
- from .types import EvalCriterion, RunResult, Task
26
 
27
  if TYPE_CHECKING:
28
  from flow.harness.maf import MAFHarness
29
 
30
- from .optimizer import ConfigSummary
31
 
32
  logger = logging.getLogger(__name__)
33
 
34
 
35
- @dataclass
36
- class AblationConfig:
37
- """Configuration for a single ablation run.
38
-
39
- Each config represents a different agent configuration to test.
40
- The name is used as an identifier in comparison results.
41
-
42
- Attributes:
43
- name: Unique identifier for this configuration
44
- enable_message_compaction: Whether to enable message compaction
45
- enable_memory_tool: Whether to enable agent-managed memory
46
- enable_sub_agent: Whether to enable sub-agent for isolated research
47
- compaction_head_size: Number of initial messages to keep
48
- compaction_tail_size: Number of recent messages to keep
49
- bash_timeout: Timeout for bash commands in seconds
50
- """
51
-
52
- name: str
53
- enable_message_compaction: bool = True
54
- enable_memory_tool: bool = True
55
- enable_sub_agent: bool = False
56
- compaction_head_size: int = 10
57
- compaction_tail_size: int = 40
58
- bash_timeout: int = 120
59
-
60
-
61
- @dataclass
62
- class AblationResult:
63
- """Result of a single ablation run.
64
-
65
- Contains all data from the run including raw results,
66
- extracted metrics, and evaluation scores.
67
- """
68
-
69
- config: AblationConfig
70
- run_result: RunResult
71
- metrics: TraceMetrics
72
- eval_score: float
73
- eval_passed: bool
74
- eval_reasoning: str
75
-
76
-
77
- def create_harness_from_config(config: AblationConfig, workspace: Path) -> MAFHarness:
78
- """Create a MAFHarness from an ablation config.
79
 
80
  Args:
81
- config: The ablation configuration
82
  workspace: Working directory
83
 
84
  Returns:
85
  A configured MAFHarness
86
  """
 
87
  from flow.harness.maf import MAFHarness
88
 
 
 
 
89
  return MAFHarness(
90
  workspace=workspace,
91
  memory_path=workspace / "memory",
92
- enable_compaction=config.enable_message_compaction,
93
- enable_memory_tool=config.enable_memory_tool,
94
- enable_sub_agent=config.enable_sub_agent,
95
- compaction_head_size=config.compaction_head_size,
96
- compaction_tail_size=config.compaction_tail_size,
97
- bash_timeout=config.bash_timeout,
98
  )
99
 
100
 
101
- async def run_single_ablation(
102
- config: AblationConfig,
103
  task: Task,
104
  workspace: Path,
105
- ) -> AblationResult:
106
- """Run a single ablation with trace capture and evaluation.
107
 
108
  Args:
109
- config: The ablation configuration
110
  task: The task to run
111
  workspace: Working directory
112
 
113
  Returns:
114
- AblationResult with metrics and evaluation
115
  """
116
- # Create harness from config
117
- harness = create_harness_from_config(config, workspace)
118
 
119
  try:
120
- # Create runner
121
  runner = FlowExperimentRunner(keep_workspace=True)
122
-
123
- # Run the experiment
124
  run_result = await runner.run(harness, task, workspace=workspace)
125
-
126
- # Extract metrics
127
  metrics = extract_metrics(run_result.trace)
128
 
129
- # Evaluate the result
130
  evaluator = HeuristicEvaluator()
131
  eval_result = await evaluator.evaluate(run_result)
132
 
133
- return AblationResult(
134
- config=config,
135
  run_result=run_result,
136
  metrics=metrics,
137
  eval_score=eval_result.score,
@@ -142,26 +96,20 @@ async def run_single_ablation(
142
  await harness.close()
143
 
144
 
145
- def save_ablation_result(result: AblationResult, output_dir: Path) -> None:
146
- """Save ablation result to files.
147
-
148
- Creates a subdirectory for the config with all result files.
149
-
150
- Args:
151
- result: The ablation result to save
152
- output_dir: Base directory for output
153
- """
154
- config_dir = output_dir / result.config.name
155
  save_run_result(
156
  result.run_result,
157
  config_dir,
158
  metrics=result.metrics,
159
  )
160
 
161
- # Save ablation-specific data
162
- with open(config_dir / "ablation.json", "w") as f:
163
  json.dump({
164
- "config": asdict(result.config),
 
 
165
  "evaluation": {
166
  "score": result.eval_score,
167
  "passed": result.eval_passed,
@@ -170,37 +118,29 @@ def save_ablation_result(result: AblationResult, output_dir: Path) -> None:
170
  }, f, indent=2)
171
 
172
 
173
- async def run_ablations(
174
- configs: list[AblationConfig],
175
  task_prompt: str,
176
  output_dir: Path | None = None,
177
- task_name: str = "ablation_task",
178
- ) -> list[AblationResult]:
179
- """Run multiple ablation configurations and compare.
180
-
181
- This function:
182
- 1. Sets up tracing
183
- 2. Runs each configuration on the same task
184
- 3. Collects metrics and evaluation scores
185
- 4. Saves results and prints comparison
186
 
187
  Args:
188
- configs: List of configurations to test
189
  task_prompt: The task prompt to run
190
- output_dir: Base directory for output (default: ~/.flow/ablations)
191
- task_name: Name for the task (used in file paths)
192
 
193
  Returns:
194
- List of ablation results
195
  """
196
- # Setup output directory
197
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
198
  if output_dir is None:
199
- output_dir = Path.home() / ".flow" / "ablations"
200
  output_dir = output_dir / timestamp
201
  output_dir.mkdir(parents=True, exist_ok=True)
202
 
203
- # Create task
204
  task = Task(
205
  name=task_name,
206
  prompt=task_prompt,
@@ -212,52 +152,47 @@ async def run_ablations(
212
  ],
213
  )
214
 
215
- # Save configs
216
  with open(output_dir / "config.json", "w") as f: # noqa: ASYNC230
217
  json.dump({
218
  "task": task_prompt,
219
  "timestamp": timestamp,
220
- "configs": [asdict(c) for c in configs],
221
  }, f, indent=2)
222
 
223
  print("=" * 80)
224
- print(" FLOW ABLATION RUNNER")
225
  print("=" * 80)
226
- print(f" Task: {task_prompt[:60]}{'...' if len(task_prompt) > 60 else ''}")
227
- print(f" Configs: {len(configs)}")
228
- print(f" Output: {output_dir}")
229
  print("=" * 80)
230
 
231
- # Setup tracing once
232
- setup_tracing("flow-ablation")
233
 
234
- results = []
235
- for i, config in enumerate(configs, 1):
236
- print(f"\n[{i}/{len(configs)}] Running: {config.name}")
237
  print("-" * 40)
238
 
239
- # Each config gets its own workspace
240
- workspace = output_dir / config.name / "workspace"
241
  workspace.mkdir(parents=True, exist_ok=True)
242
 
243
- result = await run_single_ablation(
244
- config=config,
245
  task=task,
246
  workspace=workspace,
247
  )
248
 
249
  results.append(result)
250
- save_ablation_result(result, output_dir)
251
 
252
- # Quick status
253
  status = "OK" if result.run_result.success else "FAIL"
254
  print(f" {status} | {result.run_result.duration_seconds:.1f}s | "
255
  f"Tokens: {result.metrics.total_tokens} | Tools: {result.metrics.tool_call_count}")
256
 
257
- # Save comparison
258
  comparison_data = [
259
  {
260
- "name": r.config.name,
261
  "success": r.run_result.success,
262
  "duration_seconds": r.run_result.duration_seconds,
263
  "metrics": metrics_to_dict(r.metrics),
@@ -272,152 +207,48 @@ async def run_ablations(
272
  with open(output_dir / "comparison.json", "w") as f: # noqa: ASYNC230
273
  json.dump({"task": task_prompt, "results": comparison_data}, f, indent=2)
274
 
275
- # Print comparison
276
- print_comparison_table(comparison_data, "Ablation Comparison")
277
-
278
  print(f"\nResults saved to: {output_dir}")
279
 
280
  return results
281
 
282
 
283
  # =============================================================================
284
- # Context Engineering Baseline Configurations
285
- # =============================================================================
286
- # These configurations demonstrate the three main context engineering strategies:
287
- # 1. Compaction - Reactive trimming via message stores
288
- # 2. Agent-Managed Memory - Agent controls when to write/read/delete
289
- # 3. Isolation - Sub-agent architecture prevents context pollution
290
-
291
-
292
- # Baseline: No context engineering (for comparison)
293
- CONTEXT_ENG_BASELINE = AblationConfig(
294
- name="no_context_engineering",
295
- enable_message_compaction=False,
296
- enable_memory_tool=False,
297
- enable_sub_agent=False,
298
- )
299
-
300
- # Strategy 1: Compaction via Message Stores
301
- # Uses HeadTailCompactingMessageStore to keep first N + last M messages
302
- # Good for: Long-running sessions where middle context is less important
303
- COMPACTION_ONLY = AblationConfig(
304
- name="compaction_only",
305
- enable_message_compaction=True,
306
- enable_memory_tool=False,
307
- enable_sub_agent=False,
308
- compaction_head_size=10, # Keep task context
309
- compaction_tail_size=40, # Keep recent work
310
- )
311
-
312
- # Strategy 2: Agent-Managed Memory
313
- # Agent decides when to save/retrieve information from persistent storage
314
- # Good for: Cross-session memory, learning patterns, storing decisions
315
- AGENT_MEMORY_ONLY = AblationConfig(
316
- name="agent_memory_only",
317
- enable_message_compaction=False,
318
- enable_memory_tool=True,
319
- enable_sub_agent=False,
320
- )
321
-
322
- # Strategy 3: Isolation via Sub-Agent
323
- # Delegate heavy research to sub-agent with isolated context
324
- # Good for: Complex research tasks that would pollute main context
325
- ISOLATION_ONLY = AblationConfig(
326
- name="isolation_only",
327
- enable_message_compaction=False,
328
- enable_memory_tool=False,
329
- enable_sub_agent=True,
330
- )
331
-
332
- # Combined: All context engineering strategies
333
- # Uses compaction + memory + isolation together
334
- # Good for: Production systems with long-running, complex tasks
335
- ALL_CONTEXT_ENGINEERING = AblationConfig(
336
- name="all_context_engineering",
337
- enable_message_compaction=True,
338
- enable_memory_tool=True,
339
- enable_sub_agent=True,
340
- compaction_head_size=10,
341
- compaction_tail_size=40,
342
- )
343
-
344
- # Predefined list for running context engineering comparison
345
- CONTEXT_ENGINEERING_CONFIGS = [
346
- CONTEXT_ENG_BASELINE,
347
- COMPACTION_ONLY,
348
- AGENT_MEMORY_ONLY,
349
- ISOLATION_ONLY,
350
- ALL_CONTEXT_ENGINEERING,
351
- ]
352
-
353
-
354
- async def run_context_engineering_comparison(
355
- task_prompt: str,
356
- output_dir: Path | None = None,
357
- ) -> list[AblationResult]:
358
- """Run a comparison of all context engineering strategies.
359
-
360
- This is a convenience function that runs all context engineering
361
- baseline configurations against a single task for comparison.
362
-
363
- Args:
364
- task_prompt: The task to run (should benefit from context management)
365
- output_dir: Optional output directory for results
366
-
367
- Returns:
368
- List of AblationResult for each strategy
369
-
370
- Example:
371
- >>> results = await run_context_engineering_comparison(
372
- ... "Research the authentication patterns in this codebase and "
373
- ... "create a summary document with recommendations."
374
- ... )
375
- """
376
- return await run_ablations(
377
- configs=CONTEXT_ENGINEERING_CONFIGS,
378
- task_prompt=task_prompt,
379
- output_dir=output_dir,
380
- task_name="context_engineering_comparison",
381
- )
382
-
383
-
384
- # =============================================================================
385
- # Shared Utilities for Pareto Analysis
386
  # =============================================================================
387
 
388
 
389
  def compute_pareto_frontier(
390
- summaries: list[ConfigSummary],
391
  score_key: str = "avg_score",
392
  cost_key: str = "avg_tokens",
393
  ) -> list[str]:
394
  """Compute Pareto frontier for multi-objective optimization.
395
 
396
- Identifies configurations that are not dominated by any other configuration.
397
- A config is dominated if another config has better score AND lower tokens.
398
 
399
  Args:
400
- summaries: List of ConfigSummary objects (or dicts with score/token keys)
401
  score_key: Attribute name for the score metric (higher is better)
402
  cost_key: Attribute name for the cost metric (lower is better)
403
 
404
  Returns:
405
  List of names of Pareto-optimal configurations
406
  """
407
- # Sort by cost (ascending)
408
- def get_val(s: object, key: str) -> float:
409
  if isinstance(s, dict):
410
  return float(s.get(key, 0))
411
  return float(getattr(s, key, 0))
412
 
413
- def get_name(s: object) -> str:
414
  if isinstance(s, dict):
415
  return str(s.get("name", ""))
416
  return str(getattr(s, "name", ""))
417
 
418
  sorted_summaries = sorted(summaries, key=lambda s: get_val(s, cost_key))
419
 
420
- pareto_names = []
421
  best_score = -1.0
422
 
423
  for summary in sorted_summaries:
@@ -430,40 +261,37 @@ def compute_pareto_frontier(
430
 
431
 
432
  def generate_recommendation(
433
- summaries: list[ConfigSummary],
434
  pareto_names: list[str],
435
  min_score: float = 0.7,
436
  ) -> tuple[str | None, str]:
437
  """Generate a recommendation based on Pareto analysis.
438
 
439
  Args:
440
- summaries: List of ConfigSummary objects
441
- pareto_names: Names of Pareto-optimal configs
442
  min_score: Minimum acceptable score threshold
443
 
444
  Returns:
445
- Tuple of (recommended_config_name, recommendation_text)
446
  """
447
- def get_val(s: object, key: str) -> float:
448
  if isinstance(s, dict):
449
  return float(s.get(key, 0))
450
  return float(getattr(s, key, 0))
451
 
452
- def get_name(s: object) -> str:
453
  if isinstance(s, dict):
454
  return str(s.get("name", ""))
455
  return str(getattr(s, "name", ""))
456
 
457
- # Filter to acceptable configs
458
  acceptable = [s for s in summaries if get_val(s, "avg_score") >= min_score]
459
  if not acceptable:
460
  return None, "No configuration met the minimum score threshold."
461
 
462
- # Prefer Pareto-optimal configs
463
  pareto_acceptable = [s for s in acceptable if get_name(s) in pareto_names]
464
  candidates = pareto_acceptable if pareto_acceptable else acceptable
465
 
466
- # Pick the one with lowest tokens among candidates
467
  best = min(candidates, key=lambda s: get_val(s, "avg_tokens"))
468
  name = get_name(best)
469
  tokens = get_val(best, "avg_tokens")
 
1
  # Copyright (c) Microsoft. All rights reserved.
2
 
3
+ """Experiment runner for comparing agent configurations.
4
 
5
  This module provides:
6
+ - Functions for running experiments with Agent/Candidate models
7
  - Pareto analysis utilities for multi-objective optimization
8
+ - Convenience functions for running optimization studies
 
9
  """
10
 
11
  from __future__ import annotations
12
 
13
  import json
14
  import logging
15
+ from dataclasses import asdict
16
  from datetime import datetime
17
  from pathlib import Path
18
+ from typing import TYPE_CHECKING, Any
19
 
20
  from .evaluators import HeuristicEvaluator
21
+ from .metrics import extract_metrics, metrics_to_dict
22
+ from .models import Agent, Candidate, ExperimentResult
23
  from .reporters import print_comparison_table, save_run_result
24
  from .runner import FlowExperimentRunner, setup_tracing
25
+ from .types import EvalCriterion, Task
26
 
27
  if TYPE_CHECKING:
28
  from flow.harness.maf import MAFHarness
29
 
30
+ from .optimizer import CandidateSummary
31
 
32
  logger = logging.getLogger(__name__)
33
 
34
 
35
+ def create_harness_from_agent(agent: Agent, workspace: Path) -> MAFHarness:
36
+ """Create a MAFHarness from an Agent definition.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
  Args:
39
+ agent: The agent definition
40
  workspace: Working directory
41
 
42
  Returns:
43
  A configured MAFHarness
44
  """
45
+ from flow.experiments.models import resolve_tools
46
  from flow.harness.maf import MAFHarness
47
 
48
+ # Resolve tools to dict form
49
+ tools_spec = resolve_tools(agent.tools)
50
+
51
  return MAFHarness(
52
  workspace=workspace,
53
  memory_path=workspace / "memory",
54
+ enable_compaction=agent.compaction.enabled,
55
+ compaction_head_size=agent.compaction.head_size,
56
+ compaction_tail_size=agent.compaction.tail_size,
57
+ tools=tools_spec,
58
+ instructions=agent.instructions,
 
59
  )
60
 
61
 
62
+ async def run_single_experiment(
63
+ candidate: Candidate,
64
  task: Task,
65
  workspace: Path,
66
+ ) -> ExperimentResult:
67
+ """Run a single experiment with trace capture and evaluation.
68
 
69
  Args:
70
+ candidate: The candidate to test
71
  task: The task to run
72
  workspace: Working directory
73
 
74
  Returns:
75
+ ExperimentResult with metrics and evaluation
76
  """
77
+ harness = create_harness_from_agent(candidate.agent, workspace)
 
78
 
79
  try:
 
80
  runner = FlowExperimentRunner(keep_workspace=True)
 
 
81
  run_result = await runner.run(harness, task, workspace=workspace)
 
 
82
  metrics = extract_metrics(run_result.trace)
83
 
 
84
  evaluator = HeuristicEvaluator()
85
  eval_result = await evaluator.evaluate(run_result)
86
 
87
+ return ExperimentResult(
88
+ candidate=candidate,
89
  run_result=run_result,
90
  metrics=metrics,
91
  eval_score=eval_result.score,
 
96
  await harness.close()
97
 
98
 
99
+ def save_experiment_result(result: ExperimentResult, output_dir: Path) -> None:
100
+ """Save experiment result to files."""
101
+ config_dir = output_dir / result.candidate.agent.name
 
 
 
 
 
 
 
102
  save_run_result(
103
  result.run_result,
104
  config_dir,
105
  metrics=result.metrics,
106
  )
107
 
108
+ with open(config_dir / "experiment.json", "w") as f:
 
109
  json.dump({
110
+ "agent": asdict(result.candidate.agent),
111
+ "mutations": result.candidate.mutations,
112
+ "rationale": result.candidate.rationale,
113
  "evaluation": {
114
  "score": result.eval_score,
115
  "passed": result.eval_passed,
 
118
  }, f, indent=2)
119
 
120
 
121
+ async def run_experiments(
122
+ candidates: list[Candidate],
123
  task_prompt: str,
124
  output_dir: Path | None = None,
125
+ task_name: str = "experiment_task",
126
+ ) -> list[ExperimentResult]:
127
+ """Run multiple candidates and compare.
 
 
 
 
 
 
128
 
129
  Args:
130
+ candidates: List of candidates to test
131
  task_prompt: The task prompt to run
132
+ output_dir: Base directory for output (default: ~/.flow/experiments)
133
+ task_name: Name for the task
134
 
135
  Returns:
136
+ List of experiment results
137
  """
 
138
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
139
  if output_dir is None:
140
+ output_dir = Path.home() / ".flow" / "experiments"
141
  output_dir = output_dir / timestamp
142
  output_dir.mkdir(parents=True, exist_ok=True)
143
 
 
144
  task = Task(
145
  name=task_name,
146
  prompt=task_prompt,
 
152
  ],
153
  )
154
 
 
155
  with open(output_dir / "config.json", "w") as f: # noqa: ASYNC230
156
  json.dump({
157
  "task": task_prompt,
158
  "timestamp": timestamp,
159
+ "candidates": [asdict(c) for c in candidates],
160
  }, f, indent=2)
161
 
162
  print("=" * 80)
163
+ print(" FLOW EXPERIMENT RUNNER")
164
  print("=" * 80)
165
+ print(f" Task: {task_prompt[:60]}{'...' if len(task_prompt) > 60 else ''}")
166
+ print(f" Candidates: {len(candidates)}")
167
+ print(f" Output: {output_dir}")
168
  print("=" * 80)
169
 
170
+ setup_tracing("flow-experiment")
 
171
 
172
+ results: list[ExperimentResult] = []
173
+ for i, candidate in enumerate(candidates, 1):
174
+ print(f"\n[{i}/{len(candidates)}] Running: {candidate.agent.name}")
175
  print("-" * 40)
176
 
177
+ workspace = output_dir / candidate.agent.name / "workspace"
 
178
  workspace.mkdir(parents=True, exist_ok=True)
179
 
180
+ result = await run_single_experiment(
181
+ candidate=candidate,
182
  task=task,
183
  workspace=workspace,
184
  )
185
 
186
  results.append(result)
187
+ save_experiment_result(result, output_dir)
188
 
 
189
  status = "OK" if result.run_result.success else "FAIL"
190
  print(f" {status} | {result.run_result.duration_seconds:.1f}s | "
191
  f"Tokens: {result.metrics.total_tokens} | Tools: {result.metrics.tool_call_count}")
192
 
 
193
  comparison_data = [
194
  {
195
+ "name": r.candidate.agent.name,
196
  "success": r.run_result.success,
197
  "duration_seconds": r.run_result.duration_seconds,
198
  "metrics": metrics_to_dict(r.metrics),
 
207
  with open(output_dir / "comparison.json", "w") as f: # noqa: ASYNC230
208
  json.dump({"task": task_prompt, "results": comparison_data}, f, indent=2)
209
 
210
+ print_comparison_table(comparison_data, "Experiment Comparison")
 
 
211
  print(f"\nResults saved to: {output_dir}")
212
 
213
  return results
214
 
215
 
216
  # =============================================================================
217
+ # Pareto Analysis Utilities
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
218
  # =============================================================================
219
 
220
 
221
  def compute_pareto_frontier(
222
+ summaries: list[CandidateSummary],
223
  score_key: str = "avg_score",
224
  cost_key: str = "avg_tokens",
225
  ) -> list[str]:
226
  """Compute Pareto frontier for multi-objective optimization.
227
 
228
+ Identifies configurations that are not dominated by any other.
229
+ A config is dominated if another has better score AND lower tokens.
230
 
231
  Args:
232
+ summaries: List of CandidateSummary objects (or dicts)
233
  score_key: Attribute name for the score metric (higher is better)
234
  cost_key: Attribute name for the cost metric (lower is better)
235
 
236
  Returns:
237
  List of names of Pareto-optimal configurations
238
  """
239
+ def get_val(s: CandidateSummary | dict[str, Any], key: str) -> float:
 
240
  if isinstance(s, dict):
241
  return float(s.get(key, 0))
242
  return float(getattr(s, key, 0))
243
 
244
+ def get_name(s: CandidateSummary | dict[str, Any]) -> str:
245
  if isinstance(s, dict):
246
  return str(s.get("name", ""))
247
  return str(getattr(s, "name", ""))
248
 
249
  sorted_summaries = sorted(summaries, key=lambda s: get_val(s, cost_key))
250
 
251
+ pareto_names: list[str] = []
252
  best_score = -1.0
253
 
254
  for summary in sorted_summaries:
 
261
 
262
 
263
  def generate_recommendation(
264
+ summaries: list[CandidateSummary],
265
  pareto_names: list[str],
266
  min_score: float = 0.7,
267
  ) -> tuple[str | None, str]:
268
  """Generate a recommendation based on Pareto analysis.
269
 
270
  Args:
271
+ summaries: List of CandidateSummary objects
272
+ pareto_names: Names of Pareto-optimal candidates
273
  min_score: Minimum acceptable score threshold
274
 
275
  Returns:
276
+ Tuple of (recommended_name, recommendation_text)
277
  """
278
+ def get_val(s: CandidateSummary | dict[str, Any], key: str) -> float:
279
  if isinstance(s, dict):
280
  return float(s.get(key, 0))
281
  return float(getattr(s, key, 0))
282
 
283
+ def get_name(s: CandidateSummary | dict[str, Any]) -> str:
284
  if isinstance(s, dict):
285
  return str(s.get("name", ""))
286
  return str(getattr(s, "name", ""))
287
 
 
288
  acceptable = [s for s in summaries if get_val(s, "avg_score") >= min_score]
289
  if not acceptable:
290
  return None, "No configuration met the minimum score threshold."
291
 
 
292
  pareto_acceptable = [s for s in acceptable if get_name(s) in pareto_names]
293
  candidates = pareto_acceptable if pareto_acceptable else acceptable
294
 
 
295
  best = min(candidates, key=lambda s: get_val(s, "avg_tokens"))
296
  name = get_name(best)
297
  tokens = get_val(best, "avg_tokens")
src/flow/experiments/config_export.py DELETED
@@ -1,184 +0,0 @@
1
- # Copyright (c) Microsoft. All rights reserved.
2
-
3
- """Config export/import utilities for optimizer results.
4
-
5
- Exports winning configurations as YAML files that can be loaded
6
- and used directly with `flow run --config <path>`.
7
- """
8
-
9
- from __future__ import annotations
10
-
11
- from dataclasses import asdict
12
- from pathlib import Path
13
- from typing import Any
14
-
15
- import yaml
16
-
17
- from .ablation import AblationConfig
18
-
19
-
20
- def export_config(
21
- config: AblationConfig,
22
- metrics: dict[str, Any],
23
- path: Path,
24
- ) -> None:
25
- """Export an AblationConfig as a reusable YAML file.
26
-
27
- The exported YAML includes:
28
- - All config parameters (directly loadable)
29
- - Optimization metadata prefixed with _ (ignored when loading)
30
-
31
- Args:
32
- config: The AblationConfig to export
33
- metrics: Optimization metrics (score, tokens, etc.)
34
- path: Path to write the YAML file
35
-
36
- Example output:
37
- name: compaction_head10_tail40
38
- enable_message_compaction: true
39
- compaction_head_size: 10
40
- ...
41
- _optimization:
42
- timestamp: "2026-01-26T14:30:22"
43
- avg_score: 0.89
44
- avg_tokens: 12400
45
- """
46
- data = asdict(config)
47
- data["_optimization"] = metrics
48
- path.parent.mkdir(parents=True, exist_ok=True)
49
- path.write_text(yaml.dump(data, default_flow_style=False, sort_keys=False))
50
-
51
-
52
- def load_config(path: Path) -> AblationConfig:
53
- """Load an AblationConfig from a YAML file.
54
-
55
- Ignores any keys prefixed with _ (optimization metadata).
56
-
57
- Args:
58
- path: Path to the YAML config file
59
-
60
- Returns:
61
- AblationConfig instance
62
-
63
- Raises:
64
- FileNotFoundError: If the config file doesn't exist
65
- ValueError: If the config is invalid
66
- """
67
- if not path.exists():
68
- raise FileNotFoundError(f"Config file not found: {path}")
69
-
70
- data = yaml.safe_load(path.read_text())
71
-
72
- # Filter out metadata keys (prefixed with _)
73
- config_data = {k: v for k, v in data.items() if not k.startswith("_")}
74
-
75
- try:
76
- return AblationConfig(**config_data)
77
- except TypeError as e:
78
- raise ValueError(f"Invalid config file {path}: {e}") from e
79
-
80
-
81
- def export_optimization_configs(
82
- summaries: list[dict[str, Any]],
83
- pareto_names: list[str],
84
- output_dir: Path,
85
- timestamp: str,
86
- ) -> dict[str, Path]:
87
- """Export all notable configs from an optimization run.
88
-
89
- Exports:
90
- - best_score.yaml: Highest quality config
91
- - best_cost.yaml: Lowest token usage config
92
- - best_efficiency.yaml: Best score/token ratio
93
- - pareto/<name>.yaml: All Pareto-optimal configs
94
-
95
- Args:
96
- summaries: List of ConfigSummary dicts with metrics
97
- pareto_names: Names of Pareto-optimal configs
98
- output_dir: Directory to write configs
99
- timestamp: Optimization timestamp for metadata
100
-
101
- Returns:
102
- Dict mapping config type to file path
103
- """
104
- configs_dir = output_dir / "configs"
105
- configs_dir.mkdir(parents=True, exist_ok=True)
106
-
107
- exported: dict[str, Path] = {}
108
-
109
- if not summaries:
110
- return exported
111
-
112
- # Find best by different criteria
113
- best_score = max(summaries, key=lambda s: s.get("avg_score", 0))
114
- best_cost = min(summaries, key=lambda s: s.get("avg_tokens", float("inf")))
115
- best_efficiency = max(
116
- summaries,
117
- key=lambda s: s.get("avg_score", 0) / max(s.get("avg_tokens", 1), 1),
118
- )
119
-
120
- # Export best configs
121
- for label, summary in [
122
- ("best_score", best_score),
123
- ("best_cost", best_cost),
124
- ("best_efficiency", best_efficiency),
125
- ]:
126
- config = _summary_to_config(summary)
127
- metrics = _extract_metrics(summary, timestamp, label)
128
- path = configs_dir / f"{label}.yaml"
129
- export_config(config, metrics, path)
130
- exported[label] = path
131
-
132
- # Export Pareto-optimal configs
133
- pareto_dir = configs_dir / "pareto"
134
- pareto_dir.mkdir(exist_ok=True)
135
-
136
- for summary in summaries:
137
- name = summary.get("name", "unknown")
138
- if name in pareto_names:
139
- config = _summary_to_config(summary)
140
- metrics = _extract_metrics(summary, timestamp, "pareto")
141
- metrics["is_pareto_optimal"] = True
142
- path = pareto_dir / f"{name}.yaml"
143
- export_config(config, metrics, path)
144
- exported[f"pareto/{name}"] = path
145
-
146
- return exported
147
-
148
-
149
- def _summary_to_config(summary: dict[str, Any]) -> AblationConfig:
150
- """Convert a summary dict back to an AblationConfig."""
151
- # Extract config fields from summary
152
- config_fields = {
153
- "name": summary.get("name", "unknown"),
154
- "enable_message_compaction": summary.get("enable_message_compaction", True),
155
- "enable_memory_tool": summary.get("enable_memory_tool", True),
156
- "enable_sub_agent": summary.get("enable_sub_agent", False),
157
- "compaction_head_size": summary.get("compaction_head_size", 10),
158
- "compaction_tail_size": summary.get("compaction_tail_size", 40),
159
- "bash_timeout": summary.get("bash_timeout", 120),
160
- }
161
-
162
- # Also check nested config if present
163
- if "config" in summary:
164
- config_fields.update(summary["config"])
165
-
166
- return AblationConfig(**config_fields)
167
-
168
-
169
- def _extract_metrics(
170
- summary: dict[str, Any],
171
- timestamp: str,
172
- selection_reason: str,
173
- ) -> dict[str, Any]:
174
- """Extract optimization metrics from a summary."""
175
- return {
176
- "timestamp": timestamp,
177
- "selection_reason": selection_reason,
178
- "avg_score": summary.get("avg_score", 0),
179
- "avg_tokens": summary.get("avg_tokens", 0),
180
- "avg_duration": summary.get("avg_duration", 0),
181
- "pass_rate": summary.get("pass_rate", 0),
182
- "pareto_rank": summary.get("pareto_rank"),
183
- "is_pareto_optimal": summary.get("is_pareto_optimal", False),
184
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/flow/experiments/models.py ADDED
@@ -0,0 +1,517 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Microsoft. All rights reserved.
2
+
3
+ """Core data models for the optimization framework.
4
+
5
+ Defines:
6
+ - CompactionConfig: Extensible compaction strategy configuration
7
+ - Agent: Framework-agnostic agent definition (what the customer brings)
8
+ - Candidate: A mutated agent variant produced by optimization
9
+ - CandidateStrategy: Protocol for generating candidates from a base agent
10
+ - GridSearchStrategy: Brute-force grid search over parameter combinations
11
+ - TOOL_PRESETS: Standard tool configurations for agents
12
+ - resolve_tools: Normalize tool specification to dict form
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ from dataclasses import asdict, dataclass, field
18
+ from itertools import product as itertools_product
19
+ from pathlib import Path
20
+ from typing import Any, Protocol, runtime_checkable
21
+
22
+ import yaml
23
+
24
+
25
+ # =============================================================================
26
+ # Tool Configuration
27
+ # =============================================================================
28
+
29
+ # Tool presets define common tool configurations.
30
+ # Each preset maps tool names to their configuration dicts.
31
+ TOOL_PRESETS: dict[str, dict[str, dict[str, Any]]] = {
32
+ "full": {
33
+ "read_file": {},
34
+ "write_file": {},
35
+ "list_directory": {},
36
+ "grep_search": {},
37
+ "bash_execute": {"timeout": 120},
38
+ "check_processes": {},
39
+ "python_repl": {},
40
+ "think": {},
41
+ "task_done": {},
42
+ "memory": {},
43
+ "sub_agent": {"model": "gpt-4o-mini"},
44
+ },
45
+ "standard": {
46
+ "read_file": {},
47
+ "write_file": {},
48
+ "list_directory": {},
49
+ "grep_search": {},
50
+ "bash_execute": {"timeout": 120},
51
+ "check_processes": {},
52
+ "python_repl": {},
53
+ "think": {},
54
+ "task_done": {},
55
+ "memory": {},
56
+ },
57
+ "minimal": {
58
+ "read_file": {},
59
+ "write_file": {},
60
+ "bash_execute": {"timeout": 120},
61
+ "task_done": {},
62
+ },
63
+ "readonly": {
64
+ "read_file": {},
65
+ "list_directory": {},
66
+ "grep_search": {},
67
+ "think": {},
68
+ "task_done": {},
69
+ },
70
+ }
71
+
72
+
73
+ def resolve_tools(tools: str | list[str] | dict[str, dict[str, Any]]) -> dict[str, dict[str, Any]]:
74
+ """Normalize tool specification to dict form.
75
+
76
+ Accepts three input formats:
77
+ - str: Preset name (e.g., "standard", "minimal", "full", "readonly")
78
+ - list[str]: List of tool names with default configs
79
+ - dict[str, dict]: Full specification with per-tool configs
80
+
81
+ Args:
82
+ tools: Tool specification in any supported format
83
+
84
+ Returns:
85
+ Dict mapping tool names to their configuration dicts
86
+
87
+ Raises:
88
+ ValueError: If preset name is unknown
89
+
90
+ Example:
91
+ >>> resolve_tools("standard")
92
+ {"read_file": {}, "write_file": {}, ...}
93
+
94
+ >>> resolve_tools(["read_file", "bash_execute"])
95
+ {"read_file": {}, "bash_execute": {}}
96
+
97
+ >>> resolve_tools({"bash_execute": {"timeout": 60}})
98
+ {"bash_execute": {"timeout": 60}}
99
+ """
100
+ if isinstance(tools, str):
101
+ if tools not in TOOL_PRESETS:
102
+ raise ValueError(f"Unknown tool preset: {tools}. Available: {list(TOOL_PRESETS.keys())}")
103
+ # Return a copy to prevent mutation of the preset
104
+ return {k: dict(v) for k, v in TOOL_PRESETS[tools].items()}
105
+ elif isinstance(tools, list):
106
+ return {name: {} for name in tools}
107
+ else:
108
+ # Already a dict, return a copy
109
+ return {k: dict(v) for k, v in tools.items()}
110
+
111
+
112
+ @dataclass
113
+ class CompactionConfig:
114
+ """Extensible compaction strategy configuration.
115
+
116
+ Supports multiple strategies via a tagged-union pattern:
117
+ - "head_tail": Keep first N + last M messages (default)
118
+ - "last_n": Keep only the last N messages
119
+ - "none": No compaction
120
+
121
+ Future strategies (e.g., "summarize") can be added without
122
+ changing existing code.
123
+
124
+ Attributes:
125
+ strategy: The compaction strategy name
126
+ params: Strategy-specific parameters
127
+ """
128
+
129
+ strategy: str = "head_tail"
130
+ params: dict[str, Any] = field(default_factory=lambda: {"head_size": 10, "tail_size": 40})
131
+
132
+ @staticmethod
133
+ def head_tail(head_size: int = 10, tail_size: int = 40) -> CompactionConfig:
134
+ """Create a head+tail compaction config."""
135
+ return CompactionConfig(strategy="head_tail", params={"head_size": head_size, "tail_size": tail_size})
136
+
137
+ @staticmethod
138
+ def last_n(n: int = 50) -> CompactionConfig:
139
+ """Create a last-N compaction config."""
140
+ return CompactionConfig(strategy="last_n", params={"n": n})
141
+
142
+ @staticmethod
143
+ def none() -> CompactionConfig:
144
+ """Create a no-compaction config."""
145
+ return CompactionConfig(strategy="none", params={})
146
+
147
+ @property
148
+ def enabled(self) -> bool:
149
+ """Whether compaction is enabled."""
150
+ return self.strategy != "none"
151
+
152
+ @property
153
+ def head_size(self) -> int:
154
+ """Head size for head_tail strategy. Returns 0 for other strategies."""
155
+ return self.params.get("head_size", 0)
156
+
157
+ @property
158
+ def tail_size(self) -> int:
159
+ """Tail size for head_tail strategy. Returns 0 for other strategies."""
160
+ return self.params.get("tail_size", 0)
161
+
162
+
163
+ @dataclass
164
+ class Agent:
165
+ """Framework-agnostic agent definition.
166
+
167
+ This is what the customer brings to the optimization service.
168
+ It describes the agent's identity, model, tools, and context
169
+ engineering settings — everything needed to instantiate and
170
+ run the agent on any supported framework harness.
171
+
172
+ Attributes:
173
+ name: Unique identifier for this agent
174
+ description: Human-readable description
175
+ instructions: System prompt / instructions (optional, uses framework default if None)
176
+ model: Model deployment name (e.g., "gpt-4o")
177
+ compaction: Compaction strategy configuration
178
+ tools: Tool configuration - can be:
179
+ - str: Preset name ("standard", "minimal", "full", "readonly")
180
+ - list[str]: List of tool names with default configs
181
+ - dict[str, dict]: Full specification with per-tool configs
182
+ """
183
+
184
+ name: str
185
+ description: str = ""
186
+ instructions: str | None = None
187
+ model: str | None = None
188
+ compaction: CompactionConfig = field(default_factory=CompactionConfig)
189
+ tools: str | list[str] | dict[str, dict[str, Any]] = "standard"
190
+
191
+
192
+ @dataclass
193
+ class Candidate:
194
+ """A mutated agent variant produced by the optimization process.
195
+
196
+ Each candidate is derived from a base Agent with specific mutations
197
+ applied. The mutations dict records what was changed, and the
198
+ rationale explains why.
199
+
200
+ Attributes:
201
+ agent: The mutated agent configuration
202
+ mutations: Dict describing what was changed from the base
203
+ rationale: Human-readable explanation of why this candidate exists
204
+ """
205
+
206
+ agent: Agent
207
+ mutations: dict[str, Any] = field(default_factory=dict)
208
+ rationale: str = ""
209
+
210
+
211
+ @dataclass
212
+ class ExperimentResult:
213
+ """Result of running a single experiment (one candidate on one task)."""
214
+
215
+ candidate: Candidate
216
+ run_result: Any # RunResult from types.py
217
+ metrics: Any # TraceMetrics from metrics.py
218
+ eval_score: float = 0.0
219
+ eval_passed: bool = False
220
+ eval_reasoning: str = ""
221
+
222
+
223
+ @runtime_checkable
224
+ class CandidateStrategy(Protocol):
225
+ """Protocol for generating candidate variants from a base agent.
226
+
227
+ Implementations explore different regions of the optimization space:
228
+ - GridSearchStrategy: Exhaustive grid over parameter combinations
229
+ - (Future) HeuristicStrategy: Rule-based mutations from telemetry
230
+ - (Future) BayesianStrategy: Bayesian optimization over parameters
231
+ """
232
+
233
+ def generate(self, base: Agent, budget: int) -> list[Candidate]:
234
+ """Generate candidate variants from a base agent.
235
+
236
+ Args:
237
+ base: The base agent to mutate
238
+ budget: Maximum number of candidates to generate
239
+
240
+ Returns:
241
+ List of Candidate objects (at most `budget` items)
242
+ """
243
+ ...
244
+
245
+
246
+ class GridSearchStrategy:
247
+ """Brute-force grid search over parameter combinations.
248
+
249
+ Generates candidates by taking the Cartesian product of all
250
+ specified parameter variations.
251
+
252
+ Example:
253
+ strategy = GridSearchStrategy(variations={
254
+ "tools": ["standard", "minimal", "full"],
255
+ "compaction": [
256
+ CompactionConfig.head_tail(10, 40),
257
+ CompactionConfig.head_tail(5, 20),
258
+ CompactionConfig.none(),
259
+ ],
260
+ })
261
+ candidates = strategy.generate(base_agent, budget=20)
262
+ """
263
+
264
+ def __init__(self, variations: dict[str, list[Any]]) -> None:
265
+ """Initialize with parameter variations.
266
+
267
+ Args:
268
+ variations: Dict mapping Agent field names to lists of values to try.
269
+ Special keys:
270
+ - "compaction": Accepts CompactionConfig objects
271
+ - "tools": Accepts preset strings, lists, or dicts
272
+ """
273
+ self.variations = variations
274
+
275
+ def generate(self, base: Agent, budget: int) -> list[Candidate]:
276
+ """Generate all grid combinations up to budget."""
277
+ if not self.variations:
278
+ return [Candidate(agent=base, mutations={}, rationale="baseline")]
279
+
280
+ param_names = list(self.variations.keys())
281
+ param_values = list(self.variations.values())
282
+
283
+ candidates = []
284
+ for values in itertools_product(*param_values):
285
+ if len(candidates) >= budget:
286
+ break
287
+
288
+ mutations = dict(zip(param_names, values, strict=True))
289
+
290
+ # Build mutated agent
291
+ agent_dict = asdict(base)
292
+ for key, value in mutations.items():
293
+ if key == "compaction" and isinstance(value, CompactionConfig):
294
+ agent_dict["compaction"] = asdict(value)
295
+ elif key in agent_dict:
296
+ agent_dict[key] = value
297
+
298
+ # Reconstruct CompactionConfig from dict
299
+ comp_data = agent_dict.pop("compaction")
300
+ if isinstance(comp_data, dict):
301
+ compaction = CompactionConfig(**comp_data)
302
+ else:
303
+ compaction = comp_data
304
+
305
+ # Handle tools field - keep as-is (str, list, or dict)
306
+ tools = agent_dict.pop("tools", "standard")
307
+
308
+ mutated = Agent(
309
+ **{k: v for k, v in agent_dict.items() if k not in ("compaction", "tools")},
310
+ compaction=compaction,
311
+ tools=tools,
312
+ )
313
+
314
+ # Build name from mutations
315
+ name_parts = []
316
+ for k, v in mutations.items():
317
+ if isinstance(v, CompactionConfig):
318
+ name_parts.append(f"{v.strategy}")
319
+ if v.strategy == "head_tail":
320
+ name_parts.append(f"h{v.head_size}_t{v.tail_size}")
321
+ elif k == "tools":
322
+ # Format tools for name
323
+ if isinstance(v, str):
324
+ name_parts.append(f"tools={v}")
325
+ elif isinstance(v, list):
326
+ name_parts.append(f"tools=[{len(v)}]")
327
+ else:
328
+ name_parts.append(f"tools=[{len(v)}]")
329
+ elif isinstance(v, bool):
330
+ name_parts.append(f"{k}={'on' if v else 'off'}")
331
+ else:
332
+ name_parts.append(f"{k}={v}")
333
+
334
+ mutated.name = f"{base.name}_{'_'.join(name_parts)}"
335
+
336
+ # Serialize mutations for storage (convert non-serializable types)
337
+ serializable_mutations = {}
338
+ for k, v in mutations.items():
339
+ if isinstance(v, CompactionConfig):
340
+ serializable_mutations[k] = asdict(v)
341
+ else:
342
+ serializable_mutations[k] = v
343
+
344
+ candidates.append(Candidate(
345
+ agent=mutated,
346
+ mutations=serializable_mutations,
347
+ rationale=f"Grid search: {', '.join(name_parts)}",
348
+ ))
349
+
350
+ return candidates
351
+
352
+
353
+ # =============================================================================
354
+ # Agent YAML Export / Import
355
+ # =============================================================================
356
+
357
+
358
+ def export_agent(
359
+ agent: Agent,
360
+ path: Path,
361
+ metrics: dict[str, Any] | None = None,
362
+ ) -> None:
363
+ """Export an Agent as a reusable YAML file.
364
+
365
+ Args:
366
+ agent: The Agent to export
367
+ path: Path to write the YAML file
368
+ metrics: Optional optimization metrics (stored under _optimization key)
369
+ """
370
+ data = asdict(agent)
371
+ if metrics:
372
+ data["_optimization"] = metrics
373
+ path.parent.mkdir(parents=True, exist_ok=True)
374
+ path.write_text(yaml.dump(data, default_flow_style=False, sort_keys=False))
375
+
376
+
377
+ def load_agent(path: Path) -> Agent:
378
+ """Load an Agent from a YAML file.
379
+
380
+ Ignores any keys prefixed with _ (optimization metadata).
381
+
382
+ Args:
383
+ path: Path to the YAML config file
384
+
385
+ Returns:
386
+ Agent instance
387
+
388
+ Raises:
389
+ FileNotFoundError: If the file doesn't exist
390
+ ValueError: If the config is invalid
391
+ """
392
+ if not path.exists():
393
+ raise FileNotFoundError(f"Agent config file not found: {path}")
394
+
395
+ data = yaml.safe_load(path.read_text())
396
+ config_data = {k: v for k, v in data.items() if not k.startswith("_")}
397
+
398
+ # Reconstruct CompactionConfig from nested dict
399
+ if "compaction" in config_data and isinstance(config_data["compaction"], dict):
400
+ config_data["compaction"] = CompactionConfig(**config_data["compaction"])
401
+
402
+ try:
403
+ return Agent(**config_data)
404
+ except TypeError as e:
405
+ raise ValueError(f"Invalid agent config file {path}: {e}") from e
406
+
407
+
408
+ def export_optimization_results(
409
+ summaries: list[dict[str, Any]],
410
+ pareto_names: list[str],
411
+ output_dir: Path,
412
+ timestamp: str,
413
+ ) -> dict[str, Path]:
414
+ """Export notable agents from an optimization run as YAML files.
415
+
416
+ Exports:
417
+ - best_score.yaml: Highest quality agent
418
+ - best_cost.yaml: Lowest token usage agent
419
+ - best_efficiency.yaml: Best score/token ratio
420
+ - pareto/<name>.yaml: All Pareto-optimal agents
421
+
422
+ Args:
423
+ summaries: List of summary dicts with metrics
424
+ pareto_names: Names of Pareto-optimal agents
425
+ output_dir: Directory to write agent files
426
+ timestamp: Optimization timestamp for metadata
427
+
428
+ Returns:
429
+ Dict mapping label to file path
430
+ """
431
+ configs_dir = output_dir / "agents"
432
+ configs_dir.mkdir(parents=True, exist_ok=True)
433
+
434
+ exported: dict[str, Path] = {}
435
+
436
+ if not summaries:
437
+ return exported
438
+
439
+ best_score = max(summaries, key=lambda s: s.get("avg_score", 0))
440
+ best_cost = min(summaries, key=lambda s: s.get("avg_tokens", float("inf")))
441
+ best_efficiency = max(
442
+ summaries,
443
+ key=lambda s: s.get("avg_score", 0) / max(s.get("avg_tokens", 1), 1),
444
+ )
445
+
446
+ for label, summary in [
447
+ ("best_score", best_score),
448
+ ("best_cost", best_cost),
449
+ ("best_efficiency", best_efficiency),
450
+ ]:
451
+ agent = _summary_to_agent(summary)
452
+ metrics = _extract_metrics(summary, timestamp, label)
453
+ path = configs_dir / f"{label}.yaml"
454
+ export_agent(agent, path, metrics)
455
+ exported[label] = path
456
+
457
+ # Export Pareto-optimal agents
458
+ pareto_dir = configs_dir / "pareto"
459
+ pareto_dir.mkdir(exist_ok=True)
460
+
461
+ for summary in summaries:
462
+ name = summary.get("name", "unknown")
463
+ if name in pareto_names:
464
+ agent = _summary_to_agent(summary)
465
+ metrics = _extract_metrics(summary, timestamp, "pareto")
466
+ metrics["is_pareto_optimal"] = True
467
+ path = pareto_dir / f"{name}.yaml"
468
+ export_agent(agent, path, metrics)
469
+ exported[f"pareto/{name}"] = path
470
+
471
+ return exported
472
+
473
+
474
+ def _summary_to_agent(summary: dict[str, Any]) -> Agent:
475
+ """Convert a summary dict back to an Agent."""
476
+ agent_data = summary.get("agent", {})
477
+ if agent_data:
478
+ # Reconstruct from nested agent dict
479
+ if "compaction" in agent_data and isinstance(agent_data["compaction"], dict):
480
+ agent_data["compaction"] = CompactionConfig(**agent_data["compaction"])
481
+ # tools field can be str, list, or dict - all are valid, keep as-is
482
+ return Agent(**agent_data)
483
+
484
+ # Fallback: build from flat summary fields (legacy format)
485
+ compaction = CompactionConfig.head_tail(
486
+ head_size=summary.get("compaction_head_size", 10),
487
+ tail_size=summary.get("compaction_tail_size", 40),
488
+ ) if summary.get("enable_message_compaction", True) else CompactionConfig.none()
489
+
490
+ # Determine tools from legacy fields if present
491
+ tools: str | list[str] | dict[str, dict[str, Any]] = "standard"
492
+ if "tools" in summary:
493
+ tools = summary["tools"]
494
+
495
+ return Agent(
496
+ name=summary.get("name", "unknown"),
497
+ compaction=compaction,
498
+ tools=tools,
499
+ )
500
+
501
+
502
+ def _extract_metrics(
503
+ summary: dict[str, Any],
504
+ timestamp: str,
505
+ selection_reason: str,
506
+ ) -> dict[str, Any]:
507
+ """Extract optimization metrics from a summary."""
508
+ return {
509
+ "timestamp": timestamp,
510
+ "selection_reason": selection_reason,
511
+ "avg_score": summary.get("avg_score", 0),
512
+ "avg_tokens": summary.get("avg_tokens", 0),
513
+ "avg_duration": summary.get("avg_duration", 0),
514
+ "pass_rate": summary.get("pass_rate", 0),
515
+ "pareto_rank": summary.get("pareto_rank"),
516
+ "is_pareto_optimal": summary.get("is_pareto_optimal", False),
517
+ }
src/flow/experiments/optimizer.py CHANGED
@@ -3,7 +3,7 @@
3
  """Optimizer service for finding best agent configurations.
4
 
5
  Runs experiments in parallel, evaluates with LLM-as-Judge,
6
- ranks via Pareto analysis, and exports reusable configs.
7
  """
8
 
9
  from __future__ import annotations
@@ -15,31 +15,32 @@ import os
15
  from collections.abc import Callable
16
  from dataclasses import asdict, dataclass, field
17
  from datetime import datetime
18
- from itertools import product
19
  from pathlib import Path
20
  from typing import Any
21
 
22
  from openai import AsyncAzureOpenAI
23
 
24
  from .ablation import (
25
- AblationConfig,
26
  compute_pareto_frontier,
27
- create_harness_from_config,
28
  )
29
- from .config_export import export_optimization_configs
30
  from .evaluators import LLMEvaluator
31
  from .metrics import TraceMetrics, extract_metrics
 
 
 
 
32
  from .runner import FlowExperimentRunner, setup_tracing
33
- from .types import EvalCriterion, RunResult, Task
34
 
35
  logger = logging.getLogger(__name__)
36
 
37
 
38
  @dataclass
39
  class TaskResult:
40
- """Result for a single config-task pair."""
41
 
42
- config_name: str
43
  task_name: str
44
  run_result: RunResult
45
  metrics: TraceMetrics
@@ -49,12 +50,12 @@ class TaskResult:
49
 
50
 
51
  @dataclass
52
- class ConfigSummary:
53
- """Aggregated summary for a configuration across all tasks."""
54
 
55
  name: str
56
- config: AblationConfig
57
- task_results: list[TaskResult] = field(default_factory=list)
58
 
59
  # Aggregated metrics
60
  avg_score: float = 0.0
@@ -72,7 +73,9 @@ class ConfigSummary:
72
  """Convert to dictionary for serialization."""
73
  return {
74
  "name": self.name,
75
- "config": asdict(self.config),
 
 
76
  "avg_score": self.avg_score,
77
  "avg_tokens": self.avg_tokens,
78
  "avg_duration": self.avg_duration,
@@ -90,21 +93,21 @@ class OptimizationResult:
90
 
91
  timestamp: str
92
  output_dir: Path
93
- summaries: list[ConfigSummary]
94
  pareto_frontier: list[str]
95
- exported_configs: dict[str, Path]
96
 
97
  # Rankings
98
- rank_by_score: list[str] = field(default_factory=list)
99
- rank_by_tokens: list[str] = field(default_factory=list)
100
- rank_by_efficiency: list[str] = field(default_factory=list)
101
 
102
  # Stats
103
  total_experiments: int = 0
104
  total_duration_seconds: float = 0.0
105
 
106
- def get_best_config(self, criterion: str = "score") -> ConfigSummary | None:
107
- """Get the best config by a criterion."""
108
  if criterion == "score":
109
  names = self.rank_by_score
110
  elif criterion == "tokens":
@@ -126,17 +129,18 @@ class OptimizationResult:
126
  class FlowOptimizer:
127
  """Optimizer for finding best agent configurations.
128
 
129
- Runs experiments in parallel, evaluates results, performs
130
- Pareto analysis, and exports winning configs.
 
131
 
132
  Example:
 
 
 
133
  optimizer = FlowOptimizer(parallel=4)
134
- configs = [
135
- AblationConfig(name="baseline", enable_message_compaction=False),
136
- AblationConfig(name="compaction", enable_message_compaction=True),
137
- ]
138
- tasks = [Task(name="test", prompt="Create hello world")]
139
- result = await optimizer.optimize(configs, tasks)
140
  print(f"Best: {result.rank_by_score[0]}")
141
  """
142
 
@@ -146,69 +150,55 @@ class FlowOptimizer:
146
  use_llm_evaluator: bool = True,
147
  output_dir: Path | None = None,
148
  ) -> None:
149
- """Initialize the optimizer.
150
-
151
- Args:
152
- parallel: Max concurrent experiments
153
- use_llm_evaluator: Whether to use LLM for evaluation
154
- output_dir: Base directory for results
155
- """
156
  self.parallel = parallel
157
  self.use_llm_evaluator = use_llm_evaluator
158
  self.output_dir = output_dir or Path.home() / ".flow" / "optimizations"
159
 
160
  async def optimize(
161
  self,
162
- configs: list[AblationConfig],
163
  tasks: list[Task],
164
  progress_callback: Callable[[int, int, str, str], None] | None = None,
165
  ) -> OptimizationResult:
166
- """Run optimization across all configs and tasks.
167
 
168
  Args:
169
- configs: Configurations to test
170
- tasks: Tasks to run each config on
171
- progress_callback: Optional callback(completed, total, config, task)
172
 
173
  Returns:
174
- OptimizationResult with rankings and exported configs
175
  """
176
  start_time = datetime.now()
177
  timestamp = start_time.strftime("%Y%m%d_%H%M%S")
178
  run_dir = self.output_dir / timestamp
179
  run_dir.mkdir(parents=True, exist_ok=True)
180
 
181
- # Setup
182
  setup_tracing("flow-optimizer")
183
- self._save_config(configs, tasks, run_dir)
184
 
185
  print("=" * 70)
186
  print(" FLOW OPTIMIZER")
187
  print("=" * 70)
188
- print(f" Configs: {len(configs)}")
189
- print(f" Tasks: {len(tasks)}")
190
- print(f" Total: {len(configs) * len(tasks)} experiments")
191
- print(f" Parallel: {self.parallel}")
192
- print(f" Output: {run_dir}")
193
  print("=" * 70)
194
 
195
- # Create LLM evaluator if needed
196
  evaluator = None
197
  if self.use_llm_evaluator:
198
  evaluator = self._create_evaluator()
199
 
200
- # Run all experiments in parallel
201
  task_results = await self._run_parallel(
202
- configs, tasks, run_dir, evaluator, progress_callback
203
  )
204
 
205
- # Aggregate by config
206
- summaries = self._aggregate_results(task_results, configs)
207
-
208
- # Pareto analysis
209
  pareto_names = self._compute_pareto(summaries)
210
 
211
- # Compute rankings
212
  rank_by_score = sorted(summaries, key=lambda s: s.avg_score, reverse=True)
213
  rank_by_tokens = sorted(summaries, key=lambda s: s.avg_tokens)
214
  rank_by_efficiency = sorted(
@@ -217,9 +207,8 @@ class FlowOptimizer:
217
  reverse=True,
218
  )
219
 
220
- # Export configs
221
  summary_dicts = [s.to_dict() for s in summaries]
222
- exported = export_optimization_configs(
223
  summary_dicts, pareto_names, run_dir, timestamp
224
  )
225
 
@@ -230,7 +219,7 @@ class FlowOptimizer:
230
  output_dir=run_dir,
231
  summaries=summaries,
232
  pareto_frontier=pareto_names,
233
- exported_configs=exported,
234
  rank_by_score=[s.name for s in rank_by_score],
235
  rank_by_tokens=[s.name for s in rank_by_tokens],
236
  rank_by_efficiency=[s.name for s in rank_by_efficiency],
@@ -238,56 +227,49 @@ class FlowOptimizer:
238
  total_duration_seconds=(end_time - start_time).total_seconds(),
239
  )
240
 
241
- # Save results
242
  self._save_results(result, run_dir)
243
-
244
- # Print summary
245
  self._print_summary(result)
246
 
247
  return result
248
 
249
  async def _run_parallel(
250
  self,
251
- configs: list[AblationConfig],
252
  tasks: list[Task],
253
  run_dir: Path,
254
  evaluator: LLMEvaluator | None,
255
  progress_callback: Callable[[int, int, str, str], None] | None,
256
  ) -> list[TaskResult]:
257
- """Run all config-task pairs in parallel with semaphore control."""
258
  semaphore = asyncio.Semaphore(self.parallel)
259
- total = len(configs) * len(tasks)
260
  completed = 0
261
  lock = asyncio.Lock()
262
 
263
- async def run_one(config: AblationConfig, task: Task) -> TaskResult:
264
  nonlocal completed
265
  async with semaphore:
266
- workspace = run_dir / "workspaces" / config.name / task.name
267
  workspace.mkdir(parents=True, exist_ok=True)
268
 
269
- result = await self._run_single(config, task, workspace, evaluator)
270
 
271
  async with lock:
272
  completed += 1
273
  status = "✓" if result.eval_passed else "✗"
274
  print(
275
- f" [{completed}/{total}] {config.name}/{task.name}: "
276
  f"{status} score={result.eval_score:.2f} "
277
  f"tokens={result.metrics.total_tokens:,}"
278
  )
279
  if progress_callback:
280
- progress_callback(completed, total, config.name, task.name)
281
 
282
  return result
283
 
284
- # Create all tasks
285
- coroutines = [run_one(config, task) for config in configs for task in tasks]
286
-
287
- # Run with gather
288
  gather_results = await asyncio.gather(*coroutines, return_exceptions=True)
289
 
290
- # Filter out exceptions
291
  valid_results: list[TaskResult] = []
292
  for r in gather_results:
293
  if isinstance(r, BaseException):
@@ -299,33 +281,31 @@ class FlowOptimizer:
299
 
300
  async def _run_single(
301
  self,
302
- config: AblationConfig,
303
  task: Task,
304
  workspace: Path,
305
  evaluator: LLMEvaluator | None,
306
  ) -> TaskResult:
307
- """Run a single config-task experiment."""
308
- harness = create_harness_from_config(config, workspace)
309
 
310
  try:
311
  runner = FlowExperimentRunner(keep_workspace=True)
312
  run_result = await runner.run(harness, task, workspace=workspace)
313
  metrics = extract_metrics(run_result.trace)
314
 
315
- # Evaluate
316
  if evaluator:
317
  eval_result = await evaluator.evaluate(run_result)
318
  eval_score = eval_result.score
319
  eval_passed = eval_result.passed
320
  eval_reasoning = eval_result.reasoning
321
  else:
322
- # Simple heuristic: passed if no error
323
  eval_score = 1.0 if run_result.success else 0.0
324
  eval_passed = run_result.success
325
  eval_reasoning = "Success" if run_result.success else run_result.error or "Failed"
326
 
327
  return TaskResult(
328
- config_name=config.name,
329
  task_name=task.name,
330
  run_result=run_result,
331
  metrics=metrics,
@@ -339,25 +319,25 @@ class FlowOptimizer:
339
  def _aggregate_results(
340
  self,
341
  task_results: list[TaskResult],
342
- configs: list[AblationConfig],
343
- ) -> list[ConfigSummary]:
344
- """Aggregate task results into config summaries."""
345
- config_map = {c.name: c for c in configs}
346
- results_by_config: dict[str, list[TaskResult]] = {c.name: [] for c in configs}
347
 
348
  for result in task_results:
349
- if result.config_name in results_by_config:
350
- results_by_config[result.config_name].append(result)
351
 
352
  summaries = []
353
- for name, results in results_by_config.items():
354
  if not results:
355
  continue
356
 
357
- config = config_map[name]
358
- summary = ConfigSummary(
359
  name=name,
360
- config=config,
361
  task_results=results,
362
  avg_score=sum(r.eval_score for r in results) / len(results),
363
  avg_tokens=sum(r.metrics.total_tokens for r in results) / len(results),
@@ -370,19 +350,17 @@ class FlowOptimizer:
370
 
371
  return summaries
372
 
373
- def _compute_pareto(self, summaries: list[ConfigSummary]) -> list[str]:
374
  """Compute Pareto frontier (maximize score, minimize tokens)."""
375
- # Use shared utility
376
  pareto_names = compute_pareto_frontier(summaries)
377
 
378
- # Mark summaries with Pareto status
379
  for summary in summaries:
380
  if summary.name in pareto_names:
381
  summary.is_pareto_optimal = True
382
  summary.pareto_rank = 0
383
  else:
384
  summary.is_pareto_optimal = False
385
- summary.pareto_rank = 1 # Simplified: all non-Pareto get rank 1
386
 
387
  return pareto_names
388
 
@@ -410,7 +388,7 @@ class FlowOptimizer:
410
 
411
  def _save_config(
412
  self,
413
- configs: list[AblationConfig],
414
  tasks: list[Task],
415
  run_dir: Path,
416
  ) -> None:
@@ -418,7 +396,7 @@ class FlowOptimizer:
418
  with open(run_dir / "optimization_config.json", "w") as f:
419
  json.dump(
420
  {
421
- "configs": [asdict(c) for c in configs],
422
  "tasks": [{"name": t.name, "prompt": t.prompt} for t in tasks],
423
  "parallel": self.parallel,
424
  "use_llm_evaluator": self.use_llm_evaluator,
@@ -437,7 +415,7 @@ class FlowOptimizer:
437
  "rank_by_score": result.rank_by_score,
438
  "rank_by_tokens": result.rank_by_tokens,
439
  "rank_by_efficiency": result.rank_by_efficiency,
440
- "exported_configs": {k: str(v) for k, v in result.exported_configs.items()},
441
  "summaries": [s.to_dict() for s in result.summaries],
442
  }
443
 
@@ -450,8 +428,7 @@ class FlowOptimizer:
450
  print(" OPTIMIZATION RESULTS")
451
  print("=" * 70)
452
 
453
- # Rankings table
454
- print(f"\n{'Config':<30} | {'Score':>8} | {'Tokens':>10} | {'Pareto':>8}")
455
  print("-" * 65)
456
 
457
  for summary in sorted(result.summaries, key=lambda s: s.avg_score, reverse=True):
@@ -465,62 +442,19 @@ class FlowOptimizer:
465
  print(f"Pareto frontier: {result.pareto_frontier}")
466
  print(f"Best by score: {result.rank_by_score[0] if result.rank_by_score else 'N/A'}")
467
  print(f"Best by efficiency: {result.rank_by_efficiency[0] if result.rank_by_efficiency else 'N/A'}")
468
- print("\nExported configs:")
469
- for name, path in result.exported_configs.items():
470
  print(f" {name}: {path}")
471
  print(f"\nResults saved to: {result.output_dir}")
472
 
473
 
474
- def generate_grid_configs(
475
- base_name: str,
476
- variations: dict[str, list[Any]],
477
- ) -> list[AblationConfig]:
478
- """Generate configs from a variation grid.
479
-
480
- Args:
481
- base_name: Base name for generated configs
482
- variations: Dict of param_name -> list of values
483
-
484
- Returns:
485
- List of AblationConfig for each combination
486
-
487
- Example:
488
- configs = generate_grid_configs("grid", {
489
- "enable_message_compaction": [True, False],
490
- "compaction_head_size": [5, 10, 20],
491
- })
492
- """
493
- if not variations:
494
- return [AblationConfig(name=base_name)]
495
-
496
- param_names = list(variations.keys())
497
- param_values = list(variations.values())
498
-
499
- configs = []
500
- for values in product(*param_values):
501
- kwargs = dict(zip(param_names, values, strict=True))
502
- name = f"{base_name}_" + "_".join(f"{k}={v}" for k, v in kwargs.items())
503
- configs.append(AblationConfig(name=name, **kwargs))
504
-
505
- return configs
506
-
507
-
508
  def load_tasks_from_jsonl(path: Path) -> list[Task]:
509
  """Load tasks from a JSONL file.
510
 
511
- Each line should be a JSON object with:
512
- - name: Task name
513
- - prompt: Task prompt
514
- - criteria: Optional list of evaluation criteria
515
- - category: Optional category string
516
- - metadata: Optional additional metadata dict
517
-
518
  Args:
519
  path: Path to JSONL file
520
 
521
  Returns:
522
  List of Task objects
523
  """
524
- from flow.experiments.types import _load_tasks_from_jsonl
525
-
526
- return _load_tasks_from_jsonl(path)
 
3
  """Optimizer service for finding best agent configurations.
4
 
5
  Runs experiments in parallel, evaluates with LLM-as-Judge,
6
+ ranks via Pareto analysis, and exports reusable agent configs.
7
  """
8
 
9
  from __future__ import annotations
 
15
  from collections.abc import Callable
16
  from dataclasses import asdict, dataclass, field
17
  from datetime import datetime
 
18
  from pathlib import Path
19
  from typing import Any
20
 
21
  from openai import AsyncAzureOpenAI
22
 
23
  from .ablation import (
 
24
  compute_pareto_frontier,
25
+ create_harness_from_agent,
26
  )
 
27
  from .evaluators import LLMEvaluator
28
  from .metrics import TraceMetrics, extract_metrics
29
+ from .models import (
30
+ Candidate,
31
+ export_optimization_results,
32
+ )
33
  from .runner import FlowExperimentRunner, setup_tracing
34
+ from .types import RunResult, Task, load_tasks_from_jsonl as _load_tasks_impl
35
 
36
  logger = logging.getLogger(__name__)
37
 
38
 
39
  @dataclass
40
  class TaskResult:
41
+ """Result for a single candidate-task pair."""
42
 
43
+ candidate_name: str
44
  task_name: str
45
  run_result: RunResult
46
  metrics: TraceMetrics
 
50
 
51
 
52
  @dataclass
53
+ class CandidateSummary:
54
+ """Aggregated summary for a candidate across all tasks."""
55
 
56
  name: str
57
+ candidate: Candidate
58
+ task_results: list[TaskResult] = field(default_factory=lambda: [])
59
 
60
  # Aggregated metrics
61
  avg_score: float = 0.0
 
73
  """Convert to dictionary for serialization."""
74
  return {
75
  "name": self.name,
76
+ "agent": asdict(self.candidate.agent),
77
+ "mutations": self.candidate.mutations,
78
+ "rationale": self.candidate.rationale,
79
  "avg_score": self.avg_score,
80
  "avg_tokens": self.avg_tokens,
81
  "avg_duration": self.avg_duration,
 
93
 
94
  timestamp: str
95
  output_dir: Path
96
+ summaries: list[CandidateSummary]
97
  pareto_frontier: list[str]
98
+ exported_agents: dict[str, Path]
99
 
100
  # Rankings
101
+ rank_by_score: list[str] = field(default_factory=lambda: [])
102
+ rank_by_tokens: list[str] = field(default_factory=lambda: [])
103
+ rank_by_efficiency: list[str] = field(default_factory=lambda: [])
104
 
105
  # Stats
106
  total_experiments: int = 0
107
  total_duration_seconds: float = 0.0
108
 
109
+ def get_best_candidate(self, criterion: str = "score") -> CandidateSummary | None:
110
+ """Get the best candidate by a criterion."""
111
  if criterion == "score":
112
  names = self.rank_by_score
113
  elif criterion == "tokens":
 
129
  class FlowOptimizer:
130
  """Optimizer for finding best agent configurations.
131
 
132
+ Takes a base Agent and a CandidateStrategy, generates candidates,
133
+ runs experiments in parallel, evaluates results, performs Pareto
134
+ analysis, and exports winning agent configs.
135
 
136
  Example:
137
+ strategy = GridSearchStrategy(variations={
138
+ "enable_memory": [True, False],
139
+ })
140
  optimizer = FlowOptimizer(parallel=4)
141
+ base = Agent(name="my_agent")
142
+ candidates = strategy.generate(base, budget=10)
143
+ result = await optimizer.optimize(candidates, tasks)
 
 
 
144
  print(f"Best: {result.rank_by_score[0]}")
145
  """
146
 
 
150
  use_llm_evaluator: bool = True,
151
  output_dir: Path | None = None,
152
  ) -> None:
 
 
 
 
 
 
 
153
  self.parallel = parallel
154
  self.use_llm_evaluator = use_llm_evaluator
155
  self.output_dir = output_dir or Path.home() / ".flow" / "optimizations"
156
 
157
  async def optimize(
158
  self,
159
+ candidates: list[Candidate],
160
  tasks: list[Task],
161
  progress_callback: Callable[[int, int, str, str], None] | None = None,
162
  ) -> OptimizationResult:
163
+ """Run optimization across all candidates and tasks.
164
 
165
  Args:
166
+ candidates: Candidates to test
167
+ tasks: Tasks to run each candidate on
168
+ progress_callback: Optional callback(completed, total, candidate_name, task_name)
169
 
170
  Returns:
171
+ OptimizationResult with rankings and exported agents
172
  """
173
  start_time = datetime.now()
174
  timestamp = start_time.strftime("%Y%m%d_%H%M%S")
175
  run_dir = self.output_dir / timestamp
176
  run_dir.mkdir(parents=True, exist_ok=True)
177
 
 
178
  setup_tracing("flow-optimizer")
179
+ self._save_config(candidates, tasks, run_dir)
180
 
181
  print("=" * 70)
182
  print(" FLOW OPTIMIZER")
183
  print("=" * 70)
184
+ print(f" Candidates: {len(candidates)}")
185
+ print(f" Tasks: {len(tasks)}")
186
+ print(f" Total: {len(candidates) * len(tasks)} experiments")
187
+ print(f" Parallel: {self.parallel}")
188
+ print(f" Output: {run_dir}")
189
  print("=" * 70)
190
 
 
191
  evaluator = None
192
  if self.use_llm_evaluator:
193
  evaluator = self._create_evaluator()
194
 
 
195
  task_results = await self._run_parallel(
196
+ candidates, tasks, run_dir, evaluator, progress_callback
197
  )
198
 
199
+ summaries = self._aggregate_results(task_results, candidates)
 
 
 
200
  pareto_names = self._compute_pareto(summaries)
201
 
 
202
  rank_by_score = sorted(summaries, key=lambda s: s.avg_score, reverse=True)
203
  rank_by_tokens = sorted(summaries, key=lambda s: s.avg_tokens)
204
  rank_by_efficiency = sorted(
 
207
  reverse=True,
208
  )
209
 
 
210
  summary_dicts = [s.to_dict() for s in summaries]
211
+ exported = export_optimization_results(
212
  summary_dicts, pareto_names, run_dir, timestamp
213
  )
214
 
 
219
  output_dir=run_dir,
220
  summaries=summaries,
221
  pareto_frontier=pareto_names,
222
+ exported_agents=exported,
223
  rank_by_score=[s.name for s in rank_by_score],
224
  rank_by_tokens=[s.name for s in rank_by_tokens],
225
  rank_by_efficiency=[s.name for s in rank_by_efficiency],
 
227
  total_duration_seconds=(end_time - start_time).total_seconds(),
228
  )
229
 
 
230
  self._save_results(result, run_dir)
 
 
231
  self._print_summary(result)
232
 
233
  return result
234
 
235
  async def _run_parallel(
236
  self,
237
+ candidates: list[Candidate],
238
  tasks: list[Task],
239
  run_dir: Path,
240
  evaluator: LLMEvaluator | None,
241
  progress_callback: Callable[[int, int, str, str], None] | None,
242
  ) -> list[TaskResult]:
243
+ """Run all candidate-task pairs in parallel with semaphore control."""
244
  semaphore = asyncio.Semaphore(self.parallel)
245
+ total = len(candidates) * len(tasks)
246
  completed = 0
247
  lock = asyncio.Lock()
248
 
249
+ async def run_one(candidate: Candidate, task: Task) -> TaskResult:
250
  nonlocal completed
251
  async with semaphore:
252
+ workspace = run_dir / "workspaces" / candidate.agent.name / task.name
253
  workspace.mkdir(parents=True, exist_ok=True)
254
 
255
+ result = await self._run_single(candidate, task, workspace, evaluator)
256
 
257
  async with lock:
258
  completed += 1
259
  status = "✓" if result.eval_passed else "✗"
260
  print(
261
+ f" [{completed}/{total}] {candidate.agent.name}/{task.name}: "
262
  f"{status} score={result.eval_score:.2f} "
263
  f"tokens={result.metrics.total_tokens:,}"
264
  )
265
  if progress_callback:
266
+ progress_callback(completed, total, candidate.agent.name, task.name)
267
 
268
  return result
269
 
270
+ coroutines = [run_one(c, t) for c in candidates for t in tasks]
 
 
 
271
  gather_results = await asyncio.gather(*coroutines, return_exceptions=True)
272
 
 
273
  valid_results: list[TaskResult] = []
274
  for r in gather_results:
275
  if isinstance(r, BaseException):
 
281
 
282
  async def _run_single(
283
  self,
284
+ candidate: Candidate,
285
  task: Task,
286
  workspace: Path,
287
  evaluator: LLMEvaluator | None,
288
  ) -> TaskResult:
289
+ """Run a single candidate-task experiment."""
290
+ harness = create_harness_from_agent(candidate.agent, workspace)
291
 
292
  try:
293
  runner = FlowExperimentRunner(keep_workspace=True)
294
  run_result = await runner.run(harness, task, workspace=workspace)
295
  metrics = extract_metrics(run_result.trace)
296
 
 
297
  if evaluator:
298
  eval_result = await evaluator.evaluate(run_result)
299
  eval_score = eval_result.score
300
  eval_passed = eval_result.passed
301
  eval_reasoning = eval_result.reasoning
302
  else:
 
303
  eval_score = 1.0 if run_result.success else 0.0
304
  eval_passed = run_result.success
305
  eval_reasoning = "Success" if run_result.success else run_result.error or "Failed"
306
 
307
  return TaskResult(
308
+ candidate_name=candidate.agent.name,
309
  task_name=task.name,
310
  run_result=run_result,
311
  metrics=metrics,
 
319
  def _aggregate_results(
320
  self,
321
  task_results: list[TaskResult],
322
+ candidates: list[Candidate],
323
+ ) -> list[CandidateSummary]:
324
+ """Aggregate task results into candidate summaries."""
325
+ candidate_map = {c.agent.name: c for c in candidates}
326
+ results_by_name: dict[str, list[TaskResult]] = {c.agent.name: [] for c in candidates}
327
 
328
  for result in task_results:
329
+ if result.candidate_name in results_by_name:
330
+ results_by_name[result.candidate_name].append(result)
331
 
332
  summaries = []
333
+ for name, results in results_by_name.items():
334
  if not results:
335
  continue
336
 
337
+ candidate = candidate_map[name]
338
+ summary = CandidateSummary(
339
  name=name,
340
+ candidate=candidate,
341
  task_results=results,
342
  avg_score=sum(r.eval_score for r in results) / len(results),
343
  avg_tokens=sum(r.metrics.total_tokens for r in results) / len(results),
 
350
 
351
  return summaries
352
 
353
+ def _compute_pareto(self, summaries: list[CandidateSummary]) -> list[str]:
354
  """Compute Pareto frontier (maximize score, minimize tokens)."""
 
355
  pareto_names = compute_pareto_frontier(summaries)
356
 
 
357
  for summary in summaries:
358
  if summary.name in pareto_names:
359
  summary.is_pareto_optimal = True
360
  summary.pareto_rank = 0
361
  else:
362
  summary.is_pareto_optimal = False
363
+ summary.pareto_rank = 1
364
 
365
  return pareto_names
366
 
 
388
 
389
  def _save_config(
390
  self,
391
+ candidates: list[Candidate],
392
  tasks: list[Task],
393
  run_dir: Path,
394
  ) -> None:
 
396
  with open(run_dir / "optimization_config.json", "w") as f:
397
  json.dump(
398
  {
399
+ "candidates": [asdict(c) for c in candidates],
400
  "tasks": [{"name": t.name, "prompt": t.prompt} for t in tasks],
401
  "parallel": self.parallel,
402
  "use_llm_evaluator": self.use_llm_evaluator,
 
415
  "rank_by_score": result.rank_by_score,
416
  "rank_by_tokens": result.rank_by_tokens,
417
  "rank_by_efficiency": result.rank_by_efficiency,
418
+ "exported_agents": {k: str(v) for k, v in result.exported_agents.items()},
419
  "summaries": [s.to_dict() for s in result.summaries],
420
  }
421
 
 
428
  print(" OPTIMIZATION RESULTS")
429
  print("=" * 70)
430
 
431
+ print(f"\n{'Candidate':<30} | {'Score':>8} | {'Tokens':>10} | {'Pareto':>8}")
 
432
  print("-" * 65)
433
 
434
  for summary in sorted(result.summaries, key=lambda s: s.avg_score, reverse=True):
 
442
  print(f"Pareto frontier: {result.pareto_frontier}")
443
  print(f"Best by score: {result.rank_by_score[0] if result.rank_by_score else 'N/A'}")
444
  print(f"Best by efficiency: {result.rank_by_efficiency[0] if result.rank_by_efficiency else 'N/A'}")
445
+ print("\nExported agents:")
446
+ for name, path in result.exported_agents.items():
447
  print(f" {name}: {path}")
448
  print(f"\nResults saved to: {result.output_dir}")
449
 
450
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
451
  def load_tasks_from_jsonl(path: Path) -> list[Task]:
452
  """Load tasks from a JSONL file.
453
 
 
 
 
 
 
 
 
454
  Args:
455
  path: Path to JSONL file
456
 
457
  Returns:
458
  List of Task objects
459
  """
460
+ return _load_tasks_impl(path)
 
 
src/flow/experiments/types.py CHANGED
@@ -109,7 +109,7 @@ class EvalResult:
109
  _DATA_DIR = Path(__file__).parent / "data" / "tasks"
110
 
111
 
112
- def _load_tasks_from_jsonl(path: Path) -> list[Task]:
113
  """Load tasks from a JSONL file.
114
 
115
  Each line should be a JSON object with:
@@ -186,4 +186,4 @@ def get_task_suite(suite_name: str) -> list[Task]:
186
  if not path.exists():
187
  available = ", ".join(get_available_suites())
188
  raise ValueError(f"Unknown suite '{suite_name}'. Available: {available}")
189
- return _load_tasks_from_jsonl(path)
 
109
  _DATA_DIR = Path(__file__).parent / "data" / "tasks"
110
 
111
 
112
+ def load_tasks_from_jsonl(path: Path) -> list[Task]:
113
  """Load tasks from a JSONL file.
114
 
115
  Each line should be a JSON object with:
 
186
  if not path.exists():
187
  available = ", ".join(get_available_suites())
188
  raise ValueError(f"Unknown suite '{suite_name}'. Available: {available}")
189
+ return load_tasks_from_jsonl(path)
src/flow/harness/maf/agent.py CHANGED
@@ -9,9 +9,10 @@ from collections.abc import Callable, Coroutine, Sequence
9
  from pathlib import Path
10
  from typing import TYPE_CHECKING, Any
11
 
 
12
  from flow.harness.maf.message_store import HeadTailCompactingChatMessageStore
13
- from flow.prompts import FLOW_AGENT_INSTRUCTIONS
14
- from flow.tools import create_all_tools
15
 
16
  if TYPE_CHECKING:
17
  from agent_framework import ChatAgent
@@ -37,10 +38,7 @@ def create_agent(
37
  workspace: Path | None = None,
38
  memory_path: Path | None = None,
39
  # Tool configuration
40
- tools: Sequence[Callable[..., Coroutine[Any, Any, str]]] | None = None,
41
- enable_memory_tool: bool = True,
42
- enable_sub_agent: bool = False,
43
- bash_timeout: int = 120,
44
  # Context engineering
45
  enable_compaction: bool = True,
46
  compaction_head_size: int = 10,
@@ -52,8 +50,6 @@ def create_agent(
52
  - Azure OpenAI as the backend
53
  - Flow's standard tools (coding, execution, memory)
54
  - Optional message compaction for long conversations
55
- - Optional agent-managed memory tool
56
- - Optional sub-agent for isolated research
57
 
58
  Args:
59
  endpoint: Azure OpenAI endpoint URL. Defaults to AZURE_OPENAI_ENDPOINT env var.
@@ -64,10 +60,11 @@ def create_agent(
64
  instructions: Agent instructions. Defaults to FLOW_AGENT_INSTRUCTIONS.
65
  workspace: Directory for file operations. Defaults to ~/.flow/workspace.
66
  memory_path: Directory for persistent memory. Defaults to ~/.flow/memory.
67
- tools: Custom tools to use. If None, creates standard Flow tools.
68
- enable_memory_tool: Whether to include the memory tool (default: True).
69
- enable_sub_agent: Whether to include the sub-agent tool (default: False).
70
- bash_timeout: Timeout for bash commands in seconds.
 
71
  enable_compaction: Whether to enable head+tail message compaction.
72
  compaction_head_size: Number of initial messages to keep.
73
  compaction_tail_size: Number of recent messages to keep.
@@ -81,9 +78,12 @@ def create_agent(
81
 
82
  Example:
83
  >>> from flow.harness.maf import create_agent
84
- >>> agent = create_agent()
85
- >>> thread = agent.get_new_thread()
86
- >>> response = await agent.run("Create a hello world script", thread=thread)
 
 
 
87
  """
88
  try:
89
  from agent_framework import ChatAgent, ai_function
@@ -123,19 +123,18 @@ def create_agent(
123
  workspace.mkdir(parents=True, exist_ok=True)
124
  memory_path.mkdir(parents=True, exist_ok=True)
125
 
126
- # Create or use provided tools
127
- if tools is None:
128
- tools = create_all_tools(
129
- workspace=workspace,
130
- memory_path=memory_path,
131
- bash_timeout=bash_timeout,
132
- enable_memory_tool=enable_memory_tool,
133
- enable_sub_agent=enable_sub_agent,
134
- )
135
 
136
  # Wrap tools with ai_function decorator for Agent Framework
137
  converted_tools = []
138
- for tool_func in tools:
139
  tool_name = getattr(tool_func, "_tool_name", tool_func.__name__)
140
  tool_description = getattr(tool_func, "_tool_description", tool_func.__doc__ or "")
141
  wrapped = ai_function(name=tool_name, description=tool_description)(tool_func)
@@ -163,11 +162,22 @@ def create_agent(
163
  f"Message compaction enabled: head={compaction_head_size}, tail={compaction_tail_size}"
164
  )
165
 
 
 
 
 
 
 
 
 
 
166
  # Create the agent
167
  agent = ChatAgent(
168
  name=name,
169
  description="Autonomous coding agent",
170
- instructions=instructions or FLOW_AGENT_INSTRUCTIONS,
 
 
171
  chat_client=client,
172
  tools=converted_tools,
173
  chat_message_store_factory=message_store_factory,
 
9
  from pathlib import Path
10
  from typing import TYPE_CHECKING, Any
11
 
12
+ from flow.experiments.models import TOOL_PRESETS, resolve_tools
13
  from flow.harness.maf.message_store import HeadTailCompactingChatMessageStore
14
+ from flow.harness.maf.tools import build_tools
15
+ from flow.prompts import build_instructions
16
 
17
  if TYPE_CHECKING:
18
  from agent_framework import ChatAgent
 
38
  workspace: Path | None = None,
39
  memory_path: Path | None = None,
40
  # Tool configuration
41
+ tools: str | list[str] | dict[str, dict[str, Any]] | Sequence[Callable[..., Coroutine[Any, Any, str]]] = "standard",
 
 
 
42
  # Context engineering
43
  enable_compaction: bool = True,
44
  compaction_head_size: int = 10,
 
50
  - Azure OpenAI as the backend
51
  - Flow's standard tools (coding, execution, memory)
52
  - Optional message compaction for long conversations
 
 
53
 
54
  Args:
55
  endpoint: Azure OpenAI endpoint URL. Defaults to AZURE_OPENAI_ENDPOINT env var.
 
60
  instructions: Agent instructions. Defaults to FLOW_AGENT_INSTRUCTIONS.
61
  workspace: Directory for file operations. Defaults to ~/.flow/workspace.
62
  memory_path: Directory for persistent memory. Defaults to ~/.flow/memory.
63
+ tools: Tool configuration - can be:
64
+ - str: Preset name ("standard", "minimal", "full", "readonly")
65
+ - list[str]: List of tool names
66
+ - dict[str, dict]: Full specification with per-tool configs
67
+ - Sequence[Callable]: Pre-built tool functions (advanced)
68
  enable_compaction: Whether to enable head+tail message compaction.
69
  compaction_head_size: Number of initial messages to keep.
70
  compaction_tail_size: Number of recent messages to keep.
 
78
 
79
  Example:
80
  >>> from flow.harness.maf import create_agent
81
+ >>> # Using preset
82
+ >>> agent = create_agent(tools="standard")
83
+ >>> # Using explicit list
84
+ >>> agent = create_agent(tools=["read_file", "write_file", "bash_execute"])
85
+ >>> # Using full config
86
+ >>> agent = create_agent(tools={"bash_execute": {"timeout": 60}, "memory": {}})
87
  """
88
  try:
89
  from agent_framework import ChatAgent, ai_function
 
123
  workspace.mkdir(parents=True, exist_ok=True)
124
  memory_path.mkdir(parents=True, exist_ok=True)
125
 
126
+ # Create tools from specification or use provided functions
127
+ if isinstance(tools, (str, list, dict)):
128
+ # Resolve to dict form and build tools
129
+ tools_spec = resolve_tools(tools)
130
+ tool_functions = build_tools(tools_spec, workspace, memory_path)
131
+ else:
132
+ # Already a sequence of callable tools
133
+ tool_functions = tools
 
134
 
135
  # Wrap tools with ai_function decorator for Agent Framework
136
  converted_tools = []
137
+ for tool_func in tool_functions:
138
  tool_name = getattr(tool_func, "_tool_name", tool_func.__name__)
139
  tool_description = getattr(tool_func, "_tool_description", tool_func.__doc__ or "")
140
  wrapped = ai_function(name=tool_name, description=tool_description)(tool_func)
 
162
  f"Message compaction enabled: head={compaction_head_size}, tail={compaction_tail_size}"
163
  )
164
 
165
+ # Determine if memory is enabled for instructions
166
+ enable_memory = False
167
+ if isinstance(tools, str):
168
+ enable_memory = "memory" in TOOL_PRESETS.get(tools, {})
169
+ elif isinstance(tools, list):
170
+ enable_memory = "memory" in tools
171
+ elif isinstance(tools, dict):
172
+ enable_memory = "memory" in tools
173
+
174
  # Create the agent
175
  agent = ChatAgent(
176
  name=name,
177
  description="Autonomous coding agent",
178
+ instructions=instructions or build_instructions(
179
+ enable_memory=enable_memory,
180
+ ),
181
  chat_client=client,
182
  tools=converted_tools,
183
  chat_message_store_factory=message_store_factory,
src/flow/harness/maf/tools/__init__.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """MAF-specific tools for the Flow agent.
2
+
3
+ This module provides tools that work with the Microsoft Agent Framework harness.
4
+ Tools are created based on a specification dict that maps tool names to their configs.
5
+
6
+ Available tools:
7
+ - read_file: Read file contents
8
+ - write_file: Write/edit file content
9
+ - list_directory: List directory contents
10
+ - grep_search: Search for text patterns
11
+ - bash_execute: Execute bash commands (config: timeout)
12
+ - check_processes: Manage background processes
13
+ - python_repl: Execute Python code
14
+ - think: Explicit reasoning tool
15
+ - task_done: Task completion marker
16
+ - memory: Persistent memory storage
17
+ - sub_agent: Isolated research sub-agent (config: model)
18
+ """
19
+
20
+ from collections.abc import Callable, Coroutine, Sequence
21
+ from pathlib import Path
22
+ from typing import Any
23
+
24
+ from flow.harness.maf.tools.coding import (
25
+ create_grep_search_tool,
26
+ create_list_directory_tool,
27
+ create_read_file_tool,
28
+ create_write_file_tool,
29
+ )
30
+ from flow.harness.maf.tools.core import task_done, think
31
+ from flow.harness.maf.tools.execution import (
32
+ create_bash_execute_tool,
33
+ create_check_processes_tool,
34
+ create_python_repl_tool,
35
+ )
36
+ from flow.harness.maf.tools.memory import create_memory_tool
37
+ from flow.harness.maf.tools.sub_agent import create_sub_agent_tool
38
+
39
+ __all__ = [
40
+ "build_tools",
41
+ "create_bash_execute_tool",
42
+ "create_check_processes_tool",
43
+ "create_grep_search_tool",
44
+ "create_list_directory_tool",
45
+ "create_memory_tool",
46
+ "create_python_repl_tool",
47
+ "create_read_file_tool",
48
+ "create_sub_agent_tool",
49
+ "create_write_file_tool",
50
+ "task_done",
51
+ "think",
52
+ ]
53
+
54
+
55
+ # Registry of tool factories that don't require config
56
+ # Maps tool name -> factory function(workspace, memory_path) -> tool
57
+ _SIMPLE_TOOL_FACTORIES: dict[str, Callable[..., Any]] = {}
58
+
59
+ # Registry of tools that are standalone (no factory needed)
60
+ _STANDALONE_TOOLS: dict[str, Callable[..., Coroutine[Any, Any, str]]] = {
61
+ "think": think,
62
+ "task_done": task_done,
63
+ }
64
+
65
+
66
+ def build_tools(
67
+ tools_spec: dict[str, dict[str, Any]],
68
+ workspace: Path,
69
+ memory_path: Path,
70
+ ) -> Sequence[Callable[..., Coroutine[Any, Any, str]]]:
71
+ """Build tool functions from a specification dict.
72
+
73
+ This is the main entry point for creating tools based on a resolved
74
+ tool specification (from resolve_tools()).
75
+
76
+ Args:
77
+ tools_spec: Dict mapping tool names to their config dicts.
78
+ e.g., {"bash_execute": {"timeout": 60}, "read_file": {}}
79
+ workspace: Root directory for file operations
80
+ memory_path: Directory for persistent memory
81
+
82
+ Returns:
83
+ List of tool functions ready to use with MAF
84
+
85
+ Example:
86
+ >>> from flow.experiments.models import resolve_tools
87
+ >>> tools_spec = resolve_tools("standard")
88
+ >>> tools = build_tools(tools_spec, workspace, memory_path)
89
+ """
90
+ workspace = Path(workspace).resolve()
91
+ memory_path = Path(memory_path).resolve()
92
+
93
+ tools: list[Callable[..., Coroutine[Any, Any, str]]] = []
94
+
95
+ for tool_name, config in tools_spec.items():
96
+ tool = _create_tool(tool_name, config, workspace, memory_path)
97
+ if tool is not None:
98
+ tools.append(tool)
99
+
100
+ return tools
101
+
102
+
103
+ def _create_tool(
104
+ name: str,
105
+ config: dict[str, Any],
106
+ workspace: Path,
107
+ memory_path: Path,
108
+ ) -> Callable[..., Coroutine[Any, Any, str]] | None:
109
+ """Create a single tool by name with the given config.
110
+
111
+ Args:
112
+ name: Tool name (e.g., "read_file", "bash_execute")
113
+ config: Tool-specific configuration dict
114
+ workspace: Root directory for file operations
115
+ memory_path: Directory for persistent memory
116
+
117
+ Returns:
118
+ Tool function or None if unknown tool name
119
+ """
120
+ # Standalone tools (no config needed)
121
+ if name in _STANDALONE_TOOLS:
122
+ return _STANDALONE_TOOLS[name]
123
+
124
+ # Coding tools
125
+ if name == "read_file":
126
+ return create_read_file_tool(workspace)
127
+ if name == "write_file":
128
+ return create_write_file_tool(workspace)
129
+ if name == "list_directory":
130
+ return create_list_directory_tool(workspace)
131
+ if name == "grep_search":
132
+ return create_grep_search_tool(workspace)
133
+
134
+ # Execution tools
135
+ if name == "bash_execute":
136
+ timeout = config.get("timeout", 120)
137
+ return create_bash_execute_tool(workspace, memory_path, timeout)
138
+ if name == "check_processes":
139
+ return create_check_processes_tool(workspace, memory_path)
140
+ if name == "python_repl":
141
+ return create_python_repl_tool(workspace)
142
+
143
+ # Memory tool
144
+ if name == "memory":
145
+ return create_memory_tool(memory_path)
146
+
147
+ # Sub-agent tool
148
+ if name == "sub_agent":
149
+ model = config.get("model", "gpt-4o-mini")
150
+ return create_sub_agent_tool(workspace, model=model)
151
+
152
+ # Unknown tool - log warning and skip
153
+ import logging
154
+
155
+ logger = logging.getLogger(__name__)
156
+ logger.warning(f"Unknown tool name: {name}. Skipping.")
157
+ return None
src/flow/{tools → harness/maf/tools}/coding.py RENAMED
File without changes
src/flow/{tools → harness/maf/tools}/core.py RENAMED
File without changes
src/flow/{tools → harness/maf/tools}/execution.py RENAMED
File without changes
src/flow/{tools → harness/maf/tools}/memory.py RENAMED
File without changes
src/flow/{tools → harness/maf/tools}/sub_agent.py RENAMED
@@ -100,12 +100,20 @@ def create_sub_agent_tool(
100
 
101
  # Create basic tools for the sub-agent
102
  # Keep it minimal - just what's needed for research
103
- from flow.tools.coding import create_coding_tools
104
- from flow.tools.core import create_core_tools
105
-
106
- sub_tools: list[Callable[..., Any]] = []
107
- sub_tools.extend(create_coding_tools(workspace))
108
- sub_tools.extend(create_core_tools())
 
 
 
 
 
 
 
 
109
 
110
  # Convert tools to agent_framework format
111
  from agent_framework import ai_function
 
100
 
101
  # Create basic tools for the sub-agent
102
  # Keep it minimal - just what's needed for research
103
+ from flow.harness.maf.tools.coding import (
104
+ create_grep_search_tool,
105
+ create_list_directory_tool,
106
+ create_read_file_tool,
107
+ )
108
+ from flow.harness.maf.tools.core import task_done, think
109
+
110
+ sub_tools: list[Callable[..., Any]] = [
111
+ create_read_file_tool(workspace),
112
+ create_list_directory_tool(workspace),
113
+ create_grep_search_tool(workspace),
114
+ think,
115
+ task_done,
116
+ ]
117
 
118
  # Convert tools to agent_framework format
119
  from agent_framework import ai_function
src/flow/prompts.py CHANGED
@@ -1,9 +1,14 @@
1
  """System prompts for the Flow agent.
2
 
3
  Defines the structured workflow for software engineering tasks.
 
4
  """
5
 
6
- FLOW_AGENT_INSTRUCTIONS = """
 
 
 
 
7
  You are an expert autonomous agent. You solve problems end-to-end by composing your available tools.
8
 
9
  ## CORE PRINCIPLE: BE AUTONOMOUS
@@ -22,7 +27,9 @@ When asked to solve a task:
22
  **Example - GOOD (autonomous):**
23
  > *writes code* → *executes code* → *sees output* → *fixes any errors*
24
  > → "Done! The script ran successfully and output X."
 
25
 
 
26
  ---
27
 
28
  ## YOUR CAPABILITIES
@@ -41,35 +48,23 @@ When asked to solve a task:
41
  - `web_search`: Search the web using Google (requires GOOGLE_API_KEY and GOOGLE_CSE_ID)
42
  - `web_fetch`: Fetch and read content from URLs
43
 
44
- **Memory Tools:**
45
- - `memory`: Persistent storage that survives across conversations
46
- - view: See directory or file contents
47
- - create: Create new files
48
- - str_replace: Edit existing files
49
- - append: Add to files
50
- - search: Find text across memory
51
- - delete: Remove files
52
-
53
  **Thinking Tools:**
54
  - `think`: Pause to reason through complex problems
55
  - `task_done`: Report when task is complete or blocked
 
56
 
57
- **Skills Tool (if available):**
58
- - `skills`: Discover and load domain-specific expertise
59
- - `skills(action='list')`: See available skills with descriptions
60
- - `skills(action='load', name='skill-name')`: Load full skill content
61
-
62
  ---
63
 
64
  ## WORKFLOW
65
 
66
  ### 1. UNDERSTAND
67
  - Read the user's request carefully
68
- - **If the `skills` tool is available**, call `skills(action='list')` to discover relevant expertise
69
  - Use `list_directory` to understand the workspace structure
70
  - Use `grep_search` to find relevant existing code
71
- - Check memory for relevant patterns: `memory(command="view", path="/memory")`
72
 
 
73
  ### 2. PLAN
74
  - Use `think` tool to plan your approach for complex tasks
75
  - Break down into small, testable steps
@@ -120,7 +115,9 @@ bash_execute("cd project && npm run build") # Production build must succeed
120
  - Clean up any background processes you started
121
  - Call `task_done` with status and summary
122
  - Include files created and suggested next steps
 
123
 
 
124
  ---
125
 
126
  ## WORKSPACE
@@ -139,50 +136,9 @@ Your workspace is at `~/.flow/workspace/`
139
  - Each `bash_execute` runs from workspace root in a fresh shell
140
  - Use `cd project && command` for commands in subdirectories
141
  - Multiple commands: `cd project && cmd1 && cmd2`
 
142
 
143
- ---
144
-
145
- ## MEMORY
146
-
147
- Your memory persists at `~/.flow/memory/`
148
-
149
- **Recommended structure:**
150
- - `/memory/patterns/` - Reusable solutions and code patterns
151
- - `/memory/projects/` - Per-project context and notes
152
- - `/memory/decisions/` - Why you made certain choices
153
-
154
- **Best practices:**
155
- When storing information, include context:
156
- - **Date**: When was this created/learned?
157
- - **Project**: What project did this come from?
158
- - **Context**: Why was this approach chosen?
159
-
160
- **Example pattern file** (`/memory/patterns/fastapi_cors.md`):
161
- ```markdown
162
- # FastAPI CORS Setup
163
- Created: 2025-01-15
164
- Source: sleep_tracker project
165
-
166
- ## Pattern
167
- from fastapi.middleware.cors import CORSMiddleware
168
- app.add_middleware(
169
- CORSMiddleware,
170
- allow_origins=["*"],
171
- allow_methods=["*"],
172
- allow_headers=["*"],
173
- )
174
-
175
- ## When to use
176
- - Full-stack apps with separate frontend/backend
177
- - Frontend on different port than backend
178
-
179
- ## Notes
180
- - Must add before routes
181
- - Restrict origins in production
182
- ```
183
-
184
- **Check memory first** - you may have solved similar problems before!
185
-
186
  ---
187
 
188
  ## CLI TOOLS
@@ -210,7 +166,9 @@ npm install @shadcn/ui
210
  npx shadcn@latest init --defaults --yes
211
  npx shadcn@latest add button card --yes
212
  ```
 
213
 
 
214
  ---
215
 
216
  ## FULL-STACK APPS
@@ -235,7 +193,9 @@ app.add_middleware(
235
  cd backend && python -c "from main import app; print('Backend OK')"
236
  cd frontend && npm run build && echo "Frontend OK"
237
  ```
 
238
 
 
239
  ---
240
 
241
  ## BACKGROUND PROCESSES
@@ -257,9 +217,6 @@ check_processes(action="list")
257
  check_processes(action="kill", pid=12345)
258
  ```
259
 
260
- **Process registry** is at `/memory/processes.md` - view it with:
261
- `memory(command='view', path='/memory/processes.md')`
262
-
263
  **IMPORTANT:**
264
  - NEVER start servers without `background=True` - they will timeout after 120s
265
  - ALWAYS clean up background processes when done testing
@@ -276,48 +233,19 @@ check_processes(action="cleanup") # Kill all when done
276
  # Bad - will timeout!
277
  bash_execute("uvicorn main:app --port 8000") # Blocks forever
278
  ```
 
279
 
 
280
  ---
281
 
282
  ## ERROR HANDLING
283
 
284
  - If a command fails, analyze the error and try alternatives
285
- - Log failures and solutions to memory for future reference
286
  - Don't give up after first failure - iterate
287
  - If truly blocked, call `task_done` with status="incomplete" and explain why
 
288
 
289
- ---
290
-
291
- ## SKILLS
292
-
293
- **If the `skills` tool is available**, use it to access domain-specific expertise:
294
-
295
- ```python
296
- # At the start of complex tasks, discover what expertise is available
297
- skills(action='list')
298
-
299
- # Output shows available skills with descriptions:
300
- # - fastapi-patterns: Build REST APIs with FastAPI...
301
- # - react-components: Build React components with hooks...
302
- # - testing-strategies: Write comprehensive tests...
303
-
304
- # Load relevant skills before implementation
305
- skills(action='load', name='fastapi-patterns')
306
- ```
307
-
308
- **Skills provide:**
309
- - Domain-specific patterns and best practices
310
- - Code examples and templates
311
- - Common pitfalls to avoid
312
-
313
- **When to load skills:**
314
- - Before starting a new project type (API, frontend, CLI)
315
- - When working with unfamiliar frameworks
316
- - For complex tasks requiring specialized knowledge
317
-
318
- **Skills location:** `~/.flow/skills/`
319
- Each skill is a folder with a `SKILL.md` file following the Anthropic Skills standard.
320
-
321
  ---
322
 
323
  ## COMPOSING TOOLS FOR COMPLEX TASKS
@@ -358,7 +286,9 @@ Each skill is a folder with a `SKILL.md` file following the Anthropic Skills sta
358
  4. bash_execute("curl localhost:8000/endpoint") → Reproduce the error
359
  5. Analyze error → Fix code → Test again → Iterate until fixed
360
  ```
 
361
 
 
362
  ---
363
 
364
  ## RESEARCH WORKFLOW
@@ -388,7 +318,9 @@ async def fetch_data(url):
388
  # 4. Test it
389
  python_repl("import httpx; print(httpx.__version__)")
390
  ```
 
391
 
 
392
  ---
393
 
394
  ## REMEMBER
@@ -401,7 +333,212 @@ python_repl("import httpx; print(httpx.__version__)")
401
  6. **TEST EVERYTHING** - Never assume code works
402
  7. **USE NON-INTERACTIVE FLAGS** - Avoid hanging commands
403
  8. **CLEAN UP** - Kill background processes when done
404
- 9. **STORE LEARNINGS** - Save patterns to memory for future use
405
 
406
  **Your goal is to deliver RESULTS, not instructions.**
407
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  """System prompts for the Flow agent.
2
 
3
  Defines the structured workflow for software engineering tasks.
4
+ Instructions are composed dynamically based on which tools are enabled.
5
  """
6
 
7
+ # =============================================================================
8
+ # Core instructions - always included
9
+ # =============================================================================
10
+
11
+ _CORE_INTRO = """
12
  You are an expert autonomous agent. You solve problems end-to-end by composing your available tools.
13
 
14
  ## CORE PRINCIPLE: BE AUTONOMOUS
 
27
  **Example - GOOD (autonomous):**
28
  > *writes code* → *executes code* → *sees output* → *fixes any errors*
29
  > → "Done! The script ran successfully and output X."
30
+ """
31
 
32
+ _CORE_CAPABILITIES = """
33
  ---
34
 
35
  ## YOUR CAPABILITIES
 
48
  - `web_search`: Search the web using Google (requires GOOGLE_API_KEY and GOOGLE_CSE_ID)
49
  - `web_fetch`: Fetch and read content from URLs
50
 
 
 
 
 
 
 
 
 
 
51
  **Thinking Tools:**
52
  - `think`: Pause to reason through complex problems
53
  - `task_done`: Report when task is complete or blocked
54
+ """
55
 
56
+ _CORE_WORKFLOW_UNDERSTAND = """
 
 
 
 
57
  ---
58
 
59
  ## WORKFLOW
60
 
61
  ### 1. UNDERSTAND
62
  - Read the user's request carefully
 
63
  - Use `list_directory` to understand the workspace structure
64
  - Use `grep_search` to find relevant existing code
65
+ """
66
 
67
+ _CORE_WORKFLOW_PLAN_EXECUTE_VERIFY = """
68
  ### 2. PLAN
69
  - Use `think` tool to plan your approach for complex tasks
70
  - Break down into small, testable steps
 
115
  - Clean up any background processes you started
116
  - Call `task_done` with status and summary
117
  - Include files created and suggested next steps
118
+ """
119
 
120
+ _CORE_WORKSPACE = """
121
  ---
122
 
123
  ## WORKSPACE
 
136
  - Each `bash_execute` runs from workspace root in a fresh shell
137
  - Use `cd project && command` for commands in subdirectories
138
  - Multiple commands: `cd project && cmd1 && cmd2`
139
+ """
140
 
141
+ _CORE_CLI_TOOLS = """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
  ---
143
 
144
  ## CLI TOOLS
 
166
  npx shadcn@latest init --defaults --yes
167
  npx shadcn@latest add button card --yes
168
  ```
169
+ """
170
 
171
+ _CORE_FULLSTACK = """
172
  ---
173
 
174
  ## FULL-STACK APPS
 
193
  cd backend && python -c "from main import app; print('Backend OK')"
194
  cd frontend && npm run build && echo "Frontend OK"
195
  ```
196
+ """
197
 
198
+ _CORE_BACKGROUND = """
199
  ---
200
 
201
  ## BACKGROUND PROCESSES
 
217
  check_processes(action="kill", pid=12345)
218
  ```
219
 
 
 
 
220
  **IMPORTANT:**
221
  - NEVER start servers without `background=True` - they will timeout after 120s
222
  - ALWAYS clean up background processes when done testing
 
233
  # Bad - will timeout!
234
  bash_execute("uvicorn main:app --port 8000") # Blocks forever
235
  ```
236
+ """
237
 
238
+ _CORE_ERROR_HANDLING = """
239
  ---
240
 
241
  ## ERROR HANDLING
242
 
243
  - If a command fails, analyze the error and try alternatives
 
244
  - Don't give up after first failure - iterate
245
  - If truly blocked, call `task_done` with status="incomplete" and explain why
246
+ """
247
 
248
+ _CORE_EXAMPLES = """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
249
  ---
250
 
251
  ## COMPOSING TOOLS FOR COMPLEX TASKS
 
286
  4. bash_execute("curl localhost:8000/endpoint") → Reproduce the error
287
  5. Analyze error → Fix code → Test again → Iterate until fixed
288
  ```
289
+ """
290
 
291
+ _CORE_RESEARCH = """
292
  ---
293
 
294
  ## RESEARCH WORKFLOW
 
318
  # 4. Test it
319
  python_repl("import httpx; print(httpx.__version__)")
320
  ```
321
+ """
322
 
323
+ _CORE_REMEMBER = """
324
  ---
325
 
326
  ## REMEMBER
 
333
  6. **TEST EVERYTHING** - Never assume code works
334
  7. **USE NON-INTERACTIVE FLAGS** - Avoid hanging commands
335
  8. **CLEAN UP** - Kill background processes when done
 
336
 
337
  **Your goal is to deliver RESULTS, not instructions.**
338
  """
339
+
340
+ # =============================================================================
341
+ # Optional sections - included only when corresponding tools are enabled
342
+ # =============================================================================
343
+
344
+ _MEMORY_CAPABILITIES = """
345
+ **Memory Tools:**
346
+ - `memory`: Persistent storage that survives across conversations
347
+ - view: See directory or file contents
348
+ - create: Create new files
349
+ - str_replace: Edit existing files
350
+ - append: Add to files
351
+ - search: Find text across memory
352
+ - delete: Remove files
353
+ """
354
+
355
+ _MEMORY_WORKFLOW_UNDERSTAND = """- Check memory for relevant patterns: `memory(command="view", path="/memory")`
356
+ """
357
+
358
+ _MEMORY_SECTION = """
359
+ ---
360
+
361
+ ## MEMORY
362
+
363
+ Your memory persists at `~/.flow/memory/`
364
+
365
+ **Recommended structure:**
366
+ - `/memory/patterns/` - Reusable solutions and code patterns
367
+ - `/memory/projects/` - Per-project context and notes
368
+ - `/memory/decisions/` - Why you made certain choices
369
+
370
+ **Best practices:**
371
+ When storing information, include context:
372
+ - **Date**: When was this created/learned?
373
+ - **Project**: What project did this come from?
374
+ - **Context**: Why was this approach chosen?
375
+
376
+ **Example pattern file** (`/memory/patterns/fastapi_cors.md`):
377
+ ```markdown
378
+ # FastAPI CORS Setup
379
+ Created: 2025-01-15
380
+ Source: sleep_tracker project
381
+
382
+ ## Pattern
383
+ from fastapi.middleware.cors import CORSMiddleware
384
+ app.add_middleware(
385
+ CORSMiddleware,
386
+ allow_origins=["*"],
387
+ allow_methods=["*"],
388
+ allow_headers=["*"],
389
+ )
390
+
391
+ ## When to use
392
+ - Full-stack apps with separate frontend/backend
393
+ - Frontend on different port than backend
394
+
395
+ ## Notes
396
+ - Must add before routes
397
+ - Restrict origins in production
398
+ ```
399
+
400
+ **Check memory first** - you may have solved similar problems before!
401
+ """
402
+
403
+ _MEMORY_ERROR_HANDLING = """- Log failures and solutions to memory for future reference
404
+ """
405
+
406
+ _MEMORY_REMEMBER = """9. **STORE LEARNINGS** - Save patterns to memory for future use
407
+ """
408
+
409
+ _MEMORY_BACKGROUND_PROCESS_REGISTRY = """
410
+ **Process registry** is at `/memory/processes.md` - view it with:
411
+ `memory(command='view', path='/memory/processes.md')`
412
+ """
413
+
414
+ _SKILLS_CAPABILITIES = """
415
+ **Skills Tool (if available):**
416
+ - `skills`: Discover and load domain-specific expertise
417
+ - `skills(action='list')`: See available skills with descriptions
418
+ - `skills(action='load', name='skill-name')`: Load full skill content
419
+ """
420
+
421
+ _SKILLS_WORKFLOW_UNDERSTAND = """- **If the `skills` tool is available**, call `skills(action='list')` to discover relevant expertise
422
+ """
423
+
424
+ _SKILLS_SECTION = """
425
+ ---
426
+
427
+ ## SKILLS
428
+
429
+ **If the `skills` tool is available**, use it to access domain-specific expertise:
430
+
431
+ ```python
432
+ # At the start of complex tasks, discover what expertise is available
433
+ skills(action='list')
434
+
435
+ # Output shows available skills with descriptions:
436
+ # - fastapi-patterns: Build REST APIs with FastAPI...
437
+ # - react-components: Build React components with hooks...
438
+ # - testing-strategies: Write comprehensive tests...
439
+
440
+ # Load relevant skills before implementation
441
+ skills(action='load', name='fastapi-patterns')
442
+ ```
443
+
444
+ **Skills provide:**
445
+ - Domain-specific patterns and best practices
446
+ - Code examples and templates
447
+ - Common pitfalls to avoid
448
+
449
+ **When to load skills:**
450
+ - Before starting a new project type (API, frontend, CLI)
451
+ - When working with unfamiliar frameworks
452
+ - For complex tasks requiring specialized knowledge
453
+
454
+ **Skills location:** `~/.flow/skills/`
455
+ Each skill is a folder with a `SKILL.md` file following the Anthropic Skills standard.
456
+ """
457
+
458
+
459
+ # =============================================================================
460
+ # Instruction builder
461
+ # =============================================================================
462
+
463
+
464
+ def build_instructions(
465
+ *,
466
+ enable_memory: bool = True,
467
+ enable_skills: bool = False,
468
+ ) -> str:
469
+ """Build agent instructions dynamically based on enabled tools.
470
+
471
+ Composes the instruction prompt from core sections plus optional sections
472
+ for memory and skills, so the agent only sees documentation for tools
473
+ it actually has.
474
+
475
+ Args:
476
+ enable_memory: Include memory tool documentation.
477
+ enable_skills: Include skills tool documentation.
478
+
479
+ Returns:
480
+ Complete instruction string.
481
+ """
482
+ # -- Capabilities section --
483
+ capabilities = _CORE_CAPABILITIES
484
+ if enable_memory:
485
+ capabilities += "\n" + _MEMORY_CAPABILITIES
486
+ if enable_skills:
487
+ capabilities += "\n" + _SKILLS_CAPABILITIES
488
+
489
+ # -- Workflow > Understand section --
490
+ understand = _CORE_WORKFLOW_UNDERSTAND
491
+ if enable_skills:
492
+ understand += _SKILLS_WORKFLOW_UNDERSTAND
493
+ if enable_memory:
494
+ understand += _MEMORY_WORKFLOW_UNDERSTAND
495
+
496
+ # -- Error handling section --
497
+ error_handling = _CORE_ERROR_HANDLING
498
+ if enable_memory:
499
+ error_handling += _MEMORY_ERROR_HANDLING
500
+
501
+ # -- Background processes section --
502
+ background = _CORE_BACKGROUND
503
+ if enable_memory:
504
+ background += _MEMORY_BACKGROUND_PROCESS_REGISTRY
505
+
506
+ # -- Remember section --
507
+ remember = _CORE_REMEMBER
508
+ if enable_memory:
509
+ remember += _MEMORY_REMEMBER
510
+
511
+ # -- Assemble --
512
+ sections = [
513
+ _CORE_INTRO,
514
+ capabilities,
515
+ understand,
516
+ _CORE_WORKFLOW_PLAN_EXECUTE_VERIFY,
517
+ _CORE_WORKSPACE,
518
+ ]
519
+
520
+ if enable_memory:
521
+ sections.append(_MEMORY_SECTION)
522
+
523
+ sections.extend([
524
+ _CORE_CLI_TOOLS,
525
+ _CORE_FULLSTACK,
526
+ background,
527
+ error_handling,
528
+ ])
529
+
530
+ if enable_skills:
531
+ sections.append(_SKILLS_SECTION)
532
+
533
+ sections.extend([
534
+ _CORE_EXAMPLES,
535
+ _CORE_RESEARCH,
536
+ remember,
537
+ ])
538
+
539
+ return "\n".join(sections)
540
+
541
+
542
+ # Legacy constant for backwards compatibility.
543
+ # Equivalent to build_instructions(enable_memory=True, enable_skills=True).
544
+ FLOW_AGENT_INSTRUCTIONS = build_instructions(enable_memory=True, enable_skills=True)
src/flow/tools/__init__.py DELETED
@@ -1,172 +0,0 @@
1
- """Flow agent tools.
2
-
3
- Provides coding, execution, memory, and core tools for software engineering tasks.
4
- Tools are harness-agnostic - they return plain data that harnesses adapt.
5
- """
6
-
7
- import inspect
8
- from collections.abc import Callable, Sequence
9
- from functools import wraps
10
- from pathlib import Path
11
- from typing import Any, get_type_hints
12
-
13
- from flow.tools.coding import create_coding_tools
14
- from flow.tools.core import create_core_tools
15
- from flow.tools.execution import create_execution_tools
16
- from flow.tools.memory import create_memory_tool
17
- from flow.tools.sub_agent import create_sub_agent_tool
18
-
19
- __all__ = [
20
- "create_all_tools",
21
- "create_coding_tools",
22
- "create_core_tools",
23
- "create_execution_tools",
24
- "create_memory_tool",
25
- "create_sub_agent_tool",
26
- "get_tool_schema",
27
- "tool",
28
- ]
29
-
30
-
31
- def tool(
32
- name: str | None = None,
33
- description: str | None = None,
34
- ) -> Callable[[Callable[..., Any]], Callable[..., Any]]:
35
- """Decorator to mark a function as an agent tool.
36
-
37
- This decorator adds metadata to functions that allows harnesses
38
- to discover and use them as agent tools.
39
-
40
- Args:
41
- name: Tool name (defaults to function name)
42
- description: Tool description (defaults to docstring)
43
-
44
- Returns:
45
- Decorated function with tool metadata
46
-
47
- Example:
48
- @tool(name="read_file", description="Read file contents")
49
- async def read_file(path: str) -> str:
50
- ...
51
- """
52
-
53
- def decorator(func: Callable[..., Any]) -> Callable[..., Any]:
54
- @wraps(func)
55
- def wrapper(*args: Any, **kwargs: Any) -> Any:
56
- return func(*args, **kwargs)
57
-
58
- # Store tool metadata
59
- wrapper._tool_name = name or func.__name__ # type: ignore[attr-defined]
60
- wrapper._tool_description = description or func.__doc__ or "" # type: ignore[attr-defined]
61
- wrapper._is_tool = True # type: ignore[attr-defined]
62
-
63
- return wrapper
64
-
65
- return decorator
66
-
67
-
68
- def get_tool_schema(func: Callable[..., Any]) -> dict[str, Any]:
69
- """Extract JSON schema from a tool function.
70
-
71
- Uses type hints and Annotated metadata to build the schema.
72
-
73
- Args:
74
- func: Tool function to extract schema from
75
-
76
- Returns:
77
- JSON schema dict for the tool's parameters
78
- """
79
- hints = get_type_hints(func, include_extras=True)
80
- sig = inspect.signature(func)
81
-
82
- properties: dict[str, Any] = {}
83
- required: list[str] = []
84
-
85
- for param_name, param in sig.parameters.items():
86
- if param_name in ("self", "cls"):
87
- continue
88
-
89
- param_schema: dict[str, Any] = {}
90
- hint = hints.get(param_name, Any)
91
-
92
- # Handle Annotated types
93
- origin = getattr(hint, "__origin__", None)
94
- if origin is not None:
95
- # Check if it's Annotated
96
- if hasattr(hint, "__metadata__"):
97
- # Extract description from Annotated metadata
98
- for meta in hint.__metadata__:
99
- if isinstance(meta, str):
100
- param_schema["description"] = meta
101
- break
102
- # Get the actual type
103
- hint = hint.__args__[0]
104
- origin = getattr(hint, "__origin__", None)
105
-
106
- # Map Python types to JSON schema types
107
- if hint is str:
108
- param_schema["type"] = "string"
109
- elif hint is int:
110
- param_schema["type"] = "integer"
111
- elif hint is float:
112
- param_schema["type"] = "number"
113
- elif hint is bool:
114
- param_schema["type"] = "boolean"
115
- elif origin is list:
116
- param_schema["type"] = "array"
117
- elif origin is dict:
118
- param_schema["type"] = "object"
119
- else:
120
- param_schema["type"] = "string" # Default fallback
121
-
122
- properties[param_name] = param_schema
123
-
124
- # Check if parameter is required (no default value)
125
- if param.default is inspect.Parameter.empty:
126
- required.append(param_name)
127
-
128
- return {
129
- "type": "object",
130
- "properties": properties,
131
- "required": required,
132
- }
133
-
134
-
135
- def create_all_tools(
136
- workspace: Path,
137
- memory_path: Path,
138
- bash_timeout: int = 120,
139
- *,
140
- enable_memory_tool: bool = True,
141
- enable_sub_agent: bool = False,
142
- sub_agent_model: str = "gpt-4o-mini",
143
- ) -> Sequence[Callable[..., Any]]:
144
- """Create all standard tools for the Flow agent.
145
-
146
- Args:
147
- workspace: Root directory for file operations
148
- memory_path: Directory for persistent memory
149
- bash_timeout: Timeout for bash commands in seconds
150
- enable_memory_tool: Whether to include the memory tool
151
- enable_sub_agent: Whether to include the sub-agent research tool
152
- sub_agent_model: Model to use for sub-agent (default: gpt-4o-mini)
153
-
154
- Returns:
155
- List of all tool functions
156
- """
157
- tools: list[Callable[..., Any]] = []
158
-
159
- # Core tools always included
160
- tools.extend(create_coding_tools(workspace))
161
- tools.extend(create_execution_tools(workspace, memory_path, bash_timeout))
162
- tools.extend(create_core_tools())
163
-
164
- # Optional: Agent-managed memory tool
165
- if enable_memory_tool:
166
- tools.append(create_memory_tool(memory_path))
167
-
168
- # Optional: Sub-agent for isolated research
169
- if enable_sub_agent:
170
- tools.append(create_sub_agent_tool(workspace, model=sub_agent_model))
171
-
172
- return tools
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/flow/ui/api/configs.py CHANGED
@@ -1,7 +1,6 @@
1
  # Copyright (c) Microsoft. All rights reserved.
2
- """Config API routes."""
3
 
4
- from itertools import product
5
  from uuid import UUID
6
 
7
  from fastapi import APIRouter, Depends, HTTPException
@@ -9,32 +8,37 @@ from pydantic import BaseModel
9
  from sqlalchemy.ext.asyncio import AsyncSession
10
  from sqlmodel import select, desc
11
 
 
 
12
  from ..database import get_session
13
  from ..models.config import AgentConfig
14
- from ..schemas import ConfigCreate, ConfigUpdate, ConfigResponse
15
 
16
  router = APIRouter(prefix="/configs", tags=["configs"])
17
 
18
 
19
- class VariationRequest(BaseModel):
20
- """Request schema for generating config variations."""
21
 
22
  base_name: str = "experiment"
23
 
24
- # Which features to vary (on/off)
25
  vary_compaction: bool = False
26
- vary_memory: bool = False
27
- vary_sub_agent: bool = False
28
-
29
- # Which numeric parameters to vary
30
  vary_compaction_head: bool = False
31
  vary_compaction_tail: bool = False
32
 
33
- # Values to use for numeric variations
 
 
 
34
  compaction_head_values: list[int] = [5, 10, 20]
35
  compaction_tail_values: list[int] = [20, 40, 60]
36
 
37
- # Optional job ID to associate configs with
 
 
 
38
  job_id: str | None = None
39
 
40
 
@@ -46,17 +50,12 @@ def parse_uuid(id_str: str) -> UUID:
46
  raise HTTPException(status_code=400, detail=f"Invalid UUID: {id_str}") from e
47
 
48
 
49
- @router.get("", response_model=list[ConfigResponse])
50
  async def list_configs(
51
  include_auto_generated: bool = False,
52
  session: AsyncSession = Depends(get_session),
53
  ) -> list[AgentConfig]:
54
- """List agent configurations.
55
-
56
- Args:
57
- include_auto_generated: If False (default), only show user-created configs.
58
- If True, include auto-generated configs from jobs.
59
- """
60
  query = select(AgentConfig)
61
  if not include_auto_generated:
62
  query = query.where(AgentConfig.is_auto_generated == False) # noqa: E712
@@ -65,9 +64,9 @@ async def list_configs(
65
  return list(result.scalars().all())
66
 
67
 
68
- @router.post("", response_model=ConfigResponse, status_code=201)
69
  async def create_config(
70
- data: ConfigCreate,
71
  session: AsyncSession = Depends(get_session),
72
  ) -> AgentConfig:
73
  """Create a new agent configuration."""
@@ -82,7 +81,7 @@ async def create_config(
82
  return config
83
 
84
 
85
- @router.get("/{config_id}", response_model=ConfigResponse)
86
  async def get_config(
87
  config_id: str,
88
  session: AsyncSession = Depends(get_session),
@@ -96,10 +95,10 @@ async def get_config(
96
  return config
97
 
98
 
99
- @router.put("/{config_id}", response_model=ConfigResponse)
100
  async def update_config(
101
  config_id: str,
102
- data: ConfigUpdate,
103
  session: AsyncSession = Depends(get_session),
104
  ) -> AgentConfig:
105
  """Update an agent configuration."""
@@ -109,25 +108,23 @@ async def update_config(
109
  if not config:
110
  raise HTTPException(status_code=404, detail="Config not found")
111
 
112
- # Update fields that were provided
113
  update_data = data.model_dump(exclude_unset=True)
114
 
115
- # Handle config_json fields separately
116
  config_fields = [
117
- "enable_message_compaction",
118
- "enable_memory_tool",
119
- "enable_sub_agent",
120
- "compaction_head_size",
121
- "compaction_tail_size",
122
- "bash_timeout",
123
  ]
124
 
125
  config_json = dict(config.config_json)
126
- for field in config_fields:
127
- if field in update_data:
128
- config_json[field] = update_data.pop(field)
 
 
 
129
 
130
- # Update top-level fields
131
  for key, value in update_data.items():
132
  setattr(config, key, value)
133
 
@@ -157,57 +154,44 @@ async def delete_config(
157
  await session.commit()
158
 
159
 
160
- @router.post("/generate-variations", response_model=list[ConfigResponse], status_code=201)
161
- async def generate_variations(
162
- data: VariationRequest,
163
  session: AsyncSession = Depends(get_session),
164
  ) -> list[AgentConfig]:
165
- """Generate config variations for ablation testing.
166
 
167
- This creates multiple configs by combining variation options.
168
- Each variation is named based on the features enabled.
169
  """
170
-
171
- # Build variation dimensions
172
- dimensions: list[list[tuple[str, str, bool | int]]] = []
173
- dimension_names: list[str] = []
174
 
175
  if data.vary_compaction:
176
- dimensions.append([
177
- ("compaction", "enable_message_compaction", True),
178
- ("no_compact", "enable_message_compaction", False),
179
- ])
180
- dimension_names.append("compaction")
181
-
182
- if data.vary_memory:
183
- dimensions.append([
184
- ("memory", "enable_memory_tool", True),
185
- ("no_mem", "enable_memory_tool", False),
186
- ])
187
- dimension_names.append("memory")
188
-
189
- if data.vary_sub_agent:
190
- dimensions.append([
191
- ("subagent", "enable_sub_agent", True),
192
- ("no_sub", "enable_sub_agent", False),
193
- ])
194
- dimension_names.append("sub_agent")
195
 
196
  if data.vary_compaction_head:
197
- dimensions.append([
198
- (f"head{size}", "compaction_head_size", size)
199
- for size in data.compaction_head_values
200
- ])
201
- dimension_names.append("head_size")
202
 
203
  if data.vary_compaction_tail:
204
- dimensions.append([
205
- (f"tail{size}", "compaction_tail_size", size)
206
- for size in data.compaction_tail_values
207
- ])
208
- dimension_names.append("tail_size")
 
 
 
 
 
209
 
210
- # Parse job_id if provided
211
  job_uuid = None
212
  if data.job_id:
213
  try:
@@ -215,19 +199,16 @@ async def generate_variations(
215
  except ValueError:
216
  pass
217
 
218
- # If no variations selected, create a single baseline config
219
- if not dimensions:
 
220
  config = AgentConfig(
221
  name=f"{data.base_name}_baseline",
222
- description=f"Baseline config from {data.base_name}",
223
  config_json={
224
  "name": f"{data.base_name}_baseline",
225
- "enable_message_compaction": True,
226
- "enable_memory_tool": True,
227
- "enable_sub_agent": False,
228
- "compaction_head_size": 10,
229
- "compaction_tail_size": 40,
230
- "bash_timeout": 120,
231
  },
232
  is_auto_generated=True,
233
  job_id=job_uuid,
@@ -237,41 +218,30 @@ async def generate_variations(
237
  await session.refresh(config)
238
  return [config]
239
 
240
- # Generate all combinations
 
 
241
  configs = []
242
- for combo in product(*dimensions):
243
- # Build name from variation labels
244
- name_parts = [label for label, _, _ in combo]
245
- config_name = f"{data.base_name}_{'_'.join(name_parts)}"
246
-
247
- # Build config JSON from defaults + variations
248
- config_json = {
249
- "name": config_name,
250
- "enable_message_compaction": True,
251
- "enable_memory_tool": True,
252
- "enable_sub_agent": False,
253
- "compaction_head_size": 10,
254
- "compaction_tail_size": 40,
255
- "bash_timeout": 120,
256
- }
257
-
258
- # Apply variations
259
- for _, key, value in combo:
260
- config_json[key] = value
261
-
262
- # Check if config with this name already exists
263
  existing = await session.execute(
264
- select(AgentConfig).where(AgentConfig.name == config_name).limit(1)
265
  )
266
  existing_config = existing.scalar_one_or_none()
267
 
268
  if existing_config:
269
  configs.append(existing_config)
270
  else:
 
271
  config = AgentConfig(
272
- name=config_name,
273
- description=f"Auto-generated variation: {', '.join(name_parts)}",
274
- config_json=config_json,
 
 
 
 
275
  is_auto_generated=True,
276
  job_id=job_uuid,
277
  )
 
1
  # Copyright (c) Microsoft. All rights reserved.
2
+ """Agent config API routes."""
3
 
 
4
  from uuid import UUID
5
 
6
  from fastapi import APIRouter, Depends, HTTPException
 
8
  from sqlalchemy.ext.asyncio import AsyncSession
9
  from sqlmodel import select, desc
10
 
11
+ from flow.experiments.models import Agent, CompactionConfig, GridSearchStrategy
12
+
13
  from ..database import get_session
14
  from ..models.config import AgentConfig
15
+ from ..schemas import AgentCreate, AgentUpdate, AgentResponse
16
 
17
  router = APIRouter(prefix="/configs", tags=["configs"])
18
 
19
 
20
+ class CandidateRequest(BaseModel):
21
+ """Request schema for generating candidate agents."""
22
 
23
  base_name: str = "experiment"
24
 
25
+ # Which dimensions to vary
26
  vary_compaction: bool = False
27
+ vary_tools: bool = False
 
 
 
28
  vary_compaction_head: bool = False
29
  vary_compaction_tail: bool = False
30
 
31
+ # Values for tool variations (preset names)
32
+ tool_presets: list[str] = ["standard", "minimal", "full"]
33
+
34
+ # Values for numeric variations
35
  compaction_head_values: list[int] = [5, 10, 20]
36
  compaction_tail_values: list[int] = [20, 40, 60]
37
 
38
+ # Budget limit
39
+ budget: int = 100
40
+
41
+ # Optional job ID to associate candidates with
42
  job_id: str | None = None
43
 
44
 
 
50
  raise HTTPException(status_code=400, detail=f"Invalid UUID: {id_str}") from e
51
 
52
 
53
+ @router.get("", response_model=list[AgentResponse])
54
  async def list_configs(
55
  include_auto_generated: bool = False,
56
  session: AsyncSession = Depends(get_session),
57
  ) -> list[AgentConfig]:
58
+ """List agent configurations."""
 
 
 
 
 
59
  query = select(AgentConfig)
60
  if not include_auto_generated:
61
  query = query.where(AgentConfig.is_auto_generated == False) # noqa: E712
 
64
  return list(result.scalars().all())
65
 
66
 
67
+ @router.post("", response_model=AgentResponse, status_code=201)
68
  async def create_config(
69
+ data: AgentCreate,
70
  session: AsyncSession = Depends(get_session),
71
  ) -> AgentConfig:
72
  """Create a new agent configuration."""
 
81
  return config
82
 
83
 
84
+ @router.get("/{config_id}", response_model=AgentResponse)
85
  async def get_config(
86
  config_id: str,
87
  session: AsyncSession = Depends(get_session),
 
95
  return config
96
 
97
 
98
+ @router.put("/{config_id}", response_model=AgentResponse)
99
  async def update_config(
100
  config_id: str,
101
+ data: AgentUpdate,
102
  session: AsyncSession = Depends(get_session),
103
  ) -> AgentConfig:
104
  """Update an agent configuration."""
 
108
  if not config:
109
  raise HTTPException(status_code=404, detail="Config not found")
110
 
 
111
  update_data = data.model_dump(exclude_unset=True)
112
 
 
113
  config_fields = [
114
+ "instructions",
115
+ "model",
116
+ "compaction",
117
+ "tools",
 
 
118
  ]
119
 
120
  config_json = dict(config.config_json)
121
+ for field_name in config_fields:
122
+ if field_name in update_data:
123
+ value = update_data.pop(field_name)
124
+ if field_name == "compaction" and hasattr(value, "model_dump"):
125
+ value = value.model_dump()
126
+ config_json[field_name] = value
127
 
 
128
  for key, value in update_data.items():
129
  setattr(config, key, value)
130
 
 
154
  await session.commit()
155
 
156
 
157
+ @router.post("/generate-candidates", response_model=list[AgentResponse], status_code=201)
158
+ async def generate_candidates(
159
+ data: CandidateRequest,
160
  session: AsyncSession = Depends(get_session),
161
  ) -> list[AgentConfig]:
162
+ """Generate candidate agents for optimization.
163
 
164
+ Uses GridSearchStrategy to generate candidate variants from a base agent.
165
+ Each candidate is stored as an AgentConfig in the database.
166
  """
167
+ variations: dict[str, list] = {}
 
 
 
168
 
169
  if data.vary_compaction:
170
+ variations["compaction"] = [
171
+ CompactionConfig.head_tail(10, 40),
172
+ CompactionConfig.none(),
173
+ ]
174
+
175
+ if data.vary_tools:
176
+ variations["tools"] = data.tool_presets
 
 
 
 
 
 
 
 
 
 
 
 
177
 
178
  if data.vary_compaction_head:
179
+ variations["compaction"] = [
180
+ CompactionConfig.head_tail(h, 40) for h in data.compaction_head_values
181
+ ]
 
 
182
 
183
  if data.vary_compaction_tail:
184
+ if data.vary_compaction_head:
185
+ variations["compaction"] = [
186
+ CompactionConfig.head_tail(h, t)
187
+ for h in data.compaction_head_values
188
+ for t in data.compaction_tail_values
189
+ ]
190
+ else:
191
+ variations["compaction"] = [
192
+ CompactionConfig.head_tail(10, t) for t in data.compaction_tail_values
193
+ ]
194
 
 
195
  job_uuid = None
196
  if data.job_id:
197
  try:
 
199
  except ValueError:
200
  pass
201
 
202
+ base = Agent(name=data.base_name)
203
+
204
+ if not variations:
205
  config = AgentConfig(
206
  name=f"{data.base_name}_baseline",
207
+ description=f"Baseline agent from {data.base_name}",
208
  config_json={
209
  "name": f"{data.base_name}_baseline",
210
+ "compaction": {"strategy": "head_tail", "params": {"head_size": 10, "tail_size": 40}},
211
+ "tools": "standard",
 
 
 
 
212
  },
213
  is_auto_generated=True,
214
  job_id=job_uuid,
 
218
  await session.refresh(config)
219
  return [config]
220
 
221
+ strategy = GridSearchStrategy(variations)
222
+ candidates = strategy.generate(base, data.budget)
223
+
224
  configs = []
225
+ for candidate in candidates:
226
+ candidate_name = candidate.agent.name
227
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
228
  existing = await session.execute(
229
+ select(AgentConfig).where(AgentConfig.name == candidate_name).limit(1)
230
  )
231
  existing_config = existing.scalar_one_or_none()
232
 
233
  if existing_config:
234
  configs.append(existing_config)
235
  else:
236
+ from dataclasses import asdict
237
  config = AgentConfig(
238
+ name=candidate_name,
239
+ description=candidate.rationale,
240
+ config_json={
241
+ "name": candidate_name,
242
+ "compaction": asdict(candidate.agent.compaction),
243
+ "tools": candidate.agent.tools,
244
+ },
245
  is_auto_generated=True,
246
  job_id=job_uuid,
247
  )
src/flow/ui/api/jobs.py CHANGED
@@ -53,12 +53,12 @@ async def create_job(
53
  session: AsyncSession = Depends(get_session),
54
  ) -> OptimizationJob:
55
  """Create a new optimization job."""
56
- # Validate config_ids exist
57
- for config_id in data.config_ids:
58
- uuid_id = parse_uuid(config_id)
59
  result = await session.execute(select(AgentConfig).where(AgentConfig.id == uuid_id))
60
  if not result.scalar_one_or_none():
61
- raise HTTPException(status_code=400, detail=f"Config {config_id} not found")
62
 
63
  # Validate task_ids exist
64
  for task_id in data.task_ids:
@@ -69,11 +69,11 @@ async def create_job(
69
 
70
  job = OptimizationJob(
71
  name=data.name,
72
- config_ids=data.config_ids,
73
  task_ids=data.task_ids,
74
  parallel=data.parallel,
75
  use_llm_eval=data.use_llm_eval,
76
- total_experiments=len(data.config_ids) * len(data.task_ids),
77
  )
78
  session.add(job)
79
  await session.commit()
 
53
  session: AsyncSession = Depends(get_session),
54
  ) -> OptimizationJob:
55
  """Create a new optimization job."""
56
+ # Validate candidate_ids exist
57
+ for candidate_id in data.candidate_ids:
58
+ uuid_id = parse_uuid(candidate_id)
59
  result = await session.execute(select(AgentConfig).where(AgentConfig.id == uuid_id))
60
  if not result.scalar_one_or_none():
61
+ raise HTTPException(status_code=400, detail=f"Candidate {candidate_id} not found")
62
 
63
  # Validate task_ids exist
64
  for task_id in data.task_ids:
 
69
 
70
  job = OptimizationJob(
71
  name=data.name,
72
+ candidate_ids=data.candidate_ids,
73
  task_ids=data.task_ids,
74
  parallel=data.parallel,
75
  use_llm_eval=data.use_llm_eval,
76
+ total_experiments=len(data.candidate_ids) * len(data.task_ids),
77
  )
78
  session.add(job)
79
  await session.commit()
src/flow/ui/api/runs.py CHANGED
@@ -26,7 +26,7 @@ def parse_uuid(id_str: str) -> UUID:
26
  @router.get("", response_model=list[RunResponse])
27
  async def list_runs(
28
  job_id: str | None = None,
29
- config_name: str | None = None,
30
  task_name: str | None = None,
31
  is_pareto: bool | None = None,
32
  session: AsyncSession = Depends(get_session),
@@ -37,8 +37,8 @@ async def list_runs(
37
  if job_id:
38
  uuid_id = parse_uuid(job_id)
39
  query = query.where(ExperimentRun.job_id == uuid_id)
40
- if config_name:
41
- query = query.where(ExperimentRun.config_name == config_name)
42
  if task_name:
43
  query = query.where(ExperimentRun.task_name == task_name)
44
  if is_pareto is not None:
@@ -75,7 +75,7 @@ async def get_run(
75
  return {
76
  "id": str(run.id),
77
  "job_id": str(run.job_id),
78
- "config_name": run.config_name,
79
  "task_name": run.task_name,
80
  "status": run.status,
81
  "tokens_total": run.tokens_total,
@@ -111,11 +111,11 @@ async def get_job_summary(
111
  raise HTTPException(status_code=404, detail="No runs found for job")
112
 
113
  # Aggregate by config
114
- config_summaries: dict[str, dict[str, Any]] = {}
115
  for run in runs:
116
- if run.config_name not in config_summaries:
117
- config_summaries[run.config_name] = {
118
- "config_name": run.config_name,
119
  "total_runs": 0,
120
  "passed_runs": 0,
121
  "avg_score": 0.0,
@@ -125,7 +125,7 @@ async def get_job_summary(
125
  "pareto_rank": 999,
126
  }
127
 
128
- summary = config_summaries[run.config_name]
129
  summary["total_runs"] += 1
130
  if run.passed:
131
  summary["passed_runs"] += 1
@@ -137,7 +137,7 @@ async def get_job_summary(
137
  summary["pareto_rank"] = min(summary["pareto_rank"], run.pareto_rank)
138
 
139
  # Calculate averages
140
- for summary in config_summaries.values():
141
  n = summary["total_runs"]
142
  summary["avg_score"] /= n
143
  summary["avg_tokens"] /= n
@@ -145,13 +145,13 @@ async def get_job_summary(
145
 
146
  # Sort by score descending
147
  sorted_summaries = sorted(
148
- config_summaries.values(),
149
  key=lambda x: (-x["avg_score"], x["avg_tokens"]),
150
  )
151
 
152
  return {
153
  "job_id": job_id,
154
  "total_runs": len(runs),
155
- "config_summaries": sorted_summaries,
156
- "pareto_configs": [s["config_name"] for s in sorted_summaries if s["is_pareto"]],
157
  }
 
26
  @router.get("", response_model=list[RunResponse])
27
  async def list_runs(
28
  job_id: str | None = None,
29
+ candidate_name: str | None = None,
30
  task_name: str | None = None,
31
  is_pareto: bool | None = None,
32
  session: AsyncSession = Depends(get_session),
 
37
  if job_id:
38
  uuid_id = parse_uuid(job_id)
39
  query = query.where(ExperimentRun.job_id == uuid_id)
40
+ if candidate_name:
41
+ query = query.where(ExperimentRun.candidate_name == candidate_name)
42
  if task_name:
43
  query = query.where(ExperimentRun.task_name == task_name)
44
  if is_pareto is not None:
 
75
  return {
76
  "id": str(run.id),
77
  "job_id": str(run.job_id),
78
+ "candidate_name": run.candidate_name,
79
  "task_name": run.task_name,
80
  "status": run.status,
81
  "tokens_total": run.tokens_total,
 
111
  raise HTTPException(status_code=404, detail="No runs found for job")
112
 
113
  # Aggregate by config
114
+ candidate_summaries: dict[str, dict[str, Any]] = {}
115
  for run in runs:
116
+ if run.candidate_name not in candidate_summaries:
117
+ candidate_summaries[run.candidate_name] = {
118
+ "candidate_name": run.candidate_name,
119
  "total_runs": 0,
120
  "passed_runs": 0,
121
  "avg_score": 0.0,
 
125
  "pareto_rank": 999,
126
  }
127
 
128
+ summary = candidate_summaries[run.candidate_name]
129
  summary["total_runs"] += 1
130
  if run.passed:
131
  summary["passed_runs"] += 1
 
137
  summary["pareto_rank"] = min(summary["pareto_rank"], run.pareto_rank)
138
 
139
  # Calculate averages
140
+ for summary in candidate_summaries.values():
141
  n = summary["total_runs"]
142
  summary["avg_score"] /= n
143
  summary["avg_tokens"] /= n
 
145
 
146
  # Sort by score descending
147
  sorted_summaries = sorted(
148
+ candidate_summaries.values(),
149
  key=lambda x: (-x["avg_score"], x["avg_tokens"]),
150
  )
151
 
152
  return {
153
  "job_id": job_id,
154
  "total_runs": len(runs),
155
+ "candidate_summaries": sorted_summaries,
156
+ "pareto_candidates": [s["candidate_name"] for s in sorted_summaries if s["is_pareto"]],
157
  }
src/flow/ui/database.py CHANGED
@@ -21,70 +21,14 @@ engine = create_async_engine(DATABASE_URL, echo=False, future=True)
21
  async_session = async_sessionmaker(engine, class_=AsyncSession, expire_on_commit=False)
22
 
23
 
24
- async def _migrate_schema(conn) -> None:
25
- """Apply schema migrations for new columns.
26
-
27
- SQLModel's create_all only creates missing tables, not columns.
28
- This adds any missing columns to existing tables.
29
- """
30
- from sqlalchemy import text, inspect
31
-
32
- def _sync_migrate(sync_conn):
33
- inspector = inspect(sync_conn)
34
-
35
- # Check agent_configs table
36
- if inspector.has_table("agent_configs"):
37
- columns = {c["name"] for c in inspector.get_columns("agent_configs")}
38
-
39
- # Add is_auto_generated column if missing
40
- if "is_auto_generated" not in columns:
41
- logger.info("Adding is_auto_generated column to agent_configs")
42
- sync_conn.execute(
43
- text("ALTER TABLE agent_configs ADD COLUMN is_auto_generated BOOLEAN DEFAULT 0")
44
- )
45
-
46
- # Add job_id column if missing
47
- if "job_id" not in columns:
48
- logger.info("Adding job_id column to agent_configs")
49
- sync_conn.execute(
50
- text("ALTER TABLE agent_configs ADD COLUMN job_id VARCHAR(36)")
51
- )
52
-
53
- # Retroactively mark configs with "Auto-generated variation:" in description
54
- logger.info("Marking auto-generated configs based on description pattern")
55
- sync_conn.execute(
56
- text(
57
- "UPDATE agent_configs SET is_auto_generated = 1 "
58
- "WHERE description LIKE 'Auto-generated variation:%' "
59
- "AND (is_auto_generated IS NULL OR is_auto_generated = 0)"
60
- )
61
- )
62
-
63
- await conn.run_sync(_sync_migrate)
64
-
65
-
66
  async def init_db() -> None:
67
- """Initialize database tables.
68
-
69
- With multiple uvicorn workers, each worker calls this on startup.
70
- SQLite + create_all can race: worker A checks table doesn't exist,
71
- worker B creates it, worker A tries to create and fails.
72
-
73
- Solution: Catch the OperationalError and continue - if the table
74
- already exists, that's fine.
75
-
76
- See: https://github.com/sqlalchemy/sqlalchemy/issues/4936
77
- """
78
- # Import models to ensure they're registered with SQLModel.metadata
79
  from flow.ui.models import AgentConfig, TaskModel, OptimizationJob, ExperimentRun # noqa: F401
80
 
81
  try:
82
  async with engine.begin() as conn:
83
  await conn.run_sync(SQLModel.metadata.create_all)
84
- # Apply migrations for new columns
85
- await _migrate_schema(conn)
86
  except Exception as e:
87
- # Handle race condition: "table already exists" is fine
88
  if "already exists" in str(e).lower():
89
  logger.debug("Tables already exist (race condition handled)")
90
  else:
 
21
  async_session = async_sessionmaker(engine, class_=AsyncSession, expire_on_commit=False)
22
 
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  async def init_db() -> None:
25
+ """Initialize database tables."""
 
 
 
 
 
 
 
 
 
 
 
26
  from flow.ui.models import AgentConfig, TaskModel, OptimizationJob, ExperimentRun # noqa: F401
27
 
28
  try:
29
  async with engine.begin() as conn:
30
  await conn.run_sync(SQLModel.metadata.create_all)
 
 
31
  except Exception as e:
 
32
  if "already exists" in str(e).lower():
33
  logger.debug("Tables already exist (race condition handled)")
34
  else:
src/flow/ui/models/config.py CHANGED
@@ -17,12 +17,12 @@ class AgentConfig(SQLModel, table=True):
17
  name: str = Field(index=True)
18
  description: str = ""
19
 
20
- # Store AblationConfig as JSON
21
  config_json: dict[str, Any] = Field(default_factory=dict, sa_column=Column(JSON))
22
 
23
- # Track auto-generated configs (created by variation endpoint)
24
  is_auto_generated: bool = Field(default=False, index=True)
25
- # Link to the job that created this config (if auto-generated)
26
  job_id: UUID | None = Field(default=None, index=True)
27
 
28
  created_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
@@ -30,5 +30,5 @@ class AgentConfig(SQLModel, table=True):
30
 
31
  @property
32
  def config(self) -> dict[str, Any]:
33
- """Alias for config_json for API compatibility."""
34
  return self.config_json
 
17
  name: str = Field(index=True)
18
  description: str = ""
19
 
20
+ # Store Agent config as JSON
21
  config_json: dict[str, Any] = Field(default_factory=dict, sa_column=Column(JSON))
22
 
23
+ # Track auto-generated candidates (created by generate-candidates endpoint)
24
  is_auto_generated: bool = Field(default=False, index=True)
25
+ # Link to the job that created this candidate (if auto-generated)
26
  job_id: UUID | None = Field(default=None, index=True)
27
 
28
  created_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
 
30
 
31
  @property
32
  def config(self) -> dict[str, Any]:
33
+ """Alias for config_json used by API response serialization."""
34
  return self.config_json
src/flow/ui/models/job.py CHANGED
@@ -33,7 +33,7 @@ class OptimizationJob(SQLModel, table=True):
33
  use_llm_eval: bool = Field(default=False)
34
 
35
  # Selected configs and tasks (stored as IDs)
36
- config_ids: list[str] = Field(default_factory=list, sa_column=Column(JSON))
37
  task_ids: list[str] = Field(default_factory=list, sa_column=Column(JSON))
38
 
39
  # Results
 
33
  use_llm_eval: bool = Field(default=False)
34
 
35
  # Selected configs and tasks (stored as IDs)
36
+ candidate_ids: list[str] = Field(default_factory=list, sa_column=Column(JSON))
37
  task_ids: list[str] = Field(default_factory=list, sa_column=Column(JSON))
38
 
39
  # Results
src/flow/ui/models/run.py CHANGED
@@ -16,7 +16,7 @@ class ExperimentRun(SQLModel, table=True):
16
  id: UUID = Field(default_factory=uuid4, primary_key=True)
17
  job_id: UUID = Field(foreign_key="optimization_jobs.id", index=True)
18
 
19
- config_name: str
20
  task_name: str
21
 
22
  # Status
 
16
  id: UUID = Field(default_factory=uuid4, primary_key=True)
17
  job_id: UUID = Field(foreign_key="optimization_jobs.id", index=True)
18
 
19
+ candidate_name: str
20
  task_name: str
21
 
22
  # Status
src/flow/ui/models/task.py CHANGED
@@ -28,5 +28,5 @@ class TaskModel(SQLModel, table=True):
28
 
29
  @property
30
  def criteria(self) -> list[dict[str, Any]]:
31
- """Alias for criteria_json for API compatibility."""
32
  return self.criteria_json
 
28
 
29
  @property
30
  def criteria(self) -> list[dict[str, Any]]:
31
+ """Alias for criteria_json used by API response serialization."""
32
  return self.criteria_json
src/flow/ui/schemas/__init__.py CHANGED
@@ -1,15 +1,15 @@
1
  # Copyright (c) Microsoft. All rights reserved.
2
  """Pydantic schemas for API requests/responses."""
3
 
4
- from .config import ConfigCreate, ConfigUpdate, ConfigResponse
5
  from .task import TaskCreate, TaskResponse, CriterionSchema
6
  from .job import JobCreate, JobResponse, JobProgress
7
  from .run import RunResponse, RunDetailResponse, CriterionResultSchema
8
 
9
  __all__ = [
10
- "ConfigCreate",
11
- "ConfigUpdate",
12
- "ConfigResponse",
13
  "TaskCreate",
14
  "TaskResponse",
15
  "CriterionSchema",
 
1
  # Copyright (c) Microsoft. All rights reserved.
2
  """Pydantic schemas for API requests/responses."""
3
 
4
+ from .config import AgentCreate, AgentUpdate, AgentResponse
5
  from .task import TaskCreate, TaskResponse, CriterionSchema
6
  from .job import JobCreate, JobResponse, JobProgress
7
  from .run import RunResponse, RunDetailResponse, CriterionResultSchema
8
 
9
  __all__ = [
10
+ "AgentCreate",
11
+ "AgentUpdate",
12
+ "AgentResponse",
13
  "TaskCreate",
14
  "TaskResponse",
15
  "CriterionSchema",
src/flow/ui/schemas/config.py CHANGED
@@ -1,5 +1,5 @@
1
  # Copyright (c) Microsoft. All rights reserved.
2
- """Config schemas."""
3
 
4
  from datetime import datetime
5
  from typing import Any
@@ -8,46 +8,52 @@ from uuid import UUID
8
  from pydantic import BaseModel, ConfigDict, field_validator
9
 
10
 
11
- class ConfigCreate(BaseModel):
12
- """Request schema for creating a config."""
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
  name: str
15
  description: str = ""
16
- enable_message_compaction: bool = True
17
- enable_memory_tool: bool = True
18
- enable_sub_agent: bool = False
19
- compaction_head_size: int = 10
20
- compaction_tail_size: int = 40
21
- bash_timeout: int = 120
22
 
23
  def to_config_json(self) -> dict[str, Any]:
24
- """Convert to config JSON for storage."""
25
  return {
26
- "name": self.name,
27
- "enable_message_compaction": self.enable_message_compaction,
28
- "enable_memory_tool": self.enable_memory_tool,
29
- "enable_sub_agent": self.enable_sub_agent,
30
- "compaction_head_size": self.compaction_head_size,
31
- "compaction_tail_size": self.compaction_tail_size,
32
- "bash_timeout": self.bash_timeout,
33
  }
34
 
35
 
36
- class ConfigUpdate(BaseModel):
37
- """Request schema for updating a config."""
38
 
39
  name: str | None = None
40
  description: str | None = None
41
- enable_message_compaction: bool | None = None
42
- enable_memory_tool: bool | None = None
43
- enable_sub_agent: bool | None = None
44
- compaction_head_size: int | None = None
45
- compaction_tail_size: int | None = None
46
- bash_timeout: int | None = None
47
 
48
 
49
- class ConfigResponse(BaseModel):
50
- """Response schema for a config."""
51
 
52
  model_config = ConfigDict(from_attributes=True)
53
 
@@ -63,7 +69,6 @@ class ConfigResponse(BaseModel):
63
  @field_validator("id", mode="before")
64
  @classmethod
65
  def convert_uuid(cls, v: UUID | str) -> str:
66
- """Convert UUID to string."""
67
  if isinstance(v, UUID):
68
  return str(v)
69
  return v
@@ -71,7 +76,6 @@ class ConfigResponse(BaseModel):
71
  @field_validator("job_id", mode="before")
72
  @classmethod
73
  def convert_job_uuid(cls, v: UUID | str | None) -> str | None:
74
- """Convert job UUID to string."""
75
  if v is None:
76
  return None
77
  if isinstance(v, UUID):
 
1
  # Copyright (c) Microsoft. All rights reserved.
2
+ """Agent config schemas."""
3
 
4
  from datetime import datetime
5
  from typing import Any
 
8
  from pydantic import BaseModel, ConfigDict, field_validator
9
 
10
 
11
+ class CompactionConfigSchema(BaseModel):
12
+ """Compaction strategy configuration."""
13
+
14
+ strategy: str = "head_tail"
15
+ params: dict[str, Any] = {"head_size": 10, "tail_size": 40}
16
+
17
+
18
+ class AgentCreate(BaseModel):
19
+ """Request schema for creating an agent.
20
+
21
+ Tools can be specified as:
22
+ - str: Preset name ("standard", "minimal", "full", "readonly")
23
+ - list[str]: List of tool names
24
+ - dict[str, dict]: Full specification with per-tool configs
25
+ """
26
 
27
  name: str
28
  description: str = ""
29
+ instructions: str | None = None
30
+ model: str | None = None
31
+ compaction: CompactionConfigSchema = CompactionConfigSchema()
32
+ tools: str | list[str] | dict[str, dict[str, Any]] = "standard"
 
 
33
 
34
  def to_config_json(self) -> dict[str, Any]:
35
+ """Convert to config JSON for storage (runtime settings only)."""
36
  return {
37
+ "instructions": self.instructions,
38
+ "model": self.model,
39
+ "compaction": self.compaction.model_dump(),
40
+ "tools": self.tools,
 
 
 
41
  }
42
 
43
 
44
+ class AgentUpdate(BaseModel):
45
+ """Request schema for updating an agent."""
46
 
47
  name: str | None = None
48
  description: str | None = None
49
+ instructions: str | None = None
50
+ model: str | None = None
51
+ compaction: CompactionConfigSchema | None = None
52
+ tools: str | list[str] | dict[str, dict[str, Any]] | None = None
 
 
53
 
54
 
55
+ class AgentResponse(BaseModel):
56
+ """Response schema for an agent."""
57
 
58
  model_config = ConfigDict(from_attributes=True)
59
 
 
69
  @field_validator("id", mode="before")
70
  @classmethod
71
  def convert_uuid(cls, v: UUID | str) -> str:
 
72
  if isinstance(v, UUID):
73
  return str(v)
74
  return v
 
76
  @field_validator("job_id", mode="before")
77
  @classmethod
78
  def convert_job_uuid(cls, v: UUID | str | None) -> str | None:
 
79
  if v is None:
80
  return None
81
  if isinstance(v, UUID):
src/flow/ui/schemas/job.py CHANGED
@@ -13,7 +13,7 @@ class JobCreate(BaseModel):
13
  """Request schema for creating a job."""
14
 
15
  name: str = ""
16
- config_ids: list[str]
17
  task_ids: list[str]
18
  parallel: int = 4
19
  use_llm_eval: bool = False
@@ -29,7 +29,7 @@ class JobResponse(BaseModel):
29
  status: JobStatus
30
  parallel: int
31
  use_llm_eval: bool
32
- config_ids: list[str]
33
  task_ids: list[str]
34
  pareto_frontier: list[str]
35
  output_dir: str | None
@@ -56,6 +56,6 @@ class JobProgress(BaseModel):
56
  job_id: str
57
  completed: int = 0
58
  total: int = 0
59
- current_config: str = ""
60
  current_task: str = ""
61
  message: str = ""
 
13
  """Request schema for creating a job."""
14
 
15
  name: str = ""
16
+ candidate_ids: list[str]
17
  task_ids: list[str]
18
  parallel: int = 4
19
  use_llm_eval: bool = False
 
29
  status: JobStatus
30
  parallel: int
31
  use_llm_eval: bool
32
+ candidate_ids: list[str]
33
  task_ids: list[str]
34
  pareto_frontier: list[str]
35
  output_dir: str | None
 
56
  job_id: str
57
  completed: int = 0
58
  total: int = 0
59
+ current_candidate: str = ""
60
  current_task: str = ""
61
  message: str = ""
src/flow/ui/schemas/run.py CHANGED
@@ -15,7 +15,7 @@ class RunResponse(BaseModel):
15
 
16
  id: str
17
  job_id: str
18
- config_name: str
19
  task_name: str
20
  status: str
21
  tokens_total: int
@@ -51,7 +51,7 @@ class RunDetailResponse(BaseModel):
51
 
52
  id: str
53
  job_id: str
54
- config_name: str
55
  task_name: str
56
  status: str
57
 
 
15
 
16
  id: str
17
  job_id: str
18
+ candidate_name: str
19
  task_name: str
20
  status: str
21
  tokens_total: int
 
51
 
52
  id: str
53
  job_id: str
54
+ candidate_name: str
55
  task_name: str
56
  status: str
57
 
src/flow/ui/services/optimizer_service.py CHANGED
@@ -9,7 +9,7 @@ from uuid import UUID
9
  from sqlalchemy.ext.asyncio import AsyncSession
10
  from sqlmodel import select
11
 
12
- from flow.experiments.ablation import AblationConfig
13
  from flow.experiments.optimizer import FlowOptimizer
14
  from flow.experiments.types import EvalCriterion, Task
15
 
@@ -26,12 +26,10 @@ class OptimizerService:
26
 
27
  async def run_job(self, job_id: str | UUID) -> AsyncGenerator[JobProgress, None]:
28
  """Run an optimization job and yield progress updates."""
29
- # Convert to UUID if string
30
  if isinstance(job_id, str):
31
  job_id = UUID(job_id)
32
 
33
  async with async_session() as session:
34
- # Load job
35
  result = await session.execute(
36
  select(OptimizationJob).where(OptimizationJob.id == job_id)
37
  )
@@ -44,7 +42,6 @@ class OptimizerService:
44
  )
45
  return
46
 
47
- # Update job status
48
  job.status = JobStatus.RUNNING
49
  job.started_at = datetime.now(timezone.utc)
50
  await session.commit()
@@ -58,48 +55,39 @@ class OptimizerService:
58
  )
59
 
60
  try:
61
- # Load configs
62
- configs = await self._load_configs(session, job.config_ids)
63
- if not configs:
64
- raise ValueError("No valid configs found")
65
 
66
- # Load tasks
67
  tasks = await self._load_tasks(session, job.task_ids)
68
  if not tasks:
69
  raise ValueError("No valid tasks found")
70
 
71
- # Create optimizer
72
  optimizer = FlowOptimizer(
73
  parallel=job.parallel,
74
  use_llm_evaluator=job.use_llm_eval,
75
  )
76
 
77
- # Track progress via callback
78
  progress_queue: asyncio.Queue[tuple[int, int, str, str]] = asyncio.Queue()
79
 
80
  def progress_callback(completed: int, total: int, config: str, task: str) -> None:
81
- """Callback invoked by FlowOptimizer on each completion."""
82
  try:
83
  progress_queue.put_nowait((completed, total, config, task))
84
  except asyncio.QueueFull:
85
  pass
86
 
87
- # Run optimization in background task
88
  async def run_optimization():
89
  return await optimizer.optimize(
90
- configs=configs,
91
  tasks=tasks,
92
  progress_callback=progress_callback,
93
  )
94
 
95
- # Start optimization
96
  opt_task = asyncio.create_task(run_optimization())
97
 
98
- # Yield progress updates while optimization runs
99
  while not opt_task.done():
100
  try:
101
- # Wait for progress with timeout
102
- completed, total, config_name, task_name = await asyncio.wait_for(
103
  progress_queue.get(),
104
  timeout=1.0,
105
  )
@@ -108,32 +96,26 @@ class OptimizerService:
108
  job_id=str(job_id),
109
  completed=completed,
110
  total=total,
111
- current_config=config_name,
112
  current_task=task_name,
113
- message=f"Running {config_name}/{task_name}...",
114
  )
115
 
116
- # Update job progress in DB
117
  job.completed_experiments = completed
118
  await session.commit()
119
 
120
  except asyncio.TimeoutError:
121
- # No progress update, check if task failed
122
  if opt_task.done():
123
- # Check for exception before breaking
124
  exc = opt_task.exception()
125
  if exc:
126
  raise exc
127
  continue
128
 
129
- # Get final result - this will re-raise any exception from the task
130
  opt_result = await opt_task
131
 
132
- # Check if all experiments failed
133
  if opt_result.total_experiments == 0 or len(opt_result.summaries) == 0:
134
- # No successful experiments - this is a failure
135
  job.status = JobStatus.FAILED
136
- job.error = "All experiments failed. Check server logs for details. Common causes: missing API keys (AZURE_OPENAI_ENDPOINT, OPENAI_API_KEY), invalid configuration."
137
  job.completed_at = datetime.now(timezone.utc)
138
  await session.commit()
139
 
@@ -144,12 +126,11 @@ class OptimizerService:
144
  )
145
  return
146
 
147
- # Save runs to database
148
  for summary in opt_result.summaries:
149
  for task_result in summary.task_results:
150
  run = ExperimentRun(
151
  job_id=job.id,
152
- config_name=task_result.config_name,
153
  task_name=task_result.task_name,
154
  status="completed",
155
  tokens_total=task_result.metrics.total_tokens,
@@ -171,7 +152,6 @@ class OptimizerService:
171
  )
172
  session.add(run)
173
 
174
- # Update job
175
  job.status = JobStatus.COMPLETED
176
  job.completed_experiments = opt_result.total_experiments
177
  job.pareto_frontier = opt_result.pareto_frontier
@@ -184,7 +164,7 @@ class OptimizerService:
184
  job_id=str(job_id),
185
  completed=opt_result.total_experiments,
186
  total=job.total_experiments,
187
- message=f"Optimization complete. Pareto configs: {', '.join(opt_result.pareto_frontier)}",
188
  )
189
 
190
  except Exception as e:
@@ -199,37 +179,47 @@ class OptimizerService:
199
  message=f"Optimization failed: {e}",
200
  )
201
 
202
- async def _load_configs(
203
  self,
204
  session: AsyncSession,
205
- config_ids: list[str],
206
- ) -> list[AblationConfig]:
207
- """Load configs from database and convert to AblationConfig."""
208
- configs = []
209
- for config_id in config_ids:
210
  result = await session.execute(
211
- select(AgentConfig).where(AgentConfig.id == UUID(config_id))
212
  )
213
  db_config = result.scalar_one_or_none()
214
  if db_config:
215
  cfg = db_config.config_json
216
- configs.append(AblationConfig(
 
 
 
 
 
 
 
 
 
 
 
217
  name=db_config.name,
218
- enable_message_compaction=cfg.get("enable_message_compaction", True),
219
- enable_memory_tool=cfg.get("enable_memory_tool", True),
220
- enable_sub_agent=cfg.get("enable_sub_agent", False),
221
- compaction_head_size=cfg.get("compaction_head_size", 10),
222
- compaction_tail_size=cfg.get("compaction_tail_size", 40),
223
- bash_timeout=cfg.get("bash_timeout", 120),
224
- ))
225
- return configs
226
 
227
  async def _load_tasks(
228
  self,
229
  session: AsyncSession,
230
  task_ids: list[str],
231
  ) -> list[Task]:
232
- """Load tasks from database and convert to Task."""
233
  tasks = []
234
  for task_id in task_ids:
235
  result = await session.execute(
 
9
  from sqlalchemy.ext.asyncio import AsyncSession
10
  from sqlmodel import select
11
 
12
+ from flow.experiments.models import Agent, Candidate, CompactionConfig
13
  from flow.experiments.optimizer import FlowOptimizer
14
  from flow.experiments.types import EvalCriterion, Task
15
 
 
26
 
27
  async def run_job(self, job_id: str | UUID) -> AsyncGenerator[JobProgress, None]:
28
  """Run an optimization job and yield progress updates."""
 
29
  if isinstance(job_id, str):
30
  job_id = UUID(job_id)
31
 
32
  async with async_session() as session:
 
33
  result = await session.execute(
34
  select(OptimizationJob).where(OptimizationJob.id == job_id)
35
  )
 
42
  )
43
  return
44
 
 
45
  job.status = JobStatus.RUNNING
46
  job.started_at = datetime.now(timezone.utc)
47
  await session.commit()
 
55
  )
56
 
57
  try:
58
+ candidates = await self._load_candidates(session, job.candidate_ids)
59
+ if not candidates:
60
+ raise ValueError("No valid candidates found")
 
61
 
 
62
  tasks = await self._load_tasks(session, job.task_ids)
63
  if not tasks:
64
  raise ValueError("No valid tasks found")
65
 
 
66
  optimizer = FlowOptimizer(
67
  parallel=job.parallel,
68
  use_llm_evaluator=job.use_llm_eval,
69
  )
70
 
 
71
  progress_queue: asyncio.Queue[tuple[int, int, str, str]] = asyncio.Queue()
72
 
73
  def progress_callback(completed: int, total: int, config: str, task: str) -> None:
 
74
  try:
75
  progress_queue.put_nowait((completed, total, config, task))
76
  except asyncio.QueueFull:
77
  pass
78
 
 
79
  async def run_optimization():
80
  return await optimizer.optimize(
81
+ candidates=candidates,
82
  tasks=tasks,
83
  progress_callback=progress_callback,
84
  )
85
 
 
86
  opt_task = asyncio.create_task(run_optimization())
87
 
 
88
  while not opt_task.done():
89
  try:
90
+ completed, total, candidate_name, task_name = await asyncio.wait_for(
 
91
  progress_queue.get(),
92
  timeout=1.0,
93
  )
 
96
  job_id=str(job_id),
97
  completed=completed,
98
  total=total,
99
+ current_candidate=candidate_name,
100
  current_task=task_name,
101
+ message=f"Running {candidate_name}/{task_name}...",
102
  )
103
 
 
104
  job.completed_experiments = completed
105
  await session.commit()
106
 
107
  except asyncio.TimeoutError:
 
108
  if opt_task.done():
 
109
  exc = opt_task.exception()
110
  if exc:
111
  raise exc
112
  continue
113
 
 
114
  opt_result = await opt_task
115
 
 
116
  if opt_result.total_experiments == 0 or len(opt_result.summaries) == 0:
 
117
  job.status = JobStatus.FAILED
118
+ job.error = "All experiments failed. Check server logs for details."
119
  job.completed_at = datetime.now(timezone.utc)
120
  await session.commit()
121
 
 
126
  )
127
  return
128
 
 
129
  for summary in opt_result.summaries:
130
  for task_result in summary.task_results:
131
  run = ExperimentRun(
132
  job_id=job.id,
133
+ candidate_name=task_result.candidate_name,
134
  task_name=task_result.task_name,
135
  status="completed",
136
  tokens_total=task_result.metrics.total_tokens,
 
152
  )
153
  session.add(run)
154
 
 
155
  job.status = JobStatus.COMPLETED
156
  job.completed_experiments = opt_result.total_experiments
157
  job.pareto_frontier = opt_result.pareto_frontier
 
164
  job_id=str(job_id),
165
  completed=opt_result.total_experiments,
166
  total=job.total_experiments,
167
+ message=f"Optimization complete. Pareto candidates: {', '.join(opt_result.pareto_frontier)}",
168
  )
169
 
170
  except Exception as e:
 
179
  message=f"Optimization failed: {e}",
180
  )
181
 
182
+ async def _load_candidates(
183
  self,
184
  session: AsyncSession,
185
+ candidate_ids: list[str],
186
+ ) -> list[Candidate]:
187
+ """Load configs from database and convert to Candidate objects."""
188
+ candidates = []
189
+ for candidate_id in candidate_ids:
190
  result = await session.execute(
191
+ select(AgentConfig).where(AgentConfig.id == UUID(candidate_id))
192
  )
193
  db_config = result.scalar_one_or_none()
194
  if db_config:
195
  cfg = db_config.config_json
196
+
197
+ # Build CompactionConfig from stored JSON
198
+ compaction_data = cfg.get("compaction", {})
199
+ compaction = CompactionConfig(
200
+ strategy=compaction_data.get("strategy", "head_tail"),
201
+ params=compaction_data.get("params", {"head_size": 10, "tail_size": 40}),
202
+ )
203
+
204
+ # Get tools configuration (can be str, list, or dict)
205
+ tools = cfg.get("tools", "standard")
206
+
207
+ agent = Agent(
208
  name=db_config.name,
209
+ instructions=cfg.get("instructions"),
210
+ model=cfg.get("model"),
211
+ compaction=compaction,
212
+ tools=tools,
213
+ )
214
+ candidates.append(Candidate(agent=agent))
215
+ return candidates
 
216
 
217
  async def _load_tasks(
218
  self,
219
  session: AsyncSession,
220
  task_ids: list[str],
221
  ) -> list[Task]:
222
+ """Load tasks from database and convert to Task objects."""
223
  tasks = []
224
  for task_id in task_ids:
225
  result = await session.execute(
src/flow/ui/tests/test_e2e_user_journey.py CHANGED
@@ -138,7 +138,7 @@ class TestE2EUserJourney:
138
 
139
  job_data = {
140
  "name": "E2E Test Optimization",
141
- "config_ids": created_agent_ids,
142
  "task_ids": created_task_ids[:2], # Use first 2 tasks
143
  "parallel": 2,
144
  "use_llm_eval": False,
@@ -150,7 +150,7 @@ class TestE2EUserJourney:
150
  print(f" ✓ Created job: {job['name']} (id: {job['id'][:8]}...)")
151
  print(f" - Status: {job['status']}")
152
  print(f" - Total experiments: {job['total_experiments']}")
153
- print(f" - Configs: {len(job['config_ids'])}, Tasks: {len(job['task_ids'])}")
154
 
155
  # ========================================
156
  # STEP 5: Get Job Details
@@ -284,7 +284,7 @@ class TestE2EUserJourney:
284
  # Test creating job with non-existent config
285
  job_data = {
286
  "name": "Invalid Job",
287
- "config_ids": ["00000000-0000-0000-0000-000000000000"],
288
  "task_ids": ["00000000-0000-0000-0000-000000000001"],
289
  }
290
  resp = await client.post("/api/jobs", json=job_data)
@@ -403,7 +403,7 @@ class TestAPIEndpoints:
403
  "/api/jobs",
404
  json={
405
  "name": "test-job",
406
- "config_ids": [config["id"]],
407
  "task_ids": [task["id"]],
408
  },
409
  )
@@ -481,7 +481,7 @@ class TestAPIEndpoints:
481
  "/api/jobs",
482
  json={
483
  "name": "start-test-job",
484
- "config_ids": [config["id"]],
485
  "task_ids": [task["id"]],
486
  "parallel": 1,
487
  },
@@ -593,7 +593,7 @@ class TestAPIEndpoints:
593
  "/api/jobs",
594
  json={
595
  "name": "reset-test-job",
596
- "config_ids": [config["id"]],
597
  "task_ids": [task["id"]],
598
  },
599
  )
 
138
 
139
  job_data = {
140
  "name": "E2E Test Optimization",
141
+ "candidate_ids": created_agent_ids,
142
  "task_ids": created_task_ids[:2], # Use first 2 tasks
143
  "parallel": 2,
144
  "use_llm_eval": False,
 
150
  print(f" ✓ Created job: {job['name']} (id: {job['id'][:8]}...)")
151
  print(f" - Status: {job['status']}")
152
  print(f" - Total experiments: {job['total_experiments']}")
153
+ print(f" - Candidates: {len(job['candidate_ids'])}, Tasks: {len(job['task_ids'])}")
154
 
155
  # ========================================
156
  # STEP 5: Get Job Details
 
284
  # Test creating job with non-existent config
285
  job_data = {
286
  "name": "Invalid Job",
287
+ "candidate_ids": ["00000000-0000-0000-0000-000000000000"],
288
  "task_ids": ["00000000-0000-0000-0000-000000000001"],
289
  }
290
  resp = await client.post("/api/jobs", json=job_data)
 
403
  "/api/jobs",
404
  json={
405
  "name": "test-job",
406
+ "candidate_ids": [config["id"]],
407
  "task_ids": [task["id"]],
408
  },
409
  )
 
481
  "/api/jobs",
482
  json={
483
  "name": "start-test-job",
484
+ "candidate_ids": [config["id"]],
485
  "task_ids": [task["id"]],
486
  "parallel": 1,
487
  },
 
593
  "/api/jobs",
594
  json={
595
  "name": "reset-test-job",
596
+ "candidate_ids": [config["id"]],
597
  "task_ids": [task["id"]],
598
  },
599
  )
src/flow/ui/ui/assets/index-2zMAgGgo.js ADDED
The diff for this file is too large to render. See raw diff
 
src/flow/ui/ui/assets/index-BG9n9RHB.js ADDED
The diff for this file is too large to render. See raw diff
 
src/flow/ui/ui/assets/index-BHAF8mLj.css ADDED
@@ -0,0 +1 @@
 
 
1
+ *,:before,:after{--tw-border-spacing-x: 0;--tw-border-spacing-y: 0;--tw-translate-x: 0;--tw-translate-y: 0;--tw-rotate: 0;--tw-skew-x: 0;--tw-skew-y: 0;--tw-scale-x: 1;--tw-scale-y: 1;--tw-pan-x: ;--tw-pan-y: ;--tw-pinch-zoom: ;--tw-scroll-snap-strictness: proximity;--tw-gradient-from-position: ;--tw-gradient-via-position: ;--tw-gradient-to-position: ;--tw-ordinal: ;--tw-slashed-zero: ;--tw-numeric-figure: ;--tw-numeric-spacing: ;--tw-numeric-fraction: ;--tw-ring-inset: ;--tw-ring-offset-width: 0px;--tw-ring-offset-color: #fff;--tw-ring-color: rgb(59 130 246 / .5);--tw-ring-offset-shadow: 0 0 #0000;--tw-ring-shadow: 0 0 #0000;--tw-shadow: 0 0 #0000;--tw-shadow-colored: 0 0 #0000;--tw-blur: ;--tw-brightness: ;--tw-contrast: ;--tw-grayscale: ;--tw-hue-rotate: ;--tw-invert: ;--tw-saturate: ;--tw-sepia: ;--tw-drop-shadow: ;--tw-backdrop-blur: ;--tw-backdrop-brightness: ;--tw-backdrop-contrast: ;--tw-backdrop-grayscale: ;--tw-backdrop-hue-rotate: ;--tw-backdrop-invert: ;--tw-backdrop-opacity: ;--tw-backdrop-saturate: ;--tw-backdrop-sepia: ;--tw-contain-size: ;--tw-contain-layout: ;--tw-contain-paint: ;--tw-contain-style: }::backdrop{--tw-border-spacing-x: 0;--tw-border-spacing-y: 0;--tw-translate-x: 0;--tw-translate-y: 0;--tw-rotate: 0;--tw-skew-x: 0;--tw-skew-y: 0;--tw-scale-x: 1;--tw-scale-y: 1;--tw-pan-x: ;--tw-pan-y: ;--tw-pinch-zoom: ;--tw-scroll-snap-strictness: proximity;--tw-gradient-from-position: ;--tw-gradient-via-position: ;--tw-gradient-to-position: ;--tw-ordinal: ;--tw-slashed-zero: ;--tw-numeric-figure: ;--tw-numeric-spacing: ;--tw-numeric-fraction: ;--tw-ring-inset: ;--tw-ring-offset-width: 0px;--tw-ring-offset-color: #fff;--tw-ring-color: rgb(59 130 246 / .5);--tw-ring-offset-shadow: 0 0 #0000;--tw-ring-shadow: 0 0 #0000;--tw-shadow: 0 0 #0000;--tw-shadow-colored: 0 0 #0000;--tw-blur: ;--tw-brightness: ;--tw-contrast: ;--tw-grayscale: ;--tw-hue-rotate: ;--tw-invert: ;--tw-saturate: ;--tw-sepia: ;--tw-drop-shadow: ;--tw-backdrop-blur: ;--tw-backdrop-brightness: ;--tw-backdrop-contrast: ;--tw-backdrop-grayscale: ;--tw-backdrop-hue-rotate: ;--tw-backdrop-invert: ;--tw-backdrop-opacity: ;--tw-backdrop-saturate: ;--tw-backdrop-sepia: ;--tw-contain-size: ;--tw-contain-layout: ;--tw-contain-paint: ;--tw-contain-style: }*,:before,:after{box-sizing:border-box;border-width:0;border-style:solid;border-color:#e5e7eb}:before,:after{--tw-content: ""}html,:host{line-height:1.5;-webkit-text-size-adjust:100%;-moz-tab-size:4;-o-tab-size:4;tab-size:4;font-family:ui-sans-serif,system-ui,sans-serif,"Apple Color Emoji","Segoe UI Emoji",Segoe UI Symbol,"Noto Color Emoji";font-feature-settings:normal;font-variation-settings:normal;-webkit-tap-highlight-color:transparent}body{margin:0;line-height:inherit}hr{height:0;color:inherit;border-top-width:1px}abbr:where([title]){-webkit-text-decoration:underline dotted;text-decoration:underline dotted}h1,h2,h3,h4,h5,h6{font-size:inherit;font-weight:inherit}a{color:inherit;text-decoration:inherit}b,strong{font-weight:bolder}code,kbd,samp,pre{font-family:JetBrains Mono,ui-monospace,monospace;font-feature-settings:normal;font-variation-settings:normal;font-size:1em}small{font-size:80%}sub,sup{font-size:75%;line-height:0;position:relative;vertical-align:baseline}sub{bottom:-.25em}sup{top:-.5em}table{text-indent:0;border-color:inherit;border-collapse:collapse}button,input,optgroup,select,textarea{font-family:inherit;font-feature-settings:inherit;font-variation-settings:inherit;font-size:100%;font-weight:inherit;line-height:inherit;letter-spacing:inherit;color:inherit;margin:0;padding:0}button,select{text-transform:none}button,input:where([type=button]),input:where([type=reset]),input:where([type=submit]){-webkit-appearance:button;background-color:transparent;background-image:none}:-moz-focusring{outline:auto}:-moz-ui-invalid{box-shadow:none}progress{vertical-align:baseline}::-webkit-inner-spin-button,::-webkit-outer-spin-button{height:auto}[type=search]{-webkit-appearance:textfield;outline-offset:-2px}::-webkit-search-decoration{-webkit-appearance:none}::-webkit-file-upload-button{-webkit-appearance:button;font:inherit}summary{display:list-item}blockquote,dl,dd,h1,h2,h3,h4,h5,h6,hr,figure,p,pre{margin:0}fieldset{margin:0;padding:0}legend{padding:0}ol,ul,menu{list-style:none;margin:0;padding:0}dialog{padding:0}textarea{resize:vertical}input::-moz-placeholder,textarea::-moz-placeholder{opacity:1;color:#9ca3af}input::placeholder,textarea::placeholder{opacity:1;color:#9ca3af}button,[role=button]{cursor:pointer}:disabled{cursor:default}img,svg,video,canvas,audio,iframe,embed,object{display:block;vertical-align:middle}img,video{max-width:100%;height:auto}[hidden]:where(:not([hidden=until-found])){display:none}.pointer-events-none{pointer-events:none}.fixed{position:fixed}.absolute{position:absolute}.relative{position:relative}.sticky{position:sticky}.inset-0{top:0;right:0;bottom:0;left:0}.bottom-0{bottom:0}.left-0{left:0}.left-3{left:.75rem}.top-0{top:0}.top-1\/2{top:50%}.z-10{z-index:10}.z-50{z-index:50}.mx-0\.5{margin-left:.125rem;margin-right:.125rem}.mx-4{margin-left:1rem;margin-right:1rem}.mx-auto{margin-left:auto;margin-right:auto}.-mt-1{margin-top:-.25rem}.mb-1{margin-bottom:.25rem}.mb-2{margin-bottom:.5rem}.mb-3{margin-bottom:.75rem}.mb-4{margin-bottom:1rem}.mb-6{margin-bottom:1.5rem}.mb-8{margin-bottom:2rem}.ml-2{margin-left:.5rem}.ml-4{margin-left:1rem}.ml-6{margin-left:1.5rem}.mr-1{margin-right:.25rem}.mt-0\.5{margin-top:.125rem}.mt-1{margin-top:.25rem}.mt-2{margin-top:.5rem}.mt-3{margin-top:.75rem}.mt-4{margin-top:1rem}.mt-8{margin-top:2rem}.mt-auto{margin-top:auto}.line-clamp-2{overflow:hidden;display:-webkit-box;-webkit-box-orient:vertical;-webkit-line-clamp:2}.line-clamp-3{overflow:hidden;display:-webkit-box;-webkit-box-orient:vertical;-webkit-line-clamp:3}.block{display:block}.inline-block{display:inline-block}.flex{display:flex}.inline-flex{display:inline-flex}.table{display:table}.grid{display:grid}.hidden{display:none}.h-1\.5{height:.375rem}.h-12{height:3rem}.h-2{height:.5rem}.h-3{height:.75rem}.h-32{height:8rem}.h-4{height:1rem}.h-5{height:1.25rem}.h-6{height:1.5rem}.h-8{height:2rem}.h-full{height:100%}.max-h-32{max-height:8rem}.max-h-40{max-height:10rem}.max-h-48{max-height:12rem}.max-h-96{max-height:24rem}.max-h-\[80vh\]{max-height:80vh}.min-h-\[100px\]{min-height:100px}.min-h-screen{min-height:100vh}.w-12{width:3rem}.w-2{width:.5rem}.w-20{width:5rem}.w-24{width:6rem}.w-3{width:.75rem}.w-32{width:8rem}.w-4{width:1rem}.w-5{width:1.25rem}.w-6{width:1.5rem}.w-8{width:2rem}.w-full{width:100%}.min-w-0{min-width:0px}.min-w-\[90px\]{min-width:90px}.max-w-7xl{max-width:80rem}.max-w-lg{max-width:32rem}.max-w-md{max-width:28rem}.flex-1{flex:1 1 0%}.flex-shrink-0{flex-shrink:0}.-translate-y-1\/2{--tw-translate-y: -50%;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}.rotate-180{--tw-rotate: 180deg;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}.transform{transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}@keyframes pulse{50%{opacity:.5}}.animate-pulse{animation:pulse 2s cubic-bezier(.4,0,.6,1) infinite}@keyframes spin{to{transform:rotate(360deg)}}.animate-spin{animation:spin 1s linear infinite}.cursor-pointer{cursor:pointer}.select-none{-webkit-user-select:none;-moz-user-select:none;user-select:none}.resize-y{resize:vertical}.resize{resize:both}.grid-cols-1{grid-template-columns:repeat(1,minmax(0,1fr))}.grid-cols-2{grid-template-columns:repeat(2,minmax(0,1fr))}.grid-cols-3{grid-template-columns:repeat(3,minmax(0,1fr))}.grid-cols-4{grid-template-columns:repeat(4,minmax(0,1fr))}.flex-col{flex-direction:column}.flex-wrap{flex-wrap:wrap}.items-start{align-items:flex-start}.items-end{align-items:flex-end}.items-center{align-items:center}.justify-end{justify-content:flex-end}.justify-center{justify-content:center}.justify-between{justify-content:space-between}.gap-1{gap:.25rem}.gap-1\.5{gap:.375rem}.gap-2{gap:.5rem}.gap-3{gap:.75rem}.gap-4{gap:1rem}.gap-6{gap:1.5rem}.gap-8{gap:2rem}.gap-x-4{-moz-column-gap:1rem;column-gap:1rem}.gap-y-1{row-gap:.25rem}.space-y-1>:not([hidden])~:not([hidden]){--tw-space-y-reverse: 0;margin-top:calc(.25rem * calc(1 - var(--tw-space-y-reverse)));margin-bottom:calc(.25rem * var(--tw-space-y-reverse))}.space-y-2>:not([hidden])~:not([hidden]){--tw-space-y-reverse: 0;margin-top:calc(.5rem * calc(1 - var(--tw-space-y-reverse)));margin-bottom:calc(.5rem * var(--tw-space-y-reverse))}.space-y-4>:not([hidden])~:not([hidden]){--tw-space-y-reverse: 0;margin-top:calc(1rem * calc(1 - var(--tw-space-y-reverse)));margin-bottom:calc(1rem * var(--tw-space-y-reverse))}.space-y-6>:not([hidden])~:not([hidden]){--tw-space-y-reverse: 0;margin-top:calc(1.5rem * calc(1 - var(--tw-space-y-reverse)));margin-bottom:calc(1.5rem * var(--tw-space-y-reverse))}.overflow-auto{overflow:auto}.overflow-hidden{overflow:hidden}.overflow-x-auto{overflow-x:auto}.overflow-y-auto{overflow-y:auto}.truncate{overflow:hidden;text-overflow:ellipsis;white-space:nowrap}.whitespace-pre-wrap{white-space:pre-wrap}.break-all{word-break:break-all}.rounded{border-radius:.25rem}.rounded-full{border-radius:9999px}.rounded-lg{border-radius:.5rem}.rounded-md{border-radius:.375rem}.border{border-width:1px}.border-b{border-bottom-width:1px}.border-l-2{border-left-width:2px}.border-t{border-top-width:1px}.border-dashed{border-style:dashed}.border-\[var\(--accent\)\]{border-color:var(--accent)}.border-\[var\(--border\)\]{border-color:var(--border)}.border-blue-500\/30{border-color:#3b82f64d}.border-green-500\/30{border-color:#22c55e4d}.border-red-500\/30{border-color:#ef44444d}.border-red-500\/50{border-color:#ef444480}.bg-\[var\(--accent\)\]{background-color:var(--accent)}.bg-\[var\(--bg-primary\)\]{background-color:var(--bg-primary)}.bg-\[var\(--bg-secondary\)\]{background-color:var(--bg-secondary)}.bg-\[var\(--bg-tertiary\)\]{background-color:var(--bg-tertiary)}.bg-\[var\(--error\)\]{background-color:var(--error)}.bg-black\/80{background-color:#000c}.bg-blue-100{--tw-bg-opacity: 1;background-color:rgb(219 234 254 / var(--tw-bg-opacity, 1))}.bg-blue-400{--tw-bg-opacity: 1;background-color:rgb(96 165 250 / var(--tw-bg-opacity, 1))}.bg-blue-500{--tw-bg-opacity: 1;background-color:rgb(59 130 246 / var(--tw-bg-opacity, 1))}.bg-blue-500\/10{background-color:#3b82f61a}.bg-blue-600{--tw-bg-opacity: 1;background-color:rgb(37 99 235 / var(--tw-bg-opacity, 1))}.bg-emerald-500{--tw-bg-opacity: 1;background-color:rgb(16 185 129 / var(--tw-bg-opacity, 1))}.bg-green-100{--tw-bg-opacity: 1;background-color:rgb(220 252 231 / var(--tw-bg-opacity, 1))}.bg-green-400{--tw-bg-opacity: 1;background-color:rgb(74 222 128 / var(--tw-bg-opacity, 1))}.bg-green-500{--tw-bg-opacity: 1;background-color:rgb(34 197 94 / var(--tw-bg-opacity, 1))}.bg-green-500\/10{background-color:#22c55e1a}.bg-green-500\/20{background-color:#22c55e33}.bg-green-600{--tw-bg-opacity: 1;background-color:rgb(22 163 74 / var(--tw-bg-opacity, 1))}.bg-orange-100{--tw-bg-opacity: 1;background-color:rgb(255 237 213 / var(--tw-bg-opacity, 1))}.bg-purple-100{--tw-bg-opacity: 1;background-color:rgb(243 232 255 / var(--tw-bg-opacity, 1))}.bg-red-100{--tw-bg-opacity: 1;background-color:rgb(254 226 226 / var(--tw-bg-opacity, 1))}.bg-red-500{--tw-bg-opacity: 1;background-color:rgb(239 68 68 / var(--tw-bg-opacity, 1))}.bg-red-500\/10{background-color:#ef44441a}.bg-red-600{--tw-bg-opacity: 1;background-color:rgb(220 38 38 / var(--tw-bg-opacity, 1))}.bg-yellow-500{--tw-bg-opacity: 1;background-color:rgb(234 179 8 / var(--tw-bg-opacity, 1))}.p-1{padding:.25rem}.p-2{padding:.5rem}.p-3{padding:.75rem}.p-4{padding:1rem}.p-6{padding:1.5rem}.px-1{padding-left:.25rem;padding-right:.25rem}.px-1\.5{padding-left:.375rem;padding-right:.375rem}.px-2{padding-left:.5rem;padding-right:.5rem}.px-3{padding-left:.75rem;padding-right:.75rem}.px-4{padding-left:1rem;padding-right:1rem}.py-0\.5{padding-top:.125rem;padding-bottom:.125rem}.py-1{padding-top:.25rem;padding-bottom:.25rem}.py-1\.5{padding-top:.375rem;padding-bottom:.375rem}.py-12{padding-top:3rem;padding-bottom:3rem}.py-16{padding-top:4rem;padding-bottom:4rem}.py-2{padding-top:.5rem;padding-bottom:.5rem}.py-3{padding-top:.75rem;padding-bottom:.75rem}.py-4{padding-top:1rem;padding-bottom:1rem}.py-8{padding-top:2rem;padding-bottom:2rem}.pb-1{padding-bottom:.25rem}.pb-2{padding-bottom:.5rem}.pl-10{padding-left:2.5rem}.pr-3{padding-right:.75rem}.pr-4{padding-right:1rem}.pt-2{padding-top:.5rem}.pt-3{padding-top:.75rem}.pt-4{padding-top:1rem}.text-left{text-align:left}.text-center{text-align:center}.text-right{text-align:right}.font-mono{font-family:JetBrains Mono,ui-monospace,monospace}.text-2xl{font-size:1.5rem;line-height:2rem}.text-lg{font-size:1.125rem;line-height:1.75rem}.text-sm{font-size:.875rem;line-height:1.25rem}.text-xl{font-size:1.25rem;line-height:1.75rem}.text-xs{font-size:.75rem;line-height:1rem}.font-bold{font-weight:700}.font-medium{font-weight:500}.font-semibold{font-weight:600}.uppercase{text-transform:uppercase}.tracking-wide{letter-spacing:.025em}.tracking-wider{letter-spacing:.05em}.text-\[var\(--accent\)\]{color:var(--accent)}.text-\[var\(--error\)\]{color:var(--error)}.text-\[var\(--text-primary\)\]{color:var(--text-primary)}.text-\[var\(--text-secondary\)\]{color:var(--text-secondary)}.text-\[var\(--text-tertiary\)\]{color:var(--text-tertiary)}.text-black{--tw-text-opacity: 1;color:rgb(0 0 0 / var(--tw-text-opacity, 1))}.text-blue-400{--tw-text-opacity: 1;color:rgb(96 165 250 / var(--tw-text-opacity, 1))}.text-blue-800{--tw-text-opacity: 1;color:rgb(30 64 175 / var(--tw-text-opacity, 1))}.text-emerald-400{--tw-text-opacity: 1;color:rgb(52 211 153 / var(--tw-text-opacity, 1))}.text-green-400{--tw-text-opacity: 1;color:rgb(74 222 128 / var(--tw-text-opacity, 1))}.text-green-500{--tw-text-opacity: 1;color:rgb(34 197 94 / var(--tw-text-opacity, 1))}.text-green-800{--tw-text-opacity: 1;color:rgb(22 101 52 / var(--tw-text-opacity, 1))}.text-orange-800{--tw-text-opacity: 1;color:rgb(154 52 18 / var(--tw-text-opacity, 1))}.text-purple-400{--tw-text-opacity: 1;color:rgb(192 132 252 / var(--tw-text-opacity, 1))}.text-purple-800{--tw-text-opacity: 1;color:rgb(107 33 168 / var(--tw-text-opacity, 1))}.text-red-400{--tw-text-opacity: 1;color:rgb(248 113 113 / var(--tw-text-opacity, 1))}.text-red-800{--tw-text-opacity: 1;color:rgb(153 27 27 / var(--tw-text-opacity, 1))}.text-white{--tw-text-opacity: 1;color:rgb(255 255 255 / var(--tw-text-opacity, 1))}.accent-\[var\(--accent\)\]{accent-color:var(--accent)}.shadow-lg{--tw-shadow: 0 10px 15px -3px rgb(0 0 0 / .1), 0 4px 6px -4px rgb(0 0 0 / .1);--tw-shadow-colored: 0 10px 15px -3px var(--tw-shadow-color), 0 4px 6px -4px var(--tw-shadow-color);box-shadow:var(--tw-ring-offset-shadow, 0 0 #0000),var(--tw-ring-shadow, 0 0 #0000),var(--tw-shadow)}.filter{filter:var(--tw-blur) var(--tw-brightness) var(--tw-contrast) var(--tw-grayscale) var(--tw-hue-rotate) var(--tw-invert) var(--tw-saturate) var(--tw-sepia) var(--tw-drop-shadow)}.transition-all{transition-property:all;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-duration:.15s}.transition-colors{transition-property:color,background-color,border-color,text-decoration-color,fill,stroke;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-duration:.15s}.duration-300{transition-duration:.3s}:root{--bg-primary: #0a0a0a;--bg-secondary: #141414;--bg-tertiary: #1a1a1a;--text-primary: #f5f5f5;--text-secondary: #a3a3a3;--accent: #22c55e;--accent-dim: #166534;--border: #262626;--error: #ef4444}[data-theme=light]{--bg-primary: #ffffff;--bg-secondary: #f7f8f9;--bg-tertiary: #eef0f2;--text-primary: #1a1a1a;--text-secondary: #4a4a4a;--accent: #16a34a;--accent-dim: #dcfce7;--border: #d1d5db;--error: #dc2626}*{box-sizing:border-box}body{margin:0;background-color:var(--bg-primary);color:var(--text-primary);font-family:JetBrains Mono,ui-monospace,monospace;font-size:14px;line-height:1.6}::-webkit-scrollbar{width:8px;height:8px}::-webkit-scrollbar-track{background:var(--bg-secondary)}::-webkit-scrollbar-thumb{background:var(--border);border-radius:4px}::-webkit-scrollbar-thumb:hover{background:#404040}[data-theme=light] ::-webkit-scrollbar-thumb:hover{background:silver}.last\:border-0:last-child{border-width:0px}.hover\:border-\[var\(--accent-dim\)\]:hover{border-color:var(--accent-dim)}.hover\:bg-\[\#16a34a\]:hover{--tw-bg-opacity: 1;background-color:rgb(22 163 74 / var(--tw-bg-opacity, 1))}.hover\:bg-\[var\(--bg-primary\)\]:hover{background-color:var(--bg-primary)}.hover\:bg-\[var\(--bg-tertiary\)\]:hover{background-color:var(--bg-tertiary)}.hover\:bg-\[var\(--border\)\]:hover{background-color:var(--border)}.hover\:bg-red-600:hover{--tw-bg-opacity: 1;background-color:rgb(220 38 38 / var(--tw-bg-opacity, 1))}.hover\:text-\[var\(--accent\)\]:hover{color:var(--accent)}.hover\:text-\[var\(--text-primary\)\]:hover{color:var(--text-primary)}.hover\:opacity-80:hover{opacity:.8}.focus\:border-\[var\(--accent\)\]:focus{border-color:var(--accent)}.focus\:outline-none:focus{outline:2px solid transparent;outline-offset:2px}.focus\:ring-2:focus{--tw-ring-offset-shadow: var(--tw-ring-inset) 0 0 0 var(--tw-ring-offset-width) var(--tw-ring-offset-color);--tw-ring-shadow: var(--tw-ring-inset) 0 0 0 calc(2px + var(--tw-ring-offset-width)) var(--tw-ring-color);box-shadow:var(--tw-ring-offset-shadow),var(--tw-ring-shadow),var(--tw-shadow, 0 0 #0000)}.focus\:ring-\[var\(--accent\)\]:focus{--tw-ring-color: var(--accent)}.disabled\:cursor-not-allowed:disabled{cursor:not-allowed}.disabled\:opacity-50:disabled{opacity:.5}@media (min-width: 768px){.md\:grid-cols-2{grid-template-columns:repeat(2,minmax(0,1fr))}}@media (min-width: 1024px){.lg\:grid-cols-3{grid-template-columns:repeat(3,minmax(0,1fr))}}@media (min-width: 1280px){.xl\:grid-cols-3{grid-template-columns:repeat(3,minmax(0,1fr))}}@media (prefers-color-scheme: dark){.dark\:bg-blue-900{--tw-bg-opacity: 1;background-color:rgb(30 58 138 / var(--tw-bg-opacity, 1))}.dark\:bg-green-900{--tw-bg-opacity: 1;background-color:rgb(20 83 45 / var(--tw-bg-opacity, 1))}.dark\:bg-orange-900{--tw-bg-opacity: 1;background-color:rgb(124 45 18 / var(--tw-bg-opacity, 1))}.dark\:bg-purple-900{--tw-bg-opacity: 1;background-color:rgb(88 28 135 / var(--tw-bg-opacity, 1))}.dark\:bg-red-900{--tw-bg-opacity: 1;background-color:rgb(127 29 29 / var(--tw-bg-opacity, 1))}.dark\:text-blue-200{--tw-text-opacity: 1;color:rgb(191 219 254 / var(--tw-text-opacity, 1))}.dark\:text-green-200{--tw-text-opacity: 1;color:rgb(187 247 208 / var(--tw-text-opacity, 1))}.dark\:text-orange-200{--tw-text-opacity: 1;color:rgb(254 215 170 / var(--tw-text-opacity, 1))}.dark\:text-purple-200{--tw-text-opacity: 1;color:rgb(233 213 255 / var(--tw-text-opacity, 1))}.dark\:text-red-200{--tw-text-opacity: 1;color:rgb(254 202 202 / var(--tw-text-opacity, 1))}}
src/flow/ui/ui/assets/index-Bx-_JS_6.js ADDED
The diff for this file is too large to render. See raw diff
 
src/flow/ui/ui/assets/index-VFZIS3uv.js ADDED
The diff for this file is too large to render. See raw diff
 
src/flow/ui/ui/assets/index-_IRgS-wR.css ADDED
@@ -0,0 +1 @@
 
 
1
+ *,:before,:after{--tw-border-spacing-x: 0;--tw-border-spacing-y: 0;--tw-translate-x: 0;--tw-translate-y: 0;--tw-rotate: 0;--tw-skew-x: 0;--tw-skew-y: 0;--tw-scale-x: 1;--tw-scale-y: 1;--tw-pan-x: ;--tw-pan-y: ;--tw-pinch-zoom: ;--tw-scroll-snap-strictness: proximity;--tw-gradient-from-position: ;--tw-gradient-via-position: ;--tw-gradient-to-position: ;--tw-ordinal: ;--tw-slashed-zero: ;--tw-numeric-figure: ;--tw-numeric-spacing: ;--tw-numeric-fraction: ;--tw-ring-inset: ;--tw-ring-offset-width: 0px;--tw-ring-offset-color: #fff;--tw-ring-color: rgb(59 130 246 / .5);--tw-ring-offset-shadow: 0 0 #0000;--tw-ring-shadow: 0 0 #0000;--tw-shadow: 0 0 #0000;--tw-shadow-colored: 0 0 #0000;--tw-blur: ;--tw-brightness: ;--tw-contrast: ;--tw-grayscale: ;--tw-hue-rotate: ;--tw-invert: ;--tw-saturate: ;--tw-sepia: ;--tw-drop-shadow: ;--tw-backdrop-blur: ;--tw-backdrop-brightness: ;--tw-backdrop-contrast: ;--tw-backdrop-grayscale: ;--tw-backdrop-hue-rotate: ;--tw-backdrop-invert: ;--tw-backdrop-opacity: ;--tw-backdrop-saturate: ;--tw-backdrop-sepia: ;--tw-contain-size: ;--tw-contain-layout: ;--tw-contain-paint: ;--tw-contain-style: }::backdrop{--tw-border-spacing-x: 0;--tw-border-spacing-y: 0;--tw-translate-x: 0;--tw-translate-y: 0;--tw-rotate: 0;--tw-skew-x: 0;--tw-skew-y: 0;--tw-scale-x: 1;--tw-scale-y: 1;--tw-pan-x: ;--tw-pan-y: ;--tw-pinch-zoom: ;--tw-scroll-snap-strictness: proximity;--tw-gradient-from-position: ;--tw-gradient-via-position: ;--tw-gradient-to-position: ;--tw-ordinal: ;--tw-slashed-zero: ;--tw-numeric-figure: ;--tw-numeric-spacing: ;--tw-numeric-fraction: ;--tw-ring-inset: ;--tw-ring-offset-width: 0px;--tw-ring-offset-color: #fff;--tw-ring-color: rgb(59 130 246 / .5);--tw-ring-offset-shadow: 0 0 #0000;--tw-ring-shadow: 0 0 #0000;--tw-shadow: 0 0 #0000;--tw-shadow-colored: 0 0 #0000;--tw-blur: ;--tw-brightness: ;--tw-contrast: ;--tw-grayscale: ;--tw-hue-rotate: ;--tw-invert: ;--tw-saturate: ;--tw-sepia: ;--tw-drop-shadow: ;--tw-backdrop-blur: ;--tw-backdrop-brightness: ;--tw-backdrop-contrast: ;--tw-backdrop-grayscale: ;--tw-backdrop-hue-rotate: ;--tw-backdrop-invert: ;--tw-backdrop-opacity: ;--tw-backdrop-saturate: ;--tw-backdrop-sepia: ;--tw-contain-size: ;--tw-contain-layout: ;--tw-contain-paint: ;--tw-contain-style: }*,:before,:after{box-sizing:border-box;border-width:0;border-style:solid;border-color:#e5e7eb}:before,:after{--tw-content: ""}html,:host{line-height:1.5;-webkit-text-size-adjust:100%;-moz-tab-size:4;-o-tab-size:4;tab-size:4;font-family:ui-sans-serif,system-ui,sans-serif,"Apple Color Emoji","Segoe UI Emoji",Segoe UI Symbol,"Noto Color Emoji";font-feature-settings:normal;font-variation-settings:normal;-webkit-tap-highlight-color:transparent}body{margin:0;line-height:inherit}hr{height:0;color:inherit;border-top-width:1px}abbr:where([title]){-webkit-text-decoration:underline dotted;text-decoration:underline dotted}h1,h2,h3,h4,h5,h6{font-size:inherit;font-weight:inherit}a{color:inherit;text-decoration:inherit}b,strong{font-weight:bolder}code,kbd,samp,pre{font-family:JetBrains Mono,ui-monospace,monospace;font-feature-settings:normal;font-variation-settings:normal;font-size:1em}small{font-size:80%}sub,sup{font-size:75%;line-height:0;position:relative;vertical-align:baseline}sub{bottom:-.25em}sup{top:-.5em}table{text-indent:0;border-color:inherit;border-collapse:collapse}button,input,optgroup,select,textarea{font-family:inherit;font-feature-settings:inherit;font-variation-settings:inherit;font-size:100%;font-weight:inherit;line-height:inherit;letter-spacing:inherit;color:inherit;margin:0;padding:0}button,select{text-transform:none}button,input:where([type=button]),input:where([type=reset]),input:where([type=submit]){-webkit-appearance:button;background-color:transparent;background-image:none}:-moz-focusring{outline:auto}:-moz-ui-invalid{box-shadow:none}progress{vertical-align:baseline}::-webkit-inner-spin-button,::-webkit-outer-spin-button{height:auto}[type=search]{-webkit-appearance:textfield;outline-offset:-2px}::-webkit-search-decoration{-webkit-appearance:none}::-webkit-file-upload-button{-webkit-appearance:button;font:inherit}summary{display:list-item}blockquote,dl,dd,h1,h2,h3,h4,h5,h6,hr,figure,p,pre{margin:0}fieldset{margin:0;padding:0}legend{padding:0}ol,ul,menu{list-style:none;margin:0;padding:0}dialog{padding:0}textarea{resize:vertical}input::-moz-placeholder,textarea::-moz-placeholder{opacity:1;color:#9ca3af}input::placeholder,textarea::placeholder{opacity:1;color:#9ca3af}button,[role=button]{cursor:pointer}:disabled{cursor:default}img,svg,video,canvas,audio,iframe,embed,object{display:block;vertical-align:middle}img,video{max-width:100%;height:auto}[hidden]:where(:not([hidden=until-found])){display:none}.pointer-events-none{pointer-events:none}.fixed{position:fixed}.absolute{position:absolute}.relative{position:relative}.sticky{position:sticky}.inset-0{top:0;right:0;bottom:0;left:0}.bottom-0{bottom:0}.left-0{left:0}.left-3{left:.75rem}.top-0{top:0}.top-1\/2{top:50%}.z-10{z-index:10}.z-50{z-index:50}.mx-0\.5{margin-left:.125rem;margin-right:.125rem}.mx-4{margin-left:1rem;margin-right:1rem}.mx-auto{margin-left:auto;margin-right:auto}.-mt-1{margin-top:-.25rem}.mb-1{margin-bottom:.25rem}.mb-2{margin-bottom:.5rem}.mb-3{margin-bottom:.75rem}.mb-4{margin-bottom:1rem}.mb-6{margin-bottom:1.5rem}.mb-8{margin-bottom:2rem}.ml-2{margin-left:.5rem}.ml-4{margin-left:1rem}.ml-6{margin-left:1.5rem}.mr-1{margin-right:.25rem}.mt-0\.5{margin-top:.125rem}.mt-1{margin-top:.25rem}.mt-2{margin-top:.5rem}.mt-3{margin-top:.75rem}.mt-4{margin-top:1rem}.mt-8{margin-top:2rem}.mt-auto{margin-top:auto}.line-clamp-2{overflow:hidden;display:-webkit-box;-webkit-box-orient:vertical;-webkit-line-clamp:2}.line-clamp-3{overflow:hidden;display:-webkit-box;-webkit-box-orient:vertical;-webkit-line-clamp:3}.block{display:block}.inline-block{display:inline-block}.flex{display:flex}.inline-flex{display:inline-flex}.table{display:table}.grid{display:grid}.hidden{display:none}.h-1\.5{height:.375rem}.h-12{height:3rem}.h-2{height:.5rem}.h-3{height:.75rem}.h-32{height:8rem}.h-4{height:1rem}.h-5{height:1.25rem}.h-6{height:1.5rem}.h-8{height:2rem}.h-full{height:100%}.max-h-32{max-height:8rem}.max-h-40{max-height:10rem}.max-h-48{max-height:12rem}.max-h-96{max-height:24rem}.max-h-\[80vh\]{max-height:80vh}.min-h-\[100px\]{min-height:100px}.min-h-screen{min-height:100vh}.w-12{width:3rem}.w-2{width:.5rem}.w-20{width:5rem}.w-24{width:6rem}.w-3{width:.75rem}.w-32{width:8rem}.w-4{width:1rem}.w-5{width:1.25rem}.w-6{width:1.5rem}.w-8{width:2rem}.w-full{width:100%}.min-w-0{min-width:0px}.min-w-\[90px\]{min-width:90px}.max-w-7xl{max-width:80rem}.max-w-lg{max-width:32rem}.max-w-md{max-width:28rem}.flex-1{flex:1 1 0%}.flex-shrink-0{flex-shrink:0}.-translate-y-1\/2{--tw-translate-y: -50%;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}.rotate-180{--tw-rotate: 180deg;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}.transform{transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}@keyframes pulse{50%{opacity:.5}}.animate-pulse{animation:pulse 2s cubic-bezier(.4,0,.6,1) infinite}@keyframes spin{to{transform:rotate(360deg)}}.animate-spin{animation:spin 1s linear infinite}.cursor-pointer{cursor:pointer}.select-none{-webkit-user-select:none;-moz-user-select:none;user-select:none}.resize-y{resize:vertical}.resize{resize:both}.grid-cols-1{grid-template-columns:repeat(1,minmax(0,1fr))}.grid-cols-2{grid-template-columns:repeat(2,minmax(0,1fr))}.grid-cols-3{grid-template-columns:repeat(3,minmax(0,1fr))}.grid-cols-4{grid-template-columns:repeat(4,minmax(0,1fr))}.flex-col{flex-direction:column}.flex-wrap{flex-wrap:wrap}.items-start{align-items:flex-start}.items-end{align-items:flex-end}.items-center{align-items:center}.justify-end{justify-content:flex-end}.justify-center{justify-content:center}.justify-between{justify-content:space-between}.gap-1{gap:.25rem}.gap-1\.5{gap:.375rem}.gap-2{gap:.5rem}.gap-3{gap:.75rem}.gap-4{gap:1rem}.gap-6{gap:1.5rem}.gap-8{gap:2rem}.gap-x-4{-moz-column-gap:1rem;column-gap:1rem}.gap-y-1{row-gap:.25rem}.space-y-1>:not([hidden])~:not([hidden]){--tw-space-y-reverse: 0;margin-top:calc(.25rem * calc(1 - var(--tw-space-y-reverse)));margin-bottom:calc(.25rem * var(--tw-space-y-reverse))}.space-y-2>:not([hidden])~:not([hidden]){--tw-space-y-reverse: 0;margin-top:calc(.5rem * calc(1 - var(--tw-space-y-reverse)));margin-bottom:calc(.5rem * var(--tw-space-y-reverse))}.space-y-4>:not([hidden])~:not([hidden]){--tw-space-y-reverse: 0;margin-top:calc(1rem * calc(1 - var(--tw-space-y-reverse)));margin-bottom:calc(1rem * var(--tw-space-y-reverse))}.space-y-6>:not([hidden])~:not([hidden]){--tw-space-y-reverse: 0;margin-top:calc(1.5rem * calc(1 - var(--tw-space-y-reverse)));margin-bottom:calc(1.5rem * var(--tw-space-y-reverse))}.overflow-auto{overflow:auto}.overflow-hidden{overflow:hidden}.overflow-x-auto{overflow-x:auto}.overflow-y-auto{overflow-y:auto}.truncate{overflow:hidden;text-overflow:ellipsis;white-space:nowrap}.whitespace-pre-wrap{white-space:pre-wrap}.break-all{word-break:break-all}.rounded{border-radius:.25rem}.rounded-full{border-radius:9999px}.rounded-lg{border-radius:.5rem}.rounded-md{border-radius:.375rem}.border{border-width:1px}.border-b{border-bottom-width:1px}.border-l-2{border-left-width:2px}.border-t{border-top-width:1px}.border-dashed{border-style:dashed}.border-\[var\(--accent\)\]{border-color:var(--accent)}.border-\[var\(--border\)\]{border-color:var(--border)}.border-blue-500\/30{border-color:#3b82f64d}.border-green-500\/30{border-color:#22c55e4d}.border-red-500\/30{border-color:#ef44444d}.border-red-500\/50{border-color:#ef444480}.bg-\[var\(--accent\)\]{background-color:var(--accent)}.bg-\[var\(--bg-primary\)\]{background-color:var(--bg-primary)}.bg-\[var\(--bg-secondary\)\]{background-color:var(--bg-secondary)}.bg-\[var\(--bg-tertiary\)\]{background-color:var(--bg-tertiary)}.bg-\[var\(--error\)\]{background-color:var(--error)}.bg-black\/80{background-color:#000c}.bg-blue-100{--tw-bg-opacity: 1;background-color:rgb(219 234 254 / var(--tw-bg-opacity, 1))}.bg-blue-400{--tw-bg-opacity: 1;background-color:rgb(96 165 250 / var(--tw-bg-opacity, 1))}.bg-blue-500{--tw-bg-opacity: 1;background-color:rgb(59 130 246 / var(--tw-bg-opacity, 1))}.bg-blue-500\/10{background-color:#3b82f61a}.bg-blue-600{--tw-bg-opacity: 1;background-color:rgb(37 99 235 / var(--tw-bg-opacity, 1))}.bg-emerald-500{--tw-bg-opacity: 1;background-color:rgb(16 185 129 / var(--tw-bg-opacity, 1))}.bg-green-100{--tw-bg-opacity: 1;background-color:rgb(220 252 231 / var(--tw-bg-opacity, 1))}.bg-green-400{--tw-bg-opacity: 1;background-color:rgb(74 222 128 / var(--tw-bg-opacity, 1))}.bg-green-500{--tw-bg-opacity: 1;background-color:rgb(34 197 94 / var(--tw-bg-opacity, 1))}.bg-green-500\/10{background-color:#22c55e1a}.bg-green-500\/20{background-color:#22c55e33}.bg-green-600{--tw-bg-opacity: 1;background-color:rgb(22 163 74 / var(--tw-bg-opacity, 1))}.bg-orange-100{--tw-bg-opacity: 1;background-color:rgb(255 237 213 / var(--tw-bg-opacity, 1))}.bg-purple-100{--tw-bg-opacity: 1;background-color:rgb(243 232 255 / var(--tw-bg-opacity, 1))}.bg-red-100{--tw-bg-opacity: 1;background-color:rgb(254 226 226 / var(--tw-bg-opacity, 1))}.bg-red-500{--tw-bg-opacity: 1;background-color:rgb(239 68 68 / var(--tw-bg-opacity, 1))}.bg-red-500\/10{background-color:#ef44441a}.bg-red-600{--tw-bg-opacity: 1;background-color:rgb(220 38 38 / var(--tw-bg-opacity, 1))}.bg-yellow-500{--tw-bg-opacity: 1;background-color:rgb(234 179 8 / var(--tw-bg-opacity, 1))}.p-2{padding:.5rem}.p-3{padding:.75rem}.p-4{padding:1rem}.p-6{padding:1.5rem}.px-1{padding-left:.25rem;padding-right:.25rem}.px-1\.5{padding-left:.375rem;padding-right:.375rem}.px-2{padding-left:.5rem;padding-right:.5rem}.px-3{padding-left:.75rem;padding-right:.75rem}.px-4{padding-left:1rem;padding-right:1rem}.py-0\.5{padding-top:.125rem;padding-bottom:.125rem}.py-1{padding-top:.25rem;padding-bottom:.25rem}.py-1\.5{padding-top:.375rem;padding-bottom:.375rem}.py-12{padding-top:3rem;padding-bottom:3rem}.py-16{padding-top:4rem;padding-bottom:4rem}.py-2{padding-top:.5rem;padding-bottom:.5rem}.py-3{padding-top:.75rem;padding-bottom:.75rem}.py-4{padding-top:1rem;padding-bottom:1rem}.py-8{padding-top:2rem;padding-bottom:2rem}.pb-1{padding-bottom:.25rem}.pb-2{padding-bottom:.5rem}.pl-10{padding-left:2.5rem}.pr-3{padding-right:.75rem}.pr-4{padding-right:1rem}.pt-2{padding-top:.5rem}.pt-3{padding-top:.75rem}.pt-4{padding-top:1rem}.text-left{text-align:left}.text-center{text-align:center}.text-right{text-align:right}.font-mono{font-family:JetBrains Mono,ui-monospace,monospace}.text-2xl{font-size:1.5rem;line-height:2rem}.text-lg{font-size:1.125rem;line-height:1.75rem}.text-sm{font-size:.875rem;line-height:1.25rem}.text-xl{font-size:1.25rem;line-height:1.75rem}.text-xs{font-size:.75rem;line-height:1rem}.font-bold{font-weight:700}.font-medium{font-weight:500}.font-semibold{font-weight:600}.uppercase{text-transform:uppercase}.tracking-wide{letter-spacing:.025em}.tracking-wider{letter-spacing:.05em}.text-\[var\(--accent\)\]{color:var(--accent)}.text-\[var\(--error\)\]{color:var(--error)}.text-\[var\(--text-primary\)\]{color:var(--text-primary)}.text-\[var\(--text-secondary\)\]{color:var(--text-secondary)}.text-\[var\(--text-tertiary\)\]{color:var(--text-tertiary)}.text-black{--tw-text-opacity: 1;color:rgb(0 0 0 / var(--tw-text-opacity, 1))}.text-blue-400{--tw-text-opacity: 1;color:rgb(96 165 250 / var(--tw-text-opacity, 1))}.text-blue-800{--tw-text-opacity: 1;color:rgb(30 64 175 / var(--tw-text-opacity, 1))}.text-emerald-400{--tw-text-opacity: 1;color:rgb(52 211 153 / var(--tw-text-opacity, 1))}.text-green-400{--tw-text-opacity: 1;color:rgb(74 222 128 / var(--tw-text-opacity, 1))}.text-green-500{--tw-text-opacity: 1;color:rgb(34 197 94 / var(--tw-text-opacity, 1))}.text-green-800{--tw-text-opacity: 1;color:rgb(22 101 52 / var(--tw-text-opacity, 1))}.text-orange-800{--tw-text-opacity: 1;color:rgb(154 52 18 / var(--tw-text-opacity, 1))}.text-purple-400{--tw-text-opacity: 1;color:rgb(192 132 252 / var(--tw-text-opacity, 1))}.text-purple-800{--tw-text-opacity: 1;color:rgb(107 33 168 / var(--tw-text-opacity, 1))}.text-red-400{--tw-text-opacity: 1;color:rgb(248 113 113 / var(--tw-text-opacity, 1))}.text-red-800{--tw-text-opacity: 1;color:rgb(153 27 27 / var(--tw-text-opacity, 1))}.text-white{--tw-text-opacity: 1;color:rgb(255 255 255 / var(--tw-text-opacity, 1))}.accent-\[var\(--accent\)\]{accent-color:var(--accent)}.shadow-lg{--tw-shadow: 0 10px 15px -3px rgb(0 0 0 / .1), 0 4px 6px -4px rgb(0 0 0 / .1);--tw-shadow-colored: 0 10px 15px -3px var(--tw-shadow-color), 0 4px 6px -4px var(--tw-shadow-color);box-shadow:var(--tw-ring-offset-shadow, 0 0 #0000),var(--tw-ring-shadow, 0 0 #0000),var(--tw-shadow)}.filter{filter:var(--tw-blur) var(--tw-brightness) var(--tw-contrast) var(--tw-grayscale) var(--tw-hue-rotate) var(--tw-invert) var(--tw-saturate) var(--tw-sepia) var(--tw-drop-shadow)}.transition-all{transition-property:all;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-duration:.15s}.transition-colors{transition-property:color,background-color,border-color,text-decoration-color,fill,stroke;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-duration:.15s}.duration-300{transition-duration:.3s}:root{--bg-primary: #0a0a0a;--bg-secondary: #141414;--bg-tertiary: #1a1a1a;--text-primary: #f5f5f5;--text-secondary: #a3a3a3;--accent: #22c55e;--accent-dim: #166534;--border: #262626;--error: #ef4444}[data-theme=light]{--bg-primary: #ffffff;--bg-secondary: #f7f8f9;--bg-tertiary: #eef0f2;--text-primary: #1a1a1a;--text-secondary: #4a4a4a;--accent: #16a34a;--accent-dim: #dcfce7;--border: #d1d5db;--error: #dc2626}*{box-sizing:border-box}body{margin:0;background-color:var(--bg-primary);color:var(--text-primary);font-family:JetBrains Mono,ui-monospace,monospace;font-size:14px;line-height:1.6}::-webkit-scrollbar{width:8px;height:8px}::-webkit-scrollbar-track{background:var(--bg-secondary)}::-webkit-scrollbar-thumb{background:var(--border);border-radius:4px}::-webkit-scrollbar-thumb:hover{background:#404040}[data-theme=light] ::-webkit-scrollbar-thumb:hover{background:silver}.last\:border-0:last-child{border-width:0px}.hover\:border-\[var\(--accent-dim\)\]:hover{border-color:var(--accent-dim)}.hover\:bg-\[\#16a34a\]:hover{--tw-bg-opacity: 1;background-color:rgb(22 163 74 / var(--tw-bg-opacity, 1))}.hover\:bg-\[var\(--bg-primary\)\]:hover{background-color:var(--bg-primary)}.hover\:bg-\[var\(--bg-tertiary\)\]:hover{background-color:var(--bg-tertiary)}.hover\:bg-\[var\(--border\)\]:hover{background-color:var(--border)}.hover\:bg-red-600:hover{--tw-bg-opacity: 1;background-color:rgb(220 38 38 / var(--tw-bg-opacity, 1))}.hover\:text-\[var\(--accent\)\]:hover{color:var(--accent)}.hover\:text-\[var\(--text-primary\)\]:hover{color:var(--text-primary)}.hover\:opacity-80:hover{opacity:.8}.focus\:border-\[var\(--accent\)\]:focus{border-color:var(--accent)}.focus\:outline-none:focus{outline:2px solid transparent;outline-offset:2px}.focus\:ring-2:focus{--tw-ring-offset-shadow: var(--tw-ring-inset) 0 0 0 var(--tw-ring-offset-width) var(--tw-ring-offset-color);--tw-ring-shadow: var(--tw-ring-inset) 0 0 0 calc(2px + var(--tw-ring-offset-width)) var(--tw-ring-color);box-shadow:var(--tw-ring-offset-shadow),var(--tw-ring-shadow),var(--tw-shadow, 0 0 #0000)}.focus\:ring-\[var\(--accent\)\]:focus{--tw-ring-color: var(--accent)}.disabled\:cursor-not-allowed:disabled{cursor:not-allowed}.disabled\:opacity-50:disabled{opacity:.5}@media (min-width: 768px){.md\:grid-cols-2{grid-template-columns:repeat(2,minmax(0,1fr))}}@media (min-width: 1024px){.lg\:grid-cols-3{grid-template-columns:repeat(3,minmax(0,1fr))}}@media (min-width: 1280px){.xl\:grid-cols-3{grid-template-columns:repeat(3,minmax(0,1fr))}}@media (prefers-color-scheme: dark){.dark\:bg-blue-900{--tw-bg-opacity: 1;background-color:rgb(30 58 138 / var(--tw-bg-opacity, 1))}.dark\:bg-green-900{--tw-bg-opacity: 1;background-color:rgb(20 83 45 / var(--tw-bg-opacity, 1))}.dark\:bg-orange-900{--tw-bg-opacity: 1;background-color:rgb(124 45 18 / var(--tw-bg-opacity, 1))}.dark\:bg-purple-900{--tw-bg-opacity: 1;background-color:rgb(88 28 135 / var(--tw-bg-opacity, 1))}.dark\:bg-red-900{--tw-bg-opacity: 1;background-color:rgb(127 29 29 / var(--tw-bg-opacity, 1))}.dark\:text-blue-200{--tw-text-opacity: 1;color:rgb(191 219 254 / var(--tw-text-opacity, 1))}.dark\:text-green-200{--tw-text-opacity: 1;color:rgb(187 247 208 / var(--tw-text-opacity, 1))}.dark\:text-orange-200{--tw-text-opacity: 1;color:rgb(254 215 170 / var(--tw-text-opacity, 1))}.dark\:text-purple-200{--tw-text-opacity: 1;color:rgb(233 213 255 / var(--tw-text-opacity, 1))}.dark\:text-red-200{--tw-text-opacity: 1;color:rgb(254 202 202 / var(--tw-text-opacity, 1))}}
src/flow/ui/ui/index.html CHANGED
@@ -8,8 +8,8 @@
8
  <link rel="preconnect" href="https://fonts.googleapis.com">
9
  <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
10
  <link href="https://fonts.googleapis.com/css2?family=JetBrains+Mono:wght@400;500;600;700&display=swap" rel="stylesheet">
11
- <script type="module" crossorigin src="/assets/index-BFk_2IKX.js"></script>
12
- <link rel="stylesheet" crossorigin href="/assets/index-DlCyCyh_.css">
13
  </head>
14
  <body>
15
  <div id="root"></div>
 
8
  <link rel="preconnect" href="https://fonts.googleapis.com">
9
  <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
10
  <link href="https://fonts.googleapis.com/css2?family=JetBrains+Mono:wght@400;500;600;700&display=swap" rel="stylesheet">
11
+ <script type="module" crossorigin src="/assets/index-2zMAgGgo.js"></script>
12
+ <link rel="stylesheet" crossorigin href="/assets/index-BHAF8mLj.css">
13
  </head>
14
  <body>
15
  <div id="root"></div>