Spaces:
Sleeping
Sleeping
Commit ·
cbd95af
1
Parent(s): c1ec9a0
Deploy 2026-01-28 10:56:31
Browse files- .env.example +29 -0
- README.md +10 -6
- src/flow/cli/app.py +6 -6
- src/flow/cli/optimize.py +99 -135
- src/flow/experiments/__init__.py +52 -96
- src/flow/experiments/ablation.py +76 -248
- src/flow/experiments/config_export.py +0 -184
- src/flow/experiments/models.py +517 -0
- src/flow/experiments/optimizer.py +81 -147
- src/flow/experiments/types.py +2 -2
- src/flow/harness/maf/agent.py +36 -26
- src/flow/harness/maf/tools/__init__.py +157 -0
- src/flow/{tools → harness/maf/tools}/coding.py +0 -0
- src/flow/{tools → harness/maf/tools}/core.py +0 -0
- src/flow/{tools → harness/maf/tools}/execution.py +0 -0
- src/flow/{tools → harness/maf/tools}/memory.py +0 -0
- src/flow/{tools → harness/maf/tools}/sub_agent.py +14 -6
- src/flow/prompts.py +234 -97
- src/flow/tools/__init__.py +0 -172
- src/flow/ui/api/configs.py +81 -111
- src/flow/ui/api/jobs.py +6 -6
- src/flow/ui/api/runs.py +13 -13
- src/flow/ui/database.py +1 -57
- src/flow/ui/models/config.py +4 -4
- src/flow/ui/models/job.py +1 -1
- src/flow/ui/models/run.py +1 -1
- src/flow/ui/models/task.py +1 -1
- src/flow/ui/schemas/__init__.py +4 -4
- src/flow/ui/schemas/config.py +33 -29
- src/flow/ui/schemas/job.py +3 -3
- src/flow/ui/schemas/run.py +2 -2
- src/flow/ui/services/optimizer_service.py +38 -48
- src/flow/ui/tests/test_e2e_user_journey.py +6 -6
- src/flow/ui/ui/assets/index-2zMAgGgo.js +0 -0
- src/flow/ui/ui/assets/index-BG9n9RHB.js +0 -0
- src/flow/ui/ui/assets/index-BHAF8mLj.css +1 -0
- src/flow/ui/ui/assets/index-Bx-_JS_6.js +0 -0
- src/flow/ui/ui/assets/index-VFZIS3uv.js +0 -0
- src/flow/ui/ui/assets/index-_IRgS-wR.css +1 -0
- src/flow/ui/ui/index.html +2 -2
.env.example
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Flow UI Deployment Environment
|
| 2 |
+
# Copy this to deploy/.env and fill in values
|
| 3 |
+
# This file is gitignored - secrets stay local
|
| 4 |
+
|
| 5 |
+
# --- Azure OpenAI ---
|
| 6 |
+
AZURE_OPENAI_ENDPOINT=https://your-resource.openai.azure.com/
|
| 7 |
+
AZURE_OPENAI_API_KEY=your-key
|
| 8 |
+
AZURE_OPENAI_DEPLOYMENT=gpt-4o
|
| 9 |
+
|
| 10 |
+
# --- Authentication ---
|
| 11 |
+
AUTH_ENABLED=true
|
| 12 |
+
AUTH_MODE=github
|
| 13 |
+
AUTH_SECRET=change-me-to-a-random-string
|
| 14 |
+
|
| 15 |
+
# For GitHub OAuth (create app at https://github.com/settings/developers):
|
| 16 |
+
# Homepage URL: https://victordibia-flow.hf.space
|
| 17 |
+
# Callback URL: https://victordibia-flow.hf.space/api/auth/github/callback
|
| 18 |
+
AUTH_GITHUB_CLIENT_ID=your-client-id
|
| 19 |
+
AUTH_GITHUB_CLIENT_SECRET=your-client-secret
|
| 20 |
+
AUTH_GITHUB_ALLOWED_USERS=victordibia,teammate1,teammate2
|
| 21 |
+
|
| 22 |
+
# For basic auth (simpler, no GitHub app needed):
|
| 23 |
+
# AUTH_MODE=basic
|
| 24 |
+
# AUTH_BASIC_USERNAME=admin
|
| 25 |
+
# AUTH_BASIC_PASSWORD=your-password
|
| 26 |
+
|
| 27 |
+
# --- Optional ---
|
| 28 |
+
# AUTH_SESSION_HOURS=24
|
| 29 |
+
# UVICORN_WORKERS=2
|
README.md
CHANGED
|
@@ -83,13 +83,17 @@ Flow tests different **context engineering strategies**:
|
|
| 83 |
Example configurations:
|
| 84 |
|
| 85 |
```python
|
| 86 |
-
from flow.experiments.
|
| 87 |
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
```
|
| 94 |
|
| 95 |
## Task Format
|
|
|
|
| 83 |
Example configurations:
|
| 84 |
|
| 85 |
```python
|
| 86 |
+
from flow.experiments.models import Agent, CompactionConfig, GridSearchStrategy
|
| 87 |
|
| 88 |
+
# Define a base agent
|
| 89 |
+
base = Agent(name="my_agent", enable_memory=True)
|
| 90 |
+
|
| 91 |
+
# Generate candidates via grid search
|
| 92 |
+
strategy = GridSearchStrategy(variations={
|
| 93 |
+
"enable_memory": [True, False],
|
| 94 |
+
"compaction": [CompactionConfig.head_tail(10, 40), CompactionConfig.none()],
|
| 95 |
+
})
|
| 96 |
+
candidates = strategy.generate(base, budget=10)
|
| 97 |
```
|
| 98 |
|
| 99 |
## Task Format
|
src/flow/cli/app.py
CHANGED
|
@@ -107,13 +107,13 @@ async def _run_single_task(
|
|
| 107 |
from flow.harness.maf import MAFHarness
|
| 108 |
|
| 109 |
if config_path:
|
| 110 |
-
# Load config from optimization result
|
| 111 |
-
from flow.experiments.
|
| 112 |
-
from flow.experiments.ablation import
|
| 113 |
|
| 114 |
-
|
| 115 |
-
console.print(f"[dim]Using config: {
|
| 116 |
-
harness =
|
| 117 |
else:
|
| 118 |
harness = MAFHarness(workspace=workspace, memory_path=memory_path)
|
| 119 |
|
|
|
|
| 107 |
from flow.harness.maf import MAFHarness
|
| 108 |
|
| 109 |
if config_path:
|
| 110 |
+
# Load agent config from optimization result
|
| 111 |
+
from flow.experiments.models import load_agent
|
| 112 |
+
from flow.experiments.ablation import create_harness_from_agent
|
| 113 |
|
| 114 |
+
agent_config = load_agent(config_path)
|
| 115 |
+
console.print(f"[dim]Using agent config: {agent_config.name}[/]")
|
| 116 |
+
harness = create_harness_from_agent(agent_config, workspace)
|
| 117 |
else:
|
| 118 |
harness = MAFHarness(workspace=workspace, memory_path=memory_path)
|
| 119 |
|
src/flow/cli/optimize.py
CHANGED
|
@@ -13,13 +13,9 @@ from typing import Annotated, Any
|
|
| 13 |
import typer
|
| 14 |
from rich.console import Console
|
| 15 |
|
| 16 |
-
from flow.experiments.
|
| 17 |
-
from flow.experiments.optimizer import
|
| 18 |
-
|
| 19 |
-
generate_grid_configs,
|
| 20 |
-
load_tasks_from_jsonl,
|
| 21 |
-
)
|
| 22 |
-
from flow.experiments.types import EvalCriterion, Task
|
| 23 |
|
| 24 |
console = Console()
|
| 25 |
|
|
@@ -36,21 +32,21 @@ def optimize(
|
|
| 36 |
Path | None,
|
| 37 |
typer.Option(
|
| 38 |
"--config", "-c",
|
| 39 |
-
help="Path to Python config file with
|
| 40 |
),
|
| 41 |
] = None,
|
| 42 |
agent: Annotated[
|
| 43 |
Path | None,
|
| 44 |
typer.Option(
|
| 45 |
"--agent", "-a",
|
| 46 |
-
help="Path to base agent
|
| 47 |
),
|
| 48 |
] = None,
|
| 49 |
suite: Annotated[
|
| 50 |
str | None,
|
| 51 |
typer.Option(
|
| 52 |
"--suite", "-s",
|
| 53 |
-
help="Built-in task suite:
|
| 54 |
),
|
| 55 |
] = None,
|
| 56 |
parallel: Annotated[
|
|
@@ -60,18 +56,11 @@ def optimize(
|
|
| 60 |
help="Max concurrent experiments",
|
| 61 |
),
|
| 62 |
] = 4,
|
| 63 |
-
mode: Annotated[
|
| 64 |
-
str,
|
| 65 |
-
typer.Option(
|
| 66 |
-
"--mode", "-m",
|
| 67 |
-
help="Config mode: named (use CONFIGS), grid (use VARIATIONS)",
|
| 68 |
-
),
|
| 69 |
-
] = "named",
|
| 70 |
vary: Annotated[
|
| 71 |
str | None,
|
| 72 |
typer.Option(
|
| 73 |
"--vary", "-v",
|
| 74 |
-
help="Comma-separated params to vary: compaction,memory,
|
| 75 |
),
|
| 76 |
] = None,
|
| 77 |
output: Annotated[
|
|
@@ -88,28 +77,35 @@ def optimize(
|
|
| 88 |
help="Disable LLM-as-Judge evaluation (faster, less accurate)",
|
| 89 |
),
|
| 90 |
] = False,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
) -> None:
|
| 92 |
"""Find the best agent configuration through experimentation.
|
| 93 |
|
| 94 |
Runs experiments in parallel, evaluates with LLM-as-Judge,
|
| 95 |
-
ranks via Pareto analysis, and exports winning configs.
|
| 96 |
|
| 97 |
Examples:
|
| 98 |
|
| 99 |
-
# Run with task file and default
|
| 100 |
flow optimize --tasks tasks.jsonl
|
| 101 |
|
| 102 |
-
# Use custom
|
| 103 |
flow optimize --config my_configs.py --tasks tasks.jsonl
|
| 104 |
|
| 105 |
-
#
|
| 106 |
-
flow optimize --
|
| 107 |
|
| 108 |
# Use built-in task suite
|
| 109 |
flow optimize --suite coding --parallel 2
|
| 110 |
|
| 111 |
-
#
|
| 112 |
-
flow optimize --vary compaction,memory --tasks tasks.jsonl
|
| 113 |
"""
|
| 114 |
asyncio.run(_run_optimize(
|
| 115 |
tasks_path=tasks,
|
|
@@ -117,10 +113,10 @@ def optimize(
|
|
| 117 |
agent_path=agent,
|
| 118 |
suite=suite,
|
| 119 |
parallel=parallel,
|
| 120 |
-
mode=mode,
|
| 121 |
vary=vary,
|
| 122 |
output_dir=output,
|
| 123 |
use_llm_eval=not no_llm_eval,
|
|
|
|
| 124 |
))
|
| 125 |
|
| 126 |
|
|
@@ -130,10 +126,10 @@ async def _run_optimize(
|
|
| 130 |
agent_path: Path | None,
|
| 131 |
suite: str | None,
|
| 132 |
parallel: int,
|
| 133 |
-
mode: str,
|
| 134 |
vary: str | None,
|
| 135 |
output_dir: Path | None,
|
| 136 |
use_llm_eval: bool,
|
|
|
|
| 137 |
) -> None:
|
| 138 |
"""Run the optimization."""
|
| 139 |
# Load tasks
|
|
@@ -142,19 +138,23 @@ async def _run_optimize(
|
|
| 142 |
console.print("[red]Error:[/] No tasks specified. Use --tasks or --suite")
|
| 143 |
raise typer.Exit(1)
|
| 144 |
|
| 145 |
-
# Load
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
|
|
|
|
|
|
|
|
|
| 149 |
raise typer.Exit(1)
|
| 150 |
|
|
|
|
| 151 |
console.print(f"\n[bold]Tasks:[/] {len(tasks)}")
|
| 152 |
for t in tasks:
|
| 153 |
console.print(f" - {t.name}")
|
| 154 |
|
| 155 |
-
console.print(f"\n[bold]
|
| 156 |
-
for c in
|
| 157 |
-
console.print(f" - {c.name}")
|
| 158 |
|
| 159 |
# Run optimizer
|
| 160 |
optimizer = FlowOptimizer(
|
|
@@ -164,12 +164,12 @@ async def _run_optimize(
|
|
| 164 |
)
|
| 165 |
|
| 166 |
try:
|
| 167 |
-
result = await optimizer.optimize(
|
| 168 |
|
| 169 |
console.print("\n[bold green]Optimization complete![/]")
|
| 170 |
-
console.print(f"\nBest
|
| 171 |
-
console.print("\nTo use
|
| 172 |
-
console.print(f" [dim]flow run --config {result.output_dir / '
|
| 173 |
|
| 174 |
except KeyboardInterrupt:
|
| 175 |
console.print("\n[yellow]Optimization cancelled.[/]")
|
|
@@ -185,116 +185,73 @@ def _load_tasks(tasks_path: Path | None, suite: str | None) -> list[Task]:
|
|
| 185 |
return load_tasks_from_jsonl(tasks_path)
|
| 186 |
|
| 187 |
if suite:
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
def _get_builtin_suite(name: str) -> list[Task]:
|
| 195 |
-
"""Get a built-in task suite."""
|
| 196 |
-
suites = {
|
| 197 |
-
"quick": [
|
| 198 |
-
Task(
|
| 199 |
-
name="hello_world",
|
| 200 |
-
prompt="Create a Python script 'hello.py' that prints 'Hello, World!' and run it.",
|
| 201 |
-
criteria=[
|
| 202 |
-
EvalCriterion(name="file_created", instruction="hello.py should be created"),
|
| 203 |
-
EvalCriterion(name="correct_output", instruction="Output should include 'Hello, World!'"),
|
| 204 |
-
],
|
| 205 |
-
),
|
| 206 |
-
],
|
| 207 |
-
"coding": [
|
| 208 |
-
Task(
|
| 209 |
-
name="fizzbuzz",
|
| 210 |
-
prompt="Create fizzbuzz.py that prints 1-30 with Fizz/Buzz/FizzBuzz rules. Run it.",
|
| 211 |
-
criteria=[
|
| 212 |
-
EvalCriterion(name="file_created", instruction="fizzbuzz.py should be created"),
|
| 213 |
-
EvalCriterion(name="correct_output", instruction="Output shows correct FizzBuzz pattern"),
|
| 214 |
-
],
|
| 215 |
-
metadata={"category": "short"},
|
| 216 |
-
),
|
| 217 |
-
Task(
|
| 218 |
-
name="rest_api",
|
| 219 |
-
prompt="Create a FastAPI app with a /health endpoint that returns JSON {'status': 'ok'}. Save as api.py.",
|
| 220 |
-
criteria=[
|
| 221 |
-
EvalCriterion(name="file_created", instruction="api.py should be created"),
|
| 222 |
-
EvalCriterion(name="fastapi_used", instruction="Should use FastAPI"),
|
| 223 |
-
EvalCriterion(name="endpoint_defined", instruction="Should have /health endpoint"),
|
| 224 |
-
],
|
| 225 |
-
metadata={"category": "medium"},
|
| 226 |
-
),
|
| 227 |
-
Task(
|
| 228 |
-
name="data_pipeline",
|
| 229 |
-
prompt="""Create a data processing pipeline:
|
| 230 |
-
1. data_types.py - DataRecord dataclass (id, name, value)
|
| 231 |
-
2. validators.py - validate_id, validate_name functions
|
| 232 |
-
3. pipeline.py - chain validators together
|
| 233 |
-
4. test_pipeline.py - tests for the pipeline
|
| 234 |
-
Run the tests.""",
|
| 235 |
-
criteria=[
|
| 236 |
-
EvalCriterion(name="modules_created", instruction="All 4 Python files created"),
|
| 237 |
-
EvalCriterion(name="tests_run", instruction="Tests should be executed"),
|
| 238 |
-
],
|
| 239 |
-
metadata={"category": "long"},
|
| 240 |
-
),
|
| 241 |
-
],
|
| 242 |
-
"research": [
|
| 243 |
-
Task(
|
| 244 |
-
name="codebase_analysis",
|
| 245 |
-
prompt="""Analyze this workspace:
|
| 246 |
-
1. Explore the directory structure
|
| 247 |
-
2. Identify Python files and their purposes
|
| 248 |
-
3. Create analysis_report.md with findings""",
|
| 249 |
-
criteria=[
|
| 250 |
-
EvalCriterion(name="exploration", instruction="Should explore directory"),
|
| 251 |
-
EvalCriterion(name="report_created", instruction="analysis_report.md created"),
|
| 252 |
-
],
|
| 253 |
-
metadata={"category": "research"},
|
| 254 |
-
),
|
| 255 |
-
],
|
| 256 |
-
}
|
| 257 |
|
| 258 |
-
|
| 259 |
-
|
|
|
|
|
|
|
|
|
|
| 260 |
raise typer.Exit(1)
|
| 261 |
|
| 262 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 263 |
|
| 264 |
|
| 265 |
-
def
|
| 266 |
config_path: Path | None,
|
| 267 |
-
mode: str,
|
| 268 |
vary: str | None,
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
|
|
|
| 272 |
if config_path:
|
| 273 |
if not config_path.exists():
|
| 274 |
console.print(f"[red]Error:[/] Config file not found: {config_path}")
|
| 275 |
raise typer.Exit(1)
|
| 276 |
|
| 277 |
-
|
| 278 |
|
| 279 |
-
if
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
|
|
|
| 283 |
else:
|
| 284 |
-
console.print("[red]Error:[/] Config file has no
|
| 285 |
raise typer.Exit(1)
|
| 286 |
|
| 287 |
-
# Generate from --vary flag
|
| 288 |
if vary:
|
| 289 |
variations = _parse_vary_flag(vary)
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 294 |
|
| 295 |
|
| 296 |
-
def _load_python_config(path: Path) -> tuple[list[
|
| 297 |
-
"""Load
|
| 298 |
spec = importlib.util.spec_from_file_location("config_module", path)
|
| 299 |
if spec is None or spec.loader is None:
|
| 300 |
raise ValueError(f"Cannot load {path}")
|
|
@@ -303,29 +260,36 @@ def _load_python_config(path: Path) -> tuple[list[AblationConfig], dict[str, Any
|
|
| 303 |
sys.modules["config_module"] = module
|
| 304 |
spec.loader.exec_module(module)
|
| 305 |
|
| 306 |
-
|
| 307 |
variations = getattr(module, "VARIATIONS", {})
|
| 308 |
|
| 309 |
-
return
|
| 310 |
|
| 311 |
|
| 312 |
def _parse_vary_flag(vary: str) -> dict[str, Any]:
|
| 313 |
"""Parse --vary flag into variations dict."""
|
| 314 |
-
variations = {}
|
| 315 |
|
| 316 |
for param in vary.split(","):
|
| 317 |
param = param.strip().lower()
|
| 318 |
|
| 319 |
if param in ("compaction", "compact"):
|
| 320 |
-
variations["
|
|
|
|
|
|
|
|
|
|
| 321 |
elif param in ("memory", "mem"):
|
| 322 |
-
variations["
|
| 323 |
elif param in ("subagent", "sub"):
|
| 324 |
variations["enable_sub_agent"] = [True, False]
|
| 325 |
elif param in ("head", "head_size"):
|
| 326 |
-
variations["
|
|
|
|
|
|
|
| 327 |
elif param in ("tail", "tail_size"):
|
| 328 |
-
variations["
|
|
|
|
|
|
|
| 329 |
else:
|
| 330 |
console.print(f"[yellow]Warning:[/] Unknown vary param: {param}")
|
| 331 |
|
|
|
|
| 13 |
import typer
|
| 14 |
from rich.console import Console
|
| 15 |
|
| 16 |
+
from flow.experiments.models import Agent, Candidate, CompactionConfig, GridSearchStrategy
|
| 17 |
+
from flow.experiments.optimizer import FlowOptimizer, load_tasks_from_jsonl
|
| 18 |
+
from flow.experiments.types import Task, get_task_suite
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
console = Console()
|
| 21 |
|
|
|
|
| 32 |
Path | None,
|
| 33 |
typer.Option(
|
| 34 |
"--config", "-c",
|
| 35 |
+
help="Path to Python config file with CANDIDATES or VARIATIONS",
|
| 36 |
),
|
| 37 |
] = None,
|
| 38 |
agent: Annotated[
|
| 39 |
Path | None,
|
| 40 |
typer.Option(
|
| 41 |
"--agent", "-a",
|
| 42 |
+
help="Path to base agent YAML file (for optimization)",
|
| 43 |
),
|
| 44 |
] = None,
|
| 45 |
suite: Annotated[
|
| 46 |
str | None,
|
| 47 |
typer.Option(
|
| 48 |
"--suite", "-s",
|
| 49 |
+
help="Built-in task suite: quick, core, coding",
|
| 50 |
),
|
| 51 |
] = None,
|
| 52 |
parallel: Annotated[
|
|
|
|
| 56 |
help="Max concurrent experiments",
|
| 57 |
),
|
| 58 |
] = 4,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
vary: Annotated[
|
| 60 |
str | None,
|
| 61 |
typer.Option(
|
| 62 |
"--vary", "-v",
|
| 63 |
+
help="Comma-separated params to vary: compaction,memory,subagent",
|
| 64 |
),
|
| 65 |
] = None,
|
| 66 |
output: Annotated[
|
|
|
|
| 77 |
help="Disable LLM-as-Judge evaluation (faster, less accurate)",
|
| 78 |
),
|
| 79 |
] = False,
|
| 80 |
+
budget: Annotated[
|
| 81 |
+
int,
|
| 82 |
+
typer.Option(
|
| 83 |
+
"--budget", "-b",
|
| 84 |
+
help="Maximum number of candidates to generate",
|
| 85 |
+
),
|
| 86 |
+
] = 100,
|
| 87 |
) -> None:
|
| 88 |
"""Find the best agent configuration through experimentation.
|
| 89 |
|
| 90 |
Runs experiments in parallel, evaluates with LLM-as-Judge,
|
| 91 |
+
ranks via Pareto analysis, and exports winning agent configs.
|
| 92 |
|
| 93 |
Examples:
|
| 94 |
|
| 95 |
+
# Run with task file and default candidates
|
| 96 |
flow optimize --tasks tasks.jsonl
|
| 97 |
|
| 98 |
+
# Use custom candidates from Python file
|
| 99 |
flow optimize --config my_configs.py --tasks tasks.jsonl
|
| 100 |
|
| 101 |
+
# Vary specific parameters
|
| 102 |
+
flow optimize --vary compaction,memory --tasks tasks.jsonl
|
| 103 |
|
| 104 |
# Use built-in task suite
|
| 105 |
flow optimize --suite coding --parallel 2
|
| 106 |
|
| 107 |
+
# Start from a base agent definition
|
| 108 |
+
flow optimize --agent base_agent.yaml --vary compaction,memory --tasks tasks.jsonl
|
| 109 |
"""
|
| 110 |
asyncio.run(_run_optimize(
|
| 111 |
tasks_path=tasks,
|
|
|
|
| 113 |
agent_path=agent,
|
| 114 |
suite=suite,
|
| 115 |
parallel=parallel,
|
|
|
|
| 116 |
vary=vary,
|
| 117 |
output_dir=output,
|
| 118 |
use_llm_eval=not no_llm_eval,
|
| 119 |
+
budget=budget,
|
| 120 |
))
|
| 121 |
|
| 122 |
|
|
|
|
| 126 |
agent_path: Path | None,
|
| 127 |
suite: str | None,
|
| 128 |
parallel: int,
|
|
|
|
| 129 |
vary: str | None,
|
| 130 |
output_dir: Path | None,
|
| 131 |
use_llm_eval: bool,
|
| 132 |
+
budget: int,
|
| 133 |
) -> None:
|
| 134 |
"""Run the optimization."""
|
| 135 |
# Load tasks
|
|
|
|
| 138 |
console.print("[red]Error:[/] No tasks specified. Use --tasks or --suite")
|
| 139 |
raise typer.Exit(1)
|
| 140 |
|
| 141 |
+
# Load base agent
|
| 142 |
+
base = _load_base_agent(agent_path)
|
| 143 |
+
|
| 144 |
+
# Load/generate candidates
|
| 145 |
+
candidates = _load_candidates(config_path, vary, base, budget)
|
| 146 |
+
if not candidates:
|
| 147 |
+
console.print("[red]Error:[/] No candidates to test. Use --config or --vary")
|
| 148 |
raise typer.Exit(1)
|
| 149 |
|
| 150 |
+
console.print(f"\n[bold]Base Agent:[/] {base.name}")
|
| 151 |
console.print(f"\n[bold]Tasks:[/] {len(tasks)}")
|
| 152 |
for t in tasks:
|
| 153 |
console.print(f" - {t.name}")
|
| 154 |
|
| 155 |
+
console.print(f"\n[bold]Candidates:[/] {len(candidates)}")
|
| 156 |
+
for c in candidates:
|
| 157 |
+
console.print(f" - {c.agent.name}")
|
| 158 |
|
| 159 |
# Run optimizer
|
| 160 |
optimizer = FlowOptimizer(
|
|
|
|
| 164 |
)
|
| 165 |
|
| 166 |
try:
|
| 167 |
+
result = await optimizer.optimize(candidates, tasks)
|
| 168 |
|
| 169 |
console.print("\n[bold green]Optimization complete![/]")
|
| 170 |
+
console.print(f"\nBest agents exported to: [cyan]{result.output_dir / 'agents'}[/]")
|
| 171 |
+
console.print("\nTo use an agent config:")
|
| 172 |
+
console.print(f" [dim]flow run --config {result.output_dir / 'agents' / 'best_score.yaml'} \"your task\"[/]")
|
| 173 |
|
| 174 |
except KeyboardInterrupt:
|
| 175 |
console.print("\n[yellow]Optimization cancelled.[/]")
|
|
|
|
| 185 |
return load_tasks_from_jsonl(tasks_path)
|
| 186 |
|
| 187 |
if suite:
|
| 188 |
+
try:
|
| 189 |
+
return get_task_suite(suite)
|
| 190 |
+
except ValueError as e:
|
| 191 |
+
console.print(f"[red]Error:[/] {e}")
|
| 192 |
+
raise typer.Exit(1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
|
| 194 |
+
# Default: quick suite
|
| 195 |
+
try:
|
| 196 |
+
return get_task_suite("quick")
|
| 197 |
+
except ValueError:
|
| 198 |
+
console.print("[red]Error:[/] No built-in suites available. Use --tasks to specify a JSONL file.")
|
| 199 |
raise typer.Exit(1)
|
| 200 |
|
| 201 |
+
|
| 202 |
+
def _load_base_agent(agent_path: Path | None) -> Agent:
|
| 203 |
+
"""Load base agent from YAML or use defaults."""
|
| 204 |
+
if agent_path:
|
| 205 |
+
if not agent_path.exists():
|
| 206 |
+
console.print(f"[red]Error:[/] Agent file not found: {agent_path}")
|
| 207 |
+
raise typer.Exit(1)
|
| 208 |
+
from flow.experiments.models import load_agent
|
| 209 |
+
return load_agent(agent_path)
|
| 210 |
+
|
| 211 |
+
return Agent(name="flow_agent")
|
| 212 |
|
| 213 |
|
| 214 |
+
def _load_candidates(
|
| 215 |
config_path: Path | None,
|
|
|
|
| 216 |
vary: str | None,
|
| 217 |
+
base: Agent,
|
| 218 |
+
budget: int,
|
| 219 |
+
) -> list[Candidate]:
|
| 220 |
+
"""Load candidates from file or generate from variations."""
|
| 221 |
if config_path:
|
| 222 |
if not config_path.exists():
|
| 223 |
console.print(f"[red]Error:[/] Config file not found: {config_path}")
|
| 224 |
raise typer.Exit(1)
|
| 225 |
|
| 226 |
+
candidates, variations = _load_python_config(config_path)
|
| 227 |
|
| 228 |
+
if variations:
|
| 229 |
+
strategy = GridSearchStrategy(variations)
|
| 230 |
+
return strategy.generate(base, budget)
|
| 231 |
+
elif candidates:
|
| 232 |
+
return candidates
|
| 233 |
else:
|
| 234 |
+
console.print("[red]Error:[/] Config file has no CANDIDATES or VARIATIONS")
|
| 235 |
raise typer.Exit(1)
|
| 236 |
|
|
|
|
| 237 |
if vary:
|
| 238 |
variations = _parse_vary_flag(vary)
|
| 239 |
+
strategy = GridSearchStrategy(variations)
|
| 240 |
+
return strategy.generate(base, budget)
|
| 241 |
+
|
| 242 |
+
# Default: explore context engineering dimensions
|
| 243 |
+
strategy = GridSearchStrategy(variations={
|
| 244 |
+
"enable_memory": [True, False],
|
| 245 |
+
"compaction": [
|
| 246 |
+
CompactionConfig.head_tail(10, 40),
|
| 247 |
+
CompactionConfig.none(),
|
| 248 |
+
],
|
| 249 |
+
})
|
| 250 |
+
return strategy.generate(base, budget)
|
| 251 |
|
| 252 |
|
| 253 |
+
def _load_python_config(path: Path) -> tuple[list[Candidate], dict[str, Any]]:
|
| 254 |
+
"""Load CANDIDATES and VARIATIONS from a Python file."""
|
| 255 |
spec = importlib.util.spec_from_file_location("config_module", path)
|
| 256 |
if spec is None or spec.loader is None:
|
| 257 |
raise ValueError(f"Cannot load {path}")
|
|
|
|
| 260 |
sys.modules["config_module"] = module
|
| 261 |
spec.loader.exec_module(module)
|
| 262 |
|
| 263 |
+
candidates = getattr(module, "CANDIDATES", [])
|
| 264 |
variations = getattr(module, "VARIATIONS", {})
|
| 265 |
|
| 266 |
+
return candidates, variations
|
| 267 |
|
| 268 |
|
| 269 |
def _parse_vary_flag(vary: str) -> dict[str, Any]:
|
| 270 |
"""Parse --vary flag into variations dict."""
|
| 271 |
+
variations: dict[str, Any] = {}
|
| 272 |
|
| 273 |
for param in vary.split(","):
|
| 274 |
param = param.strip().lower()
|
| 275 |
|
| 276 |
if param in ("compaction", "compact"):
|
| 277 |
+
variations["compaction"] = [
|
| 278 |
+
CompactionConfig.head_tail(10, 40),
|
| 279 |
+
CompactionConfig.none(),
|
| 280 |
+
]
|
| 281 |
elif param in ("memory", "mem"):
|
| 282 |
+
variations["enable_memory"] = [True, False]
|
| 283 |
elif param in ("subagent", "sub"):
|
| 284 |
variations["enable_sub_agent"] = [True, False]
|
| 285 |
elif param in ("head", "head_size"):
|
| 286 |
+
variations["compaction"] = [
|
| 287 |
+
CompactionConfig.head_tail(h, 40) for h in [5, 10, 20]
|
| 288 |
+
]
|
| 289 |
elif param in ("tail", "tail_size"):
|
| 290 |
+
variations["compaction"] = [
|
| 291 |
+
CompactionConfig.head_tail(10, t) for t in [20, 40, 60]
|
| 292 |
+
]
|
| 293 |
else:
|
| 294 |
console.print(f"[yellow]Warning:[/] Unknown vary param: {param}")
|
| 295 |
|
src/flow/experiments/__init__.py
CHANGED
|
@@ -3,96 +3,59 @@
|
|
| 3 |
"""Experiments framework for running and evaluating Flow agent tasks.
|
| 4 |
|
| 5 |
This package provides a structured way to:
|
| 6 |
-
- Define
|
|
|
|
| 7 |
- Run agents on tasks and collect OpenTelemetry traces
|
| 8 |
- Evaluate agent outputs using LLM, heuristic, or trace-based evaluators
|
| 9 |
- Extract metrics from execution traces
|
| 10 |
-
- Run
|
| 11 |
|
| 12 |
Example usage:
|
| 13 |
-
from flow.harness.maf import MAFHarness
|
| 14 |
from flow.experiments import (
|
| 15 |
-
|
|
|
|
|
|
|
|
|
|
| 16 |
Task,
|
| 17 |
EvalCriterion,
|
| 18 |
-
TraceEvaluator,
|
| 19 |
-
HeuristicEvaluator,
|
| 20 |
-
extract_metrics,
|
| 21 |
-
format_metrics_summary,
|
| 22 |
-
setup_tracing,
|
| 23 |
)
|
| 24 |
|
| 25 |
-
#
|
| 26 |
-
|
| 27 |
|
| 28 |
-
#
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
EvalCriterion(
|
| 34 |
-
name="correctness",
|
| 35 |
-
instruction="The function should print exactly 'Hello, World!'",
|
| 36 |
-
),
|
| 37 |
-
],
|
| 38 |
-
)
|
| 39 |
-
|
| 40 |
-
# Run the experiment
|
| 41 |
-
harness = MAFHarness()
|
| 42 |
-
runner = FlowExperimentRunner(keep_workspace=True)
|
| 43 |
-
result = await runner.run(harness, task)
|
| 44 |
-
|
| 45 |
-
# Extract metrics
|
| 46 |
-
metrics = extract_metrics(result.trace)
|
| 47 |
-
print(format_metrics_summary(metrics))
|
| 48 |
-
|
| 49 |
-
# Evaluate the result
|
| 50 |
-
evaluator = HeuristicEvaluator()
|
| 51 |
-
eval_result = await evaluator.evaluate(result)
|
| 52 |
-
print(f"Score: {eval_result.score}, Passed: {eval_result.passed}")
|
| 53 |
-
|
| 54 |
-
await harness.close()
|
| 55 |
-
|
| 56 |
-
Ablation studies:
|
| 57 |
-
from flow.experiments import run_ablations, AblationConfig
|
| 58 |
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
results = await run_ablations(
|
| 65 |
-
configs,
|
| 66 |
-
task_prompt="Create a simple HTTP server",
|
| 67 |
-
)
|
| 68 |
"""
|
| 69 |
|
| 70 |
-
#
|
| 71 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
from .ablation import (
|
| 73 |
-
AGENT_MEMORY_ONLY,
|
| 74 |
-
ALL_CONTEXT_ENGINEERING,
|
| 75 |
-
COMPACTION_ONLY,
|
| 76 |
-
# Context engineering configs
|
| 77 |
-
CONTEXT_ENG_BASELINE,
|
| 78 |
-
CONTEXT_ENGINEERING_CONFIGS,
|
| 79 |
-
ISOLATION_ONLY,
|
| 80 |
-
AblationConfig,
|
| 81 |
-
AblationResult,
|
| 82 |
-
# Shared utilities
|
| 83 |
compute_pareto_frontier,
|
| 84 |
-
|
| 85 |
generate_recommendation,
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
run_single_ablation,
|
| 89 |
-
)
|
| 90 |
-
|
| 91 |
-
# Config export
|
| 92 |
-
from .config_export import (
|
| 93 |
-
export_config,
|
| 94 |
-
export_optimization_configs,
|
| 95 |
-
load_config,
|
| 96 |
)
|
| 97 |
|
| 98 |
# Evaluators
|
|
@@ -116,11 +79,10 @@ from .metrics import (
|
|
| 116 |
|
| 117 |
# Optimizer
|
| 118 |
from .optimizer import (
|
| 119 |
-
|
| 120 |
FlowOptimizer,
|
| 121 |
OptimizationResult,
|
| 122 |
TaskResult,
|
| 123 |
-
generate_grid_configs,
|
| 124 |
load_tasks_from_jsonl,
|
| 125 |
)
|
| 126 |
|
|
@@ -142,6 +104,16 @@ from .trace_collector import FlowTraceCollector
|
|
| 142 |
from .types import CriterionResult, EvalCriterion, EvalResult, RunResult, Task
|
| 143 |
|
| 144 |
__all__ = [ # noqa: RUF022 # Intentionally grouped by category
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
# Types
|
| 146 |
"Task",
|
| 147 |
"EvalCriterion",
|
|
@@ -173,32 +145,16 @@ __all__ = [ # noqa: RUF022 # Intentionally grouped by category
|
|
| 173 |
"print_metrics_summary",
|
| 174 |
"print_comparison_table",
|
| 175 |
"print_eval_result",
|
| 176 |
-
#
|
| 177 |
-
"
|
| 178 |
-
"
|
| 179 |
-
"
|
| 180 |
-
"run_single_ablation",
|
| 181 |
-
"create_harness_from_config",
|
| 182 |
-
# Context engineering configs
|
| 183 |
-
"CONTEXT_ENG_BASELINE",
|
| 184 |
-
"COMPACTION_ONLY",
|
| 185 |
-
"AGENT_MEMORY_ONLY",
|
| 186 |
-
"ISOLATION_ONLY",
|
| 187 |
-
"ALL_CONTEXT_ENGINEERING",
|
| 188 |
-
"CONTEXT_ENGINEERING_CONFIGS",
|
| 189 |
-
"run_context_engineering_comparison",
|
| 190 |
-
# Shared utilities
|
| 191 |
"compute_pareto_frontier",
|
| 192 |
"generate_recommendation",
|
| 193 |
# Optimizer
|
| 194 |
"FlowOptimizer",
|
| 195 |
"OptimizationResult",
|
| 196 |
-
"
|
| 197 |
"TaskResult",
|
| 198 |
-
"generate_grid_configs",
|
| 199 |
"load_tasks_from_jsonl",
|
| 200 |
-
# Config export
|
| 201 |
-
"export_config",
|
| 202 |
-
"load_config",
|
| 203 |
-
"export_optimization_configs",
|
| 204 |
]
|
|
|
|
| 3 |
"""Experiments framework for running and evaluating Flow agent tasks.
|
| 4 |
|
| 5 |
This package provides a structured way to:
|
| 6 |
+
- Define agents with the Agent dataclass
|
| 7 |
+
- Generate candidate variants via CandidateStrategy implementations
|
| 8 |
- Run agents on tasks and collect OpenTelemetry traces
|
| 9 |
- Evaluate agent outputs using LLM, heuristic, or trace-based evaluators
|
| 10 |
- Extract metrics from execution traces
|
| 11 |
+
- Run optimization studies comparing different candidates
|
| 12 |
|
| 13 |
Example usage:
|
|
|
|
| 14 |
from flow.experiments import (
|
| 15 |
+
Agent,
|
| 16 |
+
Candidate,
|
| 17 |
+
GridSearchStrategy,
|
| 18 |
+
FlowOptimizer,
|
| 19 |
Task,
|
| 20 |
EvalCriterion,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
)
|
| 22 |
|
| 23 |
+
# Define a base agent
|
| 24 |
+
base = Agent(name="my_agent", enable_memory=True)
|
| 25 |
|
| 26 |
+
# Generate candidates
|
| 27 |
+
strategy = GridSearchStrategy(variations={
|
| 28 |
+
"enable_memory": [True, False],
|
| 29 |
+
})
|
| 30 |
+
candidates = strategy.generate(base, budget=10)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
+
# Run optimization
|
| 33 |
+
optimizer = FlowOptimizer(parallel=4)
|
| 34 |
+
tasks = [Task(name="test", prompt="Create hello world")]
|
| 35 |
+
result = await optimizer.optimize(candidates, tasks)
|
| 36 |
+
print(f"Best: {result.rank_by_score[0]}")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
"""
|
| 38 |
|
| 39 |
+
# Core models
|
| 40 |
+
from .models import (
|
| 41 |
+
Agent,
|
| 42 |
+
Candidate,
|
| 43 |
+
CandidateStrategy,
|
| 44 |
+
CompactionConfig,
|
| 45 |
+
ExperimentResult,
|
| 46 |
+
GridSearchStrategy,
|
| 47 |
+
export_agent,
|
| 48 |
+
export_optimization_results,
|
| 49 |
+
load_agent,
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
# Experiment runner + Pareto analysis
|
| 53 |
from .ablation import (
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
compute_pareto_frontier,
|
| 55 |
+
create_harness_from_agent,
|
| 56 |
generate_recommendation,
|
| 57 |
+
run_experiments,
|
| 58 |
+
run_single_experiment,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
)
|
| 60 |
|
| 61 |
# Evaluators
|
|
|
|
| 79 |
|
| 80 |
# Optimizer
|
| 81 |
from .optimizer import (
|
| 82 |
+
CandidateSummary,
|
| 83 |
FlowOptimizer,
|
| 84 |
OptimizationResult,
|
| 85 |
TaskResult,
|
|
|
|
| 86 |
load_tasks_from_jsonl,
|
| 87 |
)
|
| 88 |
|
|
|
|
| 104 |
from .types import CriterionResult, EvalCriterion, EvalResult, RunResult, Task
|
| 105 |
|
| 106 |
__all__ = [ # noqa: RUF022 # Intentionally grouped by category
|
| 107 |
+
# Core models
|
| 108 |
+
"Agent",
|
| 109 |
+
"Candidate",
|
| 110 |
+
"CandidateStrategy",
|
| 111 |
+
"CompactionConfig",
|
| 112 |
+
"ExperimentResult",
|
| 113 |
+
"GridSearchStrategy",
|
| 114 |
+
"export_agent",
|
| 115 |
+
"load_agent",
|
| 116 |
+
"export_optimization_results",
|
| 117 |
# Types
|
| 118 |
"Task",
|
| 119 |
"EvalCriterion",
|
|
|
|
| 145 |
"print_metrics_summary",
|
| 146 |
"print_comparison_table",
|
| 147 |
"print_eval_result",
|
| 148 |
+
# Experiment runner
|
| 149 |
+
"create_harness_from_agent",
|
| 150 |
+
"run_experiments",
|
| 151 |
+
"run_single_experiment",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
"compute_pareto_frontier",
|
| 153 |
"generate_recommendation",
|
| 154 |
# Optimizer
|
| 155 |
"FlowOptimizer",
|
| 156 |
"OptimizationResult",
|
| 157 |
+
"CandidateSummary",
|
| 158 |
"TaskResult",
|
|
|
|
| 159 |
"load_tasks_from_jsonl",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 160 |
]
|
src/flow/experiments/ablation.py
CHANGED
|
@@ -1,137 +1,91 @@
|
|
| 1 |
# Copyright (c) Microsoft. All rights reserved.
|
| 2 |
|
| 3 |
-
"""
|
| 4 |
|
| 5 |
This module provides:
|
| 6 |
-
-
|
| 7 |
- Pareto analysis utilities for multi-objective optimization
|
| 8 |
-
-
|
| 9 |
-
- Convenience functions for running ablation studies
|
| 10 |
"""
|
| 11 |
|
| 12 |
from __future__ import annotations
|
| 13 |
|
| 14 |
import json
|
| 15 |
import logging
|
| 16 |
-
from dataclasses import asdict
|
| 17 |
from datetime import datetime
|
| 18 |
from pathlib import Path
|
| 19 |
-
from typing import TYPE_CHECKING
|
| 20 |
|
| 21 |
from .evaluators import HeuristicEvaluator
|
| 22 |
-
from .metrics import
|
|
|
|
| 23 |
from .reporters import print_comparison_table, save_run_result
|
| 24 |
from .runner import FlowExperimentRunner, setup_tracing
|
| 25 |
-
from .types import EvalCriterion,
|
| 26 |
|
| 27 |
if TYPE_CHECKING:
|
| 28 |
from flow.harness.maf import MAFHarness
|
| 29 |
|
| 30 |
-
from .optimizer import
|
| 31 |
|
| 32 |
logger = logging.getLogger(__name__)
|
| 33 |
|
| 34 |
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
"""Configuration for a single ablation run.
|
| 38 |
-
|
| 39 |
-
Each config represents a different agent configuration to test.
|
| 40 |
-
The name is used as an identifier in comparison results.
|
| 41 |
-
|
| 42 |
-
Attributes:
|
| 43 |
-
name: Unique identifier for this configuration
|
| 44 |
-
enable_message_compaction: Whether to enable message compaction
|
| 45 |
-
enable_memory_tool: Whether to enable agent-managed memory
|
| 46 |
-
enable_sub_agent: Whether to enable sub-agent for isolated research
|
| 47 |
-
compaction_head_size: Number of initial messages to keep
|
| 48 |
-
compaction_tail_size: Number of recent messages to keep
|
| 49 |
-
bash_timeout: Timeout for bash commands in seconds
|
| 50 |
-
"""
|
| 51 |
-
|
| 52 |
-
name: str
|
| 53 |
-
enable_message_compaction: bool = True
|
| 54 |
-
enable_memory_tool: bool = True
|
| 55 |
-
enable_sub_agent: bool = False
|
| 56 |
-
compaction_head_size: int = 10
|
| 57 |
-
compaction_tail_size: int = 40
|
| 58 |
-
bash_timeout: int = 120
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
@dataclass
|
| 62 |
-
class AblationResult:
|
| 63 |
-
"""Result of a single ablation run.
|
| 64 |
-
|
| 65 |
-
Contains all data from the run including raw results,
|
| 66 |
-
extracted metrics, and evaluation scores.
|
| 67 |
-
"""
|
| 68 |
-
|
| 69 |
-
config: AblationConfig
|
| 70 |
-
run_result: RunResult
|
| 71 |
-
metrics: TraceMetrics
|
| 72 |
-
eval_score: float
|
| 73 |
-
eval_passed: bool
|
| 74 |
-
eval_reasoning: str
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
def create_harness_from_config(config: AblationConfig, workspace: Path) -> MAFHarness:
|
| 78 |
-
"""Create a MAFHarness from an ablation config.
|
| 79 |
|
| 80 |
Args:
|
| 81 |
-
|
| 82 |
workspace: Working directory
|
| 83 |
|
| 84 |
Returns:
|
| 85 |
A configured MAFHarness
|
| 86 |
"""
|
|
|
|
| 87 |
from flow.harness.maf import MAFHarness
|
| 88 |
|
|
|
|
|
|
|
|
|
|
| 89 |
return MAFHarness(
|
| 90 |
workspace=workspace,
|
| 91 |
memory_path=workspace / "memory",
|
| 92 |
-
enable_compaction=
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
bash_timeout=config.bash_timeout,
|
| 98 |
)
|
| 99 |
|
| 100 |
|
| 101 |
-
async def
|
| 102 |
-
|
| 103 |
task: Task,
|
| 104 |
workspace: Path,
|
| 105 |
-
) ->
|
| 106 |
-
"""Run a single
|
| 107 |
|
| 108 |
Args:
|
| 109 |
-
|
| 110 |
task: The task to run
|
| 111 |
workspace: Working directory
|
| 112 |
|
| 113 |
Returns:
|
| 114 |
-
|
| 115 |
"""
|
| 116 |
-
|
| 117 |
-
harness = create_harness_from_config(config, workspace)
|
| 118 |
|
| 119 |
try:
|
| 120 |
-
# Create runner
|
| 121 |
runner = FlowExperimentRunner(keep_workspace=True)
|
| 122 |
-
|
| 123 |
-
# Run the experiment
|
| 124 |
run_result = await runner.run(harness, task, workspace=workspace)
|
| 125 |
-
|
| 126 |
-
# Extract metrics
|
| 127 |
metrics = extract_metrics(run_result.trace)
|
| 128 |
|
| 129 |
-
# Evaluate the result
|
| 130 |
evaluator = HeuristicEvaluator()
|
| 131 |
eval_result = await evaluator.evaluate(run_result)
|
| 132 |
|
| 133 |
-
return
|
| 134 |
-
|
| 135 |
run_result=run_result,
|
| 136 |
metrics=metrics,
|
| 137 |
eval_score=eval_result.score,
|
|
@@ -142,26 +96,20 @@ async def run_single_ablation(
|
|
| 142 |
await harness.close()
|
| 143 |
|
| 144 |
|
| 145 |
-
def
|
| 146 |
-
"""Save
|
| 147 |
-
|
| 148 |
-
Creates a subdirectory for the config with all result files.
|
| 149 |
-
|
| 150 |
-
Args:
|
| 151 |
-
result: The ablation result to save
|
| 152 |
-
output_dir: Base directory for output
|
| 153 |
-
"""
|
| 154 |
-
config_dir = output_dir / result.config.name
|
| 155 |
save_run_result(
|
| 156 |
result.run_result,
|
| 157 |
config_dir,
|
| 158 |
metrics=result.metrics,
|
| 159 |
)
|
| 160 |
|
| 161 |
-
|
| 162 |
-
with open(config_dir / "ablation.json", "w") as f:
|
| 163 |
json.dump({
|
| 164 |
-
"
|
|
|
|
|
|
|
| 165 |
"evaluation": {
|
| 166 |
"score": result.eval_score,
|
| 167 |
"passed": result.eval_passed,
|
|
@@ -170,37 +118,29 @@ def save_ablation_result(result: AblationResult, output_dir: Path) -> None:
|
|
| 170 |
}, f, indent=2)
|
| 171 |
|
| 172 |
|
| 173 |
-
async def
|
| 174 |
-
|
| 175 |
task_prompt: str,
|
| 176 |
output_dir: Path | None = None,
|
| 177 |
-
task_name: str = "
|
| 178 |
-
) -> list[
|
| 179 |
-
"""Run multiple
|
| 180 |
-
|
| 181 |
-
This function:
|
| 182 |
-
1. Sets up tracing
|
| 183 |
-
2. Runs each configuration on the same task
|
| 184 |
-
3. Collects metrics and evaluation scores
|
| 185 |
-
4. Saves results and prints comparison
|
| 186 |
|
| 187 |
Args:
|
| 188 |
-
|
| 189 |
task_prompt: The task prompt to run
|
| 190 |
-
output_dir: Base directory for output (default: ~/.flow/
|
| 191 |
-
task_name: Name for the task
|
| 192 |
|
| 193 |
Returns:
|
| 194 |
-
List of
|
| 195 |
"""
|
| 196 |
-
# Setup output directory
|
| 197 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 198 |
if output_dir is None:
|
| 199 |
-
output_dir = Path.home() / ".flow" / "
|
| 200 |
output_dir = output_dir / timestamp
|
| 201 |
output_dir.mkdir(parents=True, exist_ok=True)
|
| 202 |
|
| 203 |
-
# Create task
|
| 204 |
task = Task(
|
| 205 |
name=task_name,
|
| 206 |
prompt=task_prompt,
|
|
@@ -212,52 +152,47 @@ async def run_ablations(
|
|
| 212 |
],
|
| 213 |
)
|
| 214 |
|
| 215 |
-
# Save configs
|
| 216 |
with open(output_dir / "config.json", "w") as f: # noqa: ASYNC230
|
| 217 |
json.dump({
|
| 218 |
"task": task_prompt,
|
| 219 |
"timestamp": timestamp,
|
| 220 |
-
"
|
| 221 |
}, f, indent=2)
|
| 222 |
|
| 223 |
print("=" * 80)
|
| 224 |
-
print(" FLOW
|
| 225 |
print("=" * 80)
|
| 226 |
-
print(f" Task:
|
| 227 |
-
print(f"
|
| 228 |
-
print(f" Output:
|
| 229 |
print("=" * 80)
|
| 230 |
|
| 231 |
-
|
| 232 |
-
setup_tracing("flow-ablation")
|
| 233 |
|
| 234 |
-
results = []
|
| 235 |
-
for i,
|
| 236 |
-
print(f"\n[{i}/{len(
|
| 237 |
print("-" * 40)
|
| 238 |
|
| 239 |
-
|
| 240 |
-
workspace = output_dir / config.name / "workspace"
|
| 241 |
workspace.mkdir(parents=True, exist_ok=True)
|
| 242 |
|
| 243 |
-
result = await
|
| 244 |
-
|
| 245 |
task=task,
|
| 246 |
workspace=workspace,
|
| 247 |
)
|
| 248 |
|
| 249 |
results.append(result)
|
| 250 |
-
|
| 251 |
|
| 252 |
-
# Quick status
|
| 253 |
status = "OK" if result.run_result.success else "FAIL"
|
| 254 |
print(f" {status} | {result.run_result.duration_seconds:.1f}s | "
|
| 255 |
f"Tokens: {result.metrics.total_tokens} | Tools: {result.metrics.tool_call_count}")
|
| 256 |
|
| 257 |
-
# Save comparison
|
| 258 |
comparison_data = [
|
| 259 |
{
|
| 260 |
-
"name": r.
|
| 261 |
"success": r.run_result.success,
|
| 262 |
"duration_seconds": r.run_result.duration_seconds,
|
| 263 |
"metrics": metrics_to_dict(r.metrics),
|
|
@@ -272,152 +207,48 @@ async def run_ablations(
|
|
| 272 |
with open(output_dir / "comparison.json", "w") as f: # noqa: ASYNC230
|
| 273 |
json.dump({"task": task_prompt, "results": comparison_data}, f, indent=2)
|
| 274 |
|
| 275 |
-
|
| 276 |
-
print_comparison_table(comparison_data, "Ablation Comparison")
|
| 277 |
-
|
| 278 |
print(f"\nResults saved to: {output_dir}")
|
| 279 |
|
| 280 |
return results
|
| 281 |
|
| 282 |
|
| 283 |
# =============================================================================
|
| 284 |
-
#
|
| 285 |
-
# =============================================================================
|
| 286 |
-
# These configurations demonstrate the three main context engineering strategies:
|
| 287 |
-
# 1. Compaction - Reactive trimming via message stores
|
| 288 |
-
# 2. Agent-Managed Memory - Agent controls when to write/read/delete
|
| 289 |
-
# 3. Isolation - Sub-agent architecture prevents context pollution
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
# Baseline: No context engineering (for comparison)
|
| 293 |
-
CONTEXT_ENG_BASELINE = AblationConfig(
|
| 294 |
-
name="no_context_engineering",
|
| 295 |
-
enable_message_compaction=False,
|
| 296 |
-
enable_memory_tool=False,
|
| 297 |
-
enable_sub_agent=False,
|
| 298 |
-
)
|
| 299 |
-
|
| 300 |
-
# Strategy 1: Compaction via Message Stores
|
| 301 |
-
# Uses HeadTailCompactingMessageStore to keep first N + last M messages
|
| 302 |
-
# Good for: Long-running sessions where middle context is less important
|
| 303 |
-
COMPACTION_ONLY = AblationConfig(
|
| 304 |
-
name="compaction_only",
|
| 305 |
-
enable_message_compaction=True,
|
| 306 |
-
enable_memory_tool=False,
|
| 307 |
-
enable_sub_agent=False,
|
| 308 |
-
compaction_head_size=10, # Keep task context
|
| 309 |
-
compaction_tail_size=40, # Keep recent work
|
| 310 |
-
)
|
| 311 |
-
|
| 312 |
-
# Strategy 2: Agent-Managed Memory
|
| 313 |
-
# Agent decides when to save/retrieve information from persistent storage
|
| 314 |
-
# Good for: Cross-session memory, learning patterns, storing decisions
|
| 315 |
-
AGENT_MEMORY_ONLY = AblationConfig(
|
| 316 |
-
name="agent_memory_only",
|
| 317 |
-
enable_message_compaction=False,
|
| 318 |
-
enable_memory_tool=True,
|
| 319 |
-
enable_sub_agent=False,
|
| 320 |
-
)
|
| 321 |
-
|
| 322 |
-
# Strategy 3: Isolation via Sub-Agent
|
| 323 |
-
# Delegate heavy research to sub-agent with isolated context
|
| 324 |
-
# Good for: Complex research tasks that would pollute main context
|
| 325 |
-
ISOLATION_ONLY = AblationConfig(
|
| 326 |
-
name="isolation_only",
|
| 327 |
-
enable_message_compaction=False,
|
| 328 |
-
enable_memory_tool=False,
|
| 329 |
-
enable_sub_agent=True,
|
| 330 |
-
)
|
| 331 |
-
|
| 332 |
-
# Combined: All context engineering strategies
|
| 333 |
-
# Uses compaction + memory + isolation together
|
| 334 |
-
# Good for: Production systems with long-running, complex tasks
|
| 335 |
-
ALL_CONTEXT_ENGINEERING = AblationConfig(
|
| 336 |
-
name="all_context_engineering",
|
| 337 |
-
enable_message_compaction=True,
|
| 338 |
-
enable_memory_tool=True,
|
| 339 |
-
enable_sub_agent=True,
|
| 340 |
-
compaction_head_size=10,
|
| 341 |
-
compaction_tail_size=40,
|
| 342 |
-
)
|
| 343 |
-
|
| 344 |
-
# Predefined list for running context engineering comparison
|
| 345 |
-
CONTEXT_ENGINEERING_CONFIGS = [
|
| 346 |
-
CONTEXT_ENG_BASELINE,
|
| 347 |
-
COMPACTION_ONLY,
|
| 348 |
-
AGENT_MEMORY_ONLY,
|
| 349 |
-
ISOLATION_ONLY,
|
| 350 |
-
ALL_CONTEXT_ENGINEERING,
|
| 351 |
-
]
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
async def run_context_engineering_comparison(
|
| 355 |
-
task_prompt: str,
|
| 356 |
-
output_dir: Path | None = None,
|
| 357 |
-
) -> list[AblationResult]:
|
| 358 |
-
"""Run a comparison of all context engineering strategies.
|
| 359 |
-
|
| 360 |
-
This is a convenience function that runs all context engineering
|
| 361 |
-
baseline configurations against a single task for comparison.
|
| 362 |
-
|
| 363 |
-
Args:
|
| 364 |
-
task_prompt: The task to run (should benefit from context management)
|
| 365 |
-
output_dir: Optional output directory for results
|
| 366 |
-
|
| 367 |
-
Returns:
|
| 368 |
-
List of AblationResult for each strategy
|
| 369 |
-
|
| 370 |
-
Example:
|
| 371 |
-
>>> results = await run_context_engineering_comparison(
|
| 372 |
-
... "Research the authentication patterns in this codebase and "
|
| 373 |
-
... "create a summary document with recommendations."
|
| 374 |
-
... )
|
| 375 |
-
"""
|
| 376 |
-
return await run_ablations(
|
| 377 |
-
configs=CONTEXT_ENGINEERING_CONFIGS,
|
| 378 |
-
task_prompt=task_prompt,
|
| 379 |
-
output_dir=output_dir,
|
| 380 |
-
task_name="context_engineering_comparison",
|
| 381 |
-
)
|
| 382 |
-
|
| 383 |
-
|
| 384 |
-
# =============================================================================
|
| 385 |
-
# Shared Utilities for Pareto Analysis
|
| 386 |
# =============================================================================
|
| 387 |
|
| 388 |
|
| 389 |
def compute_pareto_frontier(
|
| 390 |
-
summaries: list[
|
| 391 |
score_key: str = "avg_score",
|
| 392 |
cost_key: str = "avg_tokens",
|
| 393 |
) -> list[str]:
|
| 394 |
"""Compute Pareto frontier for multi-objective optimization.
|
| 395 |
|
| 396 |
-
Identifies configurations that are not dominated by any other
|
| 397 |
-
A config is dominated if another
|
| 398 |
|
| 399 |
Args:
|
| 400 |
-
summaries: List of
|
| 401 |
score_key: Attribute name for the score metric (higher is better)
|
| 402 |
cost_key: Attribute name for the cost metric (lower is better)
|
| 403 |
|
| 404 |
Returns:
|
| 405 |
List of names of Pareto-optimal configurations
|
| 406 |
"""
|
| 407 |
-
|
| 408 |
-
def get_val(s: object, key: str) -> float:
|
| 409 |
if isinstance(s, dict):
|
| 410 |
return float(s.get(key, 0))
|
| 411 |
return float(getattr(s, key, 0))
|
| 412 |
|
| 413 |
-
def get_name(s:
|
| 414 |
if isinstance(s, dict):
|
| 415 |
return str(s.get("name", ""))
|
| 416 |
return str(getattr(s, "name", ""))
|
| 417 |
|
| 418 |
sorted_summaries = sorted(summaries, key=lambda s: get_val(s, cost_key))
|
| 419 |
|
| 420 |
-
pareto_names = []
|
| 421 |
best_score = -1.0
|
| 422 |
|
| 423 |
for summary in sorted_summaries:
|
|
@@ -430,40 +261,37 @@ def compute_pareto_frontier(
|
|
| 430 |
|
| 431 |
|
| 432 |
def generate_recommendation(
|
| 433 |
-
summaries: list[
|
| 434 |
pareto_names: list[str],
|
| 435 |
min_score: float = 0.7,
|
| 436 |
) -> tuple[str | None, str]:
|
| 437 |
"""Generate a recommendation based on Pareto analysis.
|
| 438 |
|
| 439 |
Args:
|
| 440 |
-
summaries: List of
|
| 441 |
-
pareto_names: Names of Pareto-optimal
|
| 442 |
min_score: Minimum acceptable score threshold
|
| 443 |
|
| 444 |
Returns:
|
| 445 |
-
Tuple of (
|
| 446 |
"""
|
| 447 |
-
def get_val(s:
|
| 448 |
if isinstance(s, dict):
|
| 449 |
return float(s.get(key, 0))
|
| 450 |
return float(getattr(s, key, 0))
|
| 451 |
|
| 452 |
-
def get_name(s:
|
| 453 |
if isinstance(s, dict):
|
| 454 |
return str(s.get("name", ""))
|
| 455 |
return str(getattr(s, "name", ""))
|
| 456 |
|
| 457 |
-
# Filter to acceptable configs
|
| 458 |
acceptable = [s for s in summaries if get_val(s, "avg_score") >= min_score]
|
| 459 |
if not acceptable:
|
| 460 |
return None, "No configuration met the minimum score threshold."
|
| 461 |
|
| 462 |
-
# Prefer Pareto-optimal configs
|
| 463 |
pareto_acceptable = [s for s in acceptable if get_name(s) in pareto_names]
|
| 464 |
candidates = pareto_acceptable if pareto_acceptable else acceptable
|
| 465 |
|
| 466 |
-
# Pick the one with lowest tokens among candidates
|
| 467 |
best = min(candidates, key=lambda s: get_val(s, "avg_tokens"))
|
| 468 |
name = get_name(best)
|
| 469 |
tokens = get_val(best, "avg_tokens")
|
|
|
|
| 1 |
# Copyright (c) Microsoft. All rights reserved.
|
| 2 |
|
| 3 |
+
"""Experiment runner for comparing agent configurations.
|
| 4 |
|
| 5 |
This module provides:
|
| 6 |
+
- Functions for running experiments with Agent/Candidate models
|
| 7 |
- Pareto analysis utilities for multi-objective optimization
|
| 8 |
+
- Convenience functions for running optimization studies
|
|
|
|
| 9 |
"""
|
| 10 |
|
| 11 |
from __future__ import annotations
|
| 12 |
|
| 13 |
import json
|
| 14 |
import logging
|
| 15 |
+
from dataclasses import asdict
|
| 16 |
from datetime import datetime
|
| 17 |
from pathlib import Path
|
| 18 |
+
from typing import TYPE_CHECKING, Any
|
| 19 |
|
| 20 |
from .evaluators import HeuristicEvaluator
|
| 21 |
+
from .metrics import extract_metrics, metrics_to_dict
|
| 22 |
+
from .models import Agent, Candidate, ExperimentResult
|
| 23 |
from .reporters import print_comparison_table, save_run_result
|
| 24 |
from .runner import FlowExperimentRunner, setup_tracing
|
| 25 |
+
from .types import EvalCriterion, Task
|
| 26 |
|
| 27 |
if TYPE_CHECKING:
|
| 28 |
from flow.harness.maf import MAFHarness
|
| 29 |
|
| 30 |
+
from .optimizer import CandidateSummary
|
| 31 |
|
| 32 |
logger = logging.getLogger(__name__)
|
| 33 |
|
| 34 |
|
| 35 |
+
def create_harness_from_agent(agent: Agent, workspace: Path) -> MAFHarness:
|
| 36 |
+
"""Create a MAFHarness from an Agent definition.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
|
| 38 |
Args:
|
| 39 |
+
agent: The agent definition
|
| 40 |
workspace: Working directory
|
| 41 |
|
| 42 |
Returns:
|
| 43 |
A configured MAFHarness
|
| 44 |
"""
|
| 45 |
+
from flow.experiments.models import resolve_tools
|
| 46 |
from flow.harness.maf import MAFHarness
|
| 47 |
|
| 48 |
+
# Resolve tools to dict form
|
| 49 |
+
tools_spec = resolve_tools(agent.tools)
|
| 50 |
+
|
| 51 |
return MAFHarness(
|
| 52 |
workspace=workspace,
|
| 53 |
memory_path=workspace / "memory",
|
| 54 |
+
enable_compaction=agent.compaction.enabled,
|
| 55 |
+
compaction_head_size=agent.compaction.head_size,
|
| 56 |
+
compaction_tail_size=agent.compaction.tail_size,
|
| 57 |
+
tools=tools_spec,
|
| 58 |
+
instructions=agent.instructions,
|
|
|
|
| 59 |
)
|
| 60 |
|
| 61 |
|
| 62 |
+
async def run_single_experiment(
|
| 63 |
+
candidate: Candidate,
|
| 64 |
task: Task,
|
| 65 |
workspace: Path,
|
| 66 |
+
) -> ExperimentResult:
|
| 67 |
+
"""Run a single experiment with trace capture and evaluation.
|
| 68 |
|
| 69 |
Args:
|
| 70 |
+
candidate: The candidate to test
|
| 71 |
task: The task to run
|
| 72 |
workspace: Working directory
|
| 73 |
|
| 74 |
Returns:
|
| 75 |
+
ExperimentResult with metrics and evaluation
|
| 76 |
"""
|
| 77 |
+
harness = create_harness_from_agent(candidate.agent, workspace)
|
|
|
|
| 78 |
|
| 79 |
try:
|
|
|
|
| 80 |
runner = FlowExperimentRunner(keep_workspace=True)
|
|
|
|
|
|
|
| 81 |
run_result = await runner.run(harness, task, workspace=workspace)
|
|
|
|
|
|
|
| 82 |
metrics = extract_metrics(run_result.trace)
|
| 83 |
|
|
|
|
| 84 |
evaluator = HeuristicEvaluator()
|
| 85 |
eval_result = await evaluator.evaluate(run_result)
|
| 86 |
|
| 87 |
+
return ExperimentResult(
|
| 88 |
+
candidate=candidate,
|
| 89 |
run_result=run_result,
|
| 90 |
metrics=metrics,
|
| 91 |
eval_score=eval_result.score,
|
|
|
|
| 96 |
await harness.close()
|
| 97 |
|
| 98 |
|
| 99 |
+
def save_experiment_result(result: ExperimentResult, output_dir: Path) -> None:
|
| 100 |
+
"""Save experiment result to files."""
|
| 101 |
+
config_dir = output_dir / result.candidate.agent.name
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
save_run_result(
|
| 103 |
result.run_result,
|
| 104 |
config_dir,
|
| 105 |
metrics=result.metrics,
|
| 106 |
)
|
| 107 |
|
| 108 |
+
with open(config_dir / "experiment.json", "w") as f:
|
|
|
|
| 109 |
json.dump({
|
| 110 |
+
"agent": asdict(result.candidate.agent),
|
| 111 |
+
"mutations": result.candidate.mutations,
|
| 112 |
+
"rationale": result.candidate.rationale,
|
| 113 |
"evaluation": {
|
| 114 |
"score": result.eval_score,
|
| 115 |
"passed": result.eval_passed,
|
|
|
|
| 118 |
}, f, indent=2)
|
| 119 |
|
| 120 |
|
| 121 |
+
async def run_experiments(
|
| 122 |
+
candidates: list[Candidate],
|
| 123 |
task_prompt: str,
|
| 124 |
output_dir: Path | None = None,
|
| 125 |
+
task_name: str = "experiment_task",
|
| 126 |
+
) -> list[ExperimentResult]:
|
| 127 |
+
"""Run multiple candidates and compare.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
|
| 129 |
Args:
|
| 130 |
+
candidates: List of candidates to test
|
| 131 |
task_prompt: The task prompt to run
|
| 132 |
+
output_dir: Base directory for output (default: ~/.flow/experiments)
|
| 133 |
+
task_name: Name for the task
|
| 134 |
|
| 135 |
Returns:
|
| 136 |
+
List of experiment results
|
| 137 |
"""
|
|
|
|
| 138 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 139 |
if output_dir is None:
|
| 140 |
+
output_dir = Path.home() / ".flow" / "experiments"
|
| 141 |
output_dir = output_dir / timestamp
|
| 142 |
output_dir.mkdir(parents=True, exist_ok=True)
|
| 143 |
|
|
|
|
| 144 |
task = Task(
|
| 145 |
name=task_name,
|
| 146 |
prompt=task_prompt,
|
|
|
|
| 152 |
],
|
| 153 |
)
|
| 154 |
|
|
|
|
| 155 |
with open(output_dir / "config.json", "w") as f: # noqa: ASYNC230
|
| 156 |
json.dump({
|
| 157 |
"task": task_prompt,
|
| 158 |
"timestamp": timestamp,
|
| 159 |
+
"candidates": [asdict(c) for c in candidates],
|
| 160 |
}, f, indent=2)
|
| 161 |
|
| 162 |
print("=" * 80)
|
| 163 |
+
print(" FLOW EXPERIMENT RUNNER")
|
| 164 |
print("=" * 80)
|
| 165 |
+
print(f" Task: {task_prompt[:60]}{'...' if len(task_prompt) > 60 else ''}")
|
| 166 |
+
print(f" Candidates: {len(candidates)}")
|
| 167 |
+
print(f" Output: {output_dir}")
|
| 168 |
print("=" * 80)
|
| 169 |
|
| 170 |
+
setup_tracing("flow-experiment")
|
|
|
|
| 171 |
|
| 172 |
+
results: list[ExperimentResult] = []
|
| 173 |
+
for i, candidate in enumerate(candidates, 1):
|
| 174 |
+
print(f"\n[{i}/{len(candidates)}] Running: {candidate.agent.name}")
|
| 175 |
print("-" * 40)
|
| 176 |
|
| 177 |
+
workspace = output_dir / candidate.agent.name / "workspace"
|
|
|
|
| 178 |
workspace.mkdir(parents=True, exist_ok=True)
|
| 179 |
|
| 180 |
+
result = await run_single_experiment(
|
| 181 |
+
candidate=candidate,
|
| 182 |
task=task,
|
| 183 |
workspace=workspace,
|
| 184 |
)
|
| 185 |
|
| 186 |
results.append(result)
|
| 187 |
+
save_experiment_result(result, output_dir)
|
| 188 |
|
|
|
|
| 189 |
status = "OK" if result.run_result.success else "FAIL"
|
| 190 |
print(f" {status} | {result.run_result.duration_seconds:.1f}s | "
|
| 191 |
f"Tokens: {result.metrics.total_tokens} | Tools: {result.metrics.tool_call_count}")
|
| 192 |
|
|
|
|
| 193 |
comparison_data = [
|
| 194 |
{
|
| 195 |
+
"name": r.candidate.agent.name,
|
| 196 |
"success": r.run_result.success,
|
| 197 |
"duration_seconds": r.run_result.duration_seconds,
|
| 198 |
"metrics": metrics_to_dict(r.metrics),
|
|
|
|
| 207 |
with open(output_dir / "comparison.json", "w") as f: # noqa: ASYNC230
|
| 208 |
json.dump({"task": task_prompt, "results": comparison_data}, f, indent=2)
|
| 209 |
|
| 210 |
+
print_comparison_table(comparison_data, "Experiment Comparison")
|
|
|
|
|
|
|
| 211 |
print(f"\nResults saved to: {output_dir}")
|
| 212 |
|
| 213 |
return results
|
| 214 |
|
| 215 |
|
| 216 |
# =============================================================================
|
| 217 |
+
# Pareto Analysis Utilities
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 218 |
# =============================================================================
|
| 219 |
|
| 220 |
|
| 221 |
def compute_pareto_frontier(
|
| 222 |
+
summaries: list[CandidateSummary],
|
| 223 |
score_key: str = "avg_score",
|
| 224 |
cost_key: str = "avg_tokens",
|
| 225 |
) -> list[str]:
|
| 226 |
"""Compute Pareto frontier for multi-objective optimization.
|
| 227 |
|
| 228 |
+
Identifies configurations that are not dominated by any other.
|
| 229 |
+
A config is dominated if another has better score AND lower tokens.
|
| 230 |
|
| 231 |
Args:
|
| 232 |
+
summaries: List of CandidateSummary objects (or dicts)
|
| 233 |
score_key: Attribute name for the score metric (higher is better)
|
| 234 |
cost_key: Attribute name for the cost metric (lower is better)
|
| 235 |
|
| 236 |
Returns:
|
| 237 |
List of names of Pareto-optimal configurations
|
| 238 |
"""
|
| 239 |
+
def get_val(s: CandidateSummary | dict[str, Any], key: str) -> float:
|
|
|
|
| 240 |
if isinstance(s, dict):
|
| 241 |
return float(s.get(key, 0))
|
| 242 |
return float(getattr(s, key, 0))
|
| 243 |
|
| 244 |
+
def get_name(s: CandidateSummary | dict[str, Any]) -> str:
|
| 245 |
if isinstance(s, dict):
|
| 246 |
return str(s.get("name", ""))
|
| 247 |
return str(getattr(s, "name", ""))
|
| 248 |
|
| 249 |
sorted_summaries = sorted(summaries, key=lambda s: get_val(s, cost_key))
|
| 250 |
|
| 251 |
+
pareto_names: list[str] = []
|
| 252 |
best_score = -1.0
|
| 253 |
|
| 254 |
for summary in sorted_summaries:
|
|
|
|
| 261 |
|
| 262 |
|
| 263 |
def generate_recommendation(
|
| 264 |
+
summaries: list[CandidateSummary],
|
| 265 |
pareto_names: list[str],
|
| 266 |
min_score: float = 0.7,
|
| 267 |
) -> tuple[str | None, str]:
|
| 268 |
"""Generate a recommendation based on Pareto analysis.
|
| 269 |
|
| 270 |
Args:
|
| 271 |
+
summaries: List of CandidateSummary objects
|
| 272 |
+
pareto_names: Names of Pareto-optimal candidates
|
| 273 |
min_score: Minimum acceptable score threshold
|
| 274 |
|
| 275 |
Returns:
|
| 276 |
+
Tuple of (recommended_name, recommendation_text)
|
| 277 |
"""
|
| 278 |
+
def get_val(s: CandidateSummary | dict[str, Any], key: str) -> float:
|
| 279 |
if isinstance(s, dict):
|
| 280 |
return float(s.get(key, 0))
|
| 281 |
return float(getattr(s, key, 0))
|
| 282 |
|
| 283 |
+
def get_name(s: CandidateSummary | dict[str, Any]) -> str:
|
| 284 |
if isinstance(s, dict):
|
| 285 |
return str(s.get("name", ""))
|
| 286 |
return str(getattr(s, "name", ""))
|
| 287 |
|
|
|
|
| 288 |
acceptable = [s for s in summaries if get_val(s, "avg_score") >= min_score]
|
| 289 |
if not acceptable:
|
| 290 |
return None, "No configuration met the minimum score threshold."
|
| 291 |
|
|
|
|
| 292 |
pareto_acceptable = [s for s in acceptable if get_name(s) in pareto_names]
|
| 293 |
candidates = pareto_acceptable if pareto_acceptable else acceptable
|
| 294 |
|
|
|
|
| 295 |
best = min(candidates, key=lambda s: get_val(s, "avg_tokens"))
|
| 296 |
name = get_name(best)
|
| 297 |
tokens = get_val(best, "avg_tokens")
|
src/flow/experiments/config_export.py
DELETED
|
@@ -1,184 +0,0 @@
|
|
| 1 |
-
# Copyright (c) Microsoft. All rights reserved.
|
| 2 |
-
|
| 3 |
-
"""Config export/import utilities for optimizer results.
|
| 4 |
-
|
| 5 |
-
Exports winning configurations as YAML files that can be loaded
|
| 6 |
-
and used directly with `flow run --config <path>`.
|
| 7 |
-
"""
|
| 8 |
-
|
| 9 |
-
from __future__ import annotations
|
| 10 |
-
|
| 11 |
-
from dataclasses import asdict
|
| 12 |
-
from pathlib import Path
|
| 13 |
-
from typing import Any
|
| 14 |
-
|
| 15 |
-
import yaml
|
| 16 |
-
|
| 17 |
-
from .ablation import AblationConfig
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
def export_config(
|
| 21 |
-
config: AblationConfig,
|
| 22 |
-
metrics: dict[str, Any],
|
| 23 |
-
path: Path,
|
| 24 |
-
) -> None:
|
| 25 |
-
"""Export an AblationConfig as a reusable YAML file.
|
| 26 |
-
|
| 27 |
-
The exported YAML includes:
|
| 28 |
-
- All config parameters (directly loadable)
|
| 29 |
-
- Optimization metadata prefixed with _ (ignored when loading)
|
| 30 |
-
|
| 31 |
-
Args:
|
| 32 |
-
config: The AblationConfig to export
|
| 33 |
-
metrics: Optimization metrics (score, tokens, etc.)
|
| 34 |
-
path: Path to write the YAML file
|
| 35 |
-
|
| 36 |
-
Example output:
|
| 37 |
-
name: compaction_head10_tail40
|
| 38 |
-
enable_message_compaction: true
|
| 39 |
-
compaction_head_size: 10
|
| 40 |
-
...
|
| 41 |
-
_optimization:
|
| 42 |
-
timestamp: "2026-01-26T14:30:22"
|
| 43 |
-
avg_score: 0.89
|
| 44 |
-
avg_tokens: 12400
|
| 45 |
-
"""
|
| 46 |
-
data = asdict(config)
|
| 47 |
-
data["_optimization"] = metrics
|
| 48 |
-
path.parent.mkdir(parents=True, exist_ok=True)
|
| 49 |
-
path.write_text(yaml.dump(data, default_flow_style=False, sort_keys=False))
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
def load_config(path: Path) -> AblationConfig:
|
| 53 |
-
"""Load an AblationConfig from a YAML file.
|
| 54 |
-
|
| 55 |
-
Ignores any keys prefixed with _ (optimization metadata).
|
| 56 |
-
|
| 57 |
-
Args:
|
| 58 |
-
path: Path to the YAML config file
|
| 59 |
-
|
| 60 |
-
Returns:
|
| 61 |
-
AblationConfig instance
|
| 62 |
-
|
| 63 |
-
Raises:
|
| 64 |
-
FileNotFoundError: If the config file doesn't exist
|
| 65 |
-
ValueError: If the config is invalid
|
| 66 |
-
"""
|
| 67 |
-
if not path.exists():
|
| 68 |
-
raise FileNotFoundError(f"Config file not found: {path}")
|
| 69 |
-
|
| 70 |
-
data = yaml.safe_load(path.read_text())
|
| 71 |
-
|
| 72 |
-
# Filter out metadata keys (prefixed with _)
|
| 73 |
-
config_data = {k: v for k, v in data.items() if not k.startswith("_")}
|
| 74 |
-
|
| 75 |
-
try:
|
| 76 |
-
return AblationConfig(**config_data)
|
| 77 |
-
except TypeError as e:
|
| 78 |
-
raise ValueError(f"Invalid config file {path}: {e}") from e
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
def export_optimization_configs(
|
| 82 |
-
summaries: list[dict[str, Any]],
|
| 83 |
-
pareto_names: list[str],
|
| 84 |
-
output_dir: Path,
|
| 85 |
-
timestamp: str,
|
| 86 |
-
) -> dict[str, Path]:
|
| 87 |
-
"""Export all notable configs from an optimization run.
|
| 88 |
-
|
| 89 |
-
Exports:
|
| 90 |
-
- best_score.yaml: Highest quality config
|
| 91 |
-
- best_cost.yaml: Lowest token usage config
|
| 92 |
-
- best_efficiency.yaml: Best score/token ratio
|
| 93 |
-
- pareto/<name>.yaml: All Pareto-optimal configs
|
| 94 |
-
|
| 95 |
-
Args:
|
| 96 |
-
summaries: List of ConfigSummary dicts with metrics
|
| 97 |
-
pareto_names: Names of Pareto-optimal configs
|
| 98 |
-
output_dir: Directory to write configs
|
| 99 |
-
timestamp: Optimization timestamp for metadata
|
| 100 |
-
|
| 101 |
-
Returns:
|
| 102 |
-
Dict mapping config type to file path
|
| 103 |
-
"""
|
| 104 |
-
configs_dir = output_dir / "configs"
|
| 105 |
-
configs_dir.mkdir(parents=True, exist_ok=True)
|
| 106 |
-
|
| 107 |
-
exported: dict[str, Path] = {}
|
| 108 |
-
|
| 109 |
-
if not summaries:
|
| 110 |
-
return exported
|
| 111 |
-
|
| 112 |
-
# Find best by different criteria
|
| 113 |
-
best_score = max(summaries, key=lambda s: s.get("avg_score", 0))
|
| 114 |
-
best_cost = min(summaries, key=lambda s: s.get("avg_tokens", float("inf")))
|
| 115 |
-
best_efficiency = max(
|
| 116 |
-
summaries,
|
| 117 |
-
key=lambda s: s.get("avg_score", 0) / max(s.get("avg_tokens", 1), 1),
|
| 118 |
-
)
|
| 119 |
-
|
| 120 |
-
# Export best configs
|
| 121 |
-
for label, summary in [
|
| 122 |
-
("best_score", best_score),
|
| 123 |
-
("best_cost", best_cost),
|
| 124 |
-
("best_efficiency", best_efficiency),
|
| 125 |
-
]:
|
| 126 |
-
config = _summary_to_config(summary)
|
| 127 |
-
metrics = _extract_metrics(summary, timestamp, label)
|
| 128 |
-
path = configs_dir / f"{label}.yaml"
|
| 129 |
-
export_config(config, metrics, path)
|
| 130 |
-
exported[label] = path
|
| 131 |
-
|
| 132 |
-
# Export Pareto-optimal configs
|
| 133 |
-
pareto_dir = configs_dir / "pareto"
|
| 134 |
-
pareto_dir.mkdir(exist_ok=True)
|
| 135 |
-
|
| 136 |
-
for summary in summaries:
|
| 137 |
-
name = summary.get("name", "unknown")
|
| 138 |
-
if name in pareto_names:
|
| 139 |
-
config = _summary_to_config(summary)
|
| 140 |
-
metrics = _extract_metrics(summary, timestamp, "pareto")
|
| 141 |
-
metrics["is_pareto_optimal"] = True
|
| 142 |
-
path = pareto_dir / f"{name}.yaml"
|
| 143 |
-
export_config(config, metrics, path)
|
| 144 |
-
exported[f"pareto/{name}"] = path
|
| 145 |
-
|
| 146 |
-
return exported
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
def _summary_to_config(summary: dict[str, Any]) -> AblationConfig:
|
| 150 |
-
"""Convert a summary dict back to an AblationConfig."""
|
| 151 |
-
# Extract config fields from summary
|
| 152 |
-
config_fields = {
|
| 153 |
-
"name": summary.get("name", "unknown"),
|
| 154 |
-
"enable_message_compaction": summary.get("enable_message_compaction", True),
|
| 155 |
-
"enable_memory_tool": summary.get("enable_memory_tool", True),
|
| 156 |
-
"enable_sub_agent": summary.get("enable_sub_agent", False),
|
| 157 |
-
"compaction_head_size": summary.get("compaction_head_size", 10),
|
| 158 |
-
"compaction_tail_size": summary.get("compaction_tail_size", 40),
|
| 159 |
-
"bash_timeout": summary.get("bash_timeout", 120),
|
| 160 |
-
}
|
| 161 |
-
|
| 162 |
-
# Also check nested config if present
|
| 163 |
-
if "config" in summary:
|
| 164 |
-
config_fields.update(summary["config"])
|
| 165 |
-
|
| 166 |
-
return AblationConfig(**config_fields)
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
def _extract_metrics(
|
| 170 |
-
summary: dict[str, Any],
|
| 171 |
-
timestamp: str,
|
| 172 |
-
selection_reason: str,
|
| 173 |
-
) -> dict[str, Any]:
|
| 174 |
-
"""Extract optimization metrics from a summary."""
|
| 175 |
-
return {
|
| 176 |
-
"timestamp": timestamp,
|
| 177 |
-
"selection_reason": selection_reason,
|
| 178 |
-
"avg_score": summary.get("avg_score", 0),
|
| 179 |
-
"avg_tokens": summary.get("avg_tokens", 0),
|
| 180 |
-
"avg_duration": summary.get("avg_duration", 0),
|
| 181 |
-
"pass_rate": summary.get("pass_rate", 0),
|
| 182 |
-
"pareto_rank": summary.get("pareto_rank"),
|
| 183 |
-
"is_pareto_optimal": summary.get("is_pareto_optimal", False),
|
| 184 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/flow/experiments/models.py
ADDED
|
@@ -0,0 +1,517 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Microsoft. All rights reserved.
|
| 2 |
+
|
| 3 |
+
"""Core data models for the optimization framework.
|
| 4 |
+
|
| 5 |
+
Defines:
|
| 6 |
+
- CompactionConfig: Extensible compaction strategy configuration
|
| 7 |
+
- Agent: Framework-agnostic agent definition (what the customer brings)
|
| 8 |
+
- Candidate: A mutated agent variant produced by optimization
|
| 9 |
+
- CandidateStrategy: Protocol for generating candidates from a base agent
|
| 10 |
+
- GridSearchStrategy: Brute-force grid search over parameter combinations
|
| 11 |
+
- TOOL_PRESETS: Standard tool configurations for agents
|
| 12 |
+
- resolve_tools: Normalize tool specification to dict form
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
from __future__ import annotations
|
| 16 |
+
|
| 17 |
+
from dataclasses import asdict, dataclass, field
|
| 18 |
+
from itertools import product as itertools_product
|
| 19 |
+
from pathlib import Path
|
| 20 |
+
from typing import Any, Protocol, runtime_checkable
|
| 21 |
+
|
| 22 |
+
import yaml
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
# =============================================================================
|
| 26 |
+
# Tool Configuration
|
| 27 |
+
# =============================================================================
|
| 28 |
+
|
| 29 |
+
# Tool presets define common tool configurations.
|
| 30 |
+
# Each preset maps tool names to their configuration dicts.
|
| 31 |
+
TOOL_PRESETS: dict[str, dict[str, dict[str, Any]]] = {
|
| 32 |
+
"full": {
|
| 33 |
+
"read_file": {},
|
| 34 |
+
"write_file": {},
|
| 35 |
+
"list_directory": {},
|
| 36 |
+
"grep_search": {},
|
| 37 |
+
"bash_execute": {"timeout": 120},
|
| 38 |
+
"check_processes": {},
|
| 39 |
+
"python_repl": {},
|
| 40 |
+
"think": {},
|
| 41 |
+
"task_done": {},
|
| 42 |
+
"memory": {},
|
| 43 |
+
"sub_agent": {"model": "gpt-4o-mini"},
|
| 44 |
+
},
|
| 45 |
+
"standard": {
|
| 46 |
+
"read_file": {},
|
| 47 |
+
"write_file": {},
|
| 48 |
+
"list_directory": {},
|
| 49 |
+
"grep_search": {},
|
| 50 |
+
"bash_execute": {"timeout": 120},
|
| 51 |
+
"check_processes": {},
|
| 52 |
+
"python_repl": {},
|
| 53 |
+
"think": {},
|
| 54 |
+
"task_done": {},
|
| 55 |
+
"memory": {},
|
| 56 |
+
},
|
| 57 |
+
"minimal": {
|
| 58 |
+
"read_file": {},
|
| 59 |
+
"write_file": {},
|
| 60 |
+
"bash_execute": {"timeout": 120},
|
| 61 |
+
"task_done": {},
|
| 62 |
+
},
|
| 63 |
+
"readonly": {
|
| 64 |
+
"read_file": {},
|
| 65 |
+
"list_directory": {},
|
| 66 |
+
"grep_search": {},
|
| 67 |
+
"think": {},
|
| 68 |
+
"task_done": {},
|
| 69 |
+
},
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def resolve_tools(tools: str | list[str] | dict[str, dict[str, Any]]) -> dict[str, dict[str, Any]]:
|
| 74 |
+
"""Normalize tool specification to dict form.
|
| 75 |
+
|
| 76 |
+
Accepts three input formats:
|
| 77 |
+
- str: Preset name (e.g., "standard", "minimal", "full", "readonly")
|
| 78 |
+
- list[str]: List of tool names with default configs
|
| 79 |
+
- dict[str, dict]: Full specification with per-tool configs
|
| 80 |
+
|
| 81 |
+
Args:
|
| 82 |
+
tools: Tool specification in any supported format
|
| 83 |
+
|
| 84 |
+
Returns:
|
| 85 |
+
Dict mapping tool names to their configuration dicts
|
| 86 |
+
|
| 87 |
+
Raises:
|
| 88 |
+
ValueError: If preset name is unknown
|
| 89 |
+
|
| 90 |
+
Example:
|
| 91 |
+
>>> resolve_tools("standard")
|
| 92 |
+
{"read_file": {}, "write_file": {}, ...}
|
| 93 |
+
|
| 94 |
+
>>> resolve_tools(["read_file", "bash_execute"])
|
| 95 |
+
{"read_file": {}, "bash_execute": {}}
|
| 96 |
+
|
| 97 |
+
>>> resolve_tools({"bash_execute": {"timeout": 60}})
|
| 98 |
+
{"bash_execute": {"timeout": 60}}
|
| 99 |
+
"""
|
| 100 |
+
if isinstance(tools, str):
|
| 101 |
+
if tools not in TOOL_PRESETS:
|
| 102 |
+
raise ValueError(f"Unknown tool preset: {tools}. Available: {list(TOOL_PRESETS.keys())}")
|
| 103 |
+
# Return a copy to prevent mutation of the preset
|
| 104 |
+
return {k: dict(v) for k, v in TOOL_PRESETS[tools].items()}
|
| 105 |
+
elif isinstance(tools, list):
|
| 106 |
+
return {name: {} for name in tools}
|
| 107 |
+
else:
|
| 108 |
+
# Already a dict, return a copy
|
| 109 |
+
return {k: dict(v) for k, v in tools.items()}
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
@dataclass
|
| 113 |
+
class CompactionConfig:
|
| 114 |
+
"""Extensible compaction strategy configuration.
|
| 115 |
+
|
| 116 |
+
Supports multiple strategies via a tagged-union pattern:
|
| 117 |
+
- "head_tail": Keep first N + last M messages (default)
|
| 118 |
+
- "last_n": Keep only the last N messages
|
| 119 |
+
- "none": No compaction
|
| 120 |
+
|
| 121 |
+
Future strategies (e.g., "summarize") can be added without
|
| 122 |
+
changing existing code.
|
| 123 |
+
|
| 124 |
+
Attributes:
|
| 125 |
+
strategy: The compaction strategy name
|
| 126 |
+
params: Strategy-specific parameters
|
| 127 |
+
"""
|
| 128 |
+
|
| 129 |
+
strategy: str = "head_tail"
|
| 130 |
+
params: dict[str, Any] = field(default_factory=lambda: {"head_size": 10, "tail_size": 40})
|
| 131 |
+
|
| 132 |
+
@staticmethod
|
| 133 |
+
def head_tail(head_size: int = 10, tail_size: int = 40) -> CompactionConfig:
|
| 134 |
+
"""Create a head+tail compaction config."""
|
| 135 |
+
return CompactionConfig(strategy="head_tail", params={"head_size": head_size, "tail_size": tail_size})
|
| 136 |
+
|
| 137 |
+
@staticmethod
|
| 138 |
+
def last_n(n: int = 50) -> CompactionConfig:
|
| 139 |
+
"""Create a last-N compaction config."""
|
| 140 |
+
return CompactionConfig(strategy="last_n", params={"n": n})
|
| 141 |
+
|
| 142 |
+
@staticmethod
|
| 143 |
+
def none() -> CompactionConfig:
|
| 144 |
+
"""Create a no-compaction config."""
|
| 145 |
+
return CompactionConfig(strategy="none", params={})
|
| 146 |
+
|
| 147 |
+
@property
|
| 148 |
+
def enabled(self) -> bool:
|
| 149 |
+
"""Whether compaction is enabled."""
|
| 150 |
+
return self.strategy != "none"
|
| 151 |
+
|
| 152 |
+
@property
|
| 153 |
+
def head_size(self) -> int:
|
| 154 |
+
"""Head size for head_tail strategy. Returns 0 for other strategies."""
|
| 155 |
+
return self.params.get("head_size", 0)
|
| 156 |
+
|
| 157 |
+
@property
|
| 158 |
+
def tail_size(self) -> int:
|
| 159 |
+
"""Tail size for head_tail strategy. Returns 0 for other strategies."""
|
| 160 |
+
return self.params.get("tail_size", 0)
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
@dataclass
|
| 164 |
+
class Agent:
|
| 165 |
+
"""Framework-agnostic agent definition.
|
| 166 |
+
|
| 167 |
+
This is what the customer brings to the optimization service.
|
| 168 |
+
It describes the agent's identity, model, tools, and context
|
| 169 |
+
engineering settings — everything needed to instantiate and
|
| 170 |
+
run the agent on any supported framework harness.
|
| 171 |
+
|
| 172 |
+
Attributes:
|
| 173 |
+
name: Unique identifier for this agent
|
| 174 |
+
description: Human-readable description
|
| 175 |
+
instructions: System prompt / instructions (optional, uses framework default if None)
|
| 176 |
+
model: Model deployment name (e.g., "gpt-4o")
|
| 177 |
+
compaction: Compaction strategy configuration
|
| 178 |
+
tools: Tool configuration - can be:
|
| 179 |
+
- str: Preset name ("standard", "minimal", "full", "readonly")
|
| 180 |
+
- list[str]: List of tool names with default configs
|
| 181 |
+
- dict[str, dict]: Full specification with per-tool configs
|
| 182 |
+
"""
|
| 183 |
+
|
| 184 |
+
name: str
|
| 185 |
+
description: str = ""
|
| 186 |
+
instructions: str | None = None
|
| 187 |
+
model: str | None = None
|
| 188 |
+
compaction: CompactionConfig = field(default_factory=CompactionConfig)
|
| 189 |
+
tools: str | list[str] | dict[str, dict[str, Any]] = "standard"
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
@dataclass
|
| 193 |
+
class Candidate:
|
| 194 |
+
"""A mutated agent variant produced by the optimization process.
|
| 195 |
+
|
| 196 |
+
Each candidate is derived from a base Agent with specific mutations
|
| 197 |
+
applied. The mutations dict records what was changed, and the
|
| 198 |
+
rationale explains why.
|
| 199 |
+
|
| 200 |
+
Attributes:
|
| 201 |
+
agent: The mutated agent configuration
|
| 202 |
+
mutations: Dict describing what was changed from the base
|
| 203 |
+
rationale: Human-readable explanation of why this candidate exists
|
| 204 |
+
"""
|
| 205 |
+
|
| 206 |
+
agent: Agent
|
| 207 |
+
mutations: dict[str, Any] = field(default_factory=dict)
|
| 208 |
+
rationale: str = ""
|
| 209 |
+
|
| 210 |
+
|
| 211 |
+
@dataclass
|
| 212 |
+
class ExperimentResult:
|
| 213 |
+
"""Result of running a single experiment (one candidate on one task)."""
|
| 214 |
+
|
| 215 |
+
candidate: Candidate
|
| 216 |
+
run_result: Any # RunResult from types.py
|
| 217 |
+
metrics: Any # TraceMetrics from metrics.py
|
| 218 |
+
eval_score: float = 0.0
|
| 219 |
+
eval_passed: bool = False
|
| 220 |
+
eval_reasoning: str = ""
|
| 221 |
+
|
| 222 |
+
|
| 223 |
+
@runtime_checkable
|
| 224 |
+
class CandidateStrategy(Protocol):
|
| 225 |
+
"""Protocol for generating candidate variants from a base agent.
|
| 226 |
+
|
| 227 |
+
Implementations explore different regions of the optimization space:
|
| 228 |
+
- GridSearchStrategy: Exhaustive grid over parameter combinations
|
| 229 |
+
- (Future) HeuristicStrategy: Rule-based mutations from telemetry
|
| 230 |
+
- (Future) BayesianStrategy: Bayesian optimization over parameters
|
| 231 |
+
"""
|
| 232 |
+
|
| 233 |
+
def generate(self, base: Agent, budget: int) -> list[Candidate]:
|
| 234 |
+
"""Generate candidate variants from a base agent.
|
| 235 |
+
|
| 236 |
+
Args:
|
| 237 |
+
base: The base agent to mutate
|
| 238 |
+
budget: Maximum number of candidates to generate
|
| 239 |
+
|
| 240 |
+
Returns:
|
| 241 |
+
List of Candidate objects (at most `budget` items)
|
| 242 |
+
"""
|
| 243 |
+
...
|
| 244 |
+
|
| 245 |
+
|
| 246 |
+
class GridSearchStrategy:
|
| 247 |
+
"""Brute-force grid search over parameter combinations.
|
| 248 |
+
|
| 249 |
+
Generates candidates by taking the Cartesian product of all
|
| 250 |
+
specified parameter variations.
|
| 251 |
+
|
| 252 |
+
Example:
|
| 253 |
+
strategy = GridSearchStrategy(variations={
|
| 254 |
+
"tools": ["standard", "minimal", "full"],
|
| 255 |
+
"compaction": [
|
| 256 |
+
CompactionConfig.head_tail(10, 40),
|
| 257 |
+
CompactionConfig.head_tail(5, 20),
|
| 258 |
+
CompactionConfig.none(),
|
| 259 |
+
],
|
| 260 |
+
})
|
| 261 |
+
candidates = strategy.generate(base_agent, budget=20)
|
| 262 |
+
"""
|
| 263 |
+
|
| 264 |
+
def __init__(self, variations: dict[str, list[Any]]) -> None:
|
| 265 |
+
"""Initialize with parameter variations.
|
| 266 |
+
|
| 267 |
+
Args:
|
| 268 |
+
variations: Dict mapping Agent field names to lists of values to try.
|
| 269 |
+
Special keys:
|
| 270 |
+
- "compaction": Accepts CompactionConfig objects
|
| 271 |
+
- "tools": Accepts preset strings, lists, or dicts
|
| 272 |
+
"""
|
| 273 |
+
self.variations = variations
|
| 274 |
+
|
| 275 |
+
def generate(self, base: Agent, budget: int) -> list[Candidate]:
|
| 276 |
+
"""Generate all grid combinations up to budget."""
|
| 277 |
+
if not self.variations:
|
| 278 |
+
return [Candidate(agent=base, mutations={}, rationale="baseline")]
|
| 279 |
+
|
| 280 |
+
param_names = list(self.variations.keys())
|
| 281 |
+
param_values = list(self.variations.values())
|
| 282 |
+
|
| 283 |
+
candidates = []
|
| 284 |
+
for values in itertools_product(*param_values):
|
| 285 |
+
if len(candidates) >= budget:
|
| 286 |
+
break
|
| 287 |
+
|
| 288 |
+
mutations = dict(zip(param_names, values, strict=True))
|
| 289 |
+
|
| 290 |
+
# Build mutated agent
|
| 291 |
+
agent_dict = asdict(base)
|
| 292 |
+
for key, value in mutations.items():
|
| 293 |
+
if key == "compaction" and isinstance(value, CompactionConfig):
|
| 294 |
+
agent_dict["compaction"] = asdict(value)
|
| 295 |
+
elif key in agent_dict:
|
| 296 |
+
agent_dict[key] = value
|
| 297 |
+
|
| 298 |
+
# Reconstruct CompactionConfig from dict
|
| 299 |
+
comp_data = agent_dict.pop("compaction")
|
| 300 |
+
if isinstance(comp_data, dict):
|
| 301 |
+
compaction = CompactionConfig(**comp_data)
|
| 302 |
+
else:
|
| 303 |
+
compaction = comp_data
|
| 304 |
+
|
| 305 |
+
# Handle tools field - keep as-is (str, list, or dict)
|
| 306 |
+
tools = agent_dict.pop("tools", "standard")
|
| 307 |
+
|
| 308 |
+
mutated = Agent(
|
| 309 |
+
**{k: v for k, v in agent_dict.items() if k not in ("compaction", "tools")},
|
| 310 |
+
compaction=compaction,
|
| 311 |
+
tools=tools,
|
| 312 |
+
)
|
| 313 |
+
|
| 314 |
+
# Build name from mutations
|
| 315 |
+
name_parts = []
|
| 316 |
+
for k, v in mutations.items():
|
| 317 |
+
if isinstance(v, CompactionConfig):
|
| 318 |
+
name_parts.append(f"{v.strategy}")
|
| 319 |
+
if v.strategy == "head_tail":
|
| 320 |
+
name_parts.append(f"h{v.head_size}_t{v.tail_size}")
|
| 321 |
+
elif k == "tools":
|
| 322 |
+
# Format tools for name
|
| 323 |
+
if isinstance(v, str):
|
| 324 |
+
name_parts.append(f"tools={v}")
|
| 325 |
+
elif isinstance(v, list):
|
| 326 |
+
name_parts.append(f"tools=[{len(v)}]")
|
| 327 |
+
else:
|
| 328 |
+
name_parts.append(f"tools=[{len(v)}]")
|
| 329 |
+
elif isinstance(v, bool):
|
| 330 |
+
name_parts.append(f"{k}={'on' if v else 'off'}")
|
| 331 |
+
else:
|
| 332 |
+
name_parts.append(f"{k}={v}")
|
| 333 |
+
|
| 334 |
+
mutated.name = f"{base.name}_{'_'.join(name_parts)}"
|
| 335 |
+
|
| 336 |
+
# Serialize mutations for storage (convert non-serializable types)
|
| 337 |
+
serializable_mutations = {}
|
| 338 |
+
for k, v in mutations.items():
|
| 339 |
+
if isinstance(v, CompactionConfig):
|
| 340 |
+
serializable_mutations[k] = asdict(v)
|
| 341 |
+
else:
|
| 342 |
+
serializable_mutations[k] = v
|
| 343 |
+
|
| 344 |
+
candidates.append(Candidate(
|
| 345 |
+
agent=mutated,
|
| 346 |
+
mutations=serializable_mutations,
|
| 347 |
+
rationale=f"Grid search: {', '.join(name_parts)}",
|
| 348 |
+
))
|
| 349 |
+
|
| 350 |
+
return candidates
|
| 351 |
+
|
| 352 |
+
|
| 353 |
+
# =============================================================================
|
| 354 |
+
# Agent YAML Export / Import
|
| 355 |
+
# =============================================================================
|
| 356 |
+
|
| 357 |
+
|
| 358 |
+
def export_agent(
|
| 359 |
+
agent: Agent,
|
| 360 |
+
path: Path,
|
| 361 |
+
metrics: dict[str, Any] | None = None,
|
| 362 |
+
) -> None:
|
| 363 |
+
"""Export an Agent as a reusable YAML file.
|
| 364 |
+
|
| 365 |
+
Args:
|
| 366 |
+
agent: The Agent to export
|
| 367 |
+
path: Path to write the YAML file
|
| 368 |
+
metrics: Optional optimization metrics (stored under _optimization key)
|
| 369 |
+
"""
|
| 370 |
+
data = asdict(agent)
|
| 371 |
+
if metrics:
|
| 372 |
+
data["_optimization"] = metrics
|
| 373 |
+
path.parent.mkdir(parents=True, exist_ok=True)
|
| 374 |
+
path.write_text(yaml.dump(data, default_flow_style=False, sort_keys=False))
|
| 375 |
+
|
| 376 |
+
|
| 377 |
+
def load_agent(path: Path) -> Agent:
|
| 378 |
+
"""Load an Agent from a YAML file.
|
| 379 |
+
|
| 380 |
+
Ignores any keys prefixed with _ (optimization metadata).
|
| 381 |
+
|
| 382 |
+
Args:
|
| 383 |
+
path: Path to the YAML config file
|
| 384 |
+
|
| 385 |
+
Returns:
|
| 386 |
+
Agent instance
|
| 387 |
+
|
| 388 |
+
Raises:
|
| 389 |
+
FileNotFoundError: If the file doesn't exist
|
| 390 |
+
ValueError: If the config is invalid
|
| 391 |
+
"""
|
| 392 |
+
if not path.exists():
|
| 393 |
+
raise FileNotFoundError(f"Agent config file not found: {path}")
|
| 394 |
+
|
| 395 |
+
data = yaml.safe_load(path.read_text())
|
| 396 |
+
config_data = {k: v for k, v in data.items() if not k.startswith("_")}
|
| 397 |
+
|
| 398 |
+
# Reconstruct CompactionConfig from nested dict
|
| 399 |
+
if "compaction" in config_data and isinstance(config_data["compaction"], dict):
|
| 400 |
+
config_data["compaction"] = CompactionConfig(**config_data["compaction"])
|
| 401 |
+
|
| 402 |
+
try:
|
| 403 |
+
return Agent(**config_data)
|
| 404 |
+
except TypeError as e:
|
| 405 |
+
raise ValueError(f"Invalid agent config file {path}: {e}") from e
|
| 406 |
+
|
| 407 |
+
|
| 408 |
+
def export_optimization_results(
|
| 409 |
+
summaries: list[dict[str, Any]],
|
| 410 |
+
pareto_names: list[str],
|
| 411 |
+
output_dir: Path,
|
| 412 |
+
timestamp: str,
|
| 413 |
+
) -> dict[str, Path]:
|
| 414 |
+
"""Export notable agents from an optimization run as YAML files.
|
| 415 |
+
|
| 416 |
+
Exports:
|
| 417 |
+
- best_score.yaml: Highest quality agent
|
| 418 |
+
- best_cost.yaml: Lowest token usage agent
|
| 419 |
+
- best_efficiency.yaml: Best score/token ratio
|
| 420 |
+
- pareto/<name>.yaml: All Pareto-optimal agents
|
| 421 |
+
|
| 422 |
+
Args:
|
| 423 |
+
summaries: List of summary dicts with metrics
|
| 424 |
+
pareto_names: Names of Pareto-optimal agents
|
| 425 |
+
output_dir: Directory to write agent files
|
| 426 |
+
timestamp: Optimization timestamp for metadata
|
| 427 |
+
|
| 428 |
+
Returns:
|
| 429 |
+
Dict mapping label to file path
|
| 430 |
+
"""
|
| 431 |
+
configs_dir = output_dir / "agents"
|
| 432 |
+
configs_dir.mkdir(parents=True, exist_ok=True)
|
| 433 |
+
|
| 434 |
+
exported: dict[str, Path] = {}
|
| 435 |
+
|
| 436 |
+
if not summaries:
|
| 437 |
+
return exported
|
| 438 |
+
|
| 439 |
+
best_score = max(summaries, key=lambda s: s.get("avg_score", 0))
|
| 440 |
+
best_cost = min(summaries, key=lambda s: s.get("avg_tokens", float("inf")))
|
| 441 |
+
best_efficiency = max(
|
| 442 |
+
summaries,
|
| 443 |
+
key=lambda s: s.get("avg_score", 0) / max(s.get("avg_tokens", 1), 1),
|
| 444 |
+
)
|
| 445 |
+
|
| 446 |
+
for label, summary in [
|
| 447 |
+
("best_score", best_score),
|
| 448 |
+
("best_cost", best_cost),
|
| 449 |
+
("best_efficiency", best_efficiency),
|
| 450 |
+
]:
|
| 451 |
+
agent = _summary_to_agent(summary)
|
| 452 |
+
metrics = _extract_metrics(summary, timestamp, label)
|
| 453 |
+
path = configs_dir / f"{label}.yaml"
|
| 454 |
+
export_agent(agent, path, metrics)
|
| 455 |
+
exported[label] = path
|
| 456 |
+
|
| 457 |
+
# Export Pareto-optimal agents
|
| 458 |
+
pareto_dir = configs_dir / "pareto"
|
| 459 |
+
pareto_dir.mkdir(exist_ok=True)
|
| 460 |
+
|
| 461 |
+
for summary in summaries:
|
| 462 |
+
name = summary.get("name", "unknown")
|
| 463 |
+
if name in pareto_names:
|
| 464 |
+
agent = _summary_to_agent(summary)
|
| 465 |
+
metrics = _extract_metrics(summary, timestamp, "pareto")
|
| 466 |
+
metrics["is_pareto_optimal"] = True
|
| 467 |
+
path = pareto_dir / f"{name}.yaml"
|
| 468 |
+
export_agent(agent, path, metrics)
|
| 469 |
+
exported[f"pareto/{name}"] = path
|
| 470 |
+
|
| 471 |
+
return exported
|
| 472 |
+
|
| 473 |
+
|
| 474 |
+
def _summary_to_agent(summary: dict[str, Any]) -> Agent:
|
| 475 |
+
"""Convert a summary dict back to an Agent."""
|
| 476 |
+
agent_data = summary.get("agent", {})
|
| 477 |
+
if agent_data:
|
| 478 |
+
# Reconstruct from nested agent dict
|
| 479 |
+
if "compaction" in agent_data and isinstance(agent_data["compaction"], dict):
|
| 480 |
+
agent_data["compaction"] = CompactionConfig(**agent_data["compaction"])
|
| 481 |
+
# tools field can be str, list, or dict - all are valid, keep as-is
|
| 482 |
+
return Agent(**agent_data)
|
| 483 |
+
|
| 484 |
+
# Fallback: build from flat summary fields (legacy format)
|
| 485 |
+
compaction = CompactionConfig.head_tail(
|
| 486 |
+
head_size=summary.get("compaction_head_size", 10),
|
| 487 |
+
tail_size=summary.get("compaction_tail_size", 40),
|
| 488 |
+
) if summary.get("enable_message_compaction", True) else CompactionConfig.none()
|
| 489 |
+
|
| 490 |
+
# Determine tools from legacy fields if present
|
| 491 |
+
tools: str | list[str] | dict[str, dict[str, Any]] = "standard"
|
| 492 |
+
if "tools" in summary:
|
| 493 |
+
tools = summary["tools"]
|
| 494 |
+
|
| 495 |
+
return Agent(
|
| 496 |
+
name=summary.get("name", "unknown"),
|
| 497 |
+
compaction=compaction,
|
| 498 |
+
tools=tools,
|
| 499 |
+
)
|
| 500 |
+
|
| 501 |
+
|
| 502 |
+
def _extract_metrics(
|
| 503 |
+
summary: dict[str, Any],
|
| 504 |
+
timestamp: str,
|
| 505 |
+
selection_reason: str,
|
| 506 |
+
) -> dict[str, Any]:
|
| 507 |
+
"""Extract optimization metrics from a summary."""
|
| 508 |
+
return {
|
| 509 |
+
"timestamp": timestamp,
|
| 510 |
+
"selection_reason": selection_reason,
|
| 511 |
+
"avg_score": summary.get("avg_score", 0),
|
| 512 |
+
"avg_tokens": summary.get("avg_tokens", 0),
|
| 513 |
+
"avg_duration": summary.get("avg_duration", 0),
|
| 514 |
+
"pass_rate": summary.get("pass_rate", 0),
|
| 515 |
+
"pareto_rank": summary.get("pareto_rank"),
|
| 516 |
+
"is_pareto_optimal": summary.get("is_pareto_optimal", False),
|
| 517 |
+
}
|
src/flow/experiments/optimizer.py
CHANGED
|
@@ -3,7 +3,7 @@
|
|
| 3 |
"""Optimizer service for finding best agent configurations.
|
| 4 |
|
| 5 |
Runs experiments in parallel, evaluates with LLM-as-Judge,
|
| 6 |
-
ranks via Pareto analysis, and exports reusable configs.
|
| 7 |
"""
|
| 8 |
|
| 9 |
from __future__ import annotations
|
|
@@ -15,31 +15,32 @@ import os
|
|
| 15 |
from collections.abc import Callable
|
| 16 |
from dataclasses import asdict, dataclass, field
|
| 17 |
from datetime import datetime
|
| 18 |
-
from itertools import product
|
| 19 |
from pathlib import Path
|
| 20 |
from typing import Any
|
| 21 |
|
| 22 |
from openai import AsyncAzureOpenAI
|
| 23 |
|
| 24 |
from .ablation import (
|
| 25 |
-
AblationConfig,
|
| 26 |
compute_pareto_frontier,
|
| 27 |
-
|
| 28 |
)
|
| 29 |
-
from .config_export import export_optimization_configs
|
| 30 |
from .evaluators import LLMEvaluator
|
| 31 |
from .metrics import TraceMetrics, extract_metrics
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
from .runner import FlowExperimentRunner, setup_tracing
|
| 33 |
-
from .types import
|
| 34 |
|
| 35 |
logger = logging.getLogger(__name__)
|
| 36 |
|
| 37 |
|
| 38 |
@dataclass
|
| 39 |
class TaskResult:
|
| 40 |
-
"""Result for a single
|
| 41 |
|
| 42 |
-
|
| 43 |
task_name: str
|
| 44 |
run_result: RunResult
|
| 45 |
metrics: TraceMetrics
|
|
@@ -49,12 +50,12 @@ class TaskResult:
|
|
| 49 |
|
| 50 |
|
| 51 |
@dataclass
|
| 52 |
-
class
|
| 53 |
-
"""Aggregated summary for a
|
| 54 |
|
| 55 |
name: str
|
| 56 |
-
|
| 57 |
-
task_results: list[TaskResult] = field(default_factory=
|
| 58 |
|
| 59 |
# Aggregated metrics
|
| 60 |
avg_score: float = 0.0
|
|
@@ -72,7 +73,9 @@ class ConfigSummary:
|
|
| 72 |
"""Convert to dictionary for serialization."""
|
| 73 |
return {
|
| 74 |
"name": self.name,
|
| 75 |
-
"
|
|
|
|
|
|
|
| 76 |
"avg_score": self.avg_score,
|
| 77 |
"avg_tokens": self.avg_tokens,
|
| 78 |
"avg_duration": self.avg_duration,
|
|
@@ -90,21 +93,21 @@ class OptimizationResult:
|
|
| 90 |
|
| 91 |
timestamp: str
|
| 92 |
output_dir: Path
|
| 93 |
-
summaries: list[
|
| 94 |
pareto_frontier: list[str]
|
| 95 |
-
|
| 96 |
|
| 97 |
# Rankings
|
| 98 |
-
rank_by_score: list[str] = field(default_factory=
|
| 99 |
-
rank_by_tokens: list[str] = field(default_factory=
|
| 100 |
-
rank_by_efficiency: list[str] = field(default_factory=
|
| 101 |
|
| 102 |
# Stats
|
| 103 |
total_experiments: int = 0
|
| 104 |
total_duration_seconds: float = 0.0
|
| 105 |
|
| 106 |
-
def
|
| 107 |
-
"""Get the best
|
| 108 |
if criterion == "score":
|
| 109 |
names = self.rank_by_score
|
| 110 |
elif criterion == "tokens":
|
|
@@ -126,17 +129,18 @@ class OptimizationResult:
|
|
| 126 |
class FlowOptimizer:
|
| 127 |
"""Optimizer for finding best agent configurations.
|
| 128 |
|
| 129 |
-
|
| 130 |
-
|
|
|
|
| 131 |
|
| 132 |
Example:
|
|
|
|
|
|
|
|
|
|
| 133 |
optimizer = FlowOptimizer(parallel=4)
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
]
|
| 138 |
-
tasks = [Task(name="test", prompt="Create hello world")]
|
| 139 |
-
result = await optimizer.optimize(configs, tasks)
|
| 140 |
print(f"Best: {result.rank_by_score[0]}")
|
| 141 |
"""
|
| 142 |
|
|
@@ -146,69 +150,55 @@ class FlowOptimizer:
|
|
| 146 |
use_llm_evaluator: bool = True,
|
| 147 |
output_dir: Path | None = None,
|
| 148 |
) -> None:
|
| 149 |
-
"""Initialize the optimizer.
|
| 150 |
-
|
| 151 |
-
Args:
|
| 152 |
-
parallel: Max concurrent experiments
|
| 153 |
-
use_llm_evaluator: Whether to use LLM for evaluation
|
| 154 |
-
output_dir: Base directory for results
|
| 155 |
-
"""
|
| 156 |
self.parallel = parallel
|
| 157 |
self.use_llm_evaluator = use_llm_evaluator
|
| 158 |
self.output_dir = output_dir or Path.home() / ".flow" / "optimizations"
|
| 159 |
|
| 160 |
async def optimize(
|
| 161 |
self,
|
| 162 |
-
|
| 163 |
tasks: list[Task],
|
| 164 |
progress_callback: Callable[[int, int, str, str], None] | None = None,
|
| 165 |
) -> OptimizationResult:
|
| 166 |
-
"""Run optimization across all
|
| 167 |
|
| 168 |
Args:
|
| 169 |
-
|
| 170 |
-
tasks: Tasks to run each
|
| 171 |
-
progress_callback: Optional callback(completed, total,
|
| 172 |
|
| 173 |
Returns:
|
| 174 |
-
OptimizationResult with rankings and exported
|
| 175 |
"""
|
| 176 |
start_time = datetime.now()
|
| 177 |
timestamp = start_time.strftime("%Y%m%d_%H%M%S")
|
| 178 |
run_dir = self.output_dir / timestamp
|
| 179 |
run_dir.mkdir(parents=True, exist_ok=True)
|
| 180 |
|
| 181 |
-
# Setup
|
| 182 |
setup_tracing("flow-optimizer")
|
| 183 |
-
self._save_config(
|
| 184 |
|
| 185 |
print("=" * 70)
|
| 186 |
print(" FLOW OPTIMIZER")
|
| 187 |
print("=" * 70)
|
| 188 |
-
print(f"
|
| 189 |
-
print(f" Tasks:
|
| 190 |
-
print(f" Total:
|
| 191 |
-
print(f" Parallel:
|
| 192 |
-
print(f" Output:
|
| 193 |
print("=" * 70)
|
| 194 |
|
| 195 |
-
# Create LLM evaluator if needed
|
| 196 |
evaluator = None
|
| 197 |
if self.use_llm_evaluator:
|
| 198 |
evaluator = self._create_evaluator()
|
| 199 |
|
| 200 |
-
# Run all experiments in parallel
|
| 201 |
task_results = await self._run_parallel(
|
| 202 |
-
|
| 203 |
)
|
| 204 |
|
| 205 |
-
|
| 206 |
-
summaries = self._aggregate_results(task_results, configs)
|
| 207 |
-
|
| 208 |
-
# Pareto analysis
|
| 209 |
pareto_names = self._compute_pareto(summaries)
|
| 210 |
|
| 211 |
-
# Compute rankings
|
| 212 |
rank_by_score = sorted(summaries, key=lambda s: s.avg_score, reverse=True)
|
| 213 |
rank_by_tokens = sorted(summaries, key=lambda s: s.avg_tokens)
|
| 214 |
rank_by_efficiency = sorted(
|
|
@@ -217,9 +207,8 @@ class FlowOptimizer:
|
|
| 217 |
reverse=True,
|
| 218 |
)
|
| 219 |
|
| 220 |
-
# Export configs
|
| 221 |
summary_dicts = [s.to_dict() for s in summaries]
|
| 222 |
-
exported =
|
| 223 |
summary_dicts, pareto_names, run_dir, timestamp
|
| 224 |
)
|
| 225 |
|
|
@@ -230,7 +219,7 @@ class FlowOptimizer:
|
|
| 230 |
output_dir=run_dir,
|
| 231 |
summaries=summaries,
|
| 232 |
pareto_frontier=pareto_names,
|
| 233 |
-
|
| 234 |
rank_by_score=[s.name for s in rank_by_score],
|
| 235 |
rank_by_tokens=[s.name for s in rank_by_tokens],
|
| 236 |
rank_by_efficiency=[s.name for s in rank_by_efficiency],
|
|
@@ -238,56 +227,49 @@ class FlowOptimizer:
|
|
| 238 |
total_duration_seconds=(end_time - start_time).total_seconds(),
|
| 239 |
)
|
| 240 |
|
| 241 |
-
# Save results
|
| 242 |
self._save_results(result, run_dir)
|
| 243 |
-
|
| 244 |
-
# Print summary
|
| 245 |
self._print_summary(result)
|
| 246 |
|
| 247 |
return result
|
| 248 |
|
| 249 |
async def _run_parallel(
|
| 250 |
self,
|
| 251 |
-
|
| 252 |
tasks: list[Task],
|
| 253 |
run_dir: Path,
|
| 254 |
evaluator: LLMEvaluator | None,
|
| 255 |
progress_callback: Callable[[int, int, str, str], None] | None,
|
| 256 |
) -> list[TaskResult]:
|
| 257 |
-
"""Run all
|
| 258 |
semaphore = asyncio.Semaphore(self.parallel)
|
| 259 |
-
total = len(
|
| 260 |
completed = 0
|
| 261 |
lock = asyncio.Lock()
|
| 262 |
|
| 263 |
-
async def run_one(
|
| 264 |
nonlocal completed
|
| 265 |
async with semaphore:
|
| 266 |
-
workspace = run_dir / "workspaces" /
|
| 267 |
workspace.mkdir(parents=True, exist_ok=True)
|
| 268 |
|
| 269 |
-
result = await self._run_single(
|
| 270 |
|
| 271 |
async with lock:
|
| 272 |
completed += 1
|
| 273 |
status = "✓" if result.eval_passed else "✗"
|
| 274 |
print(
|
| 275 |
-
f" [{completed}/{total}] {
|
| 276 |
f"{status} score={result.eval_score:.2f} "
|
| 277 |
f"tokens={result.metrics.total_tokens:,}"
|
| 278 |
)
|
| 279 |
if progress_callback:
|
| 280 |
-
progress_callback(completed, total,
|
| 281 |
|
| 282 |
return result
|
| 283 |
|
| 284 |
-
|
| 285 |
-
coroutines = [run_one(config, task) for config in configs for task in tasks]
|
| 286 |
-
|
| 287 |
-
# Run with gather
|
| 288 |
gather_results = await asyncio.gather(*coroutines, return_exceptions=True)
|
| 289 |
|
| 290 |
-
# Filter out exceptions
|
| 291 |
valid_results: list[TaskResult] = []
|
| 292 |
for r in gather_results:
|
| 293 |
if isinstance(r, BaseException):
|
|
@@ -299,33 +281,31 @@ class FlowOptimizer:
|
|
| 299 |
|
| 300 |
async def _run_single(
|
| 301 |
self,
|
| 302 |
-
|
| 303 |
task: Task,
|
| 304 |
workspace: Path,
|
| 305 |
evaluator: LLMEvaluator | None,
|
| 306 |
) -> TaskResult:
|
| 307 |
-
"""Run a single
|
| 308 |
-
harness =
|
| 309 |
|
| 310 |
try:
|
| 311 |
runner = FlowExperimentRunner(keep_workspace=True)
|
| 312 |
run_result = await runner.run(harness, task, workspace=workspace)
|
| 313 |
metrics = extract_metrics(run_result.trace)
|
| 314 |
|
| 315 |
-
# Evaluate
|
| 316 |
if evaluator:
|
| 317 |
eval_result = await evaluator.evaluate(run_result)
|
| 318 |
eval_score = eval_result.score
|
| 319 |
eval_passed = eval_result.passed
|
| 320 |
eval_reasoning = eval_result.reasoning
|
| 321 |
else:
|
| 322 |
-
# Simple heuristic: passed if no error
|
| 323 |
eval_score = 1.0 if run_result.success else 0.0
|
| 324 |
eval_passed = run_result.success
|
| 325 |
eval_reasoning = "Success" if run_result.success else run_result.error or "Failed"
|
| 326 |
|
| 327 |
return TaskResult(
|
| 328 |
-
|
| 329 |
task_name=task.name,
|
| 330 |
run_result=run_result,
|
| 331 |
metrics=metrics,
|
|
@@ -339,25 +319,25 @@ class FlowOptimizer:
|
|
| 339 |
def _aggregate_results(
|
| 340 |
self,
|
| 341 |
task_results: list[TaskResult],
|
| 342 |
-
|
| 343 |
-
) -> list[
|
| 344 |
-
"""Aggregate task results into
|
| 345 |
-
|
| 346 |
-
|
| 347 |
|
| 348 |
for result in task_results:
|
| 349 |
-
if result.
|
| 350 |
-
|
| 351 |
|
| 352 |
summaries = []
|
| 353 |
-
for name, results in
|
| 354 |
if not results:
|
| 355 |
continue
|
| 356 |
|
| 357 |
-
|
| 358 |
-
summary =
|
| 359 |
name=name,
|
| 360 |
-
|
| 361 |
task_results=results,
|
| 362 |
avg_score=sum(r.eval_score for r in results) / len(results),
|
| 363 |
avg_tokens=sum(r.metrics.total_tokens for r in results) / len(results),
|
|
@@ -370,19 +350,17 @@ class FlowOptimizer:
|
|
| 370 |
|
| 371 |
return summaries
|
| 372 |
|
| 373 |
-
def _compute_pareto(self, summaries: list[
|
| 374 |
"""Compute Pareto frontier (maximize score, minimize tokens)."""
|
| 375 |
-
# Use shared utility
|
| 376 |
pareto_names = compute_pareto_frontier(summaries)
|
| 377 |
|
| 378 |
-
# Mark summaries with Pareto status
|
| 379 |
for summary in summaries:
|
| 380 |
if summary.name in pareto_names:
|
| 381 |
summary.is_pareto_optimal = True
|
| 382 |
summary.pareto_rank = 0
|
| 383 |
else:
|
| 384 |
summary.is_pareto_optimal = False
|
| 385 |
-
summary.pareto_rank = 1
|
| 386 |
|
| 387 |
return pareto_names
|
| 388 |
|
|
@@ -410,7 +388,7 @@ class FlowOptimizer:
|
|
| 410 |
|
| 411 |
def _save_config(
|
| 412 |
self,
|
| 413 |
-
|
| 414 |
tasks: list[Task],
|
| 415 |
run_dir: Path,
|
| 416 |
) -> None:
|
|
@@ -418,7 +396,7 @@ class FlowOptimizer:
|
|
| 418 |
with open(run_dir / "optimization_config.json", "w") as f:
|
| 419 |
json.dump(
|
| 420 |
{
|
| 421 |
-
"
|
| 422 |
"tasks": [{"name": t.name, "prompt": t.prompt} for t in tasks],
|
| 423 |
"parallel": self.parallel,
|
| 424 |
"use_llm_evaluator": self.use_llm_evaluator,
|
|
@@ -437,7 +415,7 @@ class FlowOptimizer:
|
|
| 437 |
"rank_by_score": result.rank_by_score,
|
| 438 |
"rank_by_tokens": result.rank_by_tokens,
|
| 439 |
"rank_by_efficiency": result.rank_by_efficiency,
|
| 440 |
-
"
|
| 441 |
"summaries": [s.to_dict() for s in result.summaries],
|
| 442 |
}
|
| 443 |
|
|
@@ -450,8 +428,7 @@ class FlowOptimizer:
|
|
| 450 |
print(" OPTIMIZATION RESULTS")
|
| 451 |
print("=" * 70)
|
| 452 |
|
| 453 |
-
|
| 454 |
-
print(f"\n{'Config':<30} | {'Score':>8} | {'Tokens':>10} | {'Pareto':>8}")
|
| 455 |
print("-" * 65)
|
| 456 |
|
| 457 |
for summary in sorted(result.summaries, key=lambda s: s.avg_score, reverse=True):
|
|
@@ -465,62 +442,19 @@ class FlowOptimizer:
|
|
| 465 |
print(f"Pareto frontier: {result.pareto_frontier}")
|
| 466 |
print(f"Best by score: {result.rank_by_score[0] if result.rank_by_score else 'N/A'}")
|
| 467 |
print(f"Best by efficiency: {result.rank_by_efficiency[0] if result.rank_by_efficiency else 'N/A'}")
|
| 468 |
-
print("\nExported
|
| 469 |
-
for name, path in result.
|
| 470 |
print(f" {name}: {path}")
|
| 471 |
print(f"\nResults saved to: {result.output_dir}")
|
| 472 |
|
| 473 |
|
| 474 |
-
def generate_grid_configs(
|
| 475 |
-
base_name: str,
|
| 476 |
-
variations: dict[str, list[Any]],
|
| 477 |
-
) -> list[AblationConfig]:
|
| 478 |
-
"""Generate configs from a variation grid.
|
| 479 |
-
|
| 480 |
-
Args:
|
| 481 |
-
base_name: Base name for generated configs
|
| 482 |
-
variations: Dict of param_name -> list of values
|
| 483 |
-
|
| 484 |
-
Returns:
|
| 485 |
-
List of AblationConfig for each combination
|
| 486 |
-
|
| 487 |
-
Example:
|
| 488 |
-
configs = generate_grid_configs("grid", {
|
| 489 |
-
"enable_message_compaction": [True, False],
|
| 490 |
-
"compaction_head_size": [5, 10, 20],
|
| 491 |
-
})
|
| 492 |
-
"""
|
| 493 |
-
if not variations:
|
| 494 |
-
return [AblationConfig(name=base_name)]
|
| 495 |
-
|
| 496 |
-
param_names = list(variations.keys())
|
| 497 |
-
param_values = list(variations.values())
|
| 498 |
-
|
| 499 |
-
configs = []
|
| 500 |
-
for values in product(*param_values):
|
| 501 |
-
kwargs = dict(zip(param_names, values, strict=True))
|
| 502 |
-
name = f"{base_name}_" + "_".join(f"{k}={v}" for k, v in kwargs.items())
|
| 503 |
-
configs.append(AblationConfig(name=name, **kwargs))
|
| 504 |
-
|
| 505 |
-
return configs
|
| 506 |
-
|
| 507 |
-
|
| 508 |
def load_tasks_from_jsonl(path: Path) -> list[Task]:
|
| 509 |
"""Load tasks from a JSONL file.
|
| 510 |
|
| 511 |
-
Each line should be a JSON object with:
|
| 512 |
-
- name: Task name
|
| 513 |
-
- prompt: Task prompt
|
| 514 |
-
- criteria: Optional list of evaluation criteria
|
| 515 |
-
- category: Optional category string
|
| 516 |
-
- metadata: Optional additional metadata dict
|
| 517 |
-
|
| 518 |
Args:
|
| 519 |
path: Path to JSONL file
|
| 520 |
|
| 521 |
Returns:
|
| 522 |
List of Task objects
|
| 523 |
"""
|
| 524 |
-
|
| 525 |
-
|
| 526 |
-
return _load_tasks_from_jsonl(path)
|
|
|
|
| 3 |
"""Optimizer service for finding best agent configurations.
|
| 4 |
|
| 5 |
Runs experiments in parallel, evaluates with LLM-as-Judge,
|
| 6 |
+
ranks via Pareto analysis, and exports reusable agent configs.
|
| 7 |
"""
|
| 8 |
|
| 9 |
from __future__ import annotations
|
|
|
|
| 15 |
from collections.abc import Callable
|
| 16 |
from dataclasses import asdict, dataclass, field
|
| 17 |
from datetime import datetime
|
|
|
|
| 18 |
from pathlib import Path
|
| 19 |
from typing import Any
|
| 20 |
|
| 21 |
from openai import AsyncAzureOpenAI
|
| 22 |
|
| 23 |
from .ablation import (
|
|
|
|
| 24 |
compute_pareto_frontier,
|
| 25 |
+
create_harness_from_agent,
|
| 26 |
)
|
|
|
|
| 27 |
from .evaluators import LLMEvaluator
|
| 28 |
from .metrics import TraceMetrics, extract_metrics
|
| 29 |
+
from .models import (
|
| 30 |
+
Candidate,
|
| 31 |
+
export_optimization_results,
|
| 32 |
+
)
|
| 33 |
from .runner import FlowExperimentRunner, setup_tracing
|
| 34 |
+
from .types import RunResult, Task, load_tasks_from_jsonl as _load_tasks_impl
|
| 35 |
|
| 36 |
logger = logging.getLogger(__name__)
|
| 37 |
|
| 38 |
|
| 39 |
@dataclass
|
| 40 |
class TaskResult:
|
| 41 |
+
"""Result for a single candidate-task pair."""
|
| 42 |
|
| 43 |
+
candidate_name: str
|
| 44 |
task_name: str
|
| 45 |
run_result: RunResult
|
| 46 |
metrics: TraceMetrics
|
|
|
|
| 50 |
|
| 51 |
|
| 52 |
@dataclass
|
| 53 |
+
class CandidateSummary:
|
| 54 |
+
"""Aggregated summary for a candidate across all tasks."""
|
| 55 |
|
| 56 |
name: str
|
| 57 |
+
candidate: Candidate
|
| 58 |
+
task_results: list[TaskResult] = field(default_factory=lambda: [])
|
| 59 |
|
| 60 |
# Aggregated metrics
|
| 61 |
avg_score: float = 0.0
|
|
|
|
| 73 |
"""Convert to dictionary for serialization."""
|
| 74 |
return {
|
| 75 |
"name": self.name,
|
| 76 |
+
"agent": asdict(self.candidate.agent),
|
| 77 |
+
"mutations": self.candidate.mutations,
|
| 78 |
+
"rationale": self.candidate.rationale,
|
| 79 |
"avg_score": self.avg_score,
|
| 80 |
"avg_tokens": self.avg_tokens,
|
| 81 |
"avg_duration": self.avg_duration,
|
|
|
|
| 93 |
|
| 94 |
timestamp: str
|
| 95 |
output_dir: Path
|
| 96 |
+
summaries: list[CandidateSummary]
|
| 97 |
pareto_frontier: list[str]
|
| 98 |
+
exported_agents: dict[str, Path]
|
| 99 |
|
| 100 |
# Rankings
|
| 101 |
+
rank_by_score: list[str] = field(default_factory=lambda: [])
|
| 102 |
+
rank_by_tokens: list[str] = field(default_factory=lambda: [])
|
| 103 |
+
rank_by_efficiency: list[str] = field(default_factory=lambda: [])
|
| 104 |
|
| 105 |
# Stats
|
| 106 |
total_experiments: int = 0
|
| 107 |
total_duration_seconds: float = 0.0
|
| 108 |
|
| 109 |
+
def get_best_candidate(self, criterion: str = "score") -> CandidateSummary | None:
|
| 110 |
+
"""Get the best candidate by a criterion."""
|
| 111 |
if criterion == "score":
|
| 112 |
names = self.rank_by_score
|
| 113 |
elif criterion == "tokens":
|
|
|
|
| 129 |
class FlowOptimizer:
|
| 130 |
"""Optimizer for finding best agent configurations.
|
| 131 |
|
| 132 |
+
Takes a base Agent and a CandidateStrategy, generates candidates,
|
| 133 |
+
runs experiments in parallel, evaluates results, performs Pareto
|
| 134 |
+
analysis, and exports winning agent configs.
|
| 135 |
|
| 136 |
Example:
|
| 137 |
+
strategy = GridSearchStrategy(variations={
|
| 138 |
+
"enable_memory": [True, False],
|
| 139 |
+
})
|
| 140 |
optimizer = FlowOptimizer(parallel=4)
|
| 141 |
+
base = Agent(name="my_agent")
|
| 142 |
+
candidates = strategy.generate(base, budget=10)
|
| 143 |
+
result = await optimizer.optimize(candidates, tasks)
|
|
|
|
|
|
|
|
|
|
| 144 |
print(f"Best: {result.rank_by_score[0]}")
|
| 145 |
"""
|
| 146 |
|
|
|
|
| 150 |
use_llm_evaluator: bool = True,
|
| 151 |
output_dir: Path | None = None,
|
| 152 |
) -> None:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
self.parallel = parallel
|
| 154 |
self.use_llm_evaluator = use_llm_evaluator
|
| 155 |
self.output_dir = output_dir or Path.home() / ".flow" / "optimizations"
|
| 156 |
|
| 157 |
async def optimize(
|
| 158 |
self,
|
| 159 |
+
candidates: list[Candidate],
|
| 160 |
tasks: list[Task],
|
| 161 |
progress_callback: Callable[[int, int, str, str], None] | None = None,
|
| 162 |
) -> OptimizationResult:
|
| 163 |
+
"""Run optimization across all candidates and tasks.
|
| 164 |
|
| 165 |
Args:
|
| 166 |
+
candidates: Candidates to test
|
| 167 |
+
tasks: Tasks to run each candidate on
|
| 168 |
+
progress_callback: Optional callback(completed, total, candidate_name, task_name)
|
| 169 |
|
| 170 |
Returns:
|
| 171 |
+
OptimizationResult with rankings and exported agents
|
| 172 |
"""
|
| 173 |
start_time = datetime.now()
|
| 174 |
timestamp = start_time.strftime("%Y%m%d_%H%M%S")
|
| 175 |
run_dir = self.output_dir / timestamp
|
| 176 |
run_dir.mkdir(parents=True, exist_ok=True)
|
| 177 |
|
|
|
|
| 178 |
setup_tracing("flow-optimizer")
|
| 179 |
+
self._save_config(candidates, tasks, run_dir)
|
| 180 |
|
| 181 |
print("=" * 70)
|
| 182 |
print(" FLOW OPTIMIZER")
|
| 183 |
print("=" * 70)
|
| 184 |
+
print(f" Candidates: {len(candidates)}")
|
| 185 |
+
print(f" Tasks: {len(tasks)}")
|
| 186 |
+
print(f" Total: {len(candidates) * len(tasks)} experiments")
|
| 187 |
+
print(f" Parallel: {self.parallel}")
|
| 188 |
+
print(f" Output: {run_dir}")
|
| 189 |
print("=" * 70)
|
| 190 |
|
|
|
|
| 191 |
evaluator = None
|
| 192 |
if self.use_llm_evaluator:
|
| 193 |
evaluator = self._create_evaluator()
|
| 194 |
|
|
|
|
| 195 |
task_results = await self._run_parallel(
|
| 196 |
+
candidates, tasks, run_dir, evaluator, progress_callback
|
| 197 |
)
|
| 198 |
|
| 199 |
+
summaries = self._aggregate_results(task_results, candidates)
|
|
|
|
|
|
|
|
|
|
| 200 |
pareto_names = self._compute_pareto(summaries)
|
| 201 |
|
|
|
|
| 202 |
rank_by_score = sorted(summaries, key=lambda s: s.avg_score, reverse=True)
|
| 203 |
rank_by_tokens = sorted(summaries, key=lambda s: s.avg_tokens)
|
| 204 |
rank_by_efficiency = sorted(
|
|
|
|
| 207 |
reverse=True,
|
| 208 |
)
|
| 209 |
|
|
|
|
| 210 |
summary_dicts = [s.to_dict() for s in summaries]
|
| 211 |
+
exported = export_optimization_results(
|
| 212 |
summary_dicts, pareto_names, run_dir, timestamp
|
| 213 |
)
|
| 214 |
|
|
|
|
| 219 |
output_dir=run_dir,
|
| 220 |
summaries=summaries,
|
| 221 |
pareto_frontier=pareto_names,
|
| 222 |
+
exported_agents=exported,
|
| 223 |
rank_by_score=[s.name for s in rank_by_score],
|
| 224 |
rank_by_tokens=[s.name for s in rank_by_tokens],
|
| 225 |
rank_by_efficiency=[s.name for s in rank_by_efficiency],
|
|
|
|
| 227 |
total_duration_seconds=(end_time - start_time).total_seconds(),
|
| 228 |
)
|
| 229 |
|
|
|
|
| 230 |
self._save_results(result, run_dir)
|
|
|
|
|
|
|
| 231 |
self._print_summary(result)
|
| 232 |
|
| 233 |
return result
|
| 234 |
|
| 235 |
async def _run_parallel(
|
| 236 |
self,
|
| 237 |
+
candidates: list[Candidate],
|
| 238 |
tasks: list[Task],
|
| 239 |
run_dir: Path,
|
| 240 |
evaluator: LLMEvaluator | None,
|
| 241 |
progress_callback: Callable[[int, int, str, str], None] | None,
|
| 242 |
) -> list[TaskResult]:
|
| 243 |
+
"""Run all candidate-task pairs in parallel with semaphore control."""
|
| 244 |
semaphore = asyncio.Semaphore(self.parallel)
|
| 245 |
+
total = len(candidates) * len(tasks)
|
| 246 |
completed = 0
|
| 247 |
lock = asyncio.Lock()
|
| 248 |
|
| 249 |
+
async def run_one(candidate: Candidate, task: Task) -> TaskResult:
|
| 250 |
nonlocal completed
|
| 251 |
async with semaphore:
|
| 252 |
+
workspace = run_dir / "workspaces" / candidate.agent.name / task.name
|
| 253 |
workspace.mkdir(parents=True, exist_ok=True)
|
| 254 |
|
| 255 |
+
result = await self._run_single(candidate, task, workspace, evaluator)
|
| 256 |
|
| 257 |
async with lock:
|
| 258 |
completed += 1
|
| 259 |
status = "✓" if result.eval_passed else "✗"
|
| 260 |
print(
|
| 261 |
+
f" [{completed}/{total}] {candidate.agent.name}/{task.name}: "
|
| 262 |
f"{status} score={result.eval_score:.2f} "
|
| 263 |
f"tokens={result.metrics.total_tokens:,}"
|
| 264 |
)
|
| 265 |
if progress_callback:
|
| 266 |
+
progress_callback(completed, total, candidate.agent.name, task.name)
|
| 267 |
|
| 268 |
return result
|
| 269 |
|
| 270 |
+
coroutines = [run_one(c, t) for c in candidates for t in tasks]
|
|
|
|
|
|
|
|
|
|
| 271 |
gather_results = await asyncio.gather(*coroutines, return_exceptions=True)
|
| 272 |
|
|
|
|
| 273 |
valid_results: list[TaskResult] = []
|
| 274 |
for r in gather_results:
|
| 275 |
if isinstance(r, BaseException):
|
|
|
|
| 281 |
|
| 282 |
async def _run_single(
|
| 283 |
self,
|
| 284 |
+
candidate: Candidate,
|
| 285 |
task: Task,
|
| 286 |
workspace: Path,
|
| 287 |
evaluator: LLMEvaluator | None,
|
| 288 |
) -> TaskResult:
|
| 289 |
+
"""Run a single candidate-task experiment."""
|
| 290 |
+
harness = create_harness_from_agent(candidate.agent, workspace)
|
| 291 |
|
| 292 |
try:
|
| 293 |
runner = FlowExperimentRunner(keep_workspace=True)
|
| 294 |
run_result = await runner.run(harness, task, workspace=workspace)
|
| 295 |
metrics = extract_metrics(run_result.trace)
|
| 296 |
|
|
|
|
| 297 |
if evaluator:
|
| 298 |
eval_result = await evaluator.evaluate(run_result)
|
| 299 |
eval_score = eval_result.score
|
| 300 |
eval_passed = eval_result.passed
|
| 301 |
eval_reasoning = eval_result.reasoning
|
| 302 |
else:
|
|
|
|
| 303 |
eval_score = 1.0 if run_result.success else 0.0
|
| 304 |
eval_passed = run_result.success
|
| 305 |
eval_reasoning = "Success" if run_result.success else run_result.error or "Failed"
|
| 306 |
|
| 307 |
return TaskResult(
|
| 308 |
+
candidate_name=candidate.agent.name,
|
| 309 |
task_name=task.name,
|
| 310 |
run_result=run_result,
|
| 311 |
metrics=metrics,
|
|
|
|
| 319 |
def _aggregate_results(
|
| 320 |
self,
|
| 321 |
task_results: list[TaskResult],
|
| 322 |
+
candidates: list[Candidate],
|
| 323 |
+
) -> list[CandidateSummary]:
|
| 324 |
+
"""Aggregate task results into candidate summaries."""
|
| 325 |
+
candidate_map = {c.agent.name: c for c in candidates}
|
| 326 |
+
results_by_name: dict[str, list[TaskResult]] = {c.agent.name: [] for c in candidates}
|
| 327 |
|
| 328 |
for result in task_results:
|
| 329 |
+
if result.candidate_name in results_by_name:
|
| 330 |
+
results_by_name[result.candidate_name].append(result)
|
| 331 |
|
| 332 |
summaries = []
|
| 333 |
+
for name, results in results_by_name.items():
|
| 334 |
if not results:
|
| 335 |
continue
|
| 336 |
|
| 337 |
+
candidate = candidate_map[name]
|
| 338 |
+
summary = CandidateSummary(
|
| 339 |
name=name,
|
| 340 |
+
candidate=candidate,
|
| 341 |
task_results=results,
|
| 342 |
avg_score=sum(r.eval_score for r in results) / len(results),
|
| 343 |
avg_tokens=sum(r.metrics.total_tokens for r in results) / len(results),
|
|
|
|
| 350 |
|
| 351 |
return summaries
|
| 352 |
|
| 353 |
+
def _compute_pareto(self, summaries: list[CandidateSummary]) -> list[str]:
|
| 354 |
"""Compute Pareto frontier (maximize score, minimize tokens)."""
|
|
|
|
| 355 |
pareto_names = compute_pareto_frontier(summaries)
|
| 356 |
|
|
|
|
| 357 |
for summary in summaries:
|
| 358 |
if summary.name in pareto_names:
|
| 359 |
summary.is_pareto_optimal = True
|
| 360 |
summary.pareto_rank = 0
|
| 361 |
else:
|
| 362 |
summary.is_pareto_optimal = False
|
| 363 |
+
summary.pareto_rank = 1
|
| 364 |
|
| 365 |
return pareto_names
|
| 366 |
|
|
|
|
| 388 |
|
| 389 |
def _save_config(
|
| 390 |
self,
|
| 391 |
+
candidates: list[Candidate],
|
| 392 |
tasks: list[Task],
|
| 393 |
run_dir: Path,
|
| 394 |
) -> None:
|
|
|
|
| 396 |
with open(run_dir / "optimization_config.json", "w") as f:
|
| 397 |
json.dump(
|
| 398 |
{
|
| 399 |
+
"candidates": [asdict(c) for c in candidates],
|
| 400 |
"tasks": [{"name": t.name, "prompt": t.prompt} for t in tasks],
|
| 401 |
"parallel": self.parallel,
|
| 402 |
"use_llm_evaluator": self.use_llm_evaluator,
|
|
|
|
| 415 |
"rank_by_score": result.rank_by_score,
|
| 416 |
"rank_by_tokens": result.rank_by_tokens,
|
| 417 |
"rank_by_efficiency": result.rank_by_efficiency,
|
| 418 |
+
"exported_agents": {k: str(v) for k, v in result.exported_agents.items()},
|
| 419 |
"summaries": [s.to_dict() for s in result.summaries],
|
| 420 |
}
|
| 421 |
|
|
|
|
| 428 |
print(" OPTIMIZATION RESULTS")
|
| 429 |
print("=" * 70)
|
| 430 |
|
| 431 |
+
print(f"\n{'Candidate':<30} | {'Score':>8} | {'Tokens':>10} | {'Pareto':>8}")
|
|
|
|
| 432 |
print("-" * 65)
|
| 433 |
|
| 434 |
for summary in sorted(result.summaries, key=lambda s: s.avg_score, reverse=True):
|
|
|
|
| 442 |
print(f"Pareto frontier: {result.pareto_frontier}")
|
| 443 |
print(f"Best by score: {result.rank_by_score[0] if result.rank_by_score else 'N/A'}")
|
| 444 |
print(f"Best by efficiency: {result.rank_by_efficiency[0] if result.rank_by_efficiency else 'N/A'}")
|
| 445 |
+
print("\nExported agents:")
|
| 446 |
+
for name, path in result.exported_agents.items():
|
| 447 |
print(f" {name}: {path}")
|
| 448 |
print(f"\nResults saved to: {result.output_dir}")
|
| 449 |
|
| 450 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 451 |
def load_tasks_from_jsonl(path: Path) -> list[Task]:
|
| 452 |
"""Load tasks from a JSONL file.
|
| 453 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 454 |
Args:
|
| 455 |
path: Path to JSONL file
|
| 456 |
|
| 457 |
Returns:
|
| 458 |
List of Task objects
|
| 459 |
"""
|
| 460 |
+
return _load_tasks_impl(path)
|
|
|
|
|
|
src/flow/experiments/types.py
CHANGED
|
@@ -109,7 +109,7 @@ class EvalResult:
|
|
| 109 |
_DATA_DIR = Path(__file__).parent / "data" / "tasks"
|
| 110 |
|
| 111 |
|
| 112 |
-
def
|
| 113 |
"""Load tasks from a JSONL file.
|
| 114 |
|
| 115 |
Each line should be a JSON object with:
|
|
@@ -186,4 +186,4 @@ def get_task_suite(suite_name: str) -> list[Task]:
|
|
| 186 |
if not path.exists():
|
| 187 |
available = ", ".join(get_available_suites())
|
| 188 |
raise ValueError(f"Unknown suite '{suite_name}'. Available: {available}")
|
| 189 |
-
return
|
|
|
|
| 109 |
_DATA_DIR = Path(__file__).parent / "data" / "tasks"
|
| 110 |
|
| 111 |
|
| 112 |
+
def load_tasks_from_jsonl(path: Path) -> list[Task]:
|
| 113 |
"""Load tasks from a JSONL file.
|
| 114 |
|
| 115 |
Each line should be a JSON object with:
|
|
|
|
| 186 |
if not path.exists():
|
| 187 |
available = ", ".join(get_available_suites())
|
| 188 |
raise ValueError(f"Unknown suite '{suite_name}'. Available: {available}")
|
| 189 |
+
return load_tasks_from_jsonl(path)
|
src/flow/harness/maf/agent.py
CHANGED
|
@@ -9,9 +9,10 @@ from collections.abc import Callable, Coroutine, Sequence
|
|
| 9 |
from pathlib import Path
|
| 10 |
from typing import TYPE_CHECKING, Any
|
| 11 |
|
|
|
|
| 12 |
from flow.harness.maf.message_store import HeadTailCompactingChatMessageStore
|
| 13 |
-
from flow.
|
| 14 |
-
from flow.
|
| 15 |
|
| 16 |
if TYPE_CHECKING:
|
| 17 |
from agent_framework import ChatAgent
|
|
@@ -37,10 +38,7 @@ def create_agent(
|
|
| 37 |
workspace: Path | None = None,
|
| 38 |
memory_path: Path | None = None,
|
| 39 |
# Tool configuration
|
| 40 |
-
tools: Sequence[Callable[..., Coroutine[Any, Any, str]]]
|
| 41 |
-
enable_memory_tool: bool = True,
|
| 42 |
-
enable_sub_agent: bool = False,
|
| 43 |
-
bash_timeout: int = 120,
|
| 44 |
# Context engineering
|
| 45 |
enable_compaction: bool = True,
|
| 46 |
compaction_head_size: int = 10,
|
|
@@ -52,8 +50,6 @@ def create_agent(
|
|
| 52 |
- Azure OpenAI as the backend
|
| 53 |
- Flow's standard tools (coding, execution, memory)
|
| 54 |
- Optional message compaction for long conversations
|
| 55 |
-
- Optional agent-managed memory tool
|
| 56 |
-
- Optional sub-agent for isolated research
|
| 57 |
|
| 58 |
Args:
|
| 59 |
endpoint: Azure OpenAI endpoint URL. Defaults to AZURE_OPENAI_ENDPOINT env var.
|
|
@@ -64,10 +60,11 @@ def create_agent(
|
|
| 64 |
instructions: Agent instructions. Defaults to FLOW_AGENT_INSTRUCTIONS.
|
| 65 |
workspace: Directory for file operations. Defaults to ~/.flow/workspace.
|
| 66 |
memory_path: Directory for persistent memory. Defaults to ~/.flow/memory.
|
| 67 |
-
tools:
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
|
|
|
| 71 |
enable_compaction: Whether to enable head+tail message compaction.
|
| 72 |
compaction_head_size: Number of initial messages to keep.
|
| 73 |
compaction_tail_size: Number of recent messages to keep.
|
|
@@ -81,9 +78,12 @@ def create_agent(
|
|
| 81 |
|
| 82 |
Example:
|
| 83 |
>>> from flow.harness.maf import create_agent
|
| 84 |
-
>>>
|
| 85 |
-
>>>
|
| 86 |
-
>>>
|
|
|
|
|
|
|
|
|
|
| 87 |
"""
|
| 88 |
try:
|
| 89 |
from agent_framework import ChatAgent, ai_function
|
|
@@ -123,19 +123,18 @@ def create_agent(
|
|
| 123 |
workspace.mkdir(parents=True, exist_ok=True)
|
| 124 |
memory_path.mkdir(parents=True, exist_ok=True)
|
| 125 |
|
| 126 |
-
# Create or use provided
|
| 127 |
-
if tools
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
)
|
| 135 |
|
| 136 |
# Wrap tools with ai_function decorator for Agent Framework
|
| 137 |
converted_tools = []
|
| 138 |
-
for tool_func in
|
| 139 |
tool_name = getattr(tool_func, "_tool_name", tool_func.__name__)
|
| 140 |
tool_description = getattr(tool_func, "_tool_description", tool_func.__doc__ or "")
|
| 141 |
wrapped = ai_function(name=tool_name, description=tool_description)(tool_func)
|
|
@@ -163,11 +162,22 @@ def create_agent(
|
|
| 163 |
f"Message compaction enabled: head={compaction_head_size}, tail={compaction_tail_size}"
|
| 164 |
)
|
| 165 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
# Create the agent
|
| 167 |
agent = ChatAgent(
|
| 168 |
name=name,
|
| 169 |
description="Autonomous coding agent",
|
| 170 |
-
instructions=instructions or
|
|
|
|
|
|
|
| 171 |
chat_client=client,
|
| 172 |
tools=converted_tools,
|
| 173 |
chat_message_store_factory=message_store_factory,
|
|
|
|
| 9 |
from pathlib import Path
|
| 10 |
from typing import TYPE_CHECKING, Any
|
| 11 |
|
| 12 |
+
from flow.experiments.models import TOOL_PRESETS, resolve_tools
|
| 13 |
from flow.harness.maf.message_store import HeadTailCompactingChatMessageStore
|
| 14 |
+
from flow.harness.maf.tools import build_tools
|
| 15 |
+
from flow.prompts import build_instructions
|
| 16 |
|
| 17 |
if TYPE_CHECKING:
|
| 18 |
from agent_framework import ChatAgent
|
|
|
|
| 38 |
workspace: Path | None = None,
|
| 39 |
memory_path: Path | None = None,
|
| 40 |
# Tool configuration
|
| 41 |
+
tools: str | list[str] | dict[str, dict[str, Any]] | Sequence[Callable[..., Coroutine[Any, Any, str]]] = "standard",
|
|
|
|
|
|
|
|
|
|
| 42 |
# Context engineering
|
| 43 |
enable_compaction: bool = True,
|
| 44 |
compaction_head_size: int = 10,
|
|
|
|
| 50 |
- Azure OpenAI as the backend
|
| 51 |
- Flow's standard tools (coding, execution, memory)
|
| 52 |
- Optional message compaction for long conversations
|
|
|
|
|
|
|
| 53 |
|
| 54 |
Args:
|
| 55 |
endpoint: Azure OpenAI endpoint URL. Defaults to AZURE_OPENAI_ENDPOINT env var.
|
|
|
|
| 60 |
instructions: Agent instructions. Defaults to FLOW_AGENT_INSTRUCTIONS.
|
| 61 |
workspace: Directory for file operations. Defaults to ~/.flow/workspace.
|
| 62 |
memory_path: Directory for persistent memory. Defaults to ~/.flow/memory.
|
| 63 |
+
tools: Tool configuration - can be:
|
| 64 |
+
- str: Preset name ("standard", "minimal", "full", "readonly")
|
| 65 |
+
- list[str]: List of tool names
|
| 66 |
+
- dict[str, dict]: Full specification with per-tool configs
|
| 67 |
+
- Sequence[Callable]: Pre-built tool functions (advanced)
|
| 68 |
enable_compaction: Whether to enable head+tail message compaction.
|
| 69 |
compaction_head_size: Number of initial messages to keep.
|
| 70 |
compaction_tail_size: Number of recent messages to keep.
|
|
|
|
| 78 |
|
| 79 |
Example:
|
| 80 |
>>> from flow.harness.maf import create_agent
|
| 81 |
+
>>> # Using preset
|
| 82 |
+
>>> agent = create_agent(tools="standard")
|
| 83 |
+
>>> # Using explicit list
|
| 84 |
+
>>> agent = create_agent(tools=["read_file", "write_file", "bash_execute"])
|
| 85 |
+
>>> # Using full config
|
| 86 |
+
>>> agent = create_agent(tools={"bash_execute": {"timeout": 60}, "memory": {}})
|
| 87 |
"""
|
| 88 |
try:
|
| 89 |
from agent_framework import ChatAgent, ai_function
|
|
|
|
| 123 |
workspace.mkdir(parents=True, exist_ok=True)
|
| 124 |
memory_path.mkdir(parents=True, exist_ok=True)
|
| 125 |
|
| 126 |
+
# Create tools from specification or use provided functions
|
| 127 |
+
if isinstance(tools, (str, list, dict)):
|
| 128 |
+
# Resolve to dict form and build tools
|
| 129 |
+
tools_spec = resolve_tools(tools)
|
| 130 |
+
tool_functions = build_tools(tools_spec, workspace, memory_path)
|
| 131 |
+
else:
|
| 132 |
+
# Already a sequence of callable tools
|
| 133 |
+
tool_functions = tools
|
|
|
|
| 134 |
|
| 135 |
# Wrap tools with ai_function decorator for Agent Framework
|
| 136 |
converted_tools = []
|
| 137 |
+
for tool_func in tool_functions:
|
| 138 |
tool_name = getattr(tool_func, "_tool_name", tool_func.__name__)
|
| 139 |
tool_description = getattr(tool_func, "_tool_description", tool_func.__doc__ or "")
|
| 140 |
wrapped = ai_function(name=tool_name, description=tool_description)(tool_func)
|
|
|
|
| 162 |
f"Message compaction enabled: head={compaction_head_size}, tail={compaction_tail_size}"
|
| 163 |
)
|
| 164 |
|
| 165 |
+
# Determine if memory is enabled for instructions
|
| 166 |
+
enable_memory = False
|
| 167 |
+
if isinstance(tools, str):
|
| 168 |
+
enable_memory = "memory" in TOOL_PRESETS.get(tools, {})
|
| 169 |
+
elif isinstance(tools, list):
|
| 170 |
+
enable_memory = "memory" in tools
|
| 171 |
+
elif isinstance(tools, dict):
|
| 172 |
+
enable_memory = "memory" in tools
|
| 173 |
+
|
| 174 |
# Create the agent
|
| 175 |
agent = ChatAgent(
|
| 176 |
name=name,
|
| 177 |
description="Autonomous coding agent",
|
| 178 |
+
instructions=instructions or build_instructions(
|
| 179 |
+
enable_memory=enable_memory,
|
| 180 |
+
),
|
| 181 |
chat_client=client,
|
| 182 |
tools=converted_tools,
|
| 183 |
chat_message_store_factory=message_store_factory,
|
src/flow/harness/maf/tools/__init__.py
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""MAF-specific tools for the Flow agent.
|
| 2 |
+
|
| 3 |
+
This module provides tools that work with the Microsoft Agent Framework harness.
|
| 4 |
+
Tools are created based on a specification dict that maps tool names to their configs.
|
| 5 |
+
|
| 6 |
+
Available tools:
|
| 7 |
+
- read_file: Read file contents
|
| 8 |
+
- write_file: Write/edit file content
|
| 9 |
+
- list_directory: List directory contents
|
| 10 |
+
- grep_search: Search for text patterns
|
| 11 |
+
- bash_execute: Execute bash commands (config: timeout)
|
| 12 |
+
- check_processes: Manage background processes
|
| 13 |
+
- python_repl: Execute Python code
|
| 14 |
+
- think: Explicit reasoning tool
|
| 15 |
+
- task_done: Task completion marker
|
| 16 |
+
- memory: Persistent memory storage
|
| 17 |
+
- sub_agent: Isolated research sub-agent (config: model)
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
from collections.abc import Callable, Coroutine, Sequence
|
| 21 |
+
from pathlib import Path
|
| 22 |
+
from typing import Any
|
| 23 |
+
|
| 24 |
+
from flow.harness.maf.tools.coding import (
|
| 25 |
+
create_grep_search_tool,
|
| 26 |
+
create_list_directory_tool,
|
| 27 |
+
create_read_file_tool,
|
| 28 |
+
create_write_file_tool,
|
| 29 |
+
)
|
| 30 |
+
from flow.harness.maf.tools.core import task_done, think
|
| 31 |
+
from flow.harness.maf.tools.execution import (
|
| 32 |
+
create_bash_execute_tool,
|
| 33 |
+
create_check_processes_tool,
|
| 34 |
+
create_python_repl_tool,
|
| 35 |
+
)
|
| 36 |
+
from flow.harness.maf.tools.memory import create_memory_tool
|
| 37 |
+
from flow.harness.maf.tools.sub_agent import create_sub_agent_tool
|
| 38 |
+
|
| 39 |
+
__all__ = [
|
| 40 |
+
"build_tools",
|
| 41 |
+
"create_bash_execute_tool",
|
| 42 |
+
"create_check_processes_tool",
|
| 43 |
+
"create_grep_search_tool",
|
| 44 |
+
"create_list_directory_tool",
|
| 45 |
+
"create_memory_tool",
|
| 46 |
+
"create_python_repl_tool",
|
| 47 |
+
"create_read_file_tool",
|
| 48 |
+
"create_sub_agent_tool",
|
| 49 |
+
"create_write_file_tool",
|
| 50 |
+
"task_done",
|
| 51 |
+
"think",
|
| 52 |
+
]
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
# Registry of tool factories that don't require config
|
| 56 |
+
# Maps tool name -> factory function(workspace, memory_path) -> tool
|
| 57 |
+
_SIMPLE_TOOL_FACTORIES: dict[str, Callable[..., Any]] = {}
|
| 58 |
+
|
| 59 |
+
# Registry of tools that are standalone (no factory needed)
|
| 60 |
+
_STANDALONE_TOOLS: dict[str, Callable[..., Coroutine[Any, Any, str]]] = {
|
| 61 |
+
"think": think,
|
| 62 |
+
"task_done": task_done,
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def build_tools(
|
| 67 |
+
tools_spec: dict[str, dict[str, Any]],
|
| 68 |
+
workspace: Path,
|
| 69 |
+
memory_path: Path,
|
| 70 |
+
) -> Sequence[Callable[..., Coroutine[Any, Any, str]]]:
|
| 71 |
+
"""Build tool functions from a specification dict.
|
| 72 |
+
|
| 73 |
+
This is the main entry point for creating tools based on a resolved
|
| 74 |
+
tool specification (from resolve_tools()).
|
| 75 |
+
|
| 76 |
+
Args:
|
| 77 |
+
tools_spec: Dict mapping tool names to their config dicts.
|
| 78 |
+
e.g., {"bash_execute": {"timeout": 60}, "read_file": {}}
|
| 79 |
+
workspace: Root directory for file operations
|
| 80 |
+
memory_path: Directory for persistent memory
|
| 81 |
+
|
| 82 |
+
Returns:
|
| 83 |
+
List of tool functions ready to use with MAF
|
| 84 |
+
|
| 85 |
+
Example:
|
| 86 |
+
>>> from flow.experiments.models import resolve_tools
|
| 87 |
+
>>> tools_spec = resolve_tools("standard")
|
| 88 |
+
>>> tools = build_tools(tools_spec, workspace, memory_path)
|
| 89 |
+
"""
|
| 90 |
+
workspace = Path(workspace).resolve()
|
| 91 |
+
memory_path = Path(memory_path).resolve()
|
| 92 |
+
|
| 93 |
+
tools: list[Callable[..., Coroutine[Any, Any, str]]] = []
|
| 94 |
+
|
| 95 |
+
for tool_name, config in tools_spec.items():
|
| 96 |
+
tool = _create_tool(tool_name, config, workspace, memory_path)
|
| 97 |
+
if tool is not None:
|
| 98 |
+
tools.append(tool)
|
| 99 |
+
|
| 100 |
+
return tools
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
def _create_tool(
|
| 104 |
+
name: str,
|
| 105 |
+
config: dict[str, Any],
|
| 106 |
+
workspace: Path,
|
| 107 |
+
memory_path: Path,
|
| 108 |
+
) -> Callable[..., Coroutine[Any, Any, str]] | None:
|
| 109 |
+
"""Create a single tool by name with the given config.
|
| 110 |
+
|
| 111 |
+
Args:
|
| 112 |
+
name: Tool name (e.g., "read_file", "bash_execute")
|
| 113 |
+
config: Tool-specific configuration dict
|
| 114 |
+
workspace: Root directory for file operations
|
| 115 |
+
memory_path: Directory for persistent memory
|
| 116 |
+
|
| 117 |
+
Returns:
|
| 118 |
+
Tool function or None if unknown tool name
|
| 119 |
+
"""
|
| 120 |
+
# Standalone tools (no config needed)
|
| 121 |
+
if name in _STANDALONE_TOOLS:
|
| 122 |
+
return _STANDALONE_TOOLS[name]
|
| 123 |
+
|
| 124 |
+
# Coding tools
|
| 125 |
+
if name == "read_file":
|
| 126 |
+
return create_read_file_tool(workspace)
|
| 127 |
+
if name == "write_file":
|
| 128 |
+
return create_write_file_tool(workspace)
|
| 129 |
+
if name == "list_directory":
|
| 130 |
+
return create_list_directory_tool(workspace)
|
| 131 |
+
if name == "grep_search":
|
| 132 |
+
return create_grep_search_tool(workspace)
|
| 133 |
+
|
| 134 |
+
# Execution tools
|
| 135 |
+
if name == "bash_execute":
|
| 136 |
+
timeout = config.get("timeout", 120)
|
| 137 |
+
return create_bash_execute_tool(workspace, memory_path, timeout)
|
| 138 |
+
if name == "check_processes":
|
| 139 |
+
return create_check_processes_tool(workspace, memory_path)
|
| 140 |
+
if name == "python_repl":
|
| 141 |
+
return create_python_repl_tool(workspace)
|
| 142 |
+
|
| 143 |
+
# Memory tool
|
| 144 |
+
if name == "memory":
|
| 145 |
+
return create_memory_tool(memory_path)
|
| 146 |
+
|
| 147 |
+
# Sub-agent tool
|
| 148 |
+
if name == "sub_agent":
|
| 149 |
+
model = config.get("model", "gpt-4o-mini")
|
| 150 |
+
return create_sub_agent_tool(workspace, model=model)
|
| 151 |
+
|
| 152 |
+
# Unknown tool - log warning and skip
|
| 153 |
+
import logging
|
| 154 |
+
|
| 155 |
+
logger = logging.getLogger(__name__)
|
| 156 |
+
logger.warning(f"Unknown tool name: {name}. Skipping.")
|
| 157 |
+
return None
|
src/flow/{tools → harness/maf/tools}/coding.py
RENAMED
|
File without changes
|
src/flow/{tools → harness/maf/tools}/core.py
RENAMED
|
File without changes
|
src/flow/{tools → harness/maf/tools}/execution.py
RENAMED
|
File without changes
|
src/flow/{tools → harness/maf/tools}/memory.py
RENAMED
|
File without changes
|
src/flow/{tools → harness/maf/tools}/sub_agent.py
RENAMED
|
@@ -100,12 +100,20 @@ def create_sub_agent_tool(
|
|
| 100 |
|
| 101 |
# Create basic tools for the sub-agent
|
| 102 |
# Keep it minimal - just what's needed for research
|
| 103 |
-
from flow.tools.coding import
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
|
| 110 |
# Convert tools to agent_framework format
|
| 111 |
from agent_framework import ai_function
|
|
|
|
| 100 |
|
| 101 |
# Create basic tools for the sub-agent
|
| 102 |
# Keep it minimal - just what's needed for research
|
| 103 |
+
from flow.harness.maf.tools.coding import (
|
| 104 |
+
create_grep_search_tool,
|
| 105 |
+
create_list_directory_tool,
|
| 106 |
+
create_read_file_tool,
|
| 107 |
+
)
|
| 108 |
+
from flow.harness.maf.tools.core import task_done, think
|
| 109 |
+
|
| 110 |
+
sub_tools: list[Callable[..., Any]] = [
|
| 111 |
+
create_read_file_tool(workspace),
|
| 112 |
+
create_list_directory_tool(workspace),
|
| 113 |
+
create_grep_search_tool(workspace),
|
| 114 |
+
think,
|
| 115 |
+
task_done,
|
| 116 |
+
]
|
| 117 |
|
| 118 |
# Convert tools to agent_framework format
|
| 119 |
from agent_framework import ai_function
|
src/flow/prompts.py
CHANGED
|
@@ -1,9 +1,14 @@
|
|
| 1 |
"""System prompts for the Flow agent.
|
| 2 |
|
| 3 |
Defines the structured workflow for software engineering tasks.
|
|
|
|
| 4 |
"""
|
| 5 |
|
| 6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
You are an expert autonomous agent. You solve problems end-to-end by composing your available tools.
|
| 8 |
|
| 9 |
## CORE PRINCIPLE: BE AUTONOMOUS
|
|
@@ -22,7 +27,9 @@ When asked to solve a task:
|
|
| 22 |
**Example - GOOD (autonomous):**
|
| 23 |
> *writes code* → *executes code* → *sees output* → *fixes any errors*
|
| 24 |
> → "Done! The script ran successfully and output X."
|
|
|
|
| 25 |
|
|
|
|
| 26 |
---
|
| 27 |
|
| 28 |
## YOUR CAPABILITIES
|
|
@@ -41,35 +48,23 @@ When asked to solve a task:
|
|
| 41 |
- `web_search`: Search the web using Google (requires GOOGLE_API_KEY and GOOGLE_CSE_ID)
|
| 42 |
- `web_fetch`: Fetch and read content from URLs
|
| 43 |
|
| 44 |
-
**Memory Tools:**
|
| 45 |
-
- `memory`: Persistent storage that survives across conversations
|
| 46 |
-
- view: See directory or file contents
|
| 47 |
-
- create: Create new files
|
| 48 |
-
- str_replace: Edit existing files
|
| 49 |
-
- append: Add to files
|
| 50 |
-
- search: Find text across memory
|
| 51 |
-
- delete: Remove files
|
| 52 |
-
|
| 53 |
**Thinking Tools:**
|
| 54 |
- `think`: Pause to reason through complex problems
|
| 55 |
- `task_done`: Report when task is complete or blocked
|
|
|
|
| 56 |
|
| 57 |
-
|
| 58 |
-
- `skills`: Discover and load domain-specific expertise
|
| 59 |
-
- `skills(action='list')`: See available skills with descriptions
|
| 60 |
-
- `skills(action='load', name='skill-name')`: Load full skill content
|
| 61 |
-
|
| 62 |
---
|
| 63 |
|
| 64 |
## WORKFLOW
|
| 65 |
|
| 66 |
### 1. UNDERSTAND
|
| 67 |
- Read the user's request carefully
|
| 68 |
-
- **If the `skills` tool is available**, call `skills(action='list')` to discover relevant expertise
|
| 69 |
- Use `list_directory` to understand the workspace structure
|
| 70 |
- Use `grep_search` to find relevant existing code
|
| 71 |
-
|
| 72 |
|
|
|
|
| 73 |
### 2. PLAN
|
| 74 |
- Use `think` tool to plan your approach for complex tasks
|
| 75 |
- Break down into small, testable steps
|
|
@@ -120,7 +115,9 @@ bash_execute("cd project && npm run build") # Production build must succeed
|
|
| 120 |
- Clean up any background processes you started
|
| 121 |
- Call `task_done` with status and summary
|
| 122 |
- Include files created and suggested next steps
|
|
|
|
| 123 |
|
|
|
|
| 124 |
---
|
| 125 |
|
| 126 |
## WORKSPACE
|
|
@@ -139,50 +136,9 @@ Your workspace is at `~/.flow/workspace/`
|
|
| 139 |
- Each `bash_execute` runs from workspace root in a fresh shell
|
| 140 |
- Use `cd project && command` for commands in subdirectories
|
| 141 |
- Multiple commands: `cd project && cmd1 && cmd2`
|
|
|
|
| 142 |
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
## MEMORY
|
| 146 |
-
|
| 147 |
-
Your memory persists at `~/.flow/memory/`
|
| 148 |
-
|
| 149 |
-
**Recommended structure:**
|
| 150 |
-
- `/memory/patterns/` - Reusable solutions and code patterns
|
| 151 |
-
- `/memory/projects/` - Per-project context and notes
|
| 152 |
-
- `/memory/decisions/` - Why you made certain choices
|
| 153 |
-
|
| 154 |
-
**Best practices:**
|
| 155 |
-
When storing information, include context:
|
| 156 |
-
- **Date**: When was this created/learned?
|
| 157 |
-
- **Project**: What project did this come from?
|
| 158 |
-
- **Context**: Why was this approach chosen?
|
| 159 |
-
|
| 160 |
-
**Example pattern file** (`/memory/patterns/fastapi_cors.md`):
|
| 161 |
-
```markdown
|
| 162 |
-
# FastAPI CORS Setup
|
| 163 |
-
Created: 2025-01-15
|
| 164 |
-
Source: sleep_tracker project
|
| 165 |
-
|
| 166 |
-
## Pattern
|
| 167 |
-
from fastapi.middleware.cors import CORSMiddleware
|
| 168 |
-
app.add_middleware(
|
| 169 |
-
CORSMiddleware,
|
| 170 |
-
allow_origins=["*"],
|
| 171 |
-
allow_methods=["*"],
|
| 172 |
-
allow_headers=["*"],
|
| 173 |
-
)
|
| 174 |
-
|
| 175 |
-
## When to use
|
| 176 |
-
- Full-stack apps with separate frontend/backend
|
| 177 |
-
- Frontend on different port than backend
|
| 178 |
-
|
| 179 |
-
## Notes
|
| 180 |
-
- Must add before routes
|
| 181 |
-
- Restrict origins in production
|
| 182 |
-
```
|
| 183 |
-
|
| 184 |
-
**Check memory first** - you may have solved similar problems before!
|
| 185 |
-
|
| 186 |
---
|
| 187 |
|
| 188 |
## CLI TOOLS
|
|
@@ -210,7 +166,9 @@ npm install @shadcn/ui
|
|
| 210 |
npx shadcn@latest init --defaults --yes
|
| 211 |
npx shadcn@latest add button card --yes
|
| 212 |
```
|
|
|
|
| 213 |
|
|
|
|
| 214 |
---
|
| 215 |
|
| 216 |
## FULL-STACK APPS
|
|
@@ -235,7 +193,9 @@ app.add_middleware(
|
|
| 235 |
cd backend && python -c "from main import app; print('Backend OK')"
|
| 236 |
cd frontend && npm run build && echo "Frontend OK"
|
| 237 |
```
|
|
|
|
| 238 |
|
|
|
|
| 239 |
---
|
| 240 |
|
| 241 |
## BACKGROUND PROCESSES
|
|
@@ -257,9 +217,6 @@ check_processes(action="list")
|
|
| 257 |
check_processes(action="kill", pid=12345)
|
| 258 |
```
|
| 259 |
|
| 260 |
-
**Process registry** is at `/memory/processes.md` - view it with:
|
| 261 |
-
`memory(command='view', path='/memory/processes.md')`
|
| 262 |
-
|
| 263 |
**IMPORTANT:**
|
| 264 |
- NEVER start servers without `background=True` - they will timeout after 120s
|
| 265 |
- ALWAYS clean up background processes when done testing
|
|
@@ -276,48 +233,19 @@ check_processes(action="cleanup") # Kill all when done
|
|
| 276 |
# Bad - will timeout!
|
| 277 |
bash_execute("uvicorn main:app --port 8000") # Blocks forever
|
| 278 |
```
|
|
|
|
| 279 |
|
|
|
|
| 280 |
---
|
| 281 |
|
| 282 |
## ERROR HANDLING
|
| 283 |
|
| 284 |
- If a command fails, analyze the error and try alternatives
|
| 285 |
-
- Log failures and solutions to memory for future reference
|
| 286 |
- Don't give up after first failure - iterate
|
| 287 |
- If truly blocked, call `task_done` with status="incomplete" and explain why
|
|
|
|
| 288 |
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
## SKILLS
|
| 292 |
-
|
| 293 |
-
**If the `skills` tool is available**, use it to access domain-specific expertise:
|
| 294 |
-
|
| 295 |
-
```python
|
| 296 |
-
# At the start of complex tasks, discover what expertise is available
|
| 297 |
-
skills(action='list')
|
| 298 |
-
|
| 299 |
-
# Output shows available skills with descriptions:
|
| 300 |
-
# - fastapi-patterns: Build REST APIs with FastAPI...
|
| 301 |
-
# - react-components: Build React components with hooks...
|
| 302 |
-
# - testing-strategies: Write comprehensive tests...
|
| 303 |
-
|
| 304 |
-
# Load relevant skills before implementation
|
| 305 |
-
skills(action='load', name='fastapi-patterns')
|
| 306 |
-
```
|
| 307 |
-
|
| 308 |
-
**Skills provide:**
|
| 309 |
-
- Domain-specific patterns and best practices
|
| 310 |
-
- Code examples and templates
|
| 311 |
-
- Common pitfalls to avoid
|
| 312 |
-
|
| 313 |
-
**When to load skills:**
|
| 314 |
-
- Before starting a new project type (API, frontend, CLI)
|
| 315 |
-
- When working with unfamiliar frameworks
|
| 316 |
-
- For complex tasks requiring specialized knowledge
|
| 317 |
-
|
| 318 |
-
**Skills location:** `~/.flow/skills/`
|
| 319 |
-
Each skill is a folder with a `SKILL.md` file following the Anthropic Skills standard.
|
| 320 |
-
|
| 321 |
---
|
| 322 |
|
| 323 |
## COMPOSING TOOLS FOR COMPLEX TASKS
|
|
@@ -358,7 +286,9 @@ Each skill is a folder with a `SKILL.md` file following the Anthropic Skills sta
|
|
| 358 |
4. bash_execute("curl localhost:8000/endpoint") → Reproduce the error
|
| 359 |
5. Analyze error → Fix code → Test again → Iterate until fixed
|
| 360 |
```
|
|
|
|
| 361 |
|
|
|
|
| 362 |
---
|
| 363 |
|
| 364 |
## RESEARCH WORKFLOW
|
|
@@ -388,7 +318,9 @@ async def fetch_data(url):
|
|
| 388 |
# 4. Test it
|
| 389 |
python_repl("import httpx; print(httpx.__version__)")
|
| 390 |
```
|
|
|
|
| 391 |
|
|
|
|
| 392 |
---
|
| 393 |
|
| 394 |
## REMEMBER
|
|
@@ -401,7 +333,212 @@ python_repl("import httpx; print(httpx.__version__)")
|
|
| 401 |
6. **TEST EVERYTHING** - Never assume code works
|
| 402 |
7. **USE NON-INTERACTIVE FLAGS** - Avoid hanging commands
|
| 403 |
8. **CLEAN UP** - Kill background processes when done
|
| 404 |
-
9. **STORE LEARNINGS** - Save patterns to memory for future use
|
| 405 |
|
| 406 |
**Your goal is to deliver RESULTS, not instructions.**
|
| 407 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
"""System prompts for the Flow agent.
|
| 2 |
|
| 3 |
Defines the structured workflow for software engineering tasks.
|
| 4 |
+
Instructions are composed dynamically based on which tools are enabled.
|
| 5 |
"""
|
| 6 |
|
| 7 |
+
# =============================================================================
|
| 8 |
+
# Core instructions - always included
|
| 9 |
+
# =============================================================================
|
| 10 |
+
|
| 11 |
+
_CORE_INTRO = """
|
| 12 |
You are an expert autonomous agent. You solve problems end-to-end by composing your available tools.
|
| 13 |
|
| 14 |
## CORE PRINCIPLE: BE AUTONOMOUS
|
|
|
|
| 27 |
**Example - GOOD (autonomous):**
|
| 28 |
> *writes code* → *executes code* → *sees output* → *fixes any errors*
|
| 29 |
> → "Done! The script ran successfully and output X."
|
| 30 |
+
"""
|
| 31 |
|
| 32 |
+
_CORE_CAPABILITIES = """
|
| 33 |
---
|
| 34 |
|
| 35 |
## YOUR CAPABILITIES
|
|
|
|
| 48 |
- `web_search`: Search the web using Google (requires GOOGLE_API_KEY and GOOGLE_CSE_ID)
|
| 49 |
- `web_fetch`: Fetch and read content from URLs
|
| 50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
**Thinking Tools:**
|
| 52 |
- `think`: Pause to reason through complex problems
|
| 53 |
- `task_done`: Report when task is complete or blocked
|
| 54 |
+
"""
|
| 55 |
|
| 56 |
+
_CORE_WORKFLOW_UNDERSTAND = """
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
---
|
| 58 |
|
| 59 |
## WORKFLOW
|
| 60 |
|
| 61 |
### 1. UNDERSTAND
|
| 62 |
- Read the user's request carefully
|
|
|
|
| 63 |
- Use `list_directory` to understand the workspace structure
|
| 64 |
- Use `grep_search` to find relevant existing code
|
| 65 |
+
"""
|
| 66 |
|
| 67 |
+
_CORE_WORKFLOW_PLAN_EXECUTE_VERIFY = """
|
| 68 |
### 2. PLAN
|
| 69 |
- Use `think` tool to plan your approach for complex tasks
|
| 70 |
- Break down into small, testable steps
|
|
|
|
| 115 |
- Clean up any background processes you started
|
| 116 |
- Call `task_done` with status and summary
|
| 117 |
- Include files created and suggested next steps
|
| 118 |
+
"""
|
| 119 |
|
| 120 |
+
_CORE_WORKSPACE = """
|
| 121 |
---
|
| 122 |
|
| 123 |
## WORKSPACE
|
|
|
|
| 136 |
- Each `bash_execute` runs from workspace root in a fresh shell
|
| 137 |
- Use `cd project && command` for commands in subdirectories
|
| 138 |
- Multiple commands: `cd project && cmd1 && cmd2`
|
| 139 |
+
"""
|
| 140 |
|
| 141 |
+
_CORE_CLI_TOOLS = """
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
---
|
| 143 |
|
| 144 |
## CLI TOOLS
|
|
|
|
| 166 |
npx shadcn@latest init --defaults --yes
|
| 167 |
npx shadcn@latest add button card --yes
|
| 168 |
```
|
| 169 |
+
"""
|
| 170 |
|
| 171 |
+
_CORE_FULLSTACK = """
|
| 172 |
---
|
| 173 |
|
| 174 |
## FULL-STACK APPS
|
|
|
|
| 193 |
cd backend && python -c "from main import app; print('Backend OK')"
|
| 194 |
cd frontend && npm run build && echo "Frontend OK"
|
| 195 |
```
|
| 196 |
+
"""
|
| 197 |
|
| 198 |
+
_CORE_BACKGROUND = """
|
| 199 |
---
|
| 200 |
|
| 201 |
## BACKGROUND PROCESSES
|
|
|
|
| 217 |
check_processes(action="kill", pid=12345)
|
| 218 |
```
|
| 219 |
|
|
|
|
|
|
|
|
|
|
| 220 |
**IMPORTANT:**
|
| 221 |
- NEVER start servers without `background=True` - they will timeout after 120s
|
| 222 |
- ALWAYS clean up background processes when done testing
|
|
|
|
| 233 |
# Bad - will timeout!
|
| 234 |
bash_execute("uvicorn main:app --port 8000") # Blocks forever
|
| 235 |
```
|
| 236 |
+
"""
|
| 237 |
|
| 238 |
+
_CORE_ERROR_HANDLING = """
|
| 239 |
---
|
| 240 |
|
| 241 |
## ERROR HANDLING
|
| 242 |
|
| 243 |
- If a command fails, analyze the error and try alternatives
|
|
|
|
| 244 |
- Don't give up after first failure - iterate
|
| 245 |
- If truly blocked, call `task_done` with status="incomplete" and explain why
|
| 246 |
+
"""
|
| 247 |
|
| 248 |
+
_CORE_EXAMPLES = """
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 249 |
---
|
| 250 |
|
| 251 |
## COMPOSING TOOLS FOR COMPLEX TASKS
|
|
|
|
| 286 |
4. bash_execute("curl localhost:8000/endpoint") → Reproduce the error
|
| 287 |
5. Analyze error → Fix code → Test again → Iterate until fixed
|
| 288 |
```
|
| 289 |
+
"""
|
| 290 |
|
| 291 |
+
_CORE_RESEARCH = """
|
| 292 |
---
|
| 293 |
|
| 294 |
## RESEARCH WORKFLOW
|
|
|
|
| 318 |
# 4. Test it
|
| 319 |
python_repl("import httpx; print(httpx.__version__)")
|
| 320 |
```
|
| 321 |
+
"""
|
| 322 |
|
| 323 |
+
_CORE_REMEMBER = """
|
| 324 |
---
|
| 325 |
|
| 326 |
## REMEMBER
|
|
|
|
| 333 |
6. **TEST EVERYTHING** - Never assume code works
|
| 334 |
7. **USE NON-INTERACTIVE FLAGS** - Avoid hanging commands
|
| 335 |
8. **CLEAN UP** - Kill background processes when done
|
|
|
|
| 336 |
|
| 337 |
**Your goal is to deliver RESULTS, not instructions.**
|
| 338 |
"""
|
| 339 |
+
|
| 340 |
+
# =============================================================================
|
| 341 |
+
# Optional sections - included only when corresponding tools are enabled
|
| 342 |
+
# =============================================================================
|
| 343 |
+
|
| 344 |
+
_MEMORY_CAPABILITIES = """
|
| 345 |
+
**Memory Tools:**
|
| 346 |
+
- `memory`: Persistent storage that survives across conversations
|
| 347 |
+
- view: See directory or file contents
|
| 348 |
+
- create: Create new files
|
| 349 |
+
- str_replace: Edit existing files
|
| 350 |
+
- append: Add to files
|
| 351 |
+
- search: Find text across memory
|
| 352 |
+
- delete: Remove files
|
| 353 |
+
"""
|
| 354 |
+
|
| 355 |
+
_MEMORY_WORKFLOW_UNDERSTAND = """- Check memory for relevant patterns: `memory(command="view", path="/memory")`
|
| 356 |
+
"""
|
| 357 |
+
|
| 358 |
+
_MEMORY_SECTION = """
|
| 359 |
+
---
|
| 360 |
+
|
| 361 |
+
## MEMORY
|
| 362 |
+
|
| 363 |
+
Your memory persists at `~/.flow/memory/`
|
| 364 |
+
|
| 365 |
+
**Recommended structure:**
|
| 366 |
+
- `/memory/patterns/` - Reusable solutions and code patterns
|
| 367 |
+
- `/memory/projects/` - Per-project context and notes
|
| 368 |
+
- `/memory/decisions/` - Why you made certain choices
|
| 369 |
+
|
| 370 |
+
**Best practices:**
|
| 371 |
+
When storing information, include context:
|
| 372 |
+
- **Date**: When was this created/learned?
|
| 373 |
+
- **Project**: What project did this come from?
|
| 374 |
+
- **Context**: Why was this approach chosen?
|
| 375 |
+
|
| 376 |
+
**Example pattern file** (`/memory/patterns/fastapi_cors.md`):
|
| 377 |
+
```markdown
|
| 378 |
+
# FastAPI CORS Setup
|
| 379 |
+
Created: 2025-01-15
|
| 380 |
+
Source: sleep_tracker project
|
| 381 |
+
|
| 382 |
+
## Pattern
|
| 383 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 384 |
+
app.add_middleware(
|
| 385 |
+
CORSMiddleware,
|
| 386 |
+
allow_origins=["*"],
|
| 387 |
+
allow_methods=["*"],
|
| 388 |
+
allow_headers=["*"],
|
| 389 |
+
)
|
| 390 |
+
|
| 391 |
+
## When to use
|
| 392 |
+
- Full-stack apps with separate frontend/backend
|
| 393 |
+
- Frontend on different port than backend
|
| 394 |
+
|
| 395 |
+
## Notes
|
| 396 |
+
- Must add before routes
|
| 397 |
+
- Restrict origins in production
|
| 398 |
+
```
|
| 399 |
+
|
| 400 |
+
**Check memory first** - you may have solved similar problems before!
|
| 401 |
+
"""
|
| 402 |
+
|
| 403 |
+
_MEMORY_ERROR_HANDLING = """- Log failures and solutions to memory for future reference
|
| 404 |
+
"""
|
| 405 |
+
|
| 406 |
+
_MEMORY_REMEMBER = """9. **STORE LEARNINGS** - Save patterns to memory for future use
|
| 407 |
+
"""
|
| 408 |
+
|
| 409 |
+
_MEMORY_BACKGROUND_PROCESS_REGISTRY = """
|
| 410 |
+
**Process registry** is at `/memory/processes.md` - view it with:
|
| 411 |
+
`memory(command='view', path='/memory/processes.md')`
|
| 412 |
+
"""
|
| 413 |
+
|
| 414 |
+
_SKILLS_CAPABILITIES = """
|
| 415 |
+
**Skills Tool (if available):**
|
| 416 |
+
- `skills`: Discover and load domain-specific expertise
|
| 417 |
+
- `skills(action='list')`: See available skills with descriptions
|
| 418 |
+
- `skills(action='load', name='skill-name')`: Load full skill content
|
| 419 |
+
"""
|
| 420 |
+
|
| 421 |
+
_SKILLS_WORKFLOW_UNDERSTAND = """- **If the `skills` tool is available**, call `skills(action='list')` to discover relevant expertise
|
| 422 |
+
"""
|
| 423 |
+
|
| 424 |
+
_SKILLS_SECTION = """
|
| 425 |
+
---
|
| 426 |
+
|
| 427 |
+
## SKILLS
|
| 428 |
+
|
| 429 |
+
**If the `skills` tool is available**, use it to access domain-specific expertise:
|
| 430 |
+
|
| 431 |
+
```python
|
| 432 |
+
# At the start of complex tasks, discover what expertise is available
|
| 433 |
+
skills(action='list')
|
| 434 |
+
|
| 435 |
+
# Output shows available skills with descriptions:
|
| 436 |
+
# - fastapi-patterns: Build REST APIs with FastAPI...
|
| 437 |
+
# - react-components: Build React components with hooks...
|
| 438 |
+
# - testing-strategies: Write comprehensive tests...
|
| 439 |
+
|
| 440 |
+
# Load relevant skills before implementation
|
| 441 |
+
skills(action='load', name='fastapi-patterns')
|
| 442 |
+
```
|
| 443 |
+
|
| 444 |
+
**Skills provide:**
|
| 445 |
+
- Domain-specific patterns and best practices
|
| 446 |
+
- Code examples and templates
|
| 447 |
+
- Common pitfalls to avoid
|
| 448 |
+
|
| 449 |
+
**When to load skills:**
|
| 450 |
+
- Before starting a new project type (API, frontend, CLI)
|
| 451 |
+
- When working with unfamiliar frameworks
|
| 452 |
+
- For complex tasks requiring specialized knowledge
|
| 453 |
+
|
| 454 |
+
**Skills location:** `~/.flow/skills/`
|
| 455 |
+
Each skill is a folder with a `SKILL.md` file following the Anthropic Skills standard.
|
| 456 |
+
"""
|
| 457 |
+
|
| 458 |
+
|
| 459 |
+
# =============================================================================
|
| 460 |
+
# Instruction builder
|
| 461 |
+
# =============================================================================
|
| 462 |
+
|
| 463 |
+
|
| 464 |
+
def build_instructions(
|
| 465 |
+
*,
|
| 466 |
+
enable_memory: bool = True,
|
| 467 |
+
enable_skills: bool = False,
|
| 468 |
+
) -> str:
|
| 469 |
+
"""Build agent instructions dynamically based on enabled tools.
|
| 470 |
+
|
| 471 |
+
Composes the instruction prompt from core sections plus optional sections
|
| 472 |
+
for memory and skills, so the agent only sees documentation for tools
|
| 473 |
+
it actually has.
|
| 474 |
+
|
| 475 |
+
Args:
|
| 476 |
+
enable_memory: Include memory tool documentation.
|
| 477 |
+
enable_skills: Include skills tool documentation.
|
| 478 |
+
|
| 479 |
+
Returns:
|
| 480 |
+
Complete instruction string.
|
| 481 |
+
"""
|
| 482 |
+
# -- Capabilities section --
|
| 483 |
+
capabilities = _CORE_CAPABILITIES
|
| 484 |
+
if enable_memory:
|
| 485 |
+
capabilities += "\n" + _MEMORY_CAPABILITIES
|
| 486 |
+
if enable_skills:
|
| 487 |
+
capabilities += "\n" + _SKILLS_CAPABILITIES
|
| 488 |
+
|
| 489 |
+
# -- Workflow > Understand section --
|
| 490 |
+
understand = _CORE_WORKFLOW_UNDERSTAND
|
| 491 |
+
if enable_skills:
|
| 492 |
+
understand += _SKILLS_WORKFLOW_UNDERSTAND
|
| 493 |
+
if enable_memory:
|
| 494 |
+
understand += _MEMORY_WORKFLOW_UNDERSTAND
|
| 495 |
+
|
| 496 |
+
# -- Error handling section --
|
| 497 |
+
error_handling = _CORE_ERROR_HANDLING
|
| 498 |
+
if enable_memory:
|
| 499 |
+
error_handling += _MEMORY_ERROR_HANDLING
|
| 500 |
+
|
| 501 |
+
# -- Background processes section --
|
| 502 |
+
background = _CORE_BACKGROUND
|
| 503 |
+
if enable_memory:
|
| 504 |
+
background += _MEMORY_BACKGROUND_PROCESS_REGISTRY
|
| 505 |
+
|
| 506 |
+
# -- Remember section --
|
| 507 |
+
remember = _CORE_REMEMBER
|
| 508 |
+
if enable_memory:
|
| 509 |
+
remember += _MEMORY_REMEMBER
|
| 510 |
+
|
| 511 |
+
# -- Assemble --
|
| 512 |
+
sections = [
|
| 513 |
+
_CORE_INTRO,
|
| 514 |
+
capabilities,
|
| 515 |
+
understand,
|
| 516 |
+
_CORE_WORKFLOW_PLAN_EXECUTE_VERIFY,
|
| 517 |
+
_CORE_WORKSPACE,
|
| 518 |
+
]
|
| 519 |
+
|
| 520 |
+
if enable_memory:
|
| 521 |
+
sections.append(_MEMORY_SECTION)
|
| 522 |
+
|
| 523 |
+
sections.extend([
|
| 524 |
+
_CORE_CLI_TOOLS,
|
| 525 |
+
_CORE_FULLSTACK,
|
| 526 |
+
background,
|
| 527 |
+
error_handling,
|
| 528 |
+
])
|
| 529 |
+
|
| 530 |
+
if enable_skills:
|
| 531 |
+
sections.append(_SKILLS_SECTION)
|
| 532 |
+
|
| 533 |
+
sections.extend([
|
| 534 |
+
_CORE_EXAMPLES,
|
| 535 |
+
_CORE_RESEARCH,
|
| 536 |
+
remember,
|
| 537 |
+
])
|
| 538 |
+
|
| 539 |
+
return "\n".join(sections)
|
| 540 |
+
|
| 541 |
+
|
| 542 |
+
# Legacy constant for backwards compatibility.
|
| 543 |
+
# Equivalent to build_instructions(enable_memory=True, enable_skills=True).
|
| 544 |
+
FLOW_AGENT_INSTRUCTIONS = build_instructions(enable_memory=True, enable_skills=True)
|
src/flow/tools/__init__.py
DELETED
|
@@ -1,172 +0,0 @@
|
|
| 1 |
-
"""Flow agent tools.
|
| 2 |
-
|
| 3 |
-
Provides coding, execution, memory, and core tools for software engineering tasks.
|
| 4 |
-
Tools are harness-agnostic - they return plain data that harnesses adapt.
|
| 5 |
-
"""
|
| 6 |
-
|
| 7 |
-
import inspect
|
| 8 |
-
from collections.abc import Callable, Sequence
|
| 9 |
-
from functools import wraps
|
| 10 |
-
from pathlib import Path
|
| 11 |
-
from typing import Any, get_type_hints
|
| 12 |
-
|
| 13 |
-
from flow.tools.coding import create_coding_tools
|
| 14 |
-
from flow.tools.core import create_core_tools
|
| 15 |
-
from flow.tools.execution import create_execution_tools
|
| 16 |
-
from flow.tools.memory import create_memory_tool
|
| 17 |
-
from flow.tools.sub_agent import create_sub_agent_tool
|
| 18 |
-
|
| 19 |
-
__all__ = [
|
| 20 |
-
"create_all_tools",
|
| 21 |
-
"create_coding_tools",
|
| 22 |
-
"create_core_tools",
|
| 23 |
-
"create_execution_tools",
|
| 24 |
-
"create_memory_tool",
|
| 25 |
-
"create_sub_agent_tool",
|
| 26 |
-
"get_tool_schema",
|
| 27 |
-
"tool",
|
| 28 |
-
]
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
def tool(
|
| 32 |
-
name: str | None = None,
|
| 33 |
-
description: str | None = None,
|
| 34 |
-
) -> Callable[[Callable[..., Any]], Callable[..., Any]]:
|
| 35 |
-
"""Decorator to mark a function as an agent tool.
|
| 36 |
-
|
| 37 |
-
This decorator adds metadata to functions that allows harnesses
|
| 38 |
-
to discover and use them as agent tools.
|
| 39 |
-
|
| 40 |
-
Args:
|
| 41 |
-
name: Tool name (defaults to function name)
|
| 42 |
-
description: Tool description (defaults to docstring)
|
| 43 |
-
|
| 44 |
-
Returns:
|
| 45 |
-
Decorated function with tool metadata
|
| 46 |
-
|
| 47 |
-
Example:
|
| 48 |
-
@tool(name="read_file", description="Read file contents")
|
| 49 |
-
async def read_file(path: str) -> str:
|
| 50 |
-
...
|
| 51 |
-
"""
|
| 52 |
-
|
| 53 |
-
def decorator(func: Callable[..., Any]) -> Callable[..., Any]:
|
| 54 |
-
@wraps(func)
|
| 55 |
-
def wrapper(*args: Any, **kwargs: Any) -> Any:
|
| 56 |
-
return func(*args, **kwargs)
|
| 57 |
-
|
| 58 |
-
# Store tool metadata
|
| 59 |
-
wrapper._tool_name = name or func.__name__ # type: ignore[attr-defined]
|
| 60 |
-
wrapper._tool_description = description or func.__doc__ or "" # type: ignore[attr-defined]
|
| 61 |
-
wrapper._is_tool = True # type: ignore[attr-defined]
|
| 62 |
-
|
| 63 |
-
return wrapper
|
| 64 |
-
|
| 65 |
-
return decorator
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
def get_tool_schema(func: Callable[..., Any]) -> dict[str, Any]:
|
| 69 |
-
"""Extract JSON schema from a tool function.
|
| 70 |
-
|
| 71 |
-
Uses type hints and Annotated metadata to build the schema.
|
| 72 |
-
|
| 73 |
-
Args:
|
| 74 |
-
func: Tool function to extract schema from
|
| 75 |
-
|
| 76 |
-
Returns:
|
| 77 |
-
JSON schema dict for the tool's parameters
|
| 78 |
-
"""
|
| 79 |
-
hints = get_type_hints(func, include_extras=True)
|
| 80 |
-
sig = inspect.signature(func)
|
| 81 |
-
|
| 82 |
-
properties: dict[str, Any] = {}
|
| 83 |
-
required: list[str] = []
|
| 84 |
-
|
| 85 |
-
for param_name, param in sig.parameters.items():
|
| 86 |
-
if param_name in ("self", "cls"):
|
| 87 |
-
continue
|
| 88 |
-
|
| 89 |
-
param_schema: dict[str, Any] = {}
|
| 90 |
-
hint = hints.get(param_name, Any)
|
| 91 |
-
|
| 92 |
-
# Handle Annotated types
|
| 93 |
-
origin = getattr(hint, "__origin__", None)
|
| 94 |
-
if origin is not None:
|
| 95 |
-
# Check if it's Annotated
|
| 96 |
-
if hasattr(hint, "__metadata__"):
|
| 97 |
-
# Extract description from Annotated metadata
|
| 98 |
-
for meta in hint.__metadata__:
|
| 99 |
-
if isinstance(meta, str):
|
| 100 |
-
param_schema["description"] = meta
|
| 101 |
-
break
|
| 102 |
-
# Get the actual type
|
| 103 |
-
hint = hint.__args__[0]
|
| 104 |
-
origin = getattr(hint, "__origin__", None)
|
| 105 |
-
|
| 106 |
-
# Map Python types to JSON schema types
|
| 107 |
-
if hint is str:
|
| 108 |
-
param_schema["type"] = "string"
|
| 109 |
-
elif hint is int:
|
| 110 |
-
param_schema["type"] = "integer"
|
| 111 |
-
elif hint is float:
|
| 112 |
-
param_schema["type"] = "number"
|
| 113 |
-
elif hint is bool:
|
| 114 |
-
param_schema["type"] = "boolean"
|
| 115 |
-
elif origin is list:
|
| 116 |
-
param_schema["type"] = "array"
|
| 117 |
-
elif origin is dict:
|
| 118 |
-
param_schema["type"] = "object"
|
| 119 |
-
else:
|
| 120 |
-
param_schema["type"] = "string" # Default fallback
|
| 121 |
-
|
| 122 |
-
properties[param_name] = param_schema
|
| 123 |
-
|
| 124 |
-
# Check if parameter is required (no default value)
|
| 125 |
-
if param.default is inspect.Parameter.empty:
|
| 126 |
-
required.append(param_name)
|
| 127 |
-
|
| 128 |
-
return {
|
| 129 |
-
"type": "object",
|
| 130 |
-
"properties": properties,
|
| 131 |
-
"required": required,
|
| 132 |
-
}
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
def create_all_tools(
|
| 136 |
-
workspace: Path,
|
| 137 |
-
memory_path: Path,
|
| 138 |
-
bash_timeout: int = 120,
|
| 139 |
-
*,
|
| 140 |
-
enable_memory_tool: bool = True,
|
| 141 |
-
enable_sub_agent: bool = False,
|
| 142 |
-
sub_agent_model: str = "gpt-4o-mini",
|
| 143 |
-
) -> Sequence[Callable[..., Any]]:
|
| 144 |
-
"""Create all standard tools for the Flow agent.
|
| 145 |
-
|
| 146 |
-
Args:
|
| 147 |
-
workspace: Root directory for file operations
|
| 148 |
-
memory_path: Directory for persistent memory
|
| 149 |
-
bash_timeout: Timeout for bash commands in seconds
|
| 150 |
-
enable_memory_tool: Whether to include the memory tool
|
| 151 |
-
enable_sub_agent: Whether to include the sub-agent research tool
|
| 152 |
-
sub_agent_model: Model to use for sub-agent (default: gpt-4o-mini)
|
| 153 |
-
|
| 154 |
-
Returns:
|
| 155 |
-
List of all tool functions
|
| 156 |
-
"""
|
| 157 |
-
tools: list[Callable[..., Any]] = []
|
| 158 |
-
|
| 159 |
-
# Core tools always included
|
| 160 |
-
tools.extend(create_coding_tools(workspace))
|
| 161 |
-
tools.extend(create_execution_tools(workspace, memory_path, bash_timeout))
|
| 162 |
-
tools.extend(create_core_tools())
|
| 163 |
-
|
| 164 |
-
# Optional: Agent-managed memory tool
|
| 165 |
-
if enable_memory_tool:
|
| 166 |
-
tools.append(create_memory_tool(memory_path))
|
| 167 |
-
|
| 168 |
-
# Optional: Sub-agent for isolated research
|
| 169 |
-
if enable_sub_agent:
|
| 170 |
-
tools.append(create_sub_agent_tool(workspace, model=sub_agent_model))
|
| 171 |
-
|
| 172 |
-
return tools
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/flow/ui/api/configs.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
| 1 |
# Copyright (c) Microsoft. All rights reserved.
|
| 2 |
-
"""
|
| 3 |
|
| 4 |
-
from itertools import product
|
| 5 |
from uuid import UUID
|
| 6 |
|
| 7 |
from fastapi import APIRouter, Depends, HTTPException
|
|
@@ -9,32 +8,37 @@ from pydantic import BaseModel
|
|
| 9 |
from sqlalchemy.ext.asyncio import AsyncSession
|
| 10 |
from sqlmodel import select, desc
|
| 11 |
|
|
|
|
|
|
|
| 12 |
from ..database import get_session
|
| 13 |
from ..models.config import AgentConfig
|
| 14 |
-
from ..schemas import
|
| 15 |
|
| 16 |
router = APIRouter(prefix="/configs", tags=["configs"])
|
| 17 |
|
| 18 |
|
| 19 |
-
class
|
| 20 |
-
"""Request schema for generating
|
| 21 |
|
| 22 |
base_name: str = "experiment"
|
| 23 |
|
| 24 |
-
# Which
|
| 25 |
vary_compaction: bool = False
|
| 26 |
-
|
| 27 |
-
vary_sub_agent: bool = False
|
| 28 |
-
|
| 29 |
-
# Which numeric parameters to vary
|
| 30 |
vary_compaction_head: bool = False
|
| 31 |
vary_compaction_tail: bool = False
|
| 32 |
|
| 33 |
-
# Values
|
|
|
|
|
|
|
|
|
|
| 34 |
compaction_head_values: list[int] = [5, 10, 20]
|
| 35 |
compaction_tail_values: list[int] = [20, 40, 60]
|
| 36 |
|
| 37 |
-
#
|
|
|
|
|
|
|
|
|
|
| 38 |
job_id: str | None = None
|
| 39 |
|
| 40 |
|
|
@@ -46,17 +50,12 @@ def parse_uuid(id_str: str) -> UUID:
|
|
| 46 |
raise HTTPException(status_code=400, detail=f"Invalid UUID: {id_str}") from e
|
| 47 |
|
| 48 |
|
| 49 |
-
@router.get("", response_model=list[
|
| 50 |
async def list_configs(
|
| 51 |
include_auto_generated: bool = False,
|
| 52 |
session: AsyncSession = Depends(get_session),
|
| 53 |
) -> list[AgentConfig]:
|
| 54 |
-
"""List agent configurations.
|
| 55 |
-
|
| 56 |
-
Args:
|
| 57 |
-
include_auto_generated: If False (default), only show user-created configs.
|
| 58 |
-
If True, include auto-generated configs from jobs.
|
| 59 |
-
"""
|
| 60 |
query = select(AgentConfig)
|
| 61 |
if not include_auto_generated:
|
| 62 |
query = query.where(AgentConfig.is_auto_generated == False) # noqa: E712
|
|
@@ -65,9 +64,9 @@ async def list_configs(
|
|
| 65 |
return list(result.scalars().all())
|
| 66 |
|
| 67 |
|
| 68 |
-
@router.post("", response_model=
|
| 69 |
async def create_config(
|
| 70 |
-
data:
|
| 71 |
session: AsyncSession = Depends(get_session),
|
| 72 |
) -> AgentConfig:
|
| 73 |
"""Create a new agent configuration."""
|
|
@@ -82,7 +81,7 @@ async def create_config(
|
|
| 82 |
return config
|
| 83 |
|
| 84 |
|
| 85 |
-
@router.get("/{config_id}", response_model=
|
| 86 |
async def get_config(
|
| 87 |
config_id: str,
|
| 88 |
session: AsyncSession = Depends(get_session),
|
|
@@ -96,10 +95,10 @@ async def get_config(
|
|
| 96 |
return config
|
| 97 |
|
| 98 |
|
| 99 |
-
@router.put("/{config_id}", response_model=
|
| 100 |
async def update_config(
|
| 101 |
config_id: str,
|
| 102 |
-
data:
|
| 103 |
session: AsyncSession = Depends(get_session),
|
| 104 |
) -> AgentConfig:
|
| 105 |
"""Update an agent configuration."""
|
|
@@ -109,25 +108,23 @@ async def update_config(
|
|
| 109 |
if not config:
|
| 110 |
raise HTTPException(status_code=404, detail="Config not found")
|
| 111 |
|
| 112 |
-
# Update fields that were provided
|
| 113 |
update_data = data.model_dump(exclude_unset=True)
|
| 114 |
|
| 115 |
-
# Handle config_json fields separately
|
| 116 |
config_fields = [
|
| 117 |
-
"
|
| 118 |
-
"
|
| 119 |
-
"
|
| 120 |
-
"
|
| 121 |
-
"compaction_tail_size",
|
| 122 |
-
"bash_timeout",
|
| 123 |
]
|
| 124 |
|
| 125 |
config_json = dict(config.config_json)
|
| 126 |
-
for
|
| 127 |
-
if
|
| 128 |
-
|
|
|
|
|
|
|
|
|
|
| 129 |
|
| 130 |
-
# Update top-level fields
|
| 131 |
for key, value in update_data.items():
|
| 132 |
setattr(config, key, value)
|
| 133 |
|
|
@@ -157,57 +154,44 @@ async def delete_config(
|
|
| 157 |
await session.commit()
|
| 158 |
|
| 159 |
|
| 160 |
-
@router.post("/generate-
|
| 161 |
-
async def
|
| 162 |
-
data:
|
| 163 |
session: AsyncSession = Depends(get_session),
|
| 164 |
) -> list[AgentConfig]:
|
| 165 |
-
"""Generate
|
| 166 |
|
| 167 |
-
|
| 168 |
-
Each
|
| 169 |
"""
|
| 170 |
-
|
| 171 |
-
# Build variation dimensions
|
| 172 |
-
dimensions: list[list[tuple[str, str, bool | int]]] = []
|
| 173 |
-
dimension_names: list[str] = []
|
| 174 |
|
| 175 |
if data.vary_compaction:
|
| 176 |
-
|
| 177 |
-
(
|
| 178 |
-
(
|
| 179 |
-
]
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
dimensions.append([
|
| 184 |
-
("memory", "enable_memory_tool", True),
|
| 185 |
-
("no_mem", "enable_memory_tool", False),
|
| 186 |
-
])
|
| 187 |
-
dimension_names.append("memory")
|
| 188 |
-
|
| 189 |
-
if data.vary_sub_agent:
|
| 190 |
-
dimensions.append([
|
| 191 |
-
("subagent", "enable_sub_agent", True),
|
| 192 |
-
("no_sub", "enable_sub_agent", False),
|
| 193 |
-
])
|
| 194 |
-
dimension_names.append("sub_agent")
|
| 195 |
|
| 196 |
if data.vary_compaction_head:
|
| 197 |
-
|
| 198 |
-
(
|
| 199 |
-
|
| 200 |
-
])
|
| 201 |
-
dimension_names.append("head_size")
|
| 202 |
|
| 203 |
if data.vary_compaction_tail:
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 209 |
|
| 210 |
-
# Parse job_id if provided
|
| 211 |
job_uuid = None
|
| 212 |
if data.job_id:
|
| 213 |
try:
|
|
@@ -215,19 +199,16 @@ async def generate_variations(
|
|
| 215 |
except ValueError:
|
| 216 |
pass
|
| 217 |
|
| 218 |
-
|
| 219 |
-
|
|
|
|
| 220 |
config = AgentConfig(
|
| 221 |
name=f"{data.base_name}_baseline",
|
| 222 |
-
description=f"Baseline
|
| 223 |
config_json={
|
| 224 |
"name": f"{data.base_name}_baseline",
|
| 225 |
-
"
|
| 226 |
-
"
|
| 227 |
-
"enable_sub_agent": False,
|
| 228 |
-
"compaction_head_size": 10,
|
| 229 |
-
"compaction_tail_size": 40,
|
| 230 |
-
"bash_timeout": 120,
|
| 231 |
},
|
| 232 |
is_auto_generated=True,
|
| 233 |
job_id=job_uuid,
|
|
@@ -237,41 +218,30 @@ async def generate_variations(
|
|
| 237 |
await session.refresh(config)
|
| 238 |
return [config]
|
| 239 |
|
| 240 |
-
|
|
|
|
|
|
|
| 241 |
configs = []
|
| 242 |
-
for
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
config_name = f"{data.base_name}_{'_'.join(name_parts)}"
|
| 246 |
-
|
| 247 |
-
# Build config JSON from defaults + variations
|
| 248 |
-
config_json = {
|
| 249 |
-
"name": config_name,
|
| 250 |
-
"enable_message_compaction": True,
|
| 251 |
-
"enable_memory_tool": True,
|
| 252 |
-
"enable_sub_agent": False,
|
| 253 |
-
"compaction_head_size": 10,
|
| 254 |
-
"compaction_tail_size": 40,
|
| 255 |
-
"bash_timeout": 120,
|
| 256 |
-
}
|
| 257 |
-
|
| 258 |
-
# Apply variations
|
| 259 |
-
for _, key, value in combo:
|
| 260 |
-
config_json[key] = value
|
| 261 |
-
|
| 262 |
-
# Check if config with this name already exists
|
| 263 |
existing = await session.execute(
|
| 264 |
-
select(AgentConfig).where(AgentConfig.name ==
|
| 265 |
)
|
| 266 |
existing_config = existing.scalar_one_or_none()
|
| 267 |
|
| 268 |
if existing_config:
|
| 269 |
configs.append(existing_config)
|
| 270 |
else:
|
|
|
|
| 271 |
config = AgentConfig(
|
| 272 |
-
name=
|
| 273 |
-
description=
|
| 274 |
-
config_json=
|
|
|
|
|
|
|
|
|
|
|
|
|
| 275 |
is_auto_generated=True,
|
| 276 |
job_id=job_uuid,
|
| 277 |
)
|
|
|
|
| 1 |
# Copyright (c) Microsoft. All rights reserved.
|
| 2 |
+
"""Agent config API routes."""
|
| 3 |
|
|
|
|
| 4 |
from uuid import UUID
|
| 5 |
|
| 6 |
from fastapi import APIRouter, Depends, HTTPException
|
|
|
|
| 8 |
from sqlalchemy.ext.asyncio import AsyncSession
|
| 9 |
from sqlmodel import select, desc
|
| 10 |
|
| 11 |
+
from flow.experiments.models import Agent, CompactionConfig, GridSearchStrategy
|
| 12 |
+
|
| 13 |
from ..database import get_session
|
| 14 |
from ..models.config import AgentConfig
|
| 15 |
+
from ..schemas import AgentCreate, AgentUpdate, AgentResponse
|
| 16 |
|
| 17 |
router = APIRouter(prefix="/configs", tags=["configs"])
|
| 18 |
|
| 19 |
|
| 20 |
+
class CandidateRequest(BaseModel):
|
| 21 |
+
"""Request schema for generating candidate agents."""
|
| 22 |
|
| 23 |
base_name: str = "experiment"
|
| 24 |
|
| 25 |
+
# Which dimensions to vary
|
| 26 |
vary_compaction: bool = False
|
| 27 |
+
vary_tools: bool = False
|
|
|
|
|
|
|
|
|
|
| 28 |
vary_compaction_head: bool = False
|
| 29 |
vary_compaction_tail: bool = False
|
| 30 |
|
| 31 |
+
# Values for tool variations (preset names)
|
| 32 |
+
tool_presets: list[str] = ["standard", "minimal", "full"]
|
| 33 |
+
|
| 34 |
+
# Values for numeric variations
|
| 35 |
compaction_head_values: list[int] = [5, 10, 20]
|
| 36 |
compaction_tail_values: list[int] = [20, 40, 60]
|
| 37 |
|
| 38 |
+
# Budget limit
|
| 39 |
+
budget: int = 100
|
| 40 |
+
|
| 41 |
+
# Optional job ID to associate candidates with
|
| 42 |
job_id: str | None = None
|
| 43 |
|
| 44 |
|
|
|
|
| 50 |
raise HTTPException(status_code=400, detail=f"Invalid UUID: {id_str}") from e
|
| 51 |
|
| 52 |
|
| 53 |
+
@router.get("", response_model=list[AgentResponse])
|
| 54 |
async def list_configs(
|
| 55 |
include_auto_generated: bool = False,
|
| 56 |
session: AsyncSession = Depends(get_session),
|
| 57 |
) -> list[AgentConfig]:
|
| 58 |
+
"""List agent configurations."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
query = select(AgentConfig)
|
| 60 |
if not include_auto_generated:
|
| 61 |
query = query.where(AgentConfig.is_auto_generated == False) # noqa: E712
|
|
|
|
| 64 |
return list(result.scalars().all())
|
| 65 |
|
| 66 |
|
| 67 |
+
@router.post("", response_model=AgentResponse, status_code=201)
|
| 68 |
async def create_config(
|
| 69 |
+
data: AgentCreate,
|
| 70 |
session: AsyncSession = Depends(get_session),
|
| 71 |
) -> AgentConfig:
|
| 72 |
"""Create a new agent configuration."""
|
|
|
|
| 81 |
return config
|
| 82 |
|
| 83 |
|
| 84 |
+
@router.get("/{config_id}", response_model=AgentResponse)
|
| 85 |
async def get_config(
|
| 86 |
config_id: str,
|
| 87 |
session: AsyncSession = Depends(get_session),
|
|
|
|
| 95 |
return config
|
| 96 |
|
| 97 |
|
| 98 |
+
@router.put("/{config_id}", response_model=AgentResponse)
|
| 99 |
async def update_config(
|
| 100 |
config_id: str,
|
| 101 |
+
data: AgentUpdate,
|
| 102 |
session: AsyncSession = Depends(get_session),
|
| 103 |
) -> AgentConfig:
|
| 104 |
"""Update an agent configuration."""
|
|
|
|
| 108 |
if not config:
|
| 109 |
raise HTTPException(status_code=404, detail="Config not found")
|
| 110 |
|
|
|
|
| 111 |
update_data = data.model_dump(exclude_unset=True)
|
| 112 |
|
|
|
|
| 113 |
config_fields = [
|
| 114 |
+
"instructions",
|
| 115 |
+
"model",
|
| 116 |
+
"compaction",
|
| 117 |
+
"tools",
|
|
|
|
|
|
|
| 118 |
]
|
| 119 |
|
| 120 |
config_json = dict(config.config_json)
|
| 121 |
+
for field_name in config_fields:
|
| 122 |
+
if field_name in update_data:
|
| 123 |
+
value = update_data.pop(field_name)
|
| 124 |
+
if field_name == "compaction" and hasattr(value, "model_dump"):
|
| 125 |
+
value = value.model_dump()
|
| 126 |
+
config_json[field_name] = value
|
| 127 |
|
|
|
|
| 128 |
for key, value in update_data.items():
|
| 129 |
setattr(config, key, value)
|
| 130 |
|
|
|
|
| 154 |
await session.commit()
|
| 155 |
|
| 156 |
|
| 157 |
+
@router.post("/generate-candidates", response_model=list[AgentResponse], status_code=201)
|
| 158 |
+
async def generate_candidates(
|
| 159 |
+
data: CandidateRequest,
|
| 160 |
session: AsyncSession = Depends(get_session),
|
| 161 |
) -> list[AgentConfig]:
|
| 162 |
+
"""Generate candidate agents for optimization.
|
| 163 |
|
| 164 |
+
Uses GridSearchStrategy to generate candidate variants from a base agent.
|
| 165 |
+
Each candidate is stored as an AgentConfig in the database.
|
| 166 |
"""
|
| 167 |
+
variations: dict[str, list] = {}
|
|
|
|
|
|
|
|
|
|
| 168 |
|
| 169 |
if data.vary_compaction:
|
| 170 |
+
variations["compaction"] = [
|
| 171 |
+
CompactionConfig.head_tail(10, 40),
|
| 172 |
+
CompactionConfig.none(),
|
| 173 |
+
]
|
| 174 |
+
|
| 175 |
+
if data.vary_tools:
|
| 176 |
+
variations["tools"] = data.tool_presets
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 177 |
|
| 178 |
if data.vary_compaction_head:
|
| 179 |
+
variations["compaction"] = [
|
| 180 |
+
CompactionConfig.head_tail(h, 40) for h in data.compaction_head_values
|
| 181 |
+
]
|
|
|
|
|
|
|
| 182 |
|
| 183 |
if data.vary_compaction_tail:
|
| 184 |
+
if data.vary_compaction_head:
|
| 185 |
+
variations["compaction"] = [
|
| 186 |
+
CompactionConfig.head_tail(h, t)
|
| 187 |
+
for h in data.compaction_head_values
|
| 188 |
+
for t in data.compaction_tail_values
|
| 189 |
+
]
|
| 190 |
+
else:
|
| 191 |
+
variations["compaction"] = [
|
| 192 |
+
CompactionConfig.head_tail(10, t) for t in data.compaction_tail_values
|
| 193 |
+
]
|
| 194 |
|
|
|
|
| 195 |
job_uuid = None
|
| 196 |
if data.job_id:
|
| 197 |
try:
|
|
|
|
| 199 |
except ValueError:
|
| 200 |
pass
|
| 201 |
|
| 202 |
+
base = Agent(name=data.base_name)
|
| 203 |
+
|
| 204 |
+
if not variations:
|
| 205 |
config = AgentConfig(
|
| 206 |
name=f"{data.base_name}_baseline",
|
| 207 |
+
description=f"Baseline agent from {data.base_name}",
|
| 208 |
config_json={
|
| 209 |
"name": f"{data.base_name}_baseline",
|
| 210 |
+
"compaction": {"strategy": "head_tail", "params": {"head_size": 10, "tail_size": 40}},
|
| 211 |
+
"tools": "standard",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 212 |
},
|
| 213 |
is_auto_generated=True,
|
| 214 |
job_id=job_uuid,
|
|
|
|
| 218 |
await session.refresh(config)
|
| 219 |
return [config]
|
| 220 |
|
| 221 |
+
strategy = GridSearchStrategy(variations)
|
| 222 |
+
candidates = strategy.generate(base, data.budget)
|
| 223 |
+
|
| 224 |
configs = []
|
| 225 |
+
for candidate in candidates:
|
| 226 |
+
candidate_name = candidate.agent.name
|
| 227 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 228 |
existing = await session.execute(
|
| 229 |
+
select(AgentConfig).where(AgentConfig.name == candidate_name).limit(1)
|
| 230 |
)
|
| 231 |
existing_config = existing.scalar_one_or_none()
|
| 232 |
|
| 233 |
if existing_config:
|
| 234 |
configs.append(existing_config)
|
| 235 |
else:
|
| 236 |
+
from dataclasses import asdict
|
| 237 |
config = AgentConfig(
|
| 238 |
+
name=candidate_name,
|
| 239 |
+
description=candidate.rationale,
|
| 240 |
+
config_json={
|
| 241 |
+
"name": candidate_name,
|
| 242 |
+
"compaction": asdict(candidate.agent.compaction),
|
| 243 |
+
"tools": candidate.agent.tools,
|
| 244 |
+
},
|
| 245 |
is_auto_generated=True,
|
| 246 |
job_id=job_uuid,
|
| 247 |
)
|
src/flow/ui/api/jobs.py
CHANGED
|
@@ -53,12 +53,12 @@ async def create_job(
|
|
| 53 |
session: AsyncSession = Depends(get_session),
|
| 54 |
) -> OptimizationJob:
|
| 55 |
"""Create a new optimization job."""
|
| 56 |
-
# Validate
|
| 57 |
-
for
|
| 58 |
-
uuid_id = parse_uuid(
|
| 59 |
result = await session.execute(select(AgentConfig).where(AgentConfig.id == uuid_id))
|
| 60 |
if not result.scalar_one_or_none():
|
| 61 |
-
raise HTTPException(status_code=400, detail=f"
|
| 62 |
|
| 63 |
# Validate task_ids exist
|
| 64 |
for task_id in data.task_ids:
|
|
@@ -69,11 +69,11 @@ async def create_job(
|
|
| 69 |
|
| 70 |
job = OptimizationJob(
|
| 71 |
name=data.name,
|
| 72 |
-
|
| 73 |
task_ids=data.task_ids,
|
| 74 |
parallel=data.parallel,
|
| 75 |
use_llm_eval=data.use_llm_eval,
|
| 76 |
-
total_experiments=len(data.
|
| 77 |
)
|
| 78 |
session.add(job)
|
| 79 |
await session.commit()
|
|
|
|
| 53 |
session: AsyncSession = Depends(get_session),
|
| 54 |
) -> OptimizationJob:
|
| 55 |
"""Create a new optimization job."""
|
| 56 |
+
# Validate candidate_ids exist
|
| 57 |
+
for candidate_id in data.candidate_ids:
|
| 58 |
+
uuid_id = parse_uuid(candidate_id)
|
| 59 |
result = await session.execute(select(AgentConfig).where(AgentConfig.id == uuid_id))
|
| 60 |
if not result.scalar_one_or_none():
|
| 61 |
+
raise HTTPException(status_code=400, detail=f"Candidate {candidate_id} not found")
|
| 62 |
|
| 63 |
# Validate task_ids exist
|
| 64 |
for task_id in data.task_ids:
|
|
|
|
| 69 |
|
| 70 |
job = OptimizationJob(
|
| 71 |
name=data.name,
|
| 72 |
+
candidate_ids=data.candidate_ids,
|
| 73 |
task_ids=data.task_ids,
|
| 74 |
parallel=data.parallel,
|
| 75 |
use_llm_eval=data.use_llm_eval,
|
| 76 |
+
total_experiments=len(data.candidate_ids) * len(data.task_ids),
|
| 77 |
)
|
| 78 |
session.add(job)
|
| 79 |
await session.commit()
|
src/flow/ui/api/runs.py
CHANGED
|
@@ -26,7 +26,7 @@ def parse_uuid(id_str: str) -> UUID:
|
|
| 26 |
@router.get("", response_model=list[RunResponse])
|
| 27 |
async def list_runs(
|
| 28 |
job_id: str | None = None,
|
| 29 |
-
|
| 30 |
task_name: str | None = None,
|
| 31 |
is_pareto: bool | None = None,
|
| 32 |
session: AsyncSession = Depends(get_session),
|
|
@@ -37,8 +37,8 @@ async def list_runs(
|
|
| 37 |
if job_id:
|
| 38 |
uuid_id = parse_uuid(job_id)
|
| 39 |
query = query.where(ExperimentRun.job_id == uuid_id)
|
| 40 |
-
if
|
| 41 |
-
query = query.where(ExperimentRun.
|
| 42 |
if task_name:
|
| 43 |
query = query.where(ExperimentRun.task_name == task_name)
|
| 44 |
if is_pareto is not None:
|
|
@@ -75,7 +75,7 @@ async def get_run(
|
|
| 75 |
return {
|
| 76 |
"id": str(run.id),
|
| 77 |
"job_id": str(run.job_id),
|
| 78 |
-
"
|
| 79 |
"task_name": run.task_name,
|
| 80 |
"status": run.status,
|
| 81 |
"tokens_total": run.tokens_total,
|
|
@@ -111,11 +111,11 @@ async def get_job_summary(
|
|
| 111 |
raise HTTPException(status_code=404, detail="No runs found for job")
|
| 112 |
|
| 113 |
# Aggregate by config
|
| 114 |
-
|
| 115 |
for run in runs:
|
| 116 |
-
if run.
|
| 117 |
-
|
| 118 |
-
"
|
| 119 |
"total_runs": 0,
|
| 120 |
"passed_runs": 0,
|
| 121 |
"avg_score": 0.0,
|
|
@@ -125,7 +125,7 @@ async def get_job_summary(
|
|
| 125 |
"pareto_rank": 999,
|
| 126 |
}
|
| 127 |
|
| 128 |
-
summary =
|
| 129 |
summary["total_runs"] += 1
|
| 130 |
if run.passed:
|
| 131 |
summary["passed_runs"] += 1
|
|
@@ -137,7 +137,7 @@ async def get_job_summary(
|
|
| 137 |
summary["pareto_rank"] = min(summary["pareto_rank"], run.pareto_rank)
|
| 138 |
|
| 139 |
# Calculate averages
|
| 140 |
-
for summary in
|
| 141 |
n = summary["total_runs"]
|
| 142 |
summary["avg_score"] /= n
|
| 143 |
summary["avg_tokens"] /= n
|
|
@@ -145,13 +145,13 @@ async def get_job_summary(
|
|
| 145 |
|
| 146 |
# Sort by score descending
|
| 147 |
sorted_summaries = sorted(
|
| 148 |
-
|
| 149 |
key=lambda x: (-x["avg_score"], x["avg_tokens"]),
|
| 150 |
)
|
| 151 |
|
| 152 |
return {
|
| 153 |
"job_id": job_id,
|
| 154 |
"total_runs": len(runs),
|
| 155 |
-
"
|
| 156 |
-
"
|
| 157 |
}
|
|
|
|
| 26 |
@router.get("", response_model=list[RunResponse])
|
| 27 |
async def list_runs(
|
| 28 |
job_id: str | None = None,
|
| 29 |
+
candidate_name: str | None = None,
|
| 30 |
task_name: str | None = None,
|
| 31 |
is_pareto: bool | None = None,
|
| 32 |
session: AsyncSession = Depends(get_session),
|
|
|
|
| 37 |
if job_id:
|
| 38 |
uuid_id = parse_uuid(job_id)
|
| 39 |
query = query.where(ExperimentRun.job_id == uuid_id)
|
| 40 |
+
if candidate_name:
|
| 41 |
+
query = query.where(ExperimentRun.candidate_name == candidate_name)
|
| 42 |
if task_name:
|
| 43 |
query = query.where(ExperimentRun.task_name == task_name)
|
| 44 |
if is_pareto is not None:
|
|
|
|
| 75 |
return {
|
| 76 |
"id": str(run.id),
|
| 77 |
"job_id": str(run.job_id),
|
| 78 |
+
"candidate_name": run.candidate_name,
|
| 79 |
"task_name": run.task_name,
|
| 80 |
"status": run.status,
|
| 81 |
"tokens_total": run.tokens_total,
|
|
|
|
| 111 |
raise HTTPException(status_code=404, detail="No runs found for job")
|
| 112 |
|
| 113 |
# Aggregate by config
|
| 114 |
+
candidate_summaries: dict[str, dict[str, Any]] = {}
|
| 115 |
for run in runs:
|
| 116 |
+
if run.candidate_name not in candidate_summaries:
|
| 117 |
+
candidate_summaries[run.candidate_name] = {
|
| 118 |
+
"candidate_name": run.candidate_name,
|
| 119 |
"total_runs": 0,
|
| 120 |
"passed_runs": 0,
|
| 121 |
"avg_score": 0.0,
|
|
|
|
| 125 |
"pareto_rank": 999,
|
| 126 |
}
|
| 127 |
|
| 128 |
+
summary = candidate_summaries[run.candidate_name]
|
| 129 |
summary["total_runs"] += 1
|
| 130 |
if run.passed:
|
| 131 |
summary["passed_runs"] += 1
|
|
|
|
| 137 |
summary["pareto_rank"] = min(summary["pareto_rank"], run.pareto_rank)
|
| 138 |
|
| 139 |
# Calculate averages
|
| 140 |
+
for summary in candidate_summaries.values():
|
| 141 |
n = summary["total_runs"]
|
| 142 |
summary["avg_score"] /= n
|
| 143 |
summary["avg_tokens"] /= n
|
|
|
|
| 145 |
|
| 146 |
# Sort by score descending
|
| 147 |
sorted_summaries = sorted(
|
| 148 |
+
candidate_summaries.values(),
|
| 149 |
key=lambda x: (-x["avg_score"], x["avg_tokens"]),
|
| 150 |
)
|
| 151 |
|
| 152 |
return {
|
| 153 |
"job_id": job_id,
|
| 154 |
"total_runs": len(runs),
|
| 155 |
+
"candidate_summaries": sorted_summaries,
|
| 156 |
+
"pareto_candidates": [s["candidate_name"] for s in sorted_summaries if s["is_pareto"]],
|
| 157 |
}
|
src/flow/ui/database.py
CHANGED
|
@@ -21,70 +21,14 @@ engine = create_async_engine(DATABASE_URL, echo=False, future=True)
|
|
| 21 |
async_session = async_sessionmaker(engine, class_=AsyncSession, expire_on_commit=False)
|
| 22 |
|
| 23 |
|
| 24 |
-
async def _migrate_schema(conn) -> None:
|
| 25 |
-
"""Apply schema migrations for new columns.
|
| 26 |
-
|
| 27 |
-
SQLModel's create_all only creates missing tables, not columns.
|
| 28 |
-
This adds any missing columns to existing tables.
|
| 29 |
-
"""
|
| 30 |
-
from sqlalchemy import text, inspect
|
| 31 |
-
|
| 32 |
-
def _sync_migrate(sync_conn):
|
| 33 |
-
inspector = inspect(sync_conn)
|
| 34 |
-
|
| 35 |
-
# Check agent_configs table
|
| 36 |
-
if inspector.has_table("agent_configs"):
|
| 37 |
-
columns = {c["name"] for c in inspector.get_columns("agent_configs")}
|
| 38 |
-
|
| 39 |
-
# Add is_auto_generated column if missing
|
| 40 |
-
if "is_auto_generated" not in columns:
|
| 41 |
-
logger.info("Adding is_auto_generated column to agent_configs")
|
| 42 |
-
sync_conn.execute(
|
| 43 |
-
text("ALTER TABLE agent_configs ADD COLUMN is_auto_generated BOOLEAN DEFAULT 0")
|
| 44 |
-
)
|
| 45 |
-
|
| 46 |
-
# Add job_id column if missing
|
| 47 |
-
if "job_id" not in columns:
|
| 48 |
-
logger.info("Adding job_id column to agent_configs")
|
| 49 |
-
sync_conn.execute(
|
| 50 |
-
text("ALTER TABLE agent_configs ADD COLUMN job_id VARCHAR(36)")
|
| 51 |
-
)
|
| 52 |
-
|
| 53 |
-
# Retroactively mark configs with "Auto-generated variation:" in description
|
| 54 |
-
logger.info("Marking auto-generated configs based on description pattern")
|
| 55 |
-
sync_conn.execute(
|
| 56 |
-
text(
|
| 57 |
-
"UPDATE agent_configs SET is_auto_generated = 1 "
|
| 58 |
-
"WHERE description LIKE 'Auto-generated variation:%' "
|
| 59 |
-
"AND (is_auto_generated IS NULL OR is_auto_generated = 0)"
|
| 60 |
-
)
|
| 61 |
-
)
|
| 62 |
-
|
| 63 |
-
await conn.run_sync(_sync_migrate)
|
| 64 |
-
|
| 65 |
-
|
| 66 |
async def init_db() -> None:
|
| 67 |
-
"""Initialize database tables.
|
| 68 |
-
|
| 69 |
-
With multiple uvicorn workers, each worker calls this on startup.
|
| 70 |
-
SQLite + create_all can race: worker A checks table doesn't exist,
|
| 71 |
-
worker B creates it, worker A tries to create and fails.
|
| 72 |
-
|
| 73 |
-
Solution: Catch the OperationalError and continue - if the table
|
| 74 |
-
already exists, that's fine.
|
| 75 |
-
|
| 76 |
-
See: https://github.com/sqlalchemy/sqlalchemy/issues/4936
|
| 77 |
-
"""
|
| 78 |
-
# Import models to ensure they're registered with SQLModel.metadata
|
| 79 |
from flow.ui.models import AgentConfig, TaskModel, OptimizationJob, ExperimentRun # noqa: F401
|
| 80 |
|
| 81 |
try:
|
| 82 |
async with engine.begin() as conn:
|
| 83 |
await conn.run_sync(SQLModel.metadata.create_all)
|
| 84 |
-
# Apply migrations for new columns
|
| 85 |
-
await _migrate_schema(conn)
|
| 86 |
except Exception as e:
|
| 87 |
-
# Handle race condition: "table already exists" is fine
|
| 88 |
if "already exists" in str(e).lower():
|
| 89 |
logger.debug("Tables already exist (race condition handled)")
|
| 90 |
else:
|
|
|
|
| 21 |
async_session = async_sessionmaker(engine, class_=AsyncSession, expire_on_commit=False)
|
| 22 |
|
| 23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
async def init_db() -> None:
|
| 25 |
+
"""Initialize database tables."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
from flow.ui.models import AgentConfig, TaskModel, OptimizationJob, ExperimentRun # noqa: F401
|
| 27 |
|
| 28 |
try:
|
| 29 |
async with engine.begin() as conn:
|
| 30 |
await conn.run_sync(SQLModel.metadata.create_all)
|
|
|
|
|
|
|
| 31 |
except Exception as e:
|
|
|
|
| 32 |
if "already exists" in str(e).lower():
|
| 33 |
logger.debug("Tables already exist (race condition handled)")
|
| 34 |
else:
|
src/flow/ui/models/config.py
CHANGED
|
@@ -17,12 +17,12 @@ class AgentConfig(SQLModel, table=True):
|
|
| 17 |
name: str = Field(index=True)
|
| 18 |
description: str = ""
|
| 19 |
|
| 20 |
-
# Store
|
| 21 |
config_json: dict[str, Any] = Field(default_factory=dict, sa_column=Column(JSON))
|
| 22 |
|
| 23 |
-
# Track auto-generated
|
| 24 |
is_auto_generated: bool = Field(default=False, index=True)
|
| 25 |
-
# Link to the job that created this
|
| 26 |
job_id: UUID | None = Field(default=None, index=True)
|
| 27 |
|
| 28 |
created_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
|
|
@@ -30,5 +30,5 @@ class AgentConfig(SQLModel, table=True):
|
|
| 30 |
|
| 31 |
@property
|
| 32 |
def config(self) -> dict[str, Any]:
|
| 33 |
-
"""Alias for config_json
|
| 34 |
return self.config_json
|
|
|
|
| 17 |
name: str = Field(index=True)
|
| 18 |
description: str = ""
|
| 19 |
|
| 20 |
+
# Store Agent config as JSON
|
| 21 |
config_json: dict[str, Any] = Field(default_factory=dict, sa_column=Column(JSON))
|
| 22 |
|
| 23 |
+
# Track auto-generated candidates (created by generate-candidates endpoint)
|
| 24 |
is_auto_generated: bool = Field(default=False, index=True)
|
| 25 |
+
# Link to the job that created this candidate (if auto-generated)
|
| 26 |
job_id: UUID | None = Field(default=None, index=True)
|
| 27 |
|
| 28 |
created_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
|
|
|
|
| 30 |
|
| 31 |
@property
|
| 32 |
def config(self) -> dict[str, Any]:
|
| 33 |
+
"""Alias for config_json used by API response serialization."""
|
| 34 |
return self.config_json
|
src/flow/ui/models/job.py
CHANGED
|
@@ -33,7 +33,7 @@ class OptimizationJob(SQLModel, table=True):
|
|
| 33 |
use_llm_eval: bool = Field(default=False)
|
| 34 |
|
| 35 |
# Selected configs and tasks (stored as IDs)
|
| 36 |
-
|
| 37 |
task_ids: list[str] = Field(default_factory=list, sa_column=Column(JSON))
|
| 38 |
|
| 39 |
# Results
|
|
|
|
| 33 |
use_llm_eval: bool = Field(default=False)
|
| 34 |
|
| 35 |
# Selected configs and tasks (stored as IDs)
|
| 36 |
+
candidate_ids: list[str] = Field(default_factory=list, sa_column=Column(JSON))
|
| 37 |
task_ids: list[str] = Field(default_factory=list, sa_column=Column(JSON))
|
| 38 |
|
| 39 |
# Results
|
src/flow/ui/models/run.py
CHANGED
|
@@ -16,7 +16,7 @@ class ExperimentRun(SQLModel, table=True):
|
|
| 16 |
id: UUID = Field(default_factory=uuid4, primary_key=True)
|
| 17 |
job_id: UUID = Field(foreign_key="optimization_jobs.id", index=True)
|
| 18 |
|
| 19 |
-
|
| 20 |
task_name: str
|
| 21 |
|
| 22 |
# Status
|
|
|
|
| 16 |
id: UUID = Field(default_factory=uuid4, primary_key=True)
|
| 17 |
job_id: UUID = Field(foreign_key="optimization_jobs.id", index=True)
|
| 18 |
|
| 19 |
+
candidate_name: str
|
| 20 |
task_name: str
|
| 21 |
|
| 22 |
# Status
|
src/flow/ui/models/task.py
CHANGED
|
@@ -28,5 +28,5 @@ class TaskModel(SQLModel, table=True):
|
|
| 28 |
|
| 29 |
@property
|
| 30 |
def criteria(self) -> list[dict[str, Any]]:
|
| 31 |
-
"""Alias for criteria_json
|
| 32 |
return self.criteria_json
|
|
|
|
| 28 |
|
| 29 |
@property
|
| 30 |
def criteria(self) -> list[dict[str, Any]]:
|
| 31 |
+
"""Alias for criteria_json used by API response serialization."""
|
| 32 |
return self.criteria_json
|
src/flow/ui/schemas/__init__.py
CHANGED
|
@@ -1,15 +1,15 @@
|
|
| 1 |
# Copyright (c) Microsoft. All rights reserved.
|
| 2 |
"""Pydantic schemas for API requests/responses."""
|
| 3 |
|
| 4 |
-
from .config import
|
| 5 |
from .task import TaskCreate, TaskResponse, CriterionSchema
|
| 6 |
from .job import JobCreate, JobResponse, JobProgress
|
| 7 |
from .run import RunResponse, RunDetailResponse, CriterionResultSchema
|
| 8 |
|
| 9 |
__all__ = [
|
| 10 |
-
"
|
| 11 |
-
"
|
| 12 |
-
"
|
| 13 |
"TaskCreate",
|
| 14 |
"TaskResponse",
|
| 15 |
"CriterionSchema",
|
|
|
|
| 1 |
# Copyright (c) Microsoft. All rights reserved.
|
| 2 |
"""Pydantic schemas for API requests/responses."""
|
| 3 |
|
| 4 |
+
from .config import AgentCreate, AgentUpdate, AgentResponse
|
| 5 |
from .task import TaskCreate, TaskResponse, CriterionSchema
|
| 6 |
from .job import JobCreate, JobResponse, JobProgress
|
| 7 |
from .run import RunResponse, RunDetailResponse, CriterionResultSchema
|
| 8 |
|
| 9 |
__all__ = [
|
| 10 |
+
"AgentCreate",
|
| 11 |
+
"AgentUpdate",
|
| 12 |
+
"AgentResponse",
|
| 13 |
"TaskCreate",
|
| 14 |
"TaskResponse",
|
| 15 |
"CriterionSchema",
|
src/flow/ui/schemas/config.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
# Copyright (c) Microsoft. All rights reserved.
|
| 2 |
-
"""
|
| 3 |
|
| 4 |
from datetime import datetime
|
| 5 |
from typing import Any
|
|
@@ -8,46 +8,52 @@ from uuid import UUID
|
|
| 8 |
from pydantic import BaseModel, ConfigDict, field_validator
|
| 9 |
|
| 10 |
|
| 11 |
-
class
|
| 12 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
name: str
|
| 15 |
description: str = ""
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
compaction_tail_size: int = 40
|
| 21 |
-
bash_timeout: int = 120
|
| 22 |
|
| 23 |
def to_config_json(self) -> dict[str, Any]:
|
| 24 |
-
"""Convert to config JSON for storage."""
|
| 25 |
return {
|
| 26 |
-
"
|
| 27 |
-
"
|
| 28 |
-
"
|
| 29 |
-
"
|
| 30 |
-
"compaction_head_size": self.compaction_head_size,
|
| 31 |
-
"compaction_tail_size": self.compaction_tail_size,
|
| 32 |
-
"bash_timeout": self.bash_timeout,
|
| 33 |
}
|
| 34 |
|
| 35 |
|
| 36 |
-
class
|
| 37 |
-
"""Request schema for updating
|
| 38 |
|
| 39 |
name: str | None = None
|
| 40 |
description: str | None = None
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
compaction_tail_size: int | None = None
|
| 46 |
-
bash_timeout: int | None = None
|
| 47 |
|
| 48 |
|
| 49 |
-
class
|
| 50 |
-
"""Response schema for
|
| 51 |
|
| 52 |
model_config = ConfigDict(from_attributes=True)
|
| 53 |
|
|
@@ -63,7 +69,6 @@ class ConfigResponse(BaseModel):
|
|
| 63 |
@field_validator("id", mode="before")
|
| 64 |
@classmethod
|
| 65 |
def convert_uuid(cls, v: UUID | str) -> str:
|
| 66 |
-
"""Convert UUID to string."""
|
| 67 |
if isinstance(v, UUID):
|
| 68 |
return str(v)
|
| 69 |
return v
|
|
@@ -71,7 +76,6 @@ class ConfigResponse(BaseModel):
|
|
| 71 |
@field_validator("job_id", mode="before")
|
| 72 |
@classmethod
|
| 73 |
def convert_job_uuid(cls, v: UUID | str | None) -> str | None:
|
| 74 |
-
"""Convert job UUID to string."""
|
| 75 |
if v is None:
|
| 76 |
return None
|
| 77 |
if isinstance(v, UUID):
|
|
|
|
| 1 |
# Copyright (c) Microsoft. All rights reserved.
|
| 2 |
+
"""Agent config schemas."""
|
| 3 |
|
| 4 |
from datetime import datetime
|
| 5 |
from typing import Any
|
|
|
|
| 8 |
from pydantic import BaseModel, ConfigDict, field_validator
|
| 9 |
|
| 10 |
|
| 11 |
+
class CompactionConfigSchema(BaseModel):
|
| 12 |
+
"""Compaction strategy configuration."""
|
| 13 |
+
|
| 14 |
+
strategy: str = "head_tail"
|
| 15 |
+
params: dict[str, Any] = {"head_size": 10, "tail_size": 40}
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class AgentCreate(BaseModel):
|
| 19 |
+
"""Request schema for creating an agent.
|
| 20 |
+
|
| 21 |
+
Tools can be specified as:
|
| 22 |
+
- str: Preset name ("standard", "minimal", "full", "readonly")
|
| 23 |
+
- list[str]: List of tool names
|
| 24 |
+
- dict[str, dict]: Full specification with per-tool configs
|
| 25 |
+
"""
|
| 26 |
|
| 27 |
name: str
|
| 28 |
description: str = ""
|
| 29 |
+
instructions: str | None = None
|
| 30 |
+
model: str | None = None
|
| 31 |
+
compaction: CompactionConfigSchema = CompactionConfigSchema()
|
| 32 |
+
tools: str | list[str] | dict[str, dict[str, Any]] = "standard"
|
|
|
|
|
|
|
| 33 |
|
| 34 |
def to_config_json(self) -> dict[str, Any]:
|
| 35 |
+
"""Convert to config JSON for storage (runtime settings only)."""
|
| 36 |
return {
|
| 37 |
+
"instructions": self.instructions,
|
| 38 |
+
"model": self.model,
|
| 39 |
+
"compaction": self.compaction.model_dump(),
|
| 40 |
+
"tools": self.tools,
|
|
|
|
|
|
|
|
|
|
| 41 |
}
|
| 42 |
|
| 43 |
|
| 44 |
+
class AgentUpdate(BaseModel):
|
| 45 |
+
"""Request schema for updating an agent."""
|
| 46 |
|
| 47 |
name: str | None = None
|
| 48 |
description: str | None = None
|
| 49 |
+
instructions: str | None = None
|
| 50 |
+
model: str | None = None
|
| 51 |
+
compaction: CompactionConfigSchema | None = None
|
| 52 |
+
tools: str | list[str] | dict[str, dict[str, Any]] | None = None
|
|
|
|
|
|
|
| 53 |
|
| 54 |
|
| 55 |
+
class AgentResponse(BaseModel):
|
| 56 |
+
"""Response schema for an agent."""
|
| 57 |
|
| 58 |
model_config = ConfigDict(from_attributes=True)
|
| 59 |
|
|
|
|
| 69 |
@field_validator("id", mode="before")
|
| 70 |
@classmethod
|
| 71 |
def convert_uuid(cls, v: UUID | str) -> str:
|
|
|
|
| 72 |
if isinstance(v, UUID):
|
| 73 |
return str(v)
|
| 74 |
return v
|
|
|
|
| 76 |
@field_validator("job_id", mode="before")
|
| 77 |
@classmethod
|
| 78 |
def convert_job_uuid(cls, v: UUID | str | None) -> str | None:
|
|
|
|
| 79 |
if v is None:
|
| 80 |
return None
|
| 81 |
if isinstance(v, UUID):
|
src/flow/ui/schemas/job.py
CHANGED
|
@@ -13,7 +13,7 @@ class JobCreate(BaseModel):
|
|
| 13 |
"""Request schema for creating a job."""
|
| 14 |
|
| 15 |
name: str = ""
|
| 16 |
-
|
| 17 |
task_ids: list[str]
|
| 18 |
parallel: int = 4
|
| 19 |
use_llm_eval: bool = False
|
|
@@ -29,7 +29,7 @@ class JobResponse(BaseModel):
|
|
| 29 |
status: JobStatus
|
| 30 |
parallel: int
|
| 31 |
use_llm_eval: bool
|
| 32 |
-
|
| 33 |
task_ids: list[str]
|
| 34 |
pareto_frontier: list[str]
|
| 35 |
output_dir: str | None
|
|
@@ -56,6 +56,6 @@ class JobProgress(BaseModel):
|
|
| 56 |
job_id: str
|
| 57 |
completed: int = 0
|
| 58 |
total: int = 0
|
| 59 |
-
|
| 60 |
current_task: str = ""
|
| 61 |
message: str = ""
|
|
|
|
| 13 |
"""Request schema for creating a job."""
|
| 14 |
|
| 15 |
name: str = ""
|
| 16 |
+
candidate_ids: list[str]
|
| 17 |
task_ids: list[str]
|
| 18 |
parallel: int = 4
|
| 19 |
use_llm_eval: bool = False
|
|
|
|
| 29 |
status: JobStatus
|
| 30 |
parallel: int
|
| 31 |
use_llm_eval: bool
|
| 32 |
+
candidate_ids: list[str]
|
| 33 |
task_ids: list[str]
|
| 34 |
pareto_frontier: list[str]
|
| 35 |
output_dir: str | None
|
|
|
|
| 56 |
job_id: str
|
| 57 |
completed: int = 0
|
| 58 |
total: int = 0
|
| 59 |
+
current_candidate: str = ""
|
| 60 |
current_task: str = ""
|
| 61 |
message: str = ""
|
src/flow/ui/schemas/run.py
CHANGED
|
@@ -15,7 +15,7 @@ class RunResponse(BaseModel):
|
|
| 15 |
|
| 16 |
id: str
|
| 17 |
job_id: str
|
| 18 |
-
|
| 19 |
task_name: str
|
| 20 |
status: str
|
| 21 |
tokens_total: int
|
|
@@ -51,7 +51,7 @@ class RunDetailResponse(BaseModel):
|
|
| 51 |
|
| 52 |
id: str
|
| 53 |
job_id: str
|
| 54 |
-
|
| 55 |
task_name: str
|
| 56 |
status: str
|
| 57 |
|
|
|
|
| 15 |
|
| 16 |
id: str
|
| 17 |
job_id: str
|
| 18 |
+
candidate_name: str
|
| 19 |
task_name: str
|
| 20 |
status: str
|
| 21 |
tokens_total: int
|
|
|
|
| 51 |
|
| 52 |
id: str
|
| 53 |
job_id: str
|
| 54 |
+
candidate_name: str
|
| 55 |
task_name: str
|
| 56 |
status: str
|
| 57 |
|
src/flow/ui/services/optimizer_service.py
CHANGED
|
@@ -9,7 +9,7 @@ from uuid import UUID
|
|
| 9 |
from sqlalchemy.ext.asyncio import AsyncSession
|
| 10 |
from sqlmodel import select
|
| 11 |
|
| 12 |
-
from flow.experiments.
|
| 13 |
from flow.experiments.optimizer import FlowOptimizer
|
| 14 |
from flow.experiments.types import EvalCriterion, Task
|
| 15 |
|
|
@@ -26,12 +26,10 @@ class OptimizerService:
|
|
| 26 |
|
| 27 |
async def run_job(self, job_id: str | UUID) -> AsyncGenerator[JobProgress, None]:
|
| 28 |
"""Run an optimization job and yield progress updates."""
|
| 29 |
-
# Convert to UUID if string
|
| 30 |
if isinstance(job_id, str):
|
| 31 |
job_id = UUID(job_id)
|
| 32 |
|
| 33 |
async with async_session() as session:
|
| 34 |
-
# Load job
|
| 35 |
result = await session.execute(
|
| 36 |
select(OptimizationJob).where(OptimizationJob.id == job_id)
|
| 37 |
)
|
|
@@ -44,7 +42,6 @@ class OptimizerService:
|
|
| 44 |
)
|
| 45 |
return
|
| 46 |
|
| 47 |
-
# Update job status
|
| 48 |
job.status = JobStatus.RUNNING
|
| 49 |
job.started_at = datetime.now(timezone.utc)
|
| 50 |
await session.commit()
|
|
@@ -58,48 +55,39 @@ class OptimizerService:
|
|
| 58 |
)
|
| 59 |
|
| 60 |
try:
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
raise ValueError("No valid configs found")
|
| 65 |
|
| 66 |
-
# Load tasks
|
| 67 |
tasks = await self._load_tasks(session, job.task_ids)
|
| 68 |
if not tasks:
|
| 69 |
raise ValueError("No valid tasks found")
|
| 70 |
|
| 71 |
-
# Create optimizer
|
| 72 |
optimizer = FlowOptimizer(
|
| 73 |
parallel=job.parallel,
|
| 74 |
use_llm_evaluator=job.use_llm_eval,
|
| 75 |
)
|
| 76 |
|
| 77 |
-
# Track progress via callback
|
| 78 |
progress_queue: asyncio.Queue[tuple[int, int, str, str]] = asyncio.Queue()
|
| 79 |
|
| 80 |
def progress_callback(completed: int, total: int, config: str, task: str) -> None:
|
| 81 |
-
"""Callback invoked by FlowOptimizer on each completion."""
|
| 82 |
try:
|
| 83 |
progress_queue.put_nowait((completed, total, config, task))
|
| 84 |
except asyncio.QueueFull:
|
| 85 |
pass
|
| 86 |
|
| 87 |
-
# Run optimization in background task
|
| 88 |
async def run_optimization():
|
| 89 |
return await optimizer.optimize(
|
| 90 |
-
|
| 91 |
tasks=tasks,
|
| 92 |
progress_callback=progress_callback,
|
| 93 |
)
|
| 94 |
|
| 95 |
-
# Start optimization
|
| 96 |
opt_task = asyncio.create_task(run_optimization())
|
| 97 |
|
| 98 |
-
# Yield progress updates while optimization runs
|
| 99 |
while not opt_task.done():
|
| 100 |
try:
|
| 101 |
-
|
| 102 |
-
completed, total, config_name, task_name = await asyncio.wait_for(
|
| 103 |
progress_queue.get(),
|
| 104 |
timeout=1.0,
|
| 105 |
)
|
|
@@ -108,32 +96,26 @@ class OptimizerService:
|
|
| 108 |
job_id=str(job_id),
|
| 109 |
completed=completed,
|
| 110 |
total=total,
|
| 111 |
-
|
| 112 |
current_task=task_name,
|
| 113 |
-
message=f"Running {
|
| 114 |
)
|
| 115 |
|
| 116 |
-
# Update job progress in DB
|
| 117 |
job.completed_experiments = completed
|
| 118 |
await session.commit()
|
| 119 |
|
| 120 |
except asyncio.TimeoutError:
|
| 121 |
-
# No progress update, check if task failed
|
| 122 |
if opt_task.done():
|
| 123 |
-
# Check for exception before breaking
|
| 124 |
exc = opt_task.exception()
|
| 125 |
if exc:
|
| 126 |
raise exc
|
| 127 |
continue
|
| 128 |
|
| 129 |
-
# Get final result - this will re-raise any exception from the task
|
| 130 |
opt_result = await opt_task
|
| 131 |
|
| 132 |
-
# Check if all experiments failed
|
| 133 |
if opt_result.total_experiments == 0 or len(opt_result.summaries) == 0:
|
| 134 |
-
# No successful experiments - this is a failure
|
| 135 |
job.status = JobStatus.FAILED
|
| 136 |
-
job.error = "All experiments failed. Check server logs for details.
|
| 137 |
job.completed_at = datetime.now(timezone.utc)
|
| 138 |
await session.commit()
|
| 139 |
|
|
@@ -144,12 +126,11 @@ class OptimizerService:
|
|
| 144 |
)
|
| 145 |
return
|
| 146 |
|
| 147 |
-
# Save runs to database
|
| 148 |
for summary in opt_result.summaries:
|
| 149 |
for task_result in summary.task_results:
|
| 150 |
run = ExperimentRun(
|
| 151 |
job_id=job.id,
|
| 152 |
-
|
| 153 |
task_name=task_result.task_name,
|
| 154 |
status="completed",
|
| 155 |
tokens_total=task_result.metrics.total_tokens,
|
|
@@ -171,7 +152,6 @@ class OptimizerService:
|
|
| 171 |
)
|
| 172 |
session.add(run)
|
| 173 |
|
| 174 |
-
# Update job
|
| 175 |
job.status = JobStatus.COMPLETED
|
| 176 |
job.completed_experiments = opt_result.total_experiments
|
| 177 |
job.pareto_frontier = opt_result.pareto_frontier
|
|
@@ -184,7 +164,7 @@ class OptimizerService:
|
|
| 184 |
job_id=str(job_id),
|
| 185 |
completed=opt_result.total_experiments,
|
| 186 |
total=job.total_experiments,
|
| 187 |
-
message=f"Optimization complete. Pareto
|
| 188 |
)
|
| 189 |
|
| 190 |
except Exception as e:
|
|
@@ -199,37 +179,47 @@ class OptimizerService:
|
|
| 199 |
message=f"Optimization failed: {e}",
|
| 200 |
)
|
| 201 |
|
| 202 |
-
async def
|
| 203 |
self,
|
| 204 |
session: AsyncSession,
|
| 205 |
-
|
| 206 |
-
) -> list[
|
| 207 |
-
"""Load configs from database and convert to
|
| 208 |
-
|
| 209 |
-
for
|
| 210 |
result = await session.execute(
|
| 211 |
-
select(AgentConfig).where(AgentConfig.id == UUID(
|
| 212 |
)
|
| 213 |
db_config = result.scalar_one_or_none()
|
| 214 |
if db_config:
|
| 215 |
cfg = db_config.config_json
|
| 216 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 217 |
name=db_config.name,
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
return configs
|
| 226 |
|
| 227 |
async def _load_tasks(
|
| 228 |
self,
|
| 229 |
session: AsyncSession,
|
| 230 |
task_ids: list[str],
|
| 231 |
) -> list[Task]:
|
| 232 |
-
"""Load tasks from database and convert to Task."""
|
| 233 |
tasks = []
|
| 234 |
for task_id in task_ids:
|
| 235 |
result = await session.execute(
|
|
|
|
| 9 |
from sqlalchemy.ext.asyncio import AsyncSession
|
| 10 |
from sqlmodel import select
|
| 11 |
|
| 12 |
+
from flow.experiments.models import Agent, Candidate, CompactionConfig
|
| 13 |
from flow.experiments.optimizer import FlowOptimizer
|
| 14 |
from flow.experiments.types import EvalCriterion, Task
|
| 15 |
|
|
|
|
| 26 |
|
| 27 |
async def run_job(self, job_id: str | UUID) -> AsyncGenerator[JobProgress, None]:
|
| 28 |
"""Run an optimization job and yield progress updates."""
|
|
|
|
| 29 |
if isinstance(job_id, str):
|
| 30 |
job_id = UUID(job_id)
|
| 31 |
|
| 32 |
async with async_session() as session:
|
|
|
|
| 33 |
result = await session.execute(
|
| 34 |
select(OptimizationJob).where(OptimizationJob.id == job_id)
|
| 35 |
)
|
|
|
|
| 42 |
)
|
| 43 |
return
|
| 44 |
|
|
|
|
| 45 |
job.status = JobStatus.RUNNING
|
| 46 |
job.started_at = datetime.now(timezone.utc)
|
| 47 |
await session.commit()
|
|
|
|
| 55 |
)
|
| 56 |
|
| 57 |
try:
|
| 58 |
+
candidates = await self._load_candidates(session, job.candidate_ids)
|
| 59 |
+
if not candidates:
|
| 60 |
+
raise ValueError("No valid candidates found")
|
|
|
|
| 61 |
|
|
|
|
| 62 |
tasks = await self._load_tasks(session, job.task_ids)
|
| 63 |
if not tasks:
|
| 64 |
raise ValueError("No valid tasks found")
|
| 65 |
|
|
|
|
| 66 |
optimizer = FlowOptimizer(
|
| 67 |
parallel=job.parallel,
|
| 68 |
use_llm_evaluator=job.use_llm_eval,
|
| 69 |
)
|
| 70 |
|
|
|
|
| 71 |
progress_queue: asyncio.Queue[tuple[int, int, str, str]] = asyncio.Queue()
|
| 72 |
|
| 73 |
def progress_callback(completed: int, total: int, config: str, task: str) -> None:
|
|
|
|
| 74 |
try:
|
| 75 |
progress_queue.put_nowait((completed, total, config, task))
|
| 76 |
except asyncio.QueueFull:
|
| 77 |
pass
|
| 78 |
|
|
|
|
| 79 |
async def run_optimization():
|
| 80 |
return await optimizer.optimize(
|
| 81 |
+
candidates=candidates,
|
| 82 |
tasks=tasks,
|
| 83 |
progress_callback=progress_callback,
|
| 84 |
)
|
| 85 |
|
|
|
|
| 86 |
opt_task = asyncio.create_task(run_optimization())
|
| 87 |
|
|
|
|
| 88 |
while not opt_task.done():
|
| 89 |
try:
|
| 90 |
+
completed, total, candidate_name, task_name = await asyncio.wait_for(
|
|
|
|
| 91 |
progress_queue.get(),
|
| 92 |
timeout=1.0,
|
| 93 |
)
|
|
|
|
| 96 |
job_id=str(job_id),
|
| 97 |
completed=completed,
|
| 98 |
total=total,
|
| 99 |
+
current_candidate=candidate_name,
|
| 100 |
current_task=task_name,
|
| 101 |
+
message=f"Running {candidate_name}/{task_name}...",
|
| 102 |
)
|
| 103 |
|
|
|
|
| 104 |
job.completed_experiments = completed
|
| 105 |
await session.commit()
|
| 106 |
|
| 107 |
except asyncio.TimeoutError:
|
|
|
|
| 108 |
if opt_task.done():
|
|
|
|
| 109 |
exc = opt_task.exception()
|
| 110 |
if exc:
|
| 111 |
raise exc
|
| 112 |
continue
|
| 113 |
|
|
|
|
| 114 |
opt_result = await opt_task
|
| 115 |
|
|
|
|
| 116 |
if opt_result.total_experiments == 0 or len(opt_result.summaries) == 0:
|
|
|
|
| 117 |
job.status = JobStatus.FAILED
|
| 118 |
+
job.error = "All experiments failed. Check server logs for details."
|
| 119 |
job.completed_at = datetime.now(timezone.utc)
|
| 120 |
await session.commit()
|
| 121 |
|
|
|
|
| 126 |
)
|
| 127 |
return
|
| 128 |
|
|
|
|
| 129 |
for summary in opt_result.summaries:
|
| 130 |
for task_result in summary.task_results:
|
| 131 |
run = ExperimentRun(
|
| 132 |
job_id=job.id,
|
| 133 |
+
candidate_name=task_result.candidate_name,
|
| 134 |
task_name=task_result.task_name,
|
| 135 |
status="completed",
|
| 136 |
tokens_total=task_result.metrics.total_tokens,
|
|
|
|
| 152 |
)
|
| 153 |
session.add(run)
|
| 154 |
|
|
|
|
| 155 |
job.status = JobStatus.COMPLETED
|
| 156 |
job.completed_experiments = opt_result.total_experiments
|
| 157 |
job.pareto_frontier = opt_result.pareto_frontier
|
|
|
|
| 164 |
job_id=str(job_id),
|
| 165 |
completed=opt_result.total_experiments,
|
| 166 |
total=job.total_experiments,
|
| 167 |
+
message=f"Optimization complete. Pareto candidates: {', '.join(opt_result.pareto_frontier)}",
|
| 168 |
)
|
| 169 |
|
| 170 |
except Exception as e:
|
|
|
|
| 179 |
message=f"Optimization failed: {e}",
|
| 180 |
)
|
| 181 |
|
| 182 |
+
async def _load_candidates(
|
| 183 |
self,
|
| 184 |
session: AsyncSession,
|
| 185 |
+
candidate_ids: list[str],
|
| 186 |
+
) -> list[Candidate]:
|
| 187 |
+
"""Load configs from database and convert to Candidate objects."""
|
| 188 |
+
candidates = []
|
| 189 |
+
for candidate_id in candidate_ids:
|
| 190 |
result = await session.execute(
|
| 191 |
+
select(AgentConfig).where(AgentConfig.id == UUID(candidate_id))
|
| 192 |
)
|
| 193 |
db_config = result.scalar_one_or_none()
|
| 194 |
if db_config:
|
| 195 |
cfg = db_config.config_json
|
| 196 |
+
|
| 197 |
+
# Build CompactionConfig from stored JSON
|
| 198 |
+
compaction_data = cfg.get("compaction", {})
|
| 199 |
+
compaction = CompactionConfig(
|
| 200 |
+
strategy=compaction_data.get("strategy", "head_tail"),
|
| 201 |
+
params=compaction_data.get("params", {"head_size": 10, "tail_size": 40}),
|
| 202 |
+
)
|
| 203 |
+
|
| 204 |
+
# Get tools configuration (can be str, list, or dict)
|
| 205 |
+
tools = cfg.get("tools", "standard")
|
| 206 |
+
|
| 207 |
+
agent = Agent(
|
| 208 |
name=db_config.name,
|
| 209 |
+
instructions=cfg.get("instructions"),
|
| 210 |
+
model=cfg.get("model"),
|
| 211 |
+
compaction=compaction,
|
| 212 |
+
tools=tools,
|
| 213 |
+
)
|
| 214 |
+
candidates.append(Candidate(agent=agent))
|
| 215 |
+
return candidates
|
|
|
|
| 216 |
|
| 217 |
async def _load_tasks(
|
| 218 |
self,
|
| 219 |
session: AsyncSession,
|
| 220 |
task_ids: list[str],
|
| 221 |
) -> list[Task]:
|
| 222 |
+
"""Load tasks from database and convert to Task objects."""
|
| 223 |
tasks = []
|
| 224 |
for task_id in task_ids:
|
| 225 |
result = await session.execute(
|
src/flow/ui/tests/test_e2e_user_journey.py
CHANGED
|
@@ -138,7 +138,7 @@ class TestE2EUserJourney:
|
|
| 138 |
|
| 139 |
job_data = {
|
| 140 |
"name": "E2E Test Optimization",
|
| 141 |
-
"
|
| 142 |
"task_ids": created_task_ids[:2], # Use first 2 tasks
|
| 143 |
"parallel": 2,
|
| 144 |
"use_llm_eval": False,
|
|
@@ -150,7 +150,7 @@ class TestE2EUserJourney:
|
|
| 150 |
print(f" ✓ Created job: {job['name']} (id: {job['id'][:8]}...)")
|
| 151 |
print(f" - Status: {job['status']}")
|
| 152 |
print(f" - Total experiments: {job['total_experiments']}")
|
| 153 |
-
print(f" -
|
| 154 |
|
| 155 |
# ========================================
|
| 156 |
# STEP 5: Get Job Details
|
|
@@ -284,7 +284,7 @@ class TestE2EUserJourney:
|
|
| 284 |
# Test creating job with non-existent config
|
| 285 |
job_data = {
|
| 286 |
"name": "Invalid Job",
|
| 287 |
-
"
|
| 288 |
"task_ids": ["00000000-0000-0000-0000-000000000001"],
|
| 289 |
}
|
| 290 |
resp = await client.post("/api/jobs", json=job_data)
|
|
@@ -403,7 +403,7 @@ class TestAPIEndpoints:
|
|
| 403 |
"/api/jobs",
|
| 404 |
json={
|
| 405 |
"name": "test-job",
|
| 406 |
-
"
|
| 407 |
"task_ids": [task["id"]],
|
| 408 |
},
|
| 409 |
)
|
|
@@ -481,7 +481,7 @@ class TestAPIEndpoints:
|
|
| 481 |
"/api/jobs",
|
| 482 |
json={
|
| 483 |
"name": "start-test-job",
|
| 484 |
-
"
|
| 485 |
"task_ids": [task["id"]],
|
| 486 |
"parallel": 1,
|
| 487 |
},
|
|
@@ -593,7 +593,7 @@ class TestAPIEndpoints:
|
|
| 593 |
"/api/jobs",
|
| 594 |
json={
|
| 595 |
"name": "reset-test-job",
|
| 596 |
-
"
|
| 597 |
"task_ids": [task["id"]],
|
| 598 |
},
|
| 599 |
)
|
|
|
|
| 138 |
|
| 139 |
job_data = {
|
| 140 |
"name": "E2E Test Optimization",
|
| 141 |
+
"candidate_ids": created_agent_ids,
|
| 142 |
"task_ids": created_task_ids[:2], # Use first 2 tasks
|
| 143 |
"parallel": 2,
|
| 144 |
"use_llm_eval": False,
|
|
|
|
| 150 |
print(f" ✓ Created job: {job['name']} (id: {job['id'][:8]}...)")
|
| 151 |
print(f" - Status: {job['status']}")
|
| 152 |
print(f" - Total experiments: {job['total_experiments']}")
|
| 153 |
+
print(f" - Candidates: {len(job['candidate_ids'])}, Tasks: {len(job['task_ids'])}")
|
| 154 |
|
| 155 |
# ========================================
|
| 156 |
# STEP 5: Get Job Details
|
|
|
|
| 284 |
# Test creating job with non-existent config
|
| 285 |
job_data = {
|
| 286 |
"name": "Invalid Job",
|
| 287 |
+
"candidate_ids": ["00000000-0000-0000-0000-000000000000"],
|
| 288 |
"task_ids": ["00000000-0000-0000-0000-000000000001"],
|
| 289 |
}
|
| 290 |
resp = await client.post("/api/jobs", json=job_data)
|
|
|
|
| 403 |
"/api/jobs",
|
| 404 |
json={
|
| 405 |
"name": "test-job",
|
| 406 |
+
"candidate_ids": [config["id"]],
|
| 407 |
"task_ids": [task["id"]],
|
| 408 |
},
|
| 409 |
)
|
|
|
|
| 481 |
"/api/jobs",
|
| 482 |
json={
|
| 483 |
"name": "start-test-job",
|
| 484 |
+
"candidate_ids": [config["id"]],
|
| 485 |
"task_ids": [task["id"]],
|
| 486 |
"parallel": 1,
|
| 487 |
},
|
|
|
|
| 593 |
"/api/jobs",
|
| 594 |
json={
|
| 595 |
"name": "reset-test-job",
|
| 596 |
+
"candidate_ids": [config["id"]],
|
| 597 |
"task_ids": [task["id"]],
|
| 598 |
},
|
| 599 |
)
|
src/flow/ui/ui/assets/index-2zMAgGgo.js
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
src/flow/ui/ui/assets/index-BG9n9RHB.js
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
src/flow/ui/ui/assets/index-BHAF8mLj.css
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
*,:before,:after{--tw-border-spacing-x: 0;--tw-border-spacing-y: 0;--tw-translate-x: 0;--tw-translate-y: 0;--tw-rotate: 0;--tw-skew-x: 0;--tw-skew-y: 0;--tw-scale-x: 1;--tw-scale-y: 1;--tw-pan-x: ;--tw-pan-y: ;--tw-pinch-zoom: ;--tw-scroll-snap-strictness: proximity;--tw-gradient-from-position: ;--tw-gradient-via-position: ;--tw-gradient-to-position: ;--tw-ordinal: ;--tw-slashed-zero: ;--tw-numeric-figure: ;--tw-numeric-spacing: ;--tw-numeric-fraction: ;--tw-ring-inset: ;--tw-ring-offset-width: 0px;--tw-ring-offset-color: #fff;--tw-ring-color: rgb(59 130 246 / .5);--tw-ring-offset-shadow: 0 0 #0000;--tw-ring-shadow: 0 0 #0000;--tw-shadow: 0 0 #0000;--tw-shadow-colored: 0 0 #0000;--tw-blur: ;--tw-brightness: ;--tw-contrast: ;--tw-grayscale: ;--tw-hue-rotate: ;--tw-invert: ;--tw-saturate: ;--tw-sepia: ;--tw-drop-shadow: ;--tw-backdrop-blur: ;--tw-backdrop-brightness: ;--tw-backdrop-contrast: ;--tw-backdrop-grayscale: ;--tw-backdrop-hue-rotate: ;--tw-backdrop-invert: ;--tw-backdrop-opacity: ;--tw-backdrop-saturate: ;--tw-backdrop-sepia: ;--tw-contain-size: ;--tw-contain-layout: ;--tw-contain-paint: ;--tw-contain-style: }::backdrop{--tw-border-spacing-x: 0;--tw-border-spacing-y: 0;--tw-translate-x: 0;--tw-translate-y: 0;--tw-rotate: 0;--tw-skew-x: 0;--tw-skew-y: 0;--tw-scale-x: 1;--tw-scale-y: 1;--tw-pan-x: ;--tw-pan-y: ;--tw-pinch-zoom: ;--tw-scroll-snap-strictness: proximity;--tw-gradient-from-position: ;--tw-gradient-via-position: ;--tw-gradient-to-position: ;--tw-ordinal: ;--tw-slashed-zero: ;--tw-numeric-figure: ;--tw-numeric-spacing: ;--tw-numeric-fraction: ;--tw-ring-inset: ;--tw-ring-offset-width: 0px;--tw-ring-offset-color: #fff;--tw-ring-color: rgb(59 130 246 / .5);--tw-ring-offset-shadow: 0 0 #0000;--tw-ring-shadow: 0 0 #0000;--tw-shadow: 0 0 #0000;--tw-shadow-colored: 0 0 #0000;--tw-blur: ;--tw-brightness: ;--tw-contrast: ;--tw-grayscale: ;--tw-hue-rotate: ;--tw-invert: ;--tw-saturate: ;--tw-sepia: ;--tw-drop-shadow: ;--tw-backdrop-blur: ;--tw-backdrop-brightness: ;--tw-backdrop-contrast: ;--tw-backdrop-grayscale: ;--tw-backdrop-hue-rotate: ;--tw-backdrop-invert: ;--tw-backdrop-opacity: ;--tw-backdrop-saturate: ;--tw-backdrop-sepia: ;--tw-contain-size: ;--tw-contain-layout: ;--tw-contain-paint: ;--tw-contain-style: }*,:before,:after{box-sizing:border-box;border-width:0;border-style:solid;border-color:#e5e7eb}:before,:after{--tw-content: ""}html,:host{line-height:1.5;-webkit-text-size-adjust:100%;-moz-tab-size:4;-o-tab-size:4;tab-size:4;font-family:ui-sans-serif,system-ui,sans-serif,"Apple Color Emoji","Segoe UI Emoji",Segoe UI Symbol,"Noto Color Emoji";font-feature-settings:normal;font-variation-settings:normal;-webkit-tap-highlight-color:transparent}body{margin:0;line-height:inherit}hr{height:0;color:inherit;border-top-width:1px}abbr:where([title]){-webkit-text-decoration:underline dotted;text-decoration:underline dotted}h1,h2,h3,h4,h5,h6{font-size:inherit;font-weight:inherit}a{color:inherit;text-decoration:inherit}b,strong{font-weight:bolder}code,kbd,samp,pre{font-family:JetBrains Mono,ui-monospace,monospace;font-feature-settings:normal;font-variation-settings:normal;font-size:1em}small{font-size:80%}sub,sup{font-size:75%;line-height:0;position:relative;vertical-align:baseline}sub{bottom:-.25em}sup{top:-.5em}table{text-indent:0;border-color:inherit;border-collapse:collapse}button,input,optgroup,select,textarea{font-family:inherit;font-feature-settings:inherit;font-variation-settings:inherit;font-size:100%;font-weight:inherit;line-height:inherit;letter-spacing:inherit;color:inherit;margin:0;padding:0}button,select{text-transform:none}button,input:where([type=button]),input:where([type=reset]),input:where([type=submit]){-webkit-appearance:button;background-color:transparent;background-image:none}:-moz-focusring{outline:auto}:-moz-ui-invalid{box-shadow:none}progress{vertical-align:baseline}::-webkit-inner-spin-button,::-webkit-outer-spin-button{height:auto}[type=search]{-webkit-appearance:textfield;outline-offset:-2px}::-webkit-search-decoration{-webkit-appearance:none}::-webkit-file-upload-button{-webkit-appearance:button;font:inherit}summary{display:list-item}blockquote,dl,dd,h1,h2,h3,h4,h5,h6,hr,figure,p,pre{margin:0}fieldset{margin:0;padding:0}legend{padding:0}ol,ul,menu{list-style:none;margin:0;padding:0}dialog{padding:0}textarea{resize:vertical}input::-moz-placeholder,textarea::-moz-placeholder{opacity:1;color:#9ca3af}input::placeholder,textarea::placeholder{opacity:1;color:#9ca3af}button,[role=button]{cursor:pointer}:disabled{cursor:default}img,svg,video,canvas,audio,iframe,embed,object{display:block;vertical-align:middle}img,video{max-width:100%;height:auto}[hidden]:where(:not([hidden=until-found])){display:none}.pointer-events-none{pointer-events:none}.fixed{position:fixed}.absolute{position:absolute}.relative{position:relative}.sticky{position:sticky}.inset-0{top:0;right:0;bottom:0;left:0}.bottom-0{bottom:0}.left-0{left:0}.left-3{left:.75rem}.top-0{top:0}.top-1\/2{top:50%}.z-10{z-index:10}.z-50{z-index:50}.mx-0\.5{margin-left:.125rem;margin-right:.125rem}.mx-4{margin-left:1rem;margin-right:1rem}.mx-auto{margin-left:auto;margin-right:auto}.-mt-1{margin-top:-.25rem}.mb-1{margin-bottom:.25rem}.mb-2{margin-bottom:.5rem}.mb-3{margin-bottom:.75rem}.mb-4{margin-bottom:1rem}.mb-6{margin-bottom:1.5rem}.mb-8{margin-bottom:2rem}.ml-2{margin-left:.5rem}.ml-4{margin-left:1rem}.ml-6{margin-left:1.5rem}.mr-1{margin-right:.25rem}.mt-0\.5{margin-top:.125rem}.mt-1{margin-top:.25rem}.mt-2{margin-top:.5rem}.mt-3{margin-top:.75rem}.mt-4{margin-top:1rem}.mt-8{margin-top:2rem}.mt-auto{margin-top:auto}.line-clamp-2{overflow:hidden;display:-webkit-box;-webkit-box-orient:vertical;-webkit-line-clamp:2}.line-clamp-3{overflow:hidden;display:-webkit-box;-webkit-box-orient:vertical;-webkit-line-clamp:3}.block{display:block}.inline-block{display:inline-block}.flex{display:flex}.inline-flex{display:inline-flex}.table{display:table}.grid{display:grid}.hidden{display:none}.h-1\.5{height:.375rem}.h-12{height:3rem}.h-2{height:.5rem}.h-3{height:.75rem}.h-32{height:8rem}.h-4{height:1rem}.h-5{height:1.25rem}.h-6{height:1.5rem}.h-8{height:2rem}.h-full{height:100%}.max-h-32{max-height:8rem}.max-h-40{max-height:10rem}.max-h-48{max-height:12rem}.max-h-96{max-height:24rem}.max-h-\[80vh\]{max-height:80vh}.min-h-\[100px\]{min-height:100px}.min-h-screen{min-height:100vh}.w-12{width:3rem}.w-2{width:.5rem}.w-20{width:5rem}.w-24{width:6rem}.w-3{width:.75rem}.w-32{width:8rem}.w-4{width:1rem}.w-5{width:1.25rem}.w-6{width:1.5rem}.w-8{width:2rem}.w-full{width:100%}.min-w-0{min-width:0px}.min-w-\[90px\]{min-width:90px}.max-w-7xl{max-width:80rem}.max-w-lg{max-width:32rem}.max-w-md{max-width:28rem}.flex-1{flex:1 1 0%}.flex-shrink-0{flex-shrink:0}.-translate-y-1\/2{--tw-translate-y: -50%;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}.rotate-180{--tw-rotate: 180deg;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}.transform{transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}@keyframes pulse{50%{opacity:.5}}.animate-pulse{animation:pulse 2s cubic-bezier(.4,0,.6,1) infinite}@keyframes spin{to{transform:rotate(360deg)}}.animate-spin{animation:spin 1s linear infinite}.cursor-pointer{cursor:pointer}.select-none{-webkit-user-select:none;-moz-user-select:none;user-select:none}.resize-y{resize:vertical}.resize{resize:both}.grid-cols-1{grid-template-columns:repeat(1,minmax(0,1fr))}.grid-cols-2{grid-template-columns:repeat(2,minmax(0,1fr))}.grid-cols-3{grid-template-columns:repeat(3,minmax(0,1fr))}.grid-cols-4{grid-template-columns:repeat(4,minmax(0,1fr))}.flex-col{flex-direction:column}.flex-wrap{flex-wrap:wrap}.items-start{align-items:flex-start}.items-end{align-items:flex-end}.items-center{align-items:center}.justify-end{justify-content:flex-end}.justify-center{justify-content:center}.justify-between{justify-content:space-between}.gap-1{gap:.25rem}.gap-1\.5{gap:.375rem}.gap-2{gap:.5rem}.gap-3{gap:.75rem}.gap-4{gap:1rem}.gap-6{gap:1.5rem}.gap-8{gap:2rem}.gap-x-4{-moz-column-gap:1rem;column-gap:1rem}.gap-y-1{row-gap:.25rem}.space-y-1>:not([hidden])~:not([hidden]){--tw-space-y-reverse: 0;margin-top:calc(.25rem * calc(1 - var(--tw-space-y-reverse)));margin-bottom:calc(.25rem * var(--tw-space-y-reverse))}.space-y-2>:not([hidden])~:not([hidden]){--tw-space-y-reverse: 0;margin-top:calc(.5rem * calc(1 - var(--tw-space-y-reverse)));margin-bottom:calc(.5rem * var(--tw-space-y-reverse))}.space-y-4>:not([hidden])~:not([hidden]){--tw-space-y-reverse: 0;margin-top:calc(1rem * calc(1 - var(--tw-space-y-reverse)));margin-bottom:calc(1rem * var(--tw-space-y-reverse))}.space-y-6>:not([hidden])~:not([hidden]){--tw-space-y-reverse: 0;margin-top:calc(1.5rem * calc(1 - var(--tw-space-y-reverse)));margin-bottom:calc(1.5rem * var(--tw-space-y-reverse))}.overflow-auto{overflow:auto}.overflow-hidden{overflow:hidden}.overflow-x-auto{overflow-x:auto}.overflow-y-auto{overflow-y:auto}.truncate{overflow:hidden;text-overflow:ellipsis;white-space:nowrap}.whitespace-pre-wrap{white-space:pre-wrap}.break-all{word-break:break-all}.rounded{border-radius:.25rem}.rounded-full{border-radius:9999px}.rounded-lg{border-radius:.5rem}.rounded-md{border-radius:.375rem}.border{border-width:1px}.border-b{border-bottom-width:1px}.border-l-2{border-left-width:2px}.border-t{border-top-width:1px}.border-dashed{border-style:dashed}.border-\[var\(--accent\)\]{border-color:var(--accent)}.border-\[var\(--border\)\]{border-color:var(--border)}.border-blue-500\/30{border-color:#3b82f64d}.border-green-500\/30{border-color:#22c55e4d}.border-red-500\/30{border-color:#ef44444d}.border-red-500\/50{border-color:#ef444480}.bg-\[var\(--accent\)\]{background-color:var(--accent)}.bg-\[var\(--bg-primary\)\]{background-color:var(--bg-primary)}.bg-\[var\(--bg-secondary\)\]{background-color:var(--bg-secondary)}.bg-\[var\(--bg-tertiary\)\]{background-color:var(--bg-tertiary)}.bg-\[var\(--error\)\]{background-color:var(--error)}.bg-black\/80{background-color:#000c}.bg-blue-100{--tw-bg-opacity: 1;background-color:rgb(219 234 254 / var(--tw-bg-opacity, 1))}.bg-blue-400{--tw-bg-opacity: 1;background-color:rgb(96 165 250 / var(--tw-bg-opacity, 1))}.bg-blue-500{--tw-bg-opacity: 1;background-color:rgb(59 130 246 / var(--tw-bg-opacity, 1))}.bg-blue-500\/10{background-color:#3b82f61a}.bg-blue-600{--tw-bg-opacity: 1;background-color:rgb(37 99 235 / var(--tw-bg-opacity, 1))}.bg-emerald-500{--tw-bg-opacity: 1;background-color:rgb(16 185 129 / var(--tw-bg-opacity, 1))}.bg-green-100{--tw-bg-opacity: 1;background-color:rgb(220 252 231 / var(--tw-bg-opacity, 1))}.bg-green-400{--tw-bg-opacity: 1;background-color:rgb(74 222 128 / var(--tw-bg-opacity, 1))}.bg-green-500{--tw-bg-opacity: 1;background-color:rgb(34 197 94 / var(--tw-bg-opacity, 1))}.bg-green-500\/10{background-color:#22c55e1a}.bg-green-500\/20{background-color:#22c55e33}.bg-green-600{--tw-bg-opacity: 1;background-color:rgb(22 163 74 / var(--tw-bg-opacity, 1))}.bg-orange-100{--tw-bg-opacity: 1;background-color:rgb(255 237 213 / var(--tw-bg-opacity, 1))}.bg-purple-100{--tw-bg-opacity: 1;background-color:rgb(243 232 255 / var(--tw-bg-opacity, 1))}.bg-red-100{--tw-bg-opacity: 1;background-color:rgb(254 226 226 / var(--tw-bg-opacity, 1))}.bg-red-500{--tw-bg-opacity: 1;background-color:rgb(239 68 68 / var(--tw-bg-opacity, 1))}.bg-red-500\/10{background-color:#ef44441a}.bg-red-600{--tw-bg-opacity: 1;background-color:rgb(220 38 38 / var(--tw-bg-opacity, 1))}.bg-yellow-500{--tw-bg-opacity: 1;background-color:rgb(234 179 8 / var(--tw-bg-opacity, 1))}.p-1{padding:.25rem}.p-2{padding:.5rem}.p-3{padding:.75rem}.p-4{padding:1rem}.p-6{padding:1.5rem}.px-1{padding-left:.25rem;padding-right:.25rem}.px-1\.5{padding-left:.375rem;padding-right:.375rem}.px-2{padding-left:.5rem;padding-right:.5rem}.px-3{padding-left:.75rem;padding-right:.75rem}.px-4{padding-left:1rem;padding-right:1rem}.py-0\.5{padding-top:.125rem;padding-bottom:.125rem}.py-1{padding-top:.25rem;padding-bottom:.25rem}.py-1\.5{padding-top:.375rem;padding-bottom:.375rem}.py-12{padding-top:3rem;padding-bottom:3rem}.py-16{padding-top:4rem;padding-bottom:4rem}.py-2{padding-top:.5rem;padding-bottom:.5rem}.py-3{padding-top:.75rem;padding-bottom:.75rem}.py-4{padding-top:1rem;padding-bottom:1rem}.py-8{padding-top:2rem;padding-bottom:2rem}.pb-1{padding-bottom:.25rem}.pb-2{padding-bottom:.5rem}.pl-10{padding-left:2.5rem}.pr-3{padding-right:.75rem}.pr-4{padding-right:1rem}.pt-2{padding-top:.5rem}.pt-3{padding-top:.75rem}.pt-4{padding-top:1rem}.text-left{text-align:left}.text-center{text-align:center}.text-right{text-align:right}.font-mono{font-family:JetBrains Mono,ui-monospace,monospace}.text-2xl{font-size:1.5rem;line-height:2rem}.text-lg{font-size:1.125rem;line-height:1.75rem}.text-sm{font-size:.875rem;line-height:1.25rem}.text-xl{font-size:1.25rem;line-height:1.75rem}.text-xs{font-size:.75rem;line-height:1rem}.font-bold{font-weight:700}.font-medium{font-weight:500}.font-semibold{font-weight:600}.uppercase{text-transform:uppercase}.tracking-wide{letter-spacing:.025em}.tracking-wider{letter-spacing:.05em}.text-\[var\(--accent\)\]{color:var(--accent)}.text-\[var\(--error\)\]{color:var(--error)}.text-\[var\(--text-primary\)\]{color:var(--text-primary)}.text-\[var\(--text-secondary\)\]{color:var(--text-secondary)}.text-\[var\(--text-tertiary\)\]{color:var(--text-tertiary)}.text-black{--tw-text-opacity: 1;color:rgb(0 0 0 / var(--tw-text-opacity, 1))}.text-blue-400{--tw-text-opacity: 1;color:rgb(96 165 250 / var(--tw-text-opacity, 1))}.text-blue-800{--tw-text-opacity: 1;color:rgb(30 64 175 / var(--tw-text-opacity, 1))}.text-emerald-400{--tw-text-opacity: 1;color:rgb(52 211 153 / var(--tw-text-opacity, 1))}.text-green-400{--tw-text-opacity: 1;color:rgb(74 222 128 / var(--tw-text-opacity, 1))}.text-green-500{--tw-text-opacity: 1;color:rgb(34 197 94 / var(--tw-text-opacity, 1))}.text-green-800{--tw-text-opacity: 1;color:rgb(22 101 52 / var(--tw-text-opacity, 1))}.text-orange-800{--tw-text-opacity: 1;color:rgb(154 52 18 / var(--tw-text-opacity, 1))}.text-purple-400{--tw-text-opacity: 1;color:rgb(192 132 252 / var(--tw-text-opacity, 1))}.text-purple-800{--tw-text-opacity: 1;color:rgb(107 33 168 / var(--tw-text-opacity, 1))}.text-red-400{--tw-text-opacity: 1;color:rgb(248 113 113 / var(--tw-text-opacity, 1))}.text-red-800{--tw-text-opacity: 1;color:rgb(153 27 27 / var(--tw-text-opacity, 1))}.text-white{--tw-text-opacity: 1;color:rgb(255 255 255 / var(--tw-text-opacity, 1))}.accent-\[var\(--accent\)\]{accent-color:var(--accent)}.shadow-lg{--tw-shadow: 0 10px 15px -3px rgb(0 0 0 / .1), 0 4px 6px -4px rgb(0 0 0 / .1);--tw-shadow-colored: 0 10px 15px -3px var(--tw-shadow-color), 0 4px 6px -4px var(--tw-shadow-color);box-shadow:var(--tw-ring-offset-shadow, 0 0 #0000),var(--tw-ring-shadow, 0 0 #0000),var(--tw-shadow)}.filter{filter:var(--tw-blur) var(--tw-brightness) var(--tw-contrast) var(--tw-grayscale) var(--tw-hue-rotate) var(--tw-invert) var(--tw-saturate) var(--tw-sepia) var(--tw-drop-shadow)}.transition-all{transition-property:all;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-duration:.15s}.transition-colors{transition-property:color,background-color,border-color,text-decoration-color,fill,stroke;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-duration:.15s}.duration-300{transition-duration:.3s}:root{--bg-primary: #0a0a0a;--bg-secondary: #141414;--bg-tertiary: #1a1a1a;--text-primary: #f5f5f5;--text-secondary: #a3a3a3;--accent: #22c55e;--accent-dim: #166534;--border: #262626;--error: #ef4444}[data-theme=light]{--bg-primary: #ffffff;--bg-secondary: #f7f8f9;--bg-tertiary: #eef0f2;--text-primary: #1a1a1a;--text-secondary: #4a4a4a;--accent: #16a34a;--accent-dim: #dcfce7;--border: #d1d5db;--error: #dc2626}*{box-sizing:border-box}body{margin:0;background-color:var(--bg-primary);color:var(--text-primary);font-family:JetBrains Mono,ui-monospace,monospace;font-size:14px;line-height:1.6}::-webkit-scrollbar{width:8px;height:8px}::-webkit-scrollbar-track{background:var(--bg-secondary)}::-webkit-scrollbar-thumb{background:var(--border);border-radius:4px}::-webkit-scrollbar-thumb:hover{background:#404040}[data-theme=light] ::-webkit-scrollbar-thumb:hover{background:silver}.last\:border-0:last-child{border-width:0px}.hover\:border-\[var\(--accent-dim\)\]:hover{border-color:var(--accent-dim)}.hover\:bg-\[\#16a34a\]:hover{--tw-bg-opacity: 1;background-color:rgb(22 163 74 / var(--tw-bg-opacity, 1))}.hover\:bg-\[var\(--bg-primary\)\]:hover{background-color:var(--bg-primary)}.hover\:bg-\[var\(--bg-tertiary\)\]:hover{background-color:var(--bg-tertiary)}.hover\:bg-\[var\(--border\)\]:hover{background-color:var(--border)}.hover\:bg-red-600:hover{--tw-bg-opacity: 1;background-color:rgb(220 38 38 / var(--tw-bg-opacity, 1))}.hover\:text-\[var\(--accent\)\]:hover{color:var(--accent)}.hover\:text-\[var\(--text-primary\)\]:hover{color:var(--text-primary)}.hover\:opacity-80:hover{opacity:.8}.focus\:border-\[var\(--accent\)\]:focus{border-color:var(--accent)}.focus\:outline-none:focus{outline:2px solid transparent;outline-offset:2px}.focus\:ring-2:focus{--tw-ring-offset-shadow: var(--tw-ring-inset) 0 0 0 var(--tw-ring-offset-width) var(--tw-ring-offset-color);--tw-ring-shadow: var(--tw-ring-inset) 0 0 0 calc(2px + var(--tw-ring-offset-width)) var(--tw-ring-color);box-shadow:var(--tw-ring-offset-shadow),var(--tw-ring-shadow),var(--tw-shadow, 0 0 #0000)}.focus\:ring-\[var\(--accent\)\]:focus{--tw-ring-color: var(--accent)}.disabled\:cursor-not-allowed:disabled{cursor:not-allowed}.disabled\:opacity-50:disabled{opacity:.5}@media (min-width: 768px){.md\:grid-cols-2{grid-template-columns:repeat(2,minmax(0,1fr))}}@media (min-width: 1024px){.lg\:grid-cols-3{grid-template-columns:repeat(3,minmax(0,1fr))}}@media (min-width: 1280px){.xl\:grid-cols-3{grid-template-columns:repeat(3,minmax(0,1fr))}}@media (prefers-color-scheme: dark){.dark\:bg-blue-900{--tw-bg-opacity: 1;background-color:rgb(30 58 138 / var(--tw-bg-opacity, 1))}.dark\:bg-green-900{--tw-bg-opacity: 1;background-color:rgb(20 83 45 / var(--tw-bg-opacity, 1))}.dark\:bg-orange-900{--tw-bg-opacity: 1;background-color:rgb(124 45 18 / var(--tw-bg-opacity, 1))}.dark\:bg-purple-900{--tw-bg-opacity: 1;background-color:rgb(88 28 135 / var(--tw-bg-opacity, 1))}.dark\:bg-red-900{--tw-bg-opacity: 1;background-color:rgb(127 29 29 / var(--tw-bg-opacity, 1))}.dark\:text-blue-200{--tw-text-opacity: 1;color:rgb(191 219 254 / var(--tw-text-opacity, 1))}.dark\:text-green-200{--tw-text-opacity: 1;color:rgb(187 247 208 / var(--tw-text-opacity, 1))}.dark\:text-orange-200{--tw-text-opacity: 1;color:rgb(254 215 170 / var(--tw-text-opacity, 1))}.dark\:text-purple-200{--tw-text-opacity: 1;color:rgb(233 213 255 / var(--tw-text-opacity, 1))}.dark\:text-red-200{--tw-text-opacity: 1;color:rgb(254 202 202 / var(--tw-text-opacity, 1))}}
|
src/flow/ui/ui/assets/index-Bx-_JS_6.js
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
src/flow/ui/ui/assets/index-VFZIS3uv.js
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
src/flow/ui/ui/assets/index-_IRgS-wR.css
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
*,:before,:after{--tw-border-spacing-x: 0;--tw-border-spacing-y: 0;--tw-translate-x: 0;--tw-translate-y: 0;--tw-rotate: 0;--tw-skew-x: 0;--tw-skew-y: 0;--tw-scale-x: 1;--tw-scale-y: 1;--tw-pan-x: ;--tw-pan-y: ;--tw-pinch-zoom: ;--tw-scroll-snap-strictness: proximity;--tw-gradient-from-position: ;--tw-gradient-via-position: ;--tw-gradient-to-position: ;--tw-ordinal: ;--tw-slashed-zero: ;--tw-numeric-figure: ;--tw-numeric-spacing: ;--tw-numeric-fraction: ;--tw-ring-inset: ;--tw-ring-offset-width: 0px;--tw-ring-offset-color: #fff;--tw-ring-color: rgb(59 130 246 / .5);--tw-ring-offset-shadow: 0 0 #0000;--tw-ring-shadow: 0 0 #0000;--tw-shadow: 0 0 #0000;--tw-shadow-colored: 0 0 #0000;--tw-blur: ;--tw-brightness: ;--tw-contrast: ;--tw-grayscale: ;--tw-hue-rotate: ;--tw-invert: ;--tw-saturate: ;--tw-sepia: ;--tw-drop-shadow: ;--tw-backdrop-blur: ;--tw-backdrop-brightness: ;--tw-backdrop-contrast: ;--tw-backdrop-grayscale: ;--tw-backdrop-hue-rotate: ;--tw-backdrop-invert: ;--tw-backdrop-opacity: ;--tw-backdrop-saturate: ;--tw-backdrop-sepia: ;--tw-contain-size: ;--tw-contain-layout: ;--tw-contain-paint: ;--tw-contain-style: }::backdrop{--tw-border-spacing-x: 0;--tw-border-spacing-y: 0;--tw-translate-x: 0;--tw-translate-y: 0;--tw-rotate: 0;--tw-skew-x: 0;--tw-skew-y: 0;--tw-scale-x: 1;--tw-scale-y: 1;--tw-pan-x: ;--tw-pan-y: ;--tw-pinch-zoom: ;--tw-scroll-snap-strictness: proximity;--tw-gradient-from-position: ;--tw-gradient-via-position: ;--tw-gradient-to-position: ;--tw-ordinal: ;--tw-slashed-zero: ;--tw-numeric-figure: ;--tw-numeric-spacing: ;--tw-numeric-fraction: ;--tw-ring-inset: ;--tw-ring-offset-width: 0px;--tw-ring-offset-color: #fff;--tw-ring-color: rgb(59 130 246 / .5);--tw-ring-offset-shadow: 0 0 #0000;--tw-ring-shadow: 0 0 #0000;--tw-shadow: 0 0 #0000;--tw-shadow-colored: 0 0 #0000;--tw-blur: ;--tw-brightness: ;--tw-contrast: ;--tw-grayscale: ;--tw-hue-rotate: ;--tw-invert: ;--tw-saturate: ;--tw-sepia: ;--tw-drop-shadow: ;--tw-backdrop-blur: ;--tw-backdrop-brightness: ;--tw-backdrop-contrast: ;--tw-backdrop-grayscale: ;--tw-backdrop-hue-rotate: ;--tw-backdrop-invert: ;--tw-backdrop-opacity: ;--tw-backdrop-saturate: ;--tw-backdrop-sepia: ;--tw-contain-size: ;--tw-contain-layout: ;--tw-contain-paint: ;--tw-contain-style: }*,:before,:after{box-sizing:border-box;border-width:0;border-style:solid;border-color:#e5e7eb}:before,:after{--tw-content: ""}html,:host{line-height:1.5;-webkit-text-size-adjust:100%;-moz-tab-size:4;-o-tab-size:4;tab-size:4;font-family:ui-sans-serif,system-ui,sans-serif,"Apple Color Emoji","Segoe UI Emoji",Segoe UI Symbol,"Noto Color Emoji";font-feature-settings:normal;font-variation-settings:normal;-webkit-tap-highlight-color:transparent}body{margin:0;line-height:inherit}hr{height:0;color:inherit;border-top-width:1px}abbr:where([title]){-webkit-text-decoration:underline dotted;text-decoration:underline dotted}h1,h2,h3,h4,h5,h6{font-size:inherit;font-weight:inherit}a{color:inherit;text-decoration:inherit}b,strong{font-weight:bolder}code,kbd,samp,pre{font-family:JetBrains Mono,ui-monospace,monospace;font-feature-settings:normal;font-variation-settings:normal;font-size:1em}small{font-size:80%}sub,sup{font-size:75%;line-height:0;position:relative;vertical-align:baseline}sub{bottom:-.25em}sup{top:-.5em}table{text-indent:0;border-color:inherit;border-collapse:collapse}button,input,optgroup,select,textarea{font-family:inherit;font-feature-settings:inherit;font-variation-settings:inherit;font-size:100%;font-weight:inherit;line-height:inherit;letter-spacing:inherit;color:inherit;margin:0;padding:0}button,select{text-transform:none}button,input:where([type=button]),input:where([type=reset]),input:where([type=submit]){-webkit-appearance:button;background-color:transparent;background-image:none}:-moz-focusring{outline:auto}:-moz-ui-invalid{box-shadow:none}progress{vertical-align:baseline}::-webkit-inner-spin-button,::-webkit-outer-spin-button{height:auto}[type=search]{-webkit-appearance:textfield;outline-offset:-2px}::-webkit-search-decoration{-webkit-appearance:none}::-webkit-file-upload-button{-webkit-appearance:button;font:inherit}summary{display:list-item}blockquote,dl,dd,h1,h2,h3,h4,h5,h6,hr,figure,p,pre{margin:0}fieldset{margin:0;padding:0}legend{padding:0}ol,ul,menu{list-style:none;margin:0;padding:0}dialog{padding:0}textarea{resize:vertical}input::-moz-placeholder,textarea::-moz-placeholder{opacity:1;color:#9ca3af}input::placeholder,textarea::placeholder{opacity:1;color:#9ca3af}button,[role=button]{cursor:pointer}:disabled{cursor:default}img,svg,video,canvas,audio,iframe,embed,object{display:block;vertical-align:middle}img,video{max-width:100%;height:auto}[hidden]:where(:not([hidden=until-found])){display:none}.pointer-events-none{pointer-events:none}.fixed{position:fixed}.absolute{position:absolute}.relative{position:relative}.sticky{position:sticky}.inset-0{top:0;right:0;bottom:0;left:0}.bottom-0{bottom:0}.left-0{left:0}.left-3{left:.75rem}.top-0{top:0}.top-1\/2{top:50%}.z-10{z-index:10}.z-50{z-index:50}.mx-0\.5{margin-left:.125rem;margin-right:.125rem}.mx-4{margin-left:1rem;margin-right:1rem}.mx-auto{margin-left:auto;margin-right:auto}.-mt-1{margin-top:-.25rem}.mb-1{margin-bottom:.25rem}.mb-2{margin-bottom:.5rem}.mb-3{margin-bottom:.75rem}.mb-4{margin-bottom:1rem}.mb-6{margin-bottom:1.5rem}.mb-8{margin-bottom:2rem}.ml-2{margin-left:.5rem}.ml-4{margin-left:1rem}.ml-6{margin-left:1.5rem}.mr-1{margin-right:.25rem}.mt-0\.5{margin-top:.125rem}.mt-1{margin-top:.25rem}.mt-2{margin-top:.5rem}.mt-3{margin-top:.75rem}.mt-4{margin-top:1rem}.mt-8{margin-top:2rem}.mt-auto{margin-top:auto}.line-clamp-2{overflow:hidden;display:-webkit-box;-webkit-box-orient:vertical;-webkit-line-clamp:2}.line-clamp-3{overflow:hidden;display:-webkit-box;-webkit-box-orient:vertical;-webkit-line-clamp:3}.block{display:block}.inline-block{display:inline-block}.flex{display:flex}.inline-flex{display:inline-flex}.table{display:table}.grid{display:grid}.hidden{display:none}.h-1\.5{height:.375rem}.h-12{height:3rem}.h-2{height:.5rem}.h-3{height:.75rem}.h-32{height:8rem}.h-4{height:1rem}.h-5{height:1.25rem}.h-6{height:1.5rem}.h-8{height:2rem}.h-full{height:100%}.max-h-32{max-height:8rem}.max-h-40{max-height:10rem}.max-h-48{max-height:12rem}.max-h-96{max-height:24rem}.max-h-\[80vh\]{max-height:80vh}.min-h-\[100px\]{min-height:100px}.min-h-screen{min-height:100vh}.w-12{width:3rem}.w-2{width:.5rem}.w-20{width:5rem}.w-24{width:6rem}.w-3{width:.75rem}.w-32{width:8rem}.w-4{width:1rem}.w-5{width:1.25rem}.w-6{width:1.5rem}.w-8{width:2rem}.w-full{width:100%}.min-w-0{min-width:0px}.min-w-\[90px\]{min-width:90px}.max-w-7xl{max-width:80rem}.max-w-lg{max-width:32rem}.max-w-md{max-width:28rem}.flex-1{flex:1 1 0%}.flex-shrink-0{flex-shrink:0}.-translate-y-1\/2{--tw-translate-y: -50%;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}.rotate-180{--tw-rotate: 180deg;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}.transform{transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}@keyframes pulse{50%{opacity:.5}}.animate-pulse{animation:pulse 2s cubic-bezier(.4,0,.6,1) infinite}@keyframes spin{to{transform:rotate(360deg)}}.animate-spin{animation:spin 1s linear infinite}.cursor-pointer{cursor:pointer}.select-none{-webkit-user-select:none;-moz-user-select:none;user-select:none}.resize-y{resize:vertical}.resize{resize:both}.grid-cols-1{grid-template-columns:repeat(1,minmax(0,1fr))}.grid-cols-2{grid-template-columns:repeat(2,minmax(0,1fr))}.grid-cols-3{grid-template-columns:repeat(3,minmax(0,1fr))}.grid-cols-4{grid-template-columns:repeat(4,minmax(0,1fr))}.flex-col{flex-direction:column}.flex-wrap{flex-wrap:wrap}.items-start{align-items:flex-start}.items-end{align-items:flex-end}.items-center{align-items:center}.justify-end{justify-content:flex-end}.justify-center{justify-content:center}.justify-between{justify-content:space-between}.gap-1{gap:.25rem}.gap-1\.5{gap:.375rem}.gap-2{gap:.5rem}.gap-3{gap:.75rem}.gap-4{gap:1rem}.gap-6{gap:1.5rem}.gap-8{gap:2rem}.gap-x-4{-moz-column-gap:1rem;column-gap:1rem}.gap-y-1{row-gap:.25rem}.space-y-1>:not([hidden])~:not([hidden]){--tw-space-y-reverse: 0;margin-top:calc(.25rem * calc(1 - var(--tw-space-y-reverse)));margin-bottom:calc(.25rem * var(--tw-space-y-reverse))}.space-y-2>:not([hidden])~:not([hidden]){--tw-space-y-reverse: 0;margin-top:calc(.5rem * calc(1 - var(--tw-space-y-reverse)));margin-bottom:calc(.5rem * var(--tw-space-y-reverse))}.space-y-4>:not([hidden])~:not([hidden]){--tw-space-y-reverse: 0;margin-top:calc(1rem * calc(1 - var(--tw-space-y-reverse)));margin-bottom:calc(1rem * var(--tw-space-y-reverse))}.space-y-6>:not([hidden])~:not([hidden]){--tw-space-y-reverse: 0;margin-top:calc(1.5rem * calc(1 - var(--tw-space-y-reverse)));margin-bottom:calc(1.5rem * var(--tw-space-y-reverse))}.overflow-auto{overflow:auto}.overflow-hidden{overflow:hidden}.overflow-x-auto{overflow-x:auto}.overflow-y-auto{overflow-y:auto}.truncate{overflow:hidden;text-overflow:ellipsis;white-space:nowrap}.whitespace-pre-wrap{white-space:pre-wrap}.break-all{word-break:break-all}.rounded{border-radius:.25rem}.rounded-full{border-radius:9999px}.rounded-lg{border-radius:.5rem}.rounded-md{border-radius:.375rem}.border{border-width:1px}.border-b{border-bottom-width:1px}.border-l-2{border-left-width:2px}.border-t{border-top-width:1px}.border-dashed{border-style:dashed}.border-\[var\(--accent\)\]{border-color:var(--accent)}.border-\[var\(--border\)\]{border-color:var(--border)}.border-blue-500\/30{border-color:#3b82f64d}.border-green-500\/30{border-color:#22c55e4d}.border-red-500\/30{border-color:#ef44444d}.border-red-500\/50{border-color:#ef444480}.bg-\[var\(--accent\)\]{background-color:var(--accent)}.bg-\[var\(--bg-primary\)\]{background-color:var(--bg-primary)}.bg-\[var\(--bg-secondary\)\]{background-color:var(--bg-secondary)}.bg-\[var\(--bg-tertiary\)\]{background-color:var(--bg-tertiary)}.bg-\[var\(--error\)\]{background-color:var(--error)}.bg-black\/80{background-color:#000c}.bg-blue-100{--tw-bg-opacity: 1;background-color:rgb(219 234 254 / var(--tw-bg-opacity, 1))}.bg-blue-400{--tw-bg-opacity: 1;background-color:rgb(96 165 250 / var(--tw-bg-opacity, 1))}.bg-blue-500{--tw-bg-opacity: 1;background-color:rgb(59 130 246 / var(--tw-bg-opacity, 1))}.bg-blue-500\/10{background-color:#3b82f61a}.bg-blue-600{--tw-bg-opacity: 1;background-color:rgb(37 99 235 / var(--tw-bg-opacity, 1))}.bg-emerald-500{--tw-bg-opacity: 1;background-color:rgb(16 185 129 / var(--tw-bg-opacity, 1))}.bg-green-100{--tw-bg-opacity: 1;background-color:rgb(220 252 231 / var(--tw-bg-opacity, 1))}.bg-green-400{--tw-bg-opacity: 1;background-color:rgb(74 222 128 / var(--tw-bg-opacity, 1))}.bg-green-500{--tw-bg-opacity: 1;background-color:rgb(34 197 94 / var(--tw-bg-opacity, 1))}.bg-green-500\/10{background-color:#22c55e1a}.bg-green-500\/20{background-color:#22c55e33}.bg-green-600{--tw-bg-opacity: 1;background-color:rgb(22 163 74 / var(--tw-bg-opacity, 1))}.bg-orange-100{--tw-bg-opacity: 1;background-color:rgb(255 237 213 / var(--tw-bg-opacity, 1))}.bg-purple-100{--tw-bg-opacity: 1;background-color:rgb(243 232 255 / var(--tw-bg-opacity, 1))}.bg-red-100{--tw-bg-opacity: 1;background-color:rgb(254 226 226 / var(--tw-bg-opacity, 1))}.bg-red-500{--tw-bg-opacity: 1;background-color:rgb(239 68 68 / var(--tw-bg-opacity, 1))}.bg-red-500\/10{background-color:#ef44441a}.bg-red-600{--tw-bg-opacity: 1;background-color:rgb(220 38 38 / var(--tw-bg-opacity, 1))}.bg-yellow-500{--tw-bg-opacity: 1;background-color:rgb(234 179 8 / var(--tw-bg-opacity, 1))}.p-2{padding:.5rem}.p-3{padding:.75rem}.p-4{padding:1rem}.p-6{padding:1.5rem}.px-1{padding-left:.25rem;padding-right:.25rem}.px-1\.5{padding-left:.375rem;padding-right:.375rem}.px-2{padding-left:.5rem;padding-right:.5rem}.px-3{padding-left:.75rem;padding-right:.75rem}.px-4{padding-left:1rem;padding-right:1rem}.py-0\.5{padding-top:.125rem;padding-bottom:.125rem}.py-1{padding-top:.25rem;padding-bottom:.25rem}.py-1\.5{padding-top:.375rem;padding-bottom:.375rem}.py-12{padding-top:3rem;padding-bottom:3rem}.py-16{padding-top:4rem;padding-bottom:4rem}.py-2{padding-top:.5rem;padding-bottom:.5rem}.py-3{padding-top:.75rem;padding-bottom:.75rem}.py-4{padding-top:1rem;padding-bottom:1rem}.py-8{padding-top:2rem;padding-bottom:2rem}.pb-1{padding-bottom:.25rem}.pb-2{padding-bottom:.5rem}.pl-10{padding-left:2.5rem}.pr-3{padding-right:.75rem}.pr-4{padding-right:1rem}.pt-2{padding-top:.5rem}.pt-3{padding-top:.75rem}.pt-4{padding-top:1rem}.text-left{text-align:left}.text-center{text-align:center}.text-right{text-align:right}.font-mono{font-family:JetBrains Mono,ui-monospace,monospace}.text-2xl{font-size:1.5rem;line-height:2rem}.text-lg{font-size:1.125rem;line-height:1.75rem}.text-sm{font-size:.875rem;line-height:1.25rem}.text-xl{font-size:1.25rem;line-height:1.75rem}.text-xs{font-size:.75rem;line-height:1rem}.font-bold{font-weight:700}.font-medium{font-weight:500}.font-semibold{font-weight:600}.uppercase{text-transform:uppercase}.tracking-wide{letter-spacing:.025em}.tracking-wider{letter-spacing:.05em}.text-\[var\(--accent\)\]{color:var(--accent)}.text-\[var\(--error\)\]{color:var(--error)}.text-\[var\(--text-primary\)\]{color:var(--text-primary)}.text-\[var\(--text-secondary\)\]{color:var(--text-secondary)}.text-\[var\(--text-tertiary\)\]{color:var(--text-tertiary)}.text-black{--tw-text-opacity: 1;color:rgb(0 0 0 / var(--tw-text-opacity, 1))}.text-blue-400{--tw-text-opacity: 1;color:rgb(96 165 250 / var(--tw-text-opacity, 1))}.text-blue-800{--tw-text-opacity: 1;color:rgb(30 64 175 / var(--tw-text-opacity, 1))}.text-emerald-400{--tw-text-opacity: 1;color:rgb(52 211 153 / var(--tw-text-opacity, 1))}.text-green-400{--tw-text-opacity: 1;color:rgb(74 222 128 / var(--tw-text-opacity, 1))}.text-green-500{--tw-text-opacity: 1;color:rgb(34 197 94 / var(--tw-text-opacity, 1))}.text-green-800{--tw-text-opacity: 1;color:rgb(22 101 52 / var(--tw-text-opacity, 1))}.text-orange-800{--tw-text-opacity: 1;color:rgb(154 52 18 / var(--tw-text-opacity, 1))}.text-purple-400{--tw-text-opacity: 1;color:rgb(192 132 252 / var(--tw-text-opacity, 1))}.text-purple-800{--tw-text-opacity: 1;color:rgb(107 33 168 / var(--tw-text-opacity, 1))}.text-red-400{--tw-text-opacity: 1;color:rgb(248 113 113 / var(--tw-text-opacity, 1))}.text-red-800{--tw-text-opacity: 1;color:rgb(153 27 27 / var(--tw-text-opacity, 1))}.text-white{--tw-text-opacity: 1;color:rgb(255 255 255 / var(--tw-text-opacity, 1))}.accent-\[var\(--accent\)\]{accent-color:var(--accent)}.shadow-lg{--tw-shadow: 0 10px 15px -3px rgb(0 0 0 / .1), 0 4px 6px -4px rgb(0 0 0 / .1);--tw-shadow-colored: 0 10px 15px -3px var(--tw-shadow-color), 0 4px 6px -4px var(--tw-shadow-color);box-shadow:var(--tw-ring-offset-shadow, 0 0 #0000),var(--tw-ring-shadow, 0 0 #0000),var(--tw-shadow)}.filter{filter:var(--tw-blur) var(--tw-brightness) var(--tw-contrast) var(--tw-grayscale) var(--tw-hue-rotate) var(--tw-invert) var(--tw-saturate) var(--tw-sepia) var(--tw-drop-shadow)}.transition-all{transition-property:all;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-duration:.15s}.transition-colors{transition-property:color,background-color,border-color,text-decoration-color,fill,stroke;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-duration:.15s}.duration-300{transition-duration:.3s}:root{--bg-primary: #0a0a0a;--bg-secondary: #141414;--bg-tertiary: #1a1a1a;--text-primary: #f5f5f5;--text-secondary: #a3a3a3;--accent: #22c55e;--accent-dim: #166534;--border: #262626;--error: #ef4444}[data-theme=light]{--bg-primary: #ffffff;--bg-secondary: #f7f8f9;--bg-tertiary: #eef0f2;--text-primary: #1a1a1a;--text-secondary: #4a4a4a;--accent: #16a34a;--accent-dim: #dcfce7;--border: #d1d5db;--error: #dc2626}*{box-sizing:border-box}body{margin:0;background-color:var(--bg-primary);color:var(--text-primary);font-family:JetBrains Mono,ui-monospace,monospace;font-size:14px;line-height:1.6}::-webkit-scrollbar{width:8px;height:8px}::-webkit-scrollbar-track{background:var(--bg-secondary)}::-webkit-scrollbar-thumb{background:var(--border);border-radius:4px}::-webkit-scrollbar-thumb:hover{background:#404040}[data-theme=light] ::-webkit-scrollbar-thumb:hover{background:silver}.last\:border-0:last-child{border-width:0px}.hover\:border-\[var\(--accent-dim\)\]:hover{border-color:var(--accent-dim)}.hover\:bg-\[\#16a34a\]:hover{--tw-bg-opacity: 1;background-color:rgb(22 163 74 / var(--tw-bg-opacity, 1))}.hover\:bg-\[var\(--bg-primary\)\]:hover{background-color:var(--bg-primary)}.hover\:bg-\[var\(--bg-tertiary\)\]:hover{background-color:var(--bg-tertiary)}.hover\:bg-\[var\(--border\)\]:hover{background-color:var(--border)}.hover\:bg-red-600:hover{--tw-bg-opacity: 1;background-color:rgb(220 38 38 / var(--tw-bg-opacity, 1))}.hover\:text-\[var\(--accent\)\]:hover{color:var(--accent)}.hover\:text-\[var\(--text-primary\)\]:hover{color:var(--text-primary)}.hover\:opacity-80:hover{opacity:.8}.focus\:border-\[var\(--accent\)\]:focus{border-color:var(--accent)}.focus\:outline-none:focus{outline:2px solid transparent;outline-offset:2px}.focus\:ring-2:focus{--tw-ring-offset-shadow: var(--tw-ring-inset) 0 0 0 var(--tw-ring-offset-width) var(--tw-ring-offset-color);--tw-ring-shadow: var(--tw-ring-inset) 0 0 0 calc(2px + var(--tw-ring-offset-width)) var(--tw-ring-color);box-shadow:var(--tw-ring-offset-shadow),var(--tw-ring-shadow),var(--tw-shadow, 0 0 #0000)}.focus\:ring-\[var\(--accent\)\]:focus{--tw-ring-color: var(--accent)}.disabled\:cursor-not-allowed:disabled{cursor:not-allowed}.disabled\:opacity-50:disabled{opacity:.5}@media (min-width: 768px){.md\:grid-cols-2{grid-template-columns:repeat(2,minmax(0,1fr))}}@media (min-width: 1024px){.lg\:grid-cols-3{grid-template-columns:repeat(3,minmax(0,1fr))}}@media (min-width: 1280px){.xl\:grid-cols-3{grid-template-columns:repeat(3,minmax(0,1fr))}}@media (prefers-color-scheme: dark){.dark\:bg-blue-900{--tw-bg-opacity: 1;background-color:rgb(30 58 138 / var(--tw-bg-opacity, 1))}.dark\:bg-green-900{--tw-bg-opacity: 1;background-color:rgb(20 83 45 / var(--tw-bg-opacity, 1))}.dark\:bg-orange-900{--tw-bg-opacity: 1;background-color:rgb(124 45 18 / var(--tw-bg-opacity, 1))}.dark\:bg-purple-900{--tw-bg-opacity: 1;background-color:rgb(88 28 135 / var(--tw-bg-opacity, 1))}.dark\:bg-red-900{--tw-bg-opacity: 1;background-color:rgb(127 29 29 / var(--tw-bg-opacity, 1))}.dark\:text-blue-200{--tw-text-opacity: 1;color:rgb(191 219 254 / var(--tw-text-opacity, 1))}.dark\:text-green-200{--tw-text-opacity: 1;color:rgb(187 247 208 / var(--tw-text-opacity, 1))}.dark\:text-orange-200{--tw-text-opacity: 1;color:rgb(254 215 170 / var(--tw-text-opacity, 1))}.dark\:text-purple-200{--tw-text-opacity: 1;color:rgb(233 213 255 / var(--tw-text-opacity, 1))}.dark\:text-red-200{--tw-text-opacity: 1;color:rgb(254 202 202 / var(--tw-text-opacity, 1))}}
|
src/flow/ui/ui/index.html
CHANGED
|
@@ -8,8 +8,8 @@
|
|
| 8 |
<link rel="preconnect" href="https://fonts.googleapis.com">
|
| 9 |
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
|
| 10 |
<link href="https://fonts.googleapis.com/css2?family=JetBrains+Mono:wght@400;500;600;700&display=swap" rel="stylesheet">
|
| 11 |
-
<script type="module" crossorigin src="/assets/index-
|
| 12 |
-
<link rel="stylesheet" crossorigin href="/assets/index-
|
| 13 |
</head>
|
| 14 |
<body>
|
| 15 |
<div id="root"></div>
|
|
|
|
| 8 |
<link rel="preconnect" href="https://fonts.googleapis.com">
|
| 9 |
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
|
| 10 |
<link href="https://fonts.googleapis.com/css2?family=JetBrains+Mono:wght@400;500;600;700&display=swap" rel="stylesheet">
|
| 11 |
+
<script type="module" crossorigin src="/assets/index-2zMAgGgo.js"></script>
|
| 12 |
+
<link rel="stylesheet" crossorigin href="/assets/index-BHAF8mLj.css">
|
| 13 |
</head>
|
| 14 |
<body>
|
| 15 |
<div id="root"></div>
|