victordibia commited on
Commit
034c2ac
·
1 Parent(s): 34635fd

Deploy 2026-01-26 07:50:36

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .dockerignore +54 -0
  2. Dockerfile +69 -0
  3. README.md +195 -0
  4. pyproject.toml +180 -0
  5. src/flow/__init__.py +26 -0
  6. src/flow/cli/__init__.py +11 -0
  7. src/flow/cli/app.py +216 -0
  8. src/flow/cli/optimize.py +332 -0
  9. src/flow/cli/output.py +99 -0
  10. src/flow/cli/repl.py +153 -0
  11. src/flow/experiments/__init__.py +204 -0
  12. src/flow/experiments/ablation.py +472 -0
  13. src/flow/experiments/config_export.py +184 -0
  14. src/flow/experiments/evaluators/__init__.py +17 -0
  15. src/flow/experiments/evaluators/base.py +32 -0
  16. src/flow/experiments/evaluators/composite.py +80 -0
  17. src/flow/experiments/evaluators/heuristic.py +193 -0
  18. src/flow/experiments/evaluators/llm.py +223 -0
  19. src/flow/experiments/evaluators/trace.py +149 -0
  20. src/flow/experiments/metrics.py +267 -0
  21. src/flow/experiments/optimizer.py +547 -0
  22. src/flow/experiments/reporters/__init__.py +17 -0
  23. src/flow/experiments/reporters/console_reporter.py +135 -0
  24. src/flow/experiments/reporters/json_reporter.py +133 -0
  25. src/flow/experiments/runner.py +243 -0
  26. src/flow/experiments/trace_collector.py +104 -0
  27. src/flow/experiments/types.py +266 -0
  28. src/flow/harness/__init__.py +18 -0
  29. src/flow/harness/base.py +110 -0
  30. src/flow/harness/maf/__init__.py +14 -0
  31. src/flow/harness/maf/agent.py +176 -0
  32. src/flow/harness/maf/harness.py +258 -0
  33. src/flow/harness/maf/message_store.py +177 -0
  34. src/flow/prompts.py +407 -0
  35. src/flow/py.typed +0 -0
  36. src/flow/tools/__init__.py +172 -0
  37. src/flow/tools/coding.py +391 -0
  38. src/flow/tools/core.py +100 -0
  39. src/flow/tools/execution.py +479 -0
  40. src/flow/tools/memory.py +260 -0
  41. src/flow/tools/sub_agent.py +188 -0
  42. src/flow/ui/__init__.py +2 -0
  43. src/flow/ui/api/__init__.py +14 -0
  44. src/flow/ui/api/configs.py +121 -0
  45. src/flow/ui/api/jobs.py +169 -0
  46. src/flow/ui/api/runs.py +157 -0
  47. src/flow/ui/api/tasks.py +119 -0
  48. src/flow/ui/database.py +30 -0
  49. src/flow/ui/main.py +94 -0
  50. src/flow/ui/models/__init__.py +15 -0
.dockerignore ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This file is relative to the build context (repo root)
2
+
3
+ # Git
4
+ .git
5
+ .gitignore
6
+
7
+ # Python
8
+ __pycache__
9
+ *.py[cod]
10
+ *$py.class
11
+ *.so
12
+ .Python
13
+ .venv
14
+ venv
15
+ ENV
16
+ .eggs
17
+ *.egg-info
18
+ dist
19
+ build
20
+
21
+ # Testing/Dev
22
+ .pytest_cache
23
+ .coverage
24
+ htmlcov
25
+ .mypy_cache
26
+ .ruff_cache
27
+ .pyright
28
+
29
+ # IDE
30
+ .vscode
31
+ .idea
32
+ *.swp
33
+ *.swo
34
+
35
+ # Frontend source (built files are already in src/flow/ui/ui/)
36
+ app/frontend/node_modules
37
+ app/frontend/src
38
+ app/frontend/*.json
39
+ app/frontend/*.ts
40
+ app/frontend/*.js
41
+ app/frontend/*.md
42
+ app/frontend/.vite
43
+
44
+ # Docs and deploy folder itself
45
+ docs
46
+ deploy
47
+
48
+ # Local env files (pass via docker env instead)
49
+ .env
50
+ .env.*
51
+ !.env.example
52
+
53
+ # Tests (not needed in production)
54
+ tests
Dockerfile ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Flow UI Container
2
+ # Production-ready deployment with uvicorn workers
3
+
4
+ FROM python:3.11-slim AS base
5
+
6
+ WORKDIR /app
7
+
8
+ # Install system dependencies
9
+ RUN apt-get update && apt-get install -y --no-install-recommends \
10
+ git \
11
+ curl \
12
+ && rm -rf /var/lib/apt/lists/*
13
+
14
+ # Install uv for fast dependency management
15
+ RUN pip install --no-cache-dir uv
16
+
17
+ # -------------------------------------------------------------------
18
+ # Builder stage: install dependencies
19
+ # -------------------------------------------------------------------
20
+ FROM base AS builder
21
+
22
+ # Copy only dependency files first (better layer caching)
23
+ COPY pyproject.toml uv.lock ./
24
+
25
+ # Install dependencies to system (no venv needed in container)
26
+ RUN uv pip install --system .
27
+
28
+ # -------------------------------------------------------------------
29
+ # Final stage: copy app and run
30
+ # -------------------------------------------------------------------
31
+ FROM base AS final
32
+
33
+ # Copy installed packages from builder
34
+ COPY --from=builder /usr/local/lib/python3.11/site-packages /usr/local/lib/python3.11/site-packages
35
+ COPY --from=builder /usr/local/bin /usr/local/bin
36
+
37
+ # Copy application source (includes pre-built frontend in src/flow/ui/ui/)
38
+ COPY src/ ./src/
39
+
40
+ # Install the app itself (editable, uses already-installed deps)
41
+ RUN uv pip install --system --no-deps -e .
42
+
43
+ # Create non-root user for security
44
+ RUN useradd --create-home --shell /bin/bash flowuser
45
+ RUN mkdir -p /app/data && chown -R flowuser:flowuser /app
46
+ USER flowuser
47
+
48
+ # Configuration
49
+ ENV PORT=7860
50
+ ENV FLOW_DATA_DIR=/app/data
51
+ ENV UVICORN_WORKERS=2
52
+
53
+ # Expose the port
54
+ EXPOSE ${PORT}
55
+
56
+ # Health check - matches the actual endpoint in main.py
57
+ HEALTHCHECK --interval=30s --timeout=10s --start-period=10s --retries=3 \
58
+ CMD curl -f http://localhost:${PORT}/api/health || exit 1
59
+
60
+ # Production uvicorn with multiple workers
61
+ # - workers: handle concurrent requests (CPU-bound, use 2-4 for most cases)
62
+ # - For I/O bound (which this is), uvicorn's async handles concurrency well
63
+ # - limit-concurrency prevents overload
64
+ CMD uvicorn flow.ui.main:app \
65
+ --host 0.0.0.0 \
66
+ --port ${PORT} \
67
+ --workers ${UVICORN_WORKERS} \
68
+ --limit-concurrency 100 \
69
+ --timeout-keep-alive 30
README.md ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Flow - Autonomous Coding Agent
3
+ emoji: 🔄
4
+ colorFrom: blue
5
+ colorTo: purple
6
+ sdk: docker
7
+ app_port: 7860
8
+ pinned: false
9
+ ---
10
+
11
+ # Flow
12
+
13
+ **Autonomous Coding Agent with a Polished CLI**
14
+
15
+ Flow is a standalone coding agent that can read, write, and execute code autonomously. It features a clean CLI interface similar to Claude Code, with support for multiple agent runtime harnesses.
16
+
17
+ ## Features
18
+
19
+ - **Autonomous Execution**: Flow doesn't just tell you what to do—it does it. Write code, run tests, fix errors, iterate.
20
+ - **Rich CLI**: Interactive REPL with streaming output, tool call visualization, and syntax highlighting.
21
+ - **Pluggable Harnesses**: Swap out the underlying agent runtime (Microsoft Agent Framework, OpenAI Swarm, etc.)
22
+ - **Persistent Memory**: Remember patterns, decisions, and context across sessions.
23
+ - **Workspace Isolation**: Secure file operations within a sandboxed workspace.
24
+
25
+ ## Installation
26
+
27
+ ```bash
28
+ # Basic installation
29
+ pip install flow-agent
30
+
31
+ # With Microsoft Agent Framework support (recommended)
32
+ pip install flow-agent[agent-framework]
33
+
34
+ # With all optional features
35
+ pip install flow-agent[all]
36
+
37
+ # Development installation
38
+ pip install flow-agent[dev]
39
+ ```
40
+
41
+ ## Quick Start
42
+
43
+ ### 1. Configure Azure OpenAI
44
+
45
+ ```bash
46
+ export AZURE_OPENAI_API_KEY="your-api-key"
47
+ export AZURE_OPENAI_ENDPOINT="https://your-resource.openai.azure.com/"
48
+ export AZURE_OPENAI_DEPLOYMENT="gpt-4o"
49
+ ```
50
+
51
+ ### 2. Initialize Flow
52
+
53
+ ```bash
54
+ flow init
55
+ ```
56
+
57
+ ### 3. Run a Task
58
+
59
+ ```bash
60
+ # Single task
61
+ flow run "Create a Python script that calculates fibonacci numbers"
62
+
63
+ # Interactive mode
64
+ flow run -i
65
+ ```
66
+
67
+ ## CLI Commands
68
+
69
+ ```bash
70
+ flow run [TASK] # Run a task or start interactive mode
71
+ flow config # Show current configuration
72
+ flow init # Initialize Flow directories
73
+ flow --help # Show help
74
+ ```
75
+
76
+ ## Usage as a Library
77
+
78
+ ```python
79
+ import asyncio
80
+ from flow import FlowAgent
81
+
82
+ async def main():
83
+ agent = FlowAgent()
84
+
85
+ # Run a task
86
+ response = await agent.run("Create a hello world script")
87
+ print(response)
88
+
89
+ # Stream events
90
+ async for event in agent.run_stream("List files in the workspace"):
91
+ print(event.type, event.content)
92
+
93
+ await agent.close()
94
+
95
+ asyncio.run(main())
96
+ ```
97
+
98
+ ## Configuration
99
+
100
+ Flow can be configured via environment variables or a config file.
101
+
102
+ ### Environment Variables
103
+
104
+ | Variable | Description | Default |
105
+ |----------|-------------|---------|
106
+ | `FLOW_HARNESS` | Agent harness to use | `agent-framework` |
107
+ | `FLOW_MODEL` | Model name | `gpt-4o` |
108
+ | `FLOW_WORKSPACE` | Workspace directory | `~/.flow/workspace` |
109
+ | `AZURE_OPENAI_API_KEY` | Azure OpenAI API key | - |
110
+ | `AZURE_OPENAI_ENDPOINT` | Azure OpenAI endpoint | - |
111
+ | `AZURE_OPENAI_DEPLOYMENT` | Azure OpenAI deployment | - |
112
+
113
+ ### Directory Structure
114
+
115
+ ```
116
+ ~/.flow/
117
+ ├── workspace/ # Agent's working directory
118
+ ├── memory/ # Persistent memory storage
119
+ │ ├── patterns/ # Reusable code patterns
120
+ │ ├── projects/ # Per-project notes
121
+ │ └── decisions/ # Architecture decisions
122
+ └── skills/ # Domain-specific expertise
123
+ ```
124
+
125
+ ## Architecture
126
+
127
+ ### Harness System
128
+
129
+ Flow uses a harness abstraction to support multiple agent runtimes:
130
+
131
+ ```
132
+ ┌─────────────────┐
133
+ │ FlowAgent │
134
+ └────────┬────────┘
135
+
136
+ ┌────────▼────────┐
137
+ │ BaseHarness │ (Abstract)
138
+ └────────┬────────┘
139
+
140
+ ┌────┴────┐
141
+ │ │
142
+ ┌───▼───┐ ┌───▼───┐
143
+ │ Agent │ │ OpenAI│
144
+ │ Frmwk │ │ Swarm │
145
+ └───────┘ └───────┘
146
+ ```
147
+
148
+ Currently supported:
149
+ - **MAFHarness**: Microsoft Agent Framework with Azure OpenAI
150
+
151
+ Planned:
152
+ - LangChain
153
+ - Claude SDK
154
+
155
+ ### Tools
156
+
157
+ Flow includes a comprehensive set of tools:
158
+
159
+ | Tool | Description |
160
+ |------|-------------|
161
+ | `read_file` | Read file contents with line numbers |
162
+ | `write_file` | Write/edit files (full write, str_replace, insert) |
163
+ | `list_directory` | List directory contents |
164
+ | `grep_search` | Search for patterns in code |
165
+ | `bash_execute` | Run shell commands |
166
+ | `python_repl` | Execute Python code snippets |
167
+ | `memory` | Persistent memory operations |
168
+ | `think` | Structured reasoning |
169
+ | `task_done` | Report task completion |
170
+
171
+ ## Development
172
+
173
+ ```bash
174
+ # Clone the repository
175
+ git clone https://github.com/victordibia/flow
176
+ cd flow
177
+
178
+ # Install development dependencies
179
+ pip install -e ".[dev]"
180
+
181
+ # Run tests
182
+ pytest tests/ -v
183
+
184
+ # Type checking
185
+ pyright src/
186
+ mypy src/
187
+
188
+ # Linting
189
+ ruff check src/
190
+ ruff format src/
191
+ ```
192
+
193
+ ## License
194
+
195
+ MIT License - see [LICENSE](LICENSE) for details.
pyproject.toml ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "flow-agent"
3
+ version = "0.1.0"
4
+ description = "Autonomous coding agent with a polished CLI"
5
+ authors = [{ name = "Victor Dibia" }]
6
+ readme = "README.md"
7
+ requires-python = ">=3.10"
8
+ license = { text = "MIT" }
9
+ classifiers = [
10
+ "Development Status :: 4 - Beta",
11
+ "Environment :: Console",
12
+ "Intended Audience :: Developers",
13
+ "License :: OSI Approved :: MIT License",
14
+ "Programming Language :: Python :: 3",
15
+ "Programming Language :: Python :: 3.10",
16
+ "Programming Language :: Python :: 3.11",
17
+ "Programming Language :: Python :: 3.12",
18
+ "Programming Language :: Python :: 3.13",
19
+ "Typing :: Typed",
20
+ ]
21
+
22
+ dependencies = [
23
+ "pydantic>=2.0.0",
24
+ "pydantic-settings>=2.0.0",
25
+ "rich>=13.0.0",
26
+ "typer>=0.9.0",
27
+ "httpx>=0.25.0",
28
+ "python-dotenv>=1.0.0",
29
+ "agent-framework-core>=1.0.0b0",
30
+ "azure-identity>=1.15.0",
31
+ "pyyaml>=6.0.0",
32
+ # OpenTelemetry for experiments tracing
33
+ "opentelemetry-api>=1.20.0",
34
+ "opentelemetry-sdk>=1.20.0",
35
+ "opentelemetry-semantic-conventions>=0.41b0",
36
+ # Web UI dependencies
37
+ "fastapi>=0.109.0",
38
+ "uvicorn>=0.27.0",
39
+ "sqlmodel>=0.0.14",
40
+ "aiosqlite>=0.19.0",
41
+ ]
42
+
43
+ [project.optional-dependencies]
44
+ # Optional features
45
+ research = ["beautifulsoup4>=4.12.0", "html2text>=2024.2.26"]
46
+
47
+ # Bundles
48
+ all = ["flow-agent[research]"]
49
+ dev = [
50
+ "pytest>=8.0.0",
51
+ "pytest-asyncio>=0.23.0",
52
+ "pytest-cov>=4.1.0",
53
+ "mypy>=1.8.0",
54
+ "pyright>=1.1.350",
55
+ "ruff>=0.2.0",
56
+ "pre-commit>=3.6.0",
57
+ "poethepoet>=0.24.0",
58
+ ]
59
+
60
+ [project.scripts]
61
+ flow = "flow.cli:main"
62
+
63
+ [project.urls]
64
+ Homepage = "https://github.com/victordibia/flow"
65
+ Repository = "https://github.com/victordibia/flow"
66
+ Issues = "https://github.com/victordibia/flow/issues"
67
+
68
+ [build-system]
69
+ requires = ["hatchling"]
70
+ build-backend = "hatchling.build"
71
+
72
+ [tool.hatch.build.targets.wheel]
73
+ packages = ["src/flow"]
74
+
75
+ # ============================================================================
76
+ # Type Checking - Strict
77
+ # ============================================================================
78
+
79
+ [tool.pyright]
80
+ include = ["src"]
81
+ exclude = ["**/tests/**", "**/.venv/**"]
82
+ typeCheckingMode = "strict"
83
+ pythonVersion = "3.10"
84
+ reportMissingTypeStubs = false
85
+ reportUnnecessaryIsInstance = false
86
+ # agent_framework is optional - ignore type issues in harness
87
+ reportUnknownMemberType = "warning"
88
+ reportUnknownVariableType = "warning"
89
+ reportUnknownArgumentType = "warning"
90
+
91
+ [tool.mypy]
92
+ plugins = ["pydantic.mypy"]
93
+ strict = true
94
+ python_version = "3.10"
95
+ ignore_missing_imports = true
96
+ disallow_untyped_defs = true
97
+ no_implicit_optional = true
98
+ check_untyped_defs = true
99
+ warn_return_any = true
100
+ show_error_codes = true
101
+ warn_unused_ignores = false
102
+ disallow_incomplete_defs = true
103
+ disallow_untyped_decorators = true
104
+
105
+ # ============================================================================
106
+ # Linting - Ruff
107
+ # ============================================================================
108
+
109
+ [tool.ruff]
110
+ line-length = 120
111
+ target-version = "py310"
112
+ src = ["src"]
113
+ fix = true
114
+ include = ["*.py", "*.pyi", "**/pyproject.toml"]
115
+ exclude = ["docs/*"]
116
+
117
+ [tool.ruff.lint]
118
+ select = [
119
+ "E", # pycodestyle errors
120
+ "F", # pyflakes
121
+ "I", # isort
122
+ "B", # bugbear
123
+ "UP", # pyupgrade
124
+ "ANN", # annotations
125
+ "S", # bandit (security)
126
+ "RUF", # ruff-specific
127
+ "ASYNC", # async checks
128
+ "D", # pydocstyle
129
+ ]
130
+ ignore = [
131
+ "D100", # allow missing docstring in public module
132
+ "D104", # allow missing docstring in public package
133
+ "D107", # allow missing docstring in __init__
134
+ "ANN401", # allow Any type (needed for generic tool/event handling)
135
+ "S101", # allow assert statements (used in tests)
136
+ ]
137
+
138
+ [tool.ruff.lint.per-file-ignores]
139
+ "**/tests/**" = ["D", "ANN", "S"]
140
+
141
+ [tool.ruff.lint.pydocstyle]
142
+ convention = "google"
143
+
144
+ [tool.ruff.format]
145
+ docstring-code-format = true
146
+
147
+ # ============================================================================
148
+ # Testing - Pytest
149
+ # ============================================================================
150
+
151
+ [tool.pytest.ini_options]
152
+ testpaths = ["tests"]
153
+ pythonpath = ["src"]
154
+ addopts = "-ra -q -r fEX"
155
+ asyncio_mode = "auto"
156
+ asyncio_default_fixture_loop_scope = "function"
157
+ filterwarnings = []
158
+
159
+ [tool.coverage.run]
160
+ source = ["src/flow"]
161
+ omit = ["**/__init__.py"]
162
+
163
+ [tool.coverage.report]
164
+ exclude_lines = [
165
+ "pragma: no cover",
166
+ "if TYPE_CHECKING:",
167
+ "raise NotImplementedError",
168
+ ]
169
+
170
+ # ============================================================================
171
+ # Task Runner - Poe
172
+ # ============================================================================
173
+
174
+ [tool.poe.tasks]
175
+ fmt = "ruff format src tests"
176
+ lint = "ruff check src tests --fix"
177
+ pyright = "pyright src"
178
+ mypy = "mypy src"
179
+ test = "pytest tests -v --cov=flow --cov-report=term-missing"
180
+ check = ["fmt", "lint", "pyright", "mypy", "test"]
src/flow/__init__.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Flow - Autonomous Coding Agent.
2
+
3
+ An autonomous coding agent with a polished CLI experience.
4
+ Uses Microsoft Agent Framework as the runtime.
5
+
6
+ Usage:
7
+ from flow.harness.maf import MAFHarness
8
+
9
+ # Simple - creates agent with defaults
10
+ harness = MAFHarness()
11
+ async for event in harness.run_stream("Create a hello world script"):
12
+ print(event)
13
+
14
+ # Or with custom settings
15
+ harness = MAFHarness(workspace=Path("/tmp/workspace"), enable_compaction=False)
16
+ """
17
+
18
+ from flow.harness.maf import MAFHarness, create_agent
19
+
20
+ __version__ = "0.1.0"
21
+
22
+ __all__ = [
23
+ "MAFHarness",
24
+ "create_agent",
25
+ "__version__",
26
+ ]
src/flow/cli/__init__.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Flow CLI - Command-line interface.
2
+
3
+ Provides the `flow` command for running the autonomous coding agent.
4
+ """
5
+
6
+ from flow.cli.app import app, main
7
+
8
+ __all__ = [
9
+ "app",
10
+ "main",
11
+ ]
src/flow/cli/app.py ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Flow CLI application.
2
+
3
+ Main entry point for the `flow` command.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import asyncio
9
+ import os
10
+ from pathlib import Path
11
+ from typing import Annotated
12
+
13
+ import typer
14
+ from rich.console import Console
15
+
16
+ from flow import __version__
17
+
18
+ app = typer.Typer(
19
+ name="flow",
20
+ help="Flow - Autonomous Coding Agent",
21
+ add_completion=False,
22
+ no_args_is_help=True,
23
+ )
24
+
25
+ console = Console()
26
+
27
+ # Default paths
28
+ DEFAULT_WORKSPACE = Path.home() / ".flow" / "workspace"
29
+ DEFAULT_MEMORY_PATH = Path.home() / ".flow" / "memory"
30
+
31
+
32
+ def version_callback(value: bool) -> None:
33
+ """Print version and exit."""
34
+ if value:
35
+ console.print(f"Flow v{__version__}")
36
+ raise typer.Exit()
37
+
38
+
39
+ @app.callback()
40
+ def callback(
41
+ version: Annotated[
42
+ bool | None,
43
+ typer.Option("--version", "-v", callback=version_callback, is_eager=True),
44
+ ] = None,
45
+ ) -> None:
46
+ """Flow - Autonomous Coding Agent."""
47
+ pass
48
+
49
+
50
+ @app.command()
51
+ def run(
52
+ task: Annotated[
53
+ str | None,
54
+ typer.Argument(help="Task to execute (or enter interactive mode if not provided)"),
55
+ ] = None,
56
+ workspace: Annotated[
57
+ Path | None,
58
+ typer.Option("--workspace", "-w", help="Workspace directory for writing files"),
59
+ ] = None,
60
+ config: Annotated[
61
+ Path | None,
62
+ typer.Option("--config", "-c", help="Config file from optimization (YAML)"),
63
+ ] = None,
64
+ interactive: Annotated[
65
+ bool,
66
+ typer.Option("--interactive/--no-interactive", "-i", help="Interactive mode"),
67
+ ] = True,
68
+ ) -> None:
69
+ """Run the coding agent.
70
+
71
+ If a task is provided, execute it and exit.
72
+ Otherwise, start an interactive REPL session.
73
+
74
+ The agent can read files from anywhere but writes go to the workspace.
75
+
76
+ Use --config to load a configuration from a previous optimization run.
77
+ """
78
+ workspace_path = workspace or DEFAULT_WORKSPACE
79
+ memory_path = DEFAULT_MEMORY_PATH
80
+
81
+ # Ensure directories exist
82
+ workspace_path.mkdir(parents=True, exist_ok=True)
83
+ memory_path.mkdir(parents=True, exist_ok=True)
84
+
85
+ if task:
86
+ # Single task mode
87
+ asyncio.run(_run_single_task(workspace_path, memory_path, task, config))
88
+ elif interactive:
89
+ # Interactive REPL mode
90
+ from flow.cli.repl import FlowREPL
91
+ repl = FlowREPL(workspace=workspace_path, memory_path=memory_path)
92
+ asyncio.run(repl.run())
93
+ else:
94
+ console.print("[red]Error:[/] No task provided and interactive mode disabled.")
95
+ raise typer.Exit(1)
96
+
97
+
98
+ async def _run_single_task(
99
+ workspace: Path,
100
+ memory_path: Path,
101
+ task: str,
102
+ config_path: Path | None = None,
103
+ ) -> None:
104
+ """Run a single task and print the result."""
105
+ from flow.cli.output import print_event
106
+ from flow.harness.base import EventType
107
+ from flow.harness.maf import MAFHarness
108
+
109
+ if config_path:
110
+ # Load config from optimization result
111
+ from flow.experiments.config_export import load_config
112
+ from flow.experiments.ablation import create_harness_from_config
113
+
114
+ ablation_config = load_config(config_path)
115
+ console.print(f"[dim]Using config: {ablation_config.name}[/]")
116
+ harness = create_harness_from_config(ablation_config, workspace)
117
+ else:
118
+ harness = MAFHarness(workspace=workspace, memory_path=memory_path)
119
+
120
+ try:
121
+ console.print("\n[bold blue]Flow[/] - Executing task...\n")
122
+
123
+ async for event in harness.run_stream(task):
124
+ print_event(console, event)
125
+
126
+ if event.type == EventType.ERROR:
127
+ raise typer.Exit(1)
128
+
129
+ except KeyboardInterrupt:
130
+ console.print("\n[yellow]Cancelled.[/]")
131
+ finally:
132
+ await harness.close()
133
+
134
+
135
+ # Import and register the optimize command
136
+ from flow.cli.optimize import optimize as optimize_cmd
137
+
138
+ app.command()(optimize_cmd)
139
+
140
+
141
+ @app.command()
142
+ def serve(
143
+ host: Annotated[
144
+ str,
145
+ typer.Option("--host", "-h", help="Host to bind to"),
146
+ ] = "0.0.0.0", # noqa: S104
147
+ port: Annotated[
148
+ int,
149
+ typer.Option("--port", "-p", help="Port to bind to"),
150
+ ] = 8091,
151
+ reload: Annotated[
152
+ bool,
153
+ typer.Option("--reload", help="Enable auto-reload for development"),
154
+ ] = False,
155
+ ) -> None:
156
+ """Start the Flow web UI server.
157
+
158
+ Launches a web interface for managing agent configurations,
159
+ running optimization experiments, and viewing results.
160
+ """
161
+ import uvicorn
162
+
163
+ console.print(f"\n[bold blue]Flow UI[/] starting on [cyan]http://{host}:{port}[/]\n")
164
+
165
+ uvicorn.run(
166
+ "flow.ui.main:app",
167
+ host=host,
168
+ port=port,
169
+ reload=reload,
170
+ )
171
+
172
+
173
+ @app.command()
174
+ def config() -> None:
175
+ """Show current configuration."""
176
+ from rich.table import Table
177
+
178
+ table = Table(title="Flow Configuration")
179
+ table.add_column("Setting", style="cyan")
180
+ table.add_column("Value", style="green")
181
+
182
+ table.add_row("Workspace", str(DEFAULT_WORKSPACE))
183
+ table.add_row("Memory Path", str(DEFAULT_MEMORY_PATH))
184
+ table.add_row("Azure Endpoint", os.environ.get("AZURE_OPENAI_ENDPOINT", "(not set)"))
185
+ table.add_row("Azure Deployment", os.environ.get("AZURE_OPENAI_DEPLOYMENT", "(not set)"))
186
+
187
+ console.print(table)
188
+
189
+
190
+ @app.command()
191
+ def init() -> None:
192
+ """Initialize Flow directories and show setup instructions."""
193
+ DEFAULT_WORKSPACE.mkdir(parents=True, exist_ok=True)
194
+ DEFAULT_MEMORY_PATH.mkdir(parents=True, exist_ok=True)
195
+
196
+ console.print("\n[bold green]Flow initialized![/]\n")
197
+ console.print(f" Workspace: [cyan]{DEFAULT_WORKSPACE}[/]")
198
+ console.print(f" Memory: [cyan]{DEFAULT_MEMORY_PATH}[/]")
199
+
200
+ console.print("\n[bold]Next steps:[/]")
201
+ console.print(" 1. Set your Azure OpenAI credentials:")
202
+ console.print(" [dim]export AZURE_OPENAI_API_KEY=your-key[/]")
203
+ console.print(" [dim]export AZURE_OPENAI_ENDPOINT=https://your-resource.openai.azure.com/[/]")
204
+ console.print(" [dim]export AZURE_OPENAI_DEPLOYMENT=your-deployment[/]")
205
+ console.print("\n 2. Run Flow:")
206
+ console.print(' [dim]flow run "Create a hello world Python script"[/]')
207
+ console.print(" [dim]flow run -i # Interactive mode[/]")
208
+
209
+
210
+ def main() -> None:
211
+ """Main entry point."""
212
+ app()
213
+
214
+
215
+ if __name__ == "__main__":
216
+ main()
src/flow/cli/optimize.py ADDED
@@ -0,0 +1,332 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Microsoft. All rights reserved.
2
+
3
+ """Optimize command for finding best agent configurations."""
4
+
5
+ from __future__ import annotations
6
+
7
+ import asyncio
8
+ import importlib.util
9
+ import sys
10
+ from pathlib import Path
11
+ from typing import Annotated, Any
12
+
13
+ import typer
14
+ from rich.console import Console
15
+
16
+ from flow.experiments.ablation import AblationConfig, CONTEXT_ENGINEERING_CONFIGS
17
+ from flow.experiments.optimizer import (
18
+ FlowOptimizer,
19
+ generate_grid_configs,
20
+ load_tasks_from_jsonl,
21
+ )
22
+ from flow.experiments.types import EvalCriterion, Task
23
+
24
+ console = Console()
25
+
26
+
27
+ def optimize(
28
+ tasks: Annotated[
29
+ Path | None,
30
+ typer.Option(
31
+ "--tasks", "-t",
32
+ help="Path to tasks.jsonl file",
33
+ ),
34
+ ] = None,
35
+ config: Annotated[
36
+ Path | None,
37
+ typer.Option(
38
+ "--config", "-c",
39
+ help="Path to Python config file with CONFIGS or VARIATIONS",
40
+ ),
41
+ ] = None,
42
+ agent: Annotated[
43
+ Path | None,
44
+ typer.Option(
45
+ "--agent", "-a",
46
+ help="Path to base agent Python file (for optimization)",
47
+ ),
48
+ ] = None,
49
+ suite: Annotated[
50
+ str | None,
51
+ typer.Option(
52
+ "--suite", "-s",
53
+ help="Built-in task suite: coding, research",
54
+ ),
55
+ ] = None,
56
+ parallel: Annotated[
57
+ int,
58
+ typer.Option(
59
+ "--parallel", "-p",
60
+ help="Max concurrent experiments",
61
+ ),
62
+ ] = 4,
63
+ mode: Annotated[
64
+ str,
65
+ typer.Option(
66
+ "--mode", "-m",
67
+ help="Config mode: named (use CONFIGS), grid (use VARIATIONS)",
68
+ ),
69
+ ] = "named",
70
+ vary: Annotated[
71
+ str | None,
72
+ typer.Option(
73
+ "--vary", "-v",
74
+ help="Comma-separated params to vary: compaction,memory,model",
75
+ ),
76
+ ] = None,
77
+ output: Annotated[
78
+ Path | None,
79
+ typer.Option(
80
+ "--output", "-o",
81
+ help="Output directory for results",
82
+ ),
83
+ ] = None,
84
+ no_llm_eval: Annotated[
85
+ bool,
86
+ typer.Option(
87
+ "--no-llm-eval",
88
+ help="Disable LLM-as-Judge evaluation (faster, less accurate)",
89
+ ),
90
+ ] = False,
91
+ ) -> None:
92
+ """Find the best agent configuration through experimentation.
93
+
94
+ Runs experiments in parallel, evaluates with LLM-as-Judge,
95
+ ranks via Pareto analysis, and exports winning configs.
96
+
97
+ Examples:
98
+
99
+ # Run with task file and default configs
100
+ flow optimize --tasks tasks.jsonl
101
+
102
+ # Use custom configs from Python file
103
+ flow optimize --config my_configs.py --tasks tasks.jsonl
104
+
105
+ # Grid search over variations
106
+ flow optimize --config my_configs.py --tasks tasks.jsonl --mode grid
107
+
108
+ # Use built-in task suite
109
+ flow optimize --suite coding --parallel 2
110
+
111
+ # Vary specific parameters
112
+ flow optimize --vary compaction,memory --tasks tasks.jsonl
113
+ """
114
+ asyncio.run(_run_optimize(
115
+ tasks_path=tasks,
116
+ config_path=config,
117
+ agent_path=agent,
118
+ suite=suite,
119
+ parallel=parallel,
120
+ mode=mode,
121
+ vary=vary,
122
+ output_dir=output,
123
+ use_llm_eval=not no_llm_eval,
124
+ ))
125
+
126
+
127
+ async def _run_optimize(
128
+ tasks_path: Path | None,
129
+ config_path: Path | None,
130
+ agent_path: Path | None,
131
+ suite: str | None,
132
+ parallel: int,
133
+ mode: str,
134
+ vary: str | None,
135
+ output_dir: Path | None,
136
+ use_llm_eval: bool,
137
+ ) -> None:
138
+ """Run the optimization."""
139
+ # Load tasks
140
+ tasks = _load_tasks(tasks_path, suite)
141
+ if not tasks:
142
+ console.print("[red]Error:[/] No tasks specified. Use --tasks or --suite")
143
+ raise typer.Exit(1)
144
+
145
+ # Load configs
146
+ configs = _load_configs(config_path, mode, vary)
147
+ if not configs:
148
+ console.print("[red]Error:[/] No configs to test. Use --config or --vary")
149
+ raise typer.Exit(1)
150
+
151
+ console.print(f"\n[bold]Tasks:[/] {len(tasks)}")
152
+ for t in tasks:
153
+ console.print(f" - {t.name}")
154
+
155
+ console.print(f"\n[bold]Configs:[/] {len(configs)}")
156
+ for c in configs:
157
+ console.print(f" - {c.name}")
158
+
159
+ # Run optimizer
160
+ optimizer = FlowOptimizer(
161
+ parallel=parallel,
162
+ use_llm_evaluator=use_llm_eval,
163
+ output_dir=output_dir,
164
+ )
165
+
166
+ try:
167
+ result = await optimizer.optimize(configs, tasks)
168
+
169
+ console.print("\n[bold green]Optimization complete![/]")
170
+ console.print(f"\nBest configs exported to: [cyan]{result.output_dir / 'configs'}[/]")
171
+ console.print("\nTo use a config:")
172
+ console.print(f" [dim]flow run --config {result.output_dir / 'configs' / 'best_score.yaml'} \"your task\"[/]")
173
+
174
+ except KeyboardInterrupt:
175
+ console.print("\n[yellow]Optimization cancelled.[/]")
176
+ raise typer.Exit(1)
177
+
178
+
179
+ def _load_tasks(tasks_path: Path | None, suite: str | None) -> list[Task]:
180
+ """Load tasks from file or built-in suite."""
181
+ if tasks_path:
182
+ if not tasks_path.exists():
183
+ console.print(f"[red]Error:[/] Tasks file not found: {tasks_path}")
184
+ raise typer.Exit(1)
185
+ return load_tasks_from_jsonl(tasks_path)
186
+
187
+ if suite:
188
+ return _get_builtin_suite(suite)
189
+
190
+ # Default: simple test suite
191
+ return _get_builtin_suite("quick")
192
+
193
+
194
+ def _get_builtin_suite(name: str) -> list[Task]:
195
+ """Get a built-in task suite."""
196
+ suites = {
197
+ "quick": [
198
+ Task(
199
+ name="hello_world",
200
+ prompt="Create a Python script 'hello.py' that prints 'Hello, World!' and run it.",
201
+ criteria=[
202
+ EvalCriterion(name="file_created", instruction="hello.py should be created"),
203
+ EvalCriterion(name="correct_output", instruction="Output should include 'Hello, World!'"),
204
+ ],
205
+ ),
206
+ ],
207
+ "coding": [
208
+ Task(
209
+ name="fizzbuzz",
210
+ prompt="Create fizzbuzz.py that prints 1-30 with Fizz/Buzz/FizzBuzz rules. Run it.",
211
+ criteria=[
212
+ EvalCriterion(name="file_created", instruction="fizzbuzz.py should be created"),
213
+ EvalCriterion(name="correct_output", instruction="Output shows correct FizzBuzz pattern"),
214
+ ],
215
+ metadata={"category": "short"},
216
+ ),
217
+ Task(
218
+ name="rest_api",
219
+ prompt="Create a FastAPI app with a /health endpoint that returns JSON {'status': 'ok'}. Save as api.py.",
220
+ criteria=[
221
+ EvalCriterion(name="file_created", instruction="api.py should be created"),
222
+ EvalCriterion(name="fastapi_used", instruction="Should use FastAPI"),
223
+ EvalCriterion(name="endpoint_defined", instruction="Should have /health endpoint"),
224
+ ],
225
+ metadata={"category": "medium"},
226
+ ),
227
+ Task(
228
+ name="data_pipeline",
229
+ prompt="""Create a data processing pipeline:
230
+ 1. data_types.py - DataRecord dataclass (id, name, value)
231
+ 2. validators.py - validate_id, validate_name functions
232
+ 3. pipeline.py - chain validators together
233
+ 4. test_pipeline.py - tests for the pipeline
234
+ Run the tests.""",
235
+ criteria=[
236
+ EvalCriterion(name="modules_created", instruction="All 4 Python files created"),
237
+ EvalCriterion(name="tests_run", instruction="Tests should be executed"),
238
+ ],
239
+ metadata={"category": "long"},
240
+ ),
241
+ ],
242
+ "research": [
243
+ Task(
244
+ name="codebase_analysis",
245
+ prompt="""Analyze this workspace:
246
+ 1. Explore the directory structure
247
+ 2. Identify Python files and their purposes
248
+ 3. Create analysis_report.md with findings""",
249
+ criteria=[
250
+ EvalCriterion(name="exploration", instruction="Should explore directory"),
251
+ EvalCriterion(name="report_created", instruction="analysis_report.md created"),
252
+ ],
253
+ metadata={"category": "research"},
254
+ ),
255
+ ],
256
+ }
257
+
258
+ if name not in suites:
259
+ console.print(f"[red]Error:[/] Unknown suite '{name}'. Available: {list(suites.keys())}")
260
+ raise typer.Exit(1)
261
+
262
+ return suites[name]
263
+
264
+
265
+ def _load_configs(
266
+ config_path: Path | None,
267
+ mode: str,
268
+ vary: str | None,
269
+ ) -> list[AblationConfig]:
270
+ """Load configs from file or generate from variations."""
271
+ # Load from Python file
272
+ if config_path:
273
+ if not config_path.exists():
274
+ console.print(f"[red]Error:[/] Config file not found: {config_path}")
275
+ raise typer.Exit(1)
276
+
277
+ configs, variations = _load_python_config(config_path)
278
+
279
+ if mode == "grid" and variations:
280
+ return generate_grid_configs("grid", variations)
281
+ elif configs:
282
+ return configs
283
+ else:
284
+ console.print("[red]Error:[/] Config file has no CONFIGS or VARIATIONS")
285
+ raise typer.Exit(1)
286
+
287
+ # Generate from --vary flag
288
+ if vary:
289
+ variations = _parse_vary_flag(vary)
290
+ return generate_grid_configs("vary", variations)
291
+
292
+ # Default: use context engineering configs
293
+ return CONTEXT_ENGINEERING_CONFIGS
294
+
295
+
296
+ def _load_python_config(path: Path) -> tuple[list[AblationConfig], dict[str, Any]]:
297
+ """Load CONFIGS and VARIATIONS from a Python file."""
298
+ spec = importlib.util.spec_from_file_location("config_module", path)
299
+ if spec is None or spec.loader is None:
300
+ raise ValueError(f"Cannot load {path}")
301
+
302
+ module = importlib.util.module_from_spec(spec)
303
+ sys.modules["config_module"] = module
304
+ spec.loader.exec_module(module)
305
+
306
+ configs = getattr(module, "CONFIGS", [])
307
+ variations = getattr(module, "VARIATIONS", {})
308
+
309
+ return configs, variations
310
+
311
+
312
+ def _parse_vary_flag(vary: str) -> dict[str, Any]:
313
+ """Parse --vary flag into variations dict."""
314
+ variations = {}
315
+
316
+ for param in vary.split(","):
317
+ param = param.strip().lower()
318
+
319
+ if param in ("compaction", "compact"):
320
+ variations["enable_message_compaction"] = [True, False]
321
+ elif param in ("memory", "mem"):
322
+ variations["enable_memory_tool"] = [True, False]
323
+ elif param in ("subagent", "sub"):
324
+ variations["enable_sub_agent"] = [True, False]
325
+ elif param in ("head", "head_size"):
326
+ variations["compaction_head_size"] = [5, 10, 20]
327
+ elif param in ("tail", "tail_size"):
328
+ variations["compaction_tail_size"] = [20, 40, 60]
329
+ else:
330
+ console.print(f"[yellow]Warning:[/] Unknown vary param: {param}")
331
+
332
+ return variations
src/flow/cli/output.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Output formatting for Flow CLI.
2
+
3
+ Provides functions for rendering agent events to the terminal
4
+ with rich formatting.
5
+ """
6
+
7
+ from rich.console import Console
8
+ from rich.markdown import Markdown
9
+ from rich.markup import escape
10
+ from rich.panel import Panel
11
+ from rich.syntax import Syntax
12
+
13
+ from flow.harness.base import Event, EventType
14
+
15
+
16
+ def print_event(console: Console, event: Event) -> None:
17
+ """Print an agent event to the console.
18
+
19
+ Args:
20
+ console: Rich console instance
21
+ event: Event to print
22
+ """
23
+ if event.type == EventType.TEXT_DELTA:
24
+ # Stream text without newline
25
+ console.print(event.content, end="")
26
+
27
+ elif event.type == EventType.TEXT_DONE:
28
+ # Final text - print with newline
29
+ if event.content:
30
+ console.print(event.content)
31
+ console.print() # Extra newline for spacing
32
+
33
+ elif event.type == EventType.TOOL_CALL_START:
34
+ # Show tool being called
35
+ tool_name = event.tool_name or "unknown"
36
+ console.print(f"\n[dim]▶ Calling tool:[/] [cyan]{tool_name}[/]")
37
+
38
+ elif event.type == EventType.TOOL_CALL_ARGS:
39
+ # Show tool arguments (streaming) - escape to prevent Rich markup interpretation
40
+ if event.content:
41
+ console.print(f"[dim]{escape(event.content)}[/]", end="")
42
+
43
+ elif event.type == EventType.TOOL_CALL_DONE:
44
+ # Tool call complete
45
+ console.print() # Newline after args
46
+
47
+ elif event.type == EventType.TOOL_RESULT:
48
+ # Show tool result (truncated if long)
49
+ result = event.content or ""
50
+ if len(result) > 500:
51
+ result = result[:500] + "\n... (truncated)"
52
+
53
+ console.print(Panel(
54
+ escape(result),
55
+ title="[green]Tool Result[/]",
56
+ border_style="dim",
57
+ expand=False,
58
+ ))
59
+
60
+ elif event.type == EventType.THINKING:
61
+ # Show agent thinking
62
+ console.print(f"[dim italic]💭 {escape(event.content or '')}[/]")
63
+
64
+ elif event.type == EventType.ERROR:
65
+ # Show error
66
+ console.print(f"\n[bold red]Error:[/] {escape(event.content or '')}")
67
+
68
+ elif event.type == EventType.DONE:
69
+ # Execution complete
70
+ console.print("\n[dim]─── Done ───[/]\n")
71
+
72
+
73
+ def print_welcome(console: Console) -> None:
74
+ """Print welcome message for interactive mode."""
75
+ console.print("\n[bold blue]Flow[/] - Autonomous Coding Agent")
76
+ console.print("[dim]Type your task and press Enter. Type 'exit' or Ctrl+D to quit.[/]\n")
77
+
78
+
79
+ def print_code(console: Console, code: str, language: str = "python") -> None:
80
+ """Print syntax-highlighted code.
81
+
82
+ Args:
83
+ console: Rich console instance
84
+ code: Code to print
85
+ language: Programming language for syntax highlighting
86
+ """
87
+ syntax = Syntax(code, language, theme="monokai", line_numbers=True)
88
+ console.print(syntax)
89
+
90
+
91
+ def print_markdown(console: Console, text: str) -> None:
92
+ """Print markdown-formatted text.
93
+
94
+ Args:
95
+ console: Rich console instance
96
+ text: Markdown text to print
97
+ """
98
+ md = Markdown(text)
99
+ console.print(md)
src/flow/cli/repl.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Interactive REPL for Flow.
2
+
3
+ Provides an interactive command-line interface for running
4
+ the Flow agent with streaming output.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from pathlib import Path
10
+
11
+ from rich.console import Console
12
+
13
+ from flow.cli.output import print_event, print_welcome
14
+ from flow.harness.base import EventType
15
+ from flow.harness.maf import MAFHarness
16
+
17
+ # Default paths
18
+ DEFAULT_WORKSPACE = Path.home() / ".flow" / "workspace"
19
+ DEFAULT_MEMORY_PATH = Path.home() / ".flow" / "memory"
20
+
21
+
22
+ class FlowREPL:
23
+ """Interactive REPL for Flow agent.
24
+
25
+ Provides a command-line interface similar to Claude Code,
26
+ with streaming output and tool call visualization.
27
+ """
28
+
29
+ def __init__(
30
+ self,
31
+ workspace: Path | None = None,
32
+ memory_path: Path | None = None,
33
+ ) -> None:
34
+ """Initialize the REPL.
35
+
36
+ Args:
37
+ workspace: Workspace directory. Defaults to ~/.flow/workspace.
38
+ memory_path: Memory directory. Defaults to ~/.flow/memory.
39
+ """
40
+ self._workspace = workspace or DEFAULT_WORKSPACE
41
+ self._memory_path = memory_path or DEFAULT_MEMORY_PATH
42
+ self._console = Console()
43
+ self._harness: MAFHarness | None = None
44
+ self._thread_id: str | None = None
45
+
46
+ def _get_harness(self) -> MAFHarness:
47
+ """Get or create the harness instance."""
48
+ if self._harness is None:
49
+ self._harness = MAFHarness(
50
+ workspace=self._workspace,
51
+ memory_path=self._memory_path,
52
+ )
53
+ return self._harness
54
+
55
+ async def run(self) -> None:
56
+ """Run the interactive REPL loop."""
57
+ print_welcome(self._console)
58
+
59
+ harness = self._get_harness()
60
+
61
+ while True:
62
+ try:
63
+ # Get user input
64
+ user_input = self._get_input()
65
+
66
+ if user_input is None:
67
+ # EOF (Ctrl+D)
68
+ break
69
+
70
+ user_input = user_input.strip()
71
+
72
+ if not user_input:
73
+ continue
74
+
75
+ # Handle special commands
76
+ if user_input.lower() in ("exit", "quit", "q"):
77
+ break
78
+
79
+ if user_input.lower() == "clear":
80
+ self._console.clear()
81
+ print_welcome(self._console)
82
+ continue
83
+
84
+ if user_input.lower() == "help":
85
+ self._print_help()
86
+ continue
87
+
88
+ if user_input.lower() == "config":
89
+ self._print_config()
90
+ continue
91
+
92
+ # Run the task
93
+ await self._run_task(harness, user_input)
94
+
95
+ except KeyboardInterrupt:
96
+ self._console.print("\n[yellow]Interrupted. Type 'exit' to quit.[/]")
97
+ continue
98
+
99
+ # Cleanup
100
+ self._console.print("\n[dim]Goodbye![/]\n")
101
+ if self._harness:
102
+ await self._harness.close()
103
+
104
+ def _get_input(self) -> str | None:
105
+ """Get input from the user.
106
+
107
+ Returns:
108
+ User input string, or None on EOF.
109
+ """
110
+ try:
111
+ return self._console.input("[bold green]>[/] ")
112
+ except EOFError:
113
+ return None
114
+
115
+ async def _run_task(self, harness: MAFHarness, task: str) -> None:
116
+ """Run a task and stream the output.
117
+
118
+ Args:
119
+ harness: Harness instance
120
+ task: Task to execute
121
+ """
122
+ self._console.print() # Blank line before output
123
+
124
+ try:
125
+ async for event in harness.run_stream(task, self._thread_id):
126
+ print_event(self._console, event)
127
+
128
+ # Store thread ID for conversation continuity
129
+ if event.type == EventType.DONE:
130
+ self._thread_id = harness.get_thread_id()
131
+
132
+ except Exception as e:
133
+ self._console.print(f"\n[bold red]Error:[/] {e}")
134
+
135
+ def _print_help(self) -> None:
136
+ """Print help information."""
137
+ self._console.print("\n[bold]Flow Commands:[/]")
138
+ self._console.print(" [cyan]exit[/], [cyan]quit[/], [cyan]q[/] - Exit the REPL")
139
+ self._console.print(" [cyan]clear[/] - Clear the screen")
140
+ self._console.print(" [cyan]config[/] - Show current configuration")
141
+ self._console.print(" [cyan]help[/] - Show this help message")
142
+ self._console.print("\n[bold]Tips:[/]")
143
+ self._console.print(" - Type your task and press Enter to execute")
144
+ self._console.print(" - Press Ctrl+C to cancel a running task")
145
+ self._console.print(" - Press Ctrl+D to exit")
146
+ self._console.print()
147
+
148
+ def _print_config(self) -> None:
149
+ """Print current configuration."""
150
+ self._console.print("\n[bold]Configuration:[/]")
151
+ self._console.print(f" Workspace: [cyan]{self._workspace}[/]")
152
+ self._console.print(f" Memory: [cyan]{self._memory_path}[/]")
153
+ self._console.print()
src/flow/experiments/__init__.py ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Microsoft. All rights reserved.
2
+
3
+ """Experiments framework for running and evaluating Flow agent tasks.
4
+
5
+ This package provides a structured way to:
6
+ - Define tasks with evaluation criteria
7
+ - Run agents on tasks and collect OpenTelemetry traces
8
+ - Evaluate agent outputs using LLM, heuristic, or trace-based evaluators
9
+ - Extract metrics from execution traces
10
+ - Run ablation studies comparing different configurations
11
+
12
+ Example usage:
13
+ from flow.harness.maf import MAFHarness
14
+ from flow.experiments import (
15
+ FlowExperimentRunner,
16
+ Task,
17
+ EvalCriterion,
18
+ TraceEvaluator,
19
+ HeuristicEvaluator,
20
+ extract_metrics,
21
+ format_metrics_summary,
22
+ setup_tracing,
23
+ )
24
+
25
+ # Setup tracing (call once at startup)
26
+ setup_tracing("my-experiment")
27
+
28
+ # Define a task
29
+ task = Task(
30
+ name="hello_world",
31
+ prompt="Write a Python function that prints 'Hello, World!'",
32
+ criteria=[
33
+ EvalCriterion(
34
+ name="correctness",
35
+ instruction="The function should print exactly 'Hello, World!'",
36
+ ),
37
+ ],
38
+ )
39
+
40
+ # Run the experiment
41
+ harness = MAFHarness()
42
+ runner = FlowExperimentRunner(keep_workspace=True)
43
+ result = await runner.run(harness, task)
44
+
45
+ # Extract metrics
46
+ metrics = extract_metrics(result.trace)
47
+ print(format_metrics_summary(metrics))
48
+
49
+ # Evaluate the result
50
+ evaluator = HeuristicEvaluator()
51
+ eval_result = await evaluator.evaluate(result)
52
+ print(f"Score: {eval_result.score}, Passed: {eval_result.passed}")
53
+
54
+ await harness.close()
55
+
56
+ Ablation studies:
57
+ from flow.experiments import run_ablations, AblationConfig
58
+
59
+ configs = [
60
+ AblationConfig(name="baseline", enable_message_compaction=False),
61
+ AblationConfig(name="with_compaction", enable_message_compaction=True),
62
+ ]
63
+
64
+ results = await run_ablations(
65
+ configs,
66
+ task_prompt="Create a simple HTTP server",
67
+ )
68
+ """
69
+
70
+ # Types
71
+ # Ablation
72
+ from .ablation import (
73
+ AGENT_MEMORY_ONLY,
74
+ ALL_CONTEXT_ENGINEERING,
75
+ COMPACTION_ONLY,
76
+ # Context engineering configs
77
+ CONTEXT_ENG_BASELINE,
78
+ CONTEXT_ENGINEERING_CONFIGS,
79
+ ISOLATION_ONLY,
80
+ AblationConfig,
81
+ AblationResult,
82
+ # Shared utilities
83
+ compute_pareto_frontier,
84
+ create_harness_from_config,
85
+ generate_recommendation,
86
+ run_ablations,
87
+ run_context_engineering_comparison,
88
+ run_single_ablation,
89
+ )
90
+
91
+ # Config export
92
+ from .config_export import (
93
+ export_config,
94
+ export_optimization_configs,
95
+ load_config,
96
+ )
97
+
98
+ # Evaluators
99
+ from .evaluators import (
100
+ CompositeEvaluator,
101
+ Evaluator,
102
+ HeuristicEvaluator,
103
+ LLMEvaluator,
104
+ TraceEvaluator,
105
+ )
106
+
107
+ # Metrics
108
+ from .metrics import (
109
+ LLMCallInfo,
110
+ ToolCallInfo,
111
+ TraceMetrics,
112
+ extract_metrics,
113
+ format_metrics_summary,
114
+ metrics_to_dict,
115
+ )
116
+
117
+ # Optimizer
118
+ from .optimizer import (
119
+ ConfigSummary,
120
+ FlowOptimizer,
121
+ OptimizationResult,
122
+ TaskResult,
123
+ generate_grid_configs,
124
+ load_tasks_from_jsonl,
125
+ )
126
+
127
+ # Reporters
128
+ from .reporters import (
129
+ load_run_result_summary,
130
+ print_comparison_table,
131
+ print_eval_result,
132
+ print_metrics_summary,
133
+ save_comparison,
134
+ save_run_result,
135
+ )
136
+
137
+ # Runner
138
+ from .runner import FlowExperimentRunner, setup_tracing
139
+
140
+ # Trace collection
141
+ from .trace_collector import FlowTraceCollector
142
+ from .types import CriterionResult, EvalCriterion, EvalResult, RunResult, Task
143
+
144
+ __all__ = [ # noqa: RUF022 # Intentionally grouped by category
145
+ # Types
146
+ "Task",
147
+ "EvalCriterion",
148
+ "RunResult",
149
+ "EvalResult",
150
+ "CriterionResult",
151
+ # Trace collection
152
+ "FlowTraceCollector",
153
+ # Metrics
154
+ "TraceMetrics",
155
+ "LLMCallInfo",
156
+ "ToolCallInfo",
157
+ "extract_metrics",
158
+ "format_metrics_summary",
159
+ "metrics_to_dict",
160
+ # Runner
161
+ "FlowExperimentRunner",
162
+ "setup_tracing",
163
+ # Evaluators
164
+ "Evaluator",
165
+ "LLMEvaluator",
166
+ "TraceEvaluator",
167
+ "HeuristicEvaluator",
168
+ "CompositeEvaluator",
169
+ # Reporters
170
+ "save_run_result",
171
+ "load_run_result_summary",
172
+ "save_comparison",
173
+ "print_metrics_summary",
174
+ "print_comparison_table",
175
+ "print_eval_result",
176
+ # Ablation
177
+ "AblationConfig",
178
+ "AblationResult",
179
+ "run_ablations",
180
+ "run_single_ablation",
181
+ "create_harness_from_config",
182
+ # Context engineering configs
183
+ "CONTEXT_ENG_BASELINE",
184
+ "COMPACTION_ONLY",
185
+ "AGENT_MEMORY_ONLY",
186
+ "ISOLATION_ONLY",
187
+ "ALL_CONTEXT_ENGINEERING",
188
+ "CONTEXT_ENGINEERING_CONFIGS",
189
+ "run_context_engineering_comparison",
190
+ # Shared utilities
191
+ "compute_pareto_frontier",
192
+ "generate_recommendation",
193
+ # Optimizer
194
+ "FlowOptimizer",
195
+ "OptimizationResult",
196
+ "ConfigSummary",
197
+ "TaskResult",
198
+ "generate_grid_configs",
199
+ "load_tasks_from_jsonl",
200
+ # Config export
201
+ "export_config",
202
+ "load_config",
203
+ "export_optimization_configs",
204
+ ]
src/flow/experiments/ablation.py ADDED
@@ -0,0 +1,472 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Microsoft. All rights reserved.
2
+
3
+ """Ablation runner for comparing Flow agent configurations.
4
+
5
+ This module provides:
6
+ - AblationConfig: Dataclass for agent configuration parameters
7
+ - Pareto analysis utilities for multi-objective optimization
8
+ - Pre-defined configurations for context engineering strategies
9
+ - Convenience functions for running ablation studies
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import json
15
+ import logging
16
+ from dataclasses import asdict, dataclass
17
+ from datetime import datetime
18
+ from pathlib import Path
19
+ from typing import TYPE_CHECKING
20
+
21
+ from .evaluators import HeuristicEvaluator
22
+ from .metrics import TraceMetrics, extract_metrics, metrics_to_dict
23
+ from .reporters import print_comparison_table, save_run_result
24
+ from .runner import FlowExperimentRunner, setup_tracing
25
+ from .types import EvalCriterion, RunResult, Task
26
+
27
+ if TYPE_CHECKING:
28
+ from flow.harness.maf import MAFHarness
29
+
30
+ from .optimizer import ConfigSummary
31
+
32
+ logger = logging.getLogger(__name__)
33
+
34
+
35
+ @dataclass
36
+ class AblationConfig:
37
+ """Configuration for a single ablation run.
38
+
39
+ Each config represents a different agent configuration to test.
40
+ The name is used as an identifier in comparison results.
41
+
42
+ Attributes:
43
+ name: Unique identifier for this configuration
44
+ enable_message_compaction: Whether to enable message compaction
45
+ enable_memory_tool: Whether to enable agent-managed memory
46
+ enable_sub_agent: Whether to enable sub-agent for isolated research
47
+ compaction_head_size: Number of initial messages to keep
48
+ compaction_tail_size: Number of recent messages to keep
49
+ bash_timeout: Timeout for bash commands in seconds
50
+ """
51
+
52
+ name: str
53
+ enable_message_compaction: bool = True
54
+ enable_memory_tool: bool = True
55
+ enable_sub_agent: bool = False
56
+ compaction_head_size: int = 10
57
+ compaction_tail_size: int = 40
58
+ bash_timeout: int = 120
59
+
60
+
61
+ @dataclass
62
+ class AblationResult:
63
+ """Result of a single ablation run.
64
+
65
+ Contains all data from the run including raw results,
66
+ extracted metrics, and evaluation scores.
67
+ """
68
+
69
+ config: AblationConfig
70
+ run_result: RunResult
71
+ metrics: TraceMetrics
72
+ eval_score: float
73
+ eval_passed: bool
74
+ eval_reasoning: str
75
+
76
+
77
+ def create_harness_from_config(config: AblationConfig, workspace: Path) -> MAFHarness:
78
+ """Create a MAFHarness from an ablation config.
79
+
80
+ Args:
81
+ config: The ablation configuration
82
+ workspace: Working directory
83
+
84
+ Returns:
85
+ A configured MAFHarness
86
+ """
87
+ from flow.harness.maf import MAFHarness
88
+
89
+ return MAFHarness(
90
+ workspace=workspace,
91
+ memory_path=workspace / "memory",
92
+ enable_compaction=config.enable_message_compaction,
93
+ enable_memory_tool=config.enable_memory_tool,
94
+ enable_sub_agent=config.enable_sub_agent,
95
+ compaction_head_size=config.compaction_head_size,
96
+ compaction_tail_size=config.compaction_tail_size,
97
+ bash_timeout=config.bash_timeout,
98
+ )
99
+
100
+
101
+ async def run_single_ablation(
102
+ config: AblationConfig,
103
+ task: Task,
104
+ workspace: Path,
105
+ ) -> AblationResult:
106
+ """Run a single ablation with trace capture and evaluation.
107
+
108
+ Args:
109
+ config: The ablation configuration
110
+ task: The task to run
111
+ workspace: Working directory
112
+
113
+ Returns:
114
+ AblationResult with metrics and evaluation
115
+ """
116
+ # Create harness from config
117
+ harness = create_harness_from_config(config, workspace)
118
+
119
+ try:
120
+ # Create runner
121
+ runner = FlowExperimentRunner(keep_workspace=True)
122
+
123
+ # Run the experiment
124
+ run_result = await runner.run(harness, task, workspace=workspace)
125
+
126
+ # Extract metrics
127
+ metrics = extract_metrics(run_result.trace)
128
+
129
+ # Evaluate the result
130
+ evaluator = HeuristicEvaluator()
131
+ eval_result = await evaluator.evaluate(run_result)
132
+
133
+ return AblationResult(
134
+ config=config,
135
+ run_result=run_result,
136
+ metrics=metrics,
137
+ eval_score=eval_result.score,
138
+ eval_passed=eval_result.passed,
139
+ eval_reasoning=eval_result.reasoning,
140
+ )
141
+ finally:
142
+ await harness.close()
143
+
144
+
145
+ def save_ablation_result(result: AblationResult, output_dir: Path) -> None:
146
+ """Save ablation result to files.
147
+
148
+ Creates a subdirectory for the config with all result files.
149
+
150
+ Args:
151
+ result: The ablation result to save
152
+ output_dir: Base directory for output
153
+ """
154
+ config_dir = output_dir / result.config.name
155
+ save_run_result(
156
+ result.run_result,
157
+ config_dir,
158
+ metrics=result.metrics,
159
+ )
160
+
161
+ # Save ablation-specific data
162
+ with open(config_dir / "ablation.json", "w") as f:
163
+ json.dump({
164
+ "config": asdict(result.config),
165
+ "evaluation": {
166
+ "score": result.eval_score,
167
+ "passed": result.eval_passed,
168
+ "reasoning": result.eval_reasoning,
169
+ },
170
+ }, f, indent=2)
171
+
172
+
173
+ async def run_ablations(
174
+ configs: list[AblationConfig],
175
+ task_prompt: str,
176
+ output_dir: Path | None = None,
177
+ task_name: str = "ablation_task",
178
+ ) -> list[AblationResult]:
179
+ """Run multiple ablation configurations and compare.
180
+
181
+ This function:
182
+ 1. Sets up tracing
183
+ 2. Runs each configuration on the same task
184
+ 3. Collects metrics and evaluation scores
185
+ 4. Saves results and prints comparison
186
+
187
+ Args:
188
+ configs: List of configurations to test
189
+ task_prompt: The task prompt to run
190
+ output_dir: Base directory for output (default: ~/.flow/ablations)
191
+ task_name: Name for the task (used in file paths)
192
+
193
+ Returns:
194
+ List of ablation results
195
+ """
196
+ # Setup output directory
197
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
198
+ if output_dir is None:
199
+ output_dir = Path.home() / ".flow" / "ablations"
200
+ output_dir = output_dir / timestamp
201
+ output_dir.mkdir(parents=True, exist_ok=True)
202
+
203
+ # Create task
204
+ task = Task(
205
+ name=task_name,
206
+ prompt=task_prompt,
207
+ criteria=[
208
+ EvalCriterion(
209
+ name="completion",
210
+ instruction="The task should be completed successfully",
211
+ ),
212
+ ],
213
+ )
214
+
215
+ # Save configs
216
+ with open(output_dir / "config.json", "w") as f: # noqa: ASYNC230
217
+ json.dump({
218
+ "task": task_prompt,
219
+ "timestamp": timestamp,
220
+ "configs": [asdict(c) for c in configs],
221
+ }, f, indent=2)
222
+
223
+ print("=" * 80)
224
+ print(" FLOW ABLATION RUNNER")
225
+ print("=" * 80)
226
+ print(f" Task: {task_prompt[:60]}{'...' if len(task_prompt) > 60 else ''}")
227
+ print(f" Configs: {len(configs)}")
228
+ print(f" Output: {output_dir}")
229
+ print("=" * 80)
230
+
231
+ # Setup tracing once
232
+ setup_tracing("flow-ablation")
233
+
234
+ results = []
235
+ for i, config in enumerate(configs, 1):
236
+ print(f"\n[{i}/{len(configs)}] Running: {config.name}")
237
+ print("-" * 40)
238
+
239
+ # Each config gets its own workspace
240
+ workspace = output_dir / config.name / "workspace"
241
+ workspace.mkdir(parents=True, exist_ok=True)
242
+
243
+ result = await run_single_ablation(
244
+ config=config,
245
+ task=task,
246
+ workspace=workspace,
247
+ )
248
+
249
+ results.append(result)
250
+ save_ablation_result(result, output_dir)
251
+
252
+ # Quick status
253
+ status = "OK" if result.run_result.success else "FAIL"
254
+ print(f" {status} | {result.run_result.duration_seconds:.1f}s | "
255
+ f"Tokens: {result.metrics.total_tokens} | Tools: {result.metrics.tool_call_count}")
256
+
257
+ # Save comparison
258
+ comparison_data = [
259
+ {
260
+ "name": r.config.name,
261
+ "success": r.run_result.success,
262
+ "duration_seconds": r.run_result.duration_seconds,
263
+ "metrics": metrics_to_dict(r.metrics),
264
+ "evaluation": {
265
+ "score": r.eval_score,
266
+ "passed": r.eval_passed,
267
+ },
268
+ }
269
+ for r in results
270
+ ]
271
+
272
+ with open(output_dir / "comparison.json", "w") as f: # noqa: ASYNC230
273
+ json.dump({"task": task_prompt, "results": comparison_data}, f, indent=2)
274
+
275
+ # Print comparison
276
+ print_comparison_table(comparison_data, "Ablation Comparison")
277
+
278
+ print(f"\nResults saved to: {output_dir}")
279
+
280
+ return results
281
+
282
+
283
+ # =============================================================================
284
+ # Context Engineering Baseline Configurations
285
+ # =============================================================================
286
+ # These configurations demonstrate the three main context engineering strategies:
287
+ # 1. Compaction - Reactive trimming via message stores
288
+ # 2. Agent-Managed Memory - Agent controls when to write/read/delete
289
+ # 3. Isolation - Sub-agent architecture prevents context pollution
290
+
291
+
292
+ # Baseline: No context engineering (for comparison)
293
+ CONTEXT_ENG_BASELINE = AblationConfig(
294
+ name="no_context_engineering",
295
+ enable_message_compaction=False,
296
+ enable_memory_tool=False,
297
+ enable_sub_agent=False,
298
+ )
299
+
300
+ # Strategy 1: Compaction via Message Stores
301
+ # Uses HeadTailCompactingMessageStore to keep first N + last M messages
302
+ # Good for: Long-running sessions where middle context is less important
303
+ COMPACTION_ONLY = AblationConfig(
304
+ name="compaction_only",
305
+ enable_message_compaction=True,
306
+ enable_memory_tool=False,
307
+ enable_sub_agent=False,
308
+ compaction_head_size=10, # Keep task context
309
+ compaction_tail_size=40, # Keep recent work
310
+ )
311
+
312
+ # Strategy 2: Agent-Managed Memory
313
+ # Agent decides when to save/retrieve information from persistent storage
314
+ # Good for: Cross-session memory, learning patterns, storing decisions
315
+ AGENT_MEMORY_ONLY = AblationConfig(
316
+ name="agent_memory_only",
317
+ enable_message_compaction=False,
318
+ enable_memory_tool=True,
319
+ enable_sub_agent=False,
320
+ )
321
+
322
+ # Strategy 3: Isolation via Sub-Agent
323
+ # Delegate heavy research to sub-agent with isolated context
324
+ # Good for: Complex research tasks that would pollute main context
325
+ ISOLATION_ONLY = AblationConfig(
326
+ name="isolation_only",
327
+ enable_message_compaction=False,
328
+ enable_memory_tool=False,
329
+ enable_sub_agent=True,
330
+ )
331
+
332
+ # Combined: All context engineering strategies
333
+ # Uses compaction + memory + isolation together
334
+ # Good for: Production systems with long-running, complex tasks
335
+ ALL_CONTEXT_ENGINEERING = AblationConfig(
336
+ name="all_context_engineering",
337
+ enable_message_compaction=True,
338
+ enable_memory_tool=True,
339
+ enable_sub_agent=True,
340
+ compaction_head_size=10,
341
+ compaction_tail_size=40,
342
+ )
343
+
344
+ # Predefined list for running context engineering comparison
345
+ CONTEXT_ENGINEERING_CONFIGS = [
346
+ CONTEXT_ENG_BASELINE,
347
+ COMPACTION_ONLY,
348
+ AGENT_MEMORY_ONLY,
349
+ ISOLATION_ONLY,
350
+ ALL_CONTEXT_ENGINEERING,
351
+ ]
352
+
353
+
354
+ async def run_context_engineering_comparison(
355
+ task_prompt: str,
356
+ output_dir: Path | None = None,
357
+ ) -> list[AblationResult]:
358
+ """Run a comparison of all context engineering strategies.
359
+
360
+ This is a convenience function that runs all context engineering
361
+ baseline configurations against a single task for comparison.
362
+
363
+ Args:
364
+ task_prompt: The task to run (should benefit from context management)
365
+ output_dir: Optional output directory for results
366
+
367
+ Returns:
368
+ List of AblationResult for each strategy
369
+
370
+ Example:
371
+ >>> results = await run_context_engineering_comparison(
372
+ ... "Research the authentication patterns in this codebase and "
373
+ ... "create a summary document with recommendations."
374
+ ... )
375
+ """
376
+ return await run_ablations(
377
+ configs=CONTEXT_ENGINEERING_CONFIGS,
378
+ task_prompt=task_prompt,
379
+ output_dir=output_dir,
380
+ task_name="context_engineering_comparison",
381
+ )
382
+
383
+
384
+ # =============================================================================
385
+ # Shared Utilities for Pareto Analysis
386
+ # =============================================================================
387
+
388
+
389
+ def compute_pareto_frontier(
390
+ summaries: list[ConfigSummary],
391
+ score_key: str = "avg_score",
392
+ cost_key: str = "avg_tokens",
393
+ ) -> list[str]:
394
+ """Compute Pareto frontier for multi-objective optimization.
395
+
396
+ Identifies configurations that are not dominated by any other configuration.
397
+ A config is dominated if another config has better score AND lower tokens.
398
+
399
+ Args:
400
+ summaries: List of ConfigSummary objects (or dicts with score/token keys)
401
+ score_key: Attribute name for the score metric (higher is better)
402
+ cost_key: Attribute name for the cost metric (lower is better)
403
+
404
+ Returns:
405
+ List of names of Pareto-optimal configurations
406
+ """
407
+ # Sort by cost (ascending)
408
+ def get_val(s: object, key: str) -> float:
409
+ if isinstance(s, dict):
410
+ return float(s.get(key, 0))
411
+ return float(getattr(s, key, 0))
412
+
413
+ def get_name(s: object) -> str:
414
+ if isinstance(s, dict):
415
+ return str(s.get("name", ""))
416
+ return str(getattr(s, "name", ""))
417
+
418
+ sorted_summaries = sorted(summaries, key=lambda s: get_val(s, cost_key))
419
+
420
+ pareto_names = []
421
+ best_score = -1.0
422
+
423
+ for summary in sorted_summaries:
424
+ score = get_val(summary, score_key)
425
+ if score > best_score:
426
+ pareto_names.append(get_name(summary))
427
+ best_score = score
428
+
429
+ return pareto_names
430
+
431
+
432
+ def generate_recommendation(
433
+ summaries: list[ConfigSummary],
434
+ pareto_names: list[str],
435
+ min_score: float = 0.7,
436
+ ) -> tuple[str | None, str]:
437
+ """Generate a recommendation based on Pareto analysis.
438
+
439
+ Args:
440
+ summaries: List of ConfigSummary objects
441
+ pareto_names: Names of Pareto-optimal configs
442
+ min_score: Minimum acceptable score threshold
443
+
444
+ Returns:
445
+ Tuple of (recommended_config_name, recommendation_text)
446
+ """
447
+ def get_val(s: object, key: str) -> float:
448
+ if isinstance(s, dict):
449
+ return float(s.get(key, 0))
450
+ return float(getattr(s, key, 0))
451
+
452
+ def get_name(s: object) -> str:
453
+ if isinstance(s, dict):
454
+ return str(s.get("name", ""))
455
+ return str(getattr(s, "name", ""))
456
+
457
+ # Filter to acceptable configs
458
+ acceptable = [s for s in summaries if get_val(s, "avg_score") >= min_score]
459
+ if not acceptable:
460
+ return None, "No configuration met the minimum score threshold."
461
+
462
+ # Prefer Pareto-optimal configs
463
+ pareto_acceptable = [s for s in acceptable if get_name(s) in pareto_names]
464
+ candidates = pareto_acceptable if pareto_acceptable else acceptable
465
+
466
+ # Pick the one with lowest tokens among candidates
467
+ best = min(candidates, key=lambda s: get_val(s, "avg_tokens"))
468
+ name = get_name(best)
469
+ tokens = get_val(best, "avg_tokens")
470
+ score = get_val(best, "avg_score")
471
+
472
+ return name, f"Recommended: {name} (avg {tokens:.0f} tokens, {score:.2f} score)"
src/flow/experiments/config_export.py ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Microsoft. All rights reserved.
2
+
3
+ """Config export/import utilities for optimizer results.
4
+
5
+ Exports winning configurations as YAML files that can be loaded
6
+ and used directly with `flow run --config <path>`.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from dataclasses import asdict
12
+ from pathlib import Path
13
+ from typing import Any
14
+
15
+ import yaml
16
+
17
+ from .ablation import AblationConfig
18
+
19
+
20
+ def export_config(
21
+ config: AblationConfig,
22
+ metrics: dict[str, Any],
23
+ path: Path,
24
+ ) -> None:
25
+ """Export an AblationConfig as a reusable YAML file.
26
+
27
+ The exported YAML includes:
28
+ - All config parameters (directly loadable)
29
+ - Optimization metadata prefixed with _ (ignored when loading)
30
+
31
+ Args:
32
+ config: The AblationConfig to export
33
+ metrics: Optimization metrics (score, tokens, etc.)
34
+ path: Path to write the YAML file
35
+
36
+ Example output:
37
+ name: compaction_head10_tail40
38
+ enable_message_compaction: true
39
+ compaction_head_size: 10
40
+ ...
41
+ _optimization:
42
+ timestamp: "2026-01-26T14:30:22"
43
+ avg_score: 0.89
44
+ avg_tokens: 12400
45
+ """
46
+ data = asdict(config)
47
+ data["_optimization"] = metrics
48
+ path.parent.mkdir(parents=True, exist_ok=True)
49
+ path.write_text(yaml.dump(data, default_flow_style=False, sort_keys=False))
50
+
51
+
52
+ def load_config(path: Path) -> AblationConfig:
53
+ """Load an AblationConfig from a YAML file.
54
+
55
+ Ignores any keys prefixed with _ (optimization metadata).
56
+
57
+ Args:
58
+ path: Path to the YAML config file
59
+
60
+ Returns:
61
+ AblationConfig instance
62
+
63
+ Raises:
64
+ FileNotFoundError: If the config file doesn't exist
65
+ ValueError: If the config is invalid
66
+ """
67
+ if not path.exists():
68
+ raise FileNotFoundError(f"Config file not found: {path}")
69
+
70
+ data = yaml.safe_load(path.read_text())
71
+
72
+ # Filter out metadata keys (prefixed with _)
73
+ config_data = {k: v for k, v in data.items() if not k.startswith("_")}
74
+
75
+ try:
76
+ return AblationConfig(**config_data)
77
+ except TypeError as e:
78
+ raise ValueError(f"Invalid config file {path}: {e}") from e
79
+
80
+
81
+ def export_optimization_configs(
82
+ summaries: list[dict[str, Any]],
83
+ pareto_names: list[str],
84
+ output_dir: Path,
85
+ timestamp: str,
86
+ ) -> dict[str, Path]:
87
+ """Export all notable configs from an optimization run.
88
+
89
+ Exports:
90
+ - best_score.yaml: Highest quality config
91
+ - best_cost.yaml: Lowest token usage config
92
+ - best_efficiency.yaml: Best score/token ratio
93
+ - pareto/<name>.yaml: All Pareto-optimal configs
94
+
95
+ Args:
96
+ summaries: List of ConfigSummary dicts with metrics
97
+ pareto_names: Names of Pareto-optimal configs
98
+ output_dir: Directory to write configs
99
+ timestamp: Optimization timestamp for metadata
100
+
101
+ Returns:
102
+ Dict mapping config type to file path
103
+ """
104
+ configs_dir = output_dir / "configs"
105
+ configs_dir.mkdir(parents=True, exist_ok=True)
106
+
107
+ exported: dict[str, Path] = {}
108
+
109
+ if not summaries:
110
+ return exported
111
+
112
+ # Find best by different criteria
113
+ best_score = max(summaries, key=lambda s: s.get("avg_score", 0))
114
+ best_cost = min(summaries, key=lambda s: s.get("avg_tokens", float("inf")))
115
+ best_efficiency = max(
116
+ summaries,
117
+ key=lambda s: s.get("avg_score", 0) / max(s.get("avg_tokens", 1), 1),
118
+ )
119
+
120
+ # Export best configs
121
+ for label, summary in [
122
+ ("best_score", best_score),
123
+ ("best_cost", best_cost),
124
+ ("best_efficiency", best_efficiency),
125
+ ]:
126
+ config = _summary_to_config(summary)
127
+ metrics = _extract_metrics(summary, timestamp, label)
128
+ path = configs_dir / f"{label}.yaml"
129
+ export_config(config, metrics, path)
130
+ exported[label] = path
131
+
132
+ # Export Pareto-optimal configs
133
+ pareto_dir = configs_dir / "pareto"
134
+ pareto_dir.mkdir(exist_ok=True)
135
+
136
+ for summary in summaries:
137
+ name = summary.get("name", "unknown")
138
+ if name in pareto_names:
139
+ config = _summary_to_config(summary)
140
+ metrics = _extract_metrics(summary, timestamp, "pareto")
141
+ metrics["is_pareto_optimal"] = True
142
+ path = pareto_dir / f"{name}.yaml"
143
+ export_config(config, metrics, path)
144
+ exported[f"pareto/{name}"] = path
145
+
146
+ return exported
147
+
148
+
149
+ def _summary_to_config(summary: dict[str, Any]) -> AblationConfig:
150
+ """Convert a summary dict back to an AblationConfig."""
151
+ # Extract config fields from summary
152
+ config_fields = {
153
+ "name": summary.get("name", "unknown"),
154
+ "enable_message_compaction": summary.get("enable_message_compaction", True),
155
+ "enable_memory_tool": summary.get("enable_memory_tool", True),
156
+ "enable_sub_agent": summary.get("enable_sub_agent", False),
157
+ "compaction_head_size": summary.get("compaction_head_size", 10),
158
+ "compaction_tail_size": summary.get("compaction_tail_size", 40),
159
+ "bash_timeout": summary.get("bash_timeout", 120),
160
+ }
161
+
162
+ # Also check nested config if present
163
+ if "config" in summary:
164
+ config_fields.update(summary["config"])
165
+
166
+ return AblationConfig(**config_fields)
167
+
168
+
169
+ def _extract_metrics(
170
+ summary: dict[str, Any],
171
+ timestamp: str,
172
+ selection_reason: str,
173
+ ) -> dict[str, Any]:
174
+ """Extract optimization metrics from a summary."""
175
+ return {
176
+ "timestamp": timestamp,
177
+ "selection_reason": selection_reason,
178
+ "avg_score": summary.get("avg_score", 0),
179
+ "avg_tokens": summary.get("avg_tokens", 0),
180
+ "avg_duration": summary.get("avg_duration", 0),
181
+ "pass_rate": summary.get("pass_rate", 0),
182
+ "pareto_rank": summary.get("pareto_rank"),
183
+ "is_pareto_optimal": summary.get("is_pareto_optimal", False),
184
+ }
src/flow/experiments/evaluators/__init__.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Microsoft. All rights reserved.
2
+
3
+ """Evaluators for the experiments framework."""
4
+
5
+ from .base import Evaluator
6
+ from .composite import CompositeEvaluator
7
+ from .heuristic import HeuristicEvaluator
8
+ from .llm import LLMEvaluator
9
+ from .trace import TraceEvaluator
10
+
11
+ __all__ = [
12
+ "CompositeEvaluator",
13
+ "Evaluator",
14
+ "HeuristicEvaluator",
15
+ "LLMEvaluator",
16
+ "TraceEvaluator",
17
+ ]
src/flow/experiments/evaluators/base.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Microsoft. All rights reserved.
2
+
3
+ """Base evaluator protocol for the experiments framework."""
4
+
5
+ from typing import Protocol
6
+
7
+ from ..types import EvalResult, RunResult
8
+
9
+
10
+ class Evaluator(Protocol):
11
+ """Protocol for evaluating agent outputs.
12
+
13
+ Evaluators assess the results of agent runs and produce scores
14
+ and pass/fail determinations based on various criteria.
15
+
16
+ Implementations:
17
+ - TraceEvaluator: Based on trace metrics (tokens, duration, tool calls)
18
+ - LLMEvaluator: Uses an LLM to judge output quality
19
+ - HeuristicEvaluator: Rule-based evaluation (files created, syntax, etc.)
20
+ - CompositeEvaluator: Combines multiple evaluators
21
+ """
22
+
23
+ async def evaluate(self, run_result: RunResult) -> EvalResult:
24
+ """Evaluate the result of an agent run.
25
+
26
+ Args:
27
+ run_result: The result from running an agent on a task
28
+
29
+ Returns:
30
+ EvalResult with scores and reasoning
31
+ """
32
+ ...
src/flow/experiments/evaluators/composite.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Microsoft. All rights reserved.
2
+
3
+ """Composite evaluator that combines multiple evaluators."""
4
+
5
+ from typing import TYPE_CHECKING
6
+
7
+ from ..types import EvalResult, RunResult
8
+
9
+ if TYPE_CHECKING:
10
+ from .base import Evaluator
11
+
12
+
13
+ class CompositeEvaluator:
14
+ """Evaluator that combines multiple evaluators.
15
+
16
+ Useful for combining different evaluation strategies:
17
+ - LLM evaluation with trace-based metrics
18
+ - Multiple heuristic checks
19
+ - Weighted combination of evaluators
20
+
21
+ Example:
22
+ evaluator = CompositeEvaluator([
23
+ TraceEvaluator(max_tokens=5000),
24
+ HeuristicEvaluator(),
25
+ ], weights=[0.3, 0.7])
26
+ result = await evaluator.evaluate(run_result)
27
+ """
28
+
29
+ def __init__(
30
+ self,
31
+ evaluators: list["Evaluator"],
32
+ weights: list[float] | None = None,
33
+ ) -> None:
34
+ """Initialize the composite evaluator.
35
+
36
+ Args:
37
+ evaluators: List of evaluators to combine
38
+ weights: Optional weights for each evaluator (default: equal weights)
39
+
40
+ Raises:
41
+ ValueError: If number of weights doesn't match number of evaluators
42
+ """
43
+ self.evaluators = evaluators
44
+ self.weights = weights or [1.0] * len(evaluators)
45
+
46
+ if len(self.weights) != len(self.evaluators):
47
+ raise ValueError("Number of weights must match number of evaluators")
48
+
49
+ async def evaluate(self, run_result: RunResult) -> EvalResult:
50
+ """Run all evaluators and combine results.
51
+
52
+ The overall score is a weighted average of all evaluator scores.
53
+ The overall pass/fail is determined by whether ALL evaluators pass.
54
+
55
+ Args:
56
+ run_result: The result from running an agent on a task
57
+
58
+ Returns:
59
+ Combined EvalResult
60
+ """
61
+ all_criteria_results = []
62
+ total_weighted_score = 0.0
63
+ total_weight = sum(self.weights)
64
+ all_passed = True
65
+ all_reasoning = []
66
+
67
+ for evaluator, weight in zip(self.evaluators, self.weights, strict=True):
68
+ result = await evaluator.evaluate(run_result)
69
+ all_criteria_results.extend(result.criteria_results)
70
+ total_weighted_score += result.score * weight
71
+ all_passed = all_passed and result.passed
72
+ if result.reasoning:
73
+ all_reasoning.append(result.reasoning)
74
+
75
+ return EvalResult(
76
+ score=total_weighted_score / total_weight if total_weight > 0 else 0.0,
77
+ passed=all_passed,
78
+ criteria_results=all_criteria_results,
79
+ reasoning=" | ".join(all_reasoning),
80
+ )
src/flow/experiments/evaluators/heuristic.py ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Microsoft. All rights reserved.
2
+
3
+ """Heuristic evaluator using rule-based assessment."""
4
+
5
+ import logging
6
+ import subprocess
7
+
8
+ from ..types import CriterionResult, EvalResult, RunResult
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class HeuristicEvaluator:
14
+ """Evaluator that uses heuristic rules to assess agent output.
15
+
16
+ This evaluator checks:
17
+ 1. Were files created?
18
+ 2. Do Python files have valid syntax?
19
+ 3. Did the agent report completion?
20
+ 4. Does the output match expected patterns based on the task?
21
+
22
+ Useful for quick, deterministic evaluation without LLM calls.
23
+
24
+ Example:
25
+ evaluator = HeuristicEvaluator(passing_threshold=0.5)
26
+ result = await evaluator.evaluate(run_result)
27
+ print(f"Score: {result.score}, Passed: {result.passed}")
28
+ """
29
+
30
+ def __init__(self, passing_threshold: float = 0.5) -> None:
31
+ """Initialize the heuristic evaluator.
32
+
33
+ Args:
34
+ passing_threshold: Minimum score to pass (0.0 to 1.0)
35
+ """
36
+ self.passing_threshold = passing_threshold
37
+
38
+ async def evaluate(self, run_result: RunResult) -> EvalResult:
39
+ """Evaluate the agent's output using heuristic rules.
40
+
41
+ Args:
42
+ run_result: The result from running an agent on a task
43
+
44
+ Returns:
45
+ EvalResult with heuristic-based scores
46
+ """
47
+ criteria_results = []
48
+ notes = []
49
+ score = 0.0
50
+
51
+ # Check if files were created
52
+ if run_result.files_created:
53
+ criteria_results.append(
54
+ CriterionResult(
55
+ name="files_created",
56
+ score=1.0,
57
+ passed=True,
58
+ reasoning=f"Created {len(run_result.files_created)} file(s)",
59
+ )
60
+ )
61
+ score += 0.25
62
+ notes.append(f"Created {len(run_result.files_created)} file(s)")
63
+ else:
64
+ criteria_results.append(
65
+ CriterionResult(
66
+ name="files_created",
67
+ score=0.0,
68
+ passed=False,
69
+ reasoning="No files created",
70
+ )
71
+ )
72
+ notes.append("No files created")
73
+
74
+ # Check if agent reported task complete
75
+ output_lower = run_result.output.lower()
76
+ if "task_done" in output_lower or "complete" in output_lower or "finished" in output_lower:
77
+ criteria_results.append(
78
+ CriterionResult(
79
+ name="task_completed",
80
+ score=1.0,
81
+ passed=True,
82
+ reasoning="Agent reported completion",
83
+ )
84
+ )
85
+ score += 0.25
86
+ notes.append("Agent reported completion")
87
+ else:
88
+ criteria_results.append(
89
+ CriterionResult(
90
+ name="task_completed",
91
+ score=0.0,
92
+ passed=False,
93
+ reasoning="Agent did not report completion",
94
+ )
95
+ )
96
+
97
+ # Try to validate Python files (check syntax)
98
+ python_files = [f for f in run_result.files_created if f.endswith(".py")]
99
+ if python_files:
100
+ all_valid = True
101
+ syntax_notes = []
102
+ for py_file in python_files[:5]: # Check up to 5 files
103
+ file_path = run_result.workspace / py_file
104
+ if file_path.exists():
105
+ try:
106
+ result = subprocess.run( # noqa: ASYNC221, S603
107
+ ["python3", "-m", "py_compile", str(file_path)], # noqa: S607
108
+ capture_output=True,
109
+ timeout=5,
110
+ )
111
+ if result.returncode != 0:
112
+ all_valid = False
113
+ syntax_notes.append(f"Syntax error in {py_file}")
114
+ except subprocess.TimeoutExpired:
115
+ syntax_notes.append(f"Timeout checking {py_file}")
116
+ except FileNotFoundError:
117
+ # python3 not available, skip syntax check
118
+ pass
119
+ except Exception as e:
120
+ all_valid = False
121
+ syntax_notes.append(f"Error checking {py_file}: {e}")
122
+
123
+ if all_valid and not syntax_notes:
124
+ criteria_results.append(
125
+ CriterionResult(
126
+ name="code_syntax",
127
+ score=1.0,
128
+ passed=True,
129
+ reasoning="Python files have valid syntax",
130
+ )
131
+ )
132
+ score += 0.25
133
+ notes.append("Python files have valid syntax")
134
+ elif syntax_notes:
135
+ criteria_results.append(
136
+ CriterionResult(
137
+ name="code_syntax",
138
+ score=0.0,
139
+ passed=False,
140
+ reasoning="; ".join(syntax_notes),
141
+ )
142
+ )
143
+ notes.extend(syntax_notes)
144
+
145
+ # Check for expected patterns in output based on task
146
+ task_lower = run_result.task.prompt.lower()
147
+ output_correct = False
148
+
149
+ if "hello" in task_lower and "hello" in output_lower:
150
+ output_correct = True
151
+ elif "api" in task_lower and (
152
+ "fastapi" in output_lower or "endpoint" in output_lower or "flask" in output_lower
153
+ ):
154
+ output_correct = True
155
+ elif "http" in task_lower and ("server" in output_lower or "port" in output_lower):
156
+ output_correct = True
157
+ elif "test" in task_lower and ("pytest" in output_lower or "test" in output_lower):
158
+ output_correct = True
159
+ elif run_result.files_created:
160
+ # Generic: if files created, give partial credit
161
+ score += 0.125
162
+
163
+ if output_correct:
164
+ criteria_results.append(
165
+ CriterionResult(
166
+ name="output_relevance",
167
+ score=1.0,
168
+ passed=True,
169
+ reasoning="Output matches expected patterns for task",
170
+ )
171
+ )
172
+ score += 0.25
173
+
174
+ # Check for execution errors
175
+ if run_result.error:
176
+ criteria_results.append(
177
+ CriterionResult(
178
+ name="execution_success",
179
+ score=0.0,
180
+ passed=False,
181
+ reasoning=f"Execution failed: {run_result.error}",
182
+ )
183
+ )
184
+ score = max(0.0, score - 0.25)
185
+
186
+ final_score = min(score, 1.0)
187
+
188
+ return EvalResult(
189
+ score=final_score,
190
+ passed=final_score >= self.passing_threshold,
191
+ criteria_results=criteria_results,
192
+ reasoning="; ".join(notes) if notes else "Heuristic evaluation complete",
193
+ )
src/flow/experiments/evaluators/llm.py ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Microsoft. All rights reserved.
2
+
3
+ """LLM-as-judge evaluator for quality assessment."""
4
+
5
+ import json
6
+ import logging
7
+ from typing import Any
8
+
9
+ from ..metrics import extract_metrics
10
+ from ..types import CriterionResult, EvalResult, RunResult
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class LLMEvaluator:
16
+ """Evaluator that uses an LLM to assess agent output against criteria.
17
+
18
+ This implements the LLM-as-a-judge pattern, where a language model
19
+ evaluates whether the agent's output meets specified criteria.
20
+
21
+ Note: Requires a separate model client - not tied to FlowConfig.
22
+ This allows using a different model for evaluation than for agent execution.
23
+
24
+ Example:
25
+ from openai import AsyncOpenAI
26
+
27
+ client = AsyncOpenAI()
28
+ evaluator = LLMEvaluator(
29
+ model_client=client,
30
+ model_name="gpt-4o",
31
+ passing_threshold=0.7,
32
+ )
33
+ result = await evaluator.evaluate(run_result)
34
+ """
35
+
36
+ def __init__(
37
+ self,
38
+ model_client: Any,
39
+ model_name: str = "gpt-4o",
40
+ passing_threshold: float = 0.7,
41
+ ) -> None:
42
+ """Initialize the LLM evaluator.
43
+
44
+ Args:
45
+ model_client: An async client with chat.completions.create method
46
+ (e.g., AsyncOpenAI, AsyncAzureOpenAI)
47
+ model_name: Model name/deployment to use for evaluation
48
+ passing_threshold: Minimum score to pass (0.0 to 1.0)
49
+ """
50
+ self.model_client = model_client
51
+ self.model_name = model_name
52
+ self.passing_threshold = passing_threshold
53
+
54
+ def _get_evaluation_prompt(self, run_result: RunResult) -> str:
55
+ """Build the evaluation prompt for the LLM."""
56
+ criteria_text = "\n".join(
57
+ f"- **{c.name}** (weight: {c.weight}): {c.instruction}"
58
+ for c in run_result.task.criteria
59
+ )
60
+
61
+ # Extract execution trace summary for research/multi-step tasks
62
+ trace_summary = self._get_trace_summary(run_result)
63
+
64
+ return f"""You are an expert evaluator assessing an AI agent's output.
65
+
66
+ ## Task
67
+ The agent was given this task:
68
+ ```
69
+ {run_result.task.prompt}
70
+ ```
71
+
72
+ ## Agent Output
73
+ ```
74
+ {run_result.output[:8000]}
75
+ ```
76
+
77
+ ## Files Created
78
+ {json.dumps(run_result.files_created, indent=2) if run_result.files_created else "None"}
79
+
80
+ ## Execution Trace
81
+ {trace_summary}
82
+
83
+ ## Execution Status
84
+ {"Success" if run_result.success else f"Failed: {run_result.error}"}
85
+
86
+ ## Evaluation Criteria
87
+ {criteria_text}
88
+
89
+ ## Instructions
90
+ Evaluate the agent's output against each criterion. Consider both the final output AND the execution
91
+ trace (tools used, steps taken) when assessing correctness.
92
+
93
+ For each criterion:
94
+ 1. Assess how well the output meets the criterion (0.0 to 1.0)
95
+ 2. Determine if it passes (score >= 0.7)
96
+ 3. Provide brief reasoning
97
+
98
+ Respond in this exact JSON format:
99
+ ```json
100
+ {{
101
+ "criteria_results": [
102
+ {{
103
+ "name": "criterion_name",
104
+ "score": 0.85,
105
+ "passed": true,
106
+ "reasoning": "Brief explanation"
107
+ }}
108
+ ],
109
+ "overall_reasoning": "Summary of the overall evaluation"
110
+ }}
111
+ ```
112
+ """
113
+
114
+ def _get_trace_summary(self, run_result: RunResult) -> str:
115
+ """Extract a summary of the execution trace for evaluation."""
116
+ if not run_result.trace:
117
+ return "No trace data available"
118
+
119
+ metrics = extract_metrics(run_result.trace)
120
+
121
+ # Build tool usage summary
122
+ tool_summary = ""
123
+ if metrics.tool_calls_by_name:
124
+ tool_lines = [f" - {name}: {count}x" for name, count in metrics.tool_calls_by_name.items()]
125
+ tool_summary = "Tools used:\n" + "\n".join(tool_lines)
126
+ else:
127
+ tool_summary = "Tools used: None"
128
+
129
+ return f"""Duration: {run_result.duration_seconds:.1f}s
130
+ LLM calls: {metrics.llm_call_count}
131
+ Total tool calls: {metrics.tool_call_count}
132
+ {tool_summary}
133
+ Tokens used: {metrics.total_tokens} (input: {metrics.input_tokens}, output: {metrics.output_tokens})"""
134
+
135
+ async def evaluate(self, run_result: RunResult) -> EvalResult:
136
+ """Evaluate the agent's output using an LLM.
137
+
138
+ Args:
139
+ run_result: The result from running an agent on a task
140
+
141
+ Returns:
142
+ EvalResult with LLM-generated scores and reasoning
143
+ """
144
+ if not run_result.task.criteria:
145
+ # No criteria to evaluate - return a default pass
146
+ return EvalResult(
147
+ score=1.0 if run_result.success else 0.0,
148
+ passed=run_result.success,
149
+ criteria_results=[],
150
+ reasoning=(
151
+ "No evaluation criteria specified"
152
+ + ("" if run_result.success else f"; Error: {run_result.error}")
153
+ ),
154
+ )
155
+
156
+ prompt = self._get_evaluation_prompt(run_result)
157
+
158
+ try:
159
+ response = await self.model_client.chat.completions.create(
160
+ model=self.model_name,
161
+ messages=[
162
+ {
163
+ "role": "system",
164
+ "content": "You are an expert evaluator. Respond only with valid JSON.",
165
+ },
166
+ {"role": "user", "content": prompt},
167
+ ],
168
+ temperature=0.1, # Low temperature for consistent evaluation
169
+ )
170
+
171
+ # Extract the response text
172
+ response_text = response.choices[0].message.content or ""
173
+
174
+ # Parse JSON from response
175
+ json_start = response_text.find("{")
176
+ json_end = response_text.rfind("}") + 1
177
+ if json_start >= 0 and json_end > json_start:
178
+ eval_data = json.loads(response_text[json_start:json_end])
179
+ else:
180
+ raise ValueError("No JSON found in response")
181
+
182
+ # Build criterion results
183
+ criteria_results = []
184
+ total_weighted_score = 0.0
185
+ total_weight = 0.0
186
+
187
+ for cr_data in eval_data.get("criteria_results", []):
188
+ cr = CriterionResult(
189
+ name=cr_data.get("name", "unknown"),
190
+ score=float(cr_data.get("score", 0.0)),
191
+ passed=bool(cr_data.get("passed", False)),
192
+ reasoning=cr_data.get("reasoning", ""),
193
+ )
194
+ criteria_results.append(cr)
195
+
196
+ # Find the weight for this criterion
197
+ weight = 1.0
198
+ for task_criterion in run_result.task.criteria:
199
+ if task_criterion.name == cr.name:
200
+ weight = task_criterion.weight
201
+ break
202
+
203
+ total_weighted_score += cr.score * weight
204
+ total_weight += weight
205
+
206
+ # Calculate overall score
207
+ overall_score = total_weighted_score / total_weight if total_weight > 0 else 0.0
208
+
209
+ return EvalResult(
210
+ score=overall_score,
211
+ passed=overall_score >= self.passing_threshold,
212
+ criteria_results=criteria_results,
213
+ reasoning=eval_data.get("overall_reasoning", ""),
214
+ )
215
+
216
+ except Exception as e:
217
+ logger.error(f"LLM evaluation failed: {e}")
218
+ return EvalResult(
219
+ score=0.0,
220
+ passed=False,
221
+ criteria_results=[],
222
+ reasoning=f"Evaluation failed: {e}",
223
+ )
src/flow/experiments/evaluators/trace.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Microsoft. All rights reserved.
2
+
3
+ """Trace-based evaluator for objective metrics assessment."""
4
+
5
+ from ..metrics import extract_metrics
6
+ from ..types import CriterionResult, EvalResult, RunResult
7
+
8
+
9
+ class TraceEvaluator:
10
+ """Evaluator that assesses agent output based on trace metrics.
11
+
12
+ This evaluator checks objective metrics from the execution trace,
13
+ such as token usage, tool calls, and timing. All limits are optional -
14
+ only specified limits are evaluated.
15
+
16
+ Example:
17
+ evaluator = TraceEvaluator(
18
+ max_tokens=5000,
19
+ max_tool_calls=20,
20
+ max_duration_seconds=60.0,
21
+ )
22
+ result = await evaluator.evaluate(run_result)
23
+ print(f"Passed: {result.passed}, Score: {result.score}")
24
+ """
25
+
26
+ def __init__(
27
+ self,
28
+ max_tokens: int | None = None,
29
+ max_tool_calls: int | None = None,
30
+ max_duration_seconds: float | None = None,
31
+ ) -> None:
32
+ """Initialize the trace evaluator.
33
+
34
+ Args:
35
+ max_tokens: Maximum allowed total tokens (None = no limit)
36
+ max_tool_calls: Maximum allowed tool calls (None = no limit)
37
+ max_duration_seconds: Maximum allowed duration (None = no limit)
38
+ """
39
+ self.max_tokens = max_tokens
40
+ self.max_tool_calls = max_tool_calls
41
+ self.max_duration_seconds = max_duration_seconds
42
+
43
+ async def evaluate(self, run_result: RunResult) -> EvalResult:
44
+ """Evaluate the agent's output based on trace metrics.
45
+
46
+ Args:
47
+ run_result: The result from running an agent on a task
48
+
49
+ Returns:
50
+ EvalResult with metric-based scores
51
+ """
52
+ metrics = extract_metrics(run_result.trace)
53
+ criteria_results = []
54
+ all_passed = True
55
+
56
+ # Check token limit
57
+ if self.max_tokens is not None:
58
+ passed = metrics.total_tokens <= self.max_tokens
59
+ all_passed = all_passed and passed
60
+ # Score decreases proportionally when over limit
61
+ if passed:
62
+ score = 1.0
63
+ else:
64
+ overage = metrics.total_tokens - self.max_tokens
65
+ score = max(0.0, 1.0 - (overage / self.max_tokens))
66
+
67
+ criteria_results.append(
68
+ CriterionResult(
69
+ name="token_limit",
70
+ score=score,
71
+ passed=passed,
72
+ reasoning=f"Used {metrics.total_tokens} tokens (limit: {self.max_tokens})",
73
+ )
74
+ )
75
+
76
+ # Check tool call limit
77
+ if self.max_tool_calls is not None:
78
+ passed = metrics.tool_call_count <= self.max_tool_calls
79
+ all_passed = all_passed and passed
80
+ if passed:
81
+ score = 1.0
82
+ else:
83
+ overage = metrics.tool_call_count - self.max_tool_calls
84
+ score = max(0.0, 1.0 - (overage / self.max_tool_calls))
85
+
86
+ criteria_results.append(
87
+ CriterionResult(
88
+ name="tool_call_limit",
89
+ score=score,
90
+ passed=passed,
91
+ reasoning=f"Made {metrics.tool_call_count} tool calls (limit: {self.max_tool_calls})",
92
+ )
93
+ )
94
+
95
+ # Check duration limit
96
+ if self.max_duration_seconds is not None:
97
+ passed = run_result.duration_seconds <= self.max_duration_seconds
98
+ all_passed = all_passed and passed
99
+ if passed:
100
+ score = 1.0
101
+ else:
102
+ overage = run_result.duration_seconds - self.max_duration_seconds
103
+ score = max(0.0, 1.0 - (overage / self.max_duration_seconds))
104
+
105
+ criteria_results.append(
106
+ CriterionResult(
107
+ name="duration_limit",
108
+ score=score,
109
+ passed=passed,
110
+ reasoning=f"Took {run_result.duration_seconds:.2f}s (limit: {self.max_duration_seconds}s)",
111
+ )
112
+ )
113
+
114
+ # Check for execution errors
115
+ if run_result.error:
116
+ all_passed = False
117
+ criteria_results.append(
118
+ CriterionResult(
119
+ name="execution_success",
120
+ score=0.0,
121
+ passed=False,
122
+ reasoning=f"Execution failed: {run_result.error}",
123
+ )
124
+ )
125
+
126
+ # Check for trace errors
127
+ if metrics.error_count > 0:
128
+ criteria_results.append(
129
+ CriterionResult(
130
+ name="trace_errors",
131
+ score=max(0.0, 1.0 - (metrics.error_count * 0.2)),
132
+ passed=metrics.error_count == 0,
133
+ reasoning=f"Found {metrics.error_count} error(s) in trace",
134
+ )
135
+ )
136
+
137
+ # Calculate overall score
138
+ if criteria_results:
139
+ overall_score = sum(cr.score for cr in criteria_results) / len(criteria_results)
140
+ else:
141
+ # No criteria specified - just check success
142
+ overall_score = 1.0 if run_result.success else 0.0
143
+
144
+ return EvalResult(
145
+ score=overall_score,
146
+ passed=all_passed and run_result.success,
147
+ criteria_results=criteria_results,
148
+ reasoning=f"Trace evaluation: {len(criteria_results)} criteria checked",
149
+ )
src/flow/experiments/metrics.py ADDED
@@ -0,0 +1,267 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Microsoft. All rights reserved.
2
+
3
+ """Metrics extraction utilities for the experiments framework."""
4
+
5
+ from dataclasses import dataclass, field
6
+ from typing import Any
7
+
8
+
9
+ @dataclass
10
+ class LLMCallInfo:
11
+ """Information about a single LLM call."""
12
+
13
+ model: str = "unknown"
14
+ input_tokens: int = 0
15
+ output_tokens: int = 0
16
+ finish_reason: str = ""
17
+ duration_ms: float = 0.0
18
+
19
+
20
+ @dataclass
21
+ class ToolCallInfo:
22
+ """Information about a single tool call."""
23
+
24
+ name: str = "unknown"
25
+ duration_ms: float = 0.0
26
+ call_id: str = ""
27
+
28
+
29
+ @dataclass
30
+ class TraceMetrics:
31
+ """Objective metrics extracted from execution traces.
32
+
33
+ These are factual measurements from the trace, not subjective assessments.
34
+
35
+ Attributes:
36
+ total_tokens: Total tokens used (input + output)
37
+ input_tokens: Input/prompt tokens used
38
+ output_tokens: Output/completion tokens used
39
+ tool_call_count: Number of tool calls made
40
+ tool_calls_by_name: Count of calls per tool name
41
+ llm_call_count: Number of LLM API calls
42
+ total_duration_ms: Total execution time in milliseconds
43
+ llm_duration_ms: Time spent in LLM calls
44
+ tool_duration_ms: Time spent in tool calls
45
+ span_count: Total number of trace spans
46
+ error_count: Number of error spans
47
+ llm_calls: Detailed info for each LLM call
48
+ tool_calls: Detailed info for each tool call
49
+ """
50
+
51
+ total_tokens: int = 0
52
+ input_tokens: int = 0
53
+ output_tokens: int = 0
54
+ tool_call_count: int = 0
55
+ tool_calls_by_name: dict[str, int] = field(default_factory=dict)
56
+ llm_call_count: int = 0
57
+ total_duration_ms: float = 0.0
58
+ llm_duration_ms: float = 0.0
59
+ tool_duration_ms: float = 0.0
60
+ span_count: int = 0
61
+ error_count: int = 0
62
+ llm_calls: list[LLMCallInfo] = field(default_factory=list)
63
+ tool_calls: list[ToolCallInfo] = field(default_factory=list)
64
+
65
+
66
+ def extract_metrics(trace: list[dict[str, Any]]) -> TraceMetrics:
67
+ """Extract objective metrics from a trace.
68
+
69
+ Parses OpenTelemetry semantic conventions for GenAI:
70
+ - gen_ai.operation.name == "chat" for LLM calls
71
+ - gen_ai.usage.input_tokens / output_tokens for token counts
72
+ - gen_ai.operation.name == "execute_tool" for tool calls
73
+ - gen_ai.tool.name for tool identification
74
+
75
+ Args:
76
+ trace: List of trace span dictionaries
77
+
78
+ Returns:
79
+ TraceMetrics with extracted values
80
+ """
81
+ metrics = TraceMetrics()
82
+ metrics.span_count = len(trace)
83
+
84
+ for span in trace:
85
+ data = span.get("data", {})
86
+ attributes = data.get("attributes", {})
87
+ operation_name = data.get("operation_name", "")
88
+ duration_ms = data.get("duration_ms", 0) or 0
89
+
90
+ # Check for errors
91
+ status = data.get("status", "")
92
+ if "ERROR" in str(status).upper():
93
+ metrics.error_count += 1
94
+
95
+ # Check for LLM operations (gen_ai.operation.name = "chat")
96
+ if attributes.get("gen_ai.operation.name") == "chat":
97
+ input_tokens = attributes.get("gen_ai.usage.input_tokens", 0) or 0
98
+ output_tokens = attributes.get("gen_ai.usage.output_tokens", 0) or 0
99
+
100
+ metrics.llm_call_count += 1
101
+ metrics.input_tokens += int(input_tokens)
102
+ metrics.output_tokens += int(output_tokens)
103
+ metrics.llm_duration_ms += duration_ms
104
+
105
+ metrics.llm_calls.append(LLMCallInfo(
106
+ model=attributes.get("gen_ai.request.model", "unknown"),
107
+ input_tokens=int(input_tokens),
108
+ output_tokens=int(output_tokens),
109
+ finish_reason=str(attributes.get("gen_ai.response.finish_reasons", "")),
110
+ duration_ms=duration_ms,
111
+ ))
112
+
113
+ # Check for tool executions
114
+ elif attributes.get("gen_ai.operation.name") == "execute_tool":
115
+ tool_name = attributes.get("gen_ai.tool.name", operation_name)
116
+
117
+ metrics.tool_call_count += 1
118
+ metrics.tool_duration_ms += duration_ms
119
+ metrics.tool_calls_by_name[tool_name] = metrics.tool_calls_by_name.get(tool_name, 0) + 1
120
+
121
+ metrics.tool_calls.append(ToolCallInfo(
122
+ name=tool_name,
123
+ duration_ms=duration_ms,
124
+ call_id=attributes.get("gen_ai.tool.call.id", ""),
125
+ ))
126
+
127
+ # Also check for generic tool patterns (fallback)
128
+ elif not attributes.get("gen_ai.operation.name"):
129
+ is_tool_call = (
130
+ "tool" in operation_name.lower()
131
+ or attributes.get("tool.name")
132
+ or attributes.get("gen_ai.tool.name")
133
+ or "function_call" in operation_name.lower()
134
+ )
135
+
136
+ if is_tool_call:
137
+ tool_name = (
138
+ attributes.get("tool.name")
139
+ or attributes.get("gen_ai.tool.name")
140
+ or _extract_tool_name_from_operation(operation_name)
141
+ or "unknown"
142
+ )
143
+ metrics.tool_call_count += 1
144
+ metrics.tool_duration_ms += duration_ms
145
+ metrics.tool_calls_by_name[tool_name] = metrics.tool_calls_by_name.get(tool_name, 0) + 1
146
+
147
+ metrics.tool_calls.append(ToolCallInfo(
148
+ name=tool_name,
149
+ duration_ms=duration_ms,
150
+ call_id="",
151
+ ))
152
+
153
+ # Check for token counts in non-chat spans (fallback)
154
+ input_tokens = (
155
+ attributes.get("gen_ai.usage.input_tokens")
156
+ or attributes.get("llm.token_count.prompt")
157
+ or attributes.get("input_tokens")
158
+ )
159
+ output_tokens = (
160
+ attributes.get("gen_ai.usage.output_tokens")
161
+ or attributes.get("llm.token_count.completion")
162
+ or attributes.get("output_tokens")
163
+ )
164
+
165
+ if input_tokens or output_tokens:
166
+ metrics.input_tokens += int(input_tokens or 0)
167
+ metrics.output_tokens += int(output_tokens or 0)
168
+ metrics.llm_call_count += 1
169
+ metrics.llm_duration_ms += duration_ms
170
+
171
+ # Track total duration from root span
172
+ if not data.get("parent_span_id"):
173
+ metrics.total_duration_ms = max(metrics.total_duration_ms, duration_ms)
174
+
175
+ # Calculate total tokens
176
+ metrics.total_tokens = metrics.input_tokens + metrics.output_tokens
177
+
178
+ return metrics
179
+
180
+
181
+ def _extract_tool_name_from_operation(operation_name: str) -> str | None:
182
+ """Try to extract a tool name from an operation name.
183
+
184
+ Args:
185
+ operation_name: The span operation name
186
+
187
+ Returns:
188
+ Extracted tool name or None
189
+ """
190
+ # Common patterns: "tool:read_file", "execute_tool:write_file", "function_call:search"
191
+ for prefix in ["tool:", "execute_tool:", "function_call:", "call_"]:
192
+ if operation_name.lower().startswith(prefix):
193
+ return operation_name[len(prefix):]
194
+
195
+ return None
196
+
197
+
198
+ def format_metrics_summary(metrics: TraceMetrics) -> str:
199
+ """Format metrics as a human-readable summary.
200
+
201
+ Args:
202
+ metrics: TraceMetrics to format
203
+
204
+ Returns:
205
+ Formatted string summary
206
+ """
207
+ lines = [
208
+ "=== Trace Metrics ===",
209
+ f"Tokens: {metrics.total_tokens} total ({metrics.input_tokens} input, {metrics.output_tokens} output)",
210
+ f"LLM Calls: {metrics.llm_call_count} ({metrics.llm_duration_ms:.1f}ms)",
211
+ f"Tool Calls: {metrics.tool_call_count} ({metrics.tool_duration_ms:.1f}ms)",
212
+ ]
213
+
214
+ if metrics.tool_calls_by_name:
215
+ lines.append(" Tool breakdown:")
216
+ for name, count in sorted(metrics.tool_calls_by_name.items()):
217
+ lines.append(f" - {name}: {count}")
218
+
219
+ lines.extend([
220
+ f"Duration: {metrics.total_duration_ms:.2f}ms",
221
+ f"Spans: {metrics.span_count}",
222
+ f"Errors: {metrics.error_count}",
223
+ ])
224
+
225
+ return "\n".join(lines)
226
+
227
+
228
+ def metrics_to_dict(metrics: TraceMetrics) -> dict[str, Any]:
229
+ """Convert TraceMetrics to a JSON-serializable dictionary.
230
+
231
+ Args:
232
+ metrics: TraceMetrics to convert
233
+
234
+ Returns:
235
+ Dictionary representation
236
+ """
237
+ return {
238
+ "total_tokens": metrics.total_tokens,
239
+ "input_tokens": metrics.input_tokens,
240
+ "output_tokens": metrics.output_tokens,
241
+ "tool_call_count": metrics.tool_call_count,
242
+ "tool_calls_by_name": metrics.tool_calls_by_name,
243
+ "llm_call_count": metrics.llm_call_count,
244
+ "total_duration_ms": metrics.total_duration_ms,
245
+ "llm_duration_ms": metrics.llm_duration_ms,
246
+ "tool_duration_ms": metrics.tool_duration_ms,
247
+ "span_count": metrics.span_count,
248
+ "error_count": metrics.error_count,
249
+ "llm_calls": [
250
+ {
251
+ "model": c.model,
252
+ "input_tokens": c.input_tokens,
253
+ "output_tokens": c.output_tokens,
254
+ "finish_reason": c.finish_reason,
255
+ "duration_ms": c.duration_ms,
256
+ }
257
+ for c in metrics.llm_calls
258
+ ],
259
+ "tool_calls": [
260
+ {
261
+ "name": c.name,
262
+ "duration_ms": c.duration_ms,
263
+ "call_id": c.call_id,
264
+ }
265
+ for c in metrics.tool_calls
266
+ ],
267
+ }
src/flow/experiments/optimizer.py ADDED
@@ -0,0 +1,547 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Microsoft. All rights reserved.
2
+
3
+ """Optimizer service for finding best agent configurations.
4
+
5
+ Runs experiments in parallel, evaluates with LLM-as-Judge,
6
+ ranks via Pareto analysis, and exports reusable configs.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import asyncio
12
+ import json
13
+ import logging
14
+ import os
15
+ from collections.abc import Callable
16
+ from dataclasses import asdict, dataclass, field
17
+ from datetime import datetime
18
+ from itertools import product
19
+ from pathlib import Path
20
+ from typing import Any
21
+
22
+ from openai import AsyncAzureOpenAI
23
+
24
+ from .ablation import (
25
+ AblationConfig,
26
+ compute_pareto_frontier,
27
+ create_harness_from_config,
28
+ )
29
+ from .config_export import export_optimization_configs
30
+ from .evaluators import LLMEvaluator
31
+ from .metrics import TraceMetrics, extract_metrics
32
+ from .runner import FlowExperimentRunner, setup_tracing
33
+ from .types import EvalCriterion, RunResult, Task
34
+
35
+ logger = logging.getLogger(__name__)
36
+
37
+
38
+ @dataclass
39
+ class TaskResult:
40
+ """Result for a single config-task pair."""
41
+
42
+ config_name: str
43
+ task_name: str
44
+ run_result: RunResult
45
+ metrics: TraceMetrics
46
+ eval_score: float
47
+ eval_passed: bool
48
+ eval_reasoning: str
49
+
50
+
51
+ @dataclass
52
+ class ConfigSummary:
53
+ """Aggregated summary for a configuration across all tasks."""
54
+
55
+ name: str
56
+ config: AblationConfig
57
+ task_results: list[TaskResult] = field(default_factory=list)
58
+
59
+ # Aggregated metrics
60
+ avg_score: float = 0.0
61
+ avg_tokens: float = 0.0
62
+ avg_duration: float = 0.0
63
+ pass_rate: float = 0.0
64
+ total_tokens: int = 0
65
+ task_count: int = 0
66
+
67
+ # Pareto analysis
68
+ pareto_rank: int | None = None
69
+ is_pareto_optimal: bool = False
70
+
71
+ def to_dict(self) -> dict[str, Any]:
72
+ """Convert to dictionary for serialization."""
73
+ return {
74
+ "name": self.name,
75
+ "config": asdict(self.config),
76
+ "avg_score": self.avg_score,
77
+ "avg_tokens": self.avg_tokens,
78
+ "avg_duration": self.avg_duration,
79
+ "pass_rate": self.pass_rate,
80
+ "total_tokens": self.total_tokens,
81
+ "task_count": self.task_count,
82
+ "pareto_rank": self.pareto_rank,
83
+ "is_pareto_optimal": self.is_pareto_optimal,
84
+ }
85
+
86
+
87
+ @dataclass
88
+ class OptimizationResult:
89
+ """Complete results from an optimization run."""
90
+
91
+ timestamp: str
92
+ output_dir: Path
93
+ summaries: list[ConfigSummary]
94
+ pareto_frontier: list[str]
95
+ exported_configs: dict[str, Path]
96
+
97
+ # Rankings
98
+ rank_by_score: list[str] = field(default_factory=list)
99
+ rank_by_tokens: list[str] = field(default_factory=list)
100
+ rank_by_efficiency: list[str] = field(default_factory=list)
101
+
102
+ # Stats
103
+ total_experiments: int = 0
104
+ total_duration_seconds: float = 0.0
105
+
106
+ def get_best_config(self, criterion: str = "score") -> ConfigSummary | None:
107
+ """Get the best config by a criterion."""
108
+ if criterion == "score":
109
+ names = self.rank_by_score
110
+ elif criterion == "tokens":
111
+ names = self.rank_by_tokens
112
+ elif criterion == "efficiency":
113
+ names = self.rank_by_efficiency
114
+ else:
115
+ return None
116
+
117
+ if not names:
118
+ return None
119
+
120
+ for summary in self.summaries:
121
+ if summary.name == names[0]:
122
+ return summary
123
+ return None
124
+
125
+
126
+ class FlowOptimizer:
127
+ """Optimizer for finding best agent configurations.
128
+
129
+ Runs experiments in parallel, evaluates results, performs
130
+ Pareto analysis, and exports winning configs.
131
+
132
+ Example:
133
+ optimizer = FlowOptimizer(parallel=4)
134
+ configs = [
135
+ AblationConfig(name="baseline", enable_message_compaction=False),
136
+ AblationConfig(name="compaction", enable_message_compaction=True),
137
+ ]
138
+ tasks = [Task(name="test", prompt="Create hello world")]
139
+ result = await optimizer.optimize(configs, tasks)
140
+ print(f"Best: {result.rank_by_score[0]}")
141
+ """
142
+
143
+ def __init__(
144
+ self,
145
+ parallel: int = 4,
146
+ use_llm_evaluator: bool = True,
147
+ output_dir: Path | None = None,
148
+ ) -> None:
149
+ """Initialize the optimizer.
150
+
151
+ Args:
152
+ parallel: Max concurrent experiments
153
+ use_llm_evaluator: Whether to use LLM for evaluation
154
+ output_dir: Base directory for results
155
+ """
156
+ self.parallel = parallel
157
+ self.use_llm_evaluator = use_llm_evaluator
158
+ self.output_dir = output_dir or Path.home() / ".flow" / "optimizations"
159
+
160
+ async def optimize(
161
+ self,
162
+ configs: list[AblationConfig],
163
+ tasks: list[Task],
164
+ progress_callback: Callable[[int, int, str, str], None] | None = None,
165
+ ) -> OptimizationResult:
166
+ """Run optimization across all configs and tasks.
167
+
168
+ Args:
169
+ configs: Configurations to test
170
+ tasks: Tasks to run each config on
171
+ progress_callback: Optional callback(completed, total, config, task)
172
+
173
+ Returns:
174
+ OptimizationResult with rankings and exported configs
175
+ """
176
+ start_time = datetime.now()
177
+ timestamp = start_time.strftime("%Y%m%d_%H%M%S")
178
+ run_dir = self.output_dir / timestamp
179
+ run_dir.mkdir(parents=True, exist_ok=True)
180
+
181
+ # Setup
182
+ setup_tracing("flow-optimizer")
183
+ self._save_config(configs, tasks, run_dir)
184
+
185
+ print("=" * 70)
186
+ print(" FLOW OPTIMIZER")
187
+ print("=" * 70)
188
+ print(f" Configs: {len(configs)}")
189
+ print(f" Tasks: {len(tasks)}")
190
+ print(f" Total: {len(configs) * len(tasks)} experiments")
191
+ print(f" Parallel: {self.parallel}")
192
+ print(f" Output: {run_dir}")
193
+ print("=" * 70)
194
+
195
+ # Create LLM evaluator if needed
196
+ evaluator = None
197
+ if self.use_llm_evaluator:
198
+ evaluator = self._create_evaluator()
199
+
200
+ # Run all experiments in parallel
201
+ task_results = await self._run_parallel(
202
+ configs, tasks, run_dir, evaluator, progress_callback
203
+ )
204
+
205
+ # Aggregate by config
206
+ summaries = self._aggregate_results(task_results, configs)
207
+
208
+ # Pareto analysis
209
+ pareto_names = self._compute_pareto(summaries)
210
+
211
+ # Compute rankings
212
+ rank_by_score = sorted(summaries, key=lambda s: s.avg_score, reverse=True)
213
+ rank_by_tokens = sorted(summaries, key=lambda s: s.avg_tokens)
214
+ rank_by_efficiency = sorted(
215
+ summaries,
216
+ key=lambda s: s.avg_score / max(s.avg_tokens, 1),
217
+ reverse=True,
218
+ )
219
+
220
+ # Export configs
221
+ summary_dicts = [s.to_dict() for s in summaries]
222
+ exported = export_optimization_configs(
223
+ summary_dicts, pareto_names, run_dir, timestamp
224
+ )
225
+
226
+ end_time = datetime.now()
227
+
228
+ result = OptimizationResult(
229
+ timestamp=timestamp,
230
+ output_dir=run_dir,
231
+ summaries=summaries,
232
+ pareto_frontier=pareto_names,
233
+ exported_configs=exported,
234
+ rank_by_score=[s.name for s in rank_by_score],
235
+ rank_by_tokens=[s.name for s in rank_by_tokens],
236
+ rank_by_efficiency=[s.name for s in rank_by_efficiency],
237
+ total_experiments=len(task_results),
238
+ total_duration_seconds=(end_time - start_time).total_seconds(),
239
+ )
240
+
241
+ # Save results
242
+ self._save_results(result, run_dir)
243
+
244
+ # Print summary
245
+ self._print_summary(result)
246
+
247
+ return result
248
+
249
+ async def _run_parallel(
250
+ self,
251
+ configs: list[AblationConfig],
252
+ tasks: list[Task],
253
+ run_dir: Path,
254
+ evaluator: LLMEvaluator | None,
255
+ progress_callback: Callable[[int, int, str, str], None] | None,
256
+ ) -> list[TaskResult]:
257
+ """Run all config-task pairs in parallel with semaphore control."""
258
+ semaphore = asyncio.Semaphore(self.parallel)
259
+ total = len(configs) * len(tasks)
260
+ completed = 0
261
+ lock = asyncio.Lock()
262
+
263
+ async def run_one(config: AblationConfig, task: Task) -> TaskResult:
264
+ nonlocal completed
265
+ async with semaphore:
266
+ workspace = run_dir / "workspaces" / config.name / task.name
267
+ workspace.mkdir(parents=True, exist_ok=True)
268
+
269
+ result = await self._run_single(config, task, workspace, evaluator)
270
+
271
+ async with lock:
272
+ completed += 1
273
+ status = "✓" if result.eval_passed else "✗"
274
+ print(
275
+ f" [{completed}/{total}] {config.name}/{task.name}: "
276
+ f"{status} score={result.eval_score:.2f} "
277
+ f"tokens={result.metrics.total_tokens:,}"
278
+ )
279
+ if progress_callback:
280
+ progress_callback(completed, total, config.name, task.name)
281
+
282
+ return result
283
+
284
+ # Create all tasks
285
+ coroutines = [run_one(config, task) for config in configs for task in tasks]
286
+
287
+ # Run with gather
288
+ gather_results = await asyncio.gather(*coroutines, return_exceptions=True)
289
+
290
+ # Filter out exceptions
291
+ valid_results: list[TaskResult] = []
292
+ for r in gather_results:
293
+ if isinstance(r, BaseException):
294
+ logger.error(f"Experiment failed: {r}")
295
+ else:
296
+ valid_results.append(r)
297
+
298
+ return valid_results
299
+
300
+ async def _run_single(
301
+ self,
302
+ config: AblationConfig,
303
+ task: Task,
304
+ workspace: Path,
305
+ evaluator: LLMEvaluator | None,
306
+ ) -> TaskResult:
307
+ """Run a single config-task experiment."""
308
+ harness = create_harness_from_config(config, workspace)
309
+
310
+ try:
311
+ runner = FlowExperimentRunner(keep_workspace=True)
312
+ run_result = await runner.run(harness, task, workspace=workspace)
313
+ metrics = extract_metrics(run_result.trace)
314
+
315
+ # Evaluate
316
+ if evaluator:
317
+ eval_result = await evaluator.evaluate(run_result)
318
+ eval_score = eval_result.score
319
+ eval_passed = eval_result.passed
320
+ eval_reasoning = eval_result.reasoning
321
+ else:
322
+ # Simple heuristic: passed if no error
323
+ eval_score = 1.0 if run_result.success else 0.0
324
+ eval_passed = run_result.success
325
+ eval_reasoning = "Success" if run_result.success else run_result.error or "Failed"
326
+
327
+ return TaskResult(
328
+ config_name=config.name,
329
+ task_name=task.name,
330
+ run_result=run_result,
331
+ metrics=metrics,
332
+ eval_score=eval_score,
333
+ eval_passed=eval_passed,
334
+ eval_reasoning=eval_reasoning,
335
+ )
336
+ finally:
337
+ await harness.close()
338
+
339
+ def _aggregate_results(
340
+ self,
341
+ task_results: list[TaskResult],
342
+ configs: list[AblationConfig],
343
+ ) -> list[ConfigSummary]:
344
+ """Aggregate task results into config summaries."""
345
+ config_map = {c.name: c for c in configs}
346
+ results_by_config: dict[str, list[TaskResult]] = {c.name: [] for c in configs}
347
+
348
+ for result in task_results:
349
+ if result.config_name in results_by_config:
350
+ results_by_config[result.config_name].append(result)
351
+
352
+ summaries = []
353
+ for name, results in results_by_config.items():
354
+ if not results:
355
+ continue
356
+
357
+ config = config_map[name]
358
+ summary = ConfigSummary(
359
+ name=name,
360
+ config=config,
361
+ task_results=results,
362
+ avg_score=sum(r.eval_score for r in results) / len(results),
363
+ avg_tokens=sum(r.metrics.total_tokens for r in results) / len(results),
364
+ avg_duration=sum(r.run_result.duration_seconds for r in results) / len(results),
365
+ pass_rate=sum(1 for r in results if r.eval_passed) / len(results),
366
+ total_tokens=sum(r.metrics.total_tokens for r in results),
367
+ task_count=len(results),
368
+ )
369
+ summaries.append(summary)
370
+
371
+ return summaries
372
+
373
+ def _compute_pareto(self, summaries: list[ConfigSummary]) -> list[str]:
374
+ """Compute Pareto frontier (maximize score, minimize tokens)."""
375
+ # Use shared utility
376
+ pareto_names = compute_pareto_frontier(summaries)
377
+
378
+ # Mark summaries with Pareto status
379
+ for summary in summaries:
380
+ if summary.name in pareto_names:
381
+ summary.is_pareto_optimal = True
382
+ summary.pareto_rank = 0
383
+ else:
384
+ summary.is_pareto_optimal = False
385
+ summary.pareto_rank = 1 # Simplified: all non-Pareto get rank 1
386
+
387
+ return pareto_names
388
+
389
+ def _create_evaluator(self) -> LLMEvaluator | None:
390
+ """Create LLM evaluator if credentials available."""
391
+ api_key = os.environ.get("AZURE_OPENAI_API_KEY")
392
+ endpoint = os.environ.get("AZURE_OPENAI_ENDPOINT")
393
+ deployment = os.environ.get("AZURE_OPENAI_DEPLOYMENT", "gpt-4o")
394
+
395
+ if not api_key or not endpoint:
396
+ logger.warning("No Azure OpenAI credentials, using heuristic evaluation")
397
+ return None
398
+
399
+ client = AsyncAzureOpenAI(
400
+ api_key=api_key,
401
+ api_version="2024-02-15-preview",
402
+ azure_endpoint=endpoint,
403
+ )
404
+
405
+ return LLMEvaluator(
406
+ model_client=client,
407
+ model_name=deployment,
408
+ passing_threshold=0.7,
409
+ )
410
+
411
+ def _save_config(
412
+ self,
413
+ configs: list[AblationConfig],
414
+ tasks: list[Task],
415
+ run_dir: Path,
416
+ ) -> None:
417
+ """Save optimization config."""
418
+ with open(run_dir / "optimization_config.json", "w") as f:
419
+ json.dump(
420
+ {
421
+ "configs": [asdict(c) for c in configs],
422
+ "tasks": [{"name": t.name, "prompt": t.prompt} for t in tasks],
423
+ "parallel": self.parallel,
424
+ "use_llm_evaluator": self.use_llm_evaluator,
425
+ },
426
+ f,
427
+ indent=2,
428
+ )
429
+
430
+ def _save_results(self, result: OptimizationResult, run_dir: Path) -> None:
431
+ """Save optimization results."""
432
+ summary_data = {
433
+ "timestamp": result.timestamp,
434
+ "total_experiments": result.total_experiments,
435
+ "total_duration_seconds": result.total_duration_seconds,
436
+ "pareto_frontier": result.pareto_frontier,
437
+ "rank_by_score": result.rank_by_score,
438
+ "rank_by_tokens": result.rank_by_tokens,
439
+ "rank_by_efficiency": result.rank_by_efficiency,
440
+ "exported_configs": {k: str(v) for k, v in result.exported_configs.items()},
441
+ "summaries": [s.to_dict() for s in result.summaries],
442
+ }
443
+
444
+ with open(run_dir / "summary.json", "w") as f:
445
+ json.dump(summary_data, f, indent=2)
446
+
447
+ def _print_summary(self, result: OptimizationResult) -> None:
448
+ """Print optimization summary."""
449
+ print("\n" + "=" * 70)
450
+ print(" OPTIMIZATION RESULTS")
451
+ print("=" * 70)
452
+
453
+ # Rankings table
454
+ print(f"\n{'Config':<30} | {'Score':>8} | {'Tokens':>10} | {'Pareto':>8}")
455
+ print("-" * 65)
456
+
457
+ for summary in sorted(result.summaries, key=lambda s: s.avg_score, reverse=True):
458
+ pareto = "★" if summary.is_pareto_optimal else ""
459
+ print(
460
+ f"{summary.name:<30} | {summary.avg_score:>8.2f} | "
461
+ f"{summary.avg_tokens:>10,.0f} | {pareto:>8}"
462
+ )
463
+
464
+ print("\n" + "-" * 70)
465
+ print(f"Pareto frontier: {result.pareto_frontier}")
466
+ print(f"Best by score: {result.rank_by_score[0] if result.rank_by_score else 'N/A'}")
467
+ print(f"Best by efficiency: {result.rank_by_efficiency[0] if result.rank_by_efficiency else 'N/A'}")
468
+ print("\nExported configs:")
469
+ for name, path in result.exported_configs.items():
470
+ print(f" {name}: {path}")
471
+ print(f"\nResults saved to: {result.output_dir}")
472
+
473
+
474
+ def generate_grid_configs(
475
+ base_name: str,
476
+ variations: dict[str, list[Any]],
477
+ ) -> list[AblationConfig]:
478
+ """Generate configs from a variation grid.
479
+
480
+ Args:
481
+ base_name: Base name for generated configs
482
+ variations: Dict of param_name -> list of values
483
+
484
+ Returns:
485
+ List of AblationConfig for each combination
486
+
487
+ Example:
488
+ configs = generate_grid_configs("grid", {
489
+ "enable_message_compaction": [True, False],
490
+ "compaction_head_size": [5, 10, 20],
491
+ })
492
+ """
493
+ if not variations:
494
+ return [AblationConfig(name=base_name)]
495
+
496
+ param_names = list(variations.keys())
497
+ param_values = list(variations.values())
498
+
499
+ configs = []
500
+ for values in product(*param_values):
501
+ kwargs = dict(zip(param_names, values, strict=True))
502
+ name = f"{base_name}_" + "_".join(f"{k}={v}" for k, v in kwargs.items())
503
+ configs.append(AblationConfig(name=name, **kwargs))
504
+
505
+ return configs
506
+
507
+
508
+ def load_tasks_from_jsonl(path: Path) -> list[Task]:
509
+ """Load tasks from a JSONL file.
510
+
511
+ Each line should be a JSON object with:
512
+ - name: Task name
513
+ - prompt: Task prompt
514
+ - criteria: Optional list of evaluation criteria
515
+ - category: Optional category string
516
+
517
+ Args:
518
+ path: Path to JSONL file
519
+
520
+ Returns:
521
+ List of Task objects
522
+ """
523
+ tasks = []
524
+ with open(path) as f:
525
+ for line in f:
526
+ line = line.strip()
527
+ if not line:
528
+ continue
529
+
530
+ data = json.loads(line)
531
+ criteria = []
532
+ for c in data.get("criteria", []):
533
+ if isinstance(c, dict):
534
+ criteria.append(EvalCriterion(**c))
535
+ else:
536
+ criteria.append(EvalCriterion(name="default", instruction=str(c)))
537
+
538
+ tasks.append(
539
+ Task(
540
+ name=data["name"],
541
+ prompt=data["prompt"],
542
+ criteria=criteria,
543
+ metadata={"category": data.get("category", "default")},
544
+ )
545
+ )
546
+
547
+ return tasks
src/flow/experiments/reporters/__init__.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Microsoft. All rights reserved.
2
+
3
+ """Reporters for experiment results."""
4
+
5
+ from .console_reporter import print_comparison_table, print_eval_result, print_metrics_summary
6
+ from .json_reporter import load_run_result_summary, save_comparison, save_run_result
7
+
8
+ __all__ = [ # noqa: RUF022 # Intentionally grouped by category
9
+ # JSON reporter
10
+ "save_run_result",
11
+ "load_run_result_summary",
12
+ "save_comparison",
13
+ # Console reporter
14
+ "print_metrics_summary",
15
+ "print_comparison_table",
16
+ "print_eval_result",
17
+ ]
src/flow/experiments/reporters/console_reporter.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Microsoft. All rights reserved.
2
+
3
+ """Console reporter for experiment results with rich formatting."""
4
+
5
+ from typing import Any
6
+
7
+ from ..metrics import TraceMetrics
8
+
9
+
10
+ def print_metrics_summary(metrics: TraceMetrics, title: str = "Trace Metrics") -> None:
11
+ """Print a formatted metrics summary to console.
12
+
13
+ Args:
14
+ metrics: TraceMetrics to display
15
+ title: Title for the summary section
16
+ """
17
+ print(f"\n{'=' * 60}")
18
+ print(f" {title}")
19
+ print("=" * 60)
20
+ print(f" Tokens: {metrics.total_tokens:,} total ({metrics.input_tokens:,} in, {metrics.output_tokens:,} out)")
21
+ print(f" LLM Calls: {metrics.llm_call_count} ({metrics.llm_duration_ms:.1f}ms)")
22
+ print(f" Tool Calls: {metrics.tool_call_count} ({metrics.tool_duration_ms:.1f}ms)")
23
+
24
+ if metrics.tool_calls_by_name:
25
+ print(" Tool breakdown:")
26
+ for name, count in sorted(metrics.tool_calls_by_name.items()):
27
+ print(f" - {name}: {count}")
28
+
29
+ print(f" Duration: {metrics.total_duration_ms:.2f}ms")
30
+ print(f" Spans: {metrics.span_count}")
31
+ if metrics.error_count > 0:
32
+ print(f" Errors: {metrics.error_count}")
33
+ print("=" * 60)
34
+
35
+
36
+ def print_comparison_table(
37
+ results: list[dict[str, Any]],
38
+ title: str = "Comparison",
39
+ ) -> None:
40
+ """Print a side-by-side comparison table of multiple results.
41
+
42
+ Args:
43
+ results: List of result dictionaries with 'name' and 'metrics' keys
44
+ title: Title for the comparison
45
+ """
46
+ if not results:
47
+ print("No results to compare")
48
+ return
49
+
50
+ names = [r.get("name", "unknown") for r in results]
51
+ col_width = max(15, max(len(n) for n in names) + 2)
52
+
53
+ print(f"\n{'=' * 80}")
54
+ print(f" {title}")
55
+ print("=" * 80)
56
+
57
+ # Header
58
+ print(f"\n{'Metric':<30} | " + " | ".join(f"{n:>{col_width}}" for n in names))
59
+ print("-" * (32 + (col_width + 3) * len(names)))
60
+
61
+ def row(label: str, values: list[Any]) -> None:
62
+ formatted = []
63
+ for v in values:
64
+ if isinstance(v, float):
65
+ formatted.append(f"{v:>{col_width}.1f}")
66
+ elif isinstance(v, bool):
67
+ formatted.append(f"{v!s:>{col_width}}")
68
+ else:
69
+ formatted.append(f"{v:>{col_width}}")
70
+ print(f"{label:<30} | " + " | ".join(formatted))
71
+
72
+ # Extract metrics for each result
73
+ metrics_list = [r.get("metrics", {}) for r in results]
74
+
75
+ row("Duration (s)", [r.get("duration_seconds", 0) for r in results])
76
+ row("Success", [r.get("success", False) for r in results])
77
+
78
+ if any(r.get("evaluation") for r in results):
79
+ row("Eval Score", [r.get("evaluation", {}).get("score", 0) for r in results])
80
+
81
+ row("Total Tokens", [m.get("total_tokens", 0) for m in metrics_list])
82
+ row("Input Tokens", [m.get("input_tokens", 0) for m in metrics_list])
83
+ row("Output Tokens", [m.get("output_tokens", 0) for m in metrics_list])
84
+ row("LLM Calls", [m.get("llm_call_count", 0) for m in metrics_list])
85
+ row("Tool Calls", [m.get("tool_call_count", 0) for m in metrics_list])
86
+ row("LLM Time (ms)", [m.get("llm_duration_ms", 0) for m in metrics_list])
87
+ row("Tool Time (ms)", [m.get("tool_duration_ms", 0) for m in metrics_list])
88
+
89
+ # Tool breakdown
90
+ all_tools: set[str] = set()
91
+ for m in metrics_list:
92
+ all_tools.update(m.get("tool_calls_by_name", {}).keys())
93
+
94
+ if all_tools:
95
+ print("\n" + "-" * 80)
96
+ print("Tool Usage Breakdown:")
97
+ for tool in sorted(all_tools):
98
+ values = [m.get("tool_calls_by_name", {}).get(tool, 0) for m in metrics_list]
99
+ row(f" {tool}", values)
100
+
101
+ print("=" * 80)
102
+
103
+
104
+ def print_eval_result(
105
+ score: float,
106
+ passed: bool,
107
+ reasoning: str,
108
+ criteria_results: list[dict[str, Any]] | None = None,
109
+ ) -> None:
110
+ """Print evaluation results in a formatted way.
111
+
112
+ Args:
113
+ score: Overall score (0.0 to 1.0)
114
+ passed: Whether evaluation passed
115
+ reasoning: Overall reasoning
116
+ criteria_results: Optional list of individual criterion results
117
+ """
118
+ status = "PASS" if passed else "FAIL"
119
+
120
+ print(f"\n{'=' * 60}")
121
+ print(f" Evaluation Result: {status}")
122
+ print("=" * 60)
123
+ print(f" Score: {score:.2f}")
124
+ print(f" Passed: {passed}")
125
+ print(f" Reason: {reasoning}")
126
+
127
+ if criteria_results:
128
+ print("\n Criteria:")
129
+ for cr in criteria_results:
130
+ cr_status = "PASS" if cr.get("passed") else "FAIL"
131
+ print(f" - {cr.get('name', 'unknown')}: {cr_status} ({cr.get('score', 0):.2f})")
132
+ if cr.get("reasoning"):
133
+ print(f" {cr['reasoning']}")
134
+
135
+ print("=" * 60)
src/flow/experiments/reporters/json_reporter.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Microsoft. All rights reserved.
2
+
3
+ """JSON reporter for experiment results."""
4
+
5
+ import json
6
+ from dataclasses import asdict
7
+ from pathlib import Path
8
+ from typing import Any
9
+
10
+ from ..metrics import TraceMetrics, metrics_to_dict
11
+ from ..types import EvalResult, RunResult
12
+
13
+
14
+ def save_run_result(
15
+ result: RunResult,
16
+ output_dir: Path,
17
+ eval_result: EvalResult | None = None,
18
+ metrics: TraceMetrics | None = None,
19
+ ) -> None:
20
+ """Save a run result to JSON files.
21
+
22
+ Creates the following files in output_dir:
23
+ - traces.json: Raw OpenTelemetry spans
24
+ - metrics.json: Extracted metrics (if provided)
25
+ - output.txt: Agent text output
26
+ - result.json: Full result summary
27
+
28
+ Args:
29
+ result: The RunResult to save
30
+ output_dir: Directory to save files
31
+ eval_result: Optional evaluation result
32
+ metrics: Optional extracted metrics
33
+ """
34
+ output_dir.mkdir(parents=True, exist_ok=True)
35
+
36
+ # Save raw traces
37
+ with open(output_dir / "traces.json", "w") as f:
38
+ json.dump(result.trace, f, indent=2, default=str)
39
+
40
+ # Save extracted metrics
41
+ if metrics:
42
+ with open(output_dir / "metrics.json", "w") as f:
43
+ json.dump(metrics_to_dict(metrics), f, indent=2)
44
+
45
+ # Save agent output
46
+ with open(output_dir / "output.txt", "w") as f:
47
+ f.write(f"Task: {result.task.prompt}\n")
48
+ f.write(f"Duration: {result.duration_seconds:.1f}s\n")
49
+ f.write(f"Success: {result.success}\n")
50
+ if eval_result:
51
+ f.write(f"Eval Score: {eval_result.score:.2f}\n")
52
+ if result.error:
53
+ f.write(f"Error: {result.error}\n")
54
+ f.write("\n" + "=" * 60 + "\n\n")
55
+ f.write(result.output)
56
+
57
+ # Save full result
58
+ result_dict: dict[str, Any] = {
59
+ "task": {
60
+ "name": result.task.name,
61
+ "prompt": result.task.prompt,
62
+ "criteria": [asdict(c) for c in result.task.criteria],
63
+ "metadata": result.task.metadata,
64
+ },
65
+ "success": result.success,
66
+ "error": result.error,
67
+ "duration_seconds": result.duration_seconds,
68
+ "files_created": result.files_created,
69
+ "trace_count": len(result.trace),
70
+ "output_length": len(result.output),
71
+ }
72
+
73
+ if metrics:
74
+ result_dict["metrics"] = metrics_to_dict(metrics)
75
+
76
+ if eval_result:
77
+ result_dict["evaluation"] = {
78
+ "score": eval_result.score,
79
+ "passed": eval_result.passed,
80
+ "reasoning": eval_result.reasoning,
81
+ "criteria_results": [
82
+ {
83
+ "name": cr.name,
84
+ "score": cr.score,
85
+ "passed": cr.passed,
86
+ "reasoning": cr.reasoning,
87
+ }
88
+ for cr in eval_result.criteria_results
89
+ ],
90
+ }
91
+
92
+ with open(output_dir / "result.json", "w") as f:
93
+ json.dump(result_dict, f, indent=2)
94
+
95
+
96
+ def load_run_result_summary(result_path: Path) -> dict[str, Any]:
97
+ """Load a run result summary from a result.json file.
98
+
99
+ Args:
100
+ result_path: Path to result.json file
101
+
102
+ Returns:
103
+ Dictionary with result summary
104
+ """
105
+ with open(result_path) as f:
106
+ return json.load(f)
107
+
108
+
109
+ def save_comparison(
110
+ results: list[tuple[str, dict[str, Any]]],
111
+ output_path: Path,
112
+ ) -> None:
113
+ """Save a comparison of multiple results.
114
+
115
+ Args:
116
+ results: List of (name, result_dict) tuples
117
+ output_path: Path to save comparison JSON
118
+ """
119
+ comparison = {
120
+ "results": [
121
+ {
122
+ "name": name,
123
+ "success": result.get("success"),
124
+ "duration_seconds": result.get("duration_seconds"),
125
+ "metrics": result.get("metrics"),
126
+ "evaluation": result.get("evaluation"),
127
+ }
128
+ for name, result in results
129
+ ],
130
+ }
131
+
132
+ with open(output_path, "w") as f:
133
+ json.dump(comparison, f, indent=2)
src/flow/experiments/runner.py ADDED
@@ -0,0 +1,243 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Microsoft. All rights reserved.
2
+
3
+ """Experiment runner for executing agents on tasks with trace capture."""
4
+
5
+ from __future__ import annotations
6
+
7
+ import logging
8
+ import os
9
+ import tempfile
10
+ import time
11
+ from pathlib import Path
12
+ from typing import TYPE_CHECKING
13
+
14
+ from opentelemetry import trace
15
+ from opentelemetry.sdk.resources import Resource
16
+ from opentelemetry.sdk.trace import TracerProvider
17
+ from opentelemetry.sdk.trace.export import SimpleSpanProcessor
18
+ from opentelemetry.semconv._incubating.attributes.service_attributes import SERVICE_NAME
19
+
20
+ from .trace_collector import FlowTraceCollector
21
+ from .types import RunResult, Task
22
+
23
+ if TYPE_CHECKING:
24
+ from flow.harness.maf import MAFHarness
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+
29
+ def setup_tracing(service_name: str = "flow-experiments") -> TracerProvider:
30
+ """Setup OpenTelemetry tracing with in-memory collection.
31
+
32
+ This creates a new TracerProvider configured for experiment tracing.
33
+ Call this once at the start of your experiment session.
34
+
35
+ Args:
36
+ service_name: Name for the tracing service
37
+
38
+ Returns:
39
+ The configured TracerProvider
40
+ """
41
+ resource = Resource.create({SERVICE_NAME: service_name})
42
+ provider = TracerProvider(resource=resource)
43
+ trace.set_tracer_provider(provider)
44
+
45
+ # Enable agent framework instrumentation if available
46
+ try:
47
+ from agent_framework.observability import enable_instrumentation
48
+ enable_instrumentation()
49
+ logger.debug("Agent Framework instrumentation enabled")
50
+ except ImportError:
51
+ logger.debug("Agent Framework not available, skipping instrumentation")
52
+ except Exception as e:
53
+ logger.debug(f"Could not enable Agent Framework instrumentation: {e}")
54
+
55
+ return provider
56
+
57
+
58
+ class FlowExperimentRunner:
59
+ """Runner for executing experiments with Flow agents.
60
+
61
+ The runner handles:
62
+ - Setting up temporary workspaces
63
+ - Collecting execution traces via OpenTelemetry
64
+ - Measuring execution time
65
+ - Capturing files created
66
+ - Supporting streaming execution
67
+
68
+ Example:
69
+ from flow.harness.maf import MAFHarness
70
+ from flow.experiments import FlowExperimentRunner, Task
71
+
72
+ harness = MAFHarness()
73
+ runner = FlowExperimentRunner(keep_workspace=True)
74
+
75
+ task = Task(name="hello", prompt="Create a hello world script")
76
+ result = await runner.run(harness, task)
77
+
78
+ print(f"Duration: {result.duration_seconds}s")
79
+ print(f"Files: {result.files_created}")
80
+ """
81
+
82
+ def __init__(
83
+ self,
84
+ workspace_base: Path | None = None,
85
+ keep_workspace: bool = False,
86
+ ) -> None:
87
+ """Initialize the experiment runner.
88
+
89
+ Args:
90
+ workspace_base: Base directory for workspaces (default: system temp)
91
+ keep_workspace: Whether to keep workspace after run (default: False)
92
+ """
93
+ self.workspace_base = workspace_base or Path(tempfile.gettempdir())
94
+ self.keep_workspace = keep_workspace
95
+
96
+ async def run(
97
+ self,
98
+ harness: MAFHarness,
99
+ task: Task,
100
+ workspace: Path | None = None,
101
+ ) -> RunResult:
102
+ """Run a harness on a task and collect results.
103
+
104
+ This method:
105
+ 1. Creates or uses a workspace directory
106
+ 2. Sets up trace collection
107
+ 3. Executes the harness with streaming
108
+ 4. Collects output and files created
109
+ 5. Returns a RunResult with all data
110
+
111
+ Args:
112
+ harness: The MAFHarness to run
113
+ task: The task to execute
114
+ workspace: Optional workspace directory (creates temp if None)
115
+
116
+ Returns:
117
+ RunResult with trace, output, and metrics
118
+ """
119
+ # Create or use workspace directory
120
+ if workspace is None:
121
+ workspace = Path(tempfile.mkdtemp(
122
+ prefix=f"flow_experiment_{task.name}_",
123
+ dir=self.workspace_base,
124
+ ))
125
+ workspace_created = True
126
+ else:
127
+ workspace.mkdir(parents=True, exist_ok=True)
128
+ workspace_created = False
129
+
130
+ logger.info(f"Running task '{task.name}' in workspace: {workspace}")
131
+
132
+ # Track files before execution
133
+ files_before = set(self._list_files(workspace))
134
+
135
+ # Set up trace collection
136
+ collector = FlowTraceCollector()
137
+ processor: SimpleSpanProcessor | None = None
138
+
139
+ try:
140
+ provider = trace.get_tracer_provider()
141
+ if isinstance(provider, TracerProvider):
142
+ processor = SimpleSpanProcessor(collector)
143
+ provider.add_span_processor(processor)
144
+ logger.debug("Trace collection enabled")
145
+ except Exception as e:
146
+ logger.debug(f"Could not set up trace collection: {e}")
147
+
148
+ # Execute the harness
149
+ start_time = time.time()
150
+ output_chunks: list[str] = []
151
+ error: str | None = None
152
+
153
+ try:
154
+ # Change to workspace directory for execution
155
+ original_cwd = os.getcwd()
156
+ os.chdir(workspace)
157
+
158
+ try:
159
+ # Use streaming execution to capture all output
160
+ async for event in harness.run_stream(task.prompt):
161
+ # Collect text output
162
+ if hasattr(event, "content") and event.content:
163
+ if hasattr(event, "type"):
164
+ from ..harness.base import EventType
165
+ if event.type in (EventType.TEXT_DELTA, EventType.TEXT_DONE):
166
+ output_chunks.append(event.content)
167
+ elif event.type == EventType.TOOL_RESULT:
168
+ # Optionally capture tool results
169
+ pass
170
+ finally:
171
+ os.chdir(original_cwd)
172
+
173
+ except Exception as e:
174
+ error = str(e)
175
+ logger.error(f"Task execution failed: {e}")
176
+
177
+ end_time = time.time()
178
+ duration_seconds = end_time - start_time
179
+
180
+ # Force flush and get traces
181
+ if processor:
182
+ try:
183
+ processor.force_flush()
184
+ except Exception as e:
185
+ logger.debug(f"Error flushing processor: {e}")
186
+
187
+ # Get collected traces
188
+ trace_data = collector.get_traces()
189
+
190
+ # Clean up trace processor
191
+ if processor:
192
+ try:
193
+ processor.shutdown()
194
+ except Exception as e:
195
+ logger.debug(f"Error shutting down processor: {e}")
196
+
197
+ # Find files created
198
+ files_after = set(self._list_files(workspace))
199
+ files_created = sorted(files_after - files_before)
200
+
201
+ # Clean up workspace if not keeping and we created it
202
+ if not self.keep_workspace and workspace_created and not error:
203
+ try:
204
+ import shutil
205
+ shutil.rmtree(workspace)
206
+ logger.debug(f"Cleaned up workspace: {workspace}")
207
+ except Exception as e:
208
+ logger.warning(f"Failed to clean up workspace: {e}")
209
+
210
+ output = "".join(output_chunks)
211
+
212
+ return RunResult(
213
+ task=task,
214
+ trace=trace_data,
215
+ output=output,
216
+ files_created=files_created,
217
+ duration_seconds=duration_seconds,
218
+ workspace=workspace,
219
+ error=error,
220
+ )
221
+
222
+ def _list_files(self, directory: Path) -> list[str]:
223
+ """List all files in a directory recursively.
224
+
225
+ Args:
226
+ directory: Directory to scan
227
+
228
+ Returns:
229
+ List of relative file paths
230
+ """
231
+ files = []
232
+ try:
233
+ for root, _, filenames in os.walk(directory):
234
+ for filename in filenames:
235
+ # Skip hidden files and common temp files
236
+ if filename.startswith("."):
237
+ continue
238
+ full_path = Path(root) / filename
239
+ rel_path = full_path.relative_to(directory)
240
+ files.append(str(rel_path))
241
+ except Exception as e:
242
+ logger.debug(f"Error listing files: {e}")
243
+ return files
src/flow/experiments/trace_collector.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Microsoft. All rights reserved.
2
+
3
+ """OpenTelemetry trace collector for experiment analysis."""
4
+
5
+ import logging
6
+ from datetime import datetime
7
+ from typing import Any
8
+
9
+ from opentelemetry.sdk.trace.export import SpanExporter, SpanExportResult
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class FlowTraceCollector(SpanExporter):
15
+ """Collects OpenTelemetry spans for experiment analysis.
16
+
17
+ This exporter captures spans during agent execution and converts them
18
+ to a dictionary format suitable for metrics extraction and analysis.
19
+
20
+ Example:
21
+ collector = FlowTraceCollector()
22
+ # Attach to TracerProvider via SimpleSpanProcessor
23
+ # Run agent execution
24
+ traces = collector.get_traces()
25
+ """
26
+
27
+ def __init__(self) -> None:
28
+ """Initialize the trace collector."""
29
+ self.spans: list[dict[str, Any]] = []
30
+
31
+ def export(self, spans: Any) -> SpanExportResult:
32
+ """Collect spans from OpenTelemetry.
33
+
34
+ Args:
35
+ spans: Sequence of OpenTelemetry ReadableSpan objects
36
+
37
+ Returns:
38
+ SpanExportResult indicating success
39
+ """
40
+ for span in spans:
41
+ try:
42
+ # Convert nanoseconds to seconds for timestamps
43
+ start_time = span.start_time / 1_000_000_000
44
+ end_time = span.end_time / 1_000_000_000 if span.end_time else None
45
+ duration_ms = ((end_time - start_time) * 1000) if end_time else None
46
+
47
+ self.spans.append({
48
+ "type": "trace_span",
49
+ "timestamp": datetime.fromtimestamp(start_time).isoformat(),
50
+ "data": {
51
+ "operation_name": span.name,
52
+ "span_id": format(span.context.span_id, "016x"),
53
+ "trace_id": format(span.context.trace_id, "032x"),
54
+ "parent_span_id": (
55
+ format(span.parent.span_id, "016x") if span.parent else None
56
+ ),
57
+ "duration_ms": duration_ms,
58
+ "attributes": dict(span.attributes) if span.attributes else {},
59
+ "status": str(span.status.status_code.name) if hasattr(span, "status") else "OK",
60
+ "events": [
61
+ {
62
+ "name": event.name,
63
+ "timestamp": datetime.fromtimestamp(
64
+ event.timestamp / 1_000_000_000
65
+ ).isoformat(),
66
+ "attributes": dict(event.attributes) if event.attributes else {},
67
+ }
68
+ for event in (span.events or [])
69
+ ],
70
+ },
71
+ })
72
+ except Exception as e:
73
+ logger.debug(f"Failed to collect span: {e}")
74
+
75
+ return SpanExportResult.SUCCESS
76
+
77
+ def force_flush(self, timeout_millis: int = 30000) -> bool:
78
+ """Force flush spans (no-op for simple collection).
79
+
80
+ Args:
81
+ timeout_millis: Timeout in milliseconds (unused)
82
+
83
+ Returns:
84
+ True always
85
+ """
86
+ return True
87
+
88
+ def shutdown(self) -> None:
89
+ """Shutdown the exporter (no-op)."""
90
+ pass
91
+
92
+ def get_traces(self) -> list[dict[str, Any]]:
93
+ """Get and clear collected traces.
94
+
95
+ Returns:
96
+ List of collected trace spans, clearing the internal list
97
+ """
98
+ traces = self.spans.copy()
99
+ self.spans.clear()
100
+ return traces
101
+
102
+ def clear(self) -> None:
103
+ """Clear collected traces without returning them."""
104
+ self.spans.clear()
src/flow/experiments/types.py ADDED
@@ -0,0 +1,266 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Microsoft. All rights reserved.
2
+
3
+ """Type definitions for the experiments framework."""
4
+
5
+ from dataclasses import dataclass, field
6
+ from pathlib import Path
7
+ from typing import Any
8
+
9
+
10
+ @dataclass
11
+ class EvalCriterion:
12
+ """A criterion for evaluating agent output.
13
+
14
+ Attributes:
15
+ name: Short identifier for the criterion (e.g., "correctness", "completeness")
16
+ instruction: Detailed instruction for how to evaluate this criterion
17
+ weight: Relative weight for scoring (default 1.0)
18
+ """
19
+
20
+ name: str
21
+ instruction: str
22
+ weight: float = 1.0
23
+
24
+
25
+ @dataclass
26
+ class Task:
27
+ """A task for the agent to perform.
28
+
29
+ Attributes:
30
+ name: Short identifier for the task
31
+ prompt: The prompt/instruction given to the agent
32
+ criteria: List of evaluation criteria for assessing the output
33
+ metadata: Additional task metadata (e.g., expected output, difficulty)
34
+ """
35
+
36
+ name: str
37
+ prompt: str
38
+ criteria: list[EvalCriterion] = field(default_factory=list)
39
+ metadata: dict[str, Any] = field(default_factory=dict)
40
+
41
+
42
+ @dataclass
43
+ class RunResult:
44
+ """Result of running an agent on a task.
45
+
46
+ Attributes:
47
+ task: The task that was executed
48
+ trace: OpenTelemetry trace spans collected during execution
49
+ output: The agent's final output/response
50
+ files_created: List of files created during execution
51
+ duration_seconds: Total execution time
52
+ workspace: Path to the workspace directory used
53
+ error: Error message if execution failed, None if successful
54
+ """
55
+
56
+ task: Task
57
+ trace: list[dict[str, Any]]
58
+ output: str
59
+ files_created: list[str]
60
+ duration_seconds: float
61
+ workspace: Path
62
+ error: str | None = None
63
+
64
+ @property
65
+ def success(self) -> bool:
66
+ """Whether the run completed without errors."""
67
+ return self.error is None
68
+
69
+
70
+ @dataclass
71
+ class CriterionResult:
72
+ """Result of evaluating a single criterion.
73
+
74
+ Attributes:
75
+ name: Name of the criterion evaluated
76
+ score: Numeric score (0.0 to 1.0)
77
+ passed: Whether the criterion was met
78
+ reasoning: Explanation of the evaluation
79
+ """
80
+
81
+ name: str
82
+ score: float
83
+ passed: bool
84
+ reasoning: str
85
+
86
+
87
+ @dataclass
88
+ class EvalResult:
89
+ """Result of evaluating an agent's output.
90
+
91
+ Attributes:
92
+ score: Overall weighted score (0.0 to 1.0)
93
+ passed: Whether the evaluation passed overall
94
+ criteria_results: Results for each individual criterion
95
+ reasoning: Overall evaluation reasoning/summary
96
+ """
97
+
98
+ score: float
99
+ passed: bool
100
+ criteria_results: list[CriterionResult]
101
+ reasoning: str
102
+
103
+
104
+ # =============================================================================
105
+ # Built-in Task Suites for Optimization
106
+ # =============================================================================
107
+
108
+ TASK_SUITES: dict[str, list[Task]] = {
109
+ "quick": [
110
+ Task(
111
+ name="fizzbuzz",
112
+ prompt="Create a Python file fizzbuzz.py that prints FizzBuzz from 1-100. Then run it.",
113
+ criteria=[
114
+ EvalCriterion(name="file_created", instruction="fizzbuzz.py file was created"),
115
+ EvalCriterion(name="correct_output", instruction="Output shows correct FizzBuzz pattern"),
116
+ ],
117
+ metadata={"category": "short", "expected_duration": 60},
118
+ ),
119
+ Task(
120
+ name="hello_api",
121
+ prompt="Create a FastAPI app in api.py with a /hello endpoint that returns {'message': 'hello'}.",
122
+ criteria=[
123
+ EvalCriterion(name="file_created", instruction="api.py file was created"),
124
+ EvalCriterion(name="has_endpoint", instruction="Contains a /hello GET endpoint"),
125
+ ],
126
+ metadata={"category": "short", "expected_duration": 90},
127
+ ),
128
+ Task(
129
+ name="file_counter",
130
+ prompt="Create a Python script count_files.py that counts .py files in current directory and prints the count.",
131
+ criteria=[
132
+ EvalCriterion(name="file_created", instruction="count_files.py was created"),
133
+ EvalCriterion(name="runs_correctly", instruction="Script runs and outputs a number"),
134
+ ],
135
+ metadata={"category": "short", "expected_duration": 60},
136
+ ),
137
+ ],
138
+ "core": [
139
+ Task(
140
+ name="fizzbuzz",
141
+ prompt="Create a Python file fizzbuzz.py that prints FizzBuzz from 1-100. Then run it.",
142
+ criteria=[
143
+ EvalCriterion(name="file_created", instruction="fizzbuzz.py file was created"),
144
+ EvalCriterion(name="correct_output", instruction="Output shows correct FizzBuzz pattern"),
145
+ ],
146
+ metadata={"category": "short"},
147
+ ),
148
+ Task(
149
+ name="rest_api",
150
+ prompt="Create a FastAPI app with CRUD endpoints for a TODO list (in-memory storage). Include GET /todos, POST /todos, DELETE /todos/{id}.",
151
+ criteria=[
152
+ EvalCriterion(name="file_created", instruction="API file was created"),
153
+ EvalCriterion(name="has_crud", instruction="Contains GET, POST, DELETE endpoints"),
154
+ ],
155
+ metadata={"category": "medium"},
156
+ ),
157
+ Task(
158
+ name="data_analysis",
159
+ prompt="Create a Python script that generates 100 random data points, calculates mean/median/std, and saves results to stats.json.",
160
+ criteria=[
161
+ EvalCriterion(name="script_created", instruction="Python script was created"),
162
+ EvalCriterion(name="json_output", instruction="stats.json was created with results"),
163
+ ],
164
+ metadata={"category": "medium"},
165
+ ),
166
+ Task(
167
+ name="cli_tool",
168
+ prompt="Create a CLI tool using argparse that takes a filename and counts lines, words, and characters (like wc).",
169
+ criteria=[
170
+ EvalCriterion(name="file_created", instruction="CLI script was created"),
171
+ EvalCriterion(name="uses_argparse", instruction="Uses argparse for argument parsing"),
172
+ ],
173
+ metadata={"category": "medium"},
174
+ ),
175
+ Task(
176
+ name="unit_tests",
177
+ prompt="Create a calculator module (calc.py) with add/subtract/multiply/divide functions, then write pytest tests for it (test_calc.py).",
178
+ criteria=[
179
+ EvalCriterion(name="module_created", instruction="calc.py was created"),
180
+ EvalCriterion(name="tests_created", instruction="test_calc.py was created"),
181
+ EvalCriterion(name="tests_pass", instruction="Tests pass when run"),
182
+ ],
183
+ metadata={"category": "medium"},
184
+ ),
185
+ ],
186
+ "coding": [
187
+ Task(
188
+ name="fizzbuzz",
189
+ prompt="Create fizzbuzz.py that prints FizzBuzz 1-100 and run it.",
190
+ criteria=[EvalCriterion(name="correct", instruction="Correct FizzBuzz output")],
191
+ metadata={"category": "short"},
192
+ ),
193
+ Task(
194
+ name="rest_api",
195
+ prompt="Create a FastAPI CRUD TODO app with GET/POST/DELETE endpoints.",
196
+ criteria=[EvalCriterion(name="has_crud", instruction="Has working CRUD")],
197
+ metadata={"category": "medium"},
198
+ ),
199
+ Task(
200
+ name="cli_tool",
201
+ prompt="Create an argparse CLI that counts lines/words/chars in a file.",
202
+ criteria=[EvalCriterion(name="works", instruction="CLI works correctly")],
203
+ metadata={"category": "medium"},
204
+ ),
205
+ Task(
206
+ name="data_pipeline",
207
+ prompt="Create a script that reads CSV data, filters rows, aggregates, and outputs JSON.",
208
+ criteria=[EvalCriterion(name="works", instruction="Pipeline produces correct output")],
209
+ metadata={"category": "medium"},
210
+ ),
211
+ Task(
212
+ name="unit_tests",
213
+ prompt="Create calc.py with math functions and test_calc.py with pytest tests.",
214
+ criteria=[EvalCriterion(name="tests_pass", instruction="Tests pass")],
215
+ metadata={"category": "medium"},
216
+ ),
217
+ Task(
218
+ name="web_scraper",
219
+ prompt="Create a script that fetches a webpage and extracts all links.",
220
+ criteria=[EvalCriterion(name="extracts_links", instruction="Extracts links correctly")],
221
+ metadata={"category": "medium"},
222
+ ),
223
+ Task(
224
+ name="async_downloader",
225
+ prompt="Create an async script that downloads multiple URLs concurrently using aiohttp.",
226
+ criteria=[EvalCriterion(name="uses_async", instruction="Uses async/await correctly")],
227
+ metadata={"category": "complex"},
228
+ ),
229
+ Task(
230
+ name="database_orm",
231
+ prompt="Create a SQLAlchemy model for Users with CRUD operations.",
232
+ criteria=[EvalCriterion(name="has_orm", instruction="Uses SQLAlchemy ORM correctly")],
233
+ metadata={"category": "complex"},
234
+ ),
235
+ Task(
236
+ name="decorator_lib",
237
+ prompt="Create a library with timing, retry, and caching decorators.",
238
+ criteria=[EvalCriterion(name="decorators_work", instruction="Decorators function correctly")],
239
+ metadata={"category": "complex"},
240
+ ),
241
+ Task(
242
+ name="config_parser",
243
+ prompt="Create a config parser that supports YAML, JSON, and env vars with validation.",
244
+ criteria=[EvalCriterion(name="multi_format", instruction="Supports multiple formats")],
245
+ metadata={"category": "complex"},
246
+ ),
247
+ ],
248
+ }
249
+
250
+
251
+ def get_task_suite(suite_name: str) -> list[Task]:
252
+ """Get a built-in task suite by name.
253
+
254
+ Args:
255
+ suite_name: Name of the suite ('quick', 'core', 'coding')
256
+
257
+ Returns:
258
+ List of Task objects
259
+
260
+ Raises:
261
+ ValueError: If suite_name is not found
262
+ """
263
+ if suite_name not in TASK_SUITES:
264
+ available = ", ".join(TASK_SUITES.keys())
265
+ raise ValueError(f"Unknown suite '{suite_name}'. Available: {available}")
266
+ return TASK_SUITES[suite_name]
src/flow/harness/__init__.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Harness modules for Flow agent.
2
+
3
+ Harnesses are agent runtime adapters that convert different agent framework
4
+ events to a uniform Event format for CLI/UI consumption.
5
+
6
+ Available harnesses:
7
+ - maf: Microsoft Agent Framework harness
8
+ - (future) langchain: LangChain harness
9
+ - (future) claude: Claude SDK harness
10
+ """
11
+
12
+ from flow.harness.base import BaseHarness, Event, EventType
13
+
14
+ __all__ = [
15
+ "BaseHarness",
16
+ "Event",
17
+ "EventType",
18
+ ]
src/flow/harness/base.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Base harness interface for agent runtimes.
2
+
3
+ Defines the abstract interface that all harnesses must implement,
4
+ allowing Flow to run on different agent frameworks.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from abc import ABC, abstractmethod
10
+ from collections.abc import AsyncIterator, Callable, Coroutine
11
+ from dataclasses import dataclass, field
12
+ from enum import Enum
13
+ from typing import Any
14
+
15
+
16
+ class EventType(Enum):
17
+ """Types of events that can be streamed from an agent."""
18
+
19
+ TEXT_DELTA = "text_delta" # Streaming text chunk
20
+ TEXT_DONE = "text_done" # Text generation complete
21
+ TOOL_CALL_START = "tool_call_start" # Starting a tool call
22
+ TOOL_CALL_ARGS = "tool_call_args" # Tool call arguments (streaming)
23
+ TOOL_CALL_DONE = "tool_call_done" # Tool call complete
24
+ TOOL_RESULT = "tool_result" # Tool execution result
25
+ THINKING = "thinking" # Agent reasoning/thinking
26
+ ERROR = "error" # An error occurred
27
+ DONE = "done" # Agent run complete
28
+
29
+
30
+ @dataclass
31
+ class Event:
32
+ """An event from the agent execution stream.
33
+
34
+ Events provide real-time feedback during agent execution,
35
+ allowing the CLI to display progress, tool calls, and results.
36
+ """
37
+
38
+ type: EventType
39
+ content: str = ""
40
+ tool_name: str | None = None
41
+ tool_call_id: str | None = None
42
+ metadata: dict[str, str | int | float | bool | None] = field(default_factory=dict)
43
+
44
+
45
+ class BaseHarness(ABC):
46
+ """Abstract base class for agent execution harnesses.
47
+
48
+ A harness is a thin adapter that converts agent framework events
49
+ to the uniform Flow Event format for CLI/UI consumption.
50
+
51
+ Each harness implementation handles:
52
+ - Taking a pre-configured agent from the framework
53
+ - Running tasks on the agent
54
+ - Converting framework-specific events to Flow Events
55
+ - Managing conversation threads
56
+
57
+ Implementations:
58
+ - MAFHarness (flow.harness.maf): Microsoft Agent Framework
59
+ - (Future) LangChainHarness: LangChain
60
+ - (Future) ClaudeHarness: Claude SDK
61
+ """
62
+
63
+ @abstractmethod
64
+ async def run(self, task: str, thread_id: str | None = None) -> str:
65
+ """Run a task and return the final response.
66
+
67
+ Args:
68
+ task: The task/prompt to execute
69
+ thread_id: Optional thread ID for conversation continuity
70
+
71
+ Returns:
72
+ The agent's final response text
73
+ """
74
+ ...
75
+
76
+ @abstractmethod
77
+ def run_stream(self, task: str, thread_id: str | None = None) -> AsyncIterator[Event]:
78
+ """Run a task with streaming events.
79
+
80
+ Args:
81
+ task: The task/prompt to execute
82
+ thread_id: Optional thread ID for conversation continuity
83
+
84
+ Yields:
85
+ Event objects representing agent activity
86
+ """
87
+ ...
88
+
89
+ @abstractmethod
90
+ def register_tools(self, tools: list[Callable[..., Coroutine[Any, Any, str]]]) -> None:
91
+ """Register tools with the harness.
92
+
93
+ Args:
94
+ tools: List of tool functions to register
95
+ """
96
+ ...
97
+
98
+ @abstractmethod
99
+ def get_thread_id(self) -> str:
100
+ """Get the current thread ID.
101
+
102
+ Returns:
103
+ The current conversation thread ID
104
+ """
105
+ ...
106
+
107
+ @abstractmethod
108
+ async def close(self) -> None:
109
+ """Clean up resources used by the harness."""
110
+ ...
src/flow/harness/maf/__init__.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Microsoft Agent Framework harness module.
2
+
3
+ Provides integration with Microsoft Agent Framework for running Flow agents.
4
+ """
5
+
6
+ from flow.harness.maf.agent import create_agent
7
+ from flow.harness.maf.harness import MAFHarness
8
+ from flow.harness.maf.message_store import HeadTailCompactingChatMessageStore
9
+
10
+ __all__ = [
11
+ "create_agent",
12
+ "HeadTailCompactingChatMessageStore",
13
+ "MAFHarness",
14
+ ]
src/flow/harness/maf/agent.py ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Agent factory for Microsoft Agent Framework.
2
+
3
+ Provides factory functions to create configured ChatAgent instances.
4
+ """
5
+
6
+ import logging
7
+ import os
8
+ from collections.abc import Callable, Coroutine, Sequence
9
+ from pathlib import Path
10
+ from typing import TYPE_CHECKING, Any
11
+
12
+ from flow.harness.maf.message_store import HeadTailCompactingChatMessageStore
13
+ from flow.prompts import FLOW_AGENT_INSTRUCTIONS
14
+ from flow.tools import create_all_tools
15
+
16
+ if TYPE_CHECKING:
17
+ from agent_framework import ChatAgent
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+ # Default paths
22
+ DEFAULT_WORKSPACE = Path.home() / ".flow" / "workspace"
23
+ DEFAULT_MEMORY_PATH = Path.home() / ".flow" / "memory"
24
+
25
+
26
+ def create_agent(
27
+ *,
28
+ # Model/API configuration
29
+ endpoint: str | None = None,
30
+ api_key: str | None = None,
31
+ deployment: str | None = None,
32
+ api_version: str = "2024-02-15-preview",
33
+ # Agent configuration
34
+ name: str = "Flow",
35
+ instructions: str | None = None,
36
+ # Workspace configuration
37
+ workspace: Path | None = None,
38
+ memory_path: Path | None = None,
39
+ # Tool configuration
40
+ tools: Sequence[Callable[..., Coroutine[Any, Any, str]]] | None = None,
41
+ enable_memory_tool: bool = True,
42
+ enable_sub_agent: bool = False,
43
+ bash_timeout: int = 120,
44
+ # Context engineering
45
+ enable_compaction: bool = True,
46
+ compaction_head_size: int = 10,
47
+ compaction_tail_size: int = 40,
48
+ ) -> "ChatAgent":
49
+ """Create a configured ChatAgent for Flow.
50
+
51
+ This factory creates a Microsoft Agent Framework ChatAgent with:
52
+ - Azure OpenAI as the backend
53
+ - Flow's standard tools (coding, execution, memory)
54
+ - Optional message compaction for long conversations
55
+ - Optional agent-managed memory tool
56
+ - Optional sub-agent for isolated research
57
+
58
+ Args:
59
+ endpoint: Azure OpenAI endpoint URL. Defaults to AZURE_OPENAI_ENDPOINT env var.
60
+ api_key: Azure OpenAI API key. Defaults to AZURE_OPENAI_API_KEY env var.
61
+ deployment: Azure OpenAI deployment name. Defaults to AZURE_OPENAI_DEPLOYMENT env var.
62
+ api_version: Azure OpenAI API version.
63
+ name: Agent name.
64
+ instructions: Agent instructions. Defaults to FLOW_AGENT_INSTRUCTIONS.
65
+ workspace: Directory for file operations. Defaults to ~/.flow/workspace.
66
+ memory_path: Directory for persistent memory. Defaults to ~/.flow/memory.
67
+ tools: Custom tools to use. If None, creates standard Flow tools.
68
+ enable_memory_tool: Whether to include the memory tool (default: True).
69
+ enable_sub_agent: Whether to include the sub-agent tool (default: False).
70
+ bash_timeout: Timeout for bash commands in seconds.
71
+ enable_compaction: Whether to enable head+tail message compaction.
72
+ compaction_head_size: Number of initial messages to keep.
73
+ compaction_tail_size: Number of recent messages to keep.
74
+
75
+ Returns:
76
+ Configured ChatAgent instance.
77
+
78
+ Raises:
79
+ ImportError: If agent_framework is not installed.
80
+ ValueError: If required Azure OpenAI credentials are missing.
81
+
82
+ Example:
83
+ >>> from flow.harness.maf import create_agent
84
+ >>> agent = create_agent()
85
+ >>> thread = agent.get_new_thread()
86
+ >>> response = await agent.run("Create a hello world script", thread=thread)
87
+ """
88
+ try:
89
+ from agent_framework import ChatAgent, ai_function
90
+ from agent_framework.azure import AzureOpenAIChatClient
91
+ except ImportError as e:
92
+ raise ImportError(
93
+ "Microsoft Agent Framework is required. "
94
+ "Install with: pip install agent-framework-core"
95
+ ) from e
96
+
97
+ # Resolve configuration from environment if not provided
98
+ endpoint = endpoint or os.environ.get("AZURE_OPENAI_ENDPOINT")
99
+ api_key = api_key or os.environ.get("AZURE_OPENAI_API_KEY")
100
+ deployment = deployment or os.environ.get("AZURE_OPENAI_DEPLOYMENT")
101
+
102
+ if not endpoint:
103
+ raise ValueError(
104
+ "Azure OpenAI endpoint is required. "
105
+ "Set AZURE_OPENAI_ENDPOINT or pass endpoint parameter."
106
+ )
107
+ if not api_key:
108
+ raise ValueError(
109
+ "Azure OpenAI API key is required. "
110
+ "Set AZURE_OPENAI_API_KEY or pass api_key parameter."
111
+ )
112
+ if not deployment:
113
+ raise ValueError(
114
+ "Azure OpenAI deployment is required. "
115
+ "Set AZURE_OPENAI_DEPLOYMENT or pass deployment parameter."
116
+ )
117
+
118
+ # Resolve paths
119
+ workspace = workspace or DEFAULT_WORKSPACE
120
+ memory_path = memory_path or DEFAULT_MEMORY_PATH
121
+
122
+ # Ensure directories exist
123
+ workspace.mkdir(parents=True, exist_ok=True)
124
+ memory_path.mkdir(parents=True, exist_ok=True)
125
+
126
+ # Create or use provided tools
127
+ if tools is None:
128
+ tools = create_all_tools(
129
+ workspace=workspace,
130
+ memory_path=memory_path,
131
+ bash_timeout=bash_timeout,
132
+ enable_memory_tool=enable_memory_tool,
133
+ enable_sub_agent=enable_sub_agent,
134
+ )
135
+
136
+ # Wrap tools with ai_function decorator for Agent Framework
137
+ converted_tools = []
138
+ for tool_func in tools:
139
+ tool_name = getattr(tool_func, "_tool_name", tool_func.__name__)
140
+ tool_description = getattr(tool_func, "_tool_description", tool_func.__doc__ or "")
141
+ wrapped = ai_function(name=tool_name, description=tool_description)(tool_func)
142
+ converted_tools.append(wrapped)
143
+
144
+ # Create the chat client
145
+ client = AzureOpenAIChatClient(
146
+ api_key=api_key,
147
+ endpoint=endpoint,
148
+ deployment=deployment,
149
+ api_version=api_version,
150
+ )
151
+
152
+ # Create message store factory if compaction is enabled
153
+ message_store_factory = None
154
+ if enable_compaction:
155
+ def create_compacting_store() -> HeadTailCompactingChatMessageStore:
156
+ return HeadTailCompactingChatMessageStore(
157
+ head_size=compaction_head_size,
158
+ tail_size=compaction_tail_size,
159
+ )
160
+
161
+ message_store_factory = create_compacting_store
162
+ logger.debug(
163
+ f"Message compaction enabled: head={compaction_head_size}, tail={compaction_tail_size}"
164
+ )
165
+
166
+ # Create the agent
167
+ agent = ChatAgent(
168
+ name=name,
169
+ description="Autonomous coding agent",
170
+ instructions=instructions or FLOW_AGENT_INSTRUCTIONS,
171
+ chat_client=client,
172
+ tools=converted_tools,
173
+ chat_message_store_factory=message_store_factory,
174
+ )
175
+
176
+ return agent
src/flow/harness/maf/harness.py ADDED
@@ -0,0 +1,258 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Microsoft Agent Framework harness.
2
+
3
+ A thin adapter that converts Agent Framework events to the uniform Flow Event format.
4
+ """
5
+
6
+ import logging
7
+ import uuid
8
+ from collections.abc import AsyncIterator
9
+ from typing import TYPE_CHECKING, Any
10
+
11
+ from flow.harness.base import BaseHarness, Event, EventType
12
+
13
+ if TYPE_CHECKING:
14
+ from agent_framework import ChatAgent
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+ # Track if instrumentation has been enabled globally
19
+ _instrumentation_enabled = False
20
+
21
+
22
+ def _enable_instrumentation() -> None:
23
+ """Enable OpenTelemetry instrumentation for Agent Framework.
24
+
25
+ This is called once when the first harness is created.
26
+ Instrumentation allows trace collection for experiments.
27
+ """
28
+ global _instrumentation_enabled
29
+ if _instrumentation_enabled:
30
+ return
31
+
32
+ try:
33
+ from agent_framework.observability import enable_instrumentation
34
+ enable_instrumentation()
35
+ _instrumentation_enabled = True
36
+ logger.debug("Agent Framework instrumentation enabled")
37
+ except ImportError:
38
+ logger.debug("Agent Framework observability not available")
39
+ except Exception as e:
40
+ logger.debug(f"Could not enable instrumentation: {e}")
41
+
42
+
43
+ class MAFHarness(BaseHarness):
44
+ """Harness adapter for Microsoft Agent Framework.
45
+
46
+ This adapter:
47
+ 1. Takes a ChatAgent (or creates one with default settings)
48
+ 2. Runs tasks on the agent
49
+ 3. Converts Agent Framework events to uniform Flow Events
50
+
51
+ Example:
52
+ >>> from flow.harness.maf import MAFHarness
53
+ >>> # Simple usage - creates agent with defaults
54
+ >>> harness = MAFHarness()
55
+ >>> async for event in harness.run_stream("Create a hello world script"):
56
+ ... print(event)
57
+
58
+ >>> # Or with custom agent
59
+ >>> from flow.harness.maf import create_agent
60
+ >>> agent = create_agent(enable_compaction=False)
61
+ >>> harness = MAFHarness(agent)
62
+ """
63
+
64
+ def __init__(
65
+ self,
66
+ agent: "ChatAgent | None" = None,
67
+ **create_agent_kwargs: Any,
68
+ ) -> None:
69
+ """Initialize the harness.
70
+
71
+ Args:
72
+ agent: Optional ChatAgent instance. If not provided, creates one
73
+ using create_agent() with the given kwargs.
74
+ **create_agent_kwargs: Passed to create_agent() if agent is None.
75
+ Common options: workspace, memory_path,
76
+ enable_compaction, enable_memory_tool.
77
+ """
78
+ if agent is None:
79
+ from flow.harness.maf.agent import create_agent
80
+ agent = create_agent(**create_agent_kwargs)
81
+ self._agent: ChatAgent = agent # type: ignore[assignment]
82
+ self._thread: Any = None # AgentThread for conversation continuity
83
+ self._thread_id: str | None = None
84
+ # Track tool calls we've seen to avoid duplicate TOOL_CALL_START events
85
+ self._seen_tool_calls: set[str] = set()
86
+
87
+ # Enable OpenTelemetry instrumentation for trace collection
88
+ _enable_instrumentation()
89
+
90
+ def register_tools(self, tools: list[Any]) -> None:
91
+ """Register tools with the harness.
92
+
93
+ Note: For MAFHarness, tools should be configured when creating the agent
94
+ via create_agent(). This method is provided for interface compatibility
95
+ but will log a warning if called.
96
+
97
+ Args:
98
+ tools: List of tool functions (ignored - configure via create_agent)
99
+ """
100
+ logger.warning(
101
+ "MAFHarness.register_tools() called but tools should be configured "
102
+ "via create_agent(). These tools will be ignored."
103
+ )
104
+
105
+ async def run(self, task: str, thread_id: str | None = None) -> str:
106
+ """Run a task and return the final response.
107
+
108
+ Args:
109
+ task: The task/prompt to execute
110
+ thread_id: Optional thread ID for conversation continuity
111
+
112
+ Returns:
113
+ The agent's final response text
114
+ """
115
+ if thread_id:
116
+ self._thread_id = thread_id
117
+
118
+ # Get or create an AgentThread for conversation continuity
119
+ if self._thread is None:
120
+ self._thread = self._agent.get_new_thread()
121
+
122
+ response = await self._agent.run(task, thread=self._thread)
123
+
124
+ # Extract text content from response
125
+ content = getattr(response, "content", None)
126
+ if content is not None:
127
+ return str(content)
128
+ return str(response)
129
+
130
+ async def run_stream(
131
+ self, task: str, thread_id: str | None = None
132
+ ) -> AsyncIterator[Event]:
133
+ """Run a task with streaming events.
134
+
135
+ Args:
136
+ task: The task/prompt to execute
137
+ thread_id: Optional thread ID for conversation continuity
138
+
139
+ Yields:
140
+ Event objects representing agent activity
141
+ """
142
+ if thread_id:
143
+ self._thread_id = thread_id
144
+
145
+ # Get or create an AgentThread for conversation continuity
146
+ if self._thread is None:
147
+ self._thread = self._agent.get_new_thread()
148
+
149
+ # Clear seen tool calls for this run
150
+ self._seen_tool_calls.clear()
151
+
152
+ try:
153
+ # Check if agent supports streaming
154
+ if hasattr(self._agent, "run_stream"):
155
+ async for chunk in self._agent.run_stream(task, thread=self._thread):
156
+ # Convert agent_framework events to Flow events
157
+ events = self._convert_event(chunk)
158
+ for event in events:
159
+ yield event
160
+ else:
161
+ # Fallback: run non-streaming and emit single event
162
+ response = await self._agent.run(task, thread=self._thread)
163
+ response_content = getattr(response, "content", None)
164
+ content = str(response_content) if response_content is not None else str(response)
165
+ yield Event(type=EventType.TEXT_DONE, content=content)
166
+
167
+ yield Event(type=EventType.DONE)
168
+
169
+ except Exception as e:
170
+ yield Event(type=EventType.ERROR, content=str(e))
171
+
172
+ def _convert_event(self, chunk: Any) -> list[Event]:
173
+ """Convert an agent_framework event to Flow Events.
174
+
175
+ Args:
176
+ chunk: Event from agent_framework (AgentResponseUpdate)
177
+
178
+ Returns:
179
+ List of converted Events (may be empty)
180
+ """
181
+ events: list[Event] = []
182
+ chunk_type = type(chunk).__name__
183
+
184
+ # AgentResponseUpdate/AgentRunResponseUpdate has .contents list and .text property
185
+ if chunk_type in ("AgentResponseUpdate", "AgentRunResponseUpdate") or hasattr(chunk, "contents"):
186
+ contents = getattr(chunk, "contents", []) or []
187
+
188
+ for content in contents:
189
+ content_type = type(content).__name__
190
+
191
+ if content_type == "TextContent":
192
+ text = getattr(content, "text", "")
193
+ if text:
194
+ events.append(Event(type=EventType.TEXT_DELTA, content=text))
195
+
196
+ elif content_type == "FunctionCallContent":
197
+ # Streaming pattern:
198
+ # - First chunk has call_id and name set, arguments=''
199
+ # - Subsequent chunks have empty call_id/name, just argument fragments
200
+ call_id = getattr(content, "call_id", "") or ""
201
+ name = getattr(content, "name", "") or ""
202
+ args = getattr(content, "arguments", "") or ""
203
+
204
+ if call_id and name:
205
+ # First chunk - emit TOOL_CALL_START
206
+ self._seen_tool_calls.add(call_id)
207
+ events.append(Event(
208
+ type=EventType.TOOL_CALL_START,
209
+ tool_name=name,
210
+ tool_call_id=call_id,
211
+ ))
212
+ elif args:
213
+ # Argument fragment - emit as TOOL_CALL_ARGS
214
+ events.append(Event(
215
+ type=EventType.TOOL_CALL_ARGS,
216
+ content=args,
217
+ ))
218
+
219
+ elif content_type == "FunctionResultContent":
220
+ result = getattr(content, "result", "")
221
+ call_id = getattr(content, "call_id", None)
222
+ events.append(Event(
223
+ type=EventType.TOOL_RESULT,
224
+ content=str(result),
225
+ tool_call_id=call_id,
226
+ ))
227
+ # Emit TOOL_CALL_DONE after result
228
+ events.append(Event(type=EventType.TOOL_CALL_DONE))
229
+
230
+ # If no contents but has text, use that
231
+ if not events and hasattr(chunk, "text"):
232
+ text = chunk.text
233
+ if text:
234
+ events.append(Event(type=EventType.TEXT_DELTA, content=text))
235
+
236
+ # Fallback for other chunk types
237
+ elif hasattr(chunk, "text"):
238
+ text = chunk.text
239
+ if text:
240
+ events.append(Event(type=EventType.TEXT_DELTA, content=text))
241
+
242
+ return events
243
+
244
+ def get_thread_id(self) -> str:
245
+ """Get the current thread ID.
246
+
247
+ Returns:
248
+ The current conversation thread ID
249
+ """
250
+ if self._thread_id is None:
251
+ self._thread_id = str(uuid.uuid4())
252
+ return self._thread_id
253
+
254
+ async def close(self) -> None:
255
+ """Clean up resources used by the harness."""
256
+ # Agent Framework doesn't require explicit cleanup
257
+ self._thread = None
258
+ self._thread_id = None
src/flow/harness/maf/message_store.py ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Message store implementations for Microsoft Agent Framework.
2
+
3
+ Provides ChatMessageStoreProtocol implementations for context management.
4
+ """
5
+
6
+ from collections.abc import MutableMapping, Sequence
7
+ from typing import TYPE_CHECKING, Any
8
+
9
+ if TYPE_CHECKING:
10
+ from agent_framework import ChatMessage
11
+
12
+
13
+ class HeadTailCompactingChatMessageStore:
14
+ """A compacting message store that works directly with Agent Framework ChatMessage.
15
+
16
+ This store implements ChatMessageStoreProtocol and keeps the first N messages
17
+ (head) and last M messages (tail), dropping middle messages to prevent
18
+ context overflow in long conversations.
19
+
20
+ IMPORTANT: This store preserves full ChatMessage objects including:
21
+ - FunctionCallContent (tool calls)
22
+ - FunctionResultContent (tool results)
23
+ - All other content types
24
+
25
+ This is critical because OpenAI's API requires tool results to immediately
26
+ follow their corresponding tool calls.
27
+
28
+ The compaction strategy:
29
+ - Keeps the first N messages (task context, initial instructions)
30
+ - Keeps the last M messages (recent work, current state)
31
+ - Drops middle messages to prevent context overflow
32
+ """
33
+
34
+ def __init__(
35
+ self,
36
+ messages: Sequence["ChatMessage"] | None = None,
37
+ head_size: int = 10,
38
+ tail_size: int = 40,
39
+ ) -> None:
40
+ """Initialize the compacting store.
41
+
42
+ Args:
43
+ messages: Initial messages to store
44
+ head_size: Number of initial messages to keep
45
+ tail_size: Number of recent messages to keep
46
+ """
47
+ if head_size < 0:
48
+ raise ValueError("head_size must be non-negative")
49
+ if tail_size < 0:
50
+ raise ValueError("tail_size must be non-negative")
51
+
52
+ self._messages: list["ChatMessage"] = list(messages) if messages else []
53
+ self._head_size = head_size
54
+ self._tail_size = tail_size
55
+
56
+ @property
57
+ def head_size(self) -> int:
58
+ """Number of messages kept from the beginning."""
59
+ return self._head_size
60
+
61
+ @property
62
+ def tail_size(self) -> int:
63
+ """Number of messages kept from the end."""
64
+ return self._tail_size
65
+
66
+ @property
67
+ def total_messages(self) -> int:
68
+ """Total number of messages stored (before compaction)."""
69
+ return len(self._messages)
70
+
71
+ @property
72
+ def compacted_count(self) -> int:
73
+ """Number of messages that would be returned by list_messages()."""
74
+ total = len(self._messages)
75
+ max_kept = self._head_size + self._tail_size
76
+ return min(total, max_kept)
77
+
78
+ @property
79
+ def dropped_count(self) -> int:
80
+ """Number of messages dropped during compaction."""
81
+ return max(0, self.total_messages - self.compacted_count)
82
+
83
+ async def add_messages(self, messages: Sequence["ChatMessage"]) -> None:
84
+ """Add messages to the store.
85
+
86
+ Messages are stored as-is, preserving all content types.
87
+
88
+ Args:
89
+ messages: Sequence of ChatMessage objects to add
90
+ """
91
+ self._messages.extend(messages)
92
+
93
+ async def list_messages(self) -> list["ChatMessage"]:
94
+ """Get messages with head+tail compaction applied.
95
+
96
+ Returns the first head_size messages plus the last tail_size messages.
97
+ If total messages <= head_size + tail_size, returns all messages.
98
+
99
+ Returns:
100
+ List of ChatMessage objects after compaction
101
+ """
102
+ total = len(self._messages)
103
+ max_kept = self._head_size + self._tail_size
104
+
105
+ # No compaction needed
106
+ if total <= max_kept:
107
+ return list(self._messages)
108
+
109
+ # Return head + tail
110
+ head = self._messages[: self._head_size]
111
+ tail = self._messages[-self._tail_size :] if self._tail_size > 0 else []
112
+
113
+ return head + tail
114
+
115
+ @classmethod
116
+ async def deserialize(
117
+ cls,
118
+ serialized_store_state: MutableMapping[str, Any],
119
+ **kwargs: Any,
120
+ ) -> "HeadTailCompactingChatMessageStore":
121
+ """Create store from serialized state."""
122
+ from agent_framework import ChatMessage
123
+
124
+ head_size = kwargs.get("head_size", serialized_store_state.get("head_size", 10))
125
+ tail_size = kwargs.get("tail_size", serialized_store_state.get("tail_size", 40))
126
+
127
+ messages_data = serialized_store_state.get("messages", [])
128
+ messages = [
129
+ ChatMessage.from_dict(m) if isinstance(m, dict) else m
130
+ for m in messages_data
131
+ ]
132
+
133
+ return cls(messages=messages, head_size=head_size, tail_size=tail_size)
134
+
135
+ async def update_from_state(
136
+ self,
137
+ serialized_store_state: MutableMapping[str, Any],
138
+ **kwargs: Any,
139
+ ) -> None:
140
+ """Update store from serialized state."""
141
+ from agent_framework import ChatMessage
142
+
143
+ if not serialized_store_state:
144
+ return
145
+
146
+ messages_data = serialized_store_state.get("messages", [])
147
+ self._messages = [
148
+ ChatMessage.from_dict(m) if isinstance(m, dict) else m
149
+ for m in messages_data
150
+ ]
151
+
152
+ if "head_size" in serialized_store_state:
153
+ self._head_size = serialized_store_state["head_size"]
154
+ if "tail_size" in serialized_store_state:
155
+ self._tail_size = serialized_store_state["tail_size"]
156
+
157
+ async def serialize(self, **kwargs: Any) -> dict[str, Any]:
158
+ """Serialize the store state.
159
+
160
+ Serializes ALL messages (not just compacted view) plus configuration.
161
+ """
162
+ return {
163
+ "messages": [m.to_dict() for m in self._messages],
164
+ "head_size": self._head_size,
165
+ "tail_size": self._tail_size,
166
+ }
167
+
168
+ @property
169
+ def stats(self) -> dict[str, int]:
170
+ """Get compaction statistics."""
171
+ return {
172
+ "total_messages": self.total_messages,
173
+ "compacted_count": self.compacted_count,
174
+ "dropped_count": self.dropped_count,
175
+ "head_size": self._head_size,
176
+ "tail_size": self._tail_size,
177
+ }
src/flow/prompts.py ADDED
@@ -0,0 +1,407 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """System prompts for the Flow agent.
2
+
3
+ Defines the structured workflow for software engineering tasks.
4
+ """
5
+
6
+ FLOW_AGENT_INSTRUCTIONS = """
7
+ You are an expert autonomous agent. You solve problems end-to-end by composing your available tools.
8
+
9
+ ## CORE PRINCIPLE: BE AUTONOMOUS
10
+
11
+ **You are NOT just an assistant that tells users what to do. You ARE the one who does it.**
12
+
13
+ When asked to solve a task:
14
+ 1. **DO IT YOURSELF** - Don't tell the user to run commands. Run them yourself.
15
+ 2. **COMPLETE THE LOOP** - Write code AND execute it. Don't stop at writing.
16
+ 3. **VERIFY YOUR WORK** - Test that it actually works before reporting done.
17
+ 4. **ITERATE ON FAILURES** - If something fails, fix it and try again.
18
+
19
+ **Example - BAD (passive):**
20
+ > "Here's the code. You can run it with `python script.py`"
21
+
22
+ **Example - GOOD (autonomous):**
23
+ > *writes code* → *executes code* → *sees output* → *fixes any errors*
24
+ > → "Done! The script ran successfully and output X."
25
+
26
+ ---
27
+
28
+ ## YOUR CAPABILITIES
29
+
30
+ **Coding Tools:**
31
+ - `read_file`: Read file contents with line numbers
32
+ - `write_file`: Create/edit files (full write, str_replace, or insert_at_line)
33
+ - `list_directory`: Explore project structure
34
+ - `grep_search`: Search for patterns in code (regex supported)
35
+
36
+ **Execution Tools:**
37
+ - `bash_execute`: Run shell commands (tests, git, npm, pip, builds, etc.)
38
+ - `python_repl`: Execute Python code snippets for quick validation
39
+
40
+ **Research Tools (if available):**
41
+ - `web_search`: Search the web using Google (requires GOOGLE_API_KEY and GOOGLE_CSE_ID)
42
+ - `web_fetch`: Fetch and read content from URLs
43
+
44
+ **Memory Tools:**
45
+ - `memory`: Persistent storage that survives across conversations
46
+ - view: See directory or file contents
47
+ - create: Create new files
48
+ - str_replace: Edit existing files
49
+ - append: Add to files
50
+ - search: Find text across memory
51
+ - delete: Remove files
52
+
53
+ **Thinking Tools:**
54
+ - `think`: Pause to reason through complex problems
55
+ - `task_done`: Report when task is complete or blocked
56
+
57
+ **Skills Tool (if available):**
58
+ - `skills`: Discover and load domain-specific expertise
59
+ - `skills(action='list')`: See available skills with descriptions
60
+ - `skills(action='load', name='skill-name')`: Load full skill content
61
+
62
+ ---
63
+
64
+ ## WORKFLOW
65
+
66
+ ### 1. UNDERSTAND
67
+ - Read the user's request carefully
68
+ - **If the `skills` tool is available**, call `skills(action='list')` to discover relevant expertise
69
+ - Use `list_directory` to understand the workspace structure
70
+ - Use `grep_search` to find relevant existing code
71
+ - Check memory for relevant patterns: `memory(command="view", path="/memory")`
72
+
73
+ ### 2. PLAN
74
+ - Use `think` tool to plan your approach for complex tasks
75
+ - Break down into small, testable steps
76
+ - Consider edge cases and error handling
77
+
78
+ ### 3. EXECUTE
79
+ - Create/edit files using `write_file`
80
+ - Test changes using `bash_execute` or `python_repl`
81
+ - Fix issues immediately when tests fail
82
+
83
+ ### 4. VERIFY (REQUIRED)
84
+ **You MUST test your work before calling `task_done`.** Never assume code works.
85
+
86
+ **For Python apps/scripts:**
87
+ ```
88
+ bash_execute("cd project && python -c 'import main'") # Check imports work
89
+ bash_execute("cd project && python main.py --help") # Test CLI if applicable
90
+ bash_execute("cd project && pytest") # Run tests if they exist
91
+ ```
92
+
93
+ **For JavaScript/TypeScript:**
94
+ ```
95
+ bash_execute("cd project && npm install && npm run build") # Must pass!
96
+ bash_execute("cd project && npx tsc --noEmit") # Type check
97
+ ```
98
+
99
+ **For Web APIs (FastAPI, Express, etc.):**
100
+ ```
101
+ # Start server in background, test with curl, then cleanup
102
+ bash_execute("cd project && uvicorn main:app --port 8000 &", background=True)
103
+ bash_execute("sleep 2 && curl http://localhost:8000/health") # Test endpoint
104
+ bash_execute("check_processes action=list") # Verify it's running
105
+ # When done testing, kill the process
106
+ ```
107
+
108
+ **For Frontend apps (React, Vue, etc.):**
109
+ ```
110
+ bash_execute("cd project && npm run build") # Production build must succeed
111
+ # If you need to test dev server, use background=True
112
+ ```
113
+
114
+ **For full-stack apps:**
115
+ 1. Test backend API with curl (start in background)
116
+ 2. Test frontend build succeeds
117
+ 3. Clean up background processes when done
118
+
119
+ ### 5. COMPLETE
120
+ - Clean up any background processes you started
121
+ - Call `task_done` with status and summary
122
+ - Include files created and suggested next steps
123
+
124
+ ---
125
+
126
+ ## WORKSPACE
127
+
128
+ Your workspace is at `~/.flow/workspace/`
129
+
130
+ **Organization:**
131
+ - Create a folder for each project (e.g., `todo_app/`, `calculator/`)
132
+ - Use `list_directory` to see existing projects before creating new ones
133
+ - Follow standard project structure conventions:
134
+ - Python: `src/`, `tests/`, `requirements.txt` or `pyproject.toml`
135
+ - JavaScript: `src/`, `package.json`, standard Node.js layout
136
+ - Full-stack: `backend/`, `frontend/` folders
137
+
138
+ **Important:**
139
+ - Each `bash_execute` runs from workspace root in a fresh shell
140
+ - Use `cd project && command` for commands in subdirectories
141
+ - Multiple commands: `cd project && cmd1 && cmd2`
142
+
143
+ ---
144
+
145
+ ## MEMORY
146
+
147
+ Your memory persists at `~/.flow/memory/`
148
+
149
+ **Recommended structure:**
150
+ - `/memory/patterns/` - Reusable solutions and code patterns
151
+ - `/memory/projects/` - Per-project context and notes
152
+ - `/memory/decisions/` - Why you made certain choices
153
+
154
+ **Best practices:**
155
+ When storing information, include context:
156
+ - **Date**: When was this created/learned?
157
+ - **Project**: What project did this come from?
158
+ - **Context**: Why was this approach chosen?
159
+
160
+ **Example pattern file** (`/memory/patterns/fastapi_cors.md`):
161
+ ```markdown
162
+ # FastAPI CORS Setup
163
+ Created: 2025-01-15
164
+ Source: sleep_tracker project
165
+
166
+ ## Pattern
167
+ from fastapi.middleware.cors import CORSMiddleware
168
+ app.add_middleware(
169
+ CORSMiddleware,
170
+ allow_origins=["*"],
171
+ allow_methods=["*"],
172
+ allow_headers=["*"],
173
+ )
174
+
175
+ ## When to use
176
+ - Full-stack apps with separate frontend/backend
177
+ - Frontend on different port than backend
178
+
179
+ ## Notes
180
+ - Must add before routes
181
+ - Restrict origins in production
182
+ ```
183
+
184
+ **Check memory first** - you may have solved similar problems before!
185
+
186
+ ---
187
+
188
+ ## CLI TOOLS
189
+
190
+ Many CLI tools have interactive prompts that will hang.
191
+ ALWAYS use non-interactive flags:
192
+
193
+ ```bash
194
+ # Good
195
+ npm create vite@latest myapp -- --template react-ts
196
+ pip install -q package
197
+ npx shadcn@latest init --defaults --yes
198
+
199
+ # Bad (will hang)
200
+ npm create vite@latest myapp # Interactive prompts
201
+ npx shadcn init # Interactive prompts
202
+ ```
203
+
204
+ **Shadcn UI** is a CLI tool, not an npm package:
205
+ ```bash
206
+ # Wrong
207
+ npm install @shadcn/ui
208
+
209
+ # Right
210
+ npx shadcn@latest init --defaults --yes
211
+ npx shadcn@latest add button card --yes
212
+ ```
213
+
214
+ ---
215
+
216
+ ## FULL-STACK APPS
217
+
218
+ When building apps with separate frontend and backend:
219
+
220
+ 1. **Always add CORS to backend:**
221
+ ```python
222
+ from fastapi.middleware.cors import CORSMiddleware
223
+ app.add_middleware(
224
+ CORSMiddleware,
225
+ allow_origins=["*"], # Restrict in production
226
+ allow_methods=["*"],
227
+ allow_headers=["*"],
228
+ )
229
+ ```
230
+
231
+ 2. **Document which ports each server uses**
232
+
233
+ 3. **Verify both sides build/run:**
234
+ ```bash
235
+ cd backend && python -c "from main import app; print('Backend OK')"
236
+ cd frontend && npm run build && echo "Frontend OK"
237
+ ```
238
+
239
+ ---
240
+
241
+ ## BACKGROUND PROCESSES
242
+
243
+ When you need to start long-running processes (servers, watchers, etc.):
244
+
245
+ **Use `background=True` parameter:**
246
+ ```python
247
+ # Start a server in background - returns immediately with PID
248
+ bash_execute("uvicorn main:app --port 8000", background=True)
249
+
250
+ # Then test it
251
+ bash_execute("curl http://localhost:8000/health")
252
+
253
+ # Check what's running
254
+ check_processes(action="list")
255
+
256
+ # Clean up when done
257
+ check_processes(action="kill", pid=12345)
258
+ ```
259
+
260
+ **Process registry** is at `/memory/processes.md` - view it with:
261
+ `memory(command='view', path='/memory/processes.md')`
262
+
263
+ **IMPORTANT:**
264
+ - NEVER start servers without `background=True` - they will timeout after 120s
265
+ - ALWAYS clean up background processes when done testing
266
+ - Check for port conflicts before starting servers
267
+
268
+ **Common patterns:**
269
+ ```bash
270
+ # Good - background server for testing
271
+ bash_execute("cd backend && uvicorn main:app --port 8000", background=True)
272
+ bash_execute("sleep 2") # Wait for startup
273
+ bash_execute("curl localhost:8000/docs") # Test
274
+ check_processes(action="cleanup") # Kill all when done
275
+
276
+ # Bad - will timeout!
277
+ bash_execute("uvicorn main:app --port 8000") # Blocks forever
278
+ ```
279
+
280
+ ---
281
+
282
+ ## ERROR HANDLING
283
+
284
+ - If a command fails, analyze the error and try alternatives
285
+ - Log failures and solutions to memory for future reference
286
+ - Don't give up after first failure - iterate
287
+ - If truly blocked, call `task_done` with status="incomplete" and explain why
288
+
289
+ ---
290
+
291
+ ## SKILLS
292
+
293
+ **If the `skills` tool is available**, use it to access domain-specific expertise:
294
+
295
+ ```python
296
+ # At the start of complex tasks, discover what expertise is available
297
+ skills(action='list')
298
+
299
+ # Output shows available skills with descriptions:
300
+ # - fastapi-patterns: Build REST APIs with FastAPI...
301
+ # - react-components: Build React components with hooks...
302
+ # - testing-strategies: Write comprehensive tests...
303
+
304
+ # Load relevant skills before implementation
305
+ skills(action='load', name='fastapi-patterns')
306
+ ```
307
+
308
+ **Skills provide:**
309
+ - Domain-specific patterns and best practices
310
+ - Code examples and templates
311
+ - Common pitfalls to avoid
312
+
313
+ **When to load skills:**
314
+ - Before starting a new project type (API, frontend, CLI)
315
+ - When working with unfamiliar frameworks
316
+ - For complex tasks requiring specialized knowledge
317
+
318
+ **Skills location:** `~/.flow/skills/`
319
+ Each skill is a folder with a `SKILL.md` file following the Anthropic Skills standard.
320
+
321
+ ---
322
+
323
+ ## COMPOSING TOOLS FOR COMPLEX TASKS
324
+
325
+ **You have all the tools needed to solve problems end-to-end. Compose them!**
326
+
327
+ ### Example: "What's the weather API response for Seattle?"
328
+ ```
329
+ # DON'T just tell the user how to do it. DO IT:
330
+ 1. web_search("weather API free") → Find a free weather API
331
+ 2. web_fetch(api_docs_url) → Read the API documentation
332
+ 3. write_file("weather.py", code) → Write a script to call the API
333
+ 4. bash_execute("python weather.py") → Run it and get the answer
334
+ 5. Report the actual result to the user
335
+ ```
336
+
337
+ ### Example: "Create a CLI tool that converts CSV to JSON"
338
+ ```
339
+ 1. write_file("csv_to_json.py", code) → Write the tool
340
+ 2. write_file("test.csv", sample_data) → Create test data
341
+ 3. bash_execute("python csv_to_json.py test.csv") → Test it works
342
+ 4. bash_execute("cat output.json") → Verify the output
343
+ 5. Report success with example output
344
+ ```
345
+
346
+ ### Example: "Find and summarize the latest Python 3.12 features"
347
+ ```
348
+ 1. web_search("Python 3.12 new features") → Find relevant pages
349
+ 2. web_fetch(python_docs_url) → Read the official docs
350
+ 3. Summarize findings directly OR write to a file if requested
351
+ ```
352
+
353
+ ### Example: "Debug why my FastAPI app returns 500 errors"
354
+ ```
355
+ 1. read_file("main.py") → Understand the code
356
+ 2. bash_execute("cd app && python -c 'from main import app'") → Check imports
357
+ 3. bash_execute("cd app && uvicorn main:app --port 8000", background=True) → Start server
358
+ 4. bash_execute("curl localhost:8000/endpoint") → Reproduce the error
359
+ 5. Analyze error → Fix code → Test again → Iterate until fixed
360
+ ```
361
+
362
+ ---
363
+
364
+ ## RESEARCH WORKFLOW
365
+
366
+ When you need information from the web:
367
+
368
+ 1. **Search first**: Use `web_search` to find relevant URLs
369
+ 2. **Fetch details**: Use `web_fetch` to read specific pages
370
+ 3. **Apply knowledge**: Write code, update configs, or summarize findings
371
+
372
+ **Example - Learning a new library:**
373
+ ```python
374
+ # 1. Search for docs
375
+ web_search("httpx python async http client tutorial")
376
+
377
+ # 2. Read the documentation
378
+ web_fetch("https://www.python-httpx.org/quickstart/", output_format="markdown")
379
+
380
+ # 3. Write code using what you learned
381
+ write_file("http_client.py", '''
382
+ import httpx
383
+ async def fetch_data(url):
384
+ async with httpx.AsyncClient() as client:
385
+ return await client.get(url)
386
+ ''')
387
+
388
+ # 4. Test it
389
+ python_repl("import httpx; print(httpx.__version__)")
390
+ ```
391
+
392
+ ---
393
+
394
+ ## REMEMBER
395
+
396
+ 1. **BE AUTONOMOUS** - Do the work yourself, don't instruct the user
397
+ 2. **COMPLETE THE LOOP** - Write code → Execute → Verify → Report results
398
+ 3. **COMPOSE TOOLS** - Chain multiple tools to solve complex problems
399
+ 4. **RESEARCH WHEN NEEDED** - Use web_search/web_fetch to learn new things
400
+ 5. **ITERATE ON FAILURES** - Don't give up, debug and fix issues
401
+ 6. **TEST EVERYTHING** - Never assume code works
402
+ 7. **USE NON-INTERACTIVE FLAGS** - Avoid hanging commands
403
+ 8. **CLEAN UP** - Kill background processes when done
404
+ 9. **STORE LEARNINGS** - Save patterns to memory for future use
405
+
406
+ **Your goal is to deliver RESULTS, not instructions.**
407
+ """
src/flow/py.typed ADDED
File without changes
src/flow/tools/__init__.py ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Flow agent tools.
2
+
3
+ Provides coding, execution, memory, and core tools for software engineering tasks.
4
+ Tools are harness-agnostic - they return plain data that harnesses adapt.
5
+ """
6
+
7
+ import inspect
8
+ from collections.abc import Callable, Sequence
9
+ from functools import wraps
10
+ from pathlib import Path
11
+ from typing import Any, get_type_hints
12
+
13
+ from flow.tools.coding import create_coding_tools
14
+ from flow.tools.core import create_core_tools
15
+ from flow.tools.execution import create_execution_tools
16
+ from flow.tools.memory import create_memory_tool
17
+ from flow.tools.sub_agent import create_sub_agent_tool
18
+
19
+ __all__ = [
20
+ "create_all_tools",
21
+ "create_coding_tools",
22
+ "create_core_tools",
23
+ "create_execution_tools",
24
+ "create_memory_tool",
25
+ "create_sub_agent_tool",
26
+ "get_tool_schema",
27
+ "tool",
28
+ ]
29
+
30
+
31
+ def tool(
32
+ name: str | None = None,
33
+ description: str | None = None,
34
+ ) -> Callable[[Callable[..., Any]], Callable[..., Any]]:
35
+ """Decorator to mark a function as an agent tool.
36
+
37
+ This decorator adds metadata to functions that allows harnesses
38
+ to discover and use them as agent tools.
39
+
40
+ Args:
41
+ name: Tool name (defaults to function name)
42
+ description: Tool description (defaults to docstring)
43
+
44
+ Returns:
45
+ Decorated function with tool metadata
46
+
47
+ Example:
48
+ @tool(name="read_file", description="Read file contents")
49
+ async def read_file(path: str) -> str:
50
+ ...
51
+ """
52
+
53
+ def decorator(func: Callable[..., Any]) -> Callable[..., Any]:
54
+ @wraps(func)
55
+ def wrapper(*args: Any, **kwargs: Any) -> Any:
56
+ return func(*args, **kwargs)
57
+
58
+ # Store tool metadata
59
+ wrapper._tool_name = name or func.__name__ # type: ignore[attr-defined]
60
+ wrapper._tool_description = description or func.__doc__ or "" # type: ignore[attr-defined]
61
+ wrapper._is_tool = True # type: ignore[attr-defined]
62
+
63
+ return wrapper
64
+
65
+ return decorator
66
+
67
+
68
+ def get_tool_schema(func: Callable[..., Any]) -> dict[str, Any]:
69
+ """Extract JSON schema from a tool function.
70
+
71
+ Uses type hints and Annotated metadata to build the schema.
72
+
73
+ Args:
74
+ func: Tool function to extract schema from
75
+
76
+ Returns:
77
+ JSON schema dict for the tool's parameters
78
+ """
79
+ hints = get_type_hints(func, include_extras=True)
80
+ sig = inspect.signature(func)
81
+
82
+ properties: dict[str, Any] = {}
83
+ required: list[str] = []
84
+
85
+ for param_name, param in sig.parameters.items():
86
+ if param_name in ("self", "cls"):
87
+ continue
88
+
89
+ param_schema: dict[str, Any] = {}
90
+ hint = hints.get(param_name, Any)
91
+
92
+ # Handle Annotated types
93
+ origin = getattr(hint, "__origin__", None)
94
+ if origin is not None:
95
+ # Check if it's Annotated
96
+ if hasattr(hint, "__metadata__"):
97
+ # Extract description from Annotated metadata
98
+ for meta in hint.__metadata__:
99
+ if isinstance(meta, str):
100
+ param_schema["description"] = meta
101
+ break
102
+ # Get the actual type
103
+ hint = hint.__args__[0]
104
+ origin = getattr(hint, "__origin__", None)
105
+
106
+ # Map Python types to JSON schema types
107
+ if hint is str:
108
+ param_schema["type"] = "string"
109
+ elif hint is int:
110
+ param_schema["type"] = "integer"
111
+ elif hint is float:
112
+ param_schema["type"] = "number"
113
+ elif hint is bool:
114
+ param_schema["type"] = "boolean"
115
+ elif origin is list:
116
+ param_schema["type"] = "array"
117
+ elif origin is dict:
118
+ param_schema["type"] = "object"
119
+ else:
120
+ param_schema["type"] = "string" # Default fallback
121
+
122
+ properties[param_name] = param_schema
123
+
124
+ # Check if parameter is required (no default value)
125
+ if param.default is inspect.Parameter.empty:
126
+ required.append(param_name)
127
+
128
+ return {
129
+ "type": "object",
130
+ "properties": properties,
131
+ "required": required,
132
+ }
133
+
134
+
135
+ def create_all_tools(
136
+ workspace: Path,
137
+ memory_path: Path,
138
+ bash_timeout: int = 120,
139
+ *,
140
+ enable_memory_tool: bool = True,
141
+ enable_sub_agent: bool = False,
142
+ sub_agent_model: str = "gpt-4o-mini",
143
+ ) -> Sequence[Callable[..., Any]]:
144
+ """Create all standard tools for the Flow agent.
145
+
146
+ Args:
147
+ workspace: Root directory for file operations
148
+ memory_path: Directory for persistent memory
149
+ bash_timeout: Timeout for bash commands in seconds
150
+ enable_memory_tool: Whether to include the memory tool
151
+ enable_sub_agent: Whether to include the sub-agent research tool
152
+ sub_agent_model: Model to use for sub-agent (default: gpt-4o-mini)
153
+
154
+ Returns:
155
+ List of all tool functions
156
+ """
157
+ tools: list[Callable[..., Any]] = []
158
+
159
+ # Core tools always included
160
+ tools.extend(create_coding_tools(workspace))
161
+ tools.extend(create_execution_tools(workspace, memory_path, bash_timeout))
162
+ tools.extend(create_core_tools())
163
+
164
+ # Optional: Agent-managed memory tool
165
+ if enable_memory_tool:
166
+ tools.append(create_memory_tool(memory_path))
167
+
168
+ # Optional: Sub-agent for isolated research
169
+ if enable_sub_agent:
170
+ tools.append(create_sub_agent_tool(workspace, model=sub_agent_model))
171
+
172
+ return tools
src/flow/tools/coding.py ADDED
@@ -0,0 +1,391 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Coding tools for file operations and code search.
2
+
3
+ These tools enable agents to read/write files, list directories,
4
+ and search for patterns in code.
5
+
6
+ The agent can read and write to any path the user has access to.
7
+ The workspace serves as the default working directory for relative paths.
8
+ """
9
+
10
+ import re
11
+ from collections.abc import Callable, Coroutine, Sequence
12
+ from pathlib import Path
13
+ from typing import Annotated, Any
14
+
15
+
16
+ def create_read_file_tool(workspace: Path) -> Callable[..., Coroutine[Any, Any, str]]:
17
+ """Create a read_file tool that can read from any path.
18
+
19
+ Args:
20
+ workspace: Default directory for relative paths (not a restriction)
21
+ """
22
+
23
+ async def read_file(
24
+ file_path: Annotated[str, "Path to the file (absolute or relative to workspace)"],
25
+ max_lines: Annotated[int, "Maximum lines to return (default: 500)"] = 500,
26
+ ) -> str:
27
+ """Read the contents of a file. Can read from any path on the system."""
28
+ try:
29
+ # Support both absolute and relative paths
30
+ path = Path(file_path)
31
+ if path.is_absolute():
32
+ full_path = path.resolve()
33
+ else:
34
+ full_path = (workspace / file_path).resolve()
35
+
36
+ if not full_path.exists():
37
+ return f"Error: File not found: {file_path}"
38
+
39
+ if not full_path.is_file():
40
+ return f"Error: Not a file: {file_path}"
41
+
42
+ content = full_path.read_text(encoding="utf-8")
43
+ lines = content.splitlines()
44
+
45
+ # Apply line limit
46
+ total_lines = len(lines)
47
+ if len(lines) > max_lines:
48
+ lines = lines[:max_lines]
49
+ truncated_msg = f"\n... (truncated, showing first {max_lines} of {total_lines} lines)"
50
+ else:
51
+ truncated_msg = ""
52
+
53
+ # Format with line numbers
54
+ numbered_lines = [f"{i + 1:5d}: {line}" for i, line in enumerate(lines)]
55
+ result = "\n".join(numbered_lines) + truncated_msg
56
+
57
+ return f"File: {full_path} ({total_lines} lines)\n{'=' * 40}\n{result}"
58
+
59
+ except UnicodeDecodeError:
60
+ return f"Error: Cannot read file (binary or non-UTF-8): {file_path}"
61
+ except PermissionError:
62
+ return f"Error: Permission denied: {file_path}"
63
+ except Exception as e:
64
+ return f"Error reading file: {e}"
65
+
66
+ # Add tool metadata
67
+ read_file._tool_name = "read_file" # type: ignore[attr-defined]
68
+ read_file._tool_description = ( # type: ignore[attr-defined]
69
+ "Read the contents of a file. Accepts absolute paths (e.g., /path/to/file) "
70
+ "or relative paths (relative to workspace). Returns content with line numbers."
71
+ )
72
+ read_file._is_tool = True # type: ignore[attr-defined]
73
+
74
+ return read_file
75
+
76
+
77
+ def create_write_file_tool(workspace: Path) -> Callable[..., Coroutine[Any, Any, str]]:
78
+ """Create a write_file tool.
79
+
80
+ Args:
81
+ workspace: Default directory for relative paths
82
+ """
83
+
84
+ async def write_file(
85
+ file_path: Annotated[str, "Path to the file (absolute or relative to workspace)"],
86
+ content: Annotated[str | None, "Full content to write (for complete file write)"] = None,
87
+ old_str: Annotated[str | None, "Text to replace (for str_replace operation)"] = None,
88
+ new_str: Annotated[str | None, "Replacement text (for str_replace operation)"] = None,
89
+ insert_line: Annotated[int | None, "Line number to insert at (1-indexed)"] = None,
90
+ insert_content: Annotated[str | None, "Content to insert at line"] = None,
91
+ ) -> str:
92
+ """Write or edit file content.
93
+
94
+ Supports: (1) full file write with 'content',
95
+ (2) str_replace to replace specific text,
96
+ (3) insert_at_line to add content at a specific line.
97
+ Creates parent directories if needed.
98
+ """
99
+ try:
100
+ # Support both absolute and relative paths
101
+ path = Path(file_path)
102
+ if path.is_absolute():
103
+ full_path = path.resolve()
104
+ else:
105
+ full_path = (workspace / file_path).resolve()
106
+
107
+ # Create parent directories
108
+ full_path.parent.mkdir(parents=True, exist_ok=True)
109
+
110
+ # Operation 1: Full file write
111
+ if content is not None:
112
+ full_path.write_text(content, encoding="utf-8")
113
+ return f"Successfully wrote {len(content)} characters to {file_path}"
114
+
115
+ # Operation 2: str_replace
116
+ if old_str is not None and new_str is not None:
117
+ if not full_path.exists():
118
+ return f"Error: File not found for str_replace: {file_path}"
119
+
120
+ current_content = full_path.read_text(encoding="utf-8")
121
+
122
+ if old_str not in current_content:
123
+ # Show a snippet of the file to help debug
124
+ if len(current_content) > 500:
125
+ snippet = current_content[:500] + "..."
126
+ else:
127
+ snippet = current_content
128
+ return (
129
+ f"Error: String to replace not found in file.\n"
130
+ f"Searching for: '{old_str[:100]}...'\n"
131
+ f"File content preview:\n{snippet}"
132
+ )
133
+
134
+ # Replace first occurrence only
135
+ new_content = current_content.replace(old_str, new_str, 1)
136
+ full_path.write_text(new_content, encoding="utf-8")
137
+ return f"Successfully replaced text in {file_path}"
138
+
139
+ # Operation 3: insert_at_line
140
+ if insert_line is not None and insert_content is not None:
141
+ if full_path.exists():
142
+ current_content = full_path.read_text(encoding="utf-8")
143
+ lines = current_content.splitlines(keepends=True)
144
+ else:
145
+ lines = []
146
+
147
+ # Ensure insert_content ends with newline
148
+ if not insert_content.endswith("\n"):
149
+ insert_content += "\n"
150
+
151
+ # Insert at specified line (1-indexed)
152
+ insert_index = insert_line - 1
153
+ if insert_index < 0:
154
+ return f"Error: Invalid line number: {insert_line}. Must be >= 1."
155
+
156
+ # Allow inserting at end
157
+ if insert_index > len(lines):
158
+ insert_index = len(lines)
159
+
160
+ lines.insert(insert_index, insert_content)
161
+ new_content = "".join(lines)
162
+ full_path.write_text(new_content, encoding="utf-8")
163
+ return f"Successfully inserted content at line {insert_line} in {file_path}"
164
+
165
+ return "Error: Must provide either 'content', 'old_str' + 'new_str', or 'insert_line' + 'insert_content'"
166
+
167
+ except Exception as e:
168
+ return f"Error writing file: {e}"
169
+
170
+ # Add tool metadata
171
+ write_file._tool_name = "write_file" # type: ignore[attr-defined]
172
+ write_file._tool_description = ( # type: ignore[attr-defined]
173
+ "Write or edit file content. Accepts absolute paths or relative paths (relative to workspace). "
174
+ "Supports: (1) full file write with 'content', (2) str_replace to replace specific text, "
175
+ "(3) insert_at_line to add content at a specific line. Creates parent directories if needed."
176
+ )
177
+ write_file._is_tool = True # type: ignore[attr-defined]
178
+
179
+ return write_file
180
+
181
+
182
+ def create_list_directory_tool(workspace: Path) -> Callable[..., Coroutine[Any, Any, str]]:
183
+ """Create a list_directory tool that can list any directory.
184
+
185
+ Args:
186
+ workspace: Default directory for relative paths (not a restriction)
187
+ """
188
+
189
+ async def list_directory(
190
+ directory_path: Annotated[str, "Path to directory (absolute or relative to workspace, default: '.')"] = ".",
191
+ recursive: Annotated[bool, "List subdirectories recursively (default: false)"] = False,
192
+ max_entries: Annotated[int, "Maximum entries to return (default: 200)"] = 200,
193
+ ) -> str:
194
+ """List files and directories at a given path. Can list any directory on the system."""
195
+ try:
196
+ # Support both absolute and relative paths
197
+ path = Path(directory_path)
198
+ if path.is_absolute():
199
+ full_path = path.resolve()
200
+ else:
201
+ full_path = (workspace / directory_path).resolve()
202
+
203
+ if not full_path.exists():
204
+ return f"Error: Directory not found: {directory_path}"
205
+
206
+ if not full_path.is_dir():
207
+ return f"Error: Not a directory: {directory_path}"
208
+
209
+ entries: list[tuple[str, str, int]] = []
210
+
211
+ if recursive:
212
+ for item in full_path.rglob("*"):
213
+ if len(entries) >= max_entries:
214
+ break
215
+ # Skip common non-essential directories
216
+ skip_dirs = ["node_modules", "__pycache__", ".git", "venv", ".venv"]
217
+ if any(part in item.parts for part in skip_dirs):
218
+ continue
219
+ rel_path = item.relative_to(full_path)
220
+ item_type = "file" if item.is_file() else "dir"
221
+ size = item.stat().st_size if item.is_file() else 0
222
+ entries.append((str(rel_path), item_type, size))
223
+ else:
224
+ for item in full_path.iterdir():
225
+ if len(entries) >= max_entries:
226
+ break
227
+ item_type = "file" if item.is_file() else "dir"
228
+ size = item.stat().st_size if item.is_file() else 0
229
+ entries.append((item.name, item_type, size))
230
+
231
+ # Sort: directories first, then by name
232
+ entries.sort(key=lambda x: (x[1] != "dir", x[0]))
233
+
234
+ # Format output
235
+ result_lines = [f"Directory: {directory_path} ({len(entries)} entries)"]
236
+ result_lines.append("=" * 50)
237
+
238
+ for name, item_type, size in entries:
239
+ if item_type == "dir":
240
+ result_lines.append(f" [DIR] {name}/")
241
+ else:
242
+ size_str = f"{size:,} bytes" if size < 10000 else f"{size / 1024:.1f} KB"
243
+ result_lines.append(f" [FILE] {name} ({size_str})")
244
+
245
+ if len(entries) >= max_entries:
246
+ result_lines.append(f"\n... (truncated at {max_entries} entries)")
247
+
248
+ return "\n".join(result_lines)
249
+
250
+ except Exception as e:
251
+ return f"Error listing directory: {e}"
252
+
253
+ # Add tool metadata
254
+ list_directory._tool_name = "list_directory" # type: ignore[attr-defined]
255
+ list_directory._tool_description = ( # type: ignore[attr-defined]
256
+ "List files and directories at a given path. Accepts absolute paths (e.g., /path/to/dir) "
257
+ "or relative paths (relative to workspace). Returns names, types, and sizes."
258
+ )
259
+ list_directory._is_tool = True # type: ignore[attr-defined]
260
+
261
+ return list_directory
262
+
263
+
264
+ def create_grep_search_tool(workspace: Path) -> Callable[..., Coroutine[Any, Any, str]]:
265
+ """Create a grep_search tool that can search any directory.
266
+
267
+ Args:
268
+ workspace: Default directory for relative paths (not a restriction)
269
+ """
270
+
271
+ async def grep_search(
272
+ pattern: Annotated[str, "Pattern to search for (regex supported)"],
273
+ path: Annotated[str, "Path to search in (absolute or relative to workspace, default: '.')"] = ".",
274
+ file_pattern: Annotated[str | None, "File pattern to filter (e.g., '*.py', '*.js')"] = None,
275
+ case_sensitive: Annotated[bool, "Case sensitive search (default: true)"] = True,
276
+ max_matches: Annotated[int, "Maximum matches to return (default: 50)"] = 50,
277
+ ) -> str:
278
+ """Search for text patterns in files. Can search any path on the system."""
279
+ try:
280
+ # Support both absolute and relative paths
281
+ search_path = Path(path)
282
+ if search_path.is_absolute():
283
+ full_path = search_path.resolve()
284
+ else:
285
+ full_path = (workspace / path).resolve()
286
+
287
+ if not full_path.exists():
288
+ return f"Error: Path not found: {path}"
289
+
290
+ # Compile regex
291
+ flags = 0 if case_sensitive else re.IGNORECASE
292
+ try:
293
+ regex = re.compile(pattern, flags)
294
+ except re.error as e:
295
+ return f"Error: Invalid regex pattern: {e}"
296
+
297
+ matches: list[dict[str, Any]] = []
298
+
299
+ # Get files to search
300
+ if full_path.is_file():
301
+ files = [full_path]
302
+ else:
303
+ if file_pattern:
304
+ files = list(full_path.rglob(file_pattern))
305
+ else:
306
+ files = [f for f in full_path.rglob("*") if f.is_file()]
307
+
308
+ # Search each file
309
+ for file_path_item in files:
310
+ if len(matches) >= max_matches:
311
+ break
312
+
313
+ # Skip common non-essential directories and binary files
314
+ skip_dirs = ["node_modules", "__pycache__", ".git", "venv", ".venv"]
315
+ if any(part in file_path_item.parts for part in skip_dirs):
316
+ continue
317
+
318
+ try:
319
+ # Skip large files (> 1MB)
320
+ if file_path_item.stat().st_size > 1_000_000:
321
+ continue
322
+
323
+ file_content = file_path_item.read_text(encoding="utf-8", errors="ignore")
324
+ lines = file_content.splitlines()
325
+
326
+ for line_num, line in enumerate(lines, 1):
327
+ if len(matches) >= max_matches:
328
+ break
329
+ if regex.search(line):
330
+ # Compute relative path from search root
331
+ try:
332
+ rel_path = file_path_item.relative_to(full_path)
333
+ except ValueError:
334
+ # If file is the search path itself, use filename
335
+ rel_path = file_path_item.name
336
+ matches.append({
337
+ "file": str(rel_path),
338
+ "line": line_num,
339
+ "text": line.strip()[:200],
340
+ })
341
+ except (UnicodeDecodeError, PermissionError):
342
+ continue
343
+
344
+ # Format output
345
+ if not matches:
346
+ return f"No matches found for pattern '{pattern}' in {path}"
347
+
348
+ result_lines = [f"Found {len(matches)} match(es) for '{pattern}'"]
349
+ result_lines.append("=" * 50)
350
+
351
+ for match in matches:
352
+ result_lines.append(f"{match['file']}:{match['line']}: {match['text']}")
353
+
354
+ if len(matches) >= max_matches:
355
+ result_lines.append(f"\n... (truncated at {max_matches} matches)")
356
+
357
+ return "\n".join(result_lines)
358
+
359
+ except Exception as e:
360
+ return f"Error searching: {e}"
361
+
362
+ # Add tool metadata
363
+ grep_search._tool_name = "grep_search" # type: ignore[attr-defined]
364
+ grep_search._tool_description = ( # type: ignore[attr-defined]
365
+ "Search for text patterns in files. Accepts absolute paths (e.g., /path/to/dir) "
366
+ "or relative paths (relative to workspace). Supports regex patterns and file filtering."
367
+ )
368
+ grep_search._is_tool = True # type: ignore[attr-defined]
369
+
370
+ return grep_search
371
+
372
+
373
+ def create_coding_tools(workspace: Path) -> Sequence[Callable[..., Coroutine[Any, Any, str]]]:
374
+ """Create all coding tools bound to a workspace.
375
+
376
+ Args:
377
+ workspace: Root directory for file operations
378
+
379
+ Returns:
380
+ List of coding tool functions
381
+ """
382
+ workspace = Path(workspace).resolve()
383
+
384
+ return [
385
+ create_read_file_tool(workspace),
386
+ create_write_file_tool(workspace),
387
+ create_list_directory_tool(workspace),
388
+ create_grep_search_tool(workspace),
389
+ ]
390
+
391
+
src/flow/tools/core.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Core metacognitive tools for agent reasoning and task management.
2
+
3
+ These tools enable agents to think explicitly, track task status,
4
+ and make structured decisions during complex software engineering tasks.
5
+ """
6
+
7
+ from collections.abc import Callable, Coroutine, Sequence
8
+ from typing import Annotated, Any, Literal
9
+
10
+
11
+ async def think(
12
+ thought: Annotated[
13
+ str,
14
+ (
15
+ "Your detailed reasoning about the current situation. "
16
+ "Include: what you've learned, options you're considering, "
17
+ "potential risks, and your planned approach."
18
+ ),
19
+ ],
20
+ ) -> str:
21
+ """Use this tool to pause and think through a complex problem.
22
+
23
+ Helpful when: (1) analyzing tool results, (2) planning multi-step approaches,
24
+ (3) making design decisions, (4) debugging issues, (5) avoiding mistakes.
25
+ Your reasoning is recorded and helps structure your approach.
26
+ """
27
+ # The value is in giving the LLM dedicated space to reason
28
+ summary = thought[:300] + "..." if len(thought) > 300 else thought
29
+ return f"Thought recorded: {summary}"
30
+
31
+
32
+ async def task_done(
33
+ status: Annotated[
34
+ Literal["complete", "incomplete"],
35
+ "'complete' if task finished successfully, 'incomplete' if blocked or needs input",
36
+ ],
37
+ summary: Annotated[
38
+ str,
39
+ (
40
+ "Summary of what was accomplished. "
41
+ "If complete: what was done and how to use/test it. "
42
+ "If incomplete: what's blocking and what's needed."
43
+ ),
44
+ ],
45
+ files_created: Annotated[
46
+ list[str] | None,
47
+ "List of files created or modified (if any)",
48
+ ] = None,
49
+ next_steps: Annotated[
50
+ list[str] | None,
51
+ "Suggested next steps for the user (if any)",
52
+ ] = None,
53
+ ) -> str:
54
+ """Call this when you have completed the user's task.
55
+
56
+ Provide a summary of what was accomplished and any relevant details.
57
+ Use 'complete' if all requirements are satisfied,
58
+ 'incomplete' if blocked or need more information.
59
+ """
60
+ result_lines = [
61
+ f"Task Status: {status.upper()}",
62
+ "",
63
+ "Summary:",
64
+ summary,
65
+ ]
66
+
67
+ if files_created:
68
+ result_lines.extend([
69
+ "",
70
+ "Files Created/Modified:",
71
+ *[f" - {f}" for f in files_created],
72
+ ])
73
+
74
+ if next_steps:
75
+ result_lines.extend([
76
+ "",
77
+ "Suggested Next Steps:",
78
+ *[f" - {step}" for step in next_steps],
79
+ ])
80
+
81
+ return "\n".join(result_lines)
82
+
83
+
84
+ # Add tool metadata
85
+ think._tool_name = "think" # type: ignore[attr-defined]
86
+ think._tool_description = think.__doc__ or "" # type: ignore[attr-defined]
87
+ think._is_tool = True # type: ignore[attr-defined]
88
+
89
+ task_done._tool_name = "task_done" # type: ignore[attr-defined]
90
+ task_done._tool_description = task_done.__doc__ or "" # type: ignore[attr-defined]
91
+ task_done._is_tool = True # type: ignore[attr-defined]
92
+
93
+
94
+ def create_core_tools() -> Sequence[Callable[..., Coroutine[Any, Any, str]]]:
95
+ """Create all core metacognitive tools.
96
+
97
+ Returns:
98
+ List of core tool functions
99
+ """
100
+ return [think, task_done]
src/flow/tools/execution.py ADDED
@@ -0,0 +1,479 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Execution tools for running commands and code.
2
+
3
+ These tools enable agents to execute bash commands and Python code
4
+ with safety controls (timeouts, output limits), and manage background processes.
5
+ """
6
+
7
+ import asyncio
8
+ import os
9
+ import re
10
+ import signal
11
+ import sys
12
+ from collections.abc import Callable, Coroutine, Sequence
13
+ from datetime import datetime
14
+ from io import StringIO
15
+ from pathlib import Path
16
+ from typing import Annotated, Any, Literal
17
+
18
+
19
+ def _get_process_registry_path(memory_path: Path) -> Path:
20
+ """Get the path to the process registry file in memory."""
21
+ return memory_path / "processes.md"
22
+
23
+
24
+ def _ensure_process_registry(memory_path: Path) -> Path:
25
+ """Ensure the process registry file exists and return its path."""
26
+ registry_path = _get_process_registry_path(memory_path)
27
+ registry_path.parent.mkdir(parents=True, exist_ok=True)
28
+
29
+ if not registry_path.exists():
30
+ registry_path.write_text(
31
+ "# Background Processes\n\n"
32
+ "This file tracks background processes started by the Flow agent.\n"
33
+ "You can view this file with `memory(command='view', path='/memory/processes.md')`\n\n"
34
+ "## Running\n\n"
35
+ "## Stopped\n\n"
36
+ )
37
+ return registry_path
38
+
39
+
40
+ def _add_process_to_registry(
41
+ memory_path: Path,
42
+ pid: int,
43
+ command: str,
44
+ workspace: str,
45
+ log_file: str,
46
+ port: int | None = None,
47
+ ) -> None:
48
+ """Add a process to the registry using checklist format."""
49
+ registry_path = _ensure_process_registry(memory_path)
50
+ content = registry_path.read_text()
51
+
52
+ # Extract port from command if not provided
53
+ if port is None:
54
+ port_match = re.search(r"(?:--port|-p)\s+(\d+)", command)
55
+ if port_match:
56
+ port = int(port_match.group(1))
57
+ elif ":8000" in command or "8000" in command:
58
+ port = 8000
59
+ elif ":3000" in command or "3000" in command:
60
+ port = 3000
61
+
62
+ timestamp = datetime.now().strftime("%Y-%m-%d %H:%M")
63
+ port_str = f"Port: {port}" if port else "Port: -"
64
+ cmd_short = command[:60] + "..." if len(command) > 60 else command
65
+ workspace_short = workspace.split("/")[-1] if "/" in workspace else workspace
66
+
67
+ # Create checklist entry
68
+ entry = f"- [ ] **PID {pid}** | `{cmd_short}` | {timestamp} | {port_str} | {workspace_short}\n"
69
+
70
+ # Add under "## Running" section
71
+ if "## Running" in content:
72
+ content = content.replace("## Running\n\n", f"## Running\n\n{entry}")
73
+ else:
74
+ # Add Running section if missing
75
+ content += f"\n## Running\n\n{entry}"
76
+
77
+ registry_path.write_text(content)
78
+
79
+
80
+ def _mark_process_stopped(memory_path: Path, pid: int, reason: str = "killed") -> None:
81
+ """Mark a process as stopped in the registry (check the box and move to Stopped)."""
82
+ registry_path = _get_process_registry_path(memory_path)
83
+ if not registry_path.exists():
84
+ return
85
+
86
+ content = registry_path.read_text()
87
+ lines = content.split("\n")
88
+ new_lines: list[str] = []
89
+ stopped_entry: str | None = None
90
+ timestamp = datetime.now().strftime("%Y-%m-%d %H:%M")
91
+
92
+ for line in lines:
93
+ if f"**PID {pid}**" in line and "- [ ]" in line:
94
+ # Found the running process - mark it as checked and prepare for Stopped section
95
+ stopped_entry = line.replace("- [ ]", "- [x]") + f" | {reason} @ {timestamp}"
96
+ # Don't add to new_lines yet (will move to Stopped section)
97
+ else:
98
+ new_lines.append(line)
99
+
100
+ # Add stopped entry to Stopped section
101
+ if stopped_entry:
102
+ content = "\n".join(new_lines)
103
+ if "## Stopped" in content:
104
+ content = content.replace("## Stopped\n\n", f"## Stopped\n\n{stopped_entry}\n")
105
+ else:
106
+ content += f"\n## Stopped\n\n{stopped_entry}\n"
107
+ registry_path.write_text(content)
108
+
109
+
110
+ def _is_process_running(pid: int) -> bool:
111
+ """Check if a process is still running."""
112
+ try:
113
+ os.kill(pid, 0)
114
+ return True
115
+ except (OSError, ProcessLookupError):
116
+ return False
117
+
118
+
119
+ def _get_running_pids_from_registry(memory_path: Path) -> list[tuple[int, str]]:
120
+ """Get list of (pid, line) for processes marked as running in registry."""
121
+ registry_path = _get_process_registry_path(memory_path)
122
+ if not registry_path.exists():
123
+ return []
124
+
125
+ content = registry_path.read_text()
126
+ running: list[tuple[int, str]] = []
127
+
128
+ for line in content.split("\n"):
129
+ if "- [ ]" in line and "**PID" in line:
130
+ # Extract PID from format: **PID 12345**
131
+ match = re.search(r"\*\*PID (\d+)\*\*", line)
132
+ if match:
133
+ pid = int(match.group(1))
134
+ running.append((pid, line))
135
+
136
+ return running
137
+
138
+
139
+ def create_bash_execute_tool(
140
+ workspace: Path, memory_path: Path, default_timeout: int = 120
141
+ ) -> Callable[..., Coroutine[Any, Any, str]]:
142
+ """Create a bash_execute tool bound to a specific workspace."""
143
+
144
+ async def bash_execute(
145
+ command: Annotated[str, "Bash command to execute"],
146
+ timeout: Annotated[int, f"Command timeout in seconds (default: {default_timeout})"] = default_timeout,
147
+ background: Annotated[
148
+ bool, "Run in background and return immediately with PID. Use for servers/long-running processes."
149
+ ] = False,
150
+ ) -> str:
151
+ """Execute bash commands in the workspace.
152
+
153
+ Returns stdout, stderr, and return code.
154
+ Use for running tests, git commands, package managers, builds, etc.
155
+ IMPORTANT: Each call runs in a fresh shell from workspace root -
156
+ use 'cd dir && command' for commands in subdirectories.
157
+ For long-running processes (servers), use background=True to avoid timeout.
158
+ """
159
+ try:
160
+ if background:
161
+ # Run in background using nohup and capture PID
162
+ # Redirect output to a log file
163
+ log_file = workspace / ".background_logs" / f"bg_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
164
+ log_file.parent.mkdir(parents=True, exist_ok=True)
165
+
166
+ bg_command = f"nohup {command} > {log_file} 2>&1 & echo $!"
167
+
168
+ proc = await asyncio.create_subprocess_shell(
169
+ bg_command,
170
+ stdout=asyncio.subprocess.PIPE,
171
+ stderr=asyncio.subprocess.PIPE,
172
+ cwd=str(workspace),
173
+ )
174
+
175
+ stdout, _ = await proc.communicate()
176
+ pid_str = stdout.decode().strip()
177
+
178
+ try:
179
+ pid = int(pid_str)
180
+ # Register the process in memory
181
+ _add_process_to_registry(
182
+ memory_path=memory_path,
183
+ pid=pid,
184
+ command=command,
185
+ workspace=str(workspace),
186
+ log_file=str(log_file),
187
+ )
188
+
189
+ return (
190
+ f"Background process started successfully.\n"
191
+ f"PID: {pid}\n"
192
+ f"Command: {command}\n"
193
+ f"Log file: {log_file}\n"
194
+ f"\nProcess registered in /memory/processes.md\n"
195
+ f"Use check_processes(action='list') to see all background processes.\n"
196
+ f"Use check_processes(action='kill', pid={pid}) to stop this process."
197
+ )
198
+ except ValueError:
199
+ return f"Error: Could not get PID. Output: {pid_str}"
200
+
201
+ # Regular (blocking) execution
202
+ proc = await asyncio.create_subprocess_shell(
203
+ command,
204
+ stdout=asyncio.subprocess.PIPE,
205
+ stderr=asyncio.subprocess.PIPE,
206
+ cwd=str(workspace),
207
+ )
208
+
209
+ try:
210
+ stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=timeout)
211
+ except asyncio.TimeoutError:
212
+ proc.kill()
213
+ await proc.wait()
214
+ return (
215
+ f"Error: Command timed out after {timeout} seconds.\n"
216
+ f"Command: {command}\n\n"
217
+ f"TIP: If this is a long-running process (like a server), "
218
+ f"use background=True to run it in the background."
219
+ )
220
+
221
+ stdout_str = stdout.decode("utf-8", errors="replace")
222
+ stderr_str = stderr.decode("utf-8", errors="replace")
223
+ return_code = proc.returncode
224
+
225
+ # Format output
226
+ result_parts = [f"Command: {command}"]
227
+ result_parts.append(f"Return code: {return_code}")
228
+ result_parts.append("=" * 50)
229
+
230
+ if stdout_str.strip():
231
+ # Truncate very long output
232
+ if len(stdout_str) > 15000:
233
+ stdout_str = stdout_str[:15000] + "\n... (stdout truncated)"
234
+ result_parts.append("STDOUT:")
235
+ result_parts.append(stdout_str)
236
+
237
+ if stderr_str.strip():
238
+ if len(stderr_str) > 5000:
239
+ stderr_str = stderr_str[:5000] + "\n... (stderr truncated)"
240
+ result_parts.append("STDERR:")
241
+ result_parts.append(stderr_str)
242
+
243
+ if not stdout_str.strip() and not stderr_str.strip():
244
+ result_parts.append("(no output)")
245
+
246
+ return "\n".join(result_parts)
247
+
248
+ except Exception as e:
249
+ return f"Error executing command: {e}"
250
+
251
+ # Add tool metadata
252
+ bash_execute._tool_name = "bash_execute" # type: ignore[attr-defined]
253
+ bash_execute._tool_description = ( # type: ignore[attr-defined]
254
+ "Execute bash commands in the workspace. "
255
+ "Returns stdout, stderr, and return code. "
256
+ "Use for running tests, git commands, package managers, builds, etc."
257
+ )
258
+ bash_execute._is_tool = True # type: ignore[attr-defined]
259
+
260
+ return bash_execute
261
+
262
+
263
+ def create_check_processes_tool(
264
+ workspace: Path, memory_path: Path
265
+ ) -> Callable[..., Coroutine[Any, Any, str]]:
266
+ """Create a tool to check and manage background processes."""
267
+
268
+ async def check_processes(
269
+ action: Annotated[
270
+ Literal["list", "kill", "cleanup"],
271
+ "'list' to see processes, 'kill' to stop one by PID, 'cleanup' to kill all",
272
+ ],
273
+ pid: Annotated[int | None, "PID to kill (required for 'kill' action)"] = None,
274
+ ) -> str:
275
+ """Check and manage background processes.
276
+
277
+ Use 'list' to see all background processes (also viewable at /memory/processes.md),
278
+ 'kill' to stop a specific process by PID,
279
+ 'cleanup' to kill all background processes from this workspace.
280
+ """
281
+ _ensure_process_registry(memory_path)
282
+ registry_path = _get_process_registry_path(memory_path)
283
+
284
+ if action == "list":
285
+ # Read the registry and update status of running processes
286
+ running_pids = _get_running_pids_from_registry(memory_path)
287
+ active_count = 0
288
+ dead_pids: list[int] = []
289
+
290
+ for proc_pid, _ in running_pids:
291
+ if _is_process_running(proc_pid):
292
+ active_count += 1
293
+ else:
294
+ dead_pids.append(proc_pid)
295
+
296
+ # Mark dead processes as stopped
297
+ for dead_pid in dead_pids:
298
+ _mark_process_stopped(memory_path, dead_pid, reason="exited")
299
+
300
+ # Return the updated registry
301
+ content = registry_path.read_text()
302
+ return (
303
+ f"Active background processes: {active_count}\n"
304
+ f"(View full registry at /memory/processes.md)\n\n"
305
+ f"{content}"
306
+ )
307
+
308
+ if action == "kill":
309
+ if pid is None:
310
+ return "Error: 'pid' is required for 'kill' action."
311
+
312
+ try:
313
+ os.kill(pid, signal.SIGTERM)
314
+ await asyncio.sleep(0.5) # Give it time to terminate
315
+
316
+ # Check if it's really dead, if not SIGKILL
317
+ if _is_process_running(pid):
318
+ os.kill(pid, signal.SIGKILL)
319
+ await asyncio.sleep(0.2)
320
+
321
+ _mark_process_stopped(memory_path, pid, reason="killed")
322
+
323
+ if _is_process_running(pid):
324
+ return f"Warning: Process {pid} may still be running after kill attempt."
325
+ return f"Successfully killed process {pid}. Updated /memory/processes.md"
326
+
327
+ except ProcessLookupError:
328
+ _mark_process_stopped(memory_path, pid, reason="not found")
329
+ return f"Process {pid} was not running (already terminated). Updated /memory/processes.md"
330
+ except PermissionError:
331
+ return f"Error: Permission denied to kill process {pid}."
332
+ except Exception as e:
333
+ return f"Error killing process {pid}: {e}"
334
+
335
+ if action == "cleanup":
336
+ # Kill all processes from this workspace
337
+ running_pids = _get_running_pids_from_registry(memory_path)
338
+ workspace_str = str(workspace)
339
+ killed: list[int] = []
340
+ failed: list[tuple[int, str]] = []
341
+
342
+ for proc_pid, line in running_pids:
343
+ # Check if this process is from our workspace
344
+ workspace_short = workspace_str.split("/")[-1]
345
+ if workspace_short in line or workspace_str in line:
346
+ try:
347
+ os.kill(proc_pid, signal.SIGTERM)
348
+ await asyncio.sleep(0.2)
349
+ if _is_process_running(proc_pid):
350
+ os.kill(proc_pid, signal.SIGKILL)
351
+ _mark_process_stopped(memory_path, proc_pid, reason="cleanup")
352
+ killed.append(proc_pid)
353
+ except (ProcessLookupError, PermissionError) as e:
354
+ _mark_process_stopped(memory_path, proc_pid, reason=f"cleanup failed: {e}")
355
+ failed.append((proc_pid, str(e)))
356
+
357
+ result = "Cleanup complete. Updated /memory/processes.md\n"
358
+ if killed:
359
+ result += f"Killed processes: {killed}\n"
360
+ if failed:
361
+ result += f"Failed to kill: {failed}\n"
362
+ if not killed and not failed:
363
+ result += "No active processes found for this workspace."
364
+
365
+ return result
366
+
367
+ return f"Unknown action: {action}"
368
+
369
+ # Add tool metadata
370
+ check_processes._tool_name = "check_processes" # type: ignore[attr-defined]
371
+ check_processes._tool_description = ( # type: ignore[attr-defined]
372
+ "Check and manage background processes. "
373
+ "Use 'list' to see all background processes, "
374
+ "'kill' to stop a specific process by PID, "
375
+ "'cleanup' to kill all background processes from this workspace."
376
+ )
377
+ check_processes._is_tool = True # type: ignore[attr-defined]
378
+
379
+ return check_processes
380
+
381
+
382
+ def create_python_repl_tool(workspace: Path) -> Callable[..., Coroutine[Any, Any, str]]:
383
+ """Create a python_repl tool bound to a specific workspace."""
384
+
385
+ async def python_repl(
386
+ code: Annotated[str, "Python code to execute"],
387
+ ) -> str:
388
+ """Execute Python code in an isolated namespace.
389
+
390
+ Returns the output (stdout) or any errors.
391
+ Use for testing code snippets, calculations, data manipulation, or quick validation.
392
+ The WORKSPACE variable is available with the workspace path.
393
+ """
394
+ old_stdout = sys.stdout
395
+ old_stderr = sys.stderr
396
+
397
+ try:
398
+ # Capture stdout and stderr
399
+ redirected_output = StringIO()
400
+ redirected_error = StringIO()
401
+ sys.stdout = redirected_output
402
+ sys.stderr = redirected_error
403
+
404
+ # Create isolated namespace with builtins
405
+ namespace: dict[str, Any] = {
406
+ "__builtins__": __builtins__,
407
+ "__name__": "__main__",
408
+ "WORKSPACE": workspace,
409
+ }
410
+
411
+ try:
412
+ # Try to compile and exec
413
+ compiled = compile(code, "<repl>", "exec")
414
+ exec(compiled, namespace) # noqa: S102
415
+
416
+ output = redirected_output.getvalue()
417
+ error = redirected_error.getvalue()
418
+
419
+ result_parts = ["Python REPL Output"]
420
+ result_parts.append("=" * 50)
421
+
422
+ if output.strip():
423
+ if len(output) > 15000:
424
+ output = output[:15000] + "\n... (output truncated)"
425
+ result_parts.append(output)
426
+
427
+ if error.strip():
428
+ result_parts.append("STDERR:")
429
+ result_parts.append(error)
430
+
431
+ if not output.strip() and not error.strip():
432
+ result_parts.append("(code executed successfully, no output)")
433
+
434
+ return "\n".join(result_parts)
435
+
436
+ except SyntaxError as e:
437
+ return f"SyntaxError: {e}"
438
+ except Exception as e:
439
+ return f"Error: {type(e).__name__}: {e}"
440
+
441
+ finally:
442
+ sys.stdout = old_stdout
443
+ sys.stderr = old_stderr
444
+
445
+ # Add tool metadata
446
+ python_repl._tool_name = "python_repl" # type: ignore[attr-defined]
447
+ python_repl._tool_description = ( # type: ignore[attr-defined]
448
+ "Execute Python code in an isolated namespace. "
449
+ "Returns the output (stdout) or any errors. "
450
+ "Use for testing code snippets, calculations, data manipulation, or quick validation."
451
+ )
452
+ python_repl._is_tool = True # type: ignore[attr-defined]
453
+
454
+ return python_repl
455
+
456
+
457
+ def create_execution_tools(
458
+ workspace: Path,
459
+ memory_path: Path,
460
+ bash_timeout: int = 120,
461
+ ) -> Sequence[Callable[..., Coroutine[Any, Any, str]]]:
462
+ """Create all execution tools bound to a workspace.
463
+
464
+ Args:
465
+ workspace: Root directory for command execution
466
+ memory_path: Path to memory directory for process registry
467
+ bash_timeout: Default timeout for bash commands in seconds
468
+
469
+ Returns:
470
+ List of execution tool functions
471
+ """
472
+ workspace = Path(workspace).resolve()
473
+ memory_path = Path(memory_path).resolve()
474
+
475
+ return [
476
+ create_bash_execute_tool(workspace, memory_path, bash_timeout),
477
+ create_check_processes_tool(workspace, memory_path),
478
+ create_python_repl_tool(workspace),
479
+ ]
src/flow/tools/memory.py ADDED
@@ -0,0 +1,260 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Memory tool for persistent storage across sessions.
2
+
3
+ Provides file-based memory storage allowing agents to store and retrieve
4
+ information, patterns, and decisions across conversations.
5
+ """
6
+
7
+ from collections.abc import Callable, Coroutine
8
+ from pathlib import Path
9
+ from typing import Annotated, Any, Literal
10
+
11
+
12
+ class MemoryBackend:
13
+ """File-based memory storage backend with security controls."""
14
+
15
+ def __init__(self, base_path: Path) -> None:
16
+ """Initialize memory backend."""
17
+ self.base_path = Path(base_path).resolve()
18
+ self.base_path.mkdir(parents=True, exist_ok=True)
19
+
20
+ def _validate_path(self, path: str) -> Path:
21
+ """Validate and resolve a memory path."""
22
+ # Normalize path (remove /memory prefix if present)
23
+ if path.startswith("/memory"):
24
+ path = path[len("/memory") :]
25
+ path = path.lstrip("/")
26
+
27
+ # Handle empty path
28
+ if not path:
29
+ return self.base_path
30
+
31
+ # Resolve to absolute path
32
+ full_path = (self.base_path / path).resolve()
33
+
34
+ # Security: Ensure path is within base_path
35
+ try:
36
+ full_path.relative_to(self.base_path)
37
+ except ValueError as err:
38
+ raise ValueError(f"Access denied: path '{path}' is outside memory directory") from err
39
+
40
+ return full_path
41
+
42
+ def view(self, path: str, view_range: list[int] | None = None) -> str:
43
+ """View directory contents or file contents."""
44
+ full_path = self._validate_path(path)
45
+
46
+ if not full_path.exists():
47
+ return f"Path not found: {path}\nUse 'create' to create new files."
48
+
49
+ # Directory listing
50
+ if full_path.is_dir():
51
+ contents = [f"Directory: {path or '/memory'}"]
52
+ items = sorted(full_path.iterdir(), key=lambda x: (x.is_file(), x.name))
53
+
54
+ if not items:
55
+ contents.append("(empty directory)")
56
+ else:
57
+ for item in items:
58
+ suffix = "/" if item.is_dir() else ""
59
+ contents.append(f" - {item.name}{suffix}")
60
+
61
+ return "\n".join(contents)
62
+
63
+ # File contents
64
+ if full_path.is_file():
65
+ content = full_path.read_text(encoding="utf-8")
66
+ lines = content.splitlines()
67
+
68
+ if view_range:
69
+ start, end = view_range
70
+ start = max(1, start)
71
+ end = min(len(lines), end)
72
+ lines = lines[start - 1 : end]
73
+ numbered_lines = [f"{i + start:5d}: {line}" for i, line in enumerate(lines)]
74
+ else:
75
+ numbered_lines = [f"{i + 1:5d}: {line}" for i, line in enumerate(lines)]
76
+
77
+ return "\n".join(numbered_lines) if numbered_lines else "(empty file)"
78
+
79
+ return f"Unknown path type: {path}"
80
+
81
+ def create(self, path: str, file_text: str) -> str:
82
+ """Create or overwrite a file."""
83
+ full_path = self._validate_path(path)
84
+ full_path.parent.mkdir(parents=True, exist_ok=True)
85
+ full_path.write_text(file_text, encoding="utf-8")
86
+ return f"File created successfully at {path}"
87
+
88
+ def str_replace(self, path: str, old_str: str, new_str: str) -> str:
89
+ """Replace text in a file."""
90
+ full_path = self._validate_path(path)
91
+
92
+ if not full_path.is_file():
93
+ raise FileNotFoundError(f"File not found: {path}")
94
+
95
+ content = full_path.read_text(encoding="utf-8")
96
+
97
+ if old_str not in content:
98
+ raise ValueError(f"Text not found in file: '{old_str[:50]}...'")
99
+
100
+ new_content = content.replace(old_str, new_str, 1)
101
+ full_path.write_text(new_content, encoding="utf-8")
102
+ return f"File {path} has been edited successfully"
103
+
104
+ def append(self, path: str, text: str) -> str:
105
+ """Append text to end of file."""
106
+ full_path = self._validate_path(path)
107
+
108
+ if not full_path.exists():
109
+ full_path.parent.mkdir(parents=True, exist_ok=True)
110
+ full_path.write_text("", encoding="utf-8")
111
+
112
+ # Ensure text starts with newline if file isn't empty
113
+ if full_path.stat().st_size > 0:
114
+ existing = full_path.read_text(encoding="utf-8")
115
+ if existing and not existing.endswith("\n"):
116
+ text = "\n" + text
117
+
118
+ # Ensure text ends with newline
119
+ if not text.endswith("\n"):
120
+ text += "\n"
121
+
122
+ with full_path.open("a", encoding="utf-8") as f:
123
+ f.write(text)
124
+
125
+ return f"Text appended to {path}"
126
+
127
+ def search(self, query: str, path: str = "") -> str:
128
+ """Search for text across memory files."""
129
+ full_path = self._validate_path(path)
130
+
131
+ if not full_path.exists():
132
+ return f"Path not found: {path or '/memory'}"
133
+
134
+ if not full_path.is_dir():
135
+ # Search single file
136
+ files = [full_path]
137
+ else:
138
+ files = list(full_path.rglob("*"))
139
+
140
+ matches: list[dict[str, Any]] = []
141
+ query_lower = query.lower()
142
+
143
+ for file_path in files:
144
+ if not file_path.is_file():
145
+ continue
146
+ try:
147
+ content = file_path.read_text(encoding="utf-8")
148
+ lines = content.splitlines()
149
+
150
+ for line_num, line in enumerate(lines, 1):
151
+ if query_lower in line.lower():
152
+ rel_path = file_path.relative_to(self.base_path)
153
+ matches.append({
154
+ "file": str(rel_path),
155
+ "line": line_num,
156
+ "content": line.strip()[:100],
157
+ })
158
+ except (UnicodeDecodeError, PermissionError):
159
+ continue
160
+
161
+ if not matches:
162
+ return f"No matches found for '{query}' in {path or '/memory'}"
163
+
164
+ result_lines = [f"Found {len(matches)} match(es) for '{query}':\n"]
165
+ for match in matches[:50]:
166
+ result_lines.append(f" {match['file']}:{match['line']} - {match['content']}")
167
+
168
+ if len(matches) > 50:
169
+ result_lines.append(f"\n... and {len(matches) - 50} more matches")
170
+
171
+ return "\n".join(result_lines)
172
+
173
+ def delete(self, path: str) -> str:
174
+ """Delete a file or empty directory."""
175
+ full_path = self._validate_path(path)
176
+
177
+ if not full_path.exists():
178
+ raise FileNotFoundError(f"Path not found: {path}")
179
+
180
+ if full_path.is_file():
181
+ full_path.unlink()
182
+ return f"File deleted: {path}"
183
+
184
+ if full_path.is_dir():
185
+ if any(full_path.iterdir()):
186
+ raise ValueError(f"Directory not empty: {path}. Delete contents first.")
187
+ full_path.rmdir()
188
+ return f"Directory deleted: {path}"
189
+
190
+ return f"Unknown path type: {path}"
191
+
192
+
193
+ def create_memory_tool(memory_path: Path) -> Callable[..., Coroutine[Any, Any, str]]:
194
+ """Create a memory tool bound to a specific memory directory."""
195
+ backend = MemoryBackend(memory_path)
196
+
197
+ async def memory(
198
+ command: Annotated[
199
+ Literal["view", "create", "str_replace", "append", "search", "delete"],
200
+ "Operation to perform",
201
+ ],
202
+ path: Annotated[str, "Path to file or directory (e.g., '/memory/patterns/cors.md')"] = "/memory",
203
+ file_text: Annotated[str | None, "Content to write (for create)"] = None,
204
+ old_str: Annotated[str | None, "Text to find (for str_replace)"] = None,
205
+ new_str: Annotated[str | None, "Replacement text (for str_replace)"] = None,
206
+ append_text: Annotated[str | None, "Text to append (for append)"] = None,
207
+ query: Annotated[str | None, "Search query (for search)"] = None,
208
+ view_range: Annotated[list[int] | None, "Line range [start, end] (for view)"] = None,
209
+ ) -> str:
210
+ """Store and retrieve information in persistent memory.
211
+
212
+ Memory persists across conversations - use it to remember patterns,
213
+ insights, project context, and decisions.
214
+ Operations: view (show directory/file), create (new file),
215
+ str_replace (edit file), append (add to file),
216
+ search (find text), delete (remove file/dir).
217
+ Organize by: /memory/patterns/, /memory/projects/, /memory/decisions/
218
+ """
219
+ try:
220
+ if command == "view":
221
+ return backend.view(path, view_range)
222
+
223
+ if command == "create":
224
+ if file_text is None:
225
+ return "Error: 'file_text' is required for create operation"
226
+ return backend.create(path, file_text)
227
+
228
+ if command == "str_replace":
229
+ if old_str is None or new_str is None:
230
+ return "Error: 'old_str' and 'new_str' are required for str_replace"
231
+ return backend.str_replace(path, old_str, new_str)
232
+
233
+ if command == "append":
234
+ if append_text is None:
235
+ return "Error: 'append_text' is required for append operation"
236
+ return backend.append(path, append_text)
237
+
238
+ if command == "search":
239
+ if query is None:
240
+ return "Error: 'query' is required for search operation"
241
+ return backend.search(query, path)
242
+
243
+ if command == "delete":
244
+ return backend.delete(path)
245
+
246
+ return f"Error: Unknown command: {command}"
247
+
248
+ except Exception as e:
249
+ return f"Memory operation failed: {e}"
250
+
251
+ # Add tool metadata
252
+ memory._tool_name = "memory" # type: ignore[attr-defined]
253
+ memory._tool_description = ( # type: ignore[attr-defined]
254
+ "Store and retrieve information in persistent memory. "
255
+ "Memory persists across conversations - use it to remember patterns, "
256
+ "insights, project context, and decisions."
257
+ )
258
+ memory._is_tool = True # type: ignore[attr-defined]
259
+
260
+ return memory
src/flow/tools/sub_agent.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Sub-agent tool for isolated research tasks.
2
+
3
+ Provides context isolation by delegating complex research tasks to a
4
+ separate agent that operates in its own context window. The sub-agent
5
+ processes the request and returns only a concise summary, preventing
6
+ context pollution in the main agent.
7
+
8
+ This implements the "Isolation" strategy for context engineering:
9
+ - Coordinator agent stays lean with minimal context
10
+ - Sub-agent can use 30K+ tokens internally for research
11
+ - Only the distilled result (200-500 tokens) returns to coordinator
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import os
17
+ from collections.abc import Callable, Coroutine
18
+ from pathlib import Path
19
+ from typing import Annotated, Any
20
+
21
+ # Sub-agent system prompt focused on research and summarization
22
+ SUB_AGENT_INSTRUCTIONS = """You are a research assistant that helps with complex information gathering tasks.
23
+
24
+ Your role:
25
+ 1. Thoroughly research the given topic or question
26
+ 2. Gather relevant information from available tools
27
+ 3. Synthesize findings into a clear, concise summary
28
+ 4. Return ONLY the essential information needed by the requesting agent
29
+
30
+ Guidelines:
31
+ - Be thorough in your research but concise in your response
32
+ - Focus on facts and actionable information
33
+ - If you can't find information, say so clearly
34
+ - Your response will be passed to another agent, so make it self-contained
35
+ - Target 200-500 tokens for your final response unless more detail is explicitly requested
36
+
37
+ Do NOT:
38
+ - Include conversational fluff or preamble
39
+ - Repeat the original question back
40
+ - Add disclaimers about your limitations
41
+ - Include information that wasn't requested
42
+ """
43
+
44
+
45
+ def create_sub_agent_tool(
46
+ workspace: Path,
47
+ model: str = "gpt-4o-mini",
48
+ endpoint: str | None = None,
49
+ api_key: str | None = None,
50
+ api_version: str = "2024-02-15-preview",
51
+ ) -> Callable[..., Coroutine[Any, Any, str]]:
52
+ """Create a sub-agent tool for isolated research tasks.
53
+
54
+ The sub-agent runs in its own isolated context, preventing context
55
+ pollution in the main agent. This is useful for:
56
+ - Complex research that requires many tool calls
57
+ - Tasks that generate lots of intermediate content
58
+ - Keeping the main agent's context lean and focused
59
+
60
+ Args:
61
+ workspace: Workspace directory for file operations
62
+ model: Model to use for sub-agent (default: gpt-4o-mini for efficiency)
63
+ endpoint: Azure OpenAI endpoint (defaults to AZURE_OPENAI_ENDPOINT env var)
64
+ api_key: Azure OpenAI API key (defaults to AZURE_OPENAI_API_KEY env var)
65
+ api_version: Azure OpenAI API version
66
+
67
+ Returns:
68
+ An async function that can be used as a tool
69
+ """
70
+ # Resolve credentials from environment if not provided
71
+ _endpoint = endpoint or os.environ.get("AZURE_OPENAI_ENDPOINT", "")
72
+ _api_key = api_key or os.environ.get("AZURE_OPENAI_API_KEY", "")
73
+
74
+ # Lazy import to avoid circular dependencies
75
+ _sub_agent: Any = None
76
+
77
+ async def _ensure_sub_agent() -> Any:
78
+ """Lazily create the sub-agent on first use."""
79
+ nonlocal _sub_agent
80
+ if _sub_agent is not None:
81
+ return _sub_agent
82
+
83
+ try:
84
+ from agent_framework import ChatAgent
85
+ from agent_framework.azure import AzureOpenAIChatClient
86
+ except ImportError as e:
87
+ raise ImportError(
88
+ "Microsoft Agent Framework is required for sub-agent. "
89
+ "Install with: pip install agent-framework-core"
90
+ ) from e
91
+
92
+ # Create a lightweight chat client for the sub-agent
93
+ # Uses a smaller/faster model by default for efficiency
94
+ client = AzureOpenAIChatClient(
95
+ api_key=_api_key,
96
+ endpoint=_endpoint,
97
+ deployment=model,
98
+ api_version=api_version,
99
+ )
100
+
101
+ # Create basic tools for the sub-agent
102
+ # Keep it minimal - just what's needed for research
103
+ from flow.tools.coding import create_coding_tools
104
+ from flow.tools.core import create_core_tools
105
+
106
+ sub_tools: list[Callable[..., Any]] = []
107
+ sub_tools.extend(create_coding_tools(workspace))
108
+ sub_tools.extend(create_core_tools())
109
+
110
+ # Convert tools to agent_framework format
111
+ from agent_framework import ai_function
112
+
113
+ converted_tools = []
114
+ for tool_func in sub_tools:
115
+ name = getattr(tool_func, "_tool_name", tool_func.__name__)
116
+ description = getattr(tool_func, "_tool_description", tool_func.__doc__ or "")
117
+ wrapped = ai_function(name=name, description=description)(tool_func)
118
+ converted_tools.append(wrapped)
119
+
120
+ _sub_agent = ChatAgent(
121
+ name="ResearchAssistant",
122
+ description="Research assistant for complex information gathering",
123
+ instructions=SUB_AGENT_INSTRUCTIONS,
124
+ chat_client=client,
125
+ tools=converted_tools,
126
+ )
127
+
128
+ return _sub_agent
129
+
130
+ async def research(
131
+ task: Annotated[
132
+ str,
133
+ "The research task or question to investigate. Be specific about what information you need.",
134
+ ],
135
+ context: Annotated[
136
+ str | None,
137
+ "Optional context to help the sub-agent understand the broader goal.",
138
+ ] = None,
139
+ ) -> str:
140
+ """Delegate a research task to a sub-agent with isolated context.
141
+
142
+ Use this tool when you need to:
143
+ - Research a complex topic that may require multiple steps
144
+ - Gather information without polluting your main context
145
+ - Get a summarized answer to a specific question
146
+
147
+ The sub-agent operates in its own context window, so it can
148
+ use many tokens internally while only returning a concise summary.
149
+ This keeps your main context lean and focused.
150
+
151
+ Examples:
152
+ - "Find all Python files that import the requests library and summarize their purpose"
153
+ - "Research how authentication is implemented in this codebase"
154
+ - "Analyze the error handling patterns used across the project"
155
+ """
156
+ sub_agent = await _ensure_sub_agent()
157
+
158
+ # Build the research prompt
159
+ prompt_parts = [f"Research task: {task}"]
160
+ if context:
161
+ prompt_parts.insert(0, f"Context: {context}")
162
+ prompt_parts.append("\nProvide a concise summary of your findings.")
163
+
164
+ full_prompt = "\n\n".join(prompt_parts)
165
+
166
+ try:
167
+ # Run the sub-agent - it operates in isolated context
168
+ response = await sub_agent.run(full_prompt)
169
+
170
+ # Extract text content from response
171
+ if hasattr(response, "content"):
172
+ return str(response.content)
173
+ return str(response)
174
+
175
+ except Exception as e:
176
+ return f"Research failed: {e}"
177
+
178
+ # Add tool metadata
179
+ research._tool_name = "research" # type: ignore[attr-defined]
180
+ research._tool_description = ( # type: ignore[attr-defined]
181
+ "Delegate a research task to a sub-agent with isolated context. "
182
+ "The sub-agent can thoroughly investigate a topic using many tool calls "
183
+ "internally, then return only a concise summary. Use this for complex "
184
+ "research that would otherwise pollute your main context."
185
+ )
186
+ research._is_tool = True # type: ignore[attr-defined]
187
+
188
+ return research
src/flow/ui/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ # Copyright (c) Microsoft. All rights reserved.
2
+ """Flow UI Backend - FastAPI server."""
src/flow/ui/api/__init__.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Microsoft. All rights reserved.
2
+ """API routes package."""
3
+
4
+ from .configs import router as configs_router
5
+ from .tasks import router as tasks_router
6
+ from .jobs import router as jobs_router
7
+ from .runs import router as runs_router
8
+
9
+ __all__ = [
10
+ "configs_router",
11
+ "tasks_router",
12
+ "jobs_router",
13
+ "runs_router",
14
+ ]
src/flow/ui/api/configs.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Microsoft. All rights reserved.
2
+ """Config API routes."""
3
+
4
+ from uuid import UUID
5
+
6
+ from fastapi import APIRouter, Depends, HTTPException
7
+ from sqlalchemy.ext.asyncio import AsyncSession
8
+ from sqlmodel import select, desc
9
+
10
+ from ..database import get_session
11
+ from ..models.config import AgentConfig
12
+ from ..schemas import ConfigCreate, ConfigUpdate, ConfigResponse
13
+
14
+ router = APIRouter(prefix="/configs", tags=["configs"])
15
+
16
+
17
+ def parse_uuid(id_str: str) -> UUID:
18
+ """Parse a string to UUID, raising 400 if invalid."""
19
+ try:
20
+ return UUID(id_str)
21
+ except ValueError as e:
22
+ raise HTTPException(status_code=400, detail=f"Invalid UUID: {id_str}") from e
23
+
24
+
25
+ @router.get("", response_model=list[ConfigResponse])
26
+ async def list_configs(session: AsyncSession = Depends(get_session)) -> list[AgentConfig]:
27
+ """List all agent configurations."""
28
+ result = await session.execute(select(AgentConfig).order_by(desc(AgentConfig.created_at)))
29
+ return list(result.scalars().all())
30
+
31
+
32
+ @router.post("", response_model=ConfigResponse, status_code=201)
33
+ async def create_config(
34
+ data: ConfigCreate,
35
+ session: AsyncSession = Depends(get_session),
36
+ ) -> AgentConfig:
37
+ """Create a new agent configuration."""
38
+ config = AgentConfig(
39
+ name=data.name,
40
+ description=data.description,
41
+ config_json=data.to_config_json(),
42
+ )
43
+ session.add(config)
44
+ await session.commit()
45
+ await session.refresh(config)
46
+ return config
47
+
48
+
49
+ @router.get("/{config_id}", response_model=ConfigResponse)
50
+ async def get_config(
51
+ config_id: str,
52
+ session: AsyncSession = Depends(get_session),
53
+ ) -> AgentConfig:
54
+ """Get a specific agent configuration."""
55
+ uuid_id = parse_uuid(config_id)
56
+ result = await session.execute(select(AgentConfig).where(AgentConfig.id == uuid_id))
57
+ config = result.scalar_one_or_none()
58
+ if not config:
59
+ raise HTTPException(status_code=404, detail="Config not found")
60
+ return config
61
+
62
+
63
+ @router.put("/{config_id}", response_model=ConfigResponse)
64
+ async def update_config(
65
+ config_id: str,
66
+ data: ConfigUpdate,
67
+ session: AsyncSession = Depends(get_session),
68
+ ) -> AgentConfig:
69
+ """Update an agent configuration."""
70
+ uuid_id = parse_uuid(config_id)
71
+ result = await session.execute(select(AgentConfig).where(AgentConfig.id == uuid_id))
72
+ config = result.scalar_one_or_none()
73
+ if not config:
74
+ raise HTTPException(status_code=404, detail="Config not found")
75
+
76
+ # Update fields that were provided
77
+ update_data = data.model_dump(exclude_unset=True)
78
+
79
+ # Handle config_json fields separately
80
+ config_fields = [
81
+ "enable_message_compaction",
82
+ "enable_memory_tool",
83
+ "enable_sub_agent",
84
+ "compaction_head_size",
85
+ "compaction_tail_size",
86
+ "bash_timeout",
87
+ ]
88
+
89
+ config_json = dict(config.config_json)
90
+ for field in config_fields:
91
+ if field in update_data:
92
+ config_json[field] = update_data.pop(field)
93
+
94
+ # Update top-level fields
95
+ for key, value in update_data.items():
96
+ setattr(config, key, value)
97
+
98
+ config.config_json = config_json
99
+
100
+ from datetime import datetime, timezone
101
+ config.updated_at = datetime.now(timezone.utc)
102
+
103
+ await session.commit()
104
+ await session.refresh(config)
105
+ return config
106
+
107
+
108
+ @router.delete("/{config_id}", status_code=204)
109
+ async def delete_config(
110
+ config_id: str,
111
+ session: AsyncSession = Depends(get_session),
112
+ ) -> None:
113
+ """Delete an agent configuration."""
114
+ uuid_id = parse_uuid(config_id)
115
+ result = await session.execute(select(AgentConfig).where(AgentConfig.id == uuid_id))
116
+ config = result.scalar_one_or_none()
117
+ if not config:
118
+ raise HTTPException(status_code=404, detail="Config not found")
119
+
120
+ await session.delete(config)
121
+ await session.commit()
src/flow/ui/api/jobs.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Microsoft. All rights reserved.
2
+ """Job API routes."""
3
+
4
+ import asyncio
5
+ from typing import Any, AsyncGenerator
6
+ from uuid import UUID
7
+
8
+ from fastapi import APIRouter, Depends, HTTPException
9
+ from fastapi.responses import StreamingResponse
10
+ from sqlalchemy.ext.asyncio import AsyncSession
11
+ from sqlmodel import select, desc
12
+
13
+ from ..database import get_session
14
+ from ..models.job import OptimizationJob, JobStatus
15
+ from ..models.config import AgentConfig
16
+ from ..models.task import TaskModel
17
+ from ..schemas import JobCreate, JobResponse
18
+ from ..services.optimizer_service import OptimizerService
19
+
20
+ router = APIRouter(prefix="/jobs", tags=["jobs"])
21
+
22
+ # Store running jobs for cancellation
23
+ _running_jobs: dict[str, asyncio.Task[Any]] = {}
24
+
25
+
26
+ def parse_uuid(id_str: str) -> UUID:
27
+ """Parse a string to UUID, raising 400 if invalid."""
28
+ try:
29
+ return UUID(id_str)
30
+ except ValueError as e:
31
+ raise HTTPException(status_code=400, detail=f"Invalid UUID: {id_str}") from e
32
+
33
+
34
+ @router.get("", response_model=list[JobResponse])
35
+ async def list_jobs(
36
+ status: JobStatus | None = None,
37
+ session: AsyncSession = Depends(get_session),
38
+ ) -> list[OptimizationJob]:
39
+ """List all optimization jobs."""
40
+ query = select(OptimizationJob)
41
+ if status:
42
+ query = query.where(OptimizationJob.status == status)
43
+ query = query.order_by(desc(OptimizationJob.created_at))
44
+ result = await session.execute(query)
45
+ return list(result.scalars().all())
46
+
47
+
48
+ @router.post("", response_model=JobResponse, status_code=201)
49
+ async def create_job(
50
+ data: JobCreate,
51
+ session: AsyncSession = Depends(get_session),
52
+ ) -> OptimizationJob:
53
+ """Create a new optimization job."""
54
+ # Validate config_ids exist
55
+ for config_id in data.config_ids:
56
+ uuid_id = parse_uuid(config_id)
57
+ result = await session.execute(select(AgentConfig).where(AgentConfig.id == uuid_id))
58
+ if not result.scalar_one_or_none():
59
+ raise HTTPException(status_code=400, detail=f"Config {config_id} not found")
60
+
61
+ # Validate task_ids exist
62
+ for task_id in data.task_ids:
63
+ uuid_id = parse_uuid(task_id)
64
+ result = await session.execute(select(TaskModel).where(TaskModel.id == uuid_id))
65
+ if not result.scalar_one_or_none():
66
+ raise HTTPException(status_code=400, detail=f"Task {task_id} not found")
67
+
68
+ job = OptimizationJob(
69
+ name=data.name,
70
+ config_ids=data.config_ids,
71
+ task_ids=data.task_ids,
72
+ parallel=data.parallel,
73
+ use_llm_eval=data.use_llm_eval,
74
+ total_experiments=len(data.config_ids) * len(data.task_ids),
75
+ )
76
+ session.add(job)
77
+ await session.commit()
78
+ await session.refresh(job)
79
+ return job
80
+
81
+
82
+ @router.get("/{job_id}", response_model=JobResponse)
83
+ async def get_job(
84
+ job_id: str,
85
+ session: AsyncSession = Depends(get_session),
86
+ ) -> OptimizationJob:
87
+ """Get a specific optimization job."""
88
+ uuid_id = parse_uuid(job_id)
89
+ result = await session.execute(select(OptimizationJob).where(OptimizationJob.id == uuid_id))
90
+ job = result.scalar_one_or_none()
91
+ if not job:
92
+ raise HTTPException(status_code=404, detail="Job not found")
93
+ return job
94
+
95
+
96
+ @router.post("/{job_id}/start")
97
+ async def start_job(
98
+ job_id: str,
99
+ session: AsyncSession = Depends(get_session),
100
+ ) -> StreamingResponse:
101
+ """Start an optimization job and stream progress via SSE."""
102
+ uuid_id = parse_uuid(job_id)
103
+ result = await session.execute(select(OptimizationJob).where(OptimizationJob.id == uuid_id))
104
+ job = result.scalar_one_or_none()
105
+ if not job:
106
+ raise HTTPException(status_code=404, detail="Job not found")
107
+
108
+ if job.status != JobStatus.PENDING:
109
+ raise HTTPException(status_code=400, detail=f"Job is already {job.status}")
110
+
111
+ async def event_stream() -> AsyncGenerator[str, None]:
112
+ service = OptimizerService()
113
+ async for progress in service.run_job(job_id):
114
+ yield f"data: {progress.model_dump_json()}\n\n"
115
+
116
+ return StreamingResponse(
117
+ event_stream(),
118
+ media_type="text/event-stream",
119
+ headers={
120
+ "Cache-Control": "no-cache",
121
+ "Connection": "keep-alive",
122
+ },
123
+ )
124
+
125
+
126
+ @router.post("/{job_id}/cancel", response_model=JobResponse)
127
+ async def cancel_job(
128
+ job_id: str,
129
+ session: AsyncSession = Depends(get_session),
130
+ ) -> OptimizationJob:
131
+ """Cancel a running optimization job."""
132
+ uuid_id = parse_uuid(job_id)
133
+ result = await session.execute(select(OptimizationJob).where(OptimizationJob.id == uuid_id))
134
+ job = result.scalar_one_or_none()
135
+ if not job:
136
+ raise HTTPException(status_code=404, detail="Job not found")
137
+
138
+ if job.status != JobStatus.RUNNING:
139
+ raise HTTPException(status_code=400, detail=f"Job is not running (status: {job.status})")
140
+
141
+ # Cancel the running task if it exists
142
+ if job_id in _running_jobs:
143
+ _running_jobs[job_id].cancel()
144
+ del _running_jobs[job_id]
145
+
146
+ job.status = JobStatus.CANCELLED
147
+ await session.commit()
148
+ await session.refresh(job)
149
+ return job
150
+
151
+
152
+ @router.delete("/{job_id}", status_code=204)
153
+ async def delete_job(
154
+ job_id: str,
155
+ session: AsyncSession = Depends(get_session),
156
+ ) -> None:
157
+ """Delete an optimization job and its runs."""
158
+ uuid_id = parse_uuid(job_id)
159
+ result = await session.execute(select(OptimizationJob).where(OptimizationJob.id == uuid_id))
160
+ job = result.scalar_one_or_none()
161
+ if not job:
162
+ raise HTTPException(status_code=404, detail="Job not found")
163
+
164
+ if job.status == JobStatus.RUNNING:
165
+ raise HTTPException(status_code=400, detail="Cannot delete a running job")
166
+
167
+ # Runs will be cascade deleted due to foreign key
168
+ await session.delete(job)
169
+ await session.commit()
src/flow/ui/api/runs.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Microsoft. All rights reserved.
2
+ """Run API routes."""
3
+
4
+ from typing import Any
5
+ from uuid import UUID
6
+
7
+ from fastapi import APIRouter, Depends, HTTPException
8
+ from sqlalchemy.ext.asyncio import AsyncSession
9
+ from sqlmodel import select, desc
10
+
11
+ from ..database import get_session
12
+ from ..models.run import ExperimentRun
13
+ from ..schemas import RunResponse, RunDetailResponse, CriterionResultSchema
14
+
15
+ router = APIRouter(prefix="/runs", tags=["runs"])
16
+
17
+
18
+ def parse_uuid(id_str: str) -> UUID:
19
+ """Parse a string to UUID, raising 400 if invalid."""
20
+ try:
21
+ return UUID(id_str)
22
+ except ValueError as e:
23
+ raise HTTPException(status_code=400, detail=f"Invalid UUID: {id_str}") from e
24
+
25
+
26
+ @router.get("", response_model=list[RunResponse])
27
+ async def list_runs(
28
+ job_id: str | None = None,
29
+ config_name: str | None = None,
30
+ task_name: str | None = None,
31
+ is_pareto: bool | None = None,
32
+ session: AsyncSession = Depends(get_session),
33
+ ) -> list[ExperimentRun]:
34
+ """List experiment runs with optional filters."""
35
+ query = select(ExperimentRun)
36
+
37
+ if job_id:
38
+ uuid_id = parse_uuid(job_id)
39
+ query = query.where(ExperimentRun.job_id == uuid_id)
40
+ if config_name:
41
+ query = query.where(ExperimentRun.config_name == config_name)
42
+ if task_name:
43
+ query = query.where(ExperimentRun.task_name == task_name)
44
+ if is_pareto is not None:
45
+ query = query.where(ExperimentRun.is_pareto == is_pareto)
46
+
47
+ query = query.order_by(desc(ExperimentRun.created_at))
48
+ result = await session.execute(query)
49
+ return list(result.scalars().all())
50
+
51
+
52
+ @router.get("/{run_id}", response_model=RunDetailResponse)
53
+ async def get_run(
54
+ run_id: str,
55
+ session: AsyncSession = Depends(get_session),
56
+ ) -> dict[str, Any]:
57
+ """Get detailed information about a specific run."""
58
+ uuid_id = parse_uuid(run_id)
59
+ result = await session.execute(select(ExperimentRun).where(ExperimentRun.id == uuid_id))
60
+ run = result.scalar_one_or_none()
61
+ if not run:
62
+ raise HTTPException(status_code=404, detail="Run not found")
63
+
64
+ # Parse criteria results from trace
65
+ criteria_results = []
66
+ if run.trace_json and "criteria_results" in run.trace_json:
67
+ for cr in run.trace_json["criteria_results"]:
68
+ criteria_results.append(CriterionResultSchema(
69
+ name=cr.get("name", ""),
70
+ score=cr.get("score", 0.0),
71
+ passed=cr.get("passed", False),
72
+ reasoning=cr.get("reasoning", ""),
73
+ ))
74
+
75
+ return {
76
+ "id": str(run.id),
77
+ "job_id": str(run.job_id),
78
+ "config_name": run.config_name,
79
+ "task_name": run.task_name,
80
+ "status": run.status,
81
+ "tokens_total": run.tokens_total,
82
+ "tokens_input": run.tokens_input,
83
+ "tokens_output": run.tokens_output,
84
+ "duration_seconds": run.duration_seconds,
85
+ "score": run.score,
86
+ "passed": run.passed,
87
+ "reasoning": run.reasoning,
88
+ "criteria_results": criteria_results,
89
+ "output": run.output,
90
+ "files_created": run.files_created,
91
+ "trace": run.trace_json,
92
+ "is_pareto": run.is_pareto,
93
+ "pareto_rank": run.pareto_rank,
94
+ "created_at": run.created_at,
95
+ }
96
+
97
+
98
+ @router.get("/job/{job_id}/summary")
99
+ async def get_job_summary(
100
+ job_id: str,
101
+ session: AsyncSession = Depends(get_session),
102
+ ) -> dict[str, Any]:
103
+ """Get aggregated summary for a job's runs."""
104
+ uuid_id = parse_uuid(job_id)
105
+ result = await session.execute(
106
+ select(ExperimentRun).where(ExperimentRun.job_id == uuid_id)
107
+ )
108
+ runs = list(result.scalars().all())
109
+
110
+ if not runs:
111
+ raise HTTPException(status_code=404, detail="No runs found for job")
112
+
113
+ # Aggregate by config
114
+ config_summaries: dict[str, dict[str, Any]] = {}
115
+ for run in runs:
116
+ if run.config_name not in config_summaries:
117
+ config_summaries[run.config_name] = {
118
+ "config_name": run.config_name,
119
+ "total_runs": 0,
120
+ "passed_runs": 0,
121
+ "avg_score": 0.0,
122
+ "avg_tokens": 0.0,
123
+ "avg_duration": 0.0,
124
+ "is_pareto": False,
125
+ "pareto_rank": 999,
126
+ }
127
+
128
+ summary = config_summaries[run.config_name]
129
+ summary["total_runs"] += 1
130
+ if run.passed:
131
+ summary["passed_runs"] += 1
132
+ summary["avg_score"] += run.score
133
+ summary["avg_tokens"] += run.tokens_total
134
+ summary["avg_duration"] += run.duration_seconds
135
+ if run.is_pareto:
136
+ summary["is_pareto"] = True
137
+ summary["pareto_rank"] = min(summary["pareto_rank"], run.pareto_rank)
138
+
139
+ # Calculate averages
140
+ for summary in config_summaries.values():
141
+ n = summary["total_runs"]
142
+ summary["avg_score"] /= n
143
+ summary["avg_tokens"] /= n
144
+ summary["avg_duration"] /= n
145
+
146
+ # Sort by score descending
147
+ sorted_summaries = sorted(
148
+ config_summaries.values(),
149
+ key=lambda x: (-x["avg_score"], x["avg_tokens"]),
150
+ )
151
+
152
+ return {
153
+ "job_id": job_id,
154
+ "total_runs": len(runs),
155
+ "config_summaries": sorted_summaries,
156
+ "pareto_configs": [s["config_name"] for s in sorted_summaries if s["is_pareto"]],
157
+ }
src/flow/ui/api/tasks.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Microsoft. All rights reserved.
2
+ """Task API routes."""
3
+
4
+ from uuid import UUID
5
+
6
+ from fastapi import APIRouter, Depends, HTTPException
7
+ from sqlalchemy.ext.asyncio import AsyncSession
8
+ from sqlmodel import select, desc
9
+
10
+ from ..database import get_session
11
+ from ..models.task import TaskModel
12
+ from ..schemas import TaskCreate, TaskResponse
13
+
14
+ router = APIRouter(prefix="/tasks", tags=["tasks"])
15
+
16
+
17
+ def parse_uuid(id_str: str) -> UUID:
18
+ """Parse a string to UUID, raising 400 if invalid."""
19
+ try:
20
+ return UUID(id_str)
21
+ except ValueError as e:
22
+ raise HTTPException(status_code=400, detail=f"Invalid UUID: {id_str}") from e
23
+
24
+
25
+ @router.get("", response_model=list[TaskResponse])
26
+ async def list_tasks(
27
+ category: str | None = None,
28
+ suite: str | None = None,
29
+ session: AsyncSession = Depends(get_session),
30
+ ) -> list[TaskModel]:
31
+ """List all tasks, optionally filtered by category or suite."""
32
+ query = select(TaskModel)
33
+ if category:
34
+ query = query.where(TaskModel.category == category)
35
+ if suite:
36
+ query = query.where(TaskModel.suite == suite)
37
+ query = query.order_by(desc(TaskModel.created_at))
38
+ result = await session.execute(query)
39
+ return list(result.scalars().all())
40
+
41
+
42
+ @router.post("", response_model=TaskResponse, status_code=201)
43
+ async def create_task(
44
+ data: TaskCreate,
45
+ session: AsyncSession = Depends(get_session),
46
+ ) -> TaskModel:
47
+ """Create a new task."""
48
+ task = TaskModel(
49
+ name=data.name,
50
+ prompt=data.prompt,
51
+ criteria_json=data.to_criteria_json(),
52
+ category=data.category,
53
+ )
54
+ session.add(task)
55
+ await session.commit()
56
+ await session.refresh(task)
57
+ return task
58
+
59
+
60
+ @router.get("/{task_id}", response_model=TaskResponse)
61
+ async def get_task(
62
+ task_id: str,
63
+ session: AsyncSession = Depends(get_session),
64
+ ) -> TaskModel:
65
+ """Get a specific task."""
66
+ uuid_id = parse_uuid(task_id)
67
+ result = await session.execute(select(TaskModel).where(TaskModel.id == uuid_id))
68
+ task = result.scalar_one_or_none()
69
+ if not task:
70
+ raise HTTPException(status_code=404, detail="Task not found")
71
+ return task
72
+
73
+
74
+ @router.delete("/{task_id}", status_code=204)
75
+ async def delete_task(
76
+ task_id: str,
77
+ session: AsyncSession = Depends(get_session),
78
+ ) -> None:
79
+ """Delete a task."""
80
+ uuid_id = parse_uuid(task_id)
81
+ result = await session.execute(select(TaskModel).where(TaskModel.id == uuid_id))
82
+ task = result.scalar_one_or_none()
83
+ if not task:
84
+ raise HTTPException(status_code=404, detail="Task not found")
85
+
86
+ await session.delete(task)
87
+ await session.commit()
88
+
89
+
90
+ @router.post("/import-suite", response_model=list[TaskResponse], status_code=201)
91
+ async def import_suite(
92
+ suite_name: str,
93
+ session: AsyncSession = Depends(get_session),
94
+ ) -> list[TaskModel]:
95
+ """Import tasks from a built-in suite."""
96
+ from flow.experiments.types import get_task_suite
97
+
98
+ try:
99
+ suite_tasks = get_task_suite(suite_name)
100
+ except ValueError as e:
101
+ raise HTTPException(status_code=400, detail=str(e))
102
+
103
+ created_tasks = []
104
+ for t in suite_tasks:
105
+ task = TaskModel(
106
+ name=t.name,
107
+ prompt=t.prompt,
108
+ criteria_json=[{"name": c.name, "instruction": c.instruction, "weight": c.weight} for c in t.criteria],
109
+ category=t.metadata.get("category", "default"),
110
+ suite=suite_name,
111
+ )
112
+ session.add(task)
113
+ created_tasks.append(task)
114
+
115
+ await session.commit()
116
+ for task in created_tasks:
117
+ await session.refresh(task)
118
+
119
+ return created_tasks
src/flow/ui/database.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Microsoft. All rights reserved.
2
+ """Database setup with SQLModel and SQLite."""
3
+
4
+ from pathlib import Path
5
+ from typing import AsyncGenerator
6
+
7
+ from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine
8
+ from sqlmodel import SQLModel
9
+
10
+ # Database path
11
+ DB_PATH = Path.home() / ".flow" / "flow_ui.db"
12
+ DB_PATH.parent.mkdir(parents=True, exist_ok=True)
13
+
14
+ DATABASE_URL = f"sqlite+aiosqlite:///{DB_PATH}"
15
+
16
+ engine = create_async_engine(DATABASE_URL, echo=False, future=True)
17
+
18
+ async_session = async_sessionmaker(engine, class_=AsyncSession, expire_on_commit=False)
19
+
20
+
21
+ async def init_db() -> None:
22
+ """Initialize database tables."""
23
+ async with engine.begin() as conn:
24
+ await conn.run_sync(SQLModel.metadata.create_all)
25
+
26
+
27
+ async def get_session() -> AsyncGenerator[AsyncSession, None]:
28
+ """Get database session."""
29
+ async with async_session() as session:
30
+ yield session
src/flow/ui/main.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Microsoft. All rights reserved.
2
+ """FastAPI server for Flow UI."""
3
+
4
+ from contextlib import asynccontextmanager
5
+ from pathlib import Path
6
+ from typing import Any, AsyncGenerator
7
+
8
+ from fastapi import FastAPI
9
+ from fastapi.middleware.cors import CORSMiddleware
10
+ from fastapi.staticfiles import StaticFiles
11
+ from fastapi.responses import FileResponse
12
+
13
+ from .database import init_db
14
+ from .api import configs_router, tasks_router, jobs_router, runs_router
15
+
16
+
17
+ @asynccontextmanager
18
+ async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
19
+ """Initialize database on startup."""
20
+ await init_db()
21
+ yield
22
+
23
+
24
+ app = FastAPI(
25
+ title="Flow Optimization UI",
26
+ description="Web UI for running agent configuration optimization experiments",
27
+ version="0.1.0",
28
+ lifespan=lifespan,
29
+ )
30
+
31
+ # CORS for development
32
+ app.add_middleware(
33
+ CORSMiddleware,
34
+ allow_origins=["http://localhost:5173", "http://127.0.0.1:5173"],
35
+ allow_credentials=True,
36
+ allow_methods=["*"],
37
+ allow_headers=["*"],
38
+ )
39
+
40
+ # API routes
41
+ app.include_router(configs_router, prefix="/api")
42
+ app.include_router(tasks_router, prefix="/api")
43
+ app.include_router(jobs_router, prefix="/api")
44
+ app.include_router(runs_router, prefix="/api")
45
+
46
+
47
+ # Health check
48
+ @app.get("/api/health")
49
+ async def health_check() -> dict[str, Any]:
50
+ """Health check endpoint."""
51
+ return {"status": "ok", "service": "flow-ui"}
52
+
53
+
54
+ # Static files and SPA fallback
55
+ # UI is built to backend/ui/ directory so the backend package is self-contained
56
+ UI_DIR = Path(__file__).parent / "ui"
57
+
58
+
59
+ def setup_static_files() -> None:
60
+ """Set up static file serving if frontend is built."""
61
+ if UI_DIR.exists():
62
+ # Serve assets directory
63
+ assets_dir = UI_DIR / "assets"
64
+ if assets_dir.exists():
65
+ app.mount("/assets", StaticFiles(directory=assets_dir), name="assets")
66
+
67
+ @app.get("/{full_path:path}")
68
+ async def serve_spa(full_path: str) -> FileResponse: # pyright: ignore[reportUnusedFunction]
69
+ """Serve SPA for all non-API routes."""
70
+ file_path = UI_DIR / full_path
71
+ if file_path.exists() and file_path.is_file():
72
+ return FileResponse(file_path)
73
+ return FileResponse(UI_DIR / "index.html")
74
+
75
+
76
+ # Only set up static files if UI is built
77
+ if UI_DIR.exists():
78
+ setup_static_files()
79
+
80
+
81
+ def run_server(host: str = "0.0.0.0", port: int = 8091) -> None: # noqa: S104
82
+ """Run the FastAPI server."""
83
+ import uvicorn
84
+
85
+ uvicorn.run(
86
+ "flow.ui.main:app",
87
+ host=host,
88
+ port=port,
89
+ reload=False,
90
+ )
91
+
92
+
93
+ if __name__ == "__main__":
94
+ run_server()
src/flow/ui/models/__init__.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Microsoft. All rights reserved.
2
+ """Database models."""
3
+
4
+ from .config import AgentConfig
5
+ from .task import TaskModel
6
+ from .job import OptimizationJob, JobStatus
7
+ from .run import ExperimentRun
8
+
9
+ __all__ = [
10
+ "AgentConfig",
11
+ "TaskModel",
12
+ "OptimizationJob",
13
+ "JobStatus",
14
+ "ExperimentRun",
15
+ ]