Spaces:
Sleeping
Sleeping
Commit ·
034c2ac
1
Parent(s): 34635fd
Deploy 2026-01-26 07:50:36
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .dockerignore +54 -0
- Dockerfile +69 -0
- README.md +195 -0
- pyproject.toml +180 -0
- src/flow/__init__.py +26 -0
- src/flow/cli/__init__.py +11 -0
- src/flow/cli/app.py +216 -0
- src/flow/cli/optimize.py +332 -0
- src/flow/cli/output.py +99 -0
- src/flow/cli/repl.py +153 -0
- src/flow/experiments/__init__.py +204 -0
- src/flow/experiments/ablation.py +472 -0
- src/flow/experiments/config_export.py +184 -0
- src/flow/experiments/evaluators/__init__.py +17 -0
- src/flow/experiments/evaluators/base.py +32 -0
- src/flow/experiments/evaluators/composite.py +80 -0
- src/flow/experiments/evaluators/heuristic.py +193 -0
- src/flow/experiments/evaluators/llm.py +223 -0
- src/flow/experiments/evaluators/trace.py +149 -0
- src/flow/experiments/metrics.py +267 -0
- src/flow/experiments/optimizer.py +547 -0
- src/flow/experiments/reporters/__init__.py +17 -0
- src/flow/experiments/reporters/console_reporter.py +135 -0
- src/flow/experiments/reporters/json_reporter.py +133 -0
- src/flow/experiments/runner.py +243 -0
- src/flow/experiments/trace_collector.py +104 -0
- src/flow/experiments/types.py +266 -0
- src/flow/harness/__init__.py +18 -0
- src/flow/harness/base.py +110 -0
- src/flow/harness/maf/__init__.py +14 -0
- src/flow/harness/maf/agent.py +176 -0
- src/flow/harness/maf/harness.py +258 -0
- src/flow/harness/maf/message_store.py +177 -0
- src/flow/prompts.py +407 -0
- src/flow/py.typed +0 -0
- src/flow/tools/__init__.py +172 -0
- src/flow/tools/coding.py +391 -0
- src/flow/tools/core.py +100 -0
- src/flow/tools/execution.py +479 -0
- src/flow/tools/memory.py +260 -0
- src/flow/tools/sub_agent.py +188 -0
- src/flow/ui/__init__.py +2 -0
- src/flow/ui/api/__init__.py +14 -0
- src/flow/ui/api/configs.py +121 -0
- src/flow/ui/api/jobs.py +169 -0
- src/flow/ui/api/runs.py +157 -0
- src/flow/ui/api/tasks.py +119 -0
- src/flow/ui/database.py +30 -0
- src/flow/ui/main.py +94 -0
- src/flow/ui/models/__init__.py +15 -0
.dockerignore
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# This file is relative to the build context (repo root)
|
| 2 |
+
|
| 3 |
+
# Git
|
| 4 |
+
.git
|
| 5 |
+
.gitignore
|
| 6 |
+
|
| 7 |
+
# Python
|
| 8 |
+
__pycache__
|
| 9 |
+
*.py[cod]
|
| 10 |
+
*$py.class
|
| 11 |
+
*.so
|
| 12 |
+
.Python
|
| 13 |
+
.venv
|
| 14 |
+
venv
|
| 15 |
+
ENV
|
| 16 |
+
.eggs
|
| 17 |
+
*.egg-info
|
| 18 |
+
dist
|
| 19 |
+
build
|
| 20 |
+
|
| 21 |
+
# Testing/Dev
|
| 22 |
+
.pytest_cache
|
| 23 |
+
.coverage
|
| 24 |
+
htmlcov
|
| 25 |
+
.mypy_cache
|
| 26 |
+
.ruff_cache
|
| 27 |
+
.pyright
|
| 28 |
+
|
| 29 |
+
# IDE
|
| 30 |
+
.vscode
|
| 31 |
+
.idea
|
| 32 |
+
*.swp
|
| 33 |
+
*.swo
|
| 34 |
+
|
| 35 |
+
# Frontend source (built files are already in src/flow/ui/ui/)
|
| 36 |
+
app/frontend/node_modules
|
| 37 |
+
app/frontend/src
|
| 38 |
+
app/frontend/*.json
|
| 39 |
+
app/frontend/*.ts
|
| 40 |
+
app/frontend/*.js
|
| 41 |
+
app/frontend/*.md
|
| 42 |
+
app/frontend/.vite
|
| 43 |
+
|
| 44 |
+
# Docs and deploy folder itself
|
| 45 |
+
docs
|
| 46 |
+
deploy
|
| 47 |
+
|
| 48 |
+
# Local env files (pass via docker env instead)
|
| 49 |
+
.env
|
| 50 |
+
.env.*
|
| 51 |
+
!.env.example
|
| 52 |
+
|
| 53 |
+
# Tests (not needed in production)
|
| 54 |
+
tests
|
Dockerfile
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Flow UI Container
|
| 2 |
+
# Production-ready deployment with uvicorn workers
|
| 3 |
+
|
| 4 |
+
FROM python:3.11-slim AS base
|
| 5 |
+
|
| 6 |
+
WORKDIR /app
|
| 7 |
+
|
| 8 |
+
# Install system dependencies
|
| 9 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 10 |
+
git \
|
| 11 |
+
curl \
|
| 12 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 13 |
+
|
| 14 |
+
# Install uv for fast dependency management
|
| 15 |
+
RUN pip install --no-cache-dir uv
|
| 16 |
+
|
| 17 |
+
# -------------------------------------------------------------------
|
| 18 |
+
# Builder stage: install dependencies
|
| 19 |
+
# -------------------------------------------------------------------
|
| 20 |
+
FROM base AS builder
|
| 21 |
+
|
| 22 |
+
# Copy only dependency files first (better layer caching)
|
| 23 |
+
COPY pyproject.toml uv.lock ./
|
| 24 |
+
|
| 25 |
+
# Install dependencies to system (no venv needed in container)
|
| 26 |
+
RUN uv pip install --system .
|
| 27 |
+
|
| 28 |
+
# -------------------------------------------------------------------
|
| 29 |
+
# Final stage: copy app and run
|
| 30 |
+
# -------------------------------------------------------------------
|
| 31 |
+
FROM base AS final
|
| 32 |
+
|
| 33 |
+
# Copy installed packages from builder
|
| 34 |
+
COPY --from=builder /usr/local/lib/python3.11/site-packages /usr/local/lib/python3.11/site-packages
|
| 35 |
+
COPY --from=builder /usr/local/bin /usr/local/bin
|
| 36 |
+
|
| 37 |
+
# Copy application source (includes pre-built frontend in src/flow/ui/ui/)
|
| 38 |
+
COPY src/ ./src/
|
| 39 |
+
|
| 40 |
+
# Install the app itself (editable, uses already-installed deps)
|
| 41 |
+
RUN uv pip install --system --no-deps -e .
|
| 42 |
+
|
| 43 |
+
# Create non-root user for security
|
| 44 |
+
RUN useradd --create-home --shell /bin/bash flowuser
|
| 45 |
+
RUN mkdir -p /app/data && chown -R flowuser:flowuser /app
|
| 46 |
+
USER flowuser
|
| 47 |
+
|
| 48 |
+
# Configuration
|
| 49 |
+
ENV PORT=7860
|
| 50 |
+
ENV FLOW_DATA_DIR=/app/data
|
| 51 |
+
ENV UVICORN_WORKERS=2
|
| 52 |
+
|
| 53 |
+
# Expose the port
|
| 54 |
+
EXPOSE ${PORT}
|
| 55 |
+
|
| 56 |
+
# Health check - matches the actual endpoint in main.py
|
| 57 |
+
HEALTHCHECK --interval=30s --timeout=10s --start-period=10s --retries=3 \
|
| 58 |
+
CMD curl -f http://localhost:${PORT}/api/health || exit 1
|
| 59 |
+
|
| 60 |
+
# Production uvicorn with multiple workers
|
| 61 |
+
# - workers: handle concurrent requests (CPU-bound, use 2-4 for most cases)
|
| 62 |
+
# - For I/O bound (which this is), uvicorn's async handles concurrency well
|
| 63 |
+
# - limit-concurrency prevents overload
|
| 64 |
+
CMD uvicorn flow.ui.main:app \
|
| 65 |
+
--host 0.0.0.0 \
|
| 66 |
+
--port ${PORT} \
|
| 67 |
+
--workers ${UVICORN_WORKERS} \
|
| 68 |
+
--limit-concurrency 100 \
|
| 69 |
+
--timeout-keep-alive 30
|
README.md
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Flow - Autonomous Coding Agent
|
| 3 |
+
emoji: 🔄
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: purple
|
| 6 |
+
sdk: docker
|
| 7 |
+
app_port: 7860
|
| 8 |
+
pinned: false
|
| 9 |
+
---
|
| 10 |
+
|
| 11 |
+
# Flow
|
| 12 |
+
|
| 13 |
+
**Autonomous Coding Agent with a Polished CLI**
|
| 14 |
+
|
| 15 |
+
Flow is a standalone coding agent that can read, write, and execute code autonomously. It features a clean CLI interface similar to Claude Code, with support for multiple agent runtime harnesses.
|
| 16 |
+
|
| 17 |
+
## Features
|
| 18 |
+
|
| 19 |
+
- **Autonomous Execution**: Flow doesn't just tell you what to do—it does it. Write code, run tests, fix errors, iterate.
|
| 20 |
+
- **Rich CLI**: Interactive REPL with streaming output, tool call visualization, and syntax highlighting.
|
| 21 |
+
- **Pluggable Harnesses**: Swap out the underlying agent runtime (Microsoft Agent Framework, OpenAI Swarm, etc.)
|
| 22 |
+
- **Persistent Memory**: Remember patterns, decisions, and context across sessions.
|
| 23 |
+
- **Workspace Isolation**: Secure file operations within a sandboxed workspace.
|
| 24 |
+
|
| 25 |
+
## Installation
|
| 26 |
+
|
| 27 |
+
```bash
|
| 28 |
+
# Basic installation
|
| 29 |
+
pip install flow-agent
|
| 30 |
+
|
| 31 |
+
# With Microsoft Agent Framework support (recommended)
|
| 32 |
+
pip install flow-agent[agent-framework]
|
| 33 |
+
|
| 34 |
+
# With all optional features
|
| 35 |
+
pip install flow-agent[all]
|
| 36 |
+
|
| 37 |
+
# Development installation
|
| 38 |
+
pip install flow-agent[dev]
|
| 39 |
+
```
|
| 40 |
+
|
| 41 |
+
## Quick Start
|
| 42 |
+
|
| 43 |
+
### 1. Configure Azure OpenAI
|
| 44 |
+
|
| 45 |
+
```bash
|
| 46 |
+
export AZURE_OPENAI_API_KEY="your-api-key"
|
| 47 |
+
export AZURE_OPENAI_ENDPOINT="https://your-resource.openai.azure.com/"
|
| 48 |
+
export AZURE_OPENAI_DEPLOYMENT="gpt-4o"
|
| 49 |
+
```
|
| 50 |
+
|
| 51 |
+
### 2. Initialize Flow
|
| 52 |
+
|
| 53 |
+
```bash
|
| 54 |
+
flow init
|
| 55 |
+
```
|
| 56 |
+
|
| 57 |
+
### 3. Run a Task
|
| 58 |
+
|
| 59 |
+
```bash
|
| 60 |
+
# Single task
|
| 61 |
+
flow run "Create a Python script that calculates fibonacci numbers"
|
| 62 |
+
|
| 63 |
+
# Interactive mode
|
| 64 |
+
flow run -i
|
| 65 |
+
```
|
| 66 |
+
|
| 67 |
+
## CLI Commands
|
| 68 |
+
|
| 69 |
+
```bash
|
| 70 |
+
flow run [TASK] # Run a task or start interactive mode
|
| 71 |
+
flow config # Show current configuration
|
| 72 |
+
flow init # Initialize Flow directories
|
| 73 |
+
flow --help # Show help
|
| 74 |
+
```
|
| 75 |
+
|
| 76 |
+
## Usage as a Library
|
| 77 |
+
|
| 78 |
+
```python
|
| 79 |
+
import asyncio
|
| 80 |
+
from flow import FlowAgent
|
| 81 |
+
|
| 82 |
+
async def main():
|
| 83 |
+
agent = FlowAgent()
|
| 84 |
+
|
| 85 |
+
# Run a task
|
| 86 |
+
response = await agent.run("Create a hello world script")
|
| 87 |
+
print(response)
|
| 88 |
+
|
| 89 |
+
# Stream events
|
| 90 |
+
async for event in agent.run_stream("List files in the workspace"):
|
| 91 |
+
print(event.type, event.content)
|
| 92 |
+
|
| 93 |
+
await agent.close()
|
| 94 |
+
|
| 95 |
+
asyncio.run(main())
|
| 96 |
+
```
|
| 97 |
+
|
| 98 |
+
## Configuration
|
| 99 |
+
|
| 100 |
+
Flow can be configured via environment variables or a config file.
|
| 101 |
+
|
| 102 |
+
### Environment Variables
|
| 103 |
+
|
| 104 |
+
| Variable | Description | Default |
|
| 105 |
+
|----------|-------------|---------|
|
| 106 |
+
| `FLOW_HARNESS` | Agent harness to use | `agent-framework` |
|
| 107 |
+
| `FLOW_MODEL` | Model name | `gpt-4o` |
|
| 108 |
+
| `FLOW_WORKSPACE` | Workspace directory | `~/.flow/workspace` |
|
| 109 |
+
| `AZURE_OPENAI_API_KEY` | Azure OpenAI API key | - |
|
| 110 |
+
| `AZURE_OPENAI_ENDPOINT` | Azure OpenAI endpoint | - |
|
| 111 |
+
| `AZURE_OPENAI_DEPLOYMENT` | Azure OpenAI deployment | - |
|
| 112 |
+
|
| 113 |
+
### Directory Structure
|
| 114 |
+
|
| 115 |
+
```
|
| 116 |
+
~/.flow/
|
| 117 |
+
├── workspace/ # Agent's working directory
|
| 118 |
+
├── memory/ # Persistent memory storage
|
| 119 |
+
│ ├── patterns/ # Reusable code patterns
|
| 120 |
+
│ ├── projects/ # Per-project notes
|
| 121 |
+
│ └── decisions/ # Architecture decisions
|
| 122 |
+
└── skills/ # Domain-specific expertise
|
| 123 |
+
```
|
| 124 |
+
|
| 125 |
+
## Architecture
|
| 126 |
+
|
| 127 |
+
### Harness System
|
| 128 |
+
|
| 129 |
+
Flow uses a harness abstraction to support multiple agent runtimes:
|
| 130 |
+
|
| 131 |
+
```
|
| 132 |
+
┌─────────────────┐
|
| 133 |
+
│ FlowAgent │
|
| 134 |
+
└────────┬────────┘
|
| 135 |
+
│
|
| 136 |
+
┌────────▼────────┐
|
| 137 |
+
│ BaseHarness │ (Abstract)
|
| 138 |
+
└────────┬────────┘
|
| 139 |
+
│
|
| 140 |
+
┌────┴────┐
|
| 141 |
+
│ │
|
| 142 |
+
┌───▼───┐ ┌───▼───┐
|
| 143 |
+
│ Agent │ │ OpenAI│
|
| 144 |
+
│ Frmwk │ │ Swarm │
|
| 145 |
+
└───────┘ └───────┘
|
| 146 |
+
```
|
| 147 |
+
|
| 148 |
+
Currently supported:
|
| 149 |
+
- **MAFHarness**: Microsoft Agent Framework with Azure OpenAI
|
| 150 |
+
|
| 151 |
+
Planned:
|
| 152 |
+
- LangChain
|
| 153 |
+
- Claude SDK
|
| 154 |
+
|
| 155 |
+
### Tools
|
| 156 |
+
|
| 157 |
+
Flow includes a comprehensive set of tools:
|
| 158 |
+
|
| 159 |
+
| Tool | Description |
|
| 160 |
+
|------|-------------|
|
| 161 |
+
| `read_file` | Read file contents with line numbers |
|
| 162 |
+
| `write_file` | Write/edit files (full write, str_replace, insert) |
|
| 163 |
+
| `list_directory` | List directory contents |
|
| 164 |
+
| `grep_search` | Search for patterns in code |
|
| 165 |
+
| `bash_execute` | Run shell commands |
|
| 166 |
+
| `python_repl` | Execute Python code snippets |
|
| 167 |
+
| `memory` | Persistent memory operations |
|
| 168 |
+
| `think` | Structured reasoning |
|
| 169 |
+
| `task_done` | Report task completion |
|
| 170 |
+
|
| 171 |
+
## Development
|
| 172 |
+
|
| 173 |
+
```bash
|
| 174 |
+
# Clone the repository
|
| 175 |
+
git clone https://github.com/victordibia/flow
|
| 176 |
+
cd flow
|
| 177 |
+
|
| 178 |
+
# Install development dependencies
|
| 179 |
+
pip install -e ".[dev]"
|
| 180 |
+
|
| 181 |
+
# Run tests
|
| 182 |
+
pytest tests/ -v
|
| 183 |
+
|
| 184 |
+
# Type checking
|
| 185 |
+
pyright src/
|
| 186 |
+
mypy src/
|
| 187 |
+
|
| 188 |
+
# Linting
|
| 189 |
+
ruff check src/
|
| 190 |
+
ruff format src/
|
| 191 |
+
```
|
| 192 |
+
|
| 193 |
+
## License
|
| 194 |
+
|
| 195 |
+
MIT License - see [LICENSE](LICENSE) for details.
|
pyproject.toml
ADDED
|
@@ -0,0 +1,180 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[project]
|
| 2 |
+
name = "flow-agent"
|
| 3 |
+
version = "0.1.0"
|
| 4 |
+
description = "Autonomous coding agent with a polished CLI"
|
| 5 |
+
authors = [{ name = "Victor Dibia" }]
|
| 6 |
+
readme = "README.md"
|
| 7 |
+
requires-python = ">=3.10"
|
| 8 |
+
license = { text = "MIT" }
|
| 9 |
+
classifiers = [
|
| 10 |
+
"Development Status :: 4 - Beta",
|
| 11 |
+
"Environment :: Console",
|
| 12 |
+
"Intended Audience :: Developers",
|
| 13 |
+
"License :: OSI Approved :: MIT License",
|
| 14 |
+
"Programming Language :: Python :: 3",
|
| 15 |
+
"Programming Language :: Python :: 3.10",
|
| 16 |
+
"Programming Language :: Python :: 3.11",
|
| 17 |
+
"Programming Language :: Python :: 3.12",
|
| 18 |
+
"Programming Language :: Python :: 3.13",
|
| 19 |
+
"Typing :: Typed",
|
| 20 |
+
]
|
| 21 |
+
|
| 22 |
+
dependencies = [
|
| 23 |
+
"pydantic>=2.0.0",
|
| 24 |
+
"pydantic-settings>=2.0.0",
|
| 25 |
+
"rich>=13.0.0",
|
| 26 |
+
"typer>=0.9.0",
|
| 27 |
+
"httpx>=0.25.0",
|
| 28 |
+
"python-dotenv>=1.0.0",
|
| 29 |
+
"agent-framework-core>=1.0.0b0",
|
| 30 |
+
"azure-identity>=1.15.0",
|
| 31 |
+
"pyyaml>=6.0.0",
|
| 32 |
+
# OpenTelemetry for experiments tracing
|
| 33 |
+
"opentelemetry-api>=1.20.0",
|
| 34 |
+
"opentelemetry-sdk>=1.20.0",
|
| 35 |
+
"opentelemetry-semantic-conventions>=0.41b0",
|
| 36 |
+
# Web UI dependencies
|
| 37 |
+
"fastapi>=0.109.0",
|
| 38 |
+
"uvicorn>=0.27.0",
|
| 39 |
+
"sqlmodel>=0.0.14",
|
| 40 |
+
"aiosqlite>=0.19.0",
|
| 41 |
+
]
|
| 42 |
+
|
| 43 |
+
[project.optional-dependencies]
|
| 44 |
+
# Optional features
|
| 45 |
+
research = ["beautifulsoup4>=4.12.0", "html2text>=2024.2.26"]
|
| 46 |
+
|
| 47 |
+
# Bundles
|
| 48 |
+
all = ["flow-agent[research]"]
|
| 49 |
+
dev = [
|
| 50 |
+
"pytest>=8.0.0",
|
| 51 |
+
"pytest-asyncio>=0.23.0",
|
| 52 |
+
"pytest-cov>=4.1.0",
|
| 53 |
+
"mypy>=1.8.0",
|
| 54 |
+
"pyright>=1.1.350",
|
| 55 |
+
"ruff>=0.2.0",
|
| 56 |
+
"pre-commit>=3.6.0",
|
| 57 |
+
"poethepoet>=0.24.0",
|
| 58 |
+
]
|
| 59 |
+
|
| 60 |
+
[project.scripts]
|
| 61 |
+
flow = "flow.cli:main"
|
| 62 |
+
|
| 63 |
+
[project.urls]
|
| 64 |
+
Homepage = "https://github.com/victordibia/flow"
|
| 65 |
+
Repository = "https://github.com/victordibia/flow"
|
| 66 |
+
Issues = "https://github.com/victordibia/flow/issues"
|
| 67 |
+
|
| 68 |
+
[build-system]
|
| 69 |
+
requires = ["hatchling"]
|
| 70 |
+
build-backend = "hatchling.build"
|
| 71 |
+
|
| 72 |
+
[tool.hatch.build.targets.wheel]
|
| 73 |
+
packages = ["src/flow"]
|
| 74 |
+
|
| 75 |
+
# ============================================================================
|
| 76 |
+
# Type Checking - Strict
|
| 77 |
+
# ============================================================================
|
| 78 |
+
|
| 79 |
+
[tool.pyright]
|
| 80 |
+
include = ["src"]
|
| 81 |
+
exclude = ["**/tests/**", "**/.venv/**"]
|
| 82 |
+
typeCheckingMode = "strict"
|
| 83 |
+
pythonVersion = "3.10"
|
| 84 |
+
reportMissingTypeStubs = false
|
| 85 |
+
reportUnnecessaryIsInstance = false
|
| 86 |
+
# agent_framework is optional - ignore type issues in harness
|
| 87 |
+
reportUnknownMemberType = "warning"
|
| 88 |
+
reportUnknownVariableType = "warning"
|
| 89 |
+
reportUnknownArgumentType = "warning"
|
| 90 |
+
|
| 91 |
+
[tool.mypy]
|
| 92 |
+
plugins = ["pydantic.mypy"]
|
| 93 |
+
strict = true
|
| 94 |
+
python_version = "3.10"
|
| 95 |
+
ignore_missing_imports = true
|
| 96 |
+
disallow_untyped_defs = true
|
| 97 |
+
no_implicit_optional = true
|
| 98 |
+
check_untyped_defs = true
|
| 99 |
+
warn_return_any = true
|
| 100 |
+
show_error_codes = true
|
| 101 |
+
warn_unused_ignores = false
|
| 102 |
+
disallow_incomplete_defs = true
|
| 103 |
+
disallow_untyped_decorators = true
|
| 104 |
+
|
| 105 |
+
# ============================================================================
|
| 106 |
+
# Linting - Ruff
|
| 107 |
+
# ============================================================================
|
| 108 |
+
|
| 109 |
+
[tool.ruff]
|
| 110 |
+
line-length = 120
|
| 111 |
+
target-version = "py310"
|
| 112 |
+
src = ["src"]
|
| 113 |
+
fix = true
|
| 114 |
+
include = ["*.py", "*.pyi", "**/pyproject.toml"]
|
| 115 |
+
exclude = ["docs/*"]
|
| 116 |
+
|
| 117 |
+
[tool.ruff.lint]
|
| 118 |
+
select = [
|
| 119 |
+
"E", # pycodestyle errors
|
| 120 |
+
"F", # pyflakes
|
| 121 |
+
"I", # isort
|
| 122 |
+
"B", # bugbear
|
| 123 |
+
"UP", # pyupgrade
|
| 124 |
+
"ANN", # annotations
|
| 125 |
+
"S", # bandit (security)
|
| 126 |
+
"RUF", # ruff-specific
|
| 127 |
+
"ASYNC", # async checks
|
| 128 |
+
"D", # pydocstyle
|
| 129 |
+
]
|
| 130 |
+
ignore = [
|
| 131 |
+
"D100", # allow missing docstring in public module
|
| 132 |
+
"D104", # allow missing docstring in public package
|
| 133 |
+
"D107", # allow missing docstring in __init__
|
| 134 |
+
"ANN401", # allow Any type (needed for generic tool/event handling)
|
| 135 |
+
"S101", # allow assert statements (used in tests)
|
| 136 |
+
]
|
| 137 |
+
|
| 138 |
+
[tool.ruff.lint.per-file-ignores]
|
| 139 |
+
"**/tests/**" = ["D", "ANN", "S"]
|
| 140 |
+
|
| 141 |
+
[tool.ruff.lint.pydocstyle]
|
| 142 |
+
convention = "google"
|
| 143 |
+
|
| 144 |
+
[tool.ruff.format]
|
| 145 |
+
docstring-code-format = true
|
| 146 |
+
|
| 147 |
+
# ============================================================================
|
| 148 |
+
# Testing - Pytest
|
| 149 |
+
# ============================================================================
|
| 150 |
+
|
| 151 |
+
[tool.pytest.ini_options]
|
| 152 |
+
testpaths = ["tests"]
|
| 153 |
+
pythonpath = ["src"]
|
| 154 |
+
addopts = "-ra -q -r fEX"
|
| 155 |
+
asyncio_mode = "auto"
|
| 156 |
+
asyncio_default_fixture_loop_scope = "function"
|
| 157 |
+
filterwarnings = []
|
| 158 |
+
|
| 159 |
+
[tool.coverage.run]
|
| 160 |
+
source = ["src/flow"]
|
| 161 |
+
omit = ["**/__init__.py"]
|
| 162 |
+
|
| 163 |
+
[tool.coverage.report]
|
| 164 |
+
exclude_lines = [
|
| 165 |
+
"pragma: no cover",
|
| 166 |
+
"if TYPE_CHECKING:",
|
| 167 |
+
"raise NotImplementedError",
|
| 168 |
+
]
|
| 169 |
+
|
| 170 |
+
# ============================================================================
|
| 171 |
+
# Task Runner - Poe
|
| 172 |
+
# ============================================================================
|
| 173 |
+
|
| 174 |
+
[tool.poe.tasks]
|
| 175 |
+
fmt = "ruff format src tests"
|
| 176 |
+
lint = "ruff check src tests --fix"
|
| 177 |
+
pyright = "pyright src"
|
| 178 |
+
mypy = "mypy src"
|
| 179 |
+
test = "pytest tests -v --cov=flow --cov-report=term-missing"
|
| 180 |
+
check = ["fmt", "lint", "pyright", "mypy", "test"]
|
src/flow/__init__.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Flow - Autonomous Coding Agent.
|
| 2 |
+
|
| 3 |
+
An autonomous coding agent with a polished CLI experience.
|
| 4 |
+
Uses Microsoft Agent Framework as the runtime.
|
| 5 |
+
|
| 6 |
+
Usage:
|
| 7 |
+
from flow.harness.maf import MAFHarness
|
| 8 |
+
|
| 9 |
+
# Simple - creates agent with defaults
|
| 10 |
+
harness = MAFHarness()
|
| 11 |
+
async for event in harness.run_stream("Create a hello world script"):
|
| 12 |
+
print(event)
|
| 13 |
+
|
| 14 |
+
# Or with custom settings
|
| 15 |
+
harness = MAFHarness(workspace=Path("/tmp/workspace"), enable_compaction=False)
|
| 16 |
+
"""
|
| 17 |
+
|
| 18 |
+
from flow.harness.maf import MAFHarness, create_agent
|
| 19 |
+
|
| 20 |
+
__version__ = "0.1.0"
|
| 21 |
+
|
| 22 |
+
__all__ = [
|
| 23 |
+
"MAFHarness",
|
| 24 |
+
"create_agent",
|
| 25 |
+
"__version__",
|
| 26 |
+
]
|
src/flow/cli/__init__.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Flow CLI - Command-line interface.
|
| 2 |
+
|
| 3 |
+
Provides the `flow` command for running the autonomous coding agent.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from flow.cli.app import app, main
|
| 7 |
+
|
| 8 |
+
__all__ = [
|
| 9 |
+
"app",
|
| 10 |
+
"main",
|
| 11 |
+
]
|
src/flow/cli/app.py
ADDED
|
@@ -0,0 +1,216 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Flow CLI application.
|
| 2 |
+
|
| 3 |
+
Main entry point for the `flow` command.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from __future__ import annotations
|
| 7 |
+
|
| 8 |
+
import asyncio
|
| 9 |
+
import os
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
from typing import Annotated
|
| 12 |
+
|
| 13 |
+
import typer
|
| 14 |
+
from rich.console import Console
|
| 15 |
+
|
| 16 |
+
from flow import __version__
|
| 17 |
+
|
| 18 |
+
app = typer.Typer(
|
| 19 |
+
name="flow",
|
| 20 |
+
help="Flow - Autonomous Coding Agent",
|
| 21 |
+
add_completion=False,
|
| 22 |
+
no_args_is_help=True,
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
console = Console()
|
| 26 |
+
|
| 27 |
+
# Default paths
|
| 28 |
+
DEFAULT_WORKSPACE = Path.home() / ".flow" / "workspace"
|
| 29 |
+
DEFAULT_MEMORY_PATH = Path.home() / ".flow" / "memory"
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def version_callback(value: bool) -> None:
|
| 33 |
+
"""Print version and exit."""
|
| 34 |
+
if value:
|
| 35 |
+
console.print(f"Flow v{__version__}")
|
| 36 |
+
raise typer.Exit()
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
@app.callback()
|
| 40 |
+
def callback(
|
| 41 |
+
version: Annotated[
|
| 42 |
+
bool | None,
|
| 43 |
+
typer.Option("--version", "-v", callback=version_callback, is_eager=True),
|
| 44 |
+
] = None,
|
| 45 |
+
) -> None:
|
| 46 |
+
"""Flow - Autonomous Coding Agent."""
|
| 47 |
+
pass
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
@app.command()
|
| 51 |
+
def run(
|
| 52 |
+
task: Annotated[
|
| 53 |
+
str | None,
|
| 54 |
+
typer.Argument(help="Task to execute (or enter interactive mode if not provided)"),
|
| 55 |
+
] = None,
|
| 56 |
+
workspace: Annotated[
|
| 57 |
+
Path | None,
|
| 58 |
+
typer.Option("--workspace", "-w", help="Workspace directory for writing files"),
|
| 59 |
+
] = None,
|
| 60 |
+
config: Annotated[
|
| 61 |
+
Path | None,
|
| 62 |
+
typer.Option("--config", "-c", help="Config file from optimization (YAML)"),
|
| 63 |
+
] = None,
|
| 64 |
+
interactive: Annotated[
|
| 65 |
+
bool,
|
| 66 |
+
typer.Option("--interactive/--no-interactive", "-i", help="Interactive mode"),
|
| 67 |
+
] = True,
|
| 68 |
+
) -> None:
|
| 69 |
+
"""Run the coding agent.
|
| 70 |
+
|
| 71 |
+
If a task is provided, execute it and exit.
|
| 72 |
+
Otherwise, start an interactive REPL session.
|
| 73 |
+
|
| 74 |
+
The agent can read files from anywhere but writes go to the workspace.
|
| 75 |
+
|
| 76 |
+
Use --config to load a configuration from a previous optimization run.
|
| 77 |
+
"""
|
| 78 |
+
workspace_path = workspace or DEFAULT_WORKSPACE
|
| 79 |
+
memory_path = DEFAULT_MEMORY_PATH
|
| 80 |
+
|
| 81 |
+
# Ensure directories exist
|
| 82 |
+
workspace_path.mkdir(parents=True, exist_ok=True)
|
| 83 |
+
memory_path.mkdir(parents=True, exist_ok=True)
|
| 84 |
+
|
| 85 |
+
if task:
|
| 86 |
+
# Single task mode
|
| 87 |
+
asyncio.run(_run_single_task(workspace_path, memory_path, task, config))
|
| 88 |
+
elif interactive:
|
| 89 |
+
# Interactive REPL mode
|
| 90 |
+
from flow.cli.repl import FlowREPL
|
| 91 |
+
repl = FlowREPL(workspace=workspace_path, memory_path=memory_path)
|
| 92 |
+
asyncio.run(repl.run())
|
| 93 |
+
else:
|
| 94 |
+
console.print("[red]Error:[/] No task provided and interactive mode disabled.")
|
| 95 |
+
raise typer.Exit(1)
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
async def _run_single_task(
|
| 99 |
+
workspace: Path,
|
| 100 |
+
memory_path: Path,
|
| 101 |
+
task: str,
|
| 102 |
+
config_path: Path | None = None,
|
| 103 |
+
) -> None:
|
| 104 |
+
"""Run a single task and print the result."""
|
| 105 |
+
from flow.cli.output import print_event
|
| 106 |
+
from flow.harness.base import EventType
|
| 107 |
+
from flow.harness.maf import MAFHarness
|
| 108 |
+
|
| 109 |
+
if config_path:
|
| 110 |
+
# Load config from optimization result
|
| 111 |
+
from flow.experiments.config_export import load_config
|
| 112 |
+
from flow.experiments.ablation import create_harness_from_config
|
| 113 |
+
|
| 114 |
+
ablation_config = load_config(config_path)
|
| 115 |
+
console.print(f"[dim]Using config: {ablation_config.name}[/]")
|
| 116 |
+
harness = create_harness_from_config(ablation_config, workspace)
|
| 117 |
+
else:
|
| 118 |
+
harness = MAFHarness(workspace=workspace, memory_path=memory_path)
|
| 119 |
+
|
| 120 |
+
try:
|
| 121 |
+
console.print("\n[bold blue]Flow[/] - Executing task...\n")
|
| 122 |
+
|
| 123 |
+
async for event in harness.run_stream(task):
|
| 124 |
+
print_event(console, event)
|
| 125 |
+
|
| 126 |
+
if event.type == EventType.ERROR:
|
| 127 |
+
raise typer.Exit(1)
|
| 128 |
+
|
| 129 |
+
except KeyboardInterrupt:
|
| 130 |
+
console.print("\n[yellow]Cancelled.[/]")
|
| 131 |
+
finally:
|
| 132 |
+
await harness.close()
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
# Import and register the optimize command
|
| 136 |
+
from flow.cli.optimize import optimize as optimize_cmd
|
| 137 |
+
|
| 138 |
+
app.command()(optimize_cmd)
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
@app.command()
|
| 142 |
+
def serve(
|
| 143 |
+
host: Annotated[
|
| 144 |
+
str,
|
| 145 |
+
typer.Option("--host", "-h", help="Host to bind to"),
|
| 146 |
+
] = "0.0.0.0", # noqa: S104
|
| 147 |
+
port: Annotated[
|
| 148 |
+
int,
|
| 149 |
+
typer.Option("--port", "-p", help="Port to bind to"),
|
| 150 |
+
] = 8091,
|
| 151 |
+
reload: Annotated[
|
| 152 |
+
bool,
|
| 153 |
+
typer.Option("--reload", help="Enable auto-reload for development"),
|
| 154 |
+
] = False,
|
| 155 |
+
) -> None:
|
| 156 |
+
"""Start the Flow web UI server.
|
| 157 |
+
|
| 158 |
+
Launches a web interface for managing agent configurations,
|
| 159 |
+
running optimization experiments, and viewing results.
|
| 160 |
+
"""
|
| 161 |
+
import uvicorn
|
| 162 |
+
|
| 163 |
+
console.print(f"\n[bold blue]Flow UI[/] starting on [cyan]http://{host}:{port}[/]\n")
|
| 164 |
+
|
| 165 |
+
uvicorn.run(
|
| 166 |
+
"flow.ui.main:app",
|
| 167 |
+
host=host,
|
| 168 |
+
port=port,
|
| 169 |
+
reload=reload,
|
| 170 |
+
)
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
@app.command()
|
| 174 |
+
def config() -> None:
|
| 175 |
+
"""Show current configuration."""
|
| 176 |
+
from rich.table import Table
|
| 177 |
+
|
| 178 |
+
table = Table(title="Flow Configuration")
|
| 179 |
+
table.add_column("Setting", style="cyan")
|
| 180 |
+
table.add_column("Value", style="green")
|
| 181 |
+
|
| 182 |
+
table.add_row("Workspace", str(DEFAULT_WORKSPACE))
|
| 183 |
+
table.add_row("Memory Path", str(DEFAULT_MEMORY_PATH))
|
| 184 |
+
table.add_row("Azure Endpoint", os.environ.get("AZURE_OPENAI_ENDPOINT", "(not set)"))
|
| 185 |
+
table.add_row("Azure Deployment", os.environ.get("AZURE_OPENAI_DEPLOYMENT", "(not set)"))
|
| 186 |
+
|
| 187 |
+
console.print(table)
|
| 188 |
+
|
| 189 |
+
|
| 190 |
+
@app.command()
|
| 191 |
+
def init() -> None:
|
| 192 |
+
"""Initialize Flow directories and show setup instructions."""
|
| 193 |
+
DEFAULT_WORKSPACE.mkdir(parents=True, exist_ok=True)
|
| 194 |
+
DEFAULT_MEMORY_PATH.mkdir(parents=True, exist_ok=True)
|
| 195 |
+
|
| 196 |
+
console.print("\n[bold green]Flow initialized![/]\n")
|
| 197 |
+
console.print(f" Workspace: [cyan]{DEFAULT_WORKSPACE}[/]")
|
| 198 |
+
console.print(f" Memory: [cyan]{DEFAULT_MEMORY_PATH}[/]")
|
| 199 |
+
|
| 200 |
+
console.print("\n[bold]Next steps:[/]")
|
| 201 |
+
console.print(" 1. Set your Azure OpenAI credentials:")
|
| 202 |
+
console.print(" [dim]export AZURE_OPENAI_API_KEY=your-key[/]")
|
| 203 |
+
console.print(" [dim]export AZURE_OPENAI_ENDPOINT=https://your-resource.openai.azure.com/[/]")
|
| 204 |
+
console.print(" [dim]export AZURE_OPENAI_DEPLOYMENT=your-deployment[/]")
|
| 205 |
+
console.print("\n 2. Run Flow:")
|
| 206 |
+
console.print(' [dim]flow run "Create a hello world Python script"[/]')
|
| 207 |
+
console.print(" [dim]flow run -i # Interactive mode[/]")
|
| 208 |
+
|
| 209 |
+
|
| 210 |
+
def main() -> None:
|
| 211 |
+
"""Main entry point."""
|
| 212 |
+
app()
|
| 213 |
+
|
| 214 |
+
|
| 215 |
+
if __name__ == "__main__":
|
| 216 |
+
main()
|
src/flow/cli/optimize.py
ADDED
|
@@ -0,0 +1,332 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Microsoft. All rights reserved.
|
| 2 |
+
|
| 3 |
+
"""Optimize command for finding best agent configurations."""
|
| 4 |
+
|
| 5 |
+
from __future__ import annotations
|
| 6 |
+
|
| 7 |
+
import asyncio
|
| 8 |
+
import importlib.util
|
| 9 |
+
import sys
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
from typing import Annotated, Any
|
| 12 |
+
|
| 13 |
+
import typer
|
| 14 |
+
from rich.console import Console
|
| 15 |
+
|
| 16 |
+
from flow.experiments.ablation import AblationConfig, CONTEXT_ENGINEERING_CONFIGS
|
| 17 |
+
from flow.experiments.optimizer import (
|
| 18 |
+
FlowOptimizer,
|
| 19 |
+
generate_grid_configs,
|
| 20 |
+
load_tasks_from_jsonl,
|
| 21 |
+
)
|
| 22 |
+
from flow.experiments.types import EvalCriterion, Task
|
| 23 |
+
|
| 24 |
+
console = Console()
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def optimize(
|
| 28 |
+
tasks: Annotated[
|
| 29 |
+
Path | None,
|
| 30 |
+
typer.Option(
|
| 31 |
+
"--tasks", "-t",
|
| 32 |
+
help="Path to tasks.jsonl file",
|
| 33 |
+
),
|
| 34 |
+
] = None,
|
| 35 |
+
config: Annotated[
|
| 36 |
+
Path | None,
|
| 37 |
+
typer.Option(
|
| 38 |
+
"--config", "-c",
|
| 39 |
+
help="Path to Python config file with CONFIGS or VARIATIONS",
|
| 40 |
+
),
|
| 41 |
+
] = None,
|
| 42 |
+
agent: Annotated[
|
| 43 |
+
Path | None,
|
| 44 |
+
typer.Option(
|
| 45 |
+
"--agent", "-a",
|
| 46 |
+
help="Path to base agent Python file (for optimization)",
|
| 47 |
+
),
|
| 48 |
+
] = None,
|
| 49 |
+
suite: Annotated[
|
| 50 |
+
str | None,
|
| 51 |
+
typer.Option(
|
| 52 |
+
"--suite", "-s",
|
| 53 |
+
help="Built-in task suite: coding, research",
|
| 54 |
+
),
|
| 55 |
+
] = None,
|
| 56 |
+
parallel: Annotated[
|
| 57 |
+
int,
|
| 58 |
+
typer.Option(
|
| 59 |
+
"--parallel", "-p",
|
| 60 |
+
help="Max concurrent experiments",
|
| 61 |
+
),
|
| 62 |
+
] = 4,
|
| 63 |
+
mode: Annotated[
|
| 64 |
+
str,
|
| 65 |
+
typer.Option(
|
| 66 |
+
"--mode", "-m",
|
| 67 |
+
help="Config mode: named (use CONFIGS), grid (use VARIATIONS)",
|
| 68 |
+
),
|
| 69 |
+
] = "named",
|
| 70 |
+
vary: Annotated[
|
| 71 |
+
str | None,
|
| 72 |
+
typer.Option(
|
| 73 |
+
"--vary", "-v",
|
| 74 |
+
help="Comma-separated params to vary: compaction,memory,model",
|
| 75 |
+
),
|
| 76 |
+
] = None,
|
| 77 |
+
output: Annotated[
|
| 78 |
+
Path | None,
|
| 79 |
+
typer.Option(
|
| 80 |
+
"--output", "-o",
|
| 81 |
+
help="Output directory for results",
|
| 82 |
+
),
|
| 83 |
+
] = None,
|
| 84 |
+
no_llm_eval: Annotated[
|
| 85 |
+
bool,
|
| 86 |
+
typer.Option(
|
| 87 |
+
"--no-llm-eval",
|
| 88 |
+
help="Disable LLM-as-Judge evaluation (faster, less accurate)",
|
| 89 |
+
),
|
| 90 |
+
] = False,
|
| 91 |
+
) -> None:
|
| 92 |
+
"""Find the best agent configuration through experimentation.
|
| 93 |
+
|
| 94 |
+
Runs experiments in parallel, evaluates with LLM-as-Judge,
|
| 95 |
+
ranks via Pareto analysis, and exports winning configs.
|
| 96 |
+
|
| 97 |
+
Examples:
|
| 98 |
+
|
| 99 |
+
# Run with task file and default configs
|
| 100 |
+
flow optimize --tasks tasks.jsonl
|
| 101 |
+
|
| 102 |
+
# Use custom configs from Python file
|
| 103 |
+
flow optimize --config my_configs.py --tasks tasks.jsonl
|
| 104 |
+
|
| 105 |
+
# Grid search over variations
|
| 106 |
+
flow optimize --config my_configs.py --tasks tasks.jsonl --mode grid
|
| 107 |
+
|
| 108 |
+
# Use built-in task suite
|
| 109 |
+
flow optimize --suite coding --parallel 2
|
| 110 |
+
|
| 111 |
+
# Vary specific parameters
|
| 112 |
+
flow optimize --vary compaction,memory --tasks tasks.jsonl
|
| 113 |
+
"""
|
| 114 |
+
asyncio.run(_run_optimize(
|
| 115 |
+
tasks_path=tasks,
|
| 116 |
+
config_path=config,
|
| 117 |
+
agent_path=agent,
|
| 118 |
+
suite=suite,
|
| 119 |
+
parallel=parallel,
|
| 120 |
+
mode=mode,
|
| 121 |
+
vary=vary,
|
| 122 |
+
output_dir=output,
|
| 123 |
+
use_llm_eval=not no_llm_eval,
|
| 124 |
+
))
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
async def _run_optimize(
|
| 128 |
+
tasks_path: Path | None,
|
| 129 |
+
config_path: Path | None,
|
| 130 |
+
agent_path: Path | None,
|
| 131 |
+
suite: str | None,
|
| 132 |
+
parallel: int,
|
| 133 |
+
mode: str,
|
| 134 |
+
vary: str | None,
|
| 135 |
+
output_dir: Path | None,
|
| 136 |
+
use_llm_eval: bool,
|
| 137 |
+
) -> None:
|
| 138 |
+
"""Run the optimization."""
|
| 139 |
+
# Load tasks
|
| 140 |
+
tasks = _load_tasks(tasks_path, suite)
|
| 141 |
+
if not tasks:
|
| 142 |
+
console.print("[red]Error:[/] No tasks specified. Use --tasks or --suite")
|
| 143 |
+
raise typer.Exit(1)
|
| 144 |
+
|
| 145 |
+
# Load configs
|
| 146 |
+
configs = _load_configs(config_path, mode, vary)
|
| 147 |
+
if not configs:
|
| 148 |
+
console.print("[red]Error:[/] No configs to test. Use --config or --vary")
|
| 149 |
+
raise typer.Exit(1)
|
| 150 |
+
|
| 151 |
+
console.print(f"\n[bold]Tasks:[/] {len(tasks)}")
|
| 152 |
+
for t in tasks:
|
| 153 |
+
console.print(f" - {t.name}")
|
| 154 |
+
|
| 155 |
+
console.print(f"\n[bold]Configs:[/] {len(configs)}")
|
| 156 |
+
for c in configs:
|
| 157 |
+
console.print(f" - {c.name}")
|
| 158 |
+
|
| 159 |
+
# Run optimizer
|
| 160 |
+
optimizer = FlowOptimizer(
|
| 161 |
+
parallel=parallel,
|
| 162 |
+
use_llm_evaluator=use_llm_eval,
|
| 163 |
+
output_dir=output_dir,
|
| 164 |
+
)
|
| 165 |
+
|
| 166 |
+
try:
|
| 167 |
+
result = await optimizer.optimize(configs, tasks)
|
| 168 |
+
|
| 169 |
+
console.print("\n[bold green]Optimization complete![/]")
|
| 170 |
+
console.print(f"\nBest configs exported to: [cyan]{result.output_dir / 'configs'}[/]")
|
| 171 |
+
console.print("\nTo use a config:")
|
| 172 |
+
console.print(f" [dim]flow run --config {result.output_dir / 'configs' / 'best_score.yaml'} \"your task\"[/]")
|
| 173 |
+
|
| 174 |
+
except KeyboardInterrupt:
|
| 175 |
+
console.print("\n[yellow]Optimization cancelled.[/]")
|
| 176 |
+
raise typer.Exit(1)
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
def _load_tasks(tasks_path: Path | None, suite: str | None) -> list[Task]:
|
| 180 |
+
"""Load tasks from file or built-in suite."""
|
| 181 |
+
if tasks_path:
|
| 182 |
+
if not tasks_path.exists():
|
| 183 |
+
console.print(f"[red]Error:[/] Tasks file not found: {tasks_path}")
|
| 184 |
+
raise typer.Exit(1)
|
| 185 |
+
return load_tasks_from_jsonl(tasks_path)
|
| 186 |
+
|
| 187 |
+
if suite:
|
| 188 |
+
return _get_builtin_suite(suite)
|
| 189 |
+
|
| 190 |
+
# Default: simple test suite
|
| 191 |
+
return _get_builtin_suite("quick")
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
def _get_builtin_suite(name: str) -> list[Task]:
|
| 195 |
+
"""Get a built-in task suite."""
|
| 196 |
+
suites = {
|
| 197 |
+
"quick": [
|
| 198 |
+
Task(
|
| 199 |
+
name="hello_world",
|
| 200 |
+
prompt="Create a Python script 'hello.py' that prints 'Hello, World!' and run it.",
|
| 201 |
+
criteria=[
|
| 202 |
+
EvalCriterion(name="file_created", instruction="hello.py should be created"),
|
| 203 |
+
EvalCriterion(name="correct_output", instruction="Output should include 'Hello, World!'"),
|
| 204 |
+
],
|
| 205 |
+
),
|
| 206 |
+
],
|
| 207 |
+
"coding": [
|
| 208 |
+
Task(
|
| 209 |
+
name="fizzbuzz",
|
| 210 |
+
prompt="Create fizzbuzz.py that prints 1-30 with Fizz/Buzz/FizzBuzz rules. Run it.",
|
| 211 |
+
criteria=[
|
| 212 |
+
EvalCriterion(name="file_created", instruction="fizzbuzz.py should be created"),
|
| 213 |
+
EvalCriterion(name="correct_output", instruction="Output shows correct FizzBuzz pattern"),
|
| 214 |
+
],
|
| 215 |
+
metadata={"category": "short"},
|
| 216 |
+
),
|
| 217 |
+
Task(
|
| 218 |
+
name="rest_api",
|
| 219 |
+
prompt="Create a FastAPI app with a /health endpoint that returns JSON {'status': 'ok'}. Save as api.py.",
|
| 220 |
+
criteria=[
|
| 221 |
+
EvalCriterion(name="file_created", instruction="api.py should be created"),
|
| 222 |
+
EvalCriterion(name="fastapi_used", instruction="Should use FastAPI"),
|
| 223 |
+
EvalCriterion(name="endpoint_defined", instruction="Should have /health endpoint"),
|
| 224 |
+
],
|
| 225 |
+
metadata={"category": "medium"},
|
| 226 |
+
),
|
| 227 |
+
Task(
|
| 228 |
+
name="data_pipeline",
|
| 229 |
+
prompt="""Create a data processing pipeline:
|
| 230 |
+
1. data_types.py - DataRecord dataclass (id, name, value)
|
| 231 |
+
2. validators.py - validate_id, validate_name functions
|
| 232 |
+
3. pipeline.py - chain validators together
|
| 233 |
+
4. test_pipeline.py - tests for the pipeline
|
| 234 |
+
Run the tests.""",
|
| 235 |
+
criteria=[
|
| 236 |
+
EvalCriterion(name="modules_created", instruction="All 4 Python files created"),
|
| 237 |
+
EvalCriterion(name="tests_run", instruction="Tests should be executed"),
|
| 238 |
+
],
|
| 239 |
+
metadata={"category": "long"},
|
| 240 |
+
),
|
| 241 |
+
],
|
| 242 |
+
"research": [
|
| 243 |
+
Task(
|
| 244 |
+
name="codebase_analysis",
|
| 245 |
+
prompt="""Analyze this workspace:
|
| 246 |
+
1. Explore the directory structure
|
| 247 |
+
2. Identify Python files and their purposes
|
| 248 |
+
3. Create analysis_report.md with findings""",
|
| 249 |
+
criteria=[
|
| 250 |
+
EvalCriterion(name="exploration", instruction="Should explore directory"),
|
| 251 |
+
EvalCriterion(name="report_created", instruction="analysis_report.md created"),
|
| 252 |
+
],
|
| 253 |
+
metadata={"category": "research"},
|
| 254 |
+
),
|
| 255 |
+
],
|
| 256 |
+
}
|
| 257 |
+
|
| 258 |
+
if name not in suites:
|
| 259 |
+
console.print(f"[red]Error:[/] Unknown suite '{name}'. Available: {list(suites.keys())}")
|
| 260 |
+
raise typer.Exit(1)
|
| 261 |
+
|
| 262 |
+
return suites[name]
|
| 263 |
+
|
| 264 |
+
|
| 265 |
+
def _load_configs(
|
| 266 |
+
config_path: Path | None,
|
| 267 |
+
mode: str,
|
| 268 |
+
vary: str | None,
|
| 269 |
+
) -> list[AblationConfig]:
|
| 270 |
+
"""Load configs from file or generate from variations."""
|
| 271 |
+
# Load from Python file
|
| 272 |
+
if config_path:
|
| 273 |
+
if not config_path.exists():
|
| 274 |
+
console.print(f"[red]Error:[/] Config file not found: {config_path}")
|
| 275 |
+
raise typer.Exit(1)
|
| 276 |
+
|
| 277 |
+
configs, variations = _load_python_config(config_path)
|
| 278 |
+
|
| 279 |
+
if mode == "grid" and variations:
|
| 280 |
+
return generate_grid_configs("grid", variations)
|
| 281 |
+
elif configs:
|
| 282 |
+
return configs
|
| 283 |
+
else:
|
| 284 |
+
console.print("[red]Error:[/] Config file has no CONFIGS or VARIATIONS")
|
| 285 |
+
raise typer.Exit(1)
|
| 286 |
+
|
| 287 |
+
# Generate from --vary flag
|
| 288 |
+
if vary:
|
| 289 |
+
variations = _parse_vary_flag(vary)
|
| 290 |
+
return generate_grid_configs("vary", variations)
|
| 291 |
+
|
| 292 |
+
# Default: use context engineering configs
|
| 293 |
+
return CONTEXT_ENGINEERING_CONFIGS
|
| 294 |
+
|
| 295 |
+
|
| 296 |
+
def _load_python_config(path: Path) -> tuple[list[AblationConfig], dict[str, Any]]:
|
| 297 |
+
"""Load CONFIGS and VARIATIONS from a Python file."""
|
| 298 |
+
spec = importlib.util.spec_from_file_location("config_module", path)
|
| 299 |
+
if spec is None or spec.loader is None:
|
| 300 |
+
raise ValueError(f"Cannot load {path}")
|
| 301 |
+
|
| 302 |
+
module = importlib.util.module_from_spec(spec)
|
| 303 |
+
sys.modules["config_module"] = module
|
| 304 |
+
spec.loader.exec_module(module)
|
| 305 |
+
|
| 306 |
+
configs = getattr(module, "CONFIGS", [])
|
| 307 |
+
variations = getattr(module, "VARIATIONS", {})
|
| 308 |
+
|
| 309 |
+
return configs, variations
|
| 310 |
+
|
| 311 |
+
|
| 312 |
+
def _parse_vary_flag(vary: str) -> dict[str, Any]:
|
| 313 |
+
"""Parse --vary flag into variations dict."""
|
| 314 |
+
variations = {}
|
| 315 |
+
|
| 316 |
+
for param in vary.split(","):
|
| 317 |
+
param = param.strip().lower()
|
| 318 |
+
|
| 319 |
+
if param in ("compaction", "compact"):
|
| 320 |
+
variations["enable_message_compaction"] = [True, False]
|
| 321 |
+
elif param in ("memory", "mem"):
|
| 322 |
+
variations["enable_memory_tool"] = [True, False]
|
| 323 |
+
elif param in ("subagent", "sub"):
|
| 324 |
+
variations["enable_sub_agent"] = [True, False]
|
| 325 |
+
elif param in ("head", "head_size"):
|
| 326 |
+
variations["compaction_head_size"] = [5, 10, 20]
|
| 327 |
+
elif param in ("tail", "tail_size"):
|
| 328 |
+
variations["compaction_tail_size"] = [20, 40, 60]
|
| 329 |
+
else:
|
| 330 |
+
console.print(f"[yellow]Warning:[/] Unknown vary param: {param}")
|
| 331 |
+
|
| 332 |
+
return variations
|
src/flow/cli/output.py
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Output formatting for Flow CLI.
|
| 2 |
+
|
| 3 |
+
Provides functions for rendering agent events to the terminal
|
| 4 |
+
with rich formatting.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from rich.console import Console
|
| 8 |
+
from rich.markdown import Markdown
|
| 9 |
+
from rich.markup import escape
|
| 10 |
+
from rich.panel import Panel
|
| 11 |
+
from rich.syntax import Syntax
|
| 12 |
+
|
| 13 |
+
from flow.harness.base import Event, EventType
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def print_event(console: Console, event: Event) -> None:
|
| 17 |
+
"""Print an agent event to the console.
|
| 18 |
+
|
| 19 |
+
Args:
|
| 20 |
+
console: Rich console instance
|
| 21 |
+
event: Event to print
|
| 22 |
+
"""
|
| 23 |
+
if event.type == EventType.TEXT_DELTA:
|
| 24 |
+
# Stream text without newline
|
| 25 |
+
console.print(event.content, end="")
|
| 26 |
+
|
| 27 |
+
elif event.type == EventType.TEXT_DONE:
|
| 28 |
+
# Final text - print with newline
|
| 29 |
+
if event.content:
|
| 30 |
+
console.print(event.content)
|
| 31 |
+
console.print() # Extra newline for spacing
|
| 32 |
+
|
| 33 |
+
elif event.type == EventType.TOOL_CALL_START:
|
| 34 |
+
# Show tool being called
|
| 35 |
+
tool_name = event.tool_name or "unknown"
|
| 36 |
+
console.print(f"\n[dim]▶ Calling tool:[/] [cyan]{tool_name}[/]")
|
| 37 |
+
|
| 38 |
+
elif event.type == EventType.TOOL_CALL_ARGS:
|
| 39 |
+
# Show tool arguments (streaming) - escape to prevent Rich markup interpretation
|
| 40 |
+
if event.content:
|
| 41 |
+
console.print(f"[dim]{escape(event.content)}[/]", end="")
|
| 42 |
+
|
| 43 |
+
elif event.type == EventType.TOOL_CALL_DONE:
|
| 44 |
+
# Tool call complete
|
| 45 |
+
console.print() # Newline after args
|
| 46 |
+
|
| 47 |
+
elif event.type == EventType.TOOL_RESULT:
|
| 48 |
+
# Show tool result (truncated if long)
|
| 49 |
+
result = event.content or ""
|
| 50 |
+
if len(result) > 500:
|
| 51 |
+
result = result[:500] + "\n... (truncated)"
|
| 52 |
+
|
| 53 |
+
console.print(Panel(
|
| 54 |
+
escape(result),
|
| 55 |
+
title="[green]Tool Result[/]",
|
| 56 |
+
border_style="dim",
|
| 57 |
+
expand=False,
|
| 58 |
+
))
|
| 59 |
+
|
| 60 |
+
elif event.type == EventType.THINKING:
|
| 61 |
+
# Show agent thinking
|
| 62 |
+
console.print(f"[dim italic]💭 {escape(event.content or '')}[/]")
|
| 63 |
+
|
| 64 |
+
elif event.type == EventType.ERROR:
|
| 65 |
+
# Show error
|
| 66 |
+
console.print(f"\n[bold red]Error:[/] {escape(event.content or '')}")
|
| 67 |
+
|
| 68 |
+
elif event.type == EventType.DONE:
|
| 69 |
+
# Execution complete
|
| 70 |
+
console.print("\n[dim]─── Done ───[/]\n")
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def print_welcome(console: Console) -> None:
|
| 74 |
+
"""Print welcome message for interactive mode."""
|
| 75 |
+
console.print("\n[bold blue]Flow[/] - Autonomous Coding Agent")
|
| 76 |
+
console.print("[dim]Type your task and press Enter. Type 'exit' or Ctrl+D to quit.[/]\n")
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def print_code(console: Console, code: str, language: str = "python") -> None:
|
| 80 |
+
"""Print syntax-highlighted code.
|
| 81 |
+
|
| 82 |
+
Args:
|
| 83 |
+
console: Rich console instance
|
| 84 |
+
code: Code to print
|
| 85 |
+
language: Programming language for syntax highlighting
|
| 86 |
+
"""
|
| 87 |
+
syntax = Syntax(code, language, theme="monokai", line_numbers=True)
|
| 88 |
+
console.print(syntax)
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def print_markdown(console: Console, text: str) -> None:
|
| 92 |
+
"""Print markdown-formatted text.
|
| 93 |
+
|
| 94 |
+
Args:
|
| 95 |
+
console: Rich console instance
|
| 96 |
+
text: Markdown text to print
|
| 97 |
+
"""
|
| 98 |
+
md = Markdown(text)
|
| 99 |
+
console.print(md)
|
src/flow/cli/repl.py
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Interactive REPL for Flow.
|
| 2 |
+
|
| 3 |
+
Provides an interactive command-line interface for running
|
| 4 |
+
the Flow agent with streaming output.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
|
| 11 |
+
from rich.console import Console
|
| 12 |
+
|
| 13 |
+
from flow.cli.output import print_event, print_welcome
|
| 14 |
+
from flow.harness.base import EventType
|
| 15 |
+
from flow.harness.maf import MAFHarness
|
| 16 |
+
|
| 17 |
+
# Default paths
|
| 18 |
+
DEFAULT_WORKSPACE = Path.home() / ".flow" / "workspace"
|
| 19 |
+
DEFAULT_MEMORY_PATH = Path.home() / ".flow" / "memory"
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class FlowREPL:
|
| 23 |
+
"""Interactive REPL for Flow agent.
|
| 24 |
+
|
| 25 |
+
Provides a command-line interface similar to Claude Code,
|
| 26 |
+
with streaming output and tool call visualization.
|
| 27 |
+
"""
|
| 28 |
+
|
| 29 |
+
def __init__(
|
| 30 |
+
self,
|
| 31 |
+
workspace: Path | None = None,
|
| 32 |
+
memory_path: Path | None = None,
|
| 33 |
+
) -> None:
|
| 34 |
+
"""Initialize the REPL.
|
| 35 |
+
|
| 36 |
+
Args:
|
| 37 |
+
workspace: Workspace directory. Defaults to ~/.flow/workspace.
|
| 38 |
+
memory_path: Memory directory. Defaults to ~/.flow/memory.
|
| 39 |
+
"""
|
| 40 |
+
self._workspace = workspace or DEFAULT_WORKSPACE
|
| 41 |
+
self._memory_path = memory_path or DEFAULT_MEMORY_PATH
|
| 42 |
+
self._console = Console()
|
| 43 |
+
self._harness: MAFHarness | None = None
|
| 44 |
+
self._thread_id: str | None = None
|
| 45 |
+
|
| 46 |
+
def _get_harness(self) -> MAFHarness:
|
| 47 |
+
"""Get or create the harness instance."""
|
| 48 |
+
if self._harness is None:
|
| 49 |
+
self._harness = MAFHarness(
|
| 50 |
+
workspace=self._workspace,
|
| 51 |
+
memory_path=self._memory_path,
|
| 52 |
+
)
|
| 53 |
+
return self._harness
|
| 54 |
+
|
| 55 |
+
async def run(self) -> None:
|
| 56 |
+
"""Run the interactive REPL loop."""
|
| 57 |
+
print_welcome(self._console)
|
| 58 |
+
|
| 59 |
+
harness = self._get_harness()
|
| 60 |
+
|
| 61 |
+
while True:
|
| 62 |
+
try:
|
| 63 |
+
# Get user input
|
| 64 |
+
user_input = self._get_input()
|
| 65 |
+
|
| 66 |
+
if user_input is None:
|
| 67 |
+
# EOF (Ctrl+D)
|
| 68 |
+
break
|
| 69 |
+
|
| 70 |
+
user_input = user_input.strip()
|
| 71 |
+
|
| 72 |
+
if not user_input:
|
| 73 |
+
continue
|
| 74 |
+
|
| 75 |
+
# Handle special commands
|
| 76 |
+
if user_input.lower() in ("exit", "quit", "q"):
|
| 77 |
+
break
|
| 78 |
+
|
| 79 |
+
if user_input.lower() == "clear":
|
| 80 |
+
self._console.clear()
|
| 81 |
+
print_welcome(self._console)
|
| 82 |
+
continue
|
| 83 |
+
|
| 84 |
+
if user_input.lower() == "help":
|
| 85 |
+
self._print_help()
|
| 86 |
+
continue
|
| 87 |
+
|
| 88 |
+
if user_input.lower() == "config":
|
| 89 |
+
self._print_config()
|
| 90 |
+
continue
|
| 91 |
+
|
| 92 |
+
# Run the task
|
| 93 |
+
await self._run_task(harness, user_input)
|
| 94 |
+
|
| 95 |
+
except KeyboardInterrupt:
|
| 96 |
+
self._console.print("\n[yellow]Interrupted. Type 'exit' to quit.[/]")
|
| 97 |
+
continue
|
| 98 |
+
|
| 99 |
+
# Cleanup
|
| 100 |
+
self._console.print("\n[dim]Goodbye![/]\n")
|
| 101 |
+
if self._harness:
|
| 102 |
+
await self._harness.close()
|
| 103 |
+
|
| 104 |
+
def _get_input(self) -> str | None:
|
| 105 |
+
"""Get input from the user.
|
| 106 |
+
|
| 107 |
+
Returns:
|
| 108 |
+
User input string, or None on EOF.
|
| 109 |
+
"""
|
| 110 |
+
try:
|
| 111 |
+
return self._console.input("[bold green]>[/] ")
|
| 112 |
+
except EOFError:
|
| 113 |
+
return None
|
| 114 |
+
|
| 115 |
+
async def _run_task(self, harness: MAFHarness, task: str) -> None:
|
| 116 |
+
"""Run a task and stream the output.
|
| 117 |
+
|
| 118 |
+
Args:
|
| 119 |
+
harness: Harness instance
|
| 120 |
+
task: Task to execute
|
| 121 |
+
"""
|
| 122 |
+
self._console.print() # Blank line before output
|
| 123 |
+
|
| 124 |
+
try:
|
| 125 |
+
async for event in harness.run_stream(task, self._thread_id):
|
| 126 |
+
print_event(self._console, event)
|
| 127 |
+
|
| 128 |
+
# Store thread ID for conversation continuity
|
| 129 |
+
if event.type == EventType.DONE:
|
| 130 |
+
self._thread_id = harness.get_thread_id()
|
| 131 |
+
|
| 132 |
+
except Exception as e:
|
| 133 |
+
self._console.print(f"\n[bold red]Error:[/] {e}")
|
| 134 |
+
|
| 135 |
+
def _print_help(self) -> None:
|
| 136 |
+
"""Print help information."""
|
| 137 |
+
self._console.print("\n[bold]Flow Commands:[/]")
|
| 138 |
+
self._console.print(" [cyan]exit[/], [cyan]quit[/], [cyan]q[/] - Exit the REPL")
|
| 139 |
+
self._console.print(" [cyan]clear[/] - Clear the screen")
|
| 140 |
+
self._console.print(" [cyan]config[/] - Show current configuration")
|
| 141 |
+
self._console.print(" [cyan]help[/] - Show this help message")
|
| 142 |
+
self._console.print("\n[bold]Tips:[/]")
|
| 143 |
+
self._console.print(" - Type your task and press Enter to execute")
|
| 144 |
+
self._console.print(" - Press Ctrl+C to cancel a running task")
|
| 145 |
+
self._console.print(" - Press Ctrl+D to exit")
|
| 146 |
+
self._console.print()
|
| 147 |
+
|
| 148 |
+
def _print_config(self) -> None:
|
| 149 |
+
"""Print current configuration."""
|
| 150 |
+
self._console.print("\n[bold]Configuration:[/]")
|
| 151 |
+
self._console.print(f" Workspace: [cyan]{self._workspace}[/]")
|
| 152 |
+
self._console.print(f" Memory: [cyan]{self._memory_path}[/]")
|
| 153 |
+
self._console.print()
|
src/flow/experiments/__init__.py
ADDED
|
@@ -0,0 +1,204 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Microsoft. All rights reserved.
|
| 2 |
+
|
| 3 |
+
"""Experiments framework for running and evaluating Flow agent tasks.
|
| 4 |
+
|
| 5 |
+
This package provides a structured way to:
|
| 6 |
+
- Define tasks with evaluation criteria
|
| 7 |
+
- Run agents on tasks and collect OpenTelemetry traces
|
| 8 |
+
- Evaluate agent outputs using LLM, heuristic, or trace-based evaluators
|
| 9 |
+
- Extract metrics from execution traces
|
| 10 |
+
- Run ablation studies comparing different configurations
|
| 11 |
+
|
| 12 |
+
Example usage:
|
| 13 |
+
from flow.harness.maf import MAFHarness
|
| 14 |
+
from flow.experiments import (
|
| 15 |
+
FlowExperimentRunner,
|
| 16 |
+
Task,
|
| 17 |
+
EvalCriterion,
|
| 18 |
+
TraceEvaluator,
|
| 19 |
+
HeuristicEvaluator,
|
| 20 |
+
extract_metrics,
|
| 21 |
+
format_metrics_summary,
|
| 22 |
+
setup_tracing,
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
# Setup tracing (call once at startup)
|
| 26 |
+
setup_tracing("my-experiment")
|
| 27 |
+
|
| 28 |
+
# Define a task
|
| 29 |
+
task = Task(
|
| 30 |
+
name="hello_world",
|
| 31 |
+
prompt="Write a Python function that prints 'Hello, World!'",
|
| 32 |
+
criteria=[
|
| 33 |
+
EvalCriterion(
|
| 34 |
+
name="correctness",
|
| 35 |
+
instruction="The function should print exactly 'Hello, World!'",
|
| 36 |
+
),
|
| 37 |
+
],
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
# Run the experiment
|
| 41 |
+
harness = MAFHarness()
|
| 42 |
+
runner = FlowExperimentRunner(keep_workspace=True)
|
| 43 |
+
result = await runner.run(harness, task)
|
| 44 |
+
|
| 45 |
+
# Extract metrics
|
| 46 |
+
metrics = extract_metrics(result.trace)
|
| 47 |
+
print(format_metrics_summary(metrics))
|
| 48 |
+
|
| 49 |
+
# Evaluate the result
|
| 50 |
+
evaluator = HeuristicEvaluator()
|
| 51 |
+
eval_result = await evaluator.evaluate(result)
|
| 52 |
+
print(f"Score: {eval_result.score}, Passed: {eval_result.passed}")
|
| 53 |
+
|
| 54 |
+
await harness.close()
|
| 55 |
+
|
| 56 |
+
Ablation studies:
|
| 57 |
+
from flow.experiments import run_ablations, AblationConfig
|
| 58 |
+
|
| 59 |
+
configs = [
|
| 60 |
+
AblationConfig(name="baseline", enable_message_compaction=False),
|
| 61 |
+
AblationConfig(name="with_compaction", enable_message_compaction=True),
|
| 62 |
+
]
|
| 63 |
+
|
| 64 |
+
results = await run_ablations(
|
| 65 |
+
configs,
|
| 66 |
+
task_prompt="Create a simple HTTP server",
|
| 67 |
+
)
|
| 68 |
+
"""
|
| 69 |
+
|
| 70 |
+
# Types
|
| 71 |
+
# Ablation
|
| 72 |
+
from .ablation import (
|
| 73 |
+
AGENT_MEMORY_ONLY,
|
| 74 |
+
ALL_CONTEXT_ENGINEERING,
|
| 75 |
+
COMPACTION_ONLY,
|
| 76 |
+
# Context engineering configs
|
| 77 |
+
CONTEXT_ENG_BASELINE,
|
| 78 |
+
CONTEXT_ENGINEERING_CONFIGS,
|
| 79 |
+
ISOLATION_ONLY,
|
| 80 |
+
AblationConfig,
|
| 81 |
+
AblationResult,
|
| 82 |
+
# Shared utilities
|
| 83 |
+
compute_pareto_frontier,
|
| 84 |
+
create_harness_from_config,
|
| 85 |
+
generate_recommendation,
|
| 86 |
+
run_ablations,
|
| 87 |
+
run_context_engineering_comparison,
|
| 88 |
+
run_single_ablation,
|
| 89 |
+
)
|
| 90 |
+
|
| 91 |
+
# Config export
|
| 92 |
+
from .config_export import (
|
| 93 |
+
export_config,
|
| 94 |
+
export_optimization_configs,
|
| 95 |
+
load_config,
|
| 96 |
+
)
|
| 97 |
+
|
| 98 |
+
# Evaluators
|
| 99 |
+
from .evaluators import (
|
| 100 |
+
CompositeEvaluator,
|
| 101 |
+
Evaluator,
|
| 102 |
+
HeuristicEvaluator,
|
| 103 |
+
LLMEvaluator,
|
| 104 |
+
TraceEvaluator,
|
| 105 |
+
)
|
| 106 |
+
|
| 107 |
+
# Metrics
|
| 108 |
+
from .metrics import (
|
| 109 |
+
LLMCallInfo,
|
| 110 |
+
ToolCallInfo,
|
| 111 |
+
TraceMetrics,
|
| 112 |
+
extract_metrics,
|
| 113 |
+
format_metrics_summary,
|
| 114 |
+
metrics_to_dict,
|
| 115 |
+
)
|
| 116 |
+
|
| 117 |
+
# Optimizer
|
| 118 |
+
from .optimizer import (
|
| 119 |
+
ConfigSummary,
|
| 120 |
+
FlowOptimizer,
|
| 121 |
+
OptimizationResult,
|
| 122 |
+
TaskResult,
|
| 123 |
+
generate_grid_configs,
|
| 124 |
+
load_tasks_from_jsonl,
|
| 125 |
+
)
|
| 126 |
+
|
| 127 |
+
# Reporters
|
| 128 |
+
from .reporters import (
|
| 129 |
+
load_run_result_summary,
|
| 130 |
+
print_comparison_table,
|
| 131 |
+
print_eval_result,
|
| 132 |
+
print_metrics_summary,
|
| 133 |
+
save_comparison,
|
| 134 |
+
save_run_result,
|
| 135 |
+
)
|
| 136 |
+
|
| 137 |
+
# Runner
|
| 138 |
+
from .runner import FlowExperimentRunner, setup_tracing
|
| 139 |
+
|
| 140 |
+
# Trace collection
|
| 141 |
+
from .trace_collector import FlowTraceCollector
|
| 142 |
+
from .types import CriterionResult, EvalCriterion, EvalResult, RunResult, Task
|
| 143 |
+
|
| 144 |
+
__all__ = [ # noqa: RUF022 # Intentionally grouped by category
|
| 145 |
+
# Types
|
| 146 |
+
"Task",
|
| 147 |
+
"EvalCriterion",
|
| 148 |
+
"RunResult",
|
| 149 |
+
"EvalResult",
|
| 150 |
+
"CriterionResult",
|
| 151 |
+
# Trace collection
|
| 152 |
+
"FlowTraceCollector",
|
| 153 |
+
# Metrics
|
| 154 |
+
"TraceMetrics",
|
| 155 |
+
"LLMCallInfo",
|
| 156 |
+
"ToolCallInfo",
|
| 157 |
+
"extract_metrics",
|
| 158 |
+
"format_metrics_summary",
|
| 159 |
+
"metrics_to_dict",
|
| 160 |
+
# Runner
|
| 161 |
+
"FlowExperimentRunner",
|
| 162 |
+
"setup_tracing",
|
| 163 |
+
# Evaluators
|
| 164 |
+
"Evaluator",
|
| 165 |
+
"LLMEvaluator",
|
| 166 |
+
"TraceEvaluator",
|
| 167 |
+
"HeuristicEvaluator",
|
| 168 |
+
"CompositeEvaluator",
|
| 169 |
+
# Reporters
|
| 170 |
+
"save_run_result",
|
| 171 |
+
"load_run_result_summary",
|
| 172 |
+
"save_comparison",
|
| 173 |
+
"print_metrics_summary",
|
| 174 |
+
"print_comparison_table",
|
| 175 |
+
"print_eval_result",
|
| 176 |
+
# Ablation
|
| 177 |
+
"AblationConfig",
|
| 178 |
+
"AblationResult",
|
| 179 |
+
"run_ablations",
|
| 180 |
+
"run_single_ablation",
|
| 181 |
+
"create_harness_from_config",
|
| 182 |
+
# Context engineering configs
|
| 183 |
+
"CONTEXT_ENG_BASELINE",
|
| 184 |
+
"COMPACTION_ONLY",
|
| 185 |
+
"AGENT_MEMORY_ONLY",
|
| 186 |
+
"ISOLATION_ONLY",
|
| 187 |
+
"ALL_CONTEXT_ENGINEERING",
|
| 188 |
+
"CONTEXT_ENGINEERING_CONFIGS",
|
| 189 |
+
"run_context_engineering_comparison",
|
| 190 |
+
# Shared utilities
|
| 191 |
+
"compute_pareto_frontier",
|
| 192 |
+
"generate_recommendation",
|
| 193 |
+
# Optimizer
|
| 194 |
+
"FlowOptimizer",
|
| 195 |
+
"OptimizationResult",
|
| 196 |
+
"ConfigSummary",
|
| 197 |
+
"TaskResult",
|
| 198 |
+
"generate_grid_configs",
|
| 199 |
+
"load_tasks_from_jsonl",
|
| 200 |
+
# Config export
|
| 201 |
+
"export_config",
|
| 202 |
+
"load_config",
|
| 203 |
+
"export_optimization_configs",
|
| 204 |
+
]
|
src/flow/experiments/ablation.py
ADDED
|
@@ -0,0 +1,472 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Microsoft. All rights reserved.
|
| 2 |
+
|
| 3 |
+
"""Ablation runner for comparing Flow agent configurations.
|
| 4 |
+
|
| 5 |
+
This module provides:
|
| 6 |
+
- AblationConfig: Dataclass for agent configuration parameters
|
| 7 |
+
- Pareto analysis utilities for multi-objective optimization
|
| 8 |
+
- Pre-defined configurations for context engineering strategies
|
| 9 |
+
- Convenience functions for running ablation studies
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
from __future__ import annotations
|
| 13 |
+
|
| 14 |
+
import json
|
| 15 |
+
import logging
|
| 16 |
+
from dataclasses import asdict, dataclass
|
| 17 |
+
from datetime import datetime
|
| 18 |
+
from pathlib import Path
|
| 19 |
+
from typing import TYPE_CHECKING
|
| 20 |
+
|
| 21 |
+
from .evaluators import HeuristicEvaluator
|
| 22 |
+
from .metrics import TraceMetrics, extract_metrics, metrics_to_dict
|
| 23 |
+
from .reporters import print_comparison_table, save_run_result
|
| 24 |
+
from .runner import FlowExperimentRunner, setup_tracing
|
| 25 |
+
from .types import EvalCriterion, RunResult, Task
|
| 26 |
+
|
| 27 |
+
if TYPE_CHECKING:
|
| 28 |
+
from flow.harness.maf import MAFHarness
|
| 29 |
+
|
| 30 |
+
from .optimizer import ConfigSummary
|
| 31 |
+
|
| 32 |
+
logger = logging.getLogger(__name__)
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
@dataclass
|
| 36 |
+
class AblationConfig:
|
| 37 |
+
"""Configuration for a single ablation run.
|
| 38 |
+
|
| 39 |
+
Each config represents a different agent configuration to test.
|
| 40 |
+
The name is used as an identifier in comparison results.
|
| 41 |
+
|
| 42 |
+
Attributes:
|
| 43 |
+
name: Unique identifier for this configuration
|
| 44 |
+
enable_message_compaction: Whether to enable message compaction
|
| 45 |
+
enable_memory_tool: Whether to enable agent-managed memory
|
| 46 |
+
enable_sub_agent: Whether to enable sub-agent for isolated research
|
| 47 |
+
compaction_head_size: Number of initial messages to keep
|
| 48 |
+
compaction_tail_size: Number of recent messages to keep
|
| 49 |
+
bash_timeout: Timeout for bash commands in seconds
|
| 50 |
+
"""
|
| 51 |
+
|
| 52 |
+
name: str
|
| 53 |
+
enable_message_compaction: bool = True
|
| 54 |
+
enable_memory_tool: bool = True
|
| 55 |
+
enable_sub_agent: bool = False
|
| 56 |
+
compaction_head_size: int = 10
|
| 57 |
+
compaction_tail_size: int = 40
|
| 58 |
+
bash_timeout: int = 120
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
@dataclass
|
| 62 |
+
class AblationResult:
|
| 63 |
+
"""Result of a single ablation run.
|
| 64 |
+
|
| 65 |
+
Contains all data from the run including raw results,
|
| 66 |
+
extracted metrics, and evaluation scores.
|
| 67 |
+
"""
|
| 68 |
+
|
| 69 |
+
config: AblationConfig
|
| 70 |
+
run_result: RunResult
|
| 71 |
+
metrics: TraceMetrics
|
| 72 |
+
eval_score: float
|
| 73 |
+
eval_passed: bool
|
| 74 |
+
eval_reasoning: str
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def create_harness_from_config(config: AblationConfig, workspace: Path) -> MAFHarness:
|
| 78 |
+
"""Create a MAFHarness from an ablation config.
|
| 79 |
+
|
| 80 |
+
Args:
|
| 81 |
+
config: The ablation configuration
|
| 82 |
+
workspace: Working directory
|
| 83 |
+
|
| 84 |
+
Returns:
|
| 85 |
+
A configured MAFHarness
|
| 86 |
+
"""
|
| 87 |
+
from flow.harness.maf import MAFHarness
|
| 88 |
+
|
| 89 |
+
return MAFHarness(
|
| 90 |
+
workspace=workspace,
|
| 91 |
+
memory_path=workspace / "memory",
|
| 92 |
+
enable_compaction=config.enable_message_compaction,
|
| 93 |
+
enable_memory_tool=config.enable_memory_tool,
|
| 94 |
+
enable_sub_agent=config.enable_sub_agent,
|
| 95 |
+
compaction_head_size=config.compaction_head_size,
|
| 96 |
+
compaction_tail_size=config.compaction_tail_size,
|
| 97 |
+
bash_timeout=config.bash_timeout,
|
| 98 |
+
)
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
async def run_single_ablation(
|
| 102 |
+
config: AblationConfig,
|
| 103 |
+
task: Task,
|
| 104 |
+
workspace: Path,
|
| 105 |
+
) -> AblationResult:
|
| 106 |
+
"""Run a single ablation with trace capture and evaluation.
|
| 107 |
+
|
| 108 |
+
Args:
|
| 109 |
+
config: The ablation configuration
|
| 110 |
+
task: The task to run
|
| 111 |
+
workspace: Working directory
|
| 112 |
+
|
| 113 |
+
Returns:
|
| 114 |
+
AblationResult with metrics and evaluation
|
| 115 |
+
"""
|
| 116 |
+
# Create harness from config
|
| 117 |
+
harness = create_harness_from_config(config, workspace)
|
| 118 |
+
|
| 119 |
+
try:
|
| 120 |
+
# Create runner
|
| 121 |
+
runner = FlowExperimentRunner(keep_workspace=True)
|
| 122 |
+
|
| 123 |
+
# Run the experiment
|
| 124 |
+
run_result = await runner.run(harness, task, workspace=workspace)
|
| 125 |
+
|
| 126 |
+
# Extract metrics
|
| 127 |
+
metrics = extract_metrics(run_result.trace)
|
| 128 |
+
|
| 129 |
+
# Evaluate the result
|
| 130 |
+
evaluator = HeuristicEvaluator()
|
| 131 |
+
eval_result = await evaluator.evaluate(run_result)
|
| 132 |
+
|
| 133 |
+
return AblationResult(
|
| 134 |
+
config=config,
|
| 135 |
+
run_result=run_result,
|
| 136 |
+
metrics=metrics,
|
| 137 |
+
eval_score=eval_result.score,
|
| 138 |
+
eval_passed=eval_result.passed,
|
| 139 |
+
eval_reasoning=eval_result.reasoning,
|
| 140 |
+
)
|
| 141 |
+
finally:
|
| 142 |
+
await harness.close()
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
def save_ablation_result(result: AblationResult, output_dir: Path) -> None:
|
| 146 |
+
"""Save ablation result to files.
|
| 147 |
+
|
| 148 |
+
Creates a subdirectory for the config with all result files.
|
| 149 |
+
|
| 150 |
+
Args:
|
| 151 |
+
result: The ablation result to save
|
| 152 |
+
output_dir: Base directory for output
|
| 153 |
+
"""
|
| 154 |
+
config_dir = output_dir / result.config.name
|
| 155 |
+
save_run_result(
|
| 156 |
+
result.run_result,
|
| 157 |
+
config_dir,
|
| 158 |
+
metrics=result.metrics,
|
| 159 |
+
)
|
| 160 |
+
|
| 161 |
+
# Save ablation-specific data
|
| 162 |
+
with open(config_dir / "ablation.json", "w") as f:
|
| 163 |
+
json.dump({
|
| 164 |
+
"config": asdict(result.config),
|
| 165 |
+
"evaluation": {
|
| 166 |
+
"score": result.eval_score,
|
| 167 |
+
"passed": result.eval_passed,
|
| 168 |
+
"reasoning": result.eval_reasoning,
|
| 169 |
+
},
|
| 170 |
+
}, f, indent=2)
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
async def run_ablations(
|
| 174 |
+
configs: list[AblationConfig],
|
| 175 |
+
task_prompt: str,
|
| 176 |
+
output_dir: Path | None = None,
|
| 177 |
+
task_name: str = "ablation_task",
|
| 178 |
+
) -> list[AblationResult]:
|
| 179 |
+
"""Run multiple ablation configurations and compare.
|
| 180 |
+
|
| 181 |
+
This function:
|
| 182 |
+
1. Sets up tracing
|
| 183 |
+
2. Runs each configuration on the same task
|
| 184 |
+
3. Collects metrics and evaluation scores
|
| 185 |
+
4. Saves results and prints comparison
|
| 186 |
+
|
| 187 |
+
Args:
|
| 188 |
+
configs: List of configurations to test
|
| 189 |
+
task_prompt: The task prompt to run
|
| 190 |
+
output_dir: Base directory for output (default: ~/.flow/ablations)
|
| 191 |
+
task_name: Name for the task (used in file paths)
|
| 192 |
+
|
| 193 |
+
Returns:
|
| 194 |
+
List of ablation results
|
| 195 |
+
"""
|
| 196 |
+
# Setup output directory
|
| 197 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 198 |
+
if output_dir is None:
|
| 199 |
+
output_dir = Path.home() / ".flow" / "ablations"
|
| 200 |
+
output_dir = output_dir / timestamp
|
| 201 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 202 |
+
|
| 203 |
+
# Create task
|
| 204 |
+
task = Task(
|
| 205 |
+
name=task_name,
|
| 206 |
+
prompt=task_prompt,
|
| 207 |
+
criteria=[
|
| 208 |
+
EvalCriterion(
|
| 209 |
+
name="completion",
|
| 210 |
+
instruction="The task should be completed successfully",
|
| 211 |
+
),
|
| 212 |
+
],
|
| 213 |
+
)
|
| 214 |
+
|
| 215 |
+
# Save configs
|
| 216 |
+
with open(output_dir / "config.json", "w") as f: # noqa: ASYNC230
|
| 217 |
+
json.dump({
|
| 218 |
+
"task": task_prompt,
|
| 219 |
+
"timestamp": timestamp,
|
| 220 |
+
"configs": [asdict(c) for c in configs],
|
| 221 |
+
}, f, indent=2)
|
| 222 |
+
|
| 223 |
+
print("=" * 80)
|
| 224 |
+
print(" FLOW ABLATION RUNNER")
|
| 225 |
+
print("=" * 80)
|
| 226 |
+
print(f" Task: {task_prompt[:60]}{'...' if len(task_prompt) > 60 else ''}")
|
| 227 |
+
print(f" Configs: {len(configs)}")
|
| 228 |
+
print(f" Output: {output_dir}")
|
| 229 |
+
print("=" * 80)
|
| 230 |
+
|
| 231 |
+
# Setup tracing once
|
| 232 |
+
setup_tracing("flow-ablation")
|
| 233 |
+
|
| 234 |
+
results = []
|
| 235 |
+
for i, config in enumerate(configs, 1):
|
| 236 |
+
print(f"\n[{i}/{len(configs)}] Running: {config.name}")
|
| 237 |
+
print("-" * 40)
|
| 238 |
+
|
| 239 |
+
# Each config gets its own workspace
|
| 240 |
+
workspace = output_dir / config.name / "workspace"
|
| 241 |
+
workspace.mkdir(parents=True, exist_ok=True)
|
| 242 |
+
|
| 243 |
+
result = await run_single_ablation(
|
| 244 |
+
config=config,
|
| 245 |
+
task=task,
|
| 246 |
+
workspace=workspace,
|
| 247 |
+
)
|
| 248 |
+
|
| 249 |
+
results.append(result)
|
| 250 |
+
save_ablation_result(result, output_dir)
|
| 251 |
+
|
| 252 |
+
# Quick status
|
| 253 |
+
status = "OK" if result.run_result.success else "FAIL"
|
| 254 |
+
print(f" {status} | {result.run_result.duration_seconds:.1f}s | "
|
| 255 |
+
f"Tokens: {result.metrics.total_tokens} | Tools: {result.metrics.tool_call_count}")
|
| 256 |
+
|
| 257 |
+
# Save comparison
|
| 258 |
+
comparison_data = [
|
| 259 |
+
{
|
| 260 |
+
"name": r.config.name,
|
| 261 |
+
"success": r.run_result.success,
|
| 262 |
+
"duration_seconds": r.run_result.duration_seconds,
|
| 263 |
+
"metrics": metrics_to_dict(r.metrics),
|
| 264 |
+
"evaluation": {
|
| 265 |
+
"score": r.eval_score,
|
| 266 |
+
"passed": r.eval_passed,
|
| 267 |
+
},
|
| 268 |
+
}
|
| 269 |
+
for r in results
|
| 270 |
+
]
|
| 271 |
+
|
| 272 |
+
with open(output_dir / "comparison.json", "w") as f: # noqa: ASYNC230
|
| 273 |
+
json.dump({"task": task_prompt, "results": comparison_data}, f, indent=2)
|
| 274 |
+
|
| 275 |
+
# Print comparison
|
| 276 |
+
print_comparison_table(comparison_data, "Ablation Comparison")
|
| 277 |
+
|
| 278 |
+
print(f"\nResults saved to: {output_dir}")
|
| 279 |
+
|
| 280 |
+
return results
|
| 281 |
+
|
| 282 |
+
|
| 283 |
+
# =============================================================================
|
| 284 |
+
# Context Engineering Baseline Configurations
|
| 285 |
+
# =============================================================================
|
| 286 |
+
# These configurations demonstrate the three main context engineering strategies:
|
| 287 |
+
# 1. Compaction - Reactive trimming via message stores
|
| 288 |
+
# 2. Agent-Managed Memory - Agent controls when to write/read/delete
|
| 289 |
+
# 3. Isolation - Sub-agent architecture prevents context pollution
|
| 290 |
+
|
| 291 |
+
|
| 292 |
+
# Baseline: No context engineering (for comparison)
|
| 293 |
+
CONTEXT_ENG_BASELINE = AblationConfig(
|
| 294 |
+
name="no_context_engineering",
|
| 295 |
+
enable_message_compaction=False,
|
| 296 |
+
enable_memory_tool=False,
|
| 297 |
+
enable_sub_agent=False,
|
| 298 |
+
)
|
| 299 |
+
|
| 300 |
+
# Strategy 1: Compaction via Message Stores
|
| 301 |
+
# Uses HeadTailCompactingMessageStore to keep first N + last M messages
|
| 302 |
+
# Good for: Long-running sessions where middle context is less important
|
| 303 |
+
COMPACTION_ONLY = AblationConfig(
|
| 304 |
+
name="compaction_only",
|
| 305 |
+
enable_message_compaction=True,
|
| 306 |
+
enable_memory_tool=False,
|
| 307 |
+
enable_sub_agent=False,
|
| 308 |
+
compaction_head_size=10, # Keep task context
|
| 309 |
+
compaction_tail_size=40, # Keep recent work
|
| 310 |
+
)
|
| 311 |
+
|
| 312 |
+
# Strategy 2: Agent-Managed Memory
|
| 313 |
+
# Agent decides when to save/retrieve information from persistent storage
|
| 314 |
+
# Good for: Cross-session memory, learning patterns, storing decisions
|
| 315 |
+
AGENT_MEMORY_ONLY = AblationConfig(
|
| 316 |
+
name="agent_memory_only",
|
| 317 |
+
enable_message_compaction=False,
|
| 318 |
+
enable_memory_tool=True,
|
| 319 |
+
enable_sub_agent=False,
|
| 320 |
+
)
|
| 321 |
+
|
| 322 |
+
# Strategy 3: Isolation via Sub-Agent
|
| 323 |
+
# Delegate heavy research to sub-agent with isolated context
|
| 324 |
+
# Good for: Complex research tasks that would pollute main context
|
| 325 |
+
ISOLATION_ONLY = AblationConfig(
|
| 326 |
+
name="isolation_only",
|
| 327 |
+
enable_message_compaction=False,
|
| 328 |
+
enable_memory_tool=False,
|
| 329 |
+
enable_sub_agent=True,
|
| 330 |
+
)
|
| 331 |
+
|
| 332 |
+
# Combined: All context engineering strategies
|
| 333 |
+
# Uses compaction + memory + isolation together
|
| 334 |
+
# Good for: Production systems with long-running, complex tasks
|
| 335 |
+
ALL_CONTEXT_ENGINEERING = AblationConfig(
|
| 336 |
+
name="all_context_engineering",
|
| 337 |
+
enable_message_compaction=True,
|
| 338 |
+
enable_memory_tool=True,
|
| 339 |
+
enable_sub_agent=True,
|
| 340 |
+
compaction_head_size=10,
|
| 341 |
+
compaction_tail_size=40,
|
| 342 |
+
)
|
| 343 |
+
|
| 344 |
+
# Predefined list for running context engineering comparison
|
| 345 |
+
CONTEXT_ENGINEERING_CONFIGS = [
|
| 346 |
+
CONTEXT_ENG_BASELINE,
|
| 347 |
+
COMPACTION_ONLY,
|
| 348 |
+
AGENT_MEMORY_ONLY,
|
| 349 |
+
ISOLATION_ONLY,
|
| 350 |
+
ALL_CONTEXT_ENGINEERING,
|
| 351 |
+
]
|
| 352 |
+
|
| 353 |
+
|
| 354 |
+
async def run_context_engineering_comparison(
|
| 355 |
+
task_prompt: str,
|
| 356 |
+
output_dir: Path | None = None,
|
| 357 |
+
) -> list[AblationResult]:
|
| 358 |
+
"""Run a comparison of all context engineering strategies.
|
| 359 |
+
|
| 360 |
+
This is a convenience function that runs all context engineering
|
| 361 |
+
baseline configurations against a single task for comparison.
|
| 362 |
+
|
| 363 |
+
Args:
|
| 364 |
+
task_prompt: The task to run (should benefit from context management)
|
| 365 |
+
output_dir: Optional output directory for results
|
| 366 |
+
|
| 367 |
+
Returns:
|
| 368 |
+
List of AblationResult for each strategy
|
| 369 |
+
|
| 370 |
+
Example:
|
| 371 |
+
>>> results = await run_context_engineering_comparison(
|
| 372 |
+
... "Research the authentication patterns in this codebase and "
|
| 373 |
+
... "create a summary document with recommendations."
|
| 374 |
+
... )
|
| 375 |
+
"""
|
| 376 |
+
return await run_ablations(
|
| 377 |
+
configs=CONTEXT_ENGINEERING_CONFIGS,
|
| 378 |
+
task_prompt=task_prompt,
|
| 379 |
+
output_dir=output_dir,
|
| 380 |
+
task_name="context_engineering_comparison",
|
| 381 |
+
)
|
| 382 |
+
|
| 383 |
+
|
| 384 |
+
# =============================================================================
|
| 385 |
+
# Shared Utilities for Pareto Analysis
|
| 386 |
+
# =============================================================================
|
| 387 |
+
|
| 388 |
+
|
| 389 |
+
def compute_pareto_frontier(
|
| 390 |
+
summaries: list[ConfigSummary],
|
| 391 |
+
score_key: str = "avg_score",
|
| 392 |
+
cost_key: str = "avg_tokens",
|
| 393 |
+
) -> list[str]:
|
| 394 |
+
"""Compute Pareto frontier for multi-objective optimization.
|
| 395 |
+
|
| 396 |
+
Identifies configurations that are not dominated by any other configuration.
|
| 397 |
+
A config is dominated if another config has better score AND lower tokens.
|
| 398 |
+
|
| 399 |
+
Args:
|
| 400 |
+
summaries: List of ConfigSummary objects (or dicts with score/token keys)
|
| 401 |
+
score_key: Attribute name for the score metric (higher is better)
|
| 402 |
+
cost_key: Attribute name for the cost metric (lower is better)
|
| 403 |
+
|
| 404 |
+
Returns:
|
| 405 |
+
List of names of Pareto-optimal configurations
|
| 406 |
+
"""
|
| 407 |
+
# Sort by cost (ascending)
|
| 408 |
+
def get_val(s: object, key: str) -> float:
|
| 409 |
+
if isinstance(s, dict):
|
| 410 |
+
return float(s.get(key, 0))
|
| 411 |
+
return float(getattr(s, key, 0))
|
| 412 |
+
|
| 413 |
+
def get_name(s: object) -> str:
|
| 414 |
+
if isinstance(s, dict):
|
| 415 |
+
return str(s.get("name", ""))
|
| 416 |
+
return str(getattr(s, "name", ""))
|
| 417 |
+
|
| 418 |
+
sorted_summaries = sorted(summaries, key=lambda s: get_val(s, cost_key))
|
| 419 |
+
|
| 420 |
+
pareto_names = []
|
| 421 |
+
best_score = -1.0
|
| 422 |
+
|
| 423 |
+
for summary in sorted_summaries:
|
| 424 |
+
score = get_val(summary, score_key)
|
| 425 |
+
if score > best_score:
|
| 426 |
+
pareto_names.append(get_name(summary))
|
| 427 |
+
best_score = score
|
| 428 |
+
|
| 429 |
+
return pareto_names
|
| 430 |
+
|
| 431 |
+
|
| 432 |
+
def generate_recommendation(
|
| 433 |
+
summaries: list[ConfigSummary],
|
| 434 |
+
pareto_names: list[str],
|
| 435 |
+
min_score: float = 0.7,
|
| 436 |
+
) -> tuple[str | None, str]:
|
| 437 |
+
"""Generate a recommendation based on Pareto analysis.
|
| 438 |
+
|
| 439 |
+
Args:
|
| 440 |
+
summaries: List of ConfigSummary objects
|
| 441 |
+
pareto_names: Names of Pareto-optimal configs
|
| 442 |
+
min_score: Minimum acceptable score threshold
|
| 443 |
+
|
| 444 |
+
Returns:
|
| 445 |
+
Tuple of (recommended_config_name, recommendation_text)
|
| 446 |
+
"""
|
| 447 |
+
def get_val(s: object, key: str) -> float:
|
| 448 |
+
if isinstance(s, dict):
|
| 449 |
+
return float(s.get(key, 0))
|
| 450 |
+
return float(getattr(s, key, 0))
|
| 451 |
+
|
| 452 |
+
def get_name(s: object) -> str:
|
| 453 |
+
if isinstance(s, dict):
|
| 454 |
+
return str(s.get("name", ""))
|
| 455 |
+
return str(getattr(s, "name", ""))
|
| 456 |
+
|
| 457 |
+
# Filter to acceptable configs
|
| 458 |
+
acceptable = [s for s in summaries if get_val(s, "avg_score") >= min_score]
|
| 459 |
+
if not acceptable:
|
| 460 |
+
return None, "No configuration met the minimum score threshold."
|
| 461 |
+
|
| 462 |
+
# Prefer Pareto-optimal configs
|
| 463 |
+
pareto_acceptable = [s for s in acceptable if get_name(s) in pareto_names]
|
| 464 |
+
candidates = pareto_acceptable if pareto_acceptable else acceptable
|
| 465 |
+
|
| 466 |
+
# Pick the one with lowest tokens among candidates
|
| 467 |
+
best = min(candidates, key=lambda s: get_val(s, "avg_tokens"))
|
| 468 |
+
name = get_name(best)
|
| 469 |
+
tokens = get_val(best, "avg_tokens")
|
| 470 |
+
score = get_val(best, "avg_score")
|
| 471 |
+
|
| 472 |
+
return name, f"Recommended: {name} (avg {tokens:.0f} tokens, {score:.2f} score)"
|
src/flow/experiments/config_export.py
ADDED
|
@@ -0,0 +1,184 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Microsoft. All rights reserved.
|
| 2 |
+
|
| 3 |
+
"""Config export/import utilities for optimizer results.
|
| 4 |
+
|
| 5 |
+
Exports winning configurations as YAML files that can be loaded
|
| 6 |
+
and used directly with `flow run --config <path>`.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
from __future__ import annotations
|
| 10 |
+
|
| 11 |
+
from dataclasses import asdict
|
| 12 |
+
from pathlib import Path
|
| 13 |
+
from typing import Any
|
| 14 |
+
|
| 15 |
+
import yaml
|
| 16 |
+
|
| 17 |
+
from .ablation import AblationConfig
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def export_config(
|
| 21 |
+
config: AblationConfig,
|
| 22 |
+
metrics: dict[str, Any],
|
| 23 |
+
path: Path,
|
| 24 |
+
) -> None:
|
| 25 |
+
"""Export an AblationConfig as a reusable YAML file.
|
| 26 |
+
|
| 27 |
+
The exported YAML includes:
|
| 28 |
+
- All config parameters (directly loadable)
|
| 29 |
+
- Optimization metadata prefixed with _ (ignored when loading)
|
| 30 |
+
|
| 31 |
+
Args:
|
| 32 |
+
config: The AblationConfig to export
|
| 33 |
+
metrics: Optimization metrics (score, tokens, etc.)
|
| 34 |
+
path: Path to write the YAML file
|
| 35 |
+
|
| 36 |
+
Example output:
|
| 37 |
+
name: compaction_head10_tail40
|
| 38 |
+
enable_message_compaction: true
|
| 39 |
+
compaction_head_size: 10
|
| 40 |
+
...
|
| 41 |
+
_optimization:
|
| 42 |
+
timestamp: "2026-01-26T14:30:22"
|
| 43 |
+
avg_score: 0.89
|
| 44 |
+
avg_tokens: 12400
|
| 45 |
+
"""
|
| 46 |
+
data = asdict(config)
|
| 47 |
+
data["_optimization"] = metrics
|
| 48 |
+
path.parent.mkdir(parents=True, exist_ok=True)
|
| 49 |
+
path.write_text(yaml.dump(data, default_flow_style=False, sort_keys=False))
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def load_config(path: Path) -> AblationConfig:
|
| 53 |
+
"""Load an AblationConfig from a YAML file.
|
| 54 |
+
|
| 55 |
+
Ignores any keys prefixed with _ (optimization metadata).
|
| 56 |
+
|
| 57 |
+
Args:
|
| 58 |
+
path: Path to the YAML config file
|
| 59 |
+
|
| 60 |
+
Returns:
|
| 61 |
+
AblationConfig instance
|
| 62 |
+
|
| 63 |
+
Raises:
|
| 64 |
+
FileNotFoundError: If the config file doesn't exist
|
| 65 |
+
ValueError: If the config is invalid
|
| 66 |
+
"""
|
| 67 |
+
if not path.exists():
|
| 68 |
+
raise FileNotFoundError(f"Config file not found: {path}")
|
| 69 |
+
|
| 70 |
+
data = yaml.safe_load(path.read_text())
|
| 71 |
+
|
| 72 |
+
# Filter out metadata keys (prefixed with _)
|
| 73 |
+
config_data = {k: v for k, v in data.items() if not k.startswith("_")}
|
| 74 |
+
|
| 75 |
+
try:
|
| 76 |
+
return AblationConfig(**config_data)
|
| 77 |
+
except TypeError as e:
|
| 78 |
+
raise ValueError(f"Invalid config file {path}: {e}") from e
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def export_optimization_configs(
|
| 82 |
+
summaries: list[dict[str, Any]],
|
| 83 |
+
pareto_names: list[str],
|
| 84 |
+
output_dir: Path,
|
| 85 |
+
timestamp: str,
|
| 86 |
+
) -> dict[str, Path]:
|
| 87 |
+
"""Export all notable configs from an optimization run.
|
| 88 |
+
|
| 89 |
+
Exports:
|
| 90 |
+
- best_score.yaml: Highest quality config
|
| 91 |
+
- best_cost.yaml: Lowest token usage config
|
| 92 |
+
- best_efficiency.yaml: Best score/token ratio
|
| 93 |
+
- pareto/<name>.yaml: All Pareto-optimal configs
|
| 94 |
+
|
| 95 |
+
Args:
|
| 96 |
+
summaries: List of ConfigSummary dicts with metrics
|
| 97 |
+
pareto_names: Names of Pareto-optimal configs
|
| 98 |
+
output_dir: Directory to write configs
|
| 99 |
+
timestamp: Optimization timestamp for metadata
|
| 100 |
+
|
| 101 |
+
Returns:
|
| 102 |
+
Dict mapping config type to file path
|
| 103 |
+
"""
|
| 104 |
+
configs_dir = output_dir / "configs"
|
| 105 |
+
configs_dir.mkdir(parents=True, exist_ok=True)
|
| 106 |
+
|
| 107 |
+
exported: dict[str, Path] = {}
|
| 108 |
+
|
| 109 |
+
if not summaries:
|
| 110 |
+
return exported
|
| 111 |
+
|
| 112 |
+
# Find best by different criteria
|
| 113 |
+
best_score = max(summaries, key=lambda s: s.get("avg_score", 0))
|
| 114 |
+
best_cost = min(summaries, key=lambda s: s.get("avg_tokens", float("inf")))
|
| 115 |
+
best_efficiency = max(
|
| 116 |
+
summaries,
|
| 117 |
+
key=lambda s: s.get("avg_score", 0) / max(s.get("avg_tokens", 1), 1),
|
| 118 |
+
)
|
| 119 |
+
|
| 120 |
+
# Export best configs
|
| 121 |
+
for label, summary in [
|
| 122 |
+
("best_score", best_score),
|
| 123 |
+
("best_cost", best_cost),
|
| 124 |
+
("best_efficiency", best_efficiency),
|
| 125 |
+
]:
|
| 126 |
+
config = _summary_to_config(summary)
|
| 127 |
+
metrics = _extract_metrics(summary, timestamp, label)
|
| 128 |
+
path = configs_dir / f"{label}.yaml"
|
| 129 |
+
export_config(config, metrics, path)
|
| 130 |
+
exported[label] = path
|
| 131 |
+
|
| 132 |
+
# Export Pareto-optimal configs
|
| 133 |
+
pareto_dir = configs_dir / "pareto"
|
| 134 |
+
pareto_dir.mkdir(exist_ok=True)
|
| 135 |
+
|
| 136 |
+
for summary in summaries:
|
| 137 |
+
name = summary.get("name", "unknown")
|
| 138 |
+
if name in pareto_names:
|
| 139 |
+
config = _summary_to_config(summary)
|
| 140 |
+
metrics = _extract_metrics(summary, timestamp, "pareto")
|
| 141 |
+
metrics["is_pareto_optimal"] = True
|
| 142 |
+
path = pareto_dir / f"{name}.yaml"
|
| 143 |
+
export_config(config, metrics, path)
|
| 144 |
+
exported[f"pareto/{name}"] = path
|
| 145 |
+
|
| 146 |
+
return exported
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
def _summary_to_config(summary: dict[str, Any]) -> AblationConfig:
|
| 150 |
+
"""Convert a summary dict back to an AblationConfig."""
|
| 151 |
+
# Extract config fields from summary
|
| 152 |
+
config_fields = {
|
| 153 |
+
"name": summary.get("name", "unknown"),
|
| 154 |
+
"enable_message_compaction": summary.get("enable_message_compaction", True),
|
| 155 |
+
"enable_memory_tool": summary.get("enable_memory_tool", True),
|
| 156 |
+
"enable_sub_agent": summary.get("enable_sub_agent", False),
|
| 157 |
+
"compaction_head_size": summary.get("compaction_head_size", 10),
|
| 158 |
+
"compaction_tail_size": summary.get("compaction_tail_size", 40),
|
| 159 |
+
"bash_timeout": summary.get("bash_timeout", 120),
|
| 160 |
+
}
|
| 161 |
+
|
| 162 |
+
# Also check nested config if present
|
| 163 |
+
if "config" in summary:
|
| 164 |
+
config_fields.update(summary["config"])
|
| 165 |
+
|
| 166 |
+
return AblationConfig(**config_fields)
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
def _extract_metrics(
|
| 170 |
+
summary: dict[str, Any],
|
| 171 |
+
timestamp: str,
|
| 172 |
+
selection_reason: str,
|
| 173 |
+
) -> dict[str, Any]:
|
| 174 |
+
"""Extract optimization metrics from a summary."""
|
| 175 |
+
return {
|
| 176 |
+
"timestamp": timestamp,
|
| 177 |
+
"selection_reason": selection_reason,
|
| 178 |
+
"avg_score": summary.get("avg_score", 0),
|
| 179 |
+
"avg_tokens": summary.get("avg_tokens", 0),
|
| 180 |
+
"avg_duration": summary.get("avg_duration", 0),
|
| 181 |
+
"pass_rate": summary.get("pass_rate", 0),
|
| 182 |
+
"pareto_rank": summary.get("pareto_rank"),
|
| 183 |
+
"is_pareto_optimal": summary.get("is_pareto_optimal", False),
|
| 184 |
+
}
|
src/flow/experiments/evaluators/__init__.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Microsoft. All rights reserved.
|
| 2 |
+
|
| 3 |
+
"""Evaluators for the experiments framework."""
|
| 4 |
+
|
| 5 |
+
from .base import Evaluator
|
| 6 |
+
from .composite import CompositeEvaluator
|
| 7 |
+
from .heuristic import HeuristicEvaluator
|
| 8 |
+
from .llm import LLMEvaluator
|
| 9 |
+
from .trace import TraceEvaluator
|
| 10 |
+
|
| 11 |
+
__all__ = [
|
| 12 |
+
"CompositeEvaluator",
|
| 13 |
+
"Evaluator",
|
| 14 |
+
"HeuristicEvaluator",
|
| 15 |
+
"LLMEvaluator",
|
| 16 |
+
"TraceEvaluator",
|
| 17 |
+
]
|
src/flow/experiments/evaluators/base.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Microsoft. All rights reserved.
|
| 2 |
+
|
| 3 |
+
"""Base evaluator protocol for the experiments framework."""
|
| 4 |
+
|
| 5 |
+
from typing import Protocol
|
| 6 |
+
|
| 7 |
+
from ..types import EvalResult, RunResult
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class Evaluator(Protocol):
|
| 11 |
+
"""Protocol for evaluating agent outputs.
|
| 12 |
+
|
| 13 |
+
Evaluators assess the results of agent runs and produce scores
|
| 14 |
+
and pass/fail determinations based on various criteria.
|
| 15 |
+
|
| 16 |
+
Implementations:
|
| 17 |
+
- TraceEvaluator: Based on trace metrics (tokens, duration, tool calls)
|
| 18 |
+
- LLMEvaluator: Uses an LLM to judge output quality
|
| 19 |
+
- HeuristicEvaluator: Rule-based evaluation (files created, syntax, etc.)
|
| 20 |
+
- CompositeEvaluator: Combines multiple evaluators
|
| 21 |
+
"""
|
| 22 |
+
|
| 23 |
+
async def evaluate(self, run_result: RunResult) -> EvalResult:
|
| 24 |
+
"""Evaluate the result of an agent run.
|
| 25 |
+
|
| 26 |
+
Args:
|
| 27 |
+
run_result: The result from running an agent on a task
|
| 28 |
+
|
| 29 |
+
Returns:
|
| 30 |
+
EvalResult with scores and reasoning
|
| 31 |
+
"""
|
| 32 |
+
...
|
src/flow/experiments/evaluators/composite.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Microsoft. All rights reserved.
|
| 2 |
+
|
| 3 |
+
"""Composite evaluator that combines multiple evaluators."""
|
| 4 |
+
|
| 5 |
+
from typing import TYPE_CHECKING
|
| 6 |
+
|
| 7 |
+
from ..types import EvalResult, RunResult
|
| 8 |
+
|
| 9 |
+
if TYPE_CHECKING:
|
| 10 |
+
from .base import Evaluator
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class CompositeEvaluator:
|
| 14 |
+
"""Evaluator that combines multiple evaluators.
|
| 15 |
+
|
| 16 |
+
Useful for combining different evaluation strategies:
|
| 17 |
+
- LLM evaluation with trace-based metrics
|
| 18 |
+
- Multiple heuristic checks
|
| 19 |
+
- Weighted combination of evaluators
|
| 20 |
+
|
| 21 |
+
Example:
|
| 22 |
+
evaluator = CompositeEvaluator([
|
| 23 |
+
TraceEvaluator(max_tokens=5000),
|
| 24 |
+
HeuristicEvaluator(),
|
| 25 |
+
], weights=[0.3, 0.7])
|
| 26 |
+
result = await evaluator.evaluate(run_result)
|
| 27 |
+
"""
|
| 28 |
+
|
| 29 |
+
def __init__(
|
| 30 |
+
self,
|
| 31 |
+
evaluators: list["Evaluator"],
|
| 32 |
+
weights: list[float] | None = None,
|
| 33 |
+
) -> None:
|
| 34 |
+
"""Initialize the composite evaluator.
|
| 35 |
+
|
| 36 |
+
Args:
|
| 37 |
+
evaluators: List of evaluators to combine
|
| 38 |
+
weights: Optional weights for each evaluator (default: equal weights)
|
| 39 |
+
|
| 40 |
+
Raises:
|
| 41 |
+
ValueError: If number of weights doesn't match number of evaluators
|
| 42 |
+
"""
|
| 43 |
+
self.evaluators = evaluators
|
| 44 |
+
self.weights = weights or [1.0] * len(evaluators)
|
| 45 |
+
|
| 46 |
+
if len(self.weights) != len(self.evaluators):
|
| 47 |
+
raise ValueError("Number of weights must match number of evaluators")
|
| 48 |
+
|
| 49 |
+
async def evaluate(self, run_result: RunResult) -> EvalResult:
|
| 50 |
+
"""Run all evaluators and combine results.
|
| 51 |
+
|
| 52 |
+
The overall score is a weighted average of all evaluator scores.
|
| 53 |
+
The overall pass/fail is determined by whether ALL evaluators pass.
|
| 54 |
+
|
| 55 |
+
Args:
|
| 56 |
+
run_result: The result from running an agent on a task
|
| 57 |
+
|
| 58 |
+
Returns:
|
| 59 |
+
Combined EvalResult
|
| 60 |
+
"""
|
| 61 |
+
all_criteria_results = []
|
| 62 |
+
total_weighted_score = 0.0
|
| 63 |
+
total_weight = sum(self.weights)
|
| 64 |
+
all_passed = True
|
| 65 |
+
all_reasoning = []
|
| 66 |
+
|
| 67 |
+
for evaluator, weight in zip(self.evaluators, self.weights, strict=True):
|
| 68 |
+
result = await evaluator.evaluate(run_result)
|
| 69 |
+
all_criteria_results.extend(result.criteria_results)
|
| 70 |
+
total_weighted_score += result.score * weight
|
| 71 |
+
all_passed = all_passed and result.passed
|
| 72 |
+
if result.reasoning:
|
| 73 |
+
all_reasoning.append(result.reasoning)
|
| 74 |
+
|
| 75 |
+
return EvalResult(
|
| 76 |
+
score=total_weighted_score / total_weight if total_weight > 0 else 0.0,
|
| 77 |
+
passed=all_passed,
|
| 78 |
+
criteria_results=all_criteria_results,
|
| 79 |
+
reasoning=" | ".join(all_reasoning),
|
| 80 |
+
)
|
src/flow/experiments/evaluators/heuristic.py
ADDED
|
@@ -0,0 +1,193 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Microsoft. All rights reserved.
|
| 2 |
+
|
| 3 |
+
"""Heuristic evaluator using rule-based assessment."""
|
| 4 |
+
|
| 5 |
+
import logging
|
| 6 |
+
import subprocess
|
| 7 |
+
|
| 8 |
+
from ..types import CriterionResult, EvalResult, RunResult
|
| 9 |
+
|
| 10 |
+
logger = logging.getLogger(__name__)
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class HeuristicEvaluator:
|
| 14 |
+
"""Evaluator that uses heuristic rules to assess agent output.
|
| 15 |
+
|
| 16 |
+
This evaluator checks:
|
| 17 |
+
1. Were files created?
|
| 18 |
+
2. Do Python files have valid syntax?
|
| 19 |
+
3. Did the agent report completion?
|
| 20 |
+
4. Does the output match expected patterns based on the task?
|
| 21 |
+
|
| 22 |
+
Useful for quick, deterministic evaluation without LLM calls.
|
| 23 |
+
|
| 24 |
+
Example:
|
| 25 |
+
evaluator = HeuristicEvaluator(passing_threshold=0.5)
|
| 26 |
+
result = await evaluator.evaluate(run_result)
|
| 27 |
+
print(f"Score: {result.score}, Passed: {result.passed}")
|
| 28 |
+
"""
|
| 29 |
+
|
| 30 |
+
def __init__(self, passing_threshold: float = 0.5) -> None:
|
| 31 |
+
"""Initialize the heuristic evaluator.
|
| 32 |
+
|
| 33 |
+
Args:
|
| 34 |
+
passing_threshold: Minimum score to pass (0.0 to 1.0)
|
| 35 |
+
"""
|
| 36 |
+
self.passing_threshold = passing_threshold
|
| 37 |
+
|
| 38 |
+
async def evaluate(self, run_result: RunResult) -> EvalResult:
|
| 39 |
+
"""Evaluate the agent's output using heuristic rules.
|
| 40 |
+
|
| 41 |
+
Args:
|
| 42 |
+
run_result: The result from running an agent on a task
|
| 43 |
+
|
| 44 |
+
Returns:
|
| 45 |
+
EvalResult with heuristic-based scores
|
| 46 |
+
"""
|
| 47 |
+
criteria_results = []
|
| 48 |
+
notes = []
|
| 49 |
+
score = 0.0
|
| 50 |
+
|
| 51 |
+
# Check if files were created
|
| 52 |
+
if run_result.files_created:
|
| 53 |
+
criteria_results.append(
|
| 54 |
+
CriterionResult(
|
| 55 |
+
name="files_created",
|
| 56 |
+
score=1.0,
|
| 57 |
+
passed=True,
|
| 58 |
+
reasoning=f"Created {len(run_result.files_created)} file(s)",
|
| 59 |
+
)
|
| 60 |
+
)
|
| 61 |
+
score += 0.25
|
| 62 |
+
notes.append(f"Created {len(run_result.files_created)} file(s)")
|
| 63 |
+
else:
|
| 64 |
+
criteria_results.append(
|
| 65 |
+
CriterionResult(
|
| 66 |
+
name="files_created",
|
| 67 |
+
score=0.0,
|
| 68 |
+
passed=False,
|
| 69 |
+
reasoning="No files created",
|
| 70 |
+
)
|
| 71 |
+
)
|
| 72 |
+
notes.append("No files created")
|
| 73 |
+
|
| 74 |
+
# Check if agent reported task complete
|
| 75 |
+
output_lower = run_result.output.lower()
|
| 76 |
+
if "task_done" in output_lower or "complete" in output_lower or "finished" in output_lower:
|
| 77 |
+
criteria_results.append(
|
| 78 |
+
CriterionResult(
|
| 79 |
+
name="task_completed",
|
| 80 |
+
score=1.0,
|
| 81 |
+
passed=True,
|
| 82 |
+
reasoning="Agent reported completion",
|
| 83 |
+
)
|
| 84 |
+
)
|
| 85 |
+
score += 0.25
|
| 86 |
+
notes.append("Agent reported completion")
|
| 87 |
+
else:
|
| 88 |
+
criteria_results.append(
|
| 89 |
+
CriterionResult(
|
| 90 |
+
name="task_completed",
|
| 91 |
+
score=0.0,
|
| 92 |
+
passed=False,
|
| 93 |
+
reasoning="Agent did not report completion",
|
| 94 |
+
)
|
| 95 |
+
)
|
| 96 |
+
|
| 97 |
+
# Try to validate Python files (check syntax)
|
| 98 |
+
python_files = [f for f in run_result.files_created if f.endswith(".py")]
|
| 99 |
+
if python_files:
|
| 100 |
+
all_valid = True
|
| 101 |
+
syntax_notes = []
|
| 102 |
+
for py_file in python_files[:5]: # Check up to 5 files
|
| 103 |
+
file_path = run_result.workspace / py_file
|
| 104 |
+
if file_path.exists():
|
| 105 |
+
try:
|
| 106 |
+
result = subprocess.run( # noqa: ASYNC221, S603
|
| 107 |
+
["python3", "-m", "py_compile", str(file_path)], # noqa: S607
|
| 108 |
+
capture_output=True,
|
| 109 |
+
timeout=5,
|
| 110 |
+
)
|
| 111 |
+
if result.returncode != 0:
|
| 112 |
+
all_valid = False
|
| 113 |
+
syntax_notes.append(f"Syntax error in {py_file}")
|
| 114 |
+
except subprocess.TimeoutExpired:
|
| 115 |
+
syntax_notes.append(f"Timeout checking {py_file}")
|
| 116 |
+
except FileNotFoundError:
|
| 117 |
+
# python3 not available, skip syntax check
|
| 118 |
+
pass
|
| 119 |
+
except Exception as e:
|
| 120 |
+
all_valid = False
|
| 121 |
+
syntax_notes.append(f"Error checking {py_file}: {e}")
|
| 122 |
+
|
| 123 |
+
if all_valid and not syntax_notes:
|
| 124 |
+
criteria_results.append(
|
| 125 |
+
CriterionResult(
|
| 126 |
+
name="code_syntax",
|
| 127 |
+
score=1.0,
|
| 128 |
+
passed=True,
|
| 129 |
+
reasoning="Python files have valid syntax",
|
| 130 |
+
)
|
| 131 |
+
)
|
| 132 |
+
score += 0.25
|
| 133 |
+
notes.append("Python files have valid syntax")
|
| 134 |
+
elif syntax_notes:
|
| 135 |
+
criteria_results.append(
|
| 136 |
+
CriterionResult(
|
| 137 |
+
name="code_syntax",
|
| 138 |
+
score=0.0,
|
| 139 |
+
passed=False,
|
| 140 |
+
reasoning="; ".join(syntax_notes),
|
| 141 |
+
)
|
| 142 |
+
)
|
| 143 |
+
notes.extend(syntax_notes)
|
| 144 |
+
|
| 145 |
+
# Check for expected patterns in output based on task
|
| 146 |
+
task_lower = run_result.task.prompt.lower()
|
| 147 |
+
output_correct = False
|
| 148 |
+
|
| 149 |
+
if "hello" in task_lower and "hello" in output_lower:
|
| 150 |
+
output_correct = True
|
| 151 |
+
elif "api" in task_lower and (
|
| 152 |
+
"fastapi" in output_lower or "endpoint" in output_lower or "flask" in output_lower
|
| 153 |
+
):
|
| 154 |
+
output_correct = True
|
| 155 |
+
elif "http" in task_lower and ("server" in output_lower or "port" in output_lower):
|
| 156 |
+
output_correct = True
|
| 157 |
+
elif "test" in task_lower and ("pytest" in output_lower or "test" in output_lower):
|
| 158 |
+
output_correct = True
|
| 159 |
+
elif run_result.files_created:
|
| 160 |
+
# Generic: if files created, give partial credit
|
| 161 |
+
score += 0.125
|
| 162 |
+
|
| 163 |
+
if output_correct:
|
| 164 |
+
criteria_results.append(
|
| 165 |
+
CriterionResult(
|
| 166 |
+
name="output_relevance",
|
| 167 |
+
score=1.0,
|
| 168 |
+
passed=True,
|
| 169 |
+
reasoning="Output matches expected patterns for task",
|
| 170 |
+
)
|
| 171 |
+
)
|
| 172 |
+
score += 0.25
|
| 173 |
+
|
| 174 |
+
# Check for execution errors
|
| 175 |
+
if run_result.error:
|
| 176 |
+
criteria_results.append(
|
| 177 |
+
CriterionResult(
|
| 178 |
+
name="execution_success",
|
| 179 |
+
score=0.0,
|
| 180 |
+
passed=False,
|
| 181 |
+
reasoning=f"Execution failed: {run_result.error}",
|
| 182 |
+
)
|
| 183 |
+
)
|
| 184 |
+
score = max(0.0, score - 0.25)
|
| 185 |
+
|
| 186 |
+
final_score = min(score, 1.0)
|
| 187 |
+
|
| 188 |
+
return EvalResult(
|
| 189 |
+
score=final_score,
|
| 190 |
+
passed=final_score >= self.passing_threshold,
|
| 191 |
+
criteria_results=criteria_results,
|
| 192 |
+
reasoning="; ".join(notes) if notes else "Heuristic evaluation complete",
|
| 193 |
+
)
|
src/flow/experiments/evaluators/llm.py
ADDED
|
@@ -0,0 +1,223 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Microsoft. All rights reserved.
|
| 2 |
+
|
| 3 |
+
"""LLM-as-judge evaluator for quality assessment."""
|
| 4 |
+
|
| 5 |
+
import json
|
| 6 |
+
import logging
|
| 7 |
+
from typing import Any
|
| 8 |
+
|
| 9 |
+
from ..metrics import extract_metrics
|
| 10 |
+
from ..types import CriterionResult, EvalResult, RunResult
|
| 11 |
+
|
| 12 |
+
logger = logging.getLogger(__name__)
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class LLMEvaluator:
|
| 16 |
+
"""Evaluator that uses an LLM to assess agent output against criteria.
|
| 17 |
+
|
| 18 |
+
This implements the LLM-as-a-judge pattern, where a language model
|
| 19 |
+
evaluates whether the agent's output meets specified criteria.
|
| 20 |
+
|
| 21 |
+
Note: Requires a separate model client - not tied to FlowConfig.
|
| 22 |
+
This allows using a different model for evaluation than for agent execution.
|
| 23 |
+
|
| 24 |
+
Example:
|
| 25 |
+
from openai import AsyncOpenAI
|
| 26 |
+
|
| 27 |
+
client = AsyncOpenAI()
|
| 28 |
+
evaluator = LLMEvaluator(
|
| 29 |
+
model_client=client,
|
| 30 |
+
model_name="gpt-4o",
|
| 31 |
+
passing_threshold=0.7,
|
| 32 |
+
)
|
| 33 |
+
result = await evaluator.evaluate(run_result)
|
| 34 |
+
"""
|
| 35 |
+
|
| 36 |
+
def __init__(
|
| 37 |
+
self,
|
| 38 |
+
model_client: Any,
|
| 39 |
+
model_name: str = "gpt-4o",
|
| 40 |
+
passing_threshold: float = 0.7,
|
| 41 |
+
) -> None:
|
| 42 |
+
"""Initialize the LLM evaluator.
|
| 43 |
+
|
| 44 |
+
Args:
|
| 45 |
+
model_client: An async client with chat.completions.create method
|
| 46 |
+
(e.g., AsyncOpenAI, AsyncAzureOpenAI)
|
| 47 |
+
model_name: Model name/deployment to use for evaluation
|
| 48 |
+
passing_threshold: Minimum score to pass (0.0 to 1.0)
|
| 49 |
+
"""
|
| 50 |
+
self.model_client = model_client
|
| 51 |
+
self.model_name = model_name
|
| 52 |
+
self.passing_threshold = passing_threshold
|
| 53 |
+
|
| 54 |
+
def _get_evaluation_prompt(self, run_result: RunResult) -> str:
|
| 55 |
+
"""Build the evaluation prompt for the LLM."""
|
| 56 |
+
criteria_text = "\n".join(
|
| 57 |
+
f"- **{c.name}** (weight: {c.weight}): {c.instruction}"
|
| 58 |
+
for c in run_result.task.criteria
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
# Extract execution trace summary for research/multi-step tasks
|
| 62 |
+
trace_summary = self._get_trace_summary(run_result)
|
| 63 |
+
|
| 64 |
+
return f"""You are an expert evaluator assessing an AI agent's output.
|
| 65 |
+
|
| 66 |
+
## Task
|
| 67 |
+
The agent was given this task:
|
| 68 |
+
```
|
| 69 |
+
{run_result.task.prompt}
|
| 70 |
+
```
|
| 71 |
+
|
| 72 |
+
## Agent Output
|
| 73 |
+
```
|
| 74 |
+
{run_result.output[:8000]}
|
| 75 |
+
```
|
| 76 |
+
|
| 77 |
+
## Files Created
|
| 78 |
+
{json.dumps(run_result.files_created, indent=2) if run_result.files_created else "None"}
|
| 79 |
+
|
| 80 |
+
## Execution Trace
|
| 81 |
+
{trace_summary}
|
| 82 |
+
|
| 83 |
+
## Execution Status
|
| 84 |
+
{"Success" if run_result.success else f"Failed: {run_result.error}"}
|
| 85 |
+
|
| 86 |
+
## Evaluation Criteria
|
| 87 |
+
{criteria_text}
|
| 88 |
+
|
| 89 |
+
## Instructions
|
| 90 |
+
Evaluate the agent's output against each criterion. Consider both the final output AND the execution
|
| 91 |
+
trace (tools used, steps taken) when assessing correctness.
|
| 92 |
+
|
| 93 |
+
For each criterion:
|
| 94 |
+
1. Assess how well the output meets the criterion (0.0 to 1.0)
|
| 95 |
+
2. Determine if it passes (score >= 0.7)
|
| 96 |
+
3. Provide brief reasoning
|
| 97 |
+
|
| 98 |
+
Respond in this exact JSON format:
|
| 99 |
+
```json
|
| 100 |
+
{{
|
| 101 |
+
"criteria_results": [
|
| 102 |
+
{{
|
| 103 |
+
"name": "criterion_name",
|
| 104 |
+
"score": 0.85,
|
| 105 |
+
"passed": true,
|
| 106 |
+
"reasoning": "Brief explanation"
|
| 107 |
+
}}
|
| 108 |
+
],
|
| 109 |
+
"overall_reasoning": "Summary of the overall evaluation"
|
| 110 |
+
}}
|
| 111 |
+
```
|
| 112 |
+
"""
|
| 113 |
+
|
| 114 |
+
def _get_trace_summary(self, run_result: RunResult) -> str:
|
| 115 |
+
"""Extract a summary of the execution trace for evaluation."""
|
| 116 |
+
if not run_result.trace:
|
| 117 |
+
return "No trace data available"
|
| 118 |
+
|
| 119 |
+
metrics = extract_metrics(run_result.trace)
|
| 120 |
+
|
| 121 |
+
# Build tool usage summary
|
| 122 |
+
tool_summary = ""
|
| 123 |
+
if metrics.tool_calls_by_name:
|
| 124 |
+
tool_lines = [f" - {name}: {count}x" for name, count in metrics.tool_calls_by_name.items()]
|
| 125 |
+
tool_summary = "Tools used:\n" + "\n".join(tool_lines)
|
| 126 |
+
else:
|
| 127 |
+
tool_summary = "Tools used: None"
|
| 128 |
+
|
| 129 |
+
return f"""Duration: {run_result.duration_seconds:.1f}s
|
| 130 |
+
LLM calls: {metrics.llm_call_count}
|
| 131 |
+
Total tool calls: {metrics.tool_call_count}
|
| 132 |
+
{tool_summary}
|
| 133 |
+
Tokens used: {metrics.total_tokens} (input: {metrics.input_tokens}, output: {metrics.output_tokens})"""
|
| 134 |
+
|
| 135 |
+
async def evaluate(self, run_result: RunResult) -> EvalResult:
|
| 136 |
+
"""Evaluate the agent's output using an LLM.
|
| 137 |
+
|
| 138 |
+
Args:
|
| 139 |
+
run_result: The result from running an agent on a task
|
| 140 |
+
|
| 141 |
+
Returns:
|
| 142 |
+
EvalResult with LLM-generated scores and reasoning
|
| 143 |
+
"""
|
| 144 |
+
if not run_result.task.criteria:
|
| 145 |
+
# No criteria to evaluate - return a default pass
|
| 146 |
+
return EvalResult(
|
| 147 |
+
score=1.0 if run_result.success else 0.0,
|
| 148 |
+
passed=run_result.success,
|
| 149 |
+
criteria_results=[],
|
| 150 |
+
reasoning=(
|
| 151 |
+
"No evaluation criteria specified"
|
| 152 |
+
+ ("" if run_result.success else f"; Error: {run_result.error}")
|
| 153 |
+
),
|
| 154 |
+
)
|
| 155 |
+
|
| 156 |
+
prompt = self._get_evaluation_prompt(run_result)
|
| 157 |
+
|
| 158 |
+
try:
|
| 159 |
+
response = await self.model_client.chat.completions.create(
|
| 160 |
+
model=self.model_name,
|
| 161 |
+
messages=[
|
| 162 |
+
{
|
| 163 |
+
"role": "system",
|
| 164 |
+
"content": "You are an expert evaluator. Respond only with valid JSON.",
|
| 165 |
+
},
|
| 166 |
+
{"role": "user", "content": prompt},
|
| 167 |
+
],
|
| 168 |
+
temperature=0.1, # Low temperature for consistent evaluation
|
| 169 |
+
)
|
| 170 |
+
|
| 171 |
+
# Extract the response text
|
| 172 |
+
response_text = response.choices[0].message.content or ""
|
| 173 |
+
|
| 174 |
+
# Parse JSON from response
|
| 175 |
+
json_start = response_text.find("{")
|
| 176 |
+
json_end = response_text.rfind("}") + 1
|
| 177 |
+
if json_start >= 0 and json_end > json_start:
|
| 178 |
+
eval_data = json.loads(response_text[json_start:json_end])
|
| 179 |
+
else:
|
| 180 |
+
raise ValueError("No JSON found in response")
|
| 181 |
+
|
| 182 |
+
# Build criterion results
|
| 183 |
+
criteria_results = []
|
| 184 |
+
total_weighted_score = 0.0
|
| 185 |
+
total_weight = 0.0
|
| 186 |
+
|
| 187 |
+
for cr_data in eval_data.get("criteria_results", []):
|
| 188 |
+
cr = CriterionResult(
|
| 189 |
+
name=cr_data.get("name", "unknown"),
|
| 190 |
+
score=float(cr_data.get("score", 0.0)),
|
| 191 |
+
passed=bool(cr_data.get("passed", False)),
|
| 192 |
+
reasoning=cr_data.get("reasoning", ""),
|
| 193 |
+
)
|
| 194 |
+
criteria_results.append(cr)
|
| 195 |
+
|
| 196 |
+
# Find the weight for this criterion
|
| 197 |
+
weight = 1.0
|
| 198 |
+
for task_criterion in run_result.task.criteria:
|
| 199 |
+
if task_criterion.name == cr.name:
|
| 200 |
+
weight = task_criterion.weight
|
| 201 |
+
break
|
| 202 |
+
|
| 203 |
+
total_weighted_score += cr.score * weight
|
| 204 |
+
total_weight += weight
|
| 205 |
+
|
| 206 |
+
# Calculate overall score
|
| 207 |
+
overall_score = total_weighted_score / total_weight if total_weight > 0 else 0.0
|
| 208 |
+
|
| 209 |
+
return EvalResult(
|
| 210 |
+
score=overall_score,
|
| 211 |
+
passed=overall_score >= self.passing_threshold,
|
| 212 |
+
criteria_results=criteria_results,
|
| 213 |
+
reasoning=eval_data.get("overall_reasoning", ""),
|
| 214 |
+
)
|
| 215 |
+
|
| 216 |
+
except Exception as e:
|
| 217 |
+
logger.error(f"LLM evaluation failed: {e}")
|
| 218 |
+
return EvalResult(
|
| 219 |
+
score=0.0,
|
| 220 |
+
passed=False,
|
| 221 |
+
criteria_results=[],
|
| 222 |
+
reasoning=f"Evaluation failed: {e}",
|
| 223 |
+
)
|
src/flow/experiments/evaluators/trace.py
ADDED
|
@@ -0,0 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Microsoft. All rights reserved.
|
| 2 |
+
|
| 3 |
+
"""Trace-based evaluator for objective metrics assessment."""
|
| 4 |
+
|
| 5 |
+
from ..metrics import extract_metrics
|
| 6 |
+
from ..types import CriterionResult, EvalResult, RunResult
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class TraceEvaluator:
|
| 10 |
+
"""Evaluator that assesses agent output based on trace metrics.
|
| 11 |
+
|
| 12 |
+
This evaluator checks objective metrics from the execution trace,
|
| 13 |
+
such as token usage, tool calls, and timing. All limits are optional -
|
| 14 |
+
only specified limits are evaluated.
|
| 15 |
+
|
| 16 |
+
Example:
|
| 17 |
+
evaluator = TraceEvaluator(
|
| 18 |
+
max_tokens=5000,
|
| 19 |
+
max_tool_calls=20,
|
| 20 |
+
max_duration_seconds=60.0,
|
| 21 |
+
)
|
| 22 |
+
result = await evaluator.evaluate(run_result)
|
| 23 |
+
print(f"Passed: {result.passed}, Score: {result.score}")
|
| 24 |
+
"""
|
| 25 |
+
|
| 26 |
+
def __init__(
|
| 27 |
+
self,
|
| 28 |
+
max_tokens: int | None = None,
|
| 29 |
+
max_tool_calls: int | None = None,
|
| 30 |
+
max_duration_seconds: float | None = None,
|
| 31 |
+
) -> None:
|
| 32 |
+
"""Initialize the trace evaluator.
|
| 33 |
+
|
| 34 |
+
Args:
|
| 35 |
+
max_tokens: Maximum allowed total tokens (None = no limit)
|
| 36 |
+
max_tool_calls: Maximum allowed tool calls (None = no limit)
|
| 37 |
+
max_duration_seconds: Maximum allowed duration (None = no limit)
|
| 38 |
+
"""
|
| 39 |
+
self.max_tokens = max_tokens
|
| 40 |
+
self.max_tool_calls = max_tool_calls
|
| 41 |
+
self.max_duration_seconds = max_duration_seconds
|
| 42 |
+
|
| 43 |
+
async def evaluate(self, run_result: RunResult) -> EvalResult:
|
| 44 |
+
"""Evaluate the agent's output based on trace metrics.
|
| 45 |
+
|
| 46 |
+
Args:
|
| 47 |
+
run_result: The result from running an agent on a task
|
| 48 |
+
|
| 49 |
+
Returns:
|
| 50 |
+
EvalResult with metric-based scores
|
| 51 |
+
"""
|
| 52 |
+
metrics = extract_metrics(run_result.trace)
|
| 53 |
+
criteria_results = []
|
| 54 |
+
all_passed = True
|
| 55 |
+
|
| 56 |
+
# Check token limit
|
| 57 |
+
if self.max_tokens is not None:
|
| 58 |
+
passed = metrics.total_tokens <= self.max_tokens
|
| 59 |
+
all_passed = all_passed and passed
|
| 60 |
+
# Score decreases proportionally when over limit
|
| 61 |
+
if passed:
|
| 62 |
+
score = 1.0
|
| 63 |
+
else:
|
| 64 |
+
overage = metrics.total_tokens - self.max_tokens
|
| 65 |
+
score = max(0.0, 1.0 - (overage / self.max_tokens))
|
| 66 |
+
|
| 67 |
+
criteria_results.append(
|
| 68 |
+
CriterionResult(
|
| 69 |
+
name="token_limit",
|
| 70 |
+
score=score,
|
| 71 |
+
passed=passed,
|
| 72 |
+
reasoning=f"Used {metrics.total_tokens} tokens (limit: {self.max_tokens})",
|
| 73 |
+
)
|
| 74 |
+
)
|
| 75 |
+
|
| 76 |
+
# Check tool call limit
|
| 77 |
+
if self.max_tool_calls is not None:
|
| 78 |
+
passed = metrics.tool_call_count <= self.max_tool_calls
|
| 79 |
+
all_passed = all_passed and passed
|
| 80 |
+
if passed:
|
| 81 |
+
score = 1.0
|
| 82 |
+
else:
|
| 83 |
+
overage = metrics.tool_call_count - self.max_tool_calls
|
| 84 |
+
score = max(0.0, 1.0 - (overage / self.max_tool_calls))
|
| 85 |
+
|
| 86 |
+
criteria_results.append(
|
| 87 |
+
CriterionResult(
|
| 88 |
+
name="tool_call_limit",
|
| 89 |
+
score=score,
|
| 90 |
+
passed=passed,
|
| 91 |
+
reasoning=f"Made {metrics.tool_call_count} tool calls (limit: {self.max_tool_calls})",
|
| 92 |
+
)
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
# Check duration limit
|
| 96 |
+
if self.max_duration_seconds is not None:
|
| 97 |
+
passed = run_result.duration_seconds <= self.max_duration_seconds
|
| 98 |
+
all_passed = all_passed and passed
|
| 99 |
+
if passed:
|
| 100 |
+
score = 1.0
|
| 101 |
+
else:
|
| 102 |
+
overage = run_result.duration_seconds - self.max_duration_seconds
|
| 103 |
+
score = max(0.0, 1.0 - (overage / self.max_duration_seconds))
|
| 104 |
+
|
| 105 |
+
criteria_results.append(
|
| 106 |
+
CriterionResult(
|
| 107 |
+
name="duration_limit",
|
| 108 |
+
score=score,
|
| 109 |
+
passed=passed,
|
| 110 |
+
reasoning=f"Took {run_result.duration_seconds:.2f}s (limit: {self.max_duration_seconds}s)",
|
| 111 |
+
)
|
| 112 |
+
)
|
| 113 |
+
|
| 114 |
+
# Check for execution errors
|
| 115 |
+
if run_result.error:
|
| 116 |
+
all_passed = False
|
| 117 |
+
criteria_results.append(
|
| 118 |
+
CriterionResult(
|
| 119 |
+
name="execution_success",
|
| 120 |
+
score=0.0,
|
| 121 |
+
passed=False,
|
| 122 |
+
reasoning=f"Execution failed: {run_result.error}",
|
| 123 |
+
)
|
| 124 |
+
)
|
| 125 |
+
|
| 126 |
+
# Check for trace errors
|
| 127 |
+
if metrics.error_count > 0:
|
| 128 |
+
criteria_results.append(
|
| 129 |
+
CriterionResult(
|
| 130 |
+
name="trace_errors",
|
| 131 |
+
score=max(0.0, 1.0 - (metrics.error_count * 0.2)),
|
| 132 |
+
passed=metrics.error_count == 0,
|
| 133 |
+
reasoning=f"Found {metrics.error_count} error(s) in trace",
|
| 134 |
+
)
|
| 135 |
+
)
|
| 136 |
+
|
| 137 |
+
# Calculate overall score
|
| 138 |
+
if criteria_results:
|
| 139 |
+
overall_score = sum(cr.score for cr in criteria_results) / len(criteria_results)
|
| 140 |
+
else:
|
| 141 |
+
# No criteria specified - just check success
|
| 142 |
+
overall_score = 1.0 if run_result.success else 0.0
|
| 143 |
+
|
| 144 |
+
return EvalResult(
|
| 145 |
+
score=overall_score,
|
| 146 |
+
passed=all_passed and run_result.success,
|
| 147 |
+
criteria_results=criteria_results,
|
| 148 |
+
reasoning=f"Trace evaluation: {len(criteria_results)} criteria checked",
|
| 149 |
+
)
|
src/flow/experiments/metrics.py
ADDED
|
@@ -0,0 +1,267 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Microsoft. All rights reserved.
|
| 2 |
+
|
| 3 |
+
"""Metrics extraction utilities for the experiments framework."""
|
| 4 |
+
|
| 5 |
+
from dataclasses import dataclass, field
|
| 6 |
+
from typing import Any
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
@dataclass
|
| 10 |
+
class LLMCallInfo:
|
| 11 |
+
"""Information about a single LLM call."""
|
| 12 |
+
|
| 13 |
+
model: str = "unknown"
|
| 14 |
+
input_tokens: int = 0
|
| 15 |
+
output_tokens: int = 0
|
| 16 |
+
finish_reason: str = ""
|
| 17 |
+
duration_ms: float = 0.0
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
@dataclass
|
| 21 |
+
class ToolCallInfo:
|
| 22 |
+
"""Information about a single tool call."""
|
| 23 |
+
|
| 24 |
+
name: str = "unknown"
|
| 25 |
+
duration_ms: float = 0.0
|
| 26 |
+
call_id: str = ""
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
@dataclass
|
| 30 |
+
class TraceMetrics:
|
| 31 |
+
"""Objective metrics extracted from execution traces.
|
| 32 |
+
|
| 33 |
+
These are factual measurements from the trace, not subjective assessments.
|
| 34 |
+
|
| 35 |
+
Attributes:
|
| 36 |
+
total_tokens: Total tokens used (input + output)
|
| 37 |
+
input_tokens: Input/prompt tokens used
|
| 38 |
+
output_tokens: Output/completion tokens used
|
| 39 |
+
tool_call_count: Number of tool calls made
|
| 40 |
+
tool_calls_by_name: Count of calls per tool name
|
| 41 |
+
llm_call_count: Number of LLM API calls
|
| 42 |
+
total_duration_ms: Total execution time in milliseconds
|
| 43 |
+
llm_duration_ms: Time spent in LLM calls
|
| 44 |
+
tool_duration_ms: Time spent in tool calls
|
| 45 |
+
span_count: Total number of trace spans
|
| 46 |
+
error_count: Number of error spans
|
| 47 |
+
llm_calls: Detailed info for each LLM call
|
| 48 |
+
tool_calls: Detailed info for each tool call
|
| 49 |
+
"""
|
| 50 |
+
|
| 51 |
+
total_tokens: int = 0
|
| 52 |
+
input_tokens: int = 0
|
| 53 |
+
output_tokens: int = 0
|
| 54 |
+
tool_call_count: int = 0
|
| 55 |
+
tool_calls_by_name: dict[str, int] = field(default_factory=dict)
|
| 56 |
+
llm_call_count: int = 0
|
| 57 |
+
total_duration_ms: float = 0.0
|
| 58 |
+
llm_duration_ms: float = 0.0
|
| 59 |
+
tool_duration_ms: float = 0.0
|
| 60 |
+
span_count: int = 0
|
| 61 |
+
error_count: int = 0
|
| 62 |
+
llm_calls: list[LLMCallInfo] = field(default_factory=list)
|
| 63 |
+
tool_calls: list[ToolCallInfo] = field(default_factory=list)
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def extract_metrics(trace: list[dict[str, Any]]) -> TraceMetrics:
|
| 67 |
+
"""Extract objective metrics from a trace.
|
| 68 |
+
|
| 69 |
+
Parses OpenTelemetry semantic conventions for GenAI:
|
| 70 |
+
- gen_ai.operation.name == "chat" for LLM calls
|
| 71 |
+
- gen_ai.usage.input_tokens / output_tokens for token counts
|
| 72 |
+
- gen_ai.operation.name == "execute_tool" for tool calls
|
| 73 |
+
- gen_ai.tool.name for tool identification
|
| 74 |
+
|
| 75 |
+
Args:
|
| 76 |
+
trace: List of trace span dictionaries
|
| 77 |
+
|
| 78 |
+
Returns:
|
| 79 |
+
TraceMetrics with extracted values
|
| 80 |
+
"""
|
| 81 |
+
metrics = TraceMetrics()
|
| 82 |
+
metrics.span_count = len(trace)
|
| 83 |
+
|
| 84 |
+
for span in trace:
|
| 85 |
+
data = span.get("data", {})
|
| 86 |
+
attributes = data.get("attributes", {})
|
| 87 |
+
operation_name = data.get("operation_name", "")
|
| 88 |
+
duration_ms = data.get("duration_ms", 0) or 0
|
| 89 |
+
|
| 90 |
+
# Check for errors
|
| 91 |
+
status = data.get("status", "")
|
| 92 |
+
if "ERROR" in str(status).upper():
|
| 93 |
+
metrics.error_count += 1
|
| 94 |
+
|
| 95 |
+
# Check for LLM operations (gen_ai.operation.name = "chat")
|
| 96 |
+
if attributes.get("gen_ai.operation.name") == "chat":
|
| 97 |
+
input_tokens = attributes.get("gen_ai.usage.input_tokens", 0) or 0
|
| 98 |
+
output_tokens = attributes.get("gen_ai.usage.output_tokens", 0) or 0
|
| 99 |
+
|
| 100 |
+
metrics.llm_call_count += 1
|
| 101 |
+
metrics.input_tokens += int(input_tokens)
|
| 102 |
+
metrics.output_tokens += int(output_tokens)
|
| 103 |
+
metrics.llm_duration_ms += duration_ms
|
| 104 |
+
|
| 105 |
+
metrics.llm_calls.append(LLMCallInfo(
|
| 106 |
+
model=attributes.get("gen_ai.request.model", "unknown"),
|
| 107 |
+
input_tokens=int(input_tokens),
|
| 108 |
+
output_tokens=int(output_tokens),
|
| 109 |
+
finish_reason=str(attributes.get("gen_ai.response.finish_reasons", "")),
|
| 110 |
+
duration_ms=duration_ms,
|
| 111 |
+
))
|
| 112 |
+
|
| 113 |
+
# Check for tool executions
|
| 114 |
+
elif attributes.get("gen_ai.operation.name") == "execute_tool":
|
| 115 |
+
tool_name = attributes.get("gen_ai.tool.name", operation_name)
|
| 116 |
+
|
| 117 |
+
metrics.tool_call_count += 1
|
| 118 |
+
metrics.tool_duration_ms += duration_ms
|
| 119 |
+
metrics.tool_calls_by_name[tool_name] = metrics.tool_calls_by_name.get(tool_name, 0) + 1
|
| 120 |
+
|
| 121 |
+
metrics.tool_calls.append(ToolCallInfo(
|
| 122 |
+
name=tool_name,
|
| 123 |
+
duration_ms=duration_ms,
|
| 124 |
+
call_id=attributes.get("gen_ai.tool.call.id", ""),
|
| 125 |
+
))
|
| 126 |
+
|
| 127 |
+
# Also check for generic tool patterns (fallback)
|
| 128 |
+
elif not attributes.get("gen_ai.operation.name"):
|
| 129 |
+
is_tool_call = (
|
| 130 |
+
"tool" in operation_name.lower()
|
| 131 |
+
or attributes.get("tool.name")
|
| 132 |
+
or attributes.get("gen_ai.tool.name")
|
| 133 |
+
or "function_call" in operation_name.lower()
|
| 134 |
+
)
|
| 135 |
+
|
| 136 |
+
if is_tool_call:
|
| 137 |
+
tool_name = (
|
| 138 |
+
attributes.get("tool.name")
|
| 139 |
+
or attributes.get("gen_ai.tool.name")
|
| 140 |
+
or _extract_tool_name_from_operation(operation_name)
|
| 141 |
+
or "unknown"
|
| 142 |
+
)
|
| 143 |
+
metrics.tool_call_count += 1
|
| 144 |
+
metrics.tool_duration_ms += duration_ms
|
| 145 |
+
metrics.tool_calls_by_name[tool_name] = metrics.tool_calls_by_name.get(tool_name, 0) + 1
|
| 146 |
+
|
| 147 |
+
metrics.tool_calls.append(ToolCallInfo(
|
| 148 |
+
name=tool_name,
|
| 149 |
+
duration_ms=duration_ms,
|
| 150 |
+
call_id="",
|
| 151 |
+
))
|
| 152 |
+
|
| 153 |
+
# Check for token counts in non-chat spans (fallback)
|
| 154 |
+
input_tokens = (
|
| 155 |
+
attributes.get("gen_ai.usage.input_tokens")
|
| 156 |
+
or attributes.get("llm.token_count.prompt")
|
| 157 |
+
or attributes.get("input_tokens")
|
| 158 |
+
)
|
| 159 |
+
output_tokens = (
|
| 160 |
+
attributes.get("gen_ai.usage.output_tokens")
|
| 161 |
+
or attributes.get("llm.token_count.completion")
|
| 162 |
+
or attributes.get("output_tokens")
|
| 163 |
+
)
|
| 164 |
+
|
| 165 |
+
if input_tokens or output_tokens:
|
| 166 |
+
metrics.input_tokens += int(input_tokens or 0)
|
| 167 |
+
metrics.output_tokens += int(output_tokens or 0)
|
| 168 |
+
metrics.llm_call_count += 1
|
| 169 |
+
metrics.llm_duration_ms += duration_ms
|
| 170 |
+
|
| 171 |
+
# Track total duration from root span
|
| 172 |
+
if not data.get("parent_span_id"):
|
| 173 |
+
metrics.total_duration_ms = max(metrics.total_duration_ms, duration_ms)
|
| 174 |
+
|
| 175 |
+
# Calculate total tokens
|
| 176 |
+
metrics.total_tokens = metrics.input_tokens + metrics.output_tokens
|
| 177 |
+
|
| 178 |
+
return metrics
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
def _extract_tool_name_from_operation(operation_name: str) -> str | None:
|
| 182 |
+
"""Try to extract a tool name from an operation name.
|
| 183 |
+
|
| 184 |
+
Args:
|
| 185 |
+
operation_name: The span operation name
|
| 186 |
+
|
| 187 |
+
Returns:
|
| 188 |
+
Extracted tool name or None
|
| 189 |
+
"""
|
| 190 |
+
# Common patterns: "tool:read_file", "execute_tool:write_file", "function_call:search"
|
| 191 |
+
for prefix in ["tool:", "execute_tool:", "function_call:", "call_"]:
|
| 192 |
+
if operation_name.lower().startswith(prefix):
|
| 193 |
+
return operation_name[len(prefix):]
|
| 194 |
+
|
| 195 |
+
return None
|
| 196 |
+
|
| 197 |
+
|
| 198 |
+
def format_metrics_summary(metrics: TraceMetrics) -> str:
|
| 199 |
+
"""Format metrics as a human-readable summary.
|
| 200 |
+
|
| 201 |
+
Args:
|
| 202 |
+
metrics: TraceMetrics to format
|
| 203 |
+
|
| 204 |
+
Returns:
|
| 205 |
+
Formatted string summary
|
| 206 |
+
"""
|
| 207 |
+
lines = [
|
| 208 |
+
"=== Trace Metrics ===",
|
| 209 |
+
f"Tokens: {metrics.total_tokens} total ({metrics.input_tokens} input, {metrics.output_tokens} output)",
|
| 210 |
+
f"LLM Calls: {metrics.llm_call_count} ({metrics.llm_duration_ms:.1f}ms)",
|
| 211 |
+
f"Tool Calls: {metrics.tool_call_count} ({metrics.tool_duration_ms:.1f}ms)",
|
| 212 |
+
]
|
| 213 |
+
|
| 214 |
+
if metrics.tool_calls_by_name:
|
| 215 |
+
lines.append(" Tool breakdown:")
|
| 216 |
+
for name, count in sorted(metrics.tool_calls_by_name.items()):
|
| 217 |
+
lines.append(f" - {name}: {count}")
|
| 218 |
+
|
| 219 |
+
lines.extend([
|
| 220 |
+
f"Duration: {metrics.total_duration_ms:.2f}ms",
|
| 221 |
+
f"Spans: {metrics.span_count}",
|
| 222 |
+
f"Errors: {metrics.error_count}",
|
| 223 |
+
])
|
| 224 |
+
|
| 225 |
+
return "\n".join(lines)
|
| 226 |
+
|
| 227 |
+
|
| 228 |
+
def metrics_to_dict(metrics: TraceMetrics) -> dict[str, Any]:
|
| 229 |
+
"""Convert TraceMetrics to a JSON-serializable dictionary.
|
| 230 |
+
|
| 231 |
+
Args:
|
| 232 |
+
metrics: TraceMetrics to convert
|
| 233 |
+
|
| 234 |
+
Returns:
|
| 235 |
+
Dictionary representation
|
| 236 |
+
"""
|
| 237 |
+
return {
|
| 238 |
+
"total_tokens": metrics.total_tokens,
|
| 239 |
+
"input_tokens": metrics.input_tokens,
|
| 240 |
+
"output_tokens": metrics.output_tokens,
|
| 241 |
+
"tool_call_count": metrics.tool_call_count,
|
| 242 |
+
"tool_calls_by_name": metrics.tool_calls_by_name,
|
| 243 |
+
"llm_call_count": metrics.llm_call_count,
|
| 244 |
+
"total_duration_ms": metrics.total_duration_ms,
|
| 245 |
+
"llm_duration_ms": metrics.llm_duration_ms,
|
| 246 |
+
"tool_duration_ms": metrics.tool_duration_ms,
|
| 247 |
+
"span_count": metrics.span_count,
|
| 248 |
+
"error_count": metrics.error_count,
|
| 249 |
+
"llm_calls": [
|
| 250 |
+
{
|
| 251 |
+
"model": c.model,
|
| 252 |
+
"input_tokens": c.input_tokens,
|
| 253 |
+
"output_tokens": c.output_tokens,
|
| 254 |
+
"finish_reason": c.finish_reason,
|
| 255 |
+
"duration_ms": c.duration_ms,
|
| 256 |
+
}
|
| 257 |
+
for c in metrics.llm_calls
|
| 258 |
+
],
|
| 259 |
+
"tool_calls": [
|
| 260 |
+
{
|
| 261 |
+
"name": c.name,
|
| 262 |
+
"duration_ms": c.duration_ms,
|
| 263 |
+
"call_id": c.call_id,
|
| 264 |
+
}
|
| 265 |
+
for c in metrics.tool_calls
|
| 266 |
+
],
|
| 267 |
+
}
|
src/flow/experiments/optimizer.py
ADDED
|
@@ -0,0 +1,547 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Microsoft. All rights reserved.
|
| 2 |
+
|
| 3 |
+
"""Optimizer service for finding best agent configurations.
|
| 4 |
+
|
| 5 |
+
Runs experiments in parallel, evaluates with LLM-as-Judge,
|
| 6 |
+
ranks via Pareto analysis, and exports reusable configs.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
from __future__ import annotations
|
| 10 |
+
|
| 11 |
+
import asyncio
|
| 12 |
+
import json
|
| 13 |
+
import logging
|
| 14 |
+
import os
|
| 15 |
+
from collections.abc import Callable
|
| 16 |
+
from dataclasses import asdict, dataclass, field
|
| 17 |
+
from datetime import datetime
|
| 18 |
+
from itertools import product
|
| 19 |
+
from pathlib import Path
|
| 20 |
+
from typing import Any
|
| 21 |
+
|
| 22 |
+
from openai import AsyncAzureOpenAI
|
| 23 |
+
|
| 24 |
+
from .ablation import (
|
| 25 |
+
AblationConfig,
|
| 26 |
+
compute_pareto_frontier,
|
| 27 |
+
create_harness_from_config,
|
| 28 |
+
)
|
| 29 |
+
from .config_export import export_optimization_configs
|
| 30 |
+
from .evaluators import LLMEvaluator
|
| 31 |
+
from .metrics import TraceMetrics, extract_metrics
|
| 32 |
+
from .runner import FlowExperimentRunner, setup_tracing
|
| 33 |
+
from .types import EvalCriterion, RunResult, Task
|
| 34 |
+
|
| 35 |
+
logger = logging.getLogger(__name__)
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
@dataclass
|
| 39 |
+
class TaskResult:
|
| 40 |
+
"""Result for a single config-task pair."""
|
| 41 |
+
|
| 42 |
+
config_name: str
|
| 43 |
+
task_name: str
|
| 44 |
+
run_result: RunResult
|
| 45 |
+
metrics: TraceMetrics
|
| 46 |
+
eval_score: float
|
| 47 |
+
eval_passed: bool
|
| 48 |
+
eval_reasoning: str
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
@dataclass
|
| 52 |
+
class ConfigSummary:
|
| 53 |
+
"""Aggregated summary for a configuration across all tasks."""
|
| 54 |
+
|
| 55 |
+
name: str
|
| 56 |
+
config: AblationConfig
|
| 57 |
+
task_results: list[TaskResult] = field(default_factory=list)
|
| 58 |
+
|
| 59 |
+
# Aggregated metrics
|
| 60 |
+
avg_score: float = 0.0
|
| 61 |
+
avg_tokens: float = 0.0
|
| 62 |
+
avg_duration: float = 0.0
|
| 63 |
+
pass_rate: float = 0.0
|
| 64 |
+
total_tokens: int = 0
|
| 65 |
+
task_count: int = 0
|
| 66 |
+
|
| 67 |
+
# Pareto analysis
|
| 68 |
+
pareto_rank: int | None = None
|
| 69 |
+
is_pareto_optimal: bool = False
|
| 70 |
+
|
| 71 |
+
def to_dict(self) -> dict[str, Any]:
|
| 72 |
+
"""Convert to dictionary for serialization."""
|
| 73 |
+
return {
|
| 74 |
+
"name": self.name,
|
| 75 |
+
"config": asdict(self.config),
|
| 76 |
+
"avg_score": self.avg_score,
|
| 77 |
+
"avg_tokens": self.avg_tokens,
|
| 78 |
+
"avg_duration": self.avg_duration,
|
| 79 |
+
"pass_rate": self.pass_rate,
|
| 80 |
+
"total_tokens": self.total_tokens,
|
| 81 |
+
"task_count": self.task_count,
|
| 82 |
+
"pareto_rank": self.pareto_rank,
|
| 83 |
+
"is_pareto_optimal": self.is_pareto_optimal,
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
@dataclass
|
| 88 |
+
class OptimizationResult:
|
| 89 |
+
"""Complete results from an optimization run."""
|
| 90 |
+
|
| 91 |
+
timestamp: str
|
| 92 |
+
output_dir: Path
|
| 93 |
+
summaries: list[ConfigSummary]
|
| 94 |
+
pareto_frontier: list[str]
|
| 95 |
+
exported_configs: dict[str, Path]
|
| 96 |
+
|
| 97 |
+
# Rankings
|
| 98 |
+
rank_by_score: list[str] = field(default_factory=list)
|
| 99 |
+
rank_by_tokens: list[str] = field(default_factory=list)
|
| 100 |
+
rank_by_efficiency: list[str] = field(default_factory=list)
|
| 101 |
+
|
| 102 |
+
# Stats
|
| 103 |
+
total_experiments: int = 0
|
| 104 |
+
total_duration_seconds: float = 0.0
|
| 105 |
+
|
| 106 |
+
def get_best_config(self, criterion: str = "score") -> ConfigSummary | None:
|
| 107 |
+
"""Get the best config by a criterion."""
|
| 108 |
+
if criterion == "score":
|
| 109 |
+
names = self.rank_by_score
|
| 110 |
+
elif criterion == "tokens":
|
| 111 |
+
names = self.rank_by_tokens
|
| 112 |
+
elif criterion == "efficiency":
|
| 113 |
+
names = self.rank_by_efficiency
|
| 114 |
+
else:
|
| 115 |
+
return None
|
| 116 |
+
|
| 117 |
+
if not names:
|
| 118 |
+
return None
|
| 119 |
+
|
| 120 |
+
for summary in self.summaries:
|
| 121 |
+
if summary.name == names[0]:
|
| 122 |
+
return summary
|
| 123 |
+
return None
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
class FlowOptimizer:
|
| 127 |
+
"""Optimizer for finding best agent configurations.
|
| 128 |
+
|
| 129 |
+
Runs experiments in parallel, evaluates results, performs
|
| 130 |
+
Pareto analysis, and exports winning configs.
|
| 131 |
+
|
| 132 |
+
Example:
|
| 133 |
+
optimizer = FlowOptimizer(parallel=4)
|
| 134 |
+
configs = [
|
| 135 |
+
AblationConfig(name="baseline", enable_message_compaction=False),
|
| 136 |
+
AblationConfig(name="compaction", enable_message_compaction=True),
|
| 137 |
+
]
|
| 138 |
+
tasks = [Task(name="test", prompt="Create hello world")]
|
| 139 |
+
result = await optimizer.optimize(configs, tasks)
|
| 140 |
+
print(f"Best: {result.rank_by_score[0]}")
|
| 141 |
+
"""
|
| 142 |
+
|
| 143 |
+
def __init__(
|
| 144 |
+
self,
|
| 145 |
+
parallel: int = 4,
|
| 146 |
+
use_llm_evaluator: bool = True,
|
| 147 |
+
output_dir: Path | None = None,
|
| 148 |
+
) -> None:
|
| 149 |
+
"""Initialize the optimizer.
|
| 150 |
+
|
| 151 |
+
Args:
|
| 152 |
+
parallel: Max concurrent experiments
|
| 153 |
+
use_llm_evaluator: Whether to use LLM for evaluation
|
| 154 |
+
output_dir: Base directory for results
|
| 155 |
+
"""
|
| 156 |
+
self.parallel = parallel
|
| 157 |
+
self.use_llm_evaluator = use_llm_evaluator
|
| 158 |
+
self.output_dir = output_dir or Path.home() / ".flow" / "optimizations"
|
| 159 |
+
|
| 160 |
+
async def optimize(
|
| 161 |
+
self,
|
| 162 |
+
configs: list[AblationConfig],
|
| 163 |
+
tasks: list[Task],
|
| 164 |
+
progress_callback: Callable[[int, int, str, str], None] | None = None,
|
| 165 |
+
) -> OptimizationResult:
|
| 166 |
+
"""Run optimization across all configs and tasks.
|
| 167 |
+
|
| 168 |
+
Args:
|
| 169 |
+
configs: Configurations to test
|
| 170 |
+
tasks: Tasks to run each config on
|
| 171 |
+
progress_callback: Optional callback(completed, total, config, task)
|
| 172 |
+
|
| 173 |
+
Returns:
|
| 174 |
+
OptimizationResult with rankings and exported configs
|
| 175 |
+
"""
|
| 176 |
+
start_time = datetime.now()
|
| 177 |
+
timestamp = start_time.strftime("%Y%m%d_%H%M%S")
|
| 178 |
+
run_dir = self.output_dir / timestamp
|
| 179 |
+
run_dir.mkdir(parents=True, exist_ok=True)
|
| 180 |
+
|
| 181 |
+
# Setup
|
| 182 |
+
setup_tracing("flow-optimizer")
|
| 183 |
+
self._save_config(configs, tasks, run_dir)
|
| 184 |
+
|
| 185 |
+
print("=" * 70)
|
| 186 |
+
print(" FLOW OPTIMIZER")
|
| 187 |
+
print("=" * 70)
|
| 188 |
+
print(f" Configs: {len(configs)}")
|
| 189 |
+
print(f" Tasks: {len(tasks)}")
|
| 190 |
+
print(f" Total: {len(configs) * len(tasks)} experiments")
|
| 191 |
+
print(f" Parallel: {self.parallel}")
|
| 192 |
+
print(f" Output: {run_dir}")
|
| 193 |
+
print("=" * 70)
|
| 194 |
+
|
| 195 |
+
# Create LLM evaluator if needed
|
| 196 |
+
evaluator = None
|
| 197 |
+
if self.use_llm_evaluator:
|
| 198 |
+
evaluator = self._create_evaluator()
|
| 199 |
+
|
| 200 |
+
# Run all experiments in parallel
|
| 201 |
+
task_results = await self._run_parallel(
|
| 202 |
+
configs, tasks, run_dir, evaluator, progress_callback
|
| 203 |
+
)
|
| 204 |
+
|
| 205 |
+
# Aggregate by config
|
| 206 |
+
summaries = self._aggregate_results(task_results, configs)
|
| 207 |
+
|
| 208 |
+
# Pareto analysis
|
| 209 |
+
pareto_names = self._compute_pareto(summaries)
|
| 210 |
+
|
| 211 |
+
# Compute rankings
|
| 212 |
+
rank_by_score = sorted(summaries, key=lambda s: s.avg_score, reverse=True)
|
| 213 |
+
rank_by_tokens = sorted(summaries, key=lambda s: s.avg_tokens)
|
| 214 |
+
rank_by_efficiency = sorted(
|
| 215 |
+
summaries,
|
| 216 |
+
key=lambda s: s.avg_score / max(s.avg_tokens, 1),
|
| 217 |
+
reverse=True,
|
| 218 |
+
)
|
| 219 |
+
|
| 220 |
+
# Export configs
|
| 221 |
+
summary_dicts = [s.to_dict() for s in summaries]
|
| 222 |
+
exported = export_optimization_configs(
|
| 223 |
+
summary_dicts, pareto_names, run_dir, timestamp
|
| 224 |
+
)
|
| 225 |
+
|
| 226 |
+
end_time = datetime.now()
|
| 227 |
+
|
| 228 |
+
result = OptimizationResult(
|
| 229 |
+
timestamp=timestamp,
|
| 230 |
+
output_dir=run_dir,
|
| 231 |
+
summaries=summaries,
|
| 232 |
+
pareto_frontier=pareto_names,
|
| 233 |
+
exported_configs=exported,
|
| 234 |
+
rank_by_score=[s.name for s in rank_by_score],
|
| 235 |
+
rank_by_tokens=[s.name for s in rank_by_tokens],
|
| 236 |
+
rank_by_efficiency=[s.name for s in rank_by_efficiency],
|
| 237 |
+
total_experiments=len(task_results),
|
| 238 |
+
total_duration_seconds=(end_time - start_time).total_seconds(),
|
| 239 |
+
)
|
| 240 |
+
|
| 241 |
+
# Save results
|
| 242 |
+
self._save_results(result, run_dir)
|
| 243 |
+
|
| 244 |
+
# Print summary
|
| 245 |
+
self._print_summary(result)
|
| 246 |
+
|
| 247 |
+
return result
|
| 248 |
+
|
| 249 |
+
async def _run_parallel(
|
| 250 |
+
self,
|
| 251 |
+
configs: list[AblationConfig],
|
| 252 |
+
tasks: list[Task],
|
| 253 |
+
run_dir: Path,
|
| 254 |
+
evaluator: LLMEvaluator | None,
|
| 255 |
+
progress_callback: Callable[[int, int, str, str], None] | None,
|
| 256 |
+
) -> list[TaskResult]:
|
| 257 |
+
"""Run all config-task pairs in parallel with semaphore control."""
|
| 258 |
+
semaphore = asyncio.Semaphore(self.parallel)
|
| 259 |
+
total = len(configs) * len(tasks)
|
| 260 |
+
completed = 0
|
| 261 |
+
lock = asyncio.Lock()
|
| 262 |
+
|
| 263 |
+
async def run_one(config: AblationConfig, task: Task) -> TaskResult:
|
| 264 |
+
nonlocal completed
|
| 265 |
+
async with semaphore:
|
| 266 |
+
workspace = run_dir / "workspaces" / config.name / task.name
|
| 267 |
+
workspace.mkdir(parents=True, exist_ok=True)
|
| 268 |
+
|
| 269 |
+
result = await self._run_single(config, task, workspace, evaluator)
|
| 270 |
+
|
| 271 |
+
async with lock:
|
| 272 |
+
completed += 1
|
| 273 |
+
status = "✓" if result.eval_passed else "✗"
|
| 274 |
+
print(
|
| 275 |
+
f" [{completed}/{total}] {config.name}/{task.name}: "
|
| 276 |
+
f"{status} score={result.eval_score:.2f} "
|
| 277 |
+
f"tokens={result.metrics.total_tokens:,}"
|
| 278 |
+
)
|
| 279 |
+
if progress_callback:
|
| 280 |
+
progress_callback(completed, total, config.name, task.name)
|
| 281 |
+
|
| 282 |
+
return result
|
| 283 |
+
|
| 284 |
+
# Create all tasks
|
| 285 |
+
coroutines = [run_one(config, task) for config in configs for task in tasks]
|
| 286 |
+
|
| 287 |
+
# Run with gather
|
| 288 |
+
gather_results = await asyncio.gather(*coroutines, return_exceptions=True)
|
| 289 |
+
|
| 290 |
+
# Filter out exceptions
|
| 291 |
+
valid_results: list[TaskResult] = []
|
| 292 |
+
for r in gather_results:
|
| 293 |
+
if isinstance(r, BaseException):
|
| 294 |
+
logger.error(f"Experiment failed: {r}")
|
| 295 |
+
else:
|
| 296 |
+
valid_results.append(r)
|
| 297 |
+
|
| 298 |
+
return valid_results
|
| 299 |
+
|
| 300 |
+
async def _run_single(
|
| 301 |
+
self,
|
| 302 |
+
config: AblationConfig,
|
| 303 |
+
task: Task,
|
| 304 |
+
workspace: Path,
|
| 305 |
+
evaluator: LLMEvaluator | None,
|
| 306 |
+
) -> TaskResult:
|
| 307 |
+
"""Run a single config-task experiment."""
|
| 308 |
+
harness = create_harness_from_config(config, workspace)
|
| 309 |
+
|
| 310 |
+
try:
|
| 311 |
+
runner = FlowExperimentRunner(keep_workspace=True)
|
| 312 |
+
run_result = await runner.run(harness, task, workspace=workspace)
|
| 313 |
+
metrics = extract_metrics(run_result.trace)
|
| 314 |
+
|
| 315 |
+
# Evaluate
|
| 316 |
+
if evaluator:
|
| 317 |
+
eval_result = await evaluator.evaluate(run_result)
|
| 318 |
+
eval_score = eval_result.score
|
| 319 |
+
eval_passed = eval_result.passed
|
| 320 |
+
eval_reasoning = eval_result.reasoning
|
| 321 |
+
else:
|
| 322 |
+
# Simple heuristic: passed if no error
|
| 323 |
+
eval_score = 1.0 if run_result.success else 0.0
|
| 324 |
+
eval_passed = run_result.success
|
| 325 |
+
eval_reasoning = "Success" if run_result.success else run_result.error or "Failed"
|
| 326 |
+
|
| 327 |
+
return TaskResult(
|
| 328 |
+
config_name=config.name,
|
| 329 |
+
task_name=task.name,
|
| 330 |
+
run_result=run_result,
|
| 331 |
+
metrics=metrics,
|
| 332 |
+
eval_score=eval_score,
|
| 333 |
+
eval_passed=eval_passed,
|
| 334 |
+
eval_reasoning=eval_reasoning,
|
| 335 |
+
)
|
| 336 |
+
finally:
|
| 337 |
+
await harness.close()
|
| 338 |
+
|
| 339 |
+
def _aggregate_results(
|
| 340 |
+
self,
|
| 341 |
+
task_results: list[TaskResult],
|
| 342 |
+
configs: list[AblationConfig],
|
| 343 |
+
) -> list[ConfigSummary]:
|
| 344 |
+
"""Aggregate task results into config summaries."""
|
| 345 |
+
config_map = {c.name: c for c in configs}
|
| 346 |
+
results_by_config: dict[str, list[TaskResult]] = {c.name: [] for c in configs}
|
| 347 |
+
|
| 348 |
+
for result in task_results:
|
| 349 |
+
if result.config_name in results_by_config:
|
| 350 |
+
results_by_config[result.config_name].append(result)
|
| 351 |
+
|
| 352 |
+
summaries = []
|
| 353 |
+
for name, results in results_by_config.items():
|
| 354 |
+
if not results:
|
| 355 |
+
continue
|
| 356 |
+
|
| 357 |
+
config = config_map[name]
|
| 358 |
+
summary = ConfigSummary(
|
| 359 |
+
name=name,
|
| 360 |
+
config=config,
|
| 361 |
+
task_results=results,
|
| 362 |
+
avg_score=sum(r.eval_score for r in results) / len(results),
|
| 363 |
+
avg_tokens=sum(r.metrics.total_tokens for r in results) / len(results),
|
| 364 |
+
avg_duration=sum(r.run_result.duration_seconds for r in results) / len(results),
|
| 365 |
+
pass_rate=sum(1 for r in results if r.eval_passed) / len(results),
|
| 366 |
+
total_tokens=sum(r.metrics.total_tokens for r in results),
|
| 367 |
+
task_count=len(results),
|
| 368 |
+
)
|
| 369 |
+
summaries.append(summary)
|
| 370 |
+
|
| 371 |
+
return summaries
|
| 372 |
+
|
| 373 |
+
def _compute_pareto(self, summaries: list[ConfigSummary]) -> list[str]:
|
| 374 |
+
"""Compute Pareto frontier (maximize score, minimize tokens)."""
|
| 375 |
+
# Use shared utility
|
| 376 |
+
pareto_names = compute_pareto_frontier(summaries)
|
| 377 |
+
|
| 378 |
+
# Mark summaries with Pareto status
|
| 379 |
+
for summary in summaries:
|
| 380 |
+
if summary.name in pareto_names:
|
| 381 |
+
summary.is_pareto_optimal = True
|
| 382 |
+
summary.pareto_rank = 0
|
| 383 |
+
else:
|
| 384 |
+
summary.is_pareto_optimal = False
|
| 385 |
+
summary.pareto_rank = 1 # Simplified: all non-Pareto get rank 1
|
| 386 |
+
|
| 387 |
+
return pareto_names
|
| 388 |
+
|
| 389 |
+
def _create_evaluator(self) -> LLMEvaluator | None:
|
| 390 |
+
"""Create LLM evaluator if credentials available."""
|
| 391 |
+
api_key = os.environ.get("AZURE_OPENAI_API_KEY")
|
| 392 |
+
endpoint = os.environ.get("AZURE_OPENAI_ENDPOINT")
|
| 393 |
+
deployment = os.environ.get("AZURE_OPENAI_DEPLOYMENT", "gpt-4o")
|
| 394 |
+
|
| 395 |
+
if not api_key or not endpoint:
|
| 396 |
+
logger.warning("No Azure OpenAI credentials, using heuristic evaluation")
|
| 397 |
+
return None
|
| 398 |
+
|
| 399 |
+
client = AsyncAzureOpenAI(
|
| 400 |
+
api_key=api_key,
|
| 401 |
+
api_version="2024-02-15-preview",
|
| 402 |
+
azure_endpoint=endpoint,
|
| 403 |
+
)
|
| 404 |
+
|
| 405 |
+
return LLMEvaluator(
|
| 406 |
+
model_client=client,
|
| 407 |
+
model_name=deployment,
|
| 408 |
+
passing_threshold=0.7,
|
| 409 |
+
)
|
| 410 |
+
|
| 411 |
+
def _save_config(
|
| 412 |
+
self,
|
| 413 |
+
configs: list[AblationConfig],
|
| 414 |
+
tasks: list[Task],
|
| 415 |
+
run_dir: Path,
|
| 416 |
+
) -> None:
|
| 417 |
+
"""Save optimization config."""
|
| 418 |
+
with open(run_dir / "optimization_config.json", "w") as f:
|
| 419 |
+
json.dump(
|
| 420 |
+
{
|
| 421 |
+
"configs": [asdict(c) for c in configs],
|
| 422 |
+
"tasks": [{"name": t.name, "prompt": t.prompt} for t in tasks],
|
| 423 |
+
"parallel": self.parallel,
|
| 424 |
+
"use_llm_evaluator": self.use_llm_evaluator,
|
| 425 |
+
},
|
| 426 |
+
f,
|
| 427 |
+
indent=2,
|
| 428 |
+
)
|
| 429 |
+
|
| 430 |
+
def _save_results(self, result: OptimizationResult, run_dir: Path) -> None:
|
| 431 |
+
"""Save optimization results."""
|
| 432 |
+
summary_data = {
|
| 433 |
+
"timestamp": result.timestamp,
|
| 434 |
+
"total_experiments": result.total_experiments,
|
| 435 |
+
"total_duration_seconds": result.total_duration_seconds,
|
| 436 |
+
"pareto_frontier": result.pareto_frontier,
|
| 437 |
+
"rank_by_score": result.rank_by_score,
|
| 438 |
+
"rank_by_tokens": result.rank_by_tokens,
|
| 439 |
+
"rank_by_efficiency": result.rank_by_efficiency,
|
| 440 |
+
"exported_configs": {k: str(v) for k, v in result.exported_configs.items()},
|
| 441 |
+
"summaries": [s.to_dict() for s in result.summaries],
|
| 442 |
+
}
|
| 443 |
+
|
| 444 |
+
with open(run_dir / "summary.json", "w") as f:
|
| 445 |
+
json.dump(summary_data, f, indent=2)
|
| 446 |
+
|
| 447 |
+
def _print_summary(self, result: OptimizationResult) -> None:
|
| 448 |
+
"""Print optimization summary."""
|
| 449 |
+
print("\n" + "=" * 70)
|
| 450 |
+
print(" OPTIMIZATION RESULTS")
|
| 451 |
+
print("=" * 70)
|
| 452 |
+
|
| 453 |
+
# Rankings table
|
| 454 |
+
print(f"\n{'Config':<30} | {'Score':>8} | {'Tokens':>10} | {'Pareto':>8}")
|
| 455 |
+
print("-" * 65)
|
| 456 |
+
|
| 457 |
+
for summary in sorted(result.summaries, key=lambda s: s.avg_score, reverse=True):
|
| 458 |
+
pareto = "★" if summary.is_pareto_optimal else ""
|
| 459 |
+
print(
|
| 460 |
+
f"{summary.name:<30} | {summary.avg_score:>8.2f} | "
|
| 461 |
+
f"{summary.avg_tokens:>10,.0f} | {pareto:>8}"
|
| 462 |
+
)
|
| 463 |
+
|
| 464 |
+
print("\n" + "-" * 70)
|
| 465 |
+
print(f"Pareto frontier: {result.pareto_frontier}")
|
| 466 |
+
print(f"Best by score: {result.rank_by_score[0] if result.rank_by_score else 'N/A'}")
|
| 467 |
+
print(f"Best by efficiency: {result.rank_by_efficiency[0] if result.rank_by_efficiency else 'N/A'}")
|
| 468 |
+
print("\nExported configs:")
|
| 469 |
+
for name, path in result.exported_configs.items():
|
| 470 |
+
print(f" {name}: {path}")
|
| 471 |
+
print(f"\nResults saved to: {result.output_dir}")
|
| 472 |
+
|
| 473 |
+
|
| 474 |
+
def generate_grid_configs(
|
| 475 |
+
base_name: str,
|
| 476 |
+
variations: dict[str, list[Any]],
|
| 477 |
+
) -> list[AblationConfig]:
|
| 478 |
+
"""Generate configs from a variation grid.
|
| 479 |
+
|
| 480 |
+
Args:
|
| 481 |
+
base_name: Base name for generated configs
|
| 482 |
+
variations: Dict of param_name -> list of values
|
| 483 |
+
|
| 484 |
+
Returns:
|
| 485 |
+
List of AblationConfig for each combination
|
| 486 |
+
|
| 487 |
+
Example:
|
| 488 |
+
configs = generate_grid_configs("grid", {
|
| 489 |
+
"enable_message_compaction": [True, False],
|
| 490 |
+
"compaction_head_size": [5, 10, 20],
|
| 491 |
+
})
|
| 492 |
+
"""
|
| 493 |
+
if not variations:
|
| 494 |
+
return [AblationConfig(name=base_name)]
|
| 495 |
+
|
| 496 |
+
param_names = list(variations.keys())
|
| 497 |
+
param_values = list(variations.values())
|
| 498 |
+
|
| 499 |
+
configs = []
|
| 500 |
+
for values in product(*param_values):
|
| 501 |
+
kwargs = dict(zip(param_names, values, strict=True))
|
| 502 |
+
name = f"{base_name}_" + "_".join(f"{k}={v}" for k, v in kwargs.items())
|
| 503 |
+
configs.append(AblationConfig(name=name, **kwargs))
|
| 504 |
+
|
| 505 |
+
return configs
|
| 506 |
+
|
| 507 |
+
|
| 508 |
+
def load_tasks_from_jsonl(path: Path) -> list[Task]:
|
| 509 |
+
"""Load tasks from a JSONL file.
|
| 510 |
+
|
| 511 |
+
Each line should be a JSON object with:
|
| 512 |
+
- name: Task name
|
| 513 |
+
- prompt: Task prompt
|
| 514 |
+
- criteria: Optional list of evaluation criteria
|
| 515 |
+
- category: Optional category string
|
| 516 |
+
|
| 517 |
+
Args:
|
| 518 |
+
path: Path to JSONL file
|
| 519 |
+
|
| 520 |
+
Returns:
|
| 521 |
+
List of Task objects
|
| 522 |
+
"""
|
| 523 |
+
tasks = []
|
| 524 |
+
with open(path) as f:
|
| 525 |
+
for line in f:
|
| 526 |
+
line = line.strip()
|
| 527 |
+
if not line:
|
| 528 |
+
continue
|
| 529 |
+
|
| 530 |
+
data = json.loads(line)
|
| 531 |
+
criteria = []
|
| 532 |
+
for c in data.get("criteria", []):
|
| 533 |
+
if isinstance(c, dict):
|
| 534 |
+
criteria.append(EvalCriterion(**c))
|
| 535 |
+
else:
|
| 536 |
+
criteria.append(EvalCriterion(name="default", instruction=str(c)))
|
| 537 |
+
|
| 538 |
+
tasks.append(
|
| 539 |
+
Task(
|
| 540 |
+
name=data["name"],
|
| 541 |
+
prompt=data["prompt"],
|
| 542 |
+
criteria=criteria,
|
| 543 |
+
metadata={"category": data.get("category", "default")},
|
| 544 |
+
)
|
| 545 |
+
)
|
| 546 |
+
|
| 547 |
+
return tasks
|
src/flow/experiments/reporters/__init__.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Microsoft. All rights reserved.
|
| 2 |
+
|
| 3 |
+
"""Reporters for experiment results."""
|
| 4 |
+
|
| 5 |
+
from .console_reporter import print_comparison_table, print_eval_result, print_metrics_summary
|
| 6 |
+
from .json_reporter import load_run_result_summary, save_comparison, save_run_result
|
| 7 |
+
|
| 8 |
+
__all__ = [ # noqa: RUF022 # Intentionally grouped by category
|
| 9 |
+
# JSON reporter
|
| 10 |
+
"save_run_result",
|
| 11 |
+
"load_run_result_summary",
|
| 12 |
+
"save_comparison",
|
| 13 |
+
# Console reporter
|
| 14 |
+
"print_metrics_summary",
|
| 15 |
+
"print_comparison_table",
|
| 16 |
+
"print_eval_result",
|
| 17 |
+
]
|
src/flow/experiments/reporters/console_reporter.py
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Microsoft. All rights reserved.
|
| 2 |
+
|
| 3 |
+
"""Console reporter for experiment results with rich formatting."""
|
| 4 |
+
|
| 5 |
+
from typing import Any
|
| 6 |
+
|
| 7 |
+
from ..metrics import TraceMetrics
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def print_metrics_summary(metrics: TraceMetrics, title: str = "Trace Metrics") -> None:
|
| 11 |
+
"""Print a formatted metrics summary to console.
|
| 12 |
+
|
| 13 |
+
Args:
|
| 14 |
+
metrics: TraceMetrics to display
|
| 15 |
+
title: Title for the summary section
|
| 16 |
+
"""
|
| 17 |
+
print(f"\n{'=' * 60}")
|
| 18 |
+
print(f" {title}")
|
| 19 |
+
print("=" * 60)
|
| 20 |
+
print(f" Tokens: {metrics.total_tokens:,} total ({metrics.input_tokens:,} in, {metrics.output_tokens:,} out)")
|
| 21 |
+
print(f" LLM Calls: {metrics.llm_call_count} ({metrics.llm_duration_ms:.1f}ms)")
|
| 22 |
+
print(f" Tool Calls: {metrics.tool_call_count} ({metrics.tool_duration_ms:.1f}ms)")
|
| 23 |
+
|
| 24 |
+
if metrics.tool_calls_by_name:
|
| 25 |
+
print(" Tool breakdown:")
|
| 26 |
+
for name, count in sorted(metrics.tool_calls_by_name.items()):
|
| 27 |
+
print(f" - {name}: {count}")
|
| 28 |
+
|
| 29 |
+
print(f" Duration: {metrics.total_duration_ms:.2f}ms")
|
| 30 |
+
print(f" Spans: {metrics.span_count}")
|
| 31 |
+
if metrics.error_count > 0:
|
| 32 |
+
print(f" Errors: {metrics.error_count}")
|
| 33 |
+
print("=" * 60)
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def print_comparison_table(
|
| 37 |
+
results: list[dict[str, Any]],
|
| 38 |
+
title: str = "Comparison",
|
| 39 |
+
) -> None:
|
| 40 |
+
"""Print a side-by-side comparison table of multiple results.
|
| 41 |
+
|
| 42 |
+
Args:
|
| 43 |
+
results: List of result dictionaries with 'name' and 'metrics' keys
|
| 44 |
+
title: Title for the comparison
|
| 45 |
+
"""
|
| 46 |
+
if not results:
|
| 47 |
+
print("No results to compare")
|
| 48 |
+
return
|
| 49 |
+
|
| 50 |
+
names = [r.get("name", "unknown") for r in results]
|
| 51 |
+
col_width = max(15, max(len(n) for n in names) + 2)
|
| 52 |
+
|
| 53 |
+
print(f"\n{'=' * 80}")
|
| 54 |
+
print(f" {title}")
|
| 55 |
+
print("=" * 80)
|
| 56 |
+
|
| 57 |
+
# Header
|
| 58 |
+
print(f"\n{'Metric':<30} | " + " | ".join(f"{n:>{col_width}}" for n in names))
|
| 59 |
+
print("-" * (32 + (col_width + 3) * len(names)))
|
| 60 |
+
|
| 61 |
+
def row(label: str, values: list[Any]) -> None:
|
| 62 |
+
formatted = []
|
| 63 |
+
for v in values:
|
| 64 |
+
if isinstance(v, float):
|
| 65 |
+
formatted.append(f"{v:>{col_width}.1f}")
|
| 66 |
+
elif isinstance(v, bool):
|
| 67 |
+
formatted.append(f"{v!s:>{col_width}}")
|
| 68 |
+
else:
|
| 69 |
+
formatted.append(f"{v:>{col_width}}")
|
| 70 |
+
print(f"{label:<30} | " + " | ".join(formatted))
|
| 71 |
+
|
| 72 |
+
# Extract metrics for each result
|
| 73 |
+
metrics_list = [r.get("metrics", {}) for r in results]
|
| 74 |
+
|
| 75 |
+
row("Duration (s)", [r.get("duration_seconds", 0) for r in results])
|
| 76 |
+
row("Success", [r.get("success", False) for r in results])
|
| 77 |
+
|
| 78 |
+
if any(r.get("evaluation") for r in results):
|
| 79 |
+
row("Eval Score", [r.get("evaluation", {}).get("score", 0) for r in results])
|
| 80 |
+
|
| 81 |
+
row("Total Tokens", [m.get("total_tokens", 0) for m in metrics_list])
|
| 82 |
+
row("Input Tokens", [m.get("input_tokens", 0) for m in metrics_list])
|
| 83 |
+
row("Output Tokens", [m.get("output_tokens", 0) for m in metrics_list])
|
| 84 |
+
row("LLM Calls", [m.get("llm_call_count", 0) for m in metrics_list])
|
| 85 |
+
row("Tool Calls", [m.get("tool_call_count", 0) for m in metrics_list])
|
| 86 |
+
row("LLM Time (ms)", [m.get("llm_duration_ms", 0) for m in metrics_list])
|
| 87 |
+
row("Tool Time (ms)", [m.get("tool_duration_ms", 0) for m in metrics_list])
|
| 88 |
+
|
| 89 |
+
# Tool breakdown
|
| 90 |
+
all_tools: set[str] = set()
|
| 91 |
+
for m in metrics_list:
|
| 92 |
+
all_tools.update(m.get("tool_calls_by_name", {}).keys())
|
| 93 |
+
|
| 94 |
+
if all_tools:
|
| 95 |
+
print("\n" + "-" * 80)
|
| 96 |
+
print("Tool Usage Breakdown:")
|
| 97 |
+
for tool in sorted(all_tools):
|
| 98 |
+
values = [m.get("tool_calls_by_name", {}).get(tool, 0) for m in metrics_list]
|
| 99 |
+
row(f" {tool}", values)
|
| 100 |
+
|
| 101 |
+
print("=" * 80)
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def print_eval_result(
|
| 105 |
+
score: float,
|
| 106 |
+
passed: bool,
|
| 107 |
+
reasoning: str,
|
| 108 |
+
criteria_results: list[dict[str, Any]] | None = None,
|
| 109 |
+
) -> None:
|
| 110 |
+
"""Print evaluation results in a formatted way.
|
| 111 |
+
|
| 112 |
+
Args:
|
| 113 |
+
score: Overall score (0.0 to 1.0)
|
| 114 |
+
passed: Whether evaluation passed
|
| 115 |
+
reasoning: Overall reasoning
|
| 116 |
+
criteria_results: Optional list of individual criterion results
|
| 117 |
+
"""
|
| 118 |
+
status = "PASS" if passed else "FAIL"
|
| 119 |
+
|
| 120 |
+
print(f"\n{'=' * 60}")
|
| 121 |
+
print(f" Evaluation Result: {status}")
|
| 122 |
+
print("=" * 60)
|
| 123 |
+
print(f" Score: {score:.2f}")
|
| 124 |
+
print(f" Passed: {passed}")
|
| 125 |
+
print(f" Reason: {reasoning}")
|
| 126 |
+
|
| 127 |
+
if criteria_results:
|
| 128 |
+
print("\n Criteria:")
|
| 129 |
+
for cr in criteria_results:
|
| 130 |
+
cr_status = "PASS" if cr.get("passed") else "FAIL"
|
| 131 |
+
print(f" - {cr.get('name', 'unknown')}: {cr_status} ({cr.get('score', 0):.2f})")
|
| 132 |
+
if cr.get("reasoning"):
|
| 133 |
+
print(f" {cr['reasoning']}")
|
| 134 |
+
|
| 135 |
+
print("=" * 60)
|
src/flow/experiments/reporters/json_reporter.py
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Microsoft. All rights reserved.
|
| 2 |
+
|
| 3 |
+
"""JSON reporter for experiment results."""
|
| 4 |
+
|
| 5 |
+
import json
|
| 6 |
+
from dataclasses import asdict
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
from typing import Any
|
| 9 |
+
|
| 10 |
+
from ..metrics import TraceMetrics, metrics_to_dict
|
| 11 |
+
from ..types import EvalResult, RunResult
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def save_run_result(
|
| 15 |
+
result: RunResult,
|
| 16 |
+
output_dir: Path,
|
| 17 |
+
eval_result: EvalResult | None = None,
|
| 18 |
+
metrics: TraceMetrics | None = None,
|
| 19 |
+
) -> None:
|
| 20 |
+
"""Save a run result to JSON files.
|
| 21 |
+
|
| 22 |
+
Creates the following files in output_dir:
|
| 23 |
+
- traces.json: Raw OpenTelemetry spans
|
| 24 |
+
- metrics.json: Extracted metrics (if provided)
|
| 25 |
+
- output.txt: Agent text output
|
| 26 |
+
- result.json: Full result summary
|
| 27 |
+
|
| 28 |
+
Args:
|
| 29 |
+
result: The RunResult to save
|
| 30 |
+
output_dir: Directory to save files
|
| 31 |
+
eval_result: Optional evaluation result
|
| 32 |
+
metrics: Optional extracted metrics
|
| 33 |
+
"""
|
| 34 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 35 |
+
|
| 36 |
+
# Save raw traces
|
| 37 |
+
with open(output_dir / "traces.json", "w") as f:
|
| 38 |
+
json.dump(result.trace, f, indent=2, default=str)
|
| 39 |
+
|
| 40 |
+
# Save extracted metrics
|
| 41 |
+
if metrics:
|
| 42 |
+
with open(output_dir / "metrics.json", "w") as f:
|
| 43 |
+
json.dump(metrics_to_dict(metrics), f, indent=2)
|
| 44 |
+
|
| 45 |
+
# Save agent output
|
| 46 |
+
with open(output_dir / "output.txt", "w") as f:
|
| 47 |
+
f.write(f"Task: {result.task.prompt}\n")
|
| 48 |
+
f.write(f"Duration: {result.duration_seconds:.1f}s\n")
|
| 49 |
+
f.write(f"Success: {result.success}\n")
|
| 50 |
+
if eval_result:
|
| 51 |
+
f.write(f"Eval Score: {eval_result.score:.2f}\n")
|
| 52 |
+
if result.error:
|
| 53 |
+
f.write(f"Error: {result.error}\n")
|
| 54 |
+
f.write("\n" + "=" * 60 + "\n\n")
|
| 55 |
+
f.write(result.output)
|
| 56 |
+
|
| 57 |
+
# Save full result
|
| 58 |
+
result_dict: dict[str, Any] = {
|
| 59 |
+
"task": {
|
| 60 |
+
"name": result.task.name,
|
| 61 |
+
"prompt": result.task.prompt,
|
| 62 |
+
"criteria": [asdict(c) for c in result.task.criteria],
|
| 63 |
+
"metadata": result.task.metadata,
|
| 64 |
+
},
|
| 65 |
+
"success": result.success,
|
| 66 |
+
"error": result.error,
|
| 67 |
+
"duration_seconds": result.duration_seconds,
|
| 68 |
+
"files_created": result.files_created,
|
| 69 |
+
"trace_count": len(result.trace),
|
| 70 |
+
"output_length": len(result.output),
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
+
if metrics:
|
| 74 |
+
result_dict["metrics"] = metrics_to_dict(metrics)
|
| 75 |
+
|
| 76 |
+
if eval_result:
|
| 77 |
+
result_dict["evaluation"] = {
|
| 78 |
+
"score": eval_result.score,
|
| 79 |
+
"passed": eval_result.passed,
|
| 80 |
+
"reasoning": eval_result.reasoning,
|
| 81 |
+
"criteria_results": [
|
| 82 |
+
{
|
| 83 |
+
"name": cr.name,
|
| 84 |
+
"score": cr.score,
|
| 85 |
+
"passed": cr.passed,
|
| 86 |
+
"reasoning": cr.reasoning,
|
| 87 |
+
}
|
| 88 |
+
for cr in eval_result.criteria_results
|
| 89 |
+
],
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
with open(output_dir / "result.json", "w") as f:
|
| 93 |
+
json.dump(result_dict, f, indent=2)
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def load_run_result_summary(result_path: Path) -> dict[str, Any]:
|
| 97 |
+
"""Load a run result summary from a result.json file.
|
| 98 |
+
|
| 99 |
+
Args:
|
| 100 |
+
result_path: Path to result.json file
|
| 101 |
+
|
| 102 |
+
Returns:
|
| 103 |
+
Dictionary with result summary
|
| 104 |
+
"""
|
| 105 |
+
with open(result_path) as f:
|
| 106 |
+
return json.load(f)
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
def save_comparison(
|
| 110 |
+
results: list[tuple[str, dict[str, Any]]],
|
| 111 |
+
output_path: Path,
|
| 112 |
+
) -> None:
|
| 113 |
+
"""Save a comparison of multiple results.
|
| 114 |
+
|
| 115 |
+
Args:
|
| 116 |
+
results: List of (name, result_dict) tuples
|
| 117 |
+
output_path: Path to save comparison JSON
|
| 118 |
+
"""
|
| 119 |
+
comparison = {
|
| 120 |
+
"results": [
|
| 121 |
+
{
|
| 122 |
+
"name": name,
|
| 123 |
+
"success": result.get("success"),
|
| 124 |
+
"duration_seconds": result.get("duration_seconds"),
|
| 125 |
+
"metrics": result.get("metrics"),
|
| 126 |
+
"evaluation": result.get("evaluation"),
|
| 127 |
+
}
|
| 128 |
+
for name, result in results
|
| 129 |
+
],
|
| 130 |
+
}
|
| 131 |
+
|
| 132 |
+
with open(output_path, "w") as f:
|
| 133 |
+
json.dump(comparison, f, indent=2)
|
src/flow/experiments/runner.py
ADDED
|
@@ -0,0 +1,243 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Microsoft. All rights reserved.
|
| 2 |
+
|
| 3 |
+
"""Experiment runner for executing agents on tasks with trace capture."""
|
| 4 |
+
|
| 5 |
+
from __future__ import annotations
|
| 6 |
+
|
| 7 |
+
import logging
|
| 8 |
+
import os
|
| 9 |
+
import tempfile
|
| 10 |
+
import time
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
from typing import TYPE_CHECKING
|
| 13 |
+
|
| 14 |
+
from opentelemetry import trace
|
| 15 |
+
from opentelemetry.sdk.resources import Resource
|
| 16 |
+
from opentelemetry.sdk.trace import TracerProvider
|
| 17 |
+
from opentelemetry.sdk.trace.export import SimpleSpanProcessor
|
| 18 |
+
from opentelemetry.semconv._incubating.attributes.service_attributes import SERVICE_NAME
|
| 19 |
+
|
| 20 |
+
from .trace_collector import FlowTraceCollector
|
| 21 |
+
from .types import RunResult, Task
|
| 22 |
+
|
| 23 |
+
if TYPE_CHECKING:
|
| 24 |
+
from flow.harness.maf import MAFHarness
|
| 25 |
+
|
| 26 |
+
logger = logging.getLogger(__name__)
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def setup_tracing(service_name: str = "flow-experiments") -> TracerProvider:
|
| 30 |
+
"""Setup OpenTelemetry tracing with in-memory collection.
|
| 31 |
+
|
| 32 |
+
This creates a new TracerProvider configured for experiment tracing.
|
| 33 |
+
Call this once at the start of your experiment session.
|
| 34 |
+
|
| 35 |
+
Args:
|
| 36 |
+
service_name: Name for the tracing service
|
| 37 |
+
|
| 38 |
+
Returns:
|
| 39 |
+
The configured TracerProvider
|
| 40 |
+
"""
|
| 41 |
+
resource = Resource.create({SERVICE_NAME: service_name})
|
| 42 |
+
provider = TracerProvider(resource=resource)
|
| 43 |
+
trace.set_tracer_provider(provider)
|
| 44 |
+
|
| 45 |
+
# Enable agent framework instrumentation if available
|
| 46 |
+
try:
|
| 47 |
+
from agent_framework.observability import enable_instrumentation
|
| 48 |
+
enable_instrumentation()
|
| 49 |
+
logger.debug("Agent Framework instrumentation enabled")
|
| 50 |
+
except ImportError:
|
| 51 |
+
logger.debug("Agent Framework not available, skipping instrumentation")
|
| 52 |
+
except Exception as e:
|
| 53 |
+
logger.debug(f"Could not enable Agent Framework instrumentation: {e}")
|
| 54 |
+
|
| 55 |
+
return provider
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
class FlowExperimentRunner:
|
| 59 |
+
"""Runner for executing experiments with Flow agents.
|
| 60 |
+
|
| 61 |
+
The runner handles:
|
| 62 |
+
- Setting up temporary workspaces
|
| 63 |
+
- Collecting execution traces via OpenTelemetry
|
| 64 |
+
- Measuring execution time
|
| 65 |
+
- Capturing files created
|
| 66 |
+
- Supporting streaming execution
|
| 67 |
+
|
| 68 |
+
Example:
|
| 69 |
+
from flow.harness.maf import MAFHarness
|
| 70 |
+
from flow.experiments import FlowExperimentRunner, Task
|
| 71 |
+
|
| 72 |
+
harness = MAFHarness()
|
| 73 |
+
runner = FlowExperimentRunner(keep_workspace=True)
|
| 74 |
+
|
| 75 |
+
task = Task(name="hello", prompt="Create a hello world script")
|
| 76 |
+
result = await runner.run(harness, task)
|
| 77 |
+
|
| 78 |
+
print(f"Duration: {result.duration_seconds}s")
|
| 79 |
+
print(f"Files: {result.files_created}")
|
| 80 |
+
"""
|
| 81 |
+
|
| 82 |
+
def __init__(
|
| 83 |
+
self,
|
| 84 |
+
workspace_base: Path | None = None,
|
| 85 |
+
keep_workspace: bool = False,
|
| 86 |
+
) -> None:
|
| 87 |
+
"""Initialize the experiment runner.
|
| 88 |
+
|
| 89 |
+
Args:
|
| 90 |
+
workspace_base: Base directory for workspaces (default: system temp)
|
| 91 |
+
keep_workspace: Whether to keep workspace after run (default: False)
|
| 92 |
+
"""
|
| 93 |
+
self.workspace_base = workspace_base or Path(tempfile.gettempdir())
|
| 94 |
+
self.keep_workspace = keep_workspace
|
| 95 |
+
|
| 96 |
+
async def run(
|
| 97 |
+
self,
|
| 98 |
+
harness: MAFHarness,
|
| 99 |
+
task: Task,
|
| 100 |
+
workspace: Path | None = None,
|
| 101 |
+
) -> RunResult:
|
| 102 |
+
"""Run a harness on a task and collect results.
|
| 103 |
+
|
| 104 |
+
This method:
|
| 105 |
+
1. Creates or uses a workspace directory
|
| 106 |
+
2. Sets up trace collection
|
| 107 |
+
3. Executes the harness with streaming
|
| 108 |
+
4. Collects output and files created
|
| 109 |
+
5. Returns a RunResult with all data
|
| 110 |
+
|
| 111 |
+
Args:
|
| 112 |
+
harness: The MAFHarness to run
|
| 113 |
+
task: The task to execute
|
| 114 |
+
workspace: Optional workspace directory (creates temp if None)
|
| 115 |
+
|
| 116 |
+
Returns:
|
| 117 |
+
RunResult with trace, output, and metrics
|
| 118 |
+
"""
|
| 119 |
+
# Create or use workspace directory
|
| 120 |
+
if workspace is None:
|
| 121 |
+
workspace = Path(tempfile.mkdtemp(
|
| 122 |
+
prefix=f"flow_experiment_{task.name}_",
|
| 123 |
+
dir=self.workspace_base,
|
| 124 |
+
))
|
| 125 |
+
workspace_created = True
|
| 126 |
+
else:
|
| 127 |
+
workspace.mkdir(parents=True, exist_ok=True)
|
| 128 |
+
workspace_created = False
|
| 129 |
+
|
| 130 |
+
logger.info(f"Running task '{task.name}' in workspace: {workspace}")
|
| 131 |
+
|
| 132 |
+
# Track files before execution
|
| 133 |
+
files_before = set(self._list_files(workspace))
|
| 134 |
+
|
| 135 |
+
# Set up trace collection
|
| 136 |
+
collector = FlowTraceCollector()
|
| 137 |
+
processor: SimpleSpanProcessor | None = None
|
| 138 |
+
|
| 139 |
+
try:
|
| 140 |
+
provider = trace.get_tracer_provider()
|
| 141 |
+
if isinstance(provider, TracerProvider):
|
| 142 |
+
processor = SimpleSpanProcessor(collector)
|
| 143 |
+
provider.add_span_processor(processor)
|
| 144 |
+
logger.debug("Trace collection enabled")
|
| 145 |
+
except Exception as e:
|
| 146 |
+
logger.debug(f"Could not set up trace collection: {e}")
|
| 147 |
+
|
| 148 |
+
# Execute the harness
|
| 149 |
+
start_time = time.time()
|
| 150 |
+
output_chunks: list[str] = []
|
| 151 |
+
error: str | None = None
|
| 152 |
+
|
| 153 |
+
try:
|
| 154 |
+
# Change to workspace directory for execution
|
| 155 |
+
original_cwd = os.getcwd()
|
| 156 |
+
os.chdir(workspace)
|
| 157 |
+
|
| 158 |
+
try:
|
| 159 |
+
# Use streaming execution to capture all output
|
| 160 |
+
async for event in harness.run_stream(task.prompt):
|
| 161 |
+
# Collect text output
|
| 162 |
+
if hasattr(event, "content") and event.content:
|
| 163 |
+
if hasattr(event, "type"):
|
| 164 |
+
from ..harness.base import EventType
|
| 165 |
+
if event.type in (EventType.TEXT_DELTA, EventType.TEXT_DONE):
|
| 166 |
+
output_chunks.append(event.content)
|
| 167 |
+
elif event.type == EventType.TOOL_RESULT:
|
| 168 |
+
# Optionally capture tool results
|
| 169 |
+
pass
|
| 170 |
+
finally:
|
| 171 |
+
os.chdir(original_cwd)
|
| 172 |
+
|
| 173 |
+
except Exception as e:
|
| 174 |
+
error = str(e)
|
| 175 |
+
logger.error(f"Task execution failed: {e}")
|
| 176 |
+
|
| 177 |
+
end_time = time.time()
|
| 178 |
+
duration_seconds = end_time - start_time
|
| 179 |
+
|
| 180 |
+
# Force flush and get traces
|
| 181 |
+
if processor:
|
| 182 |
+
try:
|
| 183 |
+
processor.force_flush()
|
| 184 |
+
except Exception as e:
|
| 185 |
+
logger.debug(f"Error flushing processor: {e}")
|
| 186 |
+
|
| 187 |
+
# Get collected traces
|
| 188 |
+
trace_data = collector.get_traces()
|
| 189 |
+
|
| 190 |
+
# Clean up trace processor
|
| 191 |
+
if processor:
|
| 192 |
+
try:
|
| 193 |
+
processor.shutdown()
|
| 194 |
+
except Exception as e:
|
| 195 |
+
logger.debug(f"Error shutting down processor: {e}")
|
| 196 |
+
|
| 197 |
+
# Find files created
|
| 198 |
+
files_after = set(self._list_files(workspace))
|
| 199 |
+
files_created = sorted(files_after - files_before)
|
| 200 |
+
|
| 201 |
+
# Clean up workspace if not keeping and we created it
|
| 202 |
+
if not self.keep_workspace and workspace_created and not error:
|
| 203 |
+
try:
|
| 204 |
+
import shutil
|
| 205 |
+
shutil.rmtree(workspace)
|
| 206 |
+
logger.debug(f"Cleaned up workspace: {workspace}")
|
| 207 |
+
except Exception as e:
|
| 208 |
+
logger.warning(f"Failed to clean up workspace: {e}")
|
| 209 |
+
|
| 210 |
+
output = "".join(output_chunks)
|
| 211 |
+
|
| 212 |
+
return RunResult(
|
| 213 |
+
task=task,
|
| 214 |
+
trace=trace_data,
|
| 215 |
+
output=output,
|
| 216 |
+
files_created=files_created,
|
| 217 |
+
duration_seconds=duration_seconds,
|
| 218 |
+
workspace=workspace,
|
| 219 |
+
error=error,
|
| 220 |
+
)
|
| 221 |
+
|
| 222 |
+
def _list_files(self, directory: Path) -> list[str]:
|
| 223 |
+
"""List all files in a directory recursively.
|
| 224 |
+
|
| 225 |
+
Args:
|
| 226 |
+
directory: Directory to scan
|
| 227 |
+
|
| 228 |
+
Returns:
|
| 229 |
+
List of relative file paths
|
| 230 |
+
"""
|
| 231 |
+
files = []
|
| 232 |
+
try:
|
| 233 |
+
for root, _, filenames in os.walk(directory):
|
| 234 |
+
for filename in filenames:
|
| 235 |
+
# Skip hidden files and common temp files
|
| 236 |
+
if filename.startswith("."):
|
| 237 |
+
continue
|
| 238 |
+
full_path = Path(root) / filename
|
| 239 |
+
rel_path = full_path.relative_to(directory)
|
| 240 |
+
files.append(str(rel_path))
|
| 241 |
+
except Exception as e:
|
| 242 |
+
logger.debug(f"Error listing files: {e}")
|
| 243 |
+
return files
|
src/flow/experiments/trace_collector.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Microsoft. All rights reserved.
|
| 2 |
+
|
| 3 |
+
"""OpenTelemetry trace collector for experiment analysis."""
|
| 4 |
+
|
| 5 |
+
import logging
|
| 6 |
+
from datetime import datetime
|
| 7 |
+
from typing import Any
|
| 8 |
+
|
| 9 |
+
from opentelemetry.sdk.trace.export import SpanExporter, SpanExportResult
|
| 10 |
+
|
| 11 |
+
logger = logging.getLogger(__name__)
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class FlowTraceCollector(SpanExporter):
|
| 15 |
+
"""Collects OpenTelemetry spans for experiment analysis.
|
| 16 |
+
|
| 17 |
+
This exporter captures spans during agent execution and converts them
|
| 18 |
+
to a dictionary format suitable for metrics extraction and analysis.
|
| 19 |
+
|
| 20 |
+
Example:
|
| 21 |
+
collector = FlowTraceCollector()
|
| 22 |
+
# Attach to TracerProvider via SimpleSpanProcessor
|
| 23 |
+
# Run agent execution
|
| 24 |
+
traces = collector.get_traces()
|
| 25 |
+
"""
|
| 26 |
+
|
| 27 |
+
def __init__(self) -> None:
|
| 28 |
+
"""Initialize the trace collector."""
|
| 29 |
+
self.spans: list[dict[str, Any]] = []
|
| 30 |
+
|
| 31 |
+
def export(self, spans: Any) -> SpanExportResult:
|
| 32 |
+
"""Collect spans from OpenTelemetry.
|
| 33 |
+
|
| 34 |
+
Args:
|
| 35 |
+
spans: Sequence of OpenTelemetry ReadableSpan objects
|
| 36 |
+
|
| 37 |
+
Returns:
|
| 38 |
+
SpanExportResult indicating success
|
| 39 |
+
"""
|
| 40 |
+
for span in spans:
|
| 41 |
+
try:
|
| 42 |
+
# Convert nanoseconds to seconds for timestamps
|
| 43 |
+
start_time = span.start_time / 1_000_000_000
|
| 44 |
+
end_time = span.end_time / 1_000_000_000 if span.end_time else None
|
| 45 |
+
duration_ms = ((end_time - start_time) * 1000) if end_time else None
|
| 46 |
+
|
| 47 |
+
self.spans.append({
|
| 48 |
+
"type": "trace_span",
|
| 49 |
+
"timestamp": datetime.fromtimestamp(start_time).isoformat(),
|
| 50 |
+
"data": {
|
| 51 |
+
"operation_name": span.name,
|
| 52 |
+
"span_id": format(span.context.span_id, "016x"),
|
| 53 |
+
"trace_id": format(span.context.trace_id, "032x"),
|
| 54 |
+
"parent_span_id": (
|
| 55 |
+
format(span.parent.span_id, "016x") if span.parent else None
|
| 56 |
+
),
|
| 57 |
+
"duration_ms": duration_ms,
|
| 58 |
+
"attributes": dict(span.attributes) if span.attributes else {},
|
| 59 |
+
"status": str(span.status.status_code.name) if hasattr(span, "status") else "OK",
|
| 60 |
+
"events": [
|
| 61 |
+
{
|
| 62 |
+
"name": event.name,
|
| 63 |
+
"timestamp": datetime.fromtimestamp(
|
| 64 |
+
event.timestamp / 1_000_000_000
|
| 65 |
+
).isoformat(),
|
| 66 |
+
"attributes": dict(event.attributes) if event.attributes else {},
|
| 67 |
+
}
|
| 68 |
+
for event in (span.events or [])
|
| 69 |
+
],
|
| 70 |
+
},
|
| 71 |
+
})
|
| 72 |
+
except Exception as e:
|
| 73 |
+
logger.debug(f"Failed to collect span: {e}")
|
| 74 |
+
|
| 75 |
+
return SpanExportResult.SUCCESS
|
| 76 |
+
|
| 77 |
+
def force_flush(self, timeout_millis: int = 30000) -> bool:
|
| 78 |
+
"""Force flush spans (no-op for simple collection).
|
| 79 |
+
|
| 80 |
+
Args:
|
| 81 |
+
timeout_millis: Timeout in milliseconds (unused)
|
| 82 |
+
|
| 83 |
+
Returns:
|
| 84 |
+
True always
|
| 85 |
+
"""
|
| 86 |
+
return True
|
| 87 |
+
|
| 88 |
+
def shutdown(self) -> None:
|
| 89 |
+
"""Shutdown the exporter (no-op)."""
|
| 90 |
+
pass
|
| 91 |
+
|
| 92 |
+
def get_traces(self) -> list[dict[str, Any]]:
|
| 93 |
+
"""Get and clear collected traces.
|
| 94 |
+
|
| 95 |
+
Returns:
|
| 96 |
+
List of collected trace spans, clearing the internal list
|
| 97 |
+
"""
|
| 98 |
+
traces = self.spans.copy()
|
| 99 |
+
self.spans.clear()
|
| 100 |
+
return traces
|
| 101 |
+
|
| 102 |
+
def clear(self) -> None:
|
| 103 |
+
"""Clear collected traces without returning them."""
|
| 104 |
+
self.spans.clear()
|
src/flow/experiments/types.py
ADDED
|
@@ -0,0 +1,266 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Microsoft. All rights reserved.
|
| 2 |
+
|
| 3 |
+
"""Type definitions for the experiments framework."""
|
| 4 |
+
|
| 5 |
+
from dataclasses import dataclass, field
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
from typing import Any
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
@dataclass
|
| 11 |
+
class EvalCriterion:
|
| 12 |
+
"""A criterion for evaluating agent output.
|
| 13 |
+
|
| 14 |
+
Attributes:
|
| 15 |
+
name: Short identifier for the criterion (e.g., "correctness", "completeness")
|
| 16 |
+
instruction: Detailed instruction for how to evaluate this criterion
|
| 17 |
+
weight: Relative weight for scoring (default 1.0)
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
name: str
|
| 21 |
+
instruction: str
|
| 22 |
+
weight: float = 1.0
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
@dataclass
|
| 26 |
+
class Task:
|
| 27 |
+
"""A task for the agent to perform.
|
| 28 |
+
|
| 29 |
+
Attributes:
|
| 30 |
+
name: Short identifier for the task
|
| 31 |
+
prompt: The prompt/instruction given to the agent
|
| 32 |
+
criteria: List of evaluation criteria for assessing the output
|
| 33 |
+
metadata: Additional task metadata (e.g., expected output, difficulty)
|
| 34 |
+
"""
|
| 35 |
+
|
| 36 |
+
name: str
|
| 37 |
+
prompt: str
|
| 38 |
+
criteria: list[EvalCriterion] = field(default_factory=list)
|
| 39 |
+
metadata: dict[str, Any] = field(default_factory=dict)
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
@dataclass
|
| 43 |
+
class RunResult:
|
| 44 |
+
"""Result of running an agent on a task.
|
| 45 |
+
|
| 46 |
+
Attributes:
|
| 47 |
+
task: The task that was executed
|
| 48 |
+
trace: OpenTelemetry trace spans collected during execution
|
| 49 |
+
output: The agent's final output/response
|
| 50 |
+
files_created: List of files created during execution
|
| 51 |
+
duration_seconds: Total execution time
|
| 52 |
+
workspace: Path to the workspace directory used
|
| 53 |
+
error: Error message if execution failed, None if successful
|
| 54 |
+
"""
|
| 55 |
+
|
| 56 |
+
task: Task
|
| 57 |
+
trace: list[dict[str, Any]]
|
| 58 |
+
output: str
|
| 59 |
+
files_created: list[str]
|
| 60 |
+
duration_seconds: float
|
| 61 |
+
workspace: Path
|
| 62 |
+
error: str | None = None
|
| 63 |
+
|
| 64 |
+
@property
|
| 65 |
+
def success(self) -> bool:
|
| 66 |
+
"""Whether the run completed without errors."""
|
| 67 |
+
return self.error is None
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
@dataclass
|
| 71 |
+
class CriterionResult:
|
| 72 |
+
"""Result of evaluating a single criterion.
|
| 73 |
+
|
| 74 |
+
Attributes:
|
| 75 |
+
name: Name of the criterion evaluated
|
| 76 |
+
score: Numeric score (0.0 to 1.0)
|
| 77 |
+
passed: Whether the criterion was met
|
| 78 |
+
reasoning: Explanation of the evaluation
|
| 79 |
+
"""
|
| 80 |
+
|
| 81 |
+
name: str
|
| 82 |
+
score: float
|
| 83 |
+
passed: bool
|
| 84 |
+
reasoning: str
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
@dataclass
|
| 88 |
+
class EvalResult:
|
| 89 |
+
"""Result of evaluating an agent's output.
|
| 90 |
+
|
| 91 |
+
Attributes:
|
| 92 |
+
score: Overall weighted score (0.0 to 1.0)
|
| 93 |
+
passed: Whether the evaluation passed overall
|
| 94 |
+
criteria_results: Results for each individual criterion
|
| 95 |
+
reasoning: Overall evaluation reasoning/summary
|
| 96 |
+
"""
|
| 97 |
+
|
| 98 |
+
score: float
|
| 99 |
+
passed: bool
|
| 100 |
+
criteria_results: list[CriterionResult]
|
| 101 |
+
reasoning: str
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
# =============================================================================
|
| 105 |
+
# Built-in Task Suites for Optimization
|
| 106 |
+
# =============================================================================
|
| 107 |
+
|
| 108 |
+
TASK_SUITES: dict[str, list[Task]] = {
|
| 109 |
+
"quick": [
|
| 110 |
+
Task(
|
| 111 |
+
name="fizzbuzz",
|
| 112 |
+
prompt="Create a Python file fizzbuzz.py that prints FizzBuzz from 1-100. Then run it.",
|
| 113 |
+
criteria=[
|
| 114 |
+
EvalCriterion(name="file_created", instruction="fizzbuzz.py file was created"),
|
| 115 |
+
EvalCriterion(name="correct_output", instruction="Output shows correct FizzBuzz pattern"),
|
| 116 |
+
],
|
| 117 |
+
metadata={"category": "short", "expected_duration": 60},
|
| 118 |
+
),
|
| 119 |
+
Task(
|
| 120 |
+
name="hello_api",
|
| 121 |
+
prompt="Create a FastAPI app in api.py with a /hello endpoint that returns {'message': 'hello'}.",
|
| 122 |
+
criteria=[
|
| 123 |
+
EvalCriterion(name="file_created", instruction="api.py file was created"),
|
| 124 |
+
EvalCriterion(name="has_endpoint", instruction="Contains a /hello GET endpoint"),
|
| 125 |
+
],
|
| 126 |
+
metadata={"category": "short", "expected_duration": 90},
|
| 127 |
+
),
|
| 128 |
+
Task(
|
| 129 |
+
name="file_counter",
|
| 130 |
+
prompt="Create a Python script count_files.py that counts .py files in current directory and prints the count.",
|
| 131 |
+
criteria=[
|
| 132 |
+
EvalCriterion(name="file_created", instruction="count_files.py was created"),
|
| 133 |
+
EvalCriterion(name="runs_correctly", instruction="Script runs and outputs a number"),
|
| 134 |
+
],
|
| 135 |
+
metadata={"category": "short", "expected_duration": 60},
|
| 136 |
+
),
|
| 137 |
+
],
|
| 138 |
+
"core": [
|
| 139 |
+
Task(
|
| 140 |
+
name="fizzbuzz",
|
| 141 |
+
prompt="Create a Python file fizzbuzz.py that prints FizzBuzz from 1-100. Then run it.",
|
| 142 |
+
criteria=[
|
| 143 |
+
EvalCriterion(name="file_created", instruction="fizzbuzz.py file was created"),
|
| 144 |
+
EvalCriterion(name="correct_output", instruction="Output shows correct FizzBuzz pattern"),
|
| 145 |
+
],
|
| 146 |
+
metadata={"category": "short"},
|
| 147 |
+
),
|
| 148 |
+
Task(
|
| 149 |
+
name="rest_api",
|
| 150 |
+
prompt="Create a FastAPI app with CRUD endpoints for a TODO list (in-memory storage). Include GET /todos, POST /todos, DELETE /todos/{id}.",
|
| 151 |
+
criteria=[
|
| 152 |
+
EvalCriterion(name="file_created", instruction="API file was created"),
|
| 153 |
+
EvalCriterion(name="has_crud", instruction="Contains GET, POST, DELETE endpoints"),
|
| 154 |
+
],
|
| 155 |
+
metadata={"category": "medium"},
|
| 156 |
+
),
|
| 157 |
+
Task(
|
| 158 |
+
name="data_analysis",
|
| 159 |
+
prompt="Create a Python script that generates 100 random data points, calculates mean/median/std, and saves results to stats.json.",
|
| 160 |
+
criteria=[
|
| 161 |
+
EvalCriterion(name="script_created", instruction="Python script was created"),
|
| 162 |
+
EvalCriterion(name="json_output", instruction="stats.json was created with results"),
|
| 163 |
+
],
|
| 164 |
+
metadata={"category": "medium"},
|
| 165 |
+
),
|
| 166 |
+
Task(
|
| 167 |
+
name="cli_tool",
|
| 168 |
+
prompt="Create a CLI tool using argparse that takes a filename and counts lines, words, and characters (like wc).",
|
| 169 |
+
criteria=[
|
| 170 |
+
EvalCriterion(name="file_created", instruction="CLI script was created"),
|
| 171 |
+
EvalCriterion(name="uses_argparse", instruction="Uses argparse for argument parsing"),
|
| 172 |
+
],
|
| 173 |
+
metadata={"category": "medium"},
|
| 174 |
+
),
|
| 175 |
+
Task(
|
| 176 |
+
name="unit_tests",
|
| 177 |
+
prompt="Create a calculator module (calc.py) with add/subtract/multiply/divide functions, then write pytest tests for it (test_calc.py).",
|
| 178 |
+
criteria=[
|
| 179 |
+
EvalCriterion(name="module_created", instruction="calc.py was created"),
|
| 180 |
+
EvalCriterion(name="tests_created", instruction="test_calc.py was created"),
|
| 181 |
+
EvalCriterion(name="tests_pass", instruction="Tests pass when run"),
|
| 182 |
+
],
|
| 183 |
+
metadata={"category": "medium"},
|
| 184 |
+
),
|
| 185 |
+
],
|
| 186 |
+
"coding": [
|
| 187 |
+
Task(
|
| 188 |
+
name="fizzbuzz",
|
| 189 |
+
prompt="Create fizzbuzz.py that prints FizzBuzz 1-100 and run it.",
|
| 190 |
+
criteria=[EvalCriterion(name="correct", instruction="Correct FizzBuzz output")],
|
| 191 |
+
metadata={"category": "short"},
|
| 192 |
+
),
|
| 193 |
+
Task(
|
| 194 |
+
name="rest_api",
|
| 195 |
+
prompt="Create a FastAPI CRUD TODO app with GET/POST/DELETE endpoints.",
|
| 196 |
+
criteria=[EvalCriterion(name="has_crud", instruction="Has working CRUD")],
|
| 197 |
+
metadata={"category": "medium"},
|
| 198 |
+
),
|
| 199 |
+
Task(
|
| 200 |
+
name="cli_tool",
|
| 201 |
+
prompt="Create an argparse CLI that counts lines/words/chars in a file.",
|
| 202 |
+
criteria=[EvalCriterion(name="works", instruction="CLI works correctly")],
|
| 203 |
+
metadata={"category": "medium"},
|
| 204 |
+
),
|
| 205 |
+
Task(
|
| 206 |
+
name="data_pipeline",
|
| 207 |
+
prompt="Create a script that reads CSV data, filters rows, aggregates, and outputs JSON.",
|
| 208 |
+
criteria=[EvalCriterion(name="works", instruction="Pipeline produces correct output")],
|
| 209 |
+
metadata={"category": "medium"},
|
| 210 |
+
),
|
| 211 |
+
Task(
|
| 212 |
+
name="unit_tests",
|
| 213 |
+
prompt="Create calc.py with math functions and test_calc.py with pytest tests.",
|
| 214 |
+
criteria=[EvalCriterion(name="tests_pass", instruction="Tests pass")],
|
| 215 |
+
metadata={"category": "medium"},
|
| 216 |
+
),
|
| 217 |
+
Task(
|
| 218 |
+
name="web_scraper",
|
| 219 |
+
prompt="Create a script that fetches a webpage and extracts all links.",
|
| 220 |
+
criteria=[EvalCriterion(name="extracts_links", instruction="Extracts links correctly")],
|
| 221 |
+
metadata={"category": "medium"},
|
| 222 |
+
),
|
| 223 |
+
Task(
|
| 224 |
+
name="async_downloader",
|
| 225 |
+
prompt="Create an async script that downloads multiple URLs concurrently using aiohttp.",
|
| 226 |
+
criteria=[EvalCriterion(name="uses_async", instruction="Uses async/await correctly")],
|
| 227 |
+
metadata={"category": "complex"},
|
| 228 |
+
),
|
| 229 |
+
Task(
|
| 230 |
+
name="database_orm",
|
| 231 |
+
prompt="Create a SQLAlchemy model for Users with CRUD operations.",
|
| 232 |
+
criteria=[EvalCriterion(name="has_orm", instruction="Uses SQLAlchemy ORM correctly")],
|
| 233 |
+
metadata={"category": "complex"},
|
| 234 |
+
),
|
| 235 |
+
Task(
|
| 236 |
+
name="decorator_lib",
|
| 237 |
+
prompt="Create a library with timing, retry, and caching decorators.",
|
| 238 |
+
criteria=[EvalCriterion(name="decorators_work", instruction="Decorators function correctly")],
|
| 239 |
+
metadata={"category": "complex"},
|
| 240 |
+
),
|
| 241 |
+
Task(
|
| 242 |
+
name="config_parser",
|
| 243 |
+
prompt="Create a config parser that supports YAML, JSON, and env vars with validation.",
|
| 244 |
+
criteria=[EvalCriterion(name="multi_format", instruction="Supports multiple formats")],
|
| 245 |
+
metadata={"category": "complex"},
|
| 246 |
+
),
|
| 247 |
+
],
|
| 248 |
+
}
|
| 249 |
+
|
| 250 |
+
|
| 251 |
+
def get_task_suite(suite_name: str) -> list[Task]:
|
| 252 |
+
"""Get a built-in task suite by name.
|
| 253 |
+
|
| 254 |
+
Args:
|
| 255 |
+
suite_name: Name of the suite ('quick', 'core', 'coding')
|
| 256 |
+
|
| 257 |
+
Returns:
|
| 258 |
+
List of Task objects
|
| 259 |
+
|
| 260 |
+
Raises:
|
| 261 |
+
ValueError: If suite_name is not found
|
| 262 |
+
"""
|
| 263 |
+
if suite_name not in TASK_SUITES:
|
| 264 |
+
available = ", ".join(TASK_SUITES.keys())
|
| 265 |
+
raise ValueError(f"Unknown suite '{suite_name}'. Available: {available}")
|
| 266 |
+
return TASK_SUITES[suite_name]
|
src/flow/harness/__init__.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Harness modules for Flow agent.
|
| 2 |
+
|
| 3 |
+
Harnesses are agent runtime adapters that convert different agent framework
|
| 4 |
+
events to a uniform Event format for CLI/UI consumption.
|
| 5 |
+
|
| 6 |
+
Available harnesses:
|
| 7 |
+
- maf: Microsoft Agent Framework harness
|
| 8 |
+
- (future) langchain: LangChain harness
|
| 9 |
+
- (future) claude: Claude SDK harness
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
from flow.harness.base import BaseHarness, Event, EventType
|
| 13 |
+
|
| 14 |
+
__all__ = [
|
| 15 |
+
"BaseHarness",
|
| 16 |
+
"Event",
|
| 17 |
+
"EventType",
|
| 18 |
+
]
|
src/flow/harness/base.py
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Base harness interface for agent runtimes.
|
| 2 |
+
|
| 3 |
+
Defines the abstract interface that all harnesses must implement,
|
| 4 |
+
allowing Flow to run on different agent frameworks.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
|
| 9 |
+
from abc import ABC, abstractmethod
|
| 10 |
+
from collections.abc import AsyncIterator, Callable, Coroutine
|
| 11 |
+
from dataclasses import dataclass, field
|
| 12 |
+
from enum import Enum
|
| 13 |
+
from typing import Any
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class EventType(Enum):
|
| 17 |
+
"""Types of events that can be streamed from an agent."""
|
| 18 |
+
|
| 19 |
+
TEXT_DELTA = "text_delta" # Streaming text chunk
|
| 20 |
+
TEXT_DONE = "text_done" # Text generation complete
|
| 21 |
+
TOOL_CALL_START = "tool_call_start" # Starting a tool call
|
| 22 |
+
TOOL_CALL_ARGS = "tool_call_args" # Tool call arguments (streaming)
|
| 23 |
+
TOOL_CALL_DONE = "tool_call_done" # Tool call complete
|
| 24 |
+
TOOL_RESULT = "tool_result" # Tool execution result
|
| 25 |
+
THINKING = "thinking" # Agent reasoning/thinking
|
| 26 |
+
ERROR = "error" # An error occurred
|
| 27 |
+
DONE = "done" # Agent run complete
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
@dataclass
|
| 31 |
+
class Event:
|
| 32 |
+
"""An event from the agent execution stream.
|
| 33 |
+
|
| 34 |
+
Events provide real-time feedback during agent execution,
|
| 35 |
+
allowing the CLI to display progress, tool calls, and results.
|
| 36 |
+
"""
|
| 37 |
+
|
| 38 |
+
type: EventType
|
| 39 |
+
content: str = ""
|
| 40 |
+
tool_name: str | None = None
|
| 41 |
+
tool_call_id: str | None = None
|
| 42 |
+
metadata: dict[str, str | int | float | bool | None] = field(default_factory=dict)
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
class BaseHarness(ABC):
|
| 46 |
+
"""Abstract base class for agent execution harnesses.
|
| 47 |
+
|
| 48 |
+
A harness is a thin adapter that converts agent framework events
|
| 49 |
+
to the uniform Flow Event format for CLI/UI consumption.
|
| 50 |
+
|
| 51 |
+
Each harness implementation handles:
|
| 52 |
+
- Taking a pre-configured agent from the framework
|
| 53 |
+
- Running tasks on the agent
|
| 54 |
+
- Converting framework-specific events to Flow Events
|
| 55 |
+
- Managing conversation threads
|
| 56 |
+
|
| 57 |
+
Implementations:
|
| 58 |
+
- MAFHarness (flow.harness.maf): Microsoft Agent Framework
|
| 59 |
+
- (Future) LangChainHarness: LangChain
|
| 60 |
+
- (Future) ClaudeHarness: Claude SDK
|
| 61 |
+
"""
|
| 62 |
+
|
| 63 |
+
@abstractmethod
|
| 64 |
+
async def run(self, task: str, thread_id: str | None = None) -> str:
|
| 65 |
+
"""Run a task and return the final response.
|
| 66 |
+
|
| 67 |
+
Args:
|
| 68 |
+
task: The task/prompt to execute
|
| 69 |
+
thread_id: Optional thread ID for conversation continuity
|
| 70 |
+
|
| 71 |
+
Returns:
|
| 72 |
+
The agent's final response text
|
| 73 |
+
"""
|
| 74 |
+
...
|
| 75 |
+
|
| 76 |
+
@abstractmethod
|
| 77 |
+
def run_stream(self, task: str, thread_id: str | None = None) -> AsyncIterator[Event]:
|
| 78 |
+
"""Run a task with streaming events.
|
| 79 |
+
|
| 80 |
+
Args:
|
| 81 |
+
task: The task/prompt to execute
|
| 82 |
+
thread_id: Optional thread ID for conversation continuity
|
| 83 |
+
|
| 84 |
+
Yields:
|
| 85 |
+
Event objects representing agent activity
|
| 86 |
+
"""
|
| 87 |
+
...
|
| 88 |
+
|
| 89 |
+
@abstractmethod
|
| 90 |
+
def register_tools(self, tools: list[Callable[..., Coroutine[Any, Any, str]]]) -> None:
|
| 91 |
+
"""Register tools with the harness.
|
| 92 |
+
|
| 93 |
+
Args:
|
| 94 |
+
tools: List of tool functions to register
|
| 95 |
+
"""
|
| 96 |
+
...
|
| 97 |
+
|
| 98 |
+
@abstractmethod
|
| 99 |
+
def get_thread_id(self) -> str:
|
| 100 |
+
"""Get the current thread ID.
|
| 101 |
+
|
| 102 |
+
Returns:
|
| 103 |
+
The current conversation thread ID
|
| 104 |
+
"""
|
| 105 |
+
...
|
| 106 |
+
|
| 107 |
+
@abstractmethod
|
| 108 |
+
async def close(self) -> None:
|
| 109 |
+
"""Clean up resources used by the harness."""
|
| 110 |
+
...
|
src/flow/harness/maf/__init__.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Microsoft Agent Framework harness module.
|
| 2 |
+
|
| 3 |
+
Provides integration with Microsoft Agent Framework for running Flow agents.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from flow.harness.maf.agent import create_agent
|
| 7 |
+
from flow.harness.maf.harness import MAFHarness
|
| 8 |
+
from flow.harness.maf.message_store import HeadTailCompactingChatMessageStore
|
| 9 |
+
|
| 10 |
+
__all__ = [
|
| 11 |
+
"create_agent",
|
| 12 |
+
"HeadTailCompactingChatMessageStore",
|
| 13 |
+
"MAFHarness",
|
| 14 |
+
]
|
src/flow/harness/maf/agent.py
ADDED
|
@@ -0,0 +1,176 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Agent factory for Microsoft Agent Framework.
|
| 2 |
+
|
| 3 |
+
Provides factory functions to create configured ChatAgent instances.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import logging
|
| 7 |
+
import os
|
| 8 |
+
from collections.abc import Callable, Coroutine, Sequence
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
from typing import TYPE_CHECKING, Any
|
| 11 |
+
|
| 12 |
+
from flow.harness.maf.message_store import HeadTailCompactingChatMessageStore
|
| 13 |
+
from flow.prompts import FLOW_AGENT_INSTRUCTIONS
|
| 14 |
+
from flow.tools import create_all_tools
|
| 15 |
+
|
| 16 |
+
if TYPE_CHECKING:
|
| 17 |
+
from agent_framework import ChatAgent
|
| 18 |
+
|
| 19 |
+
logger = logging.getLogger(__name__)
|
| 20 |
+
|
| 21 |
+
# Default paths
|
| 22 |
+
DEFAULT_WORKSPACE = Path.home() / ".flow" / "workspace"
|
| 23 |
+
DEFAULT_MEMORY_PATH = Path.home() / ".flow" / "memory"
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def create_agent(
|
| 27 |
+
*,
|
| 28 |
+
# Model/API configuration
|
| 29 |
+
endpoint: str | None = None,
|
| 30 |
+
api_key: str | None = None,
|
| 31 |
+
deployment: str | None = None,
|
| 32 |
+
api_version: str = "2024-02-15-preview",
|
| 33 |
+
# Agent configuration
|
| 34 |
+
name: str = "Flow",
|
| 35 |
+
instructions: str | None = None,
|
| 36 |
+
# Workspace configuration
|
| 37 |
+
workspace: Path | None = None,
|
| 38 |
+
memory_path: Path | None = None,
|
| 39 |
+
# Tool configuration
|
| 40 |
+
tools: Sequence[Callable[..., Coroutine[Any, Any, str]]] | None = None,
|
| 41 |
+
enable_memory_tool: bool = True,
|
| 42 |
+
enable_sub_agent: bool = False,
|
| 43 |
+
bash_timeout: int = 120,
|
| 44 |
+
# Context engineering
|
| 45 |
+
enable_compaction: bool = True,
|
| 46 |
+
compaction_head_size: int = 10,
|
| 47 |
+
compaction_tail_size: int = 40,
|
| 48 |
+
) -> "ChatAgent":
|
| 49 |
+
"""Create a configured ChatAgent for Flow.
|
| 50 |
+
|
| 51 |
+
This factory creates a Microsoft Agent Framework ChatAgent with:
|
| 52 |
+
- Azure OpenAI as the backend
|
| 53 |
+
- Flow's standard tools (coding, execution, memory)
|
| 54 |
+
- Optional message compaction for long conversations
|
| 55 |
+
- Optional agent-managed memory tool
|
| 56 |
+
- Optional sub-agent for isolated research
|
| 57 |
+
|
| 58 |
+
Args:
|
| 59 |
+
endpoint: Azure OpenAI endpoint URL. Defaults to AZURE_OPENAI_ENDPOINT env var.
|
| 60 |
+
api_key: Azure OpenAI API key. Defaults to AZURE_OPENAI_API_KEY env var.
|
| 61 |
+
deployment: Azure OpenAI deployment name. Defaults to AZURE_OPENAI_DEPLOYMENT env var.
|
| 62 |
+
api_version: Azure OpenAI API version.
|
| 63 |
+
name: Agent name.
|
| 64 |
+
instructions: Agent instructions. Defaults to FLOW_AGENT_INSTRUCTIONS.
|
| 65 |
+
workspace: Directory for file operations. Defaults to ~/.flow/workspace.
|
| 66 |
+
memory_path: Directory for persistent memory. Defaults to ~/.flow/memory.
|
| 67 |
+
tools: Custom tools to use. If None, creates standard Flow tools.
|
| 68 |
+
enable_memory_tool: Whether to include the memory tool (default: True).
|
| 69 |
+
enable_sub_agent: Whether to include the sub-agent tool (default: False).
|
| 70 |
+
bash_timeout: Timeout for bash commands in seconds.
|
| 71 |
+
enable_compaction: Whether to enable head+tail message compaction.
|
| 72 |
+
compaction_head_size: Number of initial messages to keep.
|
| 73 |
+
compaction_tail_size: Number of recent messages to keep.
|
| 74 |
+
|
| 75 |
+
Returns:
|
| 76 |
+
Configured ChatAgent instance.
|
| 77 |
+
|
| 78 |
+
Raises:
|
| 79 |
+
ImportError: If agent_framework is not installed.
|
| 80 |
+
ValueError: If required Azure OpenAI credentials are missing.
|
| 81 |
+
|
| 82 |
+
Example:
|
| 83 |
+
>>> from flow.harness.maf import create_agent
|
| 84 |
+
>>> agent = create_agent()
|
| 85 |
+
>>> thread = agent.get_new_thread()
|
| 86 |
+
>>> response = await agent.run("Create a hello world script", thread=thread)
|
| 87 |
+
"""
|
| 88 |
+
try:
|
| 89 |
+
from agent_framework import ChatAgent, ai_function
|
| 90 |
+
from agent_framework.azure import AzureOpenAIChatClient
|
| 91 |
+
except ImportError as e:
|
| 92 |
+
raise ImportError(
|
| 93 |
+
"Microsoft Agent Framework is required. "
|
| 94 |
+
"Install with: pip install agent-framework-core"
|
| 95 |
+
) from e
|
| 96 |
+
|
| 97 |
+
# Resolve configuration from environment if not provided
|
| 98 |
+
endpoint = endpoint or os.environ.get("AZURE_OPENAI_ENDPOINT")
|
| 99 |
+
api_key = api_key or os.environ.get("AZURE_OPENAI_API_KEY")
|
| 100 |
+
deployment = deployment or os.environ.get("AZURE_OPENAI_DEPLOYMENT")
|
| 101 |
+
|
| 102 |
+
if not endpoint:
|
| 103 |
+
raise ValueError(
|
| 104 |
+
"Azure OpenAI endpoint is required. "
|
| 105 |
+
"Set AZURE_OPENAI_ENDPOINT or pass endpoint parameter."
|
| 106 |
+
)
|
| 107 |
+
if not api_key:
|
| 108 |
+
raise ValueError(
|
| 109 |
+
"Azure OpenAI API key is required. "
|
| 110 |
+
"Set AZURE_OPENAI_API_KEY or pass api_key parameter."
|
| 111 |
+
)
|
| 112 |
+
if not deployment:
|
| 113 |
+
raise ValueError(
|
| 114 |
+
"Azure OpenAI deployment is required. "
|
| 115 |
+
"Set AZURE_OPENAI_DEPLOYMENT or pass deployment parameter."
|
| 116 |
+
)
|
| 117 |
+
|
| 118 |
+
# Resolve paths
|
| 119 |
+
workspace = workspace or DEFAULT_WORKSPACE
|
| 120 |
+
memory_path = memory_path or DEFAULT_MEMORY_PATH
|
| 121 |
+
|
| 122 |
+
# Ensure directories exist
|
| 123 |
+
workspace.mkdir(parents=True, exist_ok=True)
|
| 124 |
+
memory_path.mkdir(parents=True, exist_ok=True)
|
| 125 |
+
|
| 126 |
+
# Create or use provided tools
|
| 127 |
+
if tools is None:
|
| 128 |
+
tools = create_all_tools(
|
| 129 |
+
workspace=workspace,
|
| 130 |
+
memory_path=memory_path,
|
| 131 |
+
bash_timeout=bash_timeout,
|
| 132 |
+
enable_memory_tool=enable_memory_tool,
|
| 133 |
+
enable_sub_agent=enable_sub_agent,
|
| 134 |
+
)
|
| 135 |
+
|
| 136 |
+
# Wrap tools with ai_function decorator for Agent Framework
|
| 137 |
+
converted_tools = []
|
| 138 |
+
for tool_func in tools:
|
| 139 |
+
tool_name = getattr(tool_func, "_tool_name", tool_func.__name__)
|
| 140 |
+
tool_description = getattr(tool_func, "_tool_description", tool_func.__doc__ or "")
|
| 141 |
+
wrapped = ai_function(name=tool_name, description=tool_description)(tool_func)
|
| 142 |
+
converted_tools.append(wrapped)
|
| 143 |
+
|
| 144 |
+
# Create the chat client
|
| 145 |
+
client = AzureOpenAIChatClient(
|
| 146 |
+
api_key=api_key,
|
| 147 |
+
endpoint=endpoint,
|
| 148 |
+
deployment=deployment,
|
| 149 |
+
api_version=api_version,
|
| 150 |
+
)
|
| 151 |
+
|
| 152 |
+
# Create message store factory if compaction is enabled
|
| 153 |
+
message_store_factory = None
|
| 154 |
+
if enable_compaction:
|
| 155 |
+
def create_compacting_store() -> HeadTailCompactingChatMessageStore:
|
| 156 |
+
return HeadTailCompactingChatMessageStore(
|
| 157 |
+
head_size=compaction_head_size,
|
| 158 |
+
tail_size=compaction_tail_size,
|
| 159 |
+
)
|
| 160 |
+
|
| 161 |
+
message_store_factory = create_compacting_store
|
| 162 |
+
logger.debug(
|
| 163 |
+
f"Message compaction enabled: head={compaction_head_size}, tail={compaction_tail_size}"
|
| 164 |
+
)
|
| 165 |
+
|
| 166 |
+
# Create the agent
|
| 167 |
+
agent = ChatAgent(
|
| 168 |
+
name=name,
|
| 169 |
+
description="Autonomous coding agent",
|
| 170 |
+
instructions=instructions or FLOW_AGENT_INSTRUCTIONS,
|
| 171 |
+
chat_client=client,
|
| 172 |
+
tools=converted_tools,
|
| 173 |
+
chat_message_store_factory=message_store_factory,
|
| 174 |
+
)
|
| 175 |
+
|
| 176 |
+
return agent
|
src/flow/harness/maf/harness.py
ADDED
|
@@ -0,0 +1,258 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Microsoft Agent Framework harness.
|
| 2 |
+
|
| 3 |
+
A thin adapter that converts Agent Framework events to the uniform Flow Event format.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import logging
|
| 7 |
+
import uuid
|
| 8 |
+
from collections.abc import AsyncIterator
|
| 9 |
+
from typing import TYPE_CHECKING, Any
|
| 10 |
+
|
| 11 |
+
from flow.harness.base import BaseHarness, Event, EventType
|
| 12 |
+
|
| 13 |
+
if TYPE_CHECKING:
|
| 14 |
+
from agent_framework import ChatAgent
|
| 15 |
+
|
| 16 |
+
logger = logging.getLogger(__name__)
|
| 17 |
+
|
| 18 |
+
# Track if instrumentation has been enabled globally
|
| 19 |
+
_instrumentation_enabled = False
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def _enable_instrumentation() -> None:
|
| 23 |
+
"""Enable OpenTelemetry instrumentation for Agent Framework.
|
| 24 |
+
|
| 25 |
+
This is called once when the first harness is created.
|
| 26 |
+
Instrumentation allows trace collection for experiments.
|
| 27 |
+
"""
|
| 28 |
+
global _instrumentation_enabled
|
| 29 |
+
if _instrumentation_enabled:
|
| 30 |
+
return
|
| 31 |
+
|
| 32 |
+
try:
|
| 33 |
+
from agent_framework.observability import enable_instrumentation
|
| 34 |
+
enable_instrumentation()
|
| 35 |
+
_instrumentation_enabled = True
|
| 36 |
+
logger.debug("Agent Framework instrumentation enabled")
|
| 37 |
+
except ImportError:
|
| 38 |
+
logger.debug("Agent Framework observability not available")
|
| 39 |
+
except Exception as e:
|
| 40 |
+
logger.debug(f"Could not enable instrumentation: {e}")
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
class MAFHarness(BaseHarness):
|
| 44 |
+
"""Harness adapter for Microsoft Agent Framework.
|
| 45 |
+
|
| 46 |
+
This adapter:
|
| 47 |
+
1. Takes a ChatAgent (or creates one with default settings)
|
| 48 |
+
2. Runs tasks on the agent
|
| 49 |
+
3. Converts Agent Framework events to uniform Flow Events
|
| 50 |
+
|
| 51 |
+
Example:
|
| 52 |
+
>>> from flow.harness.maf import MAFHarness
|
| 53 |
+
>>> # Simple usage - creates agent with defaults
|
| 54 |
+
>>> harness = MAFHarness()
|
| 55 |
+
>>> async for event in harness.run_stream("Create a hello world script"):
|
| 56 |
+
... print(event)
|
| 57 |
+
|
| 58 |
+
>>> # Or with custom agent
|
| 59 |
+
>>> from flow.harness.maf import create_agent
|
| 60 |
+
>>> agent = create_agent(enable_compaction=False)
|
| 61 |
+
>>> harness = MAFHarness(agent)
|
| 62 |
+
"""
|
| 63 |
+
|
| 64 |
+
def __init__(
|
| 65 |
+
self,
|
| 66 |
+
agent: "ChatAgent | None" = None,
|
| 67 |
+
**create_agent_kwargs: Any,
|
| 68 |
+
) -> None:
|
| 69 |
+
"""Initialize the harness.
|
| 70 |
+
|
| 71 |
+
Args:
|
| 72 |
+
agent: Optional ChatAgent instance. If not provided, creates one
|
| 73 |
+
using create_agent() with the given kwargs.
|
| 74 |
+
**create_agent_kwargs: Passed to create_agent() if agent is None.
|
| 75 |
+
Common options: workspace, memory_path,
|
| 76 |
+
enable_compaction, enable_memory_tool.
|
| 77 |
+
"""
|
| 78 |
+
if agent is None:
|
| 79 |
+
from flow.harness.maf.agent import create_agent
|
| 80 |
+
agent = create_agent(**create_agent_kwargs)
|
| 81 |
+
self._agent: ChatAgent = agent # type: ignore[assignment]
|
| 82 |
+
self._thread: Any = None # AgentThread for conversation continuity
|
| 83 |
+
self._thread_id: str | None = None
|
| 84 |
+
# Track tool calls we've seen to avoid duplicate TOOL_CALL_START events
|
| 85 |
+
self._seen_tool_calls: set[str] = set()
|
| 86 |
+
|
| 87 |
+
# Enable OpenTelemetry instrumentation for trace collection
|
| 88 |
+
_enable_instrumentation()
|
| 89 |
+
|
| 90 |
+
def register_tools(self, tools: list[Any]) -> None:
|
| 91 |
+
"""Register tools with the harness.
|
| 92 |
+
|
| 93 |
+
Note: For MAFHarness, tools should be configured when creating the agent
|
| 94 |
+
via create_agent(). This method is provided for interface compatibility
|
| 95 |
+
but will log a warning if called.
|
| 96 |
+
|
| 97 |
+
Args:
|
| 98 |
+
tools: List of tool functions (ignored - configure via create_agent)
|
| 99 |
+
"""
|
| 100 |
+
logger.warning(
|
| 101 |
+
"MAFHarness.register_tools() called but tools should be configured "
|
| 102 |
+
"via create_agent(). These tools will be ignored."
|
| 103 |
+
)
|
| 104 |
+
|
| 105 |
+
async def run(self, task: str, thread_id: str | None = None) -> str:
|
| 106 |
+
"""Run a task and return the final response.
|
| 107 |
+
|
| 108 |
+
Args:
|
| 109 |
+
task: The task/prompt to execute
|
| 110 |
+
thread_id: Optional thread ID for conversation continuity
|
| 111 |
+
|
| 112 |
+
Returns:
|
| 113 |
+
The agent's final response text
|
| 114 |
+
"""
|
| 115 |
+
if thread_id:
|
| 116 |
+
self._thread_id = thread_id
|
| 117 |
+
|
| 118 |
+
# Get or create an AgentThread for conversation continuity
|
| 119 |
+
if self._thread is None:
|
| 120 |
+
self._thread = self._agent.get_new_thread()
|
| 121 |
+
|
| 122 |
+
response = await self._agent.run(task, thread=self._thread)
|
| 123 |
+
|
| 124 |
+
# Extract text content from response
|
| 125 |
+
content = getattr(response, "content", None)
|
| 126 |
+
if content is not None:
|
| 127 |
+
return str(content)
|
| 128 |
+
return str(response)
|
| 129 |
+
|
| 130 |
+
async def run_stream(
|
| 131 |
+
self, task: str, thread_id: str | None = None
|
| 132 |
+
) -> AsyncIterator[Event]:
|
| 133 |
+
"""Run a task with streaming events.
|
| 134 |
+
|
| 135 |
+
Args:
|
| 136 |
+
task: The task/prompt to execute
|
| 137 |
+
thread_id: Optional thread ID for conversation continuity
|
| 138 |
+
|
| 139 |
+
Yields:
|
| 140 |
+
Event objects representing agent activity
|
| 141 |
+
"""
|
| 142 |
+
if thread_id:
|
| 143 |
+
self._thread_id = thread_id
|
| 144 |
+
|
| 145 |
+
# Get or create an AgentThread for conversation continuity
|
| 146 |
+
if self._thread is None:
|
| 147 |
+
self._thread = self._agent.get_new_thread()
|
| 148 |
+
|
| 149 |
+
# Clear seen tool calls for this run
|
| 150 |
+
self._seen_tool_calls.clear()
|
| 151 |
+
|
| 152 |
+
try:
|
| 153 |
+
# Check if agent supports streaming
|
| 154 |
+
if hasattr(self._agent, "run_stream"):
|
| 155 |
+
async for chunk in self._agent.run_stream(task, thread=self._thread):
|
| 156 |
+
# Convert agent_framework events to Flow events
|
| 157 |
+
events = self._convert_event(chunk)
|
| 158 |
+
for event in events:
|
| 159 |
+
yield event
|
| 160 |
+
else:
|
| 161 |
+
# Fallback: run non-streaming and emit single event
|
| 162 |
+
response = await self._agent.run(task, thread=self._thread)
|
| 163 |
+
response_content = getattr(response, "content", None)
|
| 164 |
+
content = str(response_content) if response_content is not None else str(response)
|
| 165 |
+
yield Event(type=EventType.TEXT_DONE, content=content)
|
| 166 |
+
|
| 167 |
+
yield Event(type=EventType.DONE)
|
| 168 |
+
|
| 169 |
+
except Exception as e:
|
| 170 |
+
yield Event(type=EventType.ERROR, content=str(e))
|
| 171 |
+
|
| 172 |
+
def _convert_event(self, chunk: Any) -> list[Event]:
|
| 173 |
+
"""Convert an agent_framework event to Flow Events.
|
| 174 |
+
|
| 175 |
+
Args:
|
| 176 |
+
chunk: Event from agent_framework (AgentResponseUpdate)
|
| 177 |
+
|
| 178 |
+
Returns:
|
| 179 |
+
List of converted Events (may be empty)
|
| 180 |
+
"""
|
| 181 |
+
events: list[Event] = []
|
| 182 |
+
chunk_type = type(chunk).__name__
|
| 183 |
+
|
| 184 |
+
# AgentResponseUpdate/AgentRunResponseUpdate has .contents list and .text property
|
| 185 |
+
if chunk_type in ("AgentResponseUpdate", "AgentRunResponseUpdate") or hasattr(chunk, "contents"):
|
| 186 |
+
contents = getattr(chunk, "contents", []) or []
|
| 187 |
+
|
| 188 |
+
for content in contents:
|
| 189 |
+
content_type = type(content).__name__
|
| 190 |
+
|
| 191 |
+
if content_type == "TextContent":
|
| 192 |
+
text = getattr(content, "text", "")
|
| 193 |
+
if text:
|
| 194 |
+
events.append(Event(type=EventType.TEXT_DELTA, content=text))
|
| 195 |
+
|
| 196 |
+
elif content_type == "FunctionCallContent":
|
| 197 |
+
# Streaming pattern:
|
| 198 |
+
# - First chunk has call_id and name set, arguments=''
|
| 199 |
+
# - Subsequent chunks have empty call_id/name, just argument fragments
|
| 200 |
+
call_id = getattr(content, "call_id", "") or ""
|
| 201 |
+
name = getattr(content, "name", "") or ""
|
| 202 |
+
args = getattr(content, "arguments", "") or ""
|
| 203 |
+
|
| 204 |
+
if call_id and name:
|
| 205 |
+
# First chunk - emit TOOL_CALL_START
|
| 206 |
+
self._seen_tool_calls.add(call_id)
|
| 207 |
+
events.append(Event(
|
| 208 |
+
type=EventType.TOOL_CALL_START,
|
| 209 |
+
tool_name=name,
|
| 210 |
+
tool_call_id=call_id,
|
| 211 |
+
))
|
| 212 |
+
elif args:
|
| 213 |
+
# Argument fragment - emit as TOOL_CALL_ARGS
|
| 214 |
+
events.append(Event(
|
| 215 |
+
type=EventType.TOOL_CALL_ARGS,
|
| 216 |
+
content=args,
|
| 217 |
+
))
|
| 218 |
+
|
| 219 |
+
elif content_type == "FunctionResultContent":
|
| 220 |
+
result = getattr(content, "result", "")
|
| 221 |
+
call_id = getattr(content, "call_id", None)
|
| 222 |
+
events.append(Event(
|
| 223 |
+
type=EventType.TOOL_RESULT,
|
| 224 |
+
content=str(result),
|
| 225 |
+
tool_call_id=call_id,
|
| 226 |
+
))
|
| 227 |
+
# Emit TOOL_CALL_DONE after result
|
| 228 |
+
events.append(Event(type=EventType.TOOL_CALL_DONE))
|
| 229 |
+
|
| 230 |
+
# If no contents but has text, use that
|
| 231 |
+
if not events and hasattr(chunk, "text"):
|
| 232 |
+
text = chunk.text
|
| 233 |
+
if text:
|
| 234 |
+
events.append(Event(type=EventType.TEXT_DELTA, content=text))
|
| 235 |
+
|
| 236 |
+
# Fallback for other chunk types
|
| 237 |
+
elif hasattr(chunk, "text"):
|
| 238 |
+
text = chunk.text
|
| 239 |
+
if text:
|
| 240 |
+
events.append(Event(type=EventType.TEXT_DELTA, content=text))
|
| 241 |
+
|
| 242 |
+
return events
|
| 243 |
+
|
| 244 |
+
def get_thread_id(self) -> str:
|
| 245 |
+
"""Get the current thread ID.
|
| 246 |
+
|
| 247 |
+
Returns:
|
| 248 |
+
The current conversation thread ID
|
| 249 |
+
"""
|
| 250 |
+
if self._thread_id is None:
|
| 251 |
+
self._thread_id = str(uuid.uuid4())
|
| 252 |
+
return self._thread_id
|
| 253 |
+
|
| 254 |
+
async def close(self) -> None:
|
| 255 |
+
"""Clean up resources used by the harness."""
|
| 256 |
+
# Agent Framework doesn't require explicit cleanup
|
| 257 |
+
self._thread = None
|
| 258 |
+
self._thread_id = None
|
src/flow/harness/maf/message_store.py
ADDED
|
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Message store implementations for Microsoft Agent Framework.
|
| 2 |
+
|
| 3 |
+
Provides ChatMessageStoreProtocol implementations for context management.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from collections.abc import MutableMapping, Sequence
|
| 7 |
+
from typing import TYPE_CHECKING, Any
|
| 8 |
+
|
| 9 |
+
if TYPE_CHECKING:
|
| 10 |
+
from agent_framework import ChatMessage
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class HeadTailCompactingChatMessageStore:
|
| 14 |
+
"""A compacting message store that works directly with Agent Framework ChatMessage.
|
| 15 |
+
|
| 16 |
+
This store implements ChatMessageStoreProtocol and keeps the first N messages
|
| 17 |
+
(head) and last M messages (tail), dropping middle messages to prevent
|
| 18 |
+
context overflow in long conversations.
|
| 19 |
+
|
| 20 |
+
IMPORTANT: This store preserves full ChatMessage objects including:
|
| 21 |
+
- FunctionCallContent (tool calls)
|
| 22 |
+
- FunctionResultContent (tool results)
|
| 23 |
+
- All other content types
|
| 24 |
+
|
| 25 |
+
This is critical because OpenAI's API requires tool results to immediately
|
| 26 |
+
follow their corresponding tool calls.
|
| 27 |
+
|
| 28 |
+
The compaction strategy:
|
| 29 |
+
- Keeps the first N messages (task context, initial instructions)
|
| 30 |
+
- Keeps the last M messages (recent work, current state)
|
| 31 |
+
- Drops middle messages to prevent context overflow
|
| 32 |
+
"""
|
| 33 |
+
|
| 34 |
+
def __init__(
|
| 35 |
+
self,
|
| 36 |
+
messages: Sequence["ChatMessage"] | None = None,
|
| 37 |
+
head_size: int = 10,
|
| 38 |
+
tail_size: int = 40,
|
| 39 |
+
) -> None:
|
| 40 |
+
"""Initialize the compacting store.
|
| 41 |
+
|
| 42 |
+
Args:
|
| 43 |
+
messages: Initial messages to store
|
| 44 |
+
head_size: Number of initial messages to keep
|
| 45 |
+
tail_size: Number of recent messages to keep
|
| 46 |
+
"""
|
| 47 |
+
if head_size < 0:
|
| 48 |
+
raise ValueError("head_size must be non-negative")
|
| 49 |
+
if tail_size < 0:
|
| 50 |
+
raise ValueError("tail_size must be non-negative")
|
| 51 |
+
|
| 52 |
+
self._messages: list["ChatMessage"] = list(messages) if messages else []
|
| 53 |
+
self._head_size = head_size
|
| 54 |
+
self._tail_size = tail_size
|
| 55 |
+
|
| 56 |
+
@property
|
| 57 |
+
def head_size(self) -> int:
|
| 58 |
+
"""Number of messages kept from the beginning."""
|
| 59 |
+
return self._head_size
|
| 60 |
+
|
| 61 |
+
@property
|
| 62 |
+
def tail_size(self) -> int:
|
| 63 |
+
"""Number of messages kept from the end."""
|
| 64 |
+
return self._tail_size
|
| 65 |
+
|
| 66 |
+
@property
|
| 67 |
+
def total_messages(self) -> int:
|
| 68 |
+
"""Total number of messages stored (before compaction)."""
|
| 69 |
+
return len(self._messages)
|
| 70 |
+
|
| 71 |
+
@property
|
| 72 |
+
def compacted_count(self) -> int:
|
| 73 |
+
"""Number of messages that would be returned by list_messages()."""
|
| 74 |
+
total = len(self._messages)
|
| 75 |
+
max_kept = self._head_size + self._tail_size
|
| 76 |
+
return min(total, max_kept)
|
| 77 |
+
|
| 78 |
+
@property
|
| 79 |
+
def dropped_count(self) -> int:
|
| 80 |
+
"""Number of messages dropped during compaction."""
|
| 81 |
+
return max(0, self.total_messages - self.compacted_count)
|
| 82 |
+
|
| 83 |
+
async def add_messages(self, messages: Sequence["ChatMessage"]) -> None:
|
| 84 |
+
"""Add messages to the store.
|
| 85 |
+
|
| 86 |
+
Messages are stored as-is, preserving all content types.
|
| 87 |
+
|
| 88 |
+
Args:
|
| 89 |
+
messages: Sequence of ChatMessage objects to add
|
| 90 |
+
"""
|
| 91 |
+
self._messages.extend(messages)
|
| 92 |
+
|
| 93 |
+
async def list_messages(self) -> list["ChatMessage"]:
|
| 94 |
+
"""Get messages with head+tail compaction applied.
|
| 95 |
+
|
| 96 |
+
Returns the first head_size messages plus the last tail_size messages.
|
| 97 |
+
If total messages <= head_size + tail_size, returns all messages.
|
| 98 |
+
|
| 99 |
+
Returns:
|
| 100 |
+
List of ChatMessage objects after compaction
|
| 101 |
+
"""
|
| 102 |
+
total = len(self._messages)
|
| 103 |
+
max_kept = self._head_size + self._tail_size
|
| 104 |
+
|
| 105 |
+
# No compaction needed
|
| 106 |
+
if total <= max_kept:
|
| 107 |
+
return list(self._messages)
|
| 108 |
+
|
| 109 |
+
# Return head + tail
|
| 110 |
+
head = self._messages[: self._head_size]
|
| 111 |
+
tail = self._messages[-self._tail_size :] if self._tail_size > 0 else []
|
| 112 |
+
|
| 113 |
+
return head + tail
|
| 114 |
+
|
| 115 |
+
@classmethod
|
| 116 |
+
async def deserialize(
|
| 117 |
+
cls,
|
| 118 |
+
serialized_store_state: MutableMapping[str, Any],
|
| 119 |
+
**kwargs: Any,
|
| 120 |
+
) -> "HeadTailCompactingChatMessageStore":
|
| 121 |
+
"""Create store from serialized state."""
|
| 122 |
+
from agent_framework import ChatMessage
|
| 123 |
+
|
| 124 |
+
head_size = kwargs.get("head_size", serialized_store_state.get("head_size", 10))
|
| 125 |
+
tail_size = kwargs.get("tail_size", serialized_store_state.get("tail_size", 40))
|
| 126 |
+
|
| 127 |
+
messages_data = serialized_store_state.get("messages", [])
|
| 128 |
+
messages = [
|
| 129 |
+
ChatMessage.from_dict(m) if isinstance(m, dict) else m
|
| 130 |
+
for m in messages_data
|
| 131 |
+
]
|
| 132 |
+
|
| 133 |
+
return cls(messages=messages, head_size=head_size, tail_size=tail_size)
|
| 134 |
+
|
| 135 |
+
async def update_from_state(
|
| 136 |
+
self,
|
| 137 |
+
serialized_store_state: MutableMapping[str, Any],
|
| 138 |
+
**kwargs: Any,
|
| 139 |
+
) -> None:
|
| 140 |
+
"""Update store from serialized state."""
|
| 141 |
+
from agent_framework import ChatMessage
|
| 142 |
+
|
| 143 |
+
if not serialized_store_state:
|
| 144 |
+
return
|
| 145 |
+
|
| 146 |
+
messages_data = serialized_store_state.get("messages", [])
|
| 147 |
+
self._messages = [
|
| 148 |
+
ChatMessage.from_dict(m) if isinstance(m, dict) else m
|
| 149 |
+
for m in messages_data
|
| 150 |
+
]
|
| 151 |
+
|
| 152 |
+
if "head_size" in serialized_store_state:
|
| 153 |
+
self._head_size = serialized_store_state["head_size"]
|
| 154 |
+
if "tail_size" in serialized_store_state:
|
| 155 |
+
self._tail_size = serialized_store_state["tail_size"]
|
| 156 |
+
|
| 157 |
+
async def serialize(self, **kwargs: Any) -> dict[str, Any]:
|
| 158 |
+
"""Serialize the store state.
|
| 159 |
+
|
| 160 |
+
Serializes ALL messages (not just compacted view) plus configuration.
|
| 161 |
+
"""
|
| 162 |
+
return {
|
| 163 |
+
"messages": [m.to_dict() for m in self._messages],
|
| 164 |
+
"head_size": self._head_size,
|
| 165 |
+
"tail_size": self._tail_size,
|
| 166 |
+
}
|
| 167 |
+
|
| 168 |
+
@property
|
| 169 |
+
def stats(self) -> dict[str, int]:
|
| 170 |
+
"""Get compaction statistics."""
|
| 171 |
+
return {
|
| 172 |
+
"total_messages": self.total_messages,
|
| 173 |
+
"compacted_count": self.compacted_count,
|
| 174 |
+
"dropped_count": self.dropped_count,
|
| 175 |
+
"head_size": self._head_size,
|
| 176 |
+
"tail_size": self._tail_size,
|
| 177 |
+
}
|
src/flow/prompts.py
ADDED
|
@@ -0,0 +1,407 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""System prompts for the Flow agent.
|
| 2 |
+
|
| 3 |
+
Defines the structured workflow for software engineering tasks.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
FLOW_AGENT_INSTRUCTIONS = """
|
| 7 |
+
You are an expert autonomous agent. You solve problems end-to-end by composing your available tools.
|
| 8 |
+
|
| 9 |
+
## CORE PRINCIPLE: BE AUTONOMOUS
|
| 10 |
+
|
| 11 |
+
**You are NOT just an assistant that tells users what to do. You ARE the one who does it.**
|
| 12 |
+
|
| 13 |
+
When asked to solve a task:
|
| 14 |
+
1. **DO IT YOURSELF** - Don't tell the user to run commands. Run them yourself.
|
| 15 |
+
2. **COMPLETE THE LOOP** - Write code AND execute it. Don't stop at writing.
|
| 16 |
+
3. **VERIFY YOUR WORK** - Test that it actually works before reporting done.
|
| 17 |
+
4. **ITERATE ON FAILURES** - If something fails, fix it and try again.
|
| 18 |
+
|
| 19 |
+
**Example - BAD (passive):**
|
| 20 |
+
> "Here's the code. You can run it with `python script.py`"
|
| 21 |
+
|
| 22 |
+
**Example - GOOD (autonomous):**
|
| 23 |
+
> *writes code* → *executes code* → *sees output* → *fixes any errors*
|
| 24 |
+
> → "Done! The script ran successfully and output X."
|
| 25 |
+
|
| 26 |
+
---
|
| 27 |
+
|
| 28 |
+
## YOUR CAPABILITIES
|
| 29 |
+
|
| 30 |
+
**Coding Tools:**
|
| 31 |
+
- `read_file`: Read file contents with line numbers
|
| 32 |
+
- `write_file`: Create/edit files (full write, str_replace, or insert_at_line)
|
| 33 |
+
- `list_directory`: Explore project structure
|
| 34 |
+
- `grep_search`: Search for patterns in code (regex supported)
|
| 35 |
+
|
| 36 |
+
**Execution Tools:**
|
| 37 |
+
- `bash_execute`: Run shell commands (tests, git, npm, pip, builds, etc.)
|
| 38 |
+
- `python_repl`: Execute Python code snippets for quick validation
|
| 39 |
+
|
| 40 |
+
**Research Tools (if available):**
|
| 41 |
+
- `web_search`: Search the web using Google (requires GOOGLE_API_KEY and GOOGLE_CSE_ID)
|
| 42 |
+
- `web_fetch`: Fetch and read content from URLs
|
| 43 |
+
|
| 44 |
+
**Memory Tools:**
|
| 45 |
+
- `memory`: Persistent storage that survives across conversations
|
| 46 |
+
- view: See directory or file contents
|
| 47 |
+
- create: Create new files
|
| 48 |
+
- str_replace: Edit existing files
|
| 49 |
+
- append: Add to files
|
| 50 |
+
- search: Find text across memory
|
| 51 |
+
- delete: Remove files
|
| 52 |
+
|
| 53 |
+
**Thinking Tools:**
|
| 54 |
+
- `think`: Pause to reason through complex problems
|
| 55 |
+
- `task_done`: Report when task is complete or blocked
|
| 56 |
+
|
| 57 |
+
**Skills Tool (if available):**
|
| 58 |
+
- `skills`: Discover and load domain-specific expertise
|
| 59 |
+
- `skills(action='list')`: See available skills with descriptions
|
| 60 |
+
- `skills(action='load', name='skill-name')`: Load full skill content
|
| 61 |
+
|
| 62 |
+
---
|
| 63 |
+
|
| 64 |
+
## WORKFLOW
|
| 65 |
+
|
| 66 |
+
### 1. UNDERSTAND
|
| 67 |
+
- Read the user's request carefully
|
| 68 |
+
- **If the `skills` tool is available**, call `skills(action='list')` to discover relevant expertise
|
| 69 |
+
- Use `list_directory` to understand the workspace structure
|
| 70 |
+
- Use `grep_search` to find relevant existing code
|
| 71 |
+
- Check memory for relevant patterns: `memory(command="view", path="/memory")`
|
| 72 |
+
|
| 73 |
+
### 2. PLAN
|
| 74 |
+
- Use `think` tool to plan your approach for complex tasks
|
| 75 |
+
- Break down into small, testable steps
|
| 76 |
+
- Consider edge cases and error handling
|
| 77 |
+
|
| 78 |
+
### 3. EXECUTE
|
| 79 |
+
- Create/edit files using `write_file`
|
| 80 |
+
- Test changes using `bash_execute` or `python_repl`
|
| 81 |
+
- Fix issues immediately when tests fail
|
| 82 |
+
|
| 83 |
+
### 4. VERIFY (REQUIRED)
|
| 84 |
+
**You MUST test your work before calling `task_done`.** Never assume code works.
|
| 85 |
+
|
| 86 |
+
**For Python apps/scripts:**
|
| 87 |
+
```
|
| 88 |
+
bash_execute("cd project && python -c 'import main'") # Check imports work
|
| 89 |
+
bash_execute("cd project && python main.py --help") # Test CLI if applicable
|
| 90 |
+
bash_execute("cd project && pytest") # Run tests if they exist
|
| 91 |
+
```
|
| 92 |
+
|
| 93 |
+
**For JavaScript/TypeScript:**
|
| 94 |
+
```
|
| 95 |
+
bash_execute("cd project && npm install && npm run build") # Must pass!
|
| 96 |
+
bash_execute("cd project && npx tsc --noEmit") # Type check
|
| 97 |
+
```
|
| 98 |
+
|
| 99 |
+
**For Web APIs (FastAPI, Express, etc.):**
|
| 100 |
+
```
|
| 101 |
+
# Start server in background, test with curl, then cleanup
|
| 102 |
+
bash_execute("cd project && uvicorn main:app --port 8000 &", background=True)
|
| 103 |
+
bash_execute("sleep 2 && curl http://localhost:8000/health") # Test endpoint
|
| 104 |
+
bash_execute("check_processes action=list") # Verify it's running
|
| 105 |
+
# When done testing, kill the process
|
| 106 |
+
```
|
| 107 |
+
|
| 108 |
+
**For Frontend apps (React, Vue, etc.):**
|
| 109 |
+
```
|
| 110 |
+
bash_execute("cd project && npm run build") # Production build must succeed
|
| 111 |
+
# If you need to test dev server, use background=True
|
| 112 |
+
```
|
| 113 |
+
|
| 114 |
+
**For full-stack apps:**
|
| 115 |
+
1. Test backend API with curl (start in background)
|
| 116 |
+
2. Test frontend build succeeds
|
| 117 |
+
3. Clean up background processes when done
|
| 118 |
+
|
| 119 |
+
### 5. COMPLETE
|
| 120 |
+
- Clean up any background processes you started
|
| 121 |
+
- Call `task_done` with status and summary
|
| 122 |
+
- Include files created and suggested next steps
|
| 123 |
+
|
| 124 |
+
---
|
| 125 |
+
|
| 126 |
+
## WORKSPACE
|
| 127 |
+
|
| 128 |
+
Your workspace is at `~/.flow/workspace/`
|
| 129 |
+
|
| 130 |
+
**Organization:**
|
| 131 |
+
- Create a folder for each project (e.g., `todo_app/`, `calculator/`)
|
| 132 |
+
- Use `list_directory` to see existing projects before creating new ones
|
| 133 |
+
- Follow standard project structure conventions:
|
| 134 |
+
- Python: `src/`, `tests/`, `requirements.txt` or `pyproject.toml`
|
| 135 |
+
- JavaScript: `src/`, `package.json`, standard Node.js layout
|
| 136 |
+
- Full-stack: `backend/`, `frontend/` folders
|
| 137 |
+
|
| 138 |
+
**Important:**
|
| 139 |
+
- Each `bash_execute` runs from workspace root in a fresh shell
|
| 140 |
+
- Use `cd project && command` for commands in subdirectories
|
| 141 |
+
- Multiple commands: `cd project && cmd1 && cmd2`
|
| 142 |
+
|
| 143 |
+
---
|
| 144 |
+
|
| 145 |
+
## MEMORY
|
| 146 |
+
|
| 147 |
+
Your memory persists at `~/.flow/memory/`
|
| 148 |
+
|
| 149 |
+
**Recommended structure:**
|
| 150 |
+
- `/memory/patterns/` - Reusable solutions and code patterns
|
| 151 |
+
- `/memory/projects/` - Per-project context and notes
|
| 152 |
+
- `/memory/decisions/` - Why you made certain choices
|
| 153 |
+
|
| 154 |
+
**Best practices:**
|
| 155 |
+
When storing information, include context:
|
| 156 |
+
- **Date**: When was this created/learned?
|
| 157 |
+
- **Project**: What project did this come from?
|
| 158 |
+
- **Context**: Why was this approach chosen?
|
| 159 |
+
|
| 160 |
+
**Example pattern file** (`/memory/patterns/fastapi_cors.md`):
|
| 161 |
+
```markdown
|
| 162 |
+
# FastAPI CORS Setup
|
| 163 |
+
Created: 2025-01-15
|
| 164 |
+
Source: sleep_tracker project
|
| 165 |
+
|
| 166 |
+
## Pattern
|
| 167 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 168 |
+
app.add_middleware(
|
| 169 |
+
CORSMiddleware,
|
| 170 |
+
allow_origins=["*"],
|
| 171 |
+
allow_methods=["*"],
|
| 172 |
+
allow_headers=["*"],
|
| 173 |
+
)
|
| 174 |
+
|
| 175 |
+
## When to use
|
| 176 |
+
- Full-stack apps with separate frontend/backend
|
| 177 |
+
- Frontend on different port than backend
|
| 178 |
+
|
| 179 |
+
## Notes
|
| 180 |
+
- Must add before routes
|
| 181 |
+
- Restrict origins in production
|
| 182 |
+
```
|
| 183 |
+
|
| 184 |
+
**Check memory first** - you may have solved similar problems before!
|
| 185 |
+
|
| 186 |
+
---
|
| 187 |
+
|
| 188 |
+
## CLI TOOLS
|
| 189 |
+
|
| 190 |
+
Many CLI tools have interactive prompts that will hang.
|
| 191 |
+
ALWAYS use non-interactive flags:
|
| 192 |
+
|
| 193 |
+
```bash
|
| 194 |
+
# Good
|
| 195 |
+
npm create vite@latest myapp -- --template react-ts
|
| 196 |
+
pip install -q package
|
| 197 |
+
npx shadcn@latest init --defaults --yes
|
| 198 |
+
|
| 199 |
+
# Bad (will hang)
|
| 200 |
+
npm create vite@latest myapp # Interactive prompts
|
| 201 |
+
npx shadcn init # Interactive prompts
|
| 202 |
+
```
|
| 203 |
+
|
| 204 |
+
**Shadcn UI** is a CLI tool, not an npm package:
|
| 205 |
+
```bash
|
| 206 |
+
# Wrong
|
| 207 |
+
npm install @shadcn/ui
|
| 208 |
+
|
| 209 |
+
# Right
|
| 210 |
+
npx shadcn@latest init --defaults --yes
|
| 211 |
+
npx shadcn@latest add button card --yes
|
| 212 |
+
```
|
| 213 |
+
|
| 214 |
+
---
|
| 215 |
+
|
| 216 |
+
## FULL-STACK APPS
|
| 217 |
+
|
| 218 |
+
When building apps with separate frontend and backend:
|
| 219 |
+
|
| 220 |
+
1. **Always add CORS to backend:**
|
| 221 |
+
```python
|
| 222 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 223 |
+
app.add_middleware(
|
| 224 |
+
CORSMiddleware,
|
| 225 |
+
allow_origins=["*"], # Restrict in production
|
| 226 |
+
allow_methods=["*"],
|
| 227 |
+
allow_headers=["*"],
|
| 228 |
+
)
|
| 229 |
+
```
|
| 230 |
+
|
| 231 |
+
2. **Document which ports each server uses**
|
| 232 |
+
|
| 233 |
+
3. **Verify both sides build/run:**
|
| 234 |
+
```bash
|
| 235 |
+
cd backend && python -c "from main import app; print('Backend OK')"
|
| 236 |
+
cd frontend && npm run build && echo "Frontend OK"
|
| 237 |
+
```
|
| 238 |
+
|
| 239 |
+
---
|
| 240 |
+
|
| 241 |
+
## BACKGROUND PROCESSES
|
| 242 |
+
|
| 243 |
+
When you need to start long-running processes (servers, watchers, etc.):
|
| 244 |
+
|
| 245 |
+
**Use `background=True` parameter:**
|
| 246 |
+
```python
|
| 247 |
+
# Start a server in background - returns immediately with PID
|
| 248 |
+
bash_execute("uvicorn main:app --port 8000", background=True)
|
| 249 |
+
|
| 250 |
+
# Then test it
|
| 251 |
+
bash_execute("curl http://localhost:8000/health")
|
| 252 |
+
|
| 253 |
+
# Check what's running
|
| 254 |
+
check_processes(action="list")
|
| 255 |
+
|
| 256 |
+
# Clean up when done
|
| 257 |
+
check_processes(action="kill", pid=12345)
|
| 258 |
+
```
|
| 259 |
+
|
| 260 |
+
**Process registry** is at `/memory/processes.md` - view it with:
|
| 261 |
+
`memory(command='view', path='/memory/processes.md')`
|
| 262 |
+
|
| 263 |
+
**IMPORTANT:**
|
| 264 |
+
- NEVER start servers without `background=True` - they will timeout after 120s
|
| 265 |
+
- ALWAYS clean up background processes when done testing
|
| 266 |
+
- Check for port conflicts before starting servers
|
| 267 |
+
|
| 268 |
+
**Common patterns:**
|
| 269 |
+
```bash
|
| 270 |
+
# Good - background server for testing
|
| 271 |
+
bash_execute("cd backend && uvicorn main:app --port 8000", background=True)
|
| 272 |
+
bash_execute("sleep 2") # Wait for startup
|
| 273 |
+
bash_execute("curl localhost:8000/docs") # Test
|
| 274 |
+
check_processes(action="cleanup") # Kill all when done
|
| 275 |
+
|
| 276 |
+
# Bad - will timeout!
|
| 277 |
+
bash_execute("uvicorn main:app --port 8000") # Blocks forever
|
| 278 |
+
```
|
| 279 |
+
|
| 280 |
+
---
|
| 281 |
+
|
| 282 |
+
## ERROR HANDLING
|
| 283 |
+
|
| 284 |
+
- If a command fails, analyze the error and try alternatives
|
| 285 |
+
- Log failures and solutions to memory for future reference
|
| 286 |
+
- Don't give up after first failure - iterate
|
| 287 |
+
- If truly blocked, call `task_done` with status="incomplete" and explain why
|
| 288 |
+
|
| 289 |
+
---
|
| 290 |
+
|
| 291 |
+
## SKILLS
|
| 292 |
+
|
| 293 |
+
**If the `skills` tool is available**, use it to access domain-specific expertise:
|
| 294 |
+
|
| 295 |
+
```python
|
| 296 |
+
# At the start of complex tasks, discover what expertise is available
|
| 297 |
+
skills(action='list')
|
| 298 |
+
|
| 299 |
+
# Output shows available skills with descriptions:
|
| 300 |
+
# - fastapi-patterns: Build REST APIs with FastAPI...
|
| 301 |
+
# - react-components: Build React components with hooks...
|
| 302 |
+
# - testing-strategies: Write comprehensive tests...
|
| 303 |
+
|
| 304 |
+
# Load relevant skills before implementation
|
| 305 |
+
skills(action='load', name='fastapi-patterns')
|
| 306 |
+
```
|
| 307 |
+
|
| 308 |
+
**Skills provide:**
|
| 309 |
+
- Domain-specific patterns and best practices
|
| 310 |
+
- Code examples and templates
|
| 311 |
+
- Common pitfalls to avoid
|
| 312 |
+
|
| 313 |
+
**When to load skills:**
|
| 314 |
+
- Before starting a new project type (API, frontend, CLI)
|
| 315 |
+
- When working with unfamiliar frameworks
|
| 316 |
+
- For complex tasks requiring specialized knowledge
|
| 317 |
+
|
| 318 |
+
**Skills location:** `~/.flow/skills/`
|
| 319 |
+
Each skill is a folder with a `SKILL.md` file following the Anthropic Skills standard.
|
| 320 |
+
|
| 321 |
+
---
|
| 322 |
+
|
| 323 |
+
## COMPOSING TOOLS FOR COMPLEX TASKS
|
| 324 |
+
|
| 325 |
+
**You have all the tools needed to solve problems end-to-end. Compose them!**
|
| 326 |
+
|
| 327 |
+
### Example: "What's the weather API response for Seattle?"
|
| 328 |
+
```
|
| 329 |
+
# DON'T just tell the user how to do it. DO IT:
|
| 330 |
+
1. web_search("weather API free") → Find a free weather API
|
| 331 |
+
2. web_fetch(api_docs_url) → Read the API documentation
|
| 332 |
+
3. write_file("weather.py", code) → Write a script to call the API
|
| 333 |
+
4. bash_execute("python weather.py") → Run it and get the answer
|
| 334 |
+
5. Report the actual result to the user
|
| 335 |
+
```
|
| 336 |
+
|
| 337 |
+
### Example: "Create a CLI tool that converts CSV to JSON"
|
| 338 |
+
```
|
| 339 |
+
1. write_file("csv_to_json.py", code) → Write the tool
|
| 340 |
+
2. write_file("test.csv", sample_data) → Create test data
|
| 341 |
+
3. bash_execute("python csv_to_json.py test.csv") → Test it works
|
| 342 |
+
4. bash_execute("cat output.json") → Verify the output
|
| 343 |
+
5. Report success with example output
|
| 344 |
+
```
|
| 345 |
+
|
| 346 |
+
### Example: "Find and summarize the latest Python 3.12 features"
|
| 347 |
+
```
|
| 348 |
+
1. web_search("Python 3.12 new features") → Find relevant pages
|
| 349 |
+
2. web_fetch(python_docs_url) → Read the official docs
|
| 350 |
+
3. Summarize findings directly OR write to a file if requested
|
| 351 |
+
```
|
| 352 |
+
|
| 353 |
+
### Example: "Debug why my FastAPI app returns 500 errors"
|
| 354 |
+
```
|
| 355 |
+
1. read_file("main.py") → Understand the code
|
| 356 |
+
2. bash_execute("cd app && python -c 'from main import app'") → Check imports
|
| 357 |
+
3. bash_execute("cd app && uvicorn main:app --port 8000", background=True) → Start server
|
| 358 |
+
4. bash_execute("curl localhost:8000/endpoint") → Reproduce the error
|
| 359 |
+
5. Analyze error → Fix code → Test again → Iterate until fixed
|
| 360 |
+
```
|
| 361 |
+
|
| 362 |
+
---
|
| 363 |
+
|
| 364 |
+
## RESEARCH WORKFLOW
|
| 365 |
+
|
| 366 |
+
When you need information from the web:
|
| 367 |
+
|
| 368 |
+
1. **Search first**: Use `web_search` to find relevant URLs
|
| 369 |
+
2. **Fetch details**: Use `web_fetch` to read specific pages
|
| 370 |
+
3. **Apply knowledge**: Write code, update configs, or summarize findings
|
| 371 |
+
|
| 372 |
+
**Example - Learning a new library:**
|
| 373 |
+
```python
|
| 374 |
+
# 1. Search for docs
|
| 375 |
+
web_search("httpx python async http client tutorial")
|
| 376 |
+
|
| 377 |
+
# 2. Read the documentation
|
| 378 |
+
web_fetch("https://www.python-httpx.org/quickstart/", output_format="markdown")
|
| 379 |
+
|
| 380 |
+
# 3. Write code using what you learned
|
| 381 |
+
write_file("http_client.py", '''
|
| 382 |
+
import httpx
|
| 383 |
+
async def fetch_data(url):
|
| 384 |
+
async with httpx.AsyncClient() as client:
|
| 385 |
+
return await client.get(url)
|
| 386 |
+
''')
|
| 387 |
+
|
| 388 |
+
# 4. Test it
|
| 389 |
+
python_repl("import httpx; print(httpx.__version__)")
|
| 390 |
+
```
|
| 391 |
+
|
| 392 |
+
---
|
| 393 |
+
|
| 394 |
+
## REMEMBER
|
| 395 |
+
|
| 396 |
+
1. **BE AUTONOMOUS** - Do the work yourself, don't instruct the user
|
| 397 |
+
2. **COMPLETE THE LOOP** - Write code → Execute → Verify → Report results
|
| 398 |
+
3. **COMPOSE TOOLS** - Chain multiple tools to solve complex problems
|
| 399 |
+
4. **RESEARCH WHEN NEEDED** - Use web_search/web_fetch to learn new things
|
| 400 |
+
5. **ITERATE ON FAILURES** - Don't give up, debug and fix issues
|
| 401 |
+
6. **TEST EVERYTHING** - Never assume code works
|
| 402 |
+
7. **USE NON-INTERACTIVE FLAGS** - Avoid hanging commands
|
| 403 |
+
8. **CLEAN UP** - Kill background processes when done
|
| 404 |
+
9. **STORE LEARNINGS** - Save patterns to memory for future use
|
| 405 |
+
|
| 406 |
+
**Your goal is to deliver RESULTS, not instructions.**
|
| 407 |
+
"""
|
src/flow/py.typed
ADDED
|
File without changes
|
src/flow/tools/__init__.py
ADDED
|
@@ -0,0 +1,172 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Flow agent tools.
|
| 2 |
+
|
| 3 |
+
Provides coding, execution, memory, and core tools for software engineering tasks.
|
| 4 |
+
Tools are harness-agnostic - they return plain data that harnesses adapt.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import inspect
|
| 8 |
+
from collections.abc import Callable, Sequence
|
| 9 |
+
from functools import wraps
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
from typing import Any, get_type_hints
|
| 12 |
+
|
| 13 |
+
from flow.tools.coding import create_coding_tools
|
| 14 |
+
from flow.tools.core import create_core_tools
|
| 15 |
+
from flow.tools.execution import create_execution_tools
|
| 16 |
+
from flow.tools.memory import create_memory_tool
|
| 17 |
+
from flow.tools.sub_agent import create_sub_agent_tool
|
| 18 |
+
|
| 19 |
+
__all__ = [
|
| 20 |
+
"create_all_tools",
|
| 21 |
+
"create_coding_tools",
|
| 22 |
+
"create_core_tools",
|
| 23 |
+
"create_execution_tools",
|
| 24 |
+
"create_memory_tool",
|
| 25 |
+
"create_sub_agent_tool",
|
| 26 |
+
"get_tool_schema",
|
| 27 |
+
"tool",
|
| 28 |
+
]
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def tool(
|
| 32 |
+
name: str | None = None,
|
| 33 |
+
description: str | None = None,
|
| 34 |
+
) -> Callable[[Callable[..., Any]], Callable[..., Any]]:
|
| 35 |
+
"""Decorator to mark a function as an agent tool.
|
| 36 |
+
|
| 37 |
+
This decorator adds metadata to functions that allows harnesses
|
| 38 |
+
to discover and use them as agent tools.
|
| 39 |
+
|
| 40 |
+
Args:
|
| 41 |
+
name: Tool name (defaults to function name)
|
| 42 |
+
description: Tool description (defaults to docstring)
|
| 43 |
+
|
| 44 |
+
Returns:
|
| 45 |
+
Decorated function with tool metadata
|
| 46 |
+
|
| 47 |
+
Example:
|
| 48 |
+
@tool(name="read_file", description="Read file contents")
|
| 49 |
+
async def read_file(path: str) -> str:
|
| 50 |
+
...
|
| 51 |
+
"""
|
| 52 |
+
|
| 53 |
+
def decorator(func: Callable[..., Any]) -> Callable[..., Any]:
|
| 54 |
+
@wraps(func)
|
| 55 |
+
def wrapper(*args: Any, **kwargs: Any) -> Any:
|
| 56 |
+
return func(*args, **kwargs)
|
| 57 |
+
|
| 58 |
+
# Store tool metadata
|
| 59 |
+
wrapper._tool_name = name or func.__name__ # type: ignore[attr-defined]
|
| 60 |
+
wrapper._tool_description = description or func.__doc__ or "" # type: ignore[attr-defined]
|
| 61 |
+
wrapper._is_tool = True # type: ignore[attr-defined]
|
| 62 |
+
|
| 63 |
+
return wrapper
|
| 64 |
+
|
| 65 |
+
return decorator
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def get_tool_schema(func: Callable[..., Any]) -> dict[str, Any]:
|
| 69 |
+
"""Extract JSON schema from a tool function.
|
| 70 |
+
|
| 71 |
+
Uses type hints and Annotated metadata to build the schema.
|
| 72 |
+
|
| 73 |
+
Args:
|
| 74 |
+
func: Tool function to extract schema from
|
| 75 |
+
|
| 76 |
+
Returns:
|
| 77 |
+
JSON schema dict for the tool's parameters
|
| 78 |
+
"""
|
| 79 |
+
hints = get_type_hints(func, include_extras=True)
|
| 80 |
+
sig = inspect.signature(func)
|
| 81 |
+
|
| 82 |
+
properties: dict[str, Any] = {}
|
| 83 |
+
required: list[str] = []
|
| 84 |
+
|
| 85 |
+
for param_name, param in sig.parameters.items():
|
| 86 |
+
if param_name in ("self", "cls"):
|
| 87 |
+
continue
|
| 88 |
+
|
| 89 |
+
param_schema: dict[str, Any] = {}
|
| 90 |
+
hint = hints.get(param_name, Any)
|
| 91 |
+
|
| 92 |
+
# Handle Annotated types
|
| 93 |
+
origin = getattr(hint, "__origin__", None)
|
| 94 |
+
if origin is not None:
|
| 95 |
+
# Check if it's Annotated
|
| 96 |
+
if hasattr(hint, "__metadata__"):
|
| 97 |
+
# Extract description from Annotated metadata
|
| 98 |
+
for meta in hint.__metadata__:
|
| 99 |
+
if isinstance(meta, str):
|
| 100 |
+
param_schema["description"] = meta
|
| 101 |
+
break
|
| 102 |
+
# Get the actual type
|
| 103 |
+
hint = hint.__args__[0]
|
| 104 |
+
origin = getattr(hint, "__origin__", None)
|
| 105 |
+
|
| 106 |
+
# Map Python types to JSON schema types
|
| 107 |
+
if hint is str:
|
| 108 |
+
param_schema["type"] = "string"
|
| 109 |
+
elif hint is int:
|
| 110 |
+
param_schema["type"] = "integer"
|
| 111 |
+
elif hint is float:
|
| 112 |
+
param_schema["type"] = "number"
|
| 113 |
+
elif hint is bool:
|
| 114 |
+
param_schema["type"] = "boolean"
|
| 115 |
+
elif origin is list:
|
| 116 |
+
param_schema["type"] = "array"
|
| 117 |
+
elif origin is dict:
|
| 118 |
+
param_schema["type"] = "object"
|
| 119 |
+
else:
|
| 120 |
+
param_schema["type"] = "string" # Default fallback
|
| 121 |
+
|
| 122 |
+
properties[param_name] = param_schema
|
| 123 |
+
|
| 124 |
+
# Check if parameter is required (no default value)
|
| 125 |
+
if param.default is inspect.Parameter.empty:
|
| 126 |
+
required.append(param_name)
|
| 127 |
+
|
| 128 |
+
return {
|
| 129 |
+
"type": "object",
|
| 130 |
+
"properties": properties,
|
| 131 |
+
"required": required,
|
| 132 |
+
}
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
def create_all_tools(
|
| 136 |
+
workspace: Path,
|
| 137 |
+
memory_path: Path,
|
| 138 |
+
bash_timeout: int = 120,
|
| 139 |
+
*,
|
| 140 |
+
enable_memory_tool: bool = True,
|
| 141 |
+
enable_sub_agent: bool = False,
|
| 142 |
+
sub_agent_model: str = "gpt-4o-mini",
|
| 143 |
+
) -> Sequence[Callable[..., Any]]:
|
| 144 |
+
"""Create all standard tools for the Flow agent.
|
| 145 |
+
|
| 146 |
+
Args:
|
| 147 |
+
workspace: Root directory for file operations
|
| 148 |
+
memory_path: Directory for persistent memory
|
| 149 |
+
bash_timeout: Timeout for bash commands in seconds
|
| 150 |
+
enable_memory_tool: Whether to include the memory tool
|
| 151 |
+
enable_sub_agent: Whether to include the sub-agent research tool
|
| 152 |
+
sub_agent_model: Model to use for sub-agent (default: gpt-4o-mini)
|
| 153 |
+
|
| 154 |
+
Returns:
|
| 155 |
+
List of all tool functions
|
| 156 |
+
"""
|
| 157 |
+
tools: list[Callable[..., Any]] = []
|
| 158 |
+
|
| 159 |
+
# Core tools always included
|
| 160 |
+
tools.extend(create_coding_tools(workspace))
|
| 161 |
+
tools.extend(create_execution_tools(workspace, memory_path, bash_timeout))
|
| 162 |
+
tools.extend(create_core_tools())
|
| 163 |
+
|
| 164 |
+
# Optional: Agent-managed memory tool
|
| 165 |
+
if enable_memory_tool:
|
| 166 |
+
tools.append(create_memory_tool(memory_path))
|
| 167 |
+
|
| 168 |
+
# Optional: Sub-agent for isolated research
|
| 169 |
+
if enable_sub_agent:
|
| 170 |
+
tools.append(create_sub_agent_tool(workspace, model=sub_agent_model))
|
| 171 |
+
|
| 172 |
+
return tools
|
src/flow/tools/coding.py
ADDED
|
@@ -0,0 +1,391 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Coding tools for file operations and code search.
|
| 2 |
+
|
| 3 |
+
These tools enable agents to read/write files, list directories,
|
| 4 |
+
and search for patterns in code.
|
| 5 |
+
|
| 6 |
+
The agent can read and write to any path the user has access to.
|
| 7 |
+
The workspace serves as the default working directory for relative paths.
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
import re
|
| 11 |
+
from collections.abc import Callable, Coroutine, Sequence
|
| 12 |
+
from pathlib import Path
|
| 13 |
+
from typing import Annotated, Any
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def create_read_file_tool(workspace: Path) -> Callable[..., Coroutine[Any, Any, str]]:
|
| 17 |
+
"""Create a read_file tool that can read from any path.
|
| 18 |
+
|
| 19 |
+
Args:
|
| 20 |
+
workspace: Default directory for relative paths (not a restriction)
|
| 21 |
+
"""
|
| 22 |
+
|
| 23 |
+
async def read_file(
|
| 24 |
+
file_path: Annotated[str, "Path to the file (absolute or relative to workspace)"],
|
| 25 |
+
max_lines: Annotated[int, "Maximum lines to return (default: 500)"] = 500,
|
| 26 |
+
) -> str:
|
| 27 |
+
"""Read the contents of a file. Can read from any path on the system."""
|
| 28 |
+
try:
|
| 29 |
+
# Support both absolute and relative paths
|
| 30 |
+
path = Path(file_path)
|
| 31 |
+
if path.is_absolute():
|
| 32 |
+
full_path = path.resolve()
|
| 33 |
+
else:
|
| 34 |
+
full_path = (workspace / file_path).resolve()
|
| 35 |
+
|
| 36 |
+
if not full_path.exists():
|
| 37 |
+
return f"Error: File not found: {file_path}"
|
| 38 |
+
|
| 39 |
+
if not full_path.is_file():
|
| 40 |
+
return f"Error: Not a file: {file_path}"
|
| 41 |
+
|
| 42 |
+
content = full_path.read_text(encoding="utf-8")
|
| 43 |
+
lines = content.splitlines()
|
| 44 |
+
|
| 45 |
+
# Apply line limit
|
| 46 |
+
total_lines = len(lines)
|
| 47 |
+
if len(lines) > max_lines:
|
| 48 |
+
lines = lines[:max_lines]
|
| 49 |
+
truncated_msg = f"\n... (truncated, showing first {max_lines} of {total_lines} lines)"
|
| 50 |
+
else:
|
| 51 |
+
truncated_msg = ""
|
| 52 |
+
|
| 53 |
+
# Format with line numbers
|
| 54 |
+
numbered_lines = [f"{i + 1:5d}: {line}" for i, line in enumerate(lines)]
|
| 55 |
+
result = "\n".join(numbered_lines) + truncated_msg
|
| 56 |
+
|
| 57 |
+
return f"File: {full_path} ({total_lines} lines)\n{'=' * 40}\n{result}"
|
| 58 |
+
|
| 59 |
+
except UnicodeDecodeError:
|
| 60 |
+
return f"Error: Cannot read file (binary or non-UTF-8): {file_path}"
|
| 61 |
+
except PermissionError:
|
| 62 |
+
return f"Error: Permission denied: {file_path}"
|
| 63 |
+
except Exception as e:
|
| 64 |
+
return f"Error reading file: {e}"
|
| 65 |
+
|
| 66 |
+
# Add tool metadata
|
| 67 |
+
read_file._tool_name = "read_file" # type: ignore[attr-defined]
|
| 68 |
+
read_file._tool_description = ( # type: ignore[attr-defined]
|
| 69 |
+
"Read the contents of a file. Accepts absolute paths (e.g., /path/to/file) "
|
| 70 |
+
"or relative paths (relative to workspace). Returns content with line numbers."
|
| 71 |
+
)
|
| 72 |
+
read_file._is_tool = True # type: ignore[attr-defined]
|
| 73 |
+
|
| 74 |
+
return read_file
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def create_write_file_tool(workspace: Path) -> Callable[..., Coroutine[Any, Any, str]]:
|
| 78 |
+
"""Create a write_file tool.
|
| 79 |
+
|
| 80 |
+
Args:
|
| 81 |
+
workspace: Default directory for relative paths
|
| 82 |
+
"""
|
| 83 |
+
|
| 84 |
+
async def write_file(
|
| 85 |
+
file_path: Annotated[str, "Path to the file (absolute or relative to workspace)"],
|
| 86 |
+
content: Annotated[str | None, "Full content to write (for complete file write)"] = None,
|
| 87 |
+
old_str: Annotated[str | None, "Text to replace (for str_replace operation)"] = None,
|
| 88 |
+
new_str: Annotated[str | None, "Replacement text (for str_replace operation)"] = None,
|
| 89 |
+
insert_line: Annotated[int | None, "Line number to insert at (1-indexed)"] = None,
|
| 90 |
+
insert_content: Annotated[str | None, "Content to insert at line"] = None,
|
| 91 |
+
) -> str:
|
| 92 |
+
"""Write or edit file content.
|
| 93 |
+
|
| 94 |
+
Supports: (1) full file write with 'content',
|
| 95 |
+
(2) str_replace to replace specific text,
|
| 96 |
+
(3) insert_at_line to add content at a specific line.
|
| 97 |
+
Creates parent directories if needed.
|
| 98 |
+
"""
|
| 99 |
+
try:
|
| 100 |
+
# Support both absolute and relative paths
|
| 101 |
+
path = Path(file_path)
|
| 102 |
+
if path.is_absolute():
|
| 103 |
+
full_path = path.resolve()
|
| 104 |
+
else:
|
| 105 |
+
full_path = (workspace / file_path).resolve()
|
| 106 |
+
|
| 107 |
+
# Create parent directories
|
| 108 |
+
full_path.parent.mkdir(parents=True, exist_ok=True)
|
| 109 |
+
|
| 110 |
+
# Operation 1: Full file write
|
| 111 |
+
if content is not None:
|
| 112 |
+
full_path.write_text(content, encoding="utf-8")
|
| 113 |
+
return f"Successfully wrote {len(content)} characters to {file_path}"
|
| 114 |
+
|
| 115 |
+
# Operation 2: str_replace
|
| 116 |
+
if old_str is not None and new_str is not None:
|
| 117 |
+
if not full_path.exists():
|
| 118 |
+
return f"Error: File not found for str_replace: {file_path}"
|
| 119 |
+
|
| 120 |
+
current_content = full_path.read_text(encoding="utf-8")
|
| 121 |
+
|
| 122 |
+
if old_str not in current_content:
|
| 123 |
+
# Show a snippet of the file to help debug
|
| 124 |
+
if len(current_content) > 500:
|
| 125 |
+
snippet = current_content[:500] + "..."
|
| 126 |
+
else:
|
| 127 |
+
snippet = current_content
|
| 128 |
+
return (
|
| 129 |
+
f"Error: String to replace not found in file.\n"
|
| 130 |
+
f"Searching for: '{old_str[:100]}...'\n"
|
| 131 |
+
f"File content preview:\n{snippet}"
|
| 132 |
+
)
|
| 133 |
+
|
| 134 |
+
# Replace first occurrence only
|
| 135 |
+
new_content = current_content.replace(old_str, new_str, 1)
|
| 136 |
+
full_path.write_text(new_content, encoding="utf-8")
|
| 137 |
+
return f"Successfully replaced text in {file_path}"
|
| 138 |
+
|
| 139 |
+
# Operation 3: insert_at_line
|
| 140 |
+
if insert_line is not None and insert_content is not None:
|
| 141 |
+
if full_path.exists():
|
| 142 |
+
current_content = full_path.read_text(encoding="utf-8")
|
| 143 |
+
lines = current_content.splitlines(keepends=True)
|
| 144 |
+
else:
|
| 145 |
+
lines = []
|
| 146 |
+
|
| 147 |
+
# Ensure insert_content ends with newline
|
| 148 |
+
if not insert_content.endswith("\n"):
|
| 149 |
+
insert_content += "\n"
|
| 150 |
+
|
| 151 |
+
# Insert at specified line (1-indexed)
|
| 152 |
+
insert_index = insert_line - 1
|
| 153 |
+
if insert_index < 0:
|
| 154 |
+
return f"Error: Invalid line number: {insert_line}. Must be >= 1."
|
| 155 |
+
|
| 156 |
+
# Allow inserting at end
|
| 157 |
+
if insert_index > len(lines):
|
| 158 |
+
insert_index = len(lines)
|
| 159 |
+
|
| 160 |
+
lines.insert(insert_index, insert_content)
|
| 161 |
+
new_content = "".join(lines)
|
| 162 |
+
full_path.write_text(new_content, encoding="utf-8")
|
| 163 |
+
return f"Successfully inserted content at line {insert_line} in {file_path}"
|
| 164 |
+
|
| 165 |
+
return "Error: Must provide either 'content', 'old_str' + 'new_str', or 'insert_line' + 'insert_content'"
|
| 166 |
+
|
| 167 |
+
except Exception as e:
|
| 168 |
+
return f"Error writing file: {e}"
|
| 169 |
+
|
| 170 |
+
# Add tool metadata
|
| 171 |
+
write_file._tool_name = "write_file" # type: ignore[attr-defined]
|
| 172 |
+
write_file._tool_description = ( # type: ignore[attr-defined]
|
| 173 |
+
"Write or edit file content. Accepts absolute paths or relative paths (relative to workspace). "
|
| 174 |
+
"Supports: (1) full file write with 'content', (2) str_replace to replace specific text, "
|
| 175 |
+
"(3) insert_at_line to add content at a specific line. Creates parent directories if needed."
|
| 176 |
+
)
|
| 177 |
+
write_file._is_tool = True # type: ignore[attr-defined]
|
| 178 |
+
|
| 179 |
+
return write_file
|
| 180 |
+
|
| 181 |
+
|
| 182 |
+
def create_list_directory_tool(workspace: Path) -> Callable[..., Coroutine[Any, Any, str]]:
|
| 183 |
+
"""Create a list_directory tool that can list any directory.
|
| 184 |
+
|
| 185 |
+
Args:
|
| 186 |
+
workspace: Default directory for relative paths (not a restriction)
|
| 187 |
+
"""
|
| 188 |
+
|
| 189 |
+
async def list_directory(
|
| 190 |
+
directory_path: Annotated[str, "Path to directory (absolute or relative to workspace, default: '.')"] = ".",
|
| 191 |
+
recursive: Annotated[bool, "List subdirectories recursively (default: false)"] = False,
|
| 192 |
+
max_entries: Annotated[int, "Maximum entries to return (default: 200)"] = 200,
|
| 193 |
+
) -> str:
|
| 194 |
+
"""List files and directories at a given path. Can list any directory on the system."""
|
| 195 |
+
try:
|
| 196 |
+
# Support both absolute and relative paths
|
| 197 |
+
path = Path(directory_path)
|
| 198 |
+
if path.is_absolute():
|
| 199 |
+
full_path = path.resolve()
|
| 200 |
+
else:
|
| 201 |
+
full_path = (workspace / directory_path).resolve()
|
| 202 |
+
|
| 203 |
+
if not full_path.exists():
|
| 204 |
+
return f"Error: Directory not found: {directory_path}"
|
| 205 |
+
|
| 206 |
+
if not full_path.is_dir():
|
| 207 |
+
return f"Error: Not a directory: {directory_path}"
|
| 208 |
+
|
| 209 |
+
entries: list[tuple[str, str, int]] = []
|
| 210 |
+
|
| 211 |
+
if recursive:
|
| 212 |
+
for item in full_path.rglob("*"):
|
| 213 |
+
if len(entries) >= max_entries:
|
| 214 |
+
break
|
| 215 |
+
# Skip common non-essential directories
|
| 216 |
+
skip_dirs = ["node_modules", "__pycache__", ".git", "venv", ".venv"]
|
| 217 |
+
if any(part in item.parts for part in skip_dirs):
|
| 218 |
+
continue
|
| 219 |
+
rel_path = item.relative_to(full_path)
|
| 220 |
+
item_type = "file" if item.is_file() else "dir"
|
| 221 |
+
size = item.stat().st_size if item.is_file() else 0
|
| 222 |
+
entries.append((str(rel_path), item_type, size))
|
| 223 |
+
else:
|
| 224 |
+
for item in full_path.iterdir():
|
| 225 |
+
if len(entries) >= max_entries:
|
| 226 |
+
break
|
| 227 |
+
item_type = "file" if item.is_file() else "dir"
|
| 228 |
+
size = item.stat().st_size if item.is_file() else 0
|
| 229 |
+
entries.append((item.name, item_type, size))
|
| 230 |
+
|
| 231 |
+
# Sort: directories first, then by name
|
| 232 |
+
entries.sort(key=lambda x: (x[1] != "dir", x[0]))
|
| 233 |
+
|
| 234 |
+
# Format output
|
| 235 |
+
result_lines = [f"Directory: {directory_path} ({len(entries)} entries)"]
|
| 236 |
+
result_lines.append("=" * 50)
|
| 237 |
+
|
| 238 |
+
for name, item_type, size in entries:
|
| 239 |
+
if item_type == "dir":
|
| 240 |
+
result_lines.append(f" [DIR] {name}/")
|
| 241 |
+
else:
|
| 242 |
+
size_str = f"{size:,} bytes" if size < 10000 else f"{size / 1024:.1f} KB"
|
| 243 |
+
result_lines.append(f" [FILE] {name} ({size_str})")
|
| 244 |
+
|
| 245 |
+
if len(entries) >= max_entries:
|
| 246 |
+
result_lines.append(f"\n... (truncated at {max_entries} entries)")
|
| 247 |
+
|
| 248 |
+
return "\n".join(result_lines)
|
| 249 |
+
|
| 250 |
+
except Exception as e:
|
| 251 |
+
return f"Error listing directory: {e}"
|
| 252 |
+
|
| 253 |
+
# Add tool metadata
|
| 254 |
+
list_directory._tool_name = "list_directory" # type: ignore[attr-defined]
|
| 255 |
+
list_directory._tool_description = ( # type: ignore[attr-defined]
|
| 256 |
+
"List files and directories at a given path. Accepts absolute paths (e.g., /path/to/dir) "
|
| 257 |
+
"or relative paths (relative to workspace). Returns names, types, and sizes."
|
| 258 |
+
)
|
| 259 |
+
list_directory._is_tool = True # type: ignore[attr-defined]
|
| 260 |
+
|
| 261 |
+
return list_directory
|
| 262 |
+
|
| 263 |
+
|
| 264 |
+
def create_grep_search_tool(workspace: Path) -> Callable[..., Coroutine[Any, Any, str]]:
|
| 265 |
+
"""Create a grep_search tool that can search any directory.
|
| 266 |
+
|
| 267 |
+
Args:
|
| 268 |
+
workspace: Default directory for relative paths (not a restriction)
|
| 269 |
+
"""
|
| 270 |
+
|
| 271 |
+
async def grep_search(
|
| 272 |
+
pattern: Annotated[str, "Pattern to search for (regex supported)"],
|
| 273 |
+
path: Annotated[str, "Path to search in (absolute or relative to workspace, default: '.')"] = ".",
|
| 274 |
+
file_pattern: Annotated[str | None, "File pattern to filter (e.g., '*.py', '*.js')"] = None,
|
| 275 |
+
case_sensitive: Annotated[bool, "Case sensitive search (default: true)"] = True,
|
| 276 |
+
max_matches: Annotated[int, "Maximum matches to return (default: 50)"] = 50,
|
| 277 |
+
) -> str:
|
| 278 |
+
"""Search for text patterns in files. Can search any path on the system."""
|
| 279 |
+
try:
|
| 280 |
+
# Support both absolute and relative paths
|
| 281 |
+
search_path = Path(path)
|
| 282 |
+
if search_path.is_absolute():
|
| 283 |
+
full_path = search_path.resolve()
|
| 284 |
+
else:
|
| 285 |
+
full_path = (workspace / path).resolve()
|
| 286 |
+
|
| 287 |
+
if not full_path.exists():
|
| 288 |
+
return f"Error: Path not found: {path}"
|
| 289 |
+
|
| 290 |
+
# Compile regex
|
| 291 |
+
flags = 0 if case_sensitive else re.IGNORECASE
|
| 292 |
+
try:
|
| 293 |
+
regex = re.compile(pattern, flags)
|
| 294 |
+
except re.error as e:
|
| 295 |
+
return f"Error: Invalid regex pattern: {e}"
|
| 296 |
+
|
| 297 |
+
matches: list[dict[str, Any]] = []
|
| 298 |
+
|
| 299 |
+
# Get files to search
|
| 300 |
+
if full_path.is_file():
|
| 301 |
+
files = [full_path]
|
| 302 |
+
else:
|
| 303 |
+
if file_pattern:
|
| 304 |
+
files = list(full_path.rglob(file_pattern))
|
| 305 |
+
else:
|
| 306 |
+
files = [f for f in full_path.rglob("*") if f.is_file()]
|
| 307 |
+
|
| 308 |
+
# Search each file
|
| 309 |
+
for file_path_item in files:
|
| 310 |
+
if len(matches) >= max_matches:
|
| 311 |
+
break
|
| 312 |
+
|
| 313 |
+
# Skip common non-essential directories and binary files
|
| 314 |
+
skip_dirs = ["node_modules", "__pycache__", ".git", "venv", ".venv"]
|
| 315 |
+
if any(part in file_path_item.parts for part in skip_dirs):
|
| 316 |
+
continue
|
| 317 |
+
|
| 318 |
+
try:
|
| 319 |
+
# Skip large files (> 1MB)
|
| 320 |
+
if file_path_item.stat().st_size > 1_000_000:
|
| 321 |
+
continue
|
| 322 |
+
|
| 323 |
+
file_content = file_path_item.read_text(encoding="utf-8", errors="ignore")
|
| 324 |
+
lines = file_content.splitlines()
|
| 325 |
+
|
| 326 |
+
for line_num, line in enumerate(lines, 1):
|
| 327 |
+
if len(matches) >= max_matches:
|
| 328 |
+
break
|
| 329 |
+
if regex.search(line):
|
| 330 |
+
# Compute relative path from search root
|
| 331 |
+
try:
|
| 332 |
+
rel_path = file_path_item.relative_to(full_path)
|
| 333 |
+
except ValueError:
|
| 334 |
+
# If file is the search path itself, use filename
|
| 335 |
+
rel_path = file_path_item.name
|
| 336 |
+
matches.append({
|
| 337 |
+
"file": str(rel_path),
|
| 338 |
+
"line": line_num,
|
| 339 |
+
"text": line.strip()[:200],
|
| 340 |
+
})
|
| 341 |
+
except (UnicodeDecodeError, PermissionError):
|
| 342 |
+
continue
|
| 343 |
+
|
| 344 |
+
# Format output
|
| 345 |
+
if not matches:
|
| 346 |
+
return f"No matches found for pattern '{pattern}' in {path}"
|
| 347 |
+
|
| 348 |
+
result_lines = [f"Found {len(matches)} match(es) for '{pattern}'"]
|
| 349 |
+
result_lines.append("=" * 50)
|
| 350 |
+
|
| 351 |
+
for match in matches:
|
| 352 |
+
result_lines.append(f"{match['file']}:{match['line']}: {match['text']}")
|
| 353 |
+
|
| 354 |
+
if len(matches) >= max_matches:
|
| 355 |
+
result_lines.append(f"\n... (truncated at {max_matches} matches)")
|
| 356 |
+
|
| 357 |
+
return "\n".join(result_lines)
|
| 358 |
+
|
| 359 |
+
except Exception as e:
|
| 360 |
+
return f"Error searching: {e}"
|
| 361 |
+
|
| 362 |
+
# Add tool metadata
|
| 363 |
+
grep_search._tool_name = "grep_search" # type: ignore[attr-defined]
|
| 364 |
+
grep_search._tool_description = ( # type: ignore[attr-defined]
|
| 365 |
+
"Search for text patterns in files. Accepts absolute paths (e.g., /path/to/dir) "
|
| 366 |
+
"or relative paths (relative to workspace). Supports regex patterns and file filtering."
|
| 367 |
+
)
|
| 368 |
+
grep_search._is_tool = True # type: ignore[attr-defined]
|
| 369 |
+
|
| 370 |
+
return grep_search
|
| 371 |
+
|
| 372 |
+
|
| 373 |
+
def create_coding_tools(workspace: Path) -> Sequence[Callable[..., Coroutine[Any, Any, str]]]:
|
| 374 |
+
"""Create all coding tools bound to a workspace.
|
| 375 |
+
|
| 376 |
+
Args:
|
| 377 |
+
workspace: Root directory for file operations
|
| 378 |
+
|
| 379 |
+
Returns:
|
| 380 |
+
List of coding tool functions
|
| 381 |
+
"""
|
| 382 |
+
workspace = Path(workspace).resolve()
|
| 383 |
+
|
| 384 |
+
return [
|
| 385 |
+
create_read_file_tool(workspace),
|
| 386 |
+
create_write_file_tool(workspace),
|
| 387 |
+
create_list_directory_tool(workspace),
|
| 388 |
+
create_grep_search_tool(workspace),
|
| 389 |
+
]
|
| 390 |
+
|
| 391 |
+
|
src/flow/tools/core.py
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Core metacognitive tools for agent reasoning and task management.
|
| 2 |
+
|
| 3 |
+
These tools enable agents to think explicitly, track task status,
|
| 4 |
+
and make structured decisions during complex software engineering tasks.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from collections.abc import Callable, Coroutine, Sequence
|
| 8 |
+
from typing import Annotated, Any, Literal
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
async def think(
|
| 12 |
+
thought: Annotated[
|
| 13 |
+
str,
|
| 14 |
+
(
|
| 15 |
+
"Your detailed reasoning about the current situation. "
|
| 16 |
+
"Include: what you've learned, options you're considering, "
|
| 17 |
+
"potential risks, and your planned approach."
|
| 18 |
+
),
|
| 19 |
+
],
|
| 20 |
+
) -> str:
|
| 21 |
+
"""Use this tool to pause and think through a complex problem.
|
| 22 |
+
|
| 23 |
+
Helpful when: (1) analyzing tool results, (2) planning multi-step approaches,
|
| 24 |
+
(3) making design decisions, (4) debugging issues, (5) avoiding mistakes.
|
| 25 |
+
Your reasoning is recorded and helps structure your approach.
|
| 26 |
+
"""
|
| 27 |
+
# The value is in giving the LLM dedicated space to reason
|
| 28 |
+
summary = thought[:300] + "..." if len(thought) > 300 else thought
|
| 29 |
+
return f"Thought recorded: {summary}"
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
async def task_done(
|
| 33 |
+
status: Annotated[
|
| 34 |
+
Literal["complete", "incomplete"],
|
| 35 |
+
"'complete' if task finished successfully, 'incomplete' if blocked or needs input",
|
| 36 |
+
],
|
| 37 |
+
summary: Annotated[
|
| 38 |
+
str,
|
| 39 |
+
(
|
| 40 |
+
"Summary of what was accomplished. "
|
| 41 |
+
"If complete: what was done and how to use/test it. "
|
| 42 |
+
"If incomplete: what's blocking and what's needed."
|
| 43 |
+
),
|
| 44 |
+
],
|
| 45 |
+
files_created: Annotated[
|
| 46 |
+
list[str] | None,
|
| 47 |
+
"List of files created or modified (if any)",
|
| 48 |
+
] = None,
|
| 49 |
+
next_steps: Annotated[
|
| 50 |
+
list[str] | None,
|
| 51 |
+
"Suggested next steps for the user (if any)",
|
| 52 |
+
] = None,
|
| 53 |
+
) -> str:
|
| 54 |
+
"""Call this when you have completed the user's task.
|
| 55 |
+
|
| 56 |
+
Provide a summary of what was accomplished and any relevant details.
|
| 57 |
+
Use 'complete' if all requirements are satisfied,
|
| 58 |
+
'incomplete' if blocked or need more information.
|
| 59 |
+
"""
|
| 60 |
+
result_lines = [
|
| 61 |
+
f"Task Status: {status.upper()}",
|
| 62 |
+
"",
|
| 63 |
+
"Summary:",
|
| 64 |
+
summary,
|
| 65 |
+
]
|
| 66 |
+
|
| 67 |
+
if files_created:
|
| 68 |
+
result_lines.extend([
|
| 69 |
+
"",
|
| 70 |
+
"Files Created/Modified:",
|
| 71 |
+
*[f" - {f}" for f in files_created],
|
| 72 |
+
])
|
| 73 |
+
|
| 74 |
+
if next_steps:
|
| 75 |
+
result_lines.extend([
|
| 76 |
+
"",
|
| 77 |
+
"Suggested Next Steps:",
|
| 78 |
+
*[f" - {step}" for step in next_steps],
|
| 79 |
+
])
|
| 80 |
+
|
| 81 |
+
return "\n".join(result_lines)
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
# Add tool metadata
|
| 85 |
+
think._tool_name = "think" # type: ignore[attr-defined]
|
| 86 |
+
think._tool_description = think.__doc__ or "" # type: ignore[attr-defined]
|
| 87 |
+
think._is_tool = True # type: ignore[attr-defined]
|
| 88 |
+
|
| 89 |
+
task_done._tool_name = "task_done" # type: ignore[attr-defined]
|
| 90 |
+
task_done._tool_description = task_done.__doc__ or "" # type: ignore[attr-defined]
|
| 91 |
+
task_done._is_tool = True # type: ignore[attr-defined]
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
def create_core_tools() -> Sequence[Callable[..., Coroutine[Any, Any, str]]]:
|
| 95 |
+
"""Create all core metacognitive tools.
|
| 96 |
+
|
| 97 |
+
Returns:
|
| 98 |
+
List of core tool functions
|
| 99 |
+
"""
|
| 100 |
+
return [think, task_done]
|
src/flow/tools/execution.py
ADDED
|
@@ -0,0 +1,479 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Execution tools for running commands and code.
|
| 2 |
+
|
| 3 |
+
These tools enable agents to execute bash commands and Python code
|
| 4 |
+
with safety controls (timeouts, output limits), and manage background processes.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import asyncio
|
| 8 |
+
import os
|
| 9 |
+
import re
|
| 10 |
+
import signal
|
| 11 |
+
import sys
|
| 12 |
+
from collections.abc import Callable, Coroutine, Sequence
|
| 13 |
+
from datetime import datetime
|
| 14 |
+
from io import StringIO
|
| 15 |
+
from pathlib import Path
|
| 16 |
+
from typing import Annotated, Any, Literal
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def _get_process_registry_path(memory_path: Path) -> Path:
|
| 20 |
+
"""Get the path to the process registry file in memory."""
|
| 21 |
+
return memory_path / "processes.md"
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def _ensure_process_registry(memory_path: Path) -> Path:
|
| 25 |
+
"""Ensure the process registry file exists and return its path."""
|
| 26 |
+
registry_path = _get_process_registry_path(memory_path)
|
| 27 |
+
registry_path.parent.mkdir(parents=True, exist_ok=True)
|
| 28 |
+
|
| 29 |
+
if not registry_path.exists():
|
| 30 |
+
registry_path.write_text(
|
| 31 |
+
"# Background Processes\n\n"
|
| 32 |
+
"This file tracks background processes started by the Flow agent.\n"
|
| 33 |
+
"You can view this file with `memory(command='view', path='/memory/processes.md')`\n\n"
|
| 34 |
+
"## Running\n\n"
|
| 35 |
+
"## Stopped\n\n"
|
| 36 |
+
)
|
| 37 |
+
return registry_path
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def _add_process_to_registry(
|
| 41 |
+
memory_path: Path,
|
| 42 |
+
pid: int,
|
| 43 |
+
command: str,
|
| 44 |
+
workspace: str,
|
| 45 |
+
log_file: str,
|
| 46 |
+
port: int | None = None,
|
| 47 |
+
) -> None:
|
| 48 |
+
"""Add a process to the registry using checklist format."""
|
| 49 |
+
registry_path = _ensure_process_registry(memory_path)
|
| 50 |
+
content = registry_path.read_text()
|
| 51 |
+
|
| 52 |
+
# Extract port from command if not provided
|
| 53 |
+
if port is None:
|
| 54 |
+
port_match = re.search(r"(?:--port|-p)\s+(\d+)", command)
|
| 55 |
+
if port_match:
|
| 56 |
+
port = int(port_match.group(1))
|
| 57 |
+
elif ":8000" in command or "8000" in command:
|
| 58 |
+
port = 8000
|
| 59 |
+
elif ":3000" in command or "3000" in command:
|
| 60 |
+
port = 3000
|
| 61 |
+
|
| 62 |
+
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M")
|
| 63 |
+
port_str = f"Port: {port}" if port else "Port: -"
|
| 64 |
+
cmd_short = command[:60] + "..." if len(command) > 60 else command
|
| 65 |
+
workspace_short = workspace.split("/")[-1] if "/" in workspace else workspace
|
| 66 |
+
|
| 67 |
+
# Create checklist entry
|
| 68 |
+
entry = f"- [ ] **PID {pid}** | `{cmd_short}` | {timestamp} | {port_str} | {workspace_short}\n"
|
| 69 |
+
|
| 70 |
+
# Add under "## Running" section
|
| 71 |
+
if "## Running" in content:
|
| 72 |
+
content = content.replace("## Running\n\n", f"## Running\n\n{entry}")
|
| 73 |
+
else:
|
| 74 |
+
# Add Running section if missing
|
| 75 |
+
content += f"\n## Running\n\n{entry}"
|
| 76 |
+
|
| 77 |
+
registry_path.write_text(content)
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def _mark_process_stopped(memory_path: Path, pid: int, reason: str = "killed") -> None:
|
| 81 |
+
"""Mark a process as stopped in the registry (check the box and move to Stopped)."""
|
| 82 |
+
registry_path = _get_process_registry_path(memory_path)
|
| 83 |
+
if not registry_path.exists():
|
| 84 |
+
return
|
| 85 |
+
|
| 86 |
+
content = registry_path.read_text()
|
| 87 |
+
lines = content.split("\n")
|
| 88 |
+
new_lines: list[str] = []
|
| 89 |
+
stopped_entry: str | None = None
|
| 90 |
+
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M")
|
| 91 |
+
|
| 92 |
+
for line in lines:
|
| 93 |
+
if f"**PID {pid}**" in line and "- [ ]" in line:
|
| 94 |
+
# Found the running process - mark it as checked and prepare for Stopped section
|
| 95 |
+
stopped_entry = line.replace("- [ ]", "- [x]") + f" | {reason} @ {timestamp}"
|
| 96 |
+
# Don't add to new_lines yet (will move to Stopped section)
|
| 97 |
+
else:
|
| 98 |
+
new_lines.append(line)
|
| 99 |
+
|
| 100 |
+
# Add stopped entry to Stopped section
|
| 101 |
+
if stopped_entry:
|
| 102 |
+
content = "\n".join(new_lines)
|
| 103 |
+
if "## Stopped" in content:
|
| 104 |
+
content = content.replace("## Stopped\n\n", f"## Stopped\n\n{stopped_entry}\n")
|
| 105 |
+
else:
|
| 106 |
+
content += f"\n## Stopped\n\n{stopped_entry}\n"
|
| 107 |
+
registry_path.write_text(content)
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
def _is_process_running(pid: int) -> bool:
|
| 111 |
+
"""Check if a process is still running."""
|
| 112 |
+
try:
|
| 113 |
+
os.kill(pid, 0)
|
| 114 |
+
return True
|
| 115 |
+
except (OSError, ProcessLookupError):
|
| 116 |
+
return False
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
def _get_running_pids_from_registry(memory_path: Path) -> list[tuple[int, str]]:
|
| 120 |
+
"""Get list of (pid, line) for processes marked as running in registry."""
|
| 121 |
+
registry_path = _get_process_registry_path(memory_path)
|
| 122 |
+
if not registry_path.exists():
|
| 123 |
+
return []
|
| 124 |
+
|
| 125 |
+
content = registry_path.read_text()
|
| 126 |
+
running: list[tuple[int, str]] = []
|
| 127 |
+
|
| 128 |
+
for line in content.split("\n"):
|
| 129 |
+
if "- [ ]" in line and "**PID" in line:
|
| 130 |
+
# Extract PID from format: **PID 12345**
|
| 131 |
+
match = re.search(r"\*\*PID (\d+)\*\*", line)
|
| 132 |
+
if match:
|
| 133 |
+
pid = int(match.group(1))
|
| 134 |
+
running.append((pid, line))
|
| 135 |
+
|
| 136 |
+
return running
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
def create_bash_execute_tool(
|
| 140 |
+
workspace: Path, memory_path: Path, default_timeout: int = 120
|
| 141 |
+
) -> Callable[..., Coroutine[Any, Any, str]]:
|
| 142 |
+
"""Create a bash_execute tool bound to a specific workspace."""
|
| 143 |
+
|
| 144 |
+
async def bash_execute(
|
| 145 |
+
command: Annotated[str, "Bash command to execute"],
|
| 146 |
+
timeout: Annotated[int, f"Command timeout in seconds (default: {default_timeout})"] = default_timeout,
|
| 147 |
+
background: Annotated[
|
| 148 |
+
bool, "Run in background and return immediately with PID. Use for servers/long-running processes."
|
| 149 |
+
] = False,
|
| 150 |
+
) -> str:
|
| 151 |
+
"""Execute bash commands in the workspace.
|
| 152 |
+
|
| 153 |
+
Returns stdout, stderr, and return code.
|
| 154 |
+
Use for running tests, git commands, package managers, builds, etc.
|
| 155 |
+
IMPORTANT: Each call runs in a fresh shell from workspace root -
|
| 156 |
+
use 'cd dir && command' for commands in subdirectories.
|
| 157 |
+
For long-running processes (servers), use background=True to avoid timeout.
|
| 158 |
+
"""
|
| 159 |
+
try:
|
| 160 |
+
if background:
|
| 161 |
+
# Run in background using nohup and capture PID
|
| 162 |
+
# Redirect output to a log file
|
| 163 |
+
log_file = workspace / ".background_logs" / f"bg_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
|
| 164 |
+
log_file.parent.mkdir(parents=True, exist_ok=True)
|
| 165 |
+
|
| 166 |
+
bg_command = f"nohup {command} > {log_file} 2>&1 & echo $!"
|
| 167 |
+
|
| 168 |
+
proc = await asyncio.create_subprocess_shell(
|
| 169 |
+
bg_command,
|
| 170 |
+
stdout=asyncio.subprocess.PIPE,
|
| 171 |
+
stderr=asyncio.subprocess.PIPE,
|
| 172 |
+
cwd=str(workspace),
|
| 173 |
+
)
|
| 174 |
+
|
| 175 |
+
stdout, _ = await proc.communicate()
|
| 176 |
+
pid_str = stdout.decode().strip()
|
| 177 |
+
|
| 178 |
+
try:
|
| 179 |
+
pid = int(pid_str)
|
| 180 |
+
# Register the process in memory
|
| 181 |
+
_add_process_to_registry(
|
| 182 |
+
memory_path=memory_path,
|
| 183 |
+
pid=pid,
|
| 184 |
+
command=command,
|
| 185 |
+
workspace=str(workspace),
|
| 186 |
+
log_file=str(log_file),
|
| 187 |
+
)
|
| 188 |
+
|
| 189 |
+
return (
|
| 190 |
+
f"Background process started successfully.\n"
|
| 191 |
+
f"PID: {pid}\n"
|
| 192 |
+
f"Command: {command}\n"
|
| 193 |
+
f"Log file: {log_file}\n"
|
| 194 |
+
f"\nProcess registered in /memory/processes.md\n"
|
| 195 |
+
f"Use check_processes(action='list') to see all background processes.\n"
|
| 196 |
+
f"Use check_processes(action='kill', pid={pid}) to stop this process."
|
| 197 |
+
)
|
| 198 |
+
except ValueError:
|
| 199 |
+
return f"Error: Could not get PID. Output: {pid_str}"
|
| 200 |
+
|
| 201 |
+
# Regular (blocking) execution
|
| 202 |
+
proc = await asyncio.create_subprocess_shell(
|
| 203 |
+
command,
|
| 204 |
+
stdout=asyncio.subprocess.PIPE,
|
| 205 |
+
stderr=asyncio.subprocess.PIPE,
|
| 206 |
+
cwd=str(workspace),
|
| 207 |
+
)
|
| 208 |
+
|
| 209 |
+
try:
|
| 210 |
+
stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=timeout)
|
| 211 |
+
except asyncio.TimeoutError:
|
| 212 |
+
proc.kill()
|
| 213 |
+
await proc.wait()
|
| 214 |
+
return (
|
| 215 |
+
f"Error: Command timed out after {timeout} seconds.\n"
|
| 216 |
+
f"Command: {command}\n\n"
|
| 217 |
+
f"TIP: If this is a long-running process (like a server), "
|
| 218 |
+
f"use background=True to run it in the background."
|
| 219 |
+
)
|
| 220 |
+
|
| 221 |
+
stdout_str = stdout.decode("utf-8", errors="replace")
|
| 222 |
+
stderr_str = stderr.decode("utf-8", errors="replace")
|
| 223 |
+
return_code = proc.returncode
|
| 224 |
+
|
| 225 |
+
# Format output
|
| 226 |
+
result_parts = [f"Command: {command}"]
|
| 227 |
+
result_parts.append(f"Return code: {return_code}")
|
| 228 |
+
result_parts.append("=" * 50)
|
| 229 |
+
|
| 230 |
+
if stdout_str.strip():
|
| 231 |
+
# Truncate very long output
|
| 232 |
+
if len(stdout_str) > 15000:
|
| 233 |
+
stdout_str = stdout_str[:15000] + "\n... (stdout truncated)"
|
| 234 |
+
result_parts.append("STDOUT:")
|
| 235 |
+
result_parts.append(stdout_str)
|
| 236 |
+
|
| 237 |
+
if stderr_str.strip():
|
| 238 |
+
if len(stderr_str) > 5000:
|
| 239 |
+
stderr_str = stderr_str[:5000] + "\n... (stderr truncated)"
|
| 240 |
+
result_parts.append("STDERR:")
|
| 241 |
+
result_parts.append(stderr_str)
|
| 242 |
+
|
| 243 |
+
if not stdout_str.strip() and not stderr_str.strip():
|
| 244 |
+
result_parts.append("(no output)")
|
| 245 |
+
|
| 246 |
+
return "\n".join(result_parts)
|
| 247 |
+
|
| 248 |
+
except Exception as e:
|
| 249 |
+
return f"Error executing command: {e}"
|
| 250 |
+
|
| 251 |
+
# Add tool metadata
|
| 252 |
+
bash_execute._tool_name = "bash_execute" # type: ignore[attr-defined]
|
| 253 |
+
bash_execute._tool_description = ( # type: ignore[attr-defined]
|
| 254 |
+
"Execute bash commands in the workspace. "
|
| 255 |
+
"Returns stdout, stderr, and return code. "
|
| 256 |
+
"Use for running tests, git commands, package managers, builds, etc."
|
| 257 |
+
)
|
| 258 |
+
bash_execute._is_tool = True # type: ignore[attr-defined]
|
| 259 |
+
|
| 260 |
+
return bash_execute
|
| 261 |
+
|
| 262 |
+
|
| 263 |
+
def create_check_processes_tool(
|
| 264 |
+
workspace: Path, memory_path: Path
|
| 265 |
+
) -> Callable[..., Coroutine[Any, Any, str]]:
|
| 266 |
+
"""Create a tool to check and manage background processes."""
|
| 267 |
+
|
| 268 |
+
async def check_processes(
|
| 269 |
+
action: Annotated[
|
| 270 |
+
Literal["list", "kill", "cleanup"],
|
| 271 |
+
"'list' to see processes, 'kill' to stop one by PID, 'cleanup' to kill all",
|
| 272 |
+
],
|
| 273 |
+
pid: Annotated[int | None, "PID to kill (required for 'kill' action)"] = None,
|
| 274 |
+
) -> str:
|
| 275 |
+
"""Check and manage background processes.
|
| 276 |
+
|
| 277 |
+
Use 'list' to see all background processes (also viewable at /memory/processes.md),
|
| 278 |
+
'kill' to stop a specific process by PID,
|
| 279 |
+
'cleanup' to kill all background processes from this workspace.
|
| 280 |
+
"""
|
| 281 |
+
_ensure_process_registry(memory_path)
|
| 282 |
+
registry_path = _get_process_registry_path(memory_path)
|
| 283 |
+
|
| 284 |
+
if action == "list":
|
| 285 |
+
# Read the registry and update status of running processes
|
| 286 |
+
running_pids = _get_running_pids_from_registry(memory_path)
|
| 287 |
+
active_count = 0
|
| 288 |
+
dead_pids: list[int] = []
|
| 289 |
+
|
| 290 |
+
for proc_pid, _ in running_pids:
|
| 291 |
+
if _is_process_running(proc_pid):
|
| 292 |
+
active_count += 1
|
| 293 |
+
else:
|
| 294 |
+
dead_pids.append(proc_pid)
|
| 295 |
+
|
| 296 |
+
# Mark dead processes as stopped
|
| 297 |
+
for dead_pid in dead_pids:
|
| 298 |
+
_mark_process_stopped(memory_path, dead_pid, reason="exited")
|
| 299 |
+
|
| 300 |
+
# Return the updated registry
|
| 301 |
+
content = registry_path.read_text()
|
| 302 |
+
return (
|
| 303 |
+
f"Active background processes: {active_count}\n"
|
| 304 |
+
f"(View full registry at /memory/processes.md)\n\n"
|
| 305 |
+
f"{content}"
|
| 306 |
+
)
|
| 307 |
+
|
| 308 |
+
if action == "kill":
|
| 309 |
+
if pid is None:
|
| 310 |
+
return "Error: 'pid' is required for 'kill' action."
|
| 311 |
+
|
| 312 |
+
try:
|
| 313 |
+
os.kill(pid, signal.SIGTERM)
|
| 314 |
+
await asyncio.sleep(0.5) # Give it time to terminate
|
| 315 |
+
|
| 316 |
+
# Check if it's really dead, if not SIGKILL
|
| 317 |
+
if _is_process_running(pid):
|
| 318 |
+
os.kill(pid, signal.SIGKILL)
|
| 319 |
+
await asyncio.sleep(0.2)
|
| 320 |
+
|
| 321 |
+
_mark_process_stopped(memory_path, pid, reason="killed")
|
| 322 |
+
|
| 323 |
+
if _is_process_running(pid):
|
| 324 |
+
return f"Warning: Process {pid} may still be running after kill attempt."
|
| 325 |
+
return f"Successfully killed process {pid}. Updated /memory/processes.md"
|
| 326 |
+
|
| 327 |
+
except ProcessLookupError:
|
| 328 |
+
_mark_process_stopped(memory_path, pid, reason="not found")
|
| 329 |
+
return f"Process {pid} was not running (already terminated). Updated /memory/processes.md"
|
| 330 |
+
except PermissionError:
|
| 331 |
+
return f"Error: Permission denied to kill process {pid}."
|
| 332 |
+
except Exception as e:
|
| 333 |
+
return f"Error killing process {pid}: {e}"
|
| 334 |
+
|
| 335 |
+
if action == "cleanup":
|
| 336 |
+
# Kill all processes from this workspace
|
| 337 |
+
running_pids = _get_running_pids_from_registry(memory_path)
|
| 338 |
+
workspace_str = str(workspace)
|
| 339 |
+
killed: list[int] = []
|
| 340 |
+
failed: list[tuple[int, str]] = []
|
| 341 |
+
|
| 342 |
+
for proc_pid, line in running_pids:
|
| 343 |
+
# Check if this process is from our workspace
|
| 344 |
+
workspace_short = workspace_str.split("/")[-1]
|
| 345 |
+
if workspace_short in line or workspace_str in line:
|
| 346 |
+
try:
|
| 347 |
+
os.kill(proc_pid, signal.SIGTERM)
|
| 348 |
+
await asyncio.sleep(0.2)
|
| 349 |
+
if _is_process_running(proc_pid):
|
| 350 |
+
os.kill(proc_pid, signal.SIGKILL)
|
| 351 |
+
_mark_process_stopped(memory_path, proc_pid, reason="cleanup")
|
| 352 |
+
killed.append(proc_pid)
|
| 353 |
+
except (ProcessLookupError, PermissionError) as e:
|
| 354 |
+
_mark_process_stopped(memory_path, proc_pid, reason=f"cleanup failed: {e}")
|
| 355 |
+
failed.append((proc_pid, str(e)))
|
| 356 |
+
|
| 357 |
+
result = "Cleanup complete. Updated /memory/processes.md\n"
|
| 358 |
+
if killed:
|
| 359 |
+
result += f"Killed processes: {killed}\n"
|
| 360 |
+
if failed:
|
| 361 |
+
result += f"Failed to kill: {failed}\n"
|
| 362 |
+
if not killed and not failed:
|
| 363 |
+
result += "No active processes found for this workspace."
|
| 364 |
+
|
| 365 |
+
return result
|
| 366 |
+
|
| 367 |
+
return f"Unknown action: {action}"
|
| 368 |
+
|
| 369 |
+
# Add tool metadata
|
| 370 |
+
check_processes._tool_name = "check_processes" # type: ignore[attr-defined]
|
| 371 |
+
check_processes._tool_description = ( # type: ignore[attr-defined]
|
| 372 |
+
"Check and manage background processes. "
|
| 373 |
+
"Use 'list' to see all background processes, "
|
| 374 |
+
"'kill' to stop a specific process by PID, "
|
| 375 |
+
"'cleanup' to kill all background processes from this workspace."
|
| 376 |
+
)
|
| 377 |
+
check_processes._is_tool = True # type: ignore[attr-defined]
|
| 378 |
+
|
| 379 |
+
return check_processes
|
| 380 |
+
|
| 381 |
+
|
| 382 |
+
def create_python_repl_tool(workspace: Path) -> Callable[..., Coroutine[Any, Any, str]]:
|
| 383 |
+
"""Create a python_repl tool bound to a specific workspace."""
|
| 384 |
+
|
| 385 |
+
async def python_repl(
|
| 386 |
+
code: Annotated[str, "Python code to execute"],
|
| 387 |
+
) -> str:
|
| 388 |
+
"""Execute Python code in an isolated namespace.
|
| 389 |
+
|
| 390 |
+
Returns the output (stdout) or any errors.
|
| 391 |
+
Use for testing code snippets, calculations, data manipulation, or quick validation.
|
| 392 |
+
The WORKSPACE variable is available with the workspace path.
|
| 393 |
+
"""
|
| 394 |
+
old_stdout = sys.stdout
|
| 395 |
+
old_stderr = sys.stderr
|
| 396 |
+
|
| 397 |
+
try:
|
| 398 |
+
# Capture stdout and stderr
|
| 399 |
+
redirected_output = StringIO()
|
| 400 |
+
redirected_error = StringIO()
|
| 401 |
+
sys.stdout = redirected_output
|
| 402 |
+
sys.stderr = redirected_error
|
| 403 |
+
|
| 404 |
+
# Create isolated namespace with builtins
|
| 405 |
+
namespace: dict[str, Any] = {
|
| 406 |
+
"__builtins__": __builtins__,
|
| 407 |
+
"__name__": "__main__",
|
| 408 |
+
"WORKSPACE": workspace,
|
| 409 |
+
}
|
| 410 |
+
|
| 411 |
+
try:
|
| 412 |
+
# Try to compile and exec
|
| 413 |
+
compiled = compile(code, "<repl>", "exec")
|
| 414 |
+
exec(compiled, namespace) # noqa: S102
|
| 415 |
+
|
| 416 |
+
output = redirected_output.getvalue()
|
| 417 |
+
error = redirected_error.getvalue()
|
| 418 |
+
|
| 419 |
+
result_parts = ["Python REPL Output"]
|
| 420 |
+
result_parts.append("=" * 50)
|
| 421 |
+
|
| 422 |
+
if output.strip():
|
| 423 |
+
if len(output) > 15000:
|
| 424 |
+
output = output[:15000] + "\n... (output truncated)"
|
| 425 |
+
result_parts.append(output)
|
| 426 |
+
|
| 427 |
+
if error.strip():
|
| 428 |
+
result_parts.append("STDERR:")
|
| 429 |
+
result_parts.append(error)
|
| 430 |
+
|
| 431 |
+
if not output.strip() and not error.strip():
|
| 432 |
+
result_parts.append("(code executed successfully, no output)")
|
| 433 |
+
|
| 434 |
+
return "\n".join(result_parts)
|
| 435 |
+
|
| 436 |
+
except SyntaxError as e:
|
| 437 |
+
return f"SyntaxError: {e}"
|
| 438 |
+
except Exception as e:
|
| 439 |
+
return f"Error: {type(e).__name__}: {e}"
|
| 440 |
+
|
| 441 |
+
finally:
|
| 442 |
+
sys.stdout = old_stdout
|
| 443 |
+
sys.stderr = old_stderr
|
| 444 |
+
|
| 445 |
+
# Add tool metadata
|
| 446 |
+
python_repl._tool_name = "python_repl" # type: ignore[attr-defined]
|
| 447 |
+
python_repl._tool_description = ( # type: ignore[attr-defined]
|
| 448 |
+
"Execute Python code in an isolated namespace. "
|
| 449 |
+
"Returns the output (stdout) or any errors. "
|
| 450 |
+
"Use for testing code snippets, calculations, data manipulation, or quick validation."
|
| 451 |
+
)
|
| 452 |
+
python_repl._is_tool = True # type: ignore[attr-defined]
|
| 453 |
+
|
| 454 |
+
return python_repl
|
| 455 |
+
|
| 456 |
+
|
| 457 |
+
def create_execution_tools(
|
| 458 |
+
workspace: Path,
|
| 459 |
+
memory_path: Path,
|
| 460 |
+
bash_timeout: int = 120,
|
| 461 |
+
) -> Sequence[Callable[..., Coroutine[Any, Any, str]]]:
|
| 462 |
+
"""Create all execution tools bound to a workspace.
|
| 463 |
+
|
| 464 |
+
Args:
|
| 465 |
+
workspace: Root directory for command execution
|
| 466 |
+
memory_path: Path to memory directory for process registry
|
| 467 |
+
bash_timeout: Default timeout for bash commands in seconds
|
| 468 |
+
|
| 469 |
+
Returns:
|
| 470 |
+
List of execution tool functions
|
| 471 |
+
"""
|
| 472 |
+
workspace = Path(workspace).resolve()
|
| 473 |
+
memory_path = Path(memory_path).resolve()
|
| 474 |
+
|
| 475 |
+
return [
|
| 476 |
+
create_bash_execute_tool(workspace, memory_path, bash_timeout),
|
| 477 |
+
create_check_processes_tool(workspace, memory_path),
|
| 478 |
+
create_python_repl_tool(workspace),
|
| 479 |
+
]
|
src/flow/tools/memory.py
ADDED
|
@@ -0,0 +1,260 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Memory tool for persistent storage across sessions.
|
| 2 |
+
|
| 3 |
+
Provides file-based memory storage allowing agents to store and retrieve
|
| 4 |
+
information, patterns, and decisions across conversations.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from collections.abc import Callable, Coroutine
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
from typing import Annotated, Any, Literal
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class MemoryBackend:
|
| 13 |
+
"""File-based memory storage backend with security controls."""
|
| 14 |
+
|
| 15 |
+
def __init__(self, base_path: Path) -> None:
|
| 16 |
+
"""Initialize memory backend."""
|
| 17 |
+
self.base_path = Path(base_path).resolve()
|
| 18 |
+
self.base_path.mkdir(parents=True, exist_ok=True)
|
| 19 |
+
|
| 20 |
+
def _validate_path(self, path: str) -> Path:
|
| 21 |
+
"""Validate and resolve a memory path."""
|
| 22 |
+
# Normalize path (remove /memory prefix if present)
|
| 23 |
+
if path.startswith("/memory"):
|
| 24 |
+
path = path[len("/memory") :]
|
| 25 |
+
path = path.lstrip("/")
|
| 26 |
+
|
| 27 |
+
# Handle empty path
|
| 28 |
+
if not path:
|
| 29 |
+
return self.base_path
|
| 30 |
+
|
| 31 |
+
# Resolve to absolute path
|
| 32 |
+
full_path = (self.base_path / path).resolve()
|
| 33 |
+
|
| 34 |
+
# Security: Ensure path is within base_path
|
| 35 |
+
try:
|
| 36 |
+
full_path.relative_to(self.base_path)
|
| 37 |
+
except ValueError as err:
|
| 38 |
+
raise ValueError(f"Access denied: path '{path}' is outside memory directory") from err
|
| 39 |
+
|
| 40 |
+
return full_path
|
| 41 |
+
|
| 42 |
+
def view(self, path: str, view_range: list[int] | None = None) -> str:
|
| 43 |
+
"""View directory contents or file contents."""
|
| 44 |
+
full_path = self._validate_path(path)
|
| 45 |
+
|
| 46 |
+
if not full_path.exists():
|
| 47 |
+
return f"Path not found: {path}\nUse 'create' to create new files."
|
| 48 |
+
|
| 49 |
+
# Directory listing
|
| 50 |
+
if full_path.is_dir():
|
| 51 |
+
contents = [f"Directory: {path or '/memory'}"]
|
| 52 |
+
items = sorted(full_path.iterdir(), key=lambda x: (x.is_file(), x.name))
|
| 53 |
+
|
| 54 |
+
if not items:
|
| 55 |
+
contents.append("(empty directory)")
|
| 56 |
+
else:
|
| 57 |
+
for item in items:
|
| 58 |
+
suffix = "/" if item.is_dir() else ""
|
| 59 |
+
contents.append(f" - {item.name}{suffix}")
|
| 60 |
+
|
| 61 |
+
return "\n".join(contents)
|
| 62 |
+
|
| 63 |
+
# File contents
|
| 64 |
+
if full_path.is_file():
|
| 65 |
+
content = full_path.read_text(encoding="utf-8")
|
| 66 |
+
lines = content.splitlines()
|
| 67 |
+
|
| 68 |
+
if view_range:
|
| 69 |
+
start, end = view_range
|
| 70 |
+
start = max(1, start)
|
| 71 |
+
end = min(len(lines), end)
|
| 72 |
+
lines = lines[start - 1 : end]
|
| 73 |
+
numbered_lines = [f"{i + start:5d}: {line}" for i, line in enumerate(lines)]
|
| 74 |
+
else:
|
| 75 |
+
numbered_lines = [f"{i + 1:5d}: {line}" for i, line in enumerate(lines)]
|
| 76 |
+
|
| 77 |
+
return "\n".join(numbered_lines) if numbered_lines else "(empty file)"
|
| 78 |
+
|
| 79 |
+
return f"Unknown path type: {path}"
|
| 80 |
+
|
| 81 |
+
def create(self, path: str, file_text: str) -> str:
|
| 82 |
+
"""Create or overwrite a file."""
|
| 83 |
+
full_path = self._validate_path(path)
|
| 84 |
+
full_path.parent.mkdir(parents=True, exist_ok=True)
|
| 85 |
+
full_path.write_text(file_text, encoding="utf-8")
|
| 86 |
+
return f"File created successfully at {path}"
|
| 87 |
+
|
| 88 |
+
def str_replace(self, path: str, old_str: str, new_str: str) -> str:
|
| 89 |
+
"""Replace text in a file."""
|
| 90 |
+
full_path = self._validate_path(path)
|
| 91 |
+
|
| 92 |
+
if not full_path.is_file():
|
| 93 |
+
raise FileNotFoundError(f"File not found: {path}")
|
| 94 |
+
|
| 95 |
+
content = full_path.read_text(encoding="utf-8")
|
| 96 |
+
|
| 97 |
+
if old_str not in content:
|
| 98 |
+
raise ValueError(f"Text not found in file: '{old_str[:50]}...'")
|
| 99 |
+
|
| 100 |
+
new_content = content.replace(old_str, new_str, 1)
|
| 101 |
+
full_path.write_text(new_content, encoding="utf-8")
|
| 102 |
+
return f"File {path} has been edited successfully"
|
| 103 |
+
|
| 104 |
+
def append(self, path: str, text: str) -> str:
|
| 105 |
+
"""Append text to end of file."""
|
| 106 |
+
full_path = self._validate_path(path)
|
| 107 |
+
|
| 108 |
+
if not full_path.exists():
|
| 109 |
+
full_path.parent.mkdir(parents=True, exist_ok=True)
|
| 110 |
+
full_path.write_text("", encoding="utf-8")
|
| 111 |
+
|
| 112 |
+
# Ensure text starts with newline if file isn't empty
|
| 113 |
+
if full_path.stat().st_size > 0:
|
| 114 |
+
existing = full_path.read_text(encoding="utf-8")
|
| 115 |
+
if existing and not existing.endswith("\n"):
|
| 116 |
+
text = "\n" + text
|
| 117 |
+
|
| 118 |
+
# Ensure text ends with newline
|
| 119 |
+
if not text.endswith("\n"):
|
| 120 |
+
text += "\n"
|
| 121 |
+
|
| 122 |
+
with full_path.open("a", encoding="utf-8") as f:
|
| 123 |
+
f.write(text)
|
| 124 |
+
|
| 125 |
+
return f"Text appended to {path}"
|
| 126 |
+
|
| 127 |
+
def search(self, query: str, path: str = "") -> str:
|
| 128 |
+
"""Search for text across memory files."""
|
| 129 |
+
full_path = self._validate_path(path)
|
| 130 |
+
|
| 131 |
+
if not full_path.exists():
|
| 132 |
+
return f"Path not found: {path or '/memory'}"
|
| 133 |
+
|
| 134 |
+
if not full_path.is_dir():
|
| 135 |
+
# Search single file
|
| 136 |
+
files = [full_path]
|
| 137 |
+
else:
|
| 138 |
+
files = list(full_path.rglob("*"))
|
| 139 |
+
|
| 140 |
+
matches: list[dict[str, Any]] = []
|
| 141 |
+
query_lower = query.lower()
|
| 142 |
+
|
| 143 |
+
for file_path in files:
|
| 144 |
+
if not file_path.is_file():
|
| 145 |
+
continue
|
| 146 |
+
try:
|
| 147 |
+
content = file_path.read_text(encoding="utf-8")
|
| 148 |
+
lines = content.splitlines()
|
| 149 |
+
|
| 150 |
+
for line_num, line in enumerate(lines, 1):
|
| 151 |
+
if query_lower in line.lower():
|
| 152 |
+
rel_path = file_path.relative_to(self.base_path)
|
| 153 |
+
matches.append({
|
| 154 |
+
"file": str(rel_path),
|
| 155 |
+
"line": line_num,
|
| 156 |
+
"content": line.strip()[:100],
|
| 157 |
+
})
|
| 158 |
+
except (UnicodeDecodeError, PermissionError):
|
| 159 |
+
continue
|
| 160 |
+
|
| 161 |
+
if not matches:
|
| 162 |
+
return f"No matches found for '{query}' in {path or '/memory'}"
|
| 163 |
+
|
| 164 |
+
result_lines = [f"Found {len(matches)} match(es) for '{query}':\n"]
|
| 165 |
+
for match in matches[:50]:
|
| 166 |
+
result_lines.append(f" {match['file']}:{match['line']} - {match['content']}")
|
| 167 |
+
|
| 168 |
+
if len(matches) > 50:
|
| 169 |
+
result_lines.append(f"\n... and {len(matches) - 50} more matches")
|
| 170 |
+
|
| 171 |
+
return "\n".join(result_lines)
|
| 172 |
+
|
| 173 |
+
def delete(self, path: str) -> str:
|
| 174 |
+
"""Delete a file or empty directory."""
|
| 175 |
+
full_path = self._validate_path(path)
|
| 176 |
+
|
| 177 |
+
if not full_path.exists():
|
| 178 |
+
raise FileNotFoundError(f"Path not found: {path}")
|
| 179 |
+
|
| 180 |
+
if full_path.is_file():
|
| 181 |
+
full_path.unlink()
|
| 182 |
+
return f"File deleted: {path}"
|
| 183 |
+
|
| 184 |
+
if full_path.is_dir():
|
| 185 |
+
if any(full_path.iterdir()):
|
| 186 |
+
raise ValueError(f"Directory not empty: {path}. Delete contents first.")
|
| 187 |
+
full_path.rmdir()
|
| 188 |
+
return f"Directory deleted: {path}"
|
| 189 |
+
|
| 190 |
+
return f"Unknown path type: {path}"
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
def create_memory_tool(memory_path: Path) -> Callable[..., Coroutine[Any, Any, str]]:
|
| 194 |
+
"""Create a memory tool bound to a specific memory directory."""
|
| 195 |
+
backend = MemoryBackend(memory_path)
|
| 196 |
+
|
| 197 |
+
async def memory(
|
| 198 |
+
command: Annotated[
|
| 199 |
+
Literal["view", "create", "str_replace", "append", "search", "delete"],
|
| 200 |
+
"Operation to perform",
|
| 201 |
+
],
|
| 202 |
+
path: Annotated[str, "Path to file or directory (e.g., '/memory/patterns/cors.md')"] = "/memory",
|
| 203 |
+
file_text: Annotated[str | None, "Content to write (for create)"] = None,
|
| 204 |
+
old_str: Annotated[str | None, "Text to find (for str_replace)"] = None,
|
| 205 |
+
new_str: Annotated[str | None, "Replacement text (for str_replace)"] = None,
|
| 206 |
+
append_text: Annotated[str | None, "Text to append (for append)"] = None,
|
| 207 |
+
query: Annotated[str | None, "Search query (for search)"] = None,
|
| 208 |
+
view_range: Annotated[list[int] | None, "Line range [start, end] (for view)"] = None,
|
| 209 |
+
) -> str:
|
| 210 |
+
"""Store and retrieve information in persistent memory.
|
| 211 |
+
|
| 212 |
+
Memory persists across conversations - use it to remember patterns,
|
| 213 |
+
insights, project context, and decisions.
|
| 214 |
+
Operations: view (show directory/file), create (new file),
|
| 215 |
+
str_replace (edit file), append (add to file),
|
| 216 |
+
search (find text), delete (remove file/dir).
|
| 217 |
+
Organize by: /memory/patterns/, /memory/projects/, /memory/decisions/
|
| 218 |
+
"""
|
| 219 |
+
try:
|
| 220 |
+
if command == "view":
|
| 221 |
+
return backend.view(path, view_range)
|
| 222 |
+
|
| 223 |
+
if command == "create":
|
| 224 |
+
if file_text is None:
|
| 225 |
+
return "Error: 'file_text' is required for create operation"
|
| 226 |
+
return backend.create(path, file_text)
|
| 227 |
+
|
| 228 |
+
if command == "str_replace":
|
| 229 |
+
if old_str is None or new_str is None:
|
| 230 |
+
return "Error: 'old_str' and 'new_str' are required for str_replace"
|
| 231 |
+
return backend.str_replace(path, old_str, new_str)
|
| 232 |
+
|
| 233 |
+
if command == "append":
|
| 234 |
+
if append_text is None:
|
| 235 |
+
return "Error: 'append_text' is required for append operation"
|
| 236 |
+
return backend.append(path, append_text)
|
| 237 |
+
|
| 238 |
+
if command == "search":
|
| 239 |
+
if query is None:
|
| 240 |
+
return "Error: 'query' is required for search operation"
|
| 241 |
+
return backend.search(query, path)
|
| 242 |
+
|
| 243 |
+
if command == "delete":
|
| 244 |
+
return backend.delete(path)
|
| 245 |
+
|
| 246 |
+
return f"Error: Unknown command: {command}"
|
| 247 |
+
|
| 248 |
+
except Exception as e:
|
| 249 |
+
return f"Memory operation failed: {e}"
|
| 250 |
+
|
| 251 |
+
# Add tool metadata
|
| 252 |
+
memory._tool_name = "memory" # type: ignore[attr-defined]
|
| 253 |
+
memory._tool_description = ( # type: ignore[attr-defined]
|
| 254 |
+
"Store and retrieve information in persistent memory. "
|
| 255 |
+
"Memory persists across conversations - use it to remember patterns, "
|
| 256 |
+
"insights, project context, and decisions."
|
| 257 |
+
)
|
| 258 |
+
memory._is_tool = True # type: ignore[attr-defined]
|
| 259 |
+
|
| 260 |
+
return memory
|
src/flow/tools/sub_agent.py
ADDED
|
@@ -0,0 +1,188 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Sub-agent tool for isolated research tasks.
|
| 2 |
+
|
| 3 |
+
Provides context isolation by delegating complex research tasks to a
|
| 4 |
+
separate agent that operates in its own context window. The sub-agent
|
| 5 |
+
processes the request and returns only a concise summary, preventing
|
| 6 |
+
context pollution in the main agent.
|
| 7 |
+
|
| 8 |
+
This implements the "Isolation" strategy for context engineering:
|
| 9 |
+
- Coordinator agent stays lean with minimal context
|
| 10 |
+
- Sub-agent can use 30K+ tokens internally for research
|
| 11 |
+
- Only the distilled result (200-500 tokens) returns to coordinator
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
from __future__ import annotations
|
| 15 |
+
|
| 16 |
+
import os
|
| 17 |
+
from collections.abc import Callable, Coroutine
|
| 18 |
+
from pathlib import Path
|
| 19 |
+
from typing import Annotated, Any
|
| 20 |
+
|
| 21 |
+
# Sub-agent system prompt focused on research and summarization
|
| 22 |
+
SUB_AGENT_INSTRUCTIONS = """You are a research assistant that helps with complex information gathering tasks.
|
| 23 |
+
|
| 24 |
+
Your role:
|
| 25 |
+
1. Thoroughly research the given topic or question
|
| 26 |
+
2. Gather relevant information from available tools
|
| 27 |
+
3. Synthesize findings into a clear, concise summary
|
| 28 |
+
4. Return ONLY the essential information needed by the requesting agent
|
| 29 |
+
|
| 30 |
+
Guidelines:
|
| 31 |
+
- Be thorough in your research but concise in your response
|
| 32 |
+
- Focus on facts and actionable information
|
| 33 |
+
- If you can't find information, say so clearly
|
| 34 |
+
- Your response will be passed to another agent, so make it self-contained
|
| 35 |
+
- Target 200-500 tokens for your final response unless more detail is explicitly requested
|
| 36 |
+
|
| 37 |
+
Do NOT:
|
| 38 |
+
- Include conversational fluff or preamble
|
| 39 |
+
- Repeat the original question back
|
| 40 |
+
- Add disclaimers about your limitations
|
| 41 |
+
- Include information that wasn't requested
|
| 42 |
+
"""
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def create_sub_agent_tool(
|
| 46 |
+
workspace: Path,
|
| 47 |
+
model: str = "gpt-4o-mini",
|
| 48 |
+
endpoint: str | None = None,
|
| 49 |
+
api_key: str | None = None,
|
| 50 |
+
api_version: str = "2024-02-15-preview",
|
| 51 |
+
) -> Callable[..., Coroutine[Any, Any, str]]:
|
| 52 |
+
"""Create a sub-agent tool for isolated research tasks.
|
| 53 |
+
|
| 54 |
+
The sub-agent runs in its own isolated context, preventing context
|
| 55 |
+
pollution in the main agent. This is useful for:
|
| 56 |
+
- Complex research that requires many tool calls
|
| 57 |
+
- Tasks that generate lots of intermediate content
|
| 58 |
+
- Keeping the main agent's context lean and focused
|
| 59 |
+
|
| 60 |
+
Args:
|
| 61 |
+
workspace: Workspace directory for file operations
|
| 62 |
+
model: Model to use for sub-agent (default: gpt-4o-mini for efficiency)
|
| 63 |
+
endpoint: Azure OpenAI endpoint (defaults to AZURE_OPENAI_ENDPOINT env var)
|
| 64 |
+
api_key: Azure OpenAI API key (defaults to AZURE_OPENAI_API_KEY env var)
|
| 65 |
+
api_version: Azure OpenAI API version
|
| 66 |
+
|
| 67 |
+
Returns:
|
| 68 |
+
An async function that can be used as a tool
|
| 69 |
+
"""
|
| 70 |
+
# Resolve credentials from environment if not provided
|
| 71 |
+
_endpoint = endpoint or os.environ.get("AZURE_OPENAI_ENDPOINT", "")
|
| 72 |
+
_api_key = api_key or os.environ.get("AZURE_OPENAI_API_KEY", "")
|
| 73 |
+
|
| 74 |
+
# Lazy import to avoid circular dependencies
|
| 75 |
+
_sub_agent: Any = None
|
| 76 |
+
|
| 77 |
+
async def _ensure_sub_agent() -> Any:
|
| 78 |
+
"""Lazily create the sub-agent on first use."""
|
| 79 |
+
nonlocal _sub_agent
|
| 80 |
+
if _sub_agent is not None:
|
| 81 |
+
return _sub_agent
|
| 82 |
+
|
| 83 |
+
try:
|
| 84 |
+
from agent_framework import ChatAgent
|
| 85 |
+
from agent_framework.azure import AzureOpenAIChatClient
|
| 86 |
+
except ImportError as e:
|
| 87 |
+
raise ImportError(
|
| 88 |
+
"Microsoft Agent Framework is required for sub-agent. "
|
| 89 |
+
"Install with: pip install agent-framework-core"
|
| 90 |
+
) from e
|
| 91 |
+
|
| 92 |
+
# Create a lightweight chat client for the sub-agent
|
| 93 |
+
# Uses a smaller/faster model by default for efficiency
|
| 94 |
+
client = AzureOpenAIChatClient(
|
| 95 |
+
api_key=_api_key,
|
| 96 |
+
endpoint=_endpoint,
|
| 97 |
+
deployment=model,
|
| 98 |
+
api_version=api_version,
|
| 99 |
+
)
|
| 100 |
+
|
| 101 |
+
# Create basic tools for the sub-agent
|
| 102 |
+
# Keep it minimal - just what's needed for research
|
| 103 |
+
from flow.tools.coding import create_coding_tools
|
| 104 |
+
from flow.tools.core import create_core_tools
|
| 105 |
+
|
| 106 |
+
sub_tools: list[Callable[..., Any]] = []
|
| 107 |
+
sub_tools.extend(create_coding_tools(workspace))
|
| 108 |
+
sub_tools.extend(create_core_tools())
|
| 109 |
+
|
| 110 |
+
# Convert tools to agent_framework format
|
| 111 |
+
from agent_framework import ai_function
|
| 112 |
+
|
| 113 |
+
converted_tools = []
|
| 114 |
+
for tool_func in sub_tools:
|
| 115 |
+
name = getattr(tool_func, "_tool_name", tool_func.__name__)
|
| 116 |
+
description = getattr(tool_func, "_tool_description", tool_func.__doc__ or "")
|
| 117 |
+
wrapped = ai_function(name=name, description=description)(tool_func)
|
| 118 |
+
converted_tools.append(wrapped)
|
| 119 |
+
|
| 120 |
+
_sub_agent = ChatAgent(
|
| 121 |
+
name="ResearchAssistant",
|
| 122 |
+
description="Research assistant for complex information gathering",
|
| 123 |
+
instructions=SUB_AGENT_INSTRUCTIONS,
|
| 124 |
+
chat_client=client,
|
| 125 |
+
tools=converted_tools,
|
| 126 |
+
)
|
| 127 |
+
|
| 128 |
+
return _sub_agent
|
| 129 |
+
|
| 130 |
+
async def research(
|
| 131 |
+
task: Annotated[
|
| 132 |
+
str,
|
| 133 |
+
"The research task or question to investigate. Be specific about what information you need.",
|
| 134 |
+
],
|
| 135 |
+
context: Annotated[
|
| 136 |
+
str | None,
|
| 137 |
+
"Optional context to help the sub-agent understand the broader goal.",
|
| 138 |
+
] = None,
|
| 139 |
+
) -> str:
|
| 140 |
+
"""Delegate a research task to a sub-agent with isolated context.
|
| 141 |
+
|
| 142 |
+
Use this tool when you need to:
|
| 143 |
+
- Research a complex topic that may require multiple steps
|
| 144 |
+
- Gather information without polluting your main context
|
| 145 |
+
- Get a summarized answer to a specific question
|
| 146 |
+
|
| 147 |
+
The sub-agent operates in its own context window, so it can
|
| 148 |
+
use many tokens internally while only returning a concise summary.
|
| 149 |
+
This keeps your main context lean and focused.
|
| 150 |
+
|
| 151 |
+
Examples:
|
| 152 |
+
- "Find all Python files that import the requests library and summarize their purpose"
|
| 153 |
+
- "Research how authentication is implemented in this codebase"
|
| 154 |
+
- "Analyze the error handling patterns used across the project"
|
| 155 |
+
"""
|
| 156 |
+
sub_agent = await _ensure_sub_agent()
|
| 157 |
+
|
| 158 |
+
# Build the research prompt
|
| 159 |
+
prompt_parts = [f"Research task: {task}"]
|
| 160 |
+
if context:
|
| 161 |
+
prompt_parts.insert(0, f"Context: {context}")
|
| 162 |
+
prompt_parts.append("\nProvide a concise summary of your findings.")
|
| 163 |
+
|
| 164 |
+
full_prompt = "\n\n".join(prompt_parts)
|
| 165 |
+
|
| 166 |
+
try:
|
| 167 |
+
# Run the sub-agent - it operates in isolated context
|
| 168 |
+
response = await sub_agent.run(full_prompt)
|
| 169 |
+
|
| 170 |
+
# Extract text content from response
|
| 171 |
+
if hasattr(response, "content"):
|
| 172 |
+
return str(response.content)
|
| 173 |
+
return str(response)
|
| 174 |
+
|
| 175 |
+
except Exception as e:
|
| 176 |
+
return f"Research failed: {e}"
|
| 177 |
+
|
| 178 |
+
# Add tool metadata
|
| 179 |
+
research._tool_name = "research" # type: ignore[attr-defined]
|
| 180 |
+
research._tool_description = ( # type: ignore[attr-defined]
|
| 181 |
+
"Delegate a research task to a sub-agent with isolated context. "
|
| 182 |
+
"The sub-agent can thoroughly investigate a topic using many tool calls "
|
| 183 |
+
"internally, then return only a concise summary. Use this for complex "
|
| 184 |
+
"research that would otherwise pollute your main context."
|
| 185 |
+
)
|
| 186 |
+
research._is_tool = True # type: ignore[attr-defined]
|
| 187 |
+
|
| 188 |
+
return research
|
src/flow/ui/__init__.py
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Microsoft. All rights reserved.
|
| 2 |
+
"""Flow UI Backend - FastAPI server."""
|
src/flow/ui/api/__init__.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Microsoft. All rights reserved.
|
| 2 |
+
"""API routes package."""
|
| 3 |
+
|
| 4 |
+
from .configs import router as configs_router
|
| 5 |
+
from .tasks import router as tasks_router
|
| 6 |
+
from .jobs import router as jobs_router
|
| 7 |
+
from .runs import router as runs_router
|
| 8 |
+
|
| 9 |
+
__all__ = [
|
| 10 |
+
"configs_router",
|
| 11 |
+
"tasks_router",
|
| 12 |
+
"jobs_router",
|
| 13 |
+
"runs_router",
|
| 14 |
+
]
|
src/flow/ui/api/configs.py
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Microsoft. All rights reserved.
|
| 2 |
+
"""Config API routes."""
|
| 3 |
+
|
| 4 |
+
from uuid import UUID
|
| 5 |
+
|
| 6 |
+
from fastapi import APIRouter, Depends, HTTPException
|
| 7 |
+
from sqlalchemy.ext.asyncio import AsyncSession
|
| 8 |
+
from sqlmodel import select, desc
|
| 9 |
+
|
| 10 |
+
from ..database import get_session
|
| 11 |
+
from ..models.config import AgentConfig
|
| 12 |
+
from ..schemas import ConfigCreate, ConfigUpdate, ConfigResponse
|
| 13 |
+
|
| 14 |
+
router = APIRouter(prefix="/configs", tags=["configs"])
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def parse_uuid(id_str: str) -> UUID:
|
| 18 |
+
"""Parse a string to UUID, raising 400 if invalid."""
|
| 19 |
+
try:
|
| 20 |
+
return UUID(id_str)
|
| 21 |
+
except ValueError as e:
|
| 22 |
+
raise HTTPException(status_code=400, detail=f"Invalid UUID: {id_str}") from e
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
@router.get("", response_model=list[ConfigResponse])
|
| 26 |
+
async def list_configs(session: AsyncSession = Depends(get_session)) -> list[AgentConfig]:
|
| 27 |
+
"""List all agent configurations."""
|
| 28 |
+
result = await session.execute(select(AgentConfig).order_by(desc(AgentConfig.created_at)))
|
| 29 |
+
return list(result.scalars().all())
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
@router.post("", response_model=ConfigResponse, status_code=201)
|
| 33 |
+
async def create_config(
|
| 34 |
+
data: ConfigCreate,
|
| 35 |
+
session: AsyncSession = Depends(get_session),
|
| 36 |
+
) -> AgentConfig:
|
| 37 |
+
"""Create a new agent configuration."""
|
| 38 |
+
config = AgentConfig(
|
| 39 |
+
name=data.name,
|
| 40 |
+
description=data.description,
|
| 41 |
+
config_json=data.to_config_json(),
|
| 42 |
+
)
|
| 43 |
+
session.add(config)
|
| 44 |
+
await session.commit()
|
| 45 |
+
await session.refresh(config)
|
| 46 |
+
return config
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
@router.get("/{config_id}", response_model=ConfigResponse)
|
| 50 |
+
async def get_config(
|
| 51 |
+
config_id: str,
|
| 52 |
+
session: AsyncSession = Depends(get_session),
|
| 53 |
+
) -> AgentConfig:
|
| 54 |
+
"""Get a specific agent configuration."""
|
| 55 |
+
uuid_id = parse_uuid(config_id)
|
| 56 |
+
result = await session.execute(select(AgentConfig).where(AgentConfig.id == uuid_id))
|
| 57 |
+
config = result.scalar_one_or_none()
|
| 58 |
+
if not config:
|
| 59 |
+
raise HTTPException(status_code=404, detail="Config not found")
|
| 60 |
+
return config
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
@router.put("/{config_id}", response_model=ConfigResponse)
|
| 64 |
+
async def update_config(
|
| 65 |
+
config_id: str,
|
| 66 |
+
data: ConfigUpdate,
|
| 67 |
+
session: AsyncSession = Depends(get_session),
|
| 68 |
+
) -> AgentConfig:
|
| 69 |
+
"""Update an agent configuration."""
|
| 70 |
+
uuid_id = parse_uuid(config_id)
|
| 71 |
+
result = await session.execute(select(AgentConfig).where(AgentConfig.id == uuid_id))
|
| 72 |
+
config = result.scalar_one_or_none()
|
| 73 |
+
if not config:
|
| 74 |
+
raise HTTPException(status_code=404, detail="Config not found")
|
| 75 |
+
|
| 76 |
+
# Update fields that were provided
|
| 77 |
+
update_data = data.model_dump(exclude_unset=True)
|
| 78 |
+
|
| 79 |
+
# Handle config_json fields separately
|
| 80 |
+
config_fields = [
|
| 81 |
+
"enable_message_compaction",
|
| 82 |
+
"enable_memory_tool",
|
| 83 |
+
"enable_sub_agent",
|
| 84 |
+
"compaction_head_size",
|
| 85 |
+
"compaction_tail_size",
|
| 86 |
+
"bash_timeout",
|
| 87 |
+
]
|
| 88 |
+
|
| 89 |
+
config_json = dict(config.config_json)
|
| 90 |
+
for field in config_fields:
|
| 91 |
+
if field in update_data:
|
| 92 |
+
config_json[field] = update_data.pop(field)
|
| 93 |
+
|
| 94 |
+
# Update top-level fields
|
| 95 |
+
for key, value in update_data.items():
|
| 96 |
+
setattr(config, key, value)
|
| 97 |
+
|
| 98 |
+
config.config_json = config_json
|
| 99 |
+
|
| 100 |
+
from datetime import datetime, timezone
|
| 101 |
+
config.updated_at = datetime.now(timezone.utc)
|
| 102 |
+
|
| 103 |
+
await session.commit()
|
| 104 |
+
await session.refresh(config)
|
| 105 |
+
return config
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
@router.delete("/{config_id}", status_code=204)
|
| 109 |
+
async def delete_config(
|
| 110 |
+
config_id: str,
|
| 111 |
+
session: AsyncSession = Depends(get_session),
|
| 112 |
+
) -> None:
|
| 113 |
+
"""Delete an agent configuration."""
|
| 114 |
+
uuid_id = parse_uuid(config_id)
|
| 115 |
+
result = await session.execute(select(AgentConfig).where(AgentConfig.id == uuid_id))
|
| 116 |
+
config = result.scalar_one_or_none()
|
| 117 |
+
if not config:
|
| 118 |
+
raise HTTPException(status_code=404, detail="Config not found")
|
| 119 |
+
|
| 120 |
+
await session.delete(config)
|
| 121 |
+
await session.commit()
|
src/flow/ui/api/jobs.py
ADDED
|
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Microsoft. All rights reserved.
|
| 2 |
+
"""Job API routes."""
|
| 3 |
+
|
| 4 |
+
import asyncio
|
| 5 |
+
from typing import Any, AsyncGenerator
|
| 6 |
+
from uuid import UUID
|
| 7 |
+
|
| 8 |
+
from fastapi import APIRouter, Depends, HTTPException
|
| 9 |
+
from fastapi.responses import StreamingResponse
|
| 10 |
+
from sqlalchemy.ext.asyncio import AsyncSession
|
| 11 |
+
from sqlmodel import select, desc
|
| 12 |
+
|
| 13 |
+
from ..database import get_session
|
| 14 |
+
from ..models.job import OptimizationJob, JobStatus
|
| 15 |
+
from ..models.config import AgentConfig
|
| 16 |
+
from ..models.task import TaskModel
|
| 17 |
+
from ..schemas import JobCreate, JobResponse
|
| 18 |
+
from ..services.optimizer_service import OptimizerService
|
| 19 |
+
|
| 20 |
+
router = APIRouter(prefix="/jobs", tags=["jobs"])
|
| 21 |
+
|
| 22 |
+
# Store running jobs for cancellation
|
| 23 |
+
_running_jobs: dict[str, asyncio.Task[Any]] = {}
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def parse_uuid(id_str: str) -> UUID:
|
| 27 |
+
"""Parse a string to UUID, raising 400 if invalid."""
|
| 28 |
+
try:
|
| 29 |
+
return UUID(id_str)
|
| 30 |
+
except ValueError as e:
|
| 31 |
+
raise HTTPException(status_code=400, detail=f"Invalid UUID: {id_str}") from e
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
@router.get("", response_model=list[JobResponse])
|
| 35 |
+
async def list_jobs(
|
| 36 |
+
status: JobStatus | None = None,
|
| 37 |
+
session: AsyncSession = Depends(get_session),
|
| 38 |
+
) -> list[OptimizationJob]:
|
| 39 |
+
"""List all optimization jobs."""
|
| 40 |
+
query = select(OptimizationJob)
|
| 41 |
+
if status:
|
| 42 |
+
query = query.where(OptimizationJob.status == status)
|
| 43 |
+
query = query.order_by(desc(OptimizationJob.created_at))
|
| 44 |
+
result = await session.execute(query)
|
| 45 |
+
return list(result.scalars().all())
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
@router.post("", response_model=JobResponse, status_code=201)
|
| 49 |
+
async def create_job(
|
| 50 |
+
data: JobCreate,
|
| 51 |
+
session: AsyncSession = Depends(get_session),
|
| 52 |
+
) -> OptimizationJob:
|
| 53 |
+
"""Create a new optimization job."""
|
| 54 |
+
# Validate config_ids exist
|
| 55 |
+
for config_id in data.config_ids:
|
| 56 |
+
uuid_id = parse_uuid(config_id)
|
| 57 |
+
result = await session.execute(select(AgentConfig).where(AgentConfig.id == uuid_id))
|
| 58 |
+
if not result.scalar_one_or_none():
|
| 59 |
+
raise HTTPException(status_code=400, detail=f"Config {config_id} not found")
|
| 60 |
+
|
| 61 |
+
# Validate task_ids exist
|
| 62 |
+
for task_id in data.task_ids:
|
| 63 |
+
uuid_id = parse_uuid(task_id)
|
| 64 |
+
result = await session.execute(select(TaskModel).where(TaskModel.id == uuid_id))
|
| 65 |
+
if not result.scalar_one_or_none():
|
| 66 |
+
raise HTTPException(status_code=400, detail=f"Task {task_id} not found")
|
| 67 |
+
|
| 68 |
+
job = OptimizationJob(
|
| 69 |
+
name=data.name,
|
| 70 |
+
config_ids=data.config_ids,
|
| 71 |
+
task_ids=data.task_ids,
|
| 72 |
+
parallel=data.parallel,
|
| 73 |
+
use_llm_eval=data.use_llm_eval,
|
| 74 |
+
total_experiments=len(data.config_ids) * len(data.task_ids),
|
| 75 |
+
)
|
| 76 |
+
session.add(job)
|
| 77 |
+
await session.commit()
|
| 78 |
+
await session.refresh(job)
|
| 79 |
+
return job
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
@router.get("/{job_id}", response_model=JobResponse)
|
| 83 |
+
async def get_job(
|
| 84 |
+
job_id: str,
|
| 85 |
+
session: AsyncSession = Depends(get_session),
|
| 86 |
+
) -> OptimizationJob:
|
| 87 |
+
"""Get a specific optimization job."""
|
| 88 |
+
uuid_id = parse_uuid(job_id)
|
| 89 |
+
result = await session.execute(select(OptimizationJob).where(OptimizationJob.id == uuid_id))
|
| 90 |
+
job = result.scalar_one_or_none()
|
| 91 |
+
if not job:
|
| 92 |
+
raise HTTPException(status_code=404, detail="Job not found")
|
| 93 |
+
return job
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
@router.post("/{job_id}/start")
|
| 97 |
+
async def start_job(
|
| 98 |
+
job_id: str,
|
| 99 |
+
session: AsyncSession = Depends(get_session),
|
| 100 |
+
) -> StreamingResponse:
|
| 101 |
+
"""Start an optimization job and stream progress via SSE."""
|
| 102 |
+
uuid_id = parse_uuid(job_id)
|
| 103 |
+
result = await session.execute(select(OptimizationJob).where(OptimizationJob.id == uuid_id))
|
| 104 |
+
job = result.scalar_one_or_none()
|
| 105 |
+
if not job:
|
| 106 |
+
raise HTTPException(status_code=404, detail="Job not found")
|
| 107 |
+
|
| 108 |
+
if job.status != JobStatus.PENDING:
|
| 109 |
+
raise HTTPException(status_code=400, detail=f"Job is already {job.status}")
|
| 110 |
+
|
| 111 |
+
async def event_stream() -> AsyncGenerator[str, None]:
|
| 112 |
+
service = OptimizerService()
|
| 113 |
+
async for progress in service.run_job(job_id):
|
| 114 |
+
yield f"data: {progress.model_dump_json()}\n\n"
|
| 115 |
+
|
| 116 |
+
return StreamingResponse(
|
| 117 |
+
event_stream(),
|
| 118 |
+
media_type="text/event-stream",
|
| 119 |
+
headers={
|
| 120 |
+
"Cache-Control": "no-cache",
|
| 121 |
+
"Connection": "keep-alive",
|
| 122 |
+
},
|
| 123 |
+
)
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
@router.post("/{job_id}/cancel", response_model=JobResponse)
|
| 127 |
+
async def cancel_job(
|
| 128 |
+
job_id: str,
|
| 129 |
+
session: AsyncSession = Depends(get_session),
|
| 130 |
+
) -> OptimizationJob:
|
| 131 |
+
"""Cancel a running optimization job."""
|
| 132 |
+
uuid_id = parse_uuid(job_id)
|
| 133 |
+
result = await session.execute(select(OptimizationJob).where(OptimizationJob.id == uuid_id))
|
| 134 |
+
job = result.scalar_one_or_none()
|
| 135 |
+
if not job:
|
| 136 |
+
raise HTTPException(status_code=404, detail="Job not found")
|
| 137 |
+
|
| 138 |
+
if job.status != JobStatus.RUNNING:
|
| 139 |
+
raise HTTPException(status_code=400, detail=f"Job is not running (status: {job.status})")
|
| 140 |
+
|
| 141 |
+
# Cancel the running task if it exists
|
| 142 |
+
if job_id in _running_jobs:
|
| 143 |
+
_running_jobs[job_id].cancel()
|
| 144 |
+
del _running_jobs[job_id]
|
| 145 |
+
|
| 146 |
+
job.status = JobStatus.CANCELLED
|
| 147 |
+
await session.commit()
|
| 148 |
+
await session.refresh(job)
|
| 149 |
+
return job
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
@router.delete("/{job_id}", status_code=204)
|
| 153 |
+
async def delete_job(
|
| 154 |
+
job_id: str,
|
| 155 |
+
session: AsyncSession = Depends(get_session),
|
| 156 |
+
) -> None:
|
| 157 |
+
"""Delete an optimization job and its runs."""
|
| 158 |
+
uuid_id = parse_uuid(job_id)
|
| 159 |
+
result = await session.execute(select(OptimizationJob).where(OptimizationJob.id == uuid_id))
|
| 160 |
+
job = result.scalar_one_or_none()
|
| 161 |
+
if not job:
|
| 162 |
+
raise HTTPException(status_code=404, detail="Job not found")
|
| 163 |
+
|
| 164 |
+
if job.status == JobStatus.RUNNING:
|
| 165 |
+
raise HTTPException(status_code=400, detail="Cannot delete a running job")
|
| 166 |
+
|
| 167 |
+
# Runs will be cascade deleted due to foreign key
|
| 168 |
+
await session.delete(job)
|
| 169 |
+
await session.commit()
|
src/flow/ui/api/runs.py
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Microsoft. All rights reserved.
|
| 2 |
+
"""Run API routes."""
|
| 3 |
+
|
| 4 |
+
from typing import Any
|
| 5 |
+
from uuid import UUID
|
| 6 |
+
|
| 7 |
+
from fastapi import APIRouter, Depends, HTTPException
|
| 8 |
+
from sqlalchemy.ext.asyncio import AsyncSession
|
| 9 |
+
from sqlmodel import select, desc
|
| 10 |
+
|
| 11 |
+
from ..database import get_session
|
| 12 |
+
from ..models.run import ExperimentRun
|
| 13 |
+
from ..schemas import RunResponse, RunDetailResponse, CriterionResultSchema
|
| 14 |
+
|
| 15 |
+
router = APIRouter(prefix="/runs", tags=["runs"])
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def parse_uuid(id_str: str) -> UUID:
|
| 19 |
+
"""Parse a string to UUID, raising 400 if invalid."""
|
| 20 |
+
try:
|
| 21 |
+
return UUID(id_str)
|
| 22 |
+
except ValueError as e:
|
| 23 |
+
raise HTTPException(status_code=400, detail=f"Invalid UUID: {id_str}") from e
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
@router.get("", response_model=list[RunResponse])
|
| 27 |
+
async def list_runs(
|
| 28 |
+
job_id: str | None = None,
|
| 29 |
+
config_name: str | None = None,
|
| 30 |
+
task_name: str | None = None,
|
| 31 |
+
is_pareto: bool | None = None,
|
| 32 |
+
session: AsyncSession = Depends(get_session),
|
| 33 |
+
) -> list[ExperimentRun]:
|
| 34 |
+
"""List experiment runs with optional filters."""
|
| 35 |
+
query = select(ExperimentRun)
|
| 36 |
+
|
| 37 |
+
if job_id:
|
| 38 |
+
uuid_id = parse_uuid(job_id)
|
| 39 |
+
query = query.where(ExperimentRun.job_id == uuid_id)
|
| 40 |
+
if config_name:
|
| 41 |
+
query = query.where(ExperimentRun.config_name == config_name)
|
| 42 |
+
if task_name:
|
| 43 |
+
query = query.where(ExperimentRun.task_name == task_name)
|
| 44 |
+
if is_pareto is not None:
|
| 45 |
+
query = query.where(ExperimentRun.is_pareto == is_pareto)
|
| 46 |
+
|
| 47 |
+
query = query.order_by(desc(ExperimentRun.created_at))
|
| 48 |
+
result = await session.execute(query)
|
| 49 |
+
return list(result.scalars().all())
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
@router.get("/{run_id}", response_model=RunDetailResponse)
|
| 53 |
+
async def get_run(
|
| 54 |
+
run_id: str,
|
| 55 |
+
session: AsyncSession = Depends(get_session),
|
| 56 |
+
) -> dict[str, Any]:
|
| 57 |
+
"""Get detailed information about a specific run."""
|
| 58 |
+
uuid_id = parse_uuid(run_id)
|
| 59 |
+
result = await session.execute(select(ExperimentRun).where(ExperimentRun.id == uuid_id))
|
| 60 |
+
run = result.scalar_one_or_none()
|
| 61 |
+
if not run:
|
| 62 |
+
raise HTTPException(status_code=404, detail="Run not found")
|
| 63 |
+
|
| 64 |
+
# Parse criteria results from trace
|
| 65 |
+
criteria_results = []
|
| 66 |
+
if run.trace_json and "criteria_results" in run.trace_json:
|
| 67 |
+
for cr in run.trace_json["criteria_results"]:
|
| 68 |
+
criteria_results.append(CriterionResultSchema(
|
| 69 |
+
name=cr.get("name", ""),
|
| 70 |
+
score=cr.get("score", 0.0),
|
| 71 |
+
passed=cr.get("passed", False),
|
| 72 |
+
reasoning=cr.get("reasoning", ""),
|
| 73 |
+
))
|
| 74 |
+
|
| 75 |
+
return {
|
| 76 |
+
"id": str(run.id),
|
| 77 |
+
"job_id": str(run.job_id),
|
| 78 |
+
"config_name": run.config_name,
|
| 79 |
+
"task_name": run.task_name,
|
| 80 |
+
"status": run.status,
|
| 81 |
+
"tokens_total": run.tokens_total,
|
| 82 |
+
"tokens_input": run.tokens_input,
|
| 83 |
+
"tokens_output": run.tokens_output,
|
| 84 |
+
"duration_seconds": run.duration_seconds,
|
| 85 |
+
"score": run.score,
|
| 86 |
+
"passed": run.passed,
|
| 87 |
+
"reasoning": run.reasoning,
|
| 88 |
+
"criteria_results": criteria_results,
|
| 89 |
+
"output": run.output,
|
| 90 |
+
"files_created": run.files_created,
|
| 91 |
+
"trace": run.trace_json,
|
| 92 |
+
"is_pareto": run.is_pareto,
|
| 93 |
+
"pareto_rank": run.pareto_rank,
|
| 94 |
+
"created_at": run.created_at,
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
@router.get("/job/{job_id}/summary")
|
| 99 |
+
async def get_job_summary(
|
| 100 |
+
job_id: str,
|
| 101 |
+
session: AsyncSession = Depends(get_session),
|
| 102 |
+
) -> dict[str, Any]:
|
| 103 |
+
"""Get aggregated summary for a job's runs."""
|
| 104 |
+
uuid_id = parse_uuid(job_id)
|
| 105 |
+
result = await session.execute(
|
| 106 |
+
select(ExperimentRun).where(ExperimentRun.job_id == uuid_id)
|
| 107 |
+
)
|
| 108 |
+
runs = list(result.scalars().all())
|
| 109 |
+
|
| 110 |
+
if not runs:
|
| 111 |
+
raise HTTPException(status_code=404, detail="No runs found for job")
|
| 112 |
+
|
| 113 |
+
# Aggregate by config
|
| 114 |
+
config_summaries: dict[str, dict[str, Any]] = {}
|
| 115 |
+
for run in runs:
|
| 116 |
+
if run.config_name not in config_summaries:
|
| 117 |
+
config_summaries[run.config_name] = {
|
| 118 |
+
"config_name": run.config_name,
|
| 119 |
+
"total_runs": 0,
|
| 120 |
+
"passed_runs": 0,
|
| 121 |
+
"avg_score": 0.0,
|
| 122 |
+
"avg_tokens": 0.0,
|
| 123 |
+
"avg_duration": 0.0,
|
| 124 |
+
"is_pareto": False,
|
| 125 |
+
"pareto_rank": 999,
|
| 126 |
+
}
|
| 127 |
+
|
| 128 |
+
summary = config_summaries[run.config_name]
|
| 129 |
+
summary["total_runs"] += 1
|
| 130 |
+
if run.passed:
|
| 131 |
+
summary["passed_runs"] += 1
|
| 132 |
+
summary["avg_score"] += run.score
|
| 133 |
+
summary["avg_tokens"] += run.tokens_total
|
| 134 |
+
summary["avg_duration"] += run.duration_seconds
|
| 135 |
+
if run.is_pareto:
|
| 136 |
+
summary["is_pareto"] = True
|
| 137 |
+
summary["pareto_rank"] = min(summary["pareto_rank"], run.pareto_rank)
|
| 138 |
+
|
| 139 |
+
# Calculate averages
|
| 140 |
+
for summary in config_summaries.values():
|
| 141 |
+
n = summary["total_runs"]
|
| 142 |
+
summary["avg_score"] /= n
|
| 143 |
+
summary["avg_tokens"] /= n
|
| 144 |
+
summary["avg_duration"] /= n
|
| 145 |
+
|
| 146 |
+
# Sort by score descending
|
| 147 |
+
sorted_summaries = sorted(
|
| 148 |
+
config_summaries.values(),
|
| 149 |
+
key=lambda x: (-x["avg_score"], x["avg_tokens"]),
|
| 150 |
+
)
|
| 151 |
+
|
| 152 |
+
return {
|
| 153 |
+
"job_id": job_id,
|
| 154 |
+
"total_runs": len(runs),
|
| 155 |
+
"config_summaries": sorted_summaries,
|
| 156 |
+
"pareto_configs": [s["config_name"] for s in sorted_summaries if s["is_pareto"]],
|
| 157 |
+
}
|
src/flow/ui/api/tasks.py
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Microsoft. All rights reserved.
|
| 2 |
+
"""Task API routes."""
|
| 3 |
+
|
| 4 |
+
from uuid import UUID
|
| 5 |
+
|
| 6 |
+
from fastapi import APIRouter, Depends, HTTPException
|
| 7 |
+
from sqlalchemy.ext.asyncio import AsyncSession
|
| 8 |
+
from sqlmodel import select, desc
|
| 9 |
+
|
| 10 |
+
from ..database import get_session
|
| 11 |
+
from ..models.task import TaskModel
|
| 12 |
+
from ..schemas import TaskCreate, TaskResponse
|
| 13 |
+
|
| 14 |
+
router = APIRouter(prefix="/tasks", tags=["tasks"])
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def parse_uuid(id_str: str) -> UUID:
|
| 18 |
+
"""Parse a string to UUID, raising 400 if invalid."""
|
| 19 |
+
try:
|
| 20 |
+
return UUID(id_str)
|
| 21 |
+
except ValueError as e:
|
| 22 |
+
raise HTTPException(status_code=400, detail=f"Invalid UUID: {id_str}") from e
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
@router.get("", response_model=list[TaskResponse])
|
| 26 |
+
async def list_tasks(
|
| 27 |
+
category: str | None = None,
|
| 28 |
+
suite: str | None = None,
|
| 29 |
+
session: AsyncSession = Depends(get_session),
|
| 30 |
+
) -> list[TaskModel]:
|
| 31 |
+
"""List all tasks, optionally filtered by category or suite."""
|
| 32 |
+
query = select(TaskModel)
|
| 33 |
+
if category:
|
| 34 |
+
query = query.where(TaskModel.category == category)
|
| 35 |
+
if suite:
|
| 36 |
+
query = query.where(TaskModel.suite == suite)
|
| 37 |
+
query = query.order_by(desc(TaskModel.created_at))
|
| 38 |
+
result = await session.execute(query)
|
| 39 |
+
return list(result.scalars().all())
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
@router.post("", response_model=TaskResponse, status_code=201)
|
| 43 |
+
async def create_task(
|
| 44 |
+
data: TaskCreate,
|
| 45 |
+
session: AsyncSession = Depends(get_session),
|
| 46 |
+
) -> TaskModel:
|
| 47 |
+
"""Create a new task."""
|
| 48 |
+
task = TaskModel(
|
| 49 |
+
name=data.name,
|
| 50 |
+
prompt=data.prompt,
|
| 51 |
+
criteria_json=data.to_criteria_json(),
|
| 52 |
+
category=data.category,
|
| 53 |
+
)
|
| 54 |
+
session.add(task)
|
| 55 |
+
await session.commit()
|
| 56 |
+
await session.refresh(task)
|
| 57 |
+
return task
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
@router.get("/{task_id}", response_model=TaskResponse)
|
| 61 |
+
async def get_task(
|
| 62 |
+
task_id: str,
|
| 63 |
+
session: AsyncSession = Depends(get_session),
|
| 64 |
+
) -> TaskModel:
|
| 65 |
+
"""Get a specific task."""
|
| 66 |
+
uuid_id = parse_uuid(task_id)
|
| 67 |
+
result = await session.execute(select(TaskModel).where(TaskModel.id == uuid_id))
|
| 68 |
+
task = result.scalar_one_or_none()
|
| 69 |
+
if not task:
|
| 70 |
+
raise HTTPException(status_code=404, detail="Task not found")
|
| 71 |
+
return task
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
@router.delete("/{task_id}", status_code=204)
|
| 75 |
+
async def delete_task(
|
| 76 |
+
task_id: str,
|
| 77 |
+
session: AsyncSession = Depends(get_session),
|
| 78 |
+
) -> None:
|
| 79 |
+
"""Delete a task."""
|
| 80 |
+
uuid_id = parse_uuid(task_id)
|
| 81 |
+
result = await session.execute(select(TaskModel).where(TaskModel.id == uuid_id))
|
| 82 |
+
task = result.scalar_one_or_none()
|
| 83 |
+
if not task:
|
| 84 |
+
raise HTTPException(status_code=404, detail="Task not found")
|
| 85 |
+
|
| 86 |
+
await session.delete(task)
|
| 87 |
+
await session.commit()
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
@router.post("/import-suite", response_model=list[TaskResponse], status_code=201)
|
| 91 |
+
async def import_suite(
|
| 92 |
+
suite_name: str,
|
| 93 |
+
session: AsyncSession = Depends(get_session),
|
| 94 |
+
) -> list[TaskModel]:
|
| 95 |
+
"""Import tasks from a built-in suite."""
|
| 96 |
+
from flow.experiments.types import get_task_suite
|
| 97 |
+
|
| 98 |
+
try:
|
| 99 |
+
suite_tasks = get_task_suite(suite_name)
|
| 100 |
+
except ValueError as e:
|
| 101 |
+
raise HTTPException(status_code=400, detail=str(e))
|
| 102 |
+
|
| 103 |
+
created_tasks = []
|
| 104 |
+
for t in suite_tasks:
|
| 105 |
+
task = TaskModel(
|
| 106 |
+
name=t.name,
|
| 107 |
+
prompt=t.prompt,
|
| 108 |
+
criteria_json=[{"name": c.name, "instruction": c.instruction, "weight": c.weight} for c in t.criteria],
|
| 109 |
+
category=t.metadata.get("category", "default"),
|
| 110 |
+
suite=suite_name,
|
| 111 |
+
)
|
| 112 |
+
session.add(task)
|
| 113 |
+
created_tasks.append(task)
|
| 114 |
+
|
| 115 |
+
await session.commit()
|
| 116 |
+
for task in created_tasks:
|
| 117 |
+
await session.refresh(task)
|
| 118 |
+
|
| 119 |
+
return created_tasks
|
src/flow/ui/database.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Microsoft. All rights reserved.
|
| 2 |
+
"""Database setup with SQLModel and SQLite."""
|
| 3 |
+
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
from typing import AsyncGenerator
|
| 6 |
+
|
| 7 |
+
from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine
|
| 8 |
+
from sqlmodel import SQLModel
|
| 9 |
+
|
| 10 |
+
# Database path
|
| 11 |
+
DB_PATH = Path.home() / ".flow" / "flow_ui.db"
|
| 12 |
+
DB_PATH.parent.mkdir(parents=True, exist_ok=True)
|
| 13 |
+
|
| 14 |
+
DATABASE_URL = f"sqlite+aiosqlite:///{DB_PATH}"
|
| 15 |
+
|
| 16 |
+
engine = create_async_engine(DATABASE_URL, echo=False, future=True)
|
| 17 |
+
|
| 18 |
+
async_session = async_sessionmaker(engine, class_=AsyncSession, expire_on_commit=False)
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
async def init_db() -> None:
|
| 22 |
+
"""Initialize database tables."""
|
| 23 |
+
async with engine.begin() as conn:
|
| 24 |
+
await conn.run_sync(SQLModel.metadata.create_all)
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
async def get_session() -> AsyncGenerator[AsyncSession, None]:
|
| 28 |
+
"""Get database session."""
|
| 29 |
+
async with async_session() as session:
|
| 30 |
+
yield session
|
src/flow/ui/main.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Microsoft. All rights reserved.
|
| 2 |
+
"""FastAPI server for Flow UI."""
|
| 3 |
+
|
| 4 |
+
from contextlib import asynccontextmanager
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
from typing import Any, AsyncGenerator
|
| 7 |
+
|
| 8 |
+
from fastapi import FastAPI
|
| 9 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 10 |
+
from fastapi.staticfiles import StaticFiles
|
| 11 |
+
from fastapi.responses import FileResponse
|
| 12 |
+
|
| 13 |
+
from .database import init_db
|
| 14 |
+
from .api import configs_router, tasks_router, jobs_router, runs_router
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
@asynccontextmanager
|
| 18 |
+
async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
|
| 19 |
+
"""Initialize database on startup."""
|
| 20 |
+
await init_db()
|
| 21 |
+
yield
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
app = FastAPI(
|
| 25 |
+
title="Flow Optimization UI",
|
| 26 |
+
description="Web UI for running agent configuration optimization experiments",
|
| 27 |
+
version="0.1.0",
|
| 28 |
+
lifespan=lifespan,
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
# CORS for development
|
| 32 |
+
app.add_middleware(
|
| 33 |
+
CORSMiddleware,
|
| 34 |
+
allow_origins=["http://localhost:5173", "http://127.0.0.1:5173"],
|
| 35 |
+
allow_credentials=True,
|
| 36 |
+
allow_methods=["*"],
|
| 37 |
+
allow_headers=["*"],
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
# API routes
|
| 41 |
+
app.include_router(configs_router, prefix="/api")
|
| 42 |
+
app.include_router(tasks_router, prefix="/api")
|
| 43 |
+
app.include_router(jobs_router, prefix="/api")
|
| 44 |
+
app.include_router(runs_router, prefix="/api")
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
# Health check
|
| 48 |
+
@app.get("/api/health")
|
| 49 |
+
async def health_check() -> dict[str, Any]:
|
| 50 |
+
"""Health check endpoint."""
|
| 51 |
+
return {"status": "ok", "service": "flow-ui"}
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
# Static files and SPA fallback
|
| 55 |
+
# UI is built to backend/ui/ directory so the backend package is self-contained
|
| 56 |
+
UI_DIR = Path(__file__).parent / "ui"
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def setup_static_files() -> None:
|
| 60 |
+
"""Set up static file serving if frontend is built."""
|
| 61 |
+
if UI_DIR.exists():
|
| 62 |
+
# Serve assets directory
|
| 63 |
+
assets_dir = UI_DIR / "assets"
|
| 64 |
+
if assets_dir.exists():
|
| 65 |
+
app.mount("/assets", StaticFiles(directory=assets_dir), name="assets")
|
| 66 |
+
|
| 67 |
+
@app.get("/{full_path:path}")
|
| 68 |
+
async def serve_spa(full_path: str) -> FileResponse: # pyright: ignore[reportUnusedFunction]
|
| 69 |
+
"""Serve SPA for all non-API routes."""
|
| 70 |
+
file_path = UI_DIR / full_path
|
| 71 |
+
if file_path.exists() and file_path.is_file():
|
| 72 |
+
return FileResponse(file_path)
|
| 73 |
+
return FileResponse(UI_DIR / "index.html")
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
# Only set up static files if UI is built
|
| 77 |
+
if UI_DIR.exists():
|
| 78 |
+
setup_static_files()
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def run_server(host: str = "0.0.0.0", port: int = 8091) -> None: # noqa: S104
|
| 82 |
+
"""Run the FastAPI server."""
|
| 83 |
+
import uvicorn
|
| 84 |
+
|
| 85 |
+
uvicorn.run(
|
| 86 |
+
"flow.ui.main:app",
|
| 87 |
+
host=host,
|
| 88 |
+
port=port,
|
| 89 |
+
reload=False,
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
if __name__ == "__main__":
|
| 94 |
+
run_server()
|
src/flow/ui/models/__init__.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Microsoft. All rights reserved.
|
| 2 |
+
"""Database models."""
|
| 3 |
+
|
| 4 |
+
from .config import AgentConfig
|
| 5 |
+
from .task import TaskModel
|
| 6 |
+
from .job import OptimizationJob, JobStatus
|
| 7 |
+
from .run import ExperimentRun
|
| 8 |
+
|
| 9 |
+
__all__ = [
|
| 10 |
+
"AgentConfig",
|
| 11 |
+
"TaskModel",
|
| 12 |
+
"OptimizationJob",
|
| 13 |
+
"JobStatus",
|
| 14 |
+
"ExperimentRun",
|
| 15 |
+
]
|