Spaces:

victordibia
/

flow

Sleeping

App Files Files Community

victordibia commited on Jan 26

Commit

034c2ac

1 Parent(s): 34635fd

Deploy 2026-01-26 07:50:36

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.dockerignore +54 -0
Dockerfile +69 -0
README.md +195 -0
pyproject.toml +180 -0
src/flow/__init__.py +26 -0
src/flow/cli/__init__.py +11 -0
src/flow/cli/app.py +216 -0
src/flow/cli/optimize.py +332 -0
src/flow/cli/output.py +99 -0
src/flow/cli/repl.py +153 -0
src/flow/experiments/__init__.py +204 -0
src/flow/experiments/ablation.py +472 -0
src/flow/experiments/config_export.py +184 -0
src/flow/experiments/evaluators/__init__.py +17 -0
src/flow/experiments/evaluators/base.py +32 -0
src/flow/experiments/evaluators/composite.py +80 -0
src/flow/experiments/evaluators/heuristic.py +193 -0
src/flow/experiments/evaluators/llm.py +223 -0
src/flow/experiments/evaluators/trace.py +149 -0
src/flow/experiments/metrics.py +267 -0
src/flow/experiments/optimizer.py +547 -0
src/flow/experiments/reporters/__init__.py +17 -0
src/flow/experiments/reporters/console_reporter.py +135 -0
src/flow/experiments/reporters/json_reporter.py +133 -0
src/flow/experiments/runner.py +243 -0
src/flow/experiments/trace_collector.py +104 -0
src/flow/experiments/types.py +266 -0
src/flow/harness/__init__.py +18 -0
src/flow/harness/base.py +110 -0
src/flow/harness/maf/__init__.py +14 -0
src/flow/harness/maf/agent.py +176 -0
src/flow/harness/maf/harness.py +258 -0
src/flow/harness/maf/message_store.py +177 -0
src/flow/prompts.py +407 -0
src/flow/py.typed +0 -0
src/flow/tools/__init__.py +172 -0
src/flow/tools/coding.py +391 -0
src/flow/tools/core.py +100 -0
src/flow/tools/execution.py +479 -0
src/flow/tools/memory.py +260 -0
src/flow/tools/sub_agent.py +188 -0
src/flow/ui/__init__.py +2 -0
src/flow/ui/api/__init__.py +14 -0
src/flow/ui/api/configs.py +121 -0
src/flow/ui/api/jobs.py +169 -0
src/flow/ui/api/runs.py +157 -0
src/flow/ui/api/tasks.py +119 -0
src/flow/ui/database.py +30 -0
src/flow/ui/main.py +94 -0
src/flow/ui/models/__init__.py +15 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,54 @@

+# This file is relative to the build context (repo root)
+# Git
+.git
+.gitignore
+# Python
+__pycache__
+*.py[cod]
+*$py.class
+*.so
+.Python
+.venv
+venv
+ENV
+.eggs
+*.egg-info
+dist
+build
+# Testing/Dev
+.pytest_cache
+.coverage
+htmlcov
+.mypy_cache
+.ruff_cache
+.pyright
+# IDE
+.vscode
+.idea
+*.swp
+*.swo
+# Frontend source (built files are already in src/flow/ui/ui/)
+app/frontend/node_modules
+app/frontend/src
+app/frontend/*.json
+app/frontend/*.ts
+app/frontend/*.js
+app/frontend/*.md
+app/frontend/.vite
+# Docs and deploy folder itself
+docs
+deploy
+# Local env files (pass via docker env instead)
+.env
+.env.*
+!.env.example
+# Tests (not needed in production)
+tests

Dockerfile ADDED Viewed

	@@ -0,0 +1,69 @@

+# Flow UI Container
+# Production-ready deployment with uvicorn workers
+FROM python:3.11-slim AS base
+WORKDIR /app
+# Install system dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    git \
+    curl \
+    && rm -rf /var/lib/apt/lists/*
+# Install uv for fast dependency management
+RUN pip install --no-cache-dir uv
+# -------------------------------------------------------------------
+# Builder stage: install dependencies
+# -------------------------------------------------------------------
+FROM base AS builder
+# Copy only dependency files first (better layer caching)
+COPY pyproject.toml uv.lock ./
+# Install dependencies to system (no venv needed in container)
+RUN uv pip install --system .
+# -------------------------------------------------------------------
+# Final stage: copy app and run
+# -------------------------------------------------------------------
+FROM base AS final
+# Copy installed packages from builder
+COPY --from=builder /usr/local/lib/python3.11/site-packages /usr/local/lib/python3.11/site-packages
+COPY --from=builder /usr/local/bin /usr/local/bin
+# Copy application source (includes pre-built frontend in src/flow/ui/ui/)
+COPY src/ ./src/
+# Install the app itself (editable, uses already-installed deps)
+RUN uv pip install --system --no-deps -e .
+# Create non-root user for security
+RUN useradd --create-home --shell /bin/bash flowuser
+RUN mkdir -p /app/data && chown -R flowuser:flowuser /app
+USER flowuser
+# Configuration
+ENV PORT=7860
+ENV FLOW_DATA_DIR=/app/data
+ENV UVICORN_WORKERS=2
+# Expose the port
+EXPOSE ${PORT}
+# Health check - matches the actual endpoint in main.py
+HEALTHCHECK --interval=30s --timeout=10s --start-period=10s --retries=3 \
+    CMD curl -f http://localhost:${PORT}/api/health || exit 1
+# Production uvicorn with multiple workers
+# - workers: handle concurrent requests (CPU-bound, use 2-4 for most cases)
+# - For I/O bound (which this is), uvicorn's async handles concurrency well
+# - limit-concurrency prevents overload
+CMD uvicorn flow.ui.main:app \
+    --host 0.0.0.0 \
+    --port ${PORT} \
+    --workers ${UVICORN_WORKERS} \
+    --limit-concurrency 100 \
+    --timeout-keep-alive 30

README.md ADDED Viewed

	@@ -0,0 +1,195 @@

+---
+title: Flow - Autonomous Coding Agent
+emoji: 🔄
+colorFrom: blue
+colorTo: purple
+sdk: docker
+app_port: 7860
+pinned: false
+---
+# Flow
+**Autonomous Coding Agent with a Polished CLI**
+Flow is a standalone coding agent that can read, write, and execute code autonomously. It features a clean CLI interface similar to Claude Code, with support for multiple agent runtime harnesses.
+## Features
+- **Autonomous Execution**: Flow doesn't just tell you what to do—it does it. Write code, run tests, fix errors, iterate.
+- **Rich CLI**: Interactive REPL with streaming output, tool call visualization, and syntax highlighting.
+- **Pluggable Harnesses**: Swap out the underlying agent runtime (Microsoft Agent Framework, OpenAI Swarm, etc.)
+- **Persistent Memory**: Remember patterns, decisions, and context across sessions.
+- **Workspace Isolation**: Secure file operations within a sandboxed workspace.
+## Installation
+```bash
+# Basic installation
+pip install flow-agent
+# With Microsoft Agent Framework support (recommended)
+pip install flow-agent[agent-framework]
+# With all optional features
+pip install flow-agent[all]
+# Development installation
+pip install flow-agent[dev]
+```
+## Quick Start
+### 1. Configure Azure OpenAI
+```bash
+export AZURE_OPENAI_API_KEY="your-api-key"
+export AZURE_OPENAI_ENDPOINT="https://your-resource.openai.azure.com/"
+export AZURE_OPENAI_DEPLOYMENT="gpt-4o"
+```
+### 2. Initialize Flow
+```bash
+flow init
+```
+### 3. Run a Task
+```bash
+# Single task
+flow run "Create a Python script that calculates fibonacci numbers"
+# Interactive mode
+flow run -i
+```
+## CLI Commands
+```bash
+flow run [TASK]       # Run a task or start interactive mode
+flow config           # Show current configuration
+flow init             # Initialize Flow directories
+flow --help           # Show help
+```
+## Usage as a Library
+```python
+import asyncio
+from flow import FlowAgent
+async def main():
+    agent = FlowAgent()
+    # Run a task
+    response = await agent.run("Create a hello world script")
+    print(response)
+    # Stream events
+    async for event in agent.run_stream("List files in the workspace"):
+        print(event.type, event.content)
+    await agent.close()
+asyncio.run(main())
+```
+## Configuration
+Flow can be configured via environment variables or a config file.
+### Environment Variables
+| Variable | Description | Default |
+|----------|-------------|---------|
+| `FLOW_HARNESS` | Agent harness to use | `agent-framework` |
+| `FLOW_MODEL` | Model name | `gpt-4o` |
+| `FLOW_WORKSPACE` | Workspace directory | `~/.flow/workspace` |
+| `AZURE_OPENAI_API_KEY` | Azure OpenAI API key | - |
+| `AZURE_OPENAI_ENDPOINT` | Azure OpenAI endpoint | - |
+| `AZURE_OPENAI_DEPLOYMENT` | Azure OpenAI deployment | - |
+### Directory Structure
+```
+~/.flow/
+├── workspace/     # Agent's working directory
+├── memory/        # Persistent memory storage
+│   ├── patterns/  # Reusable code patterns
+│   ├── projects/  # Per-project notes
+│   └── decisions/ # Architecture decisions
+└── skills/        # Domain-specific expertise
+```
+## Architecture
+### Harness System
+Flow uses a harness abstraction to support multiple agent runtimes:
+```
+┌─────────────────┐
+│   FlowAgent     │
+└────────┬────────┘
+         │
+┌────────▼────────┐
+│   BaseHarness   │ (Abstract)
+└────────┬────────┘
+         │
+    ┌────┴────┐
+    │         │
+┌───▼───┐ ┌───▼───┐
+│ Agent │ │ OpenAI│
+│ Frmwk │ │ Swarm │
+└───────┘ └───────┘
+```
+Currently supported:
+- **MAFHarness**: Microsoft Agent Framework with Azure OpenAI
+Planned:
+- LangChain
+- Claude SDK
+### Tools
+Flow includes a comprehensive set of tools:
+| Tool | Description |
+|------|-------------|
+| `read_file` | Read file contents with line numbers |
+| `write_file` | Write/edit files (full write, str_replace, insert) |
+| `list_directory` | List directory contents |
+| `grep_search` | Search for patterns in code |
+| `bash_execute` | Run shell commands |
+| `python_repl` | Execute Python code snippets |
+| `memory` | Persistent memory operations |
+| `think` | Structured reasoning |
+| `task_done` | Report task completion |
+## Development
+```bash
+# Clone the repository
+git clone https://github.com/victordibia/flow
+cd flow
+# Install development dependencies
+pip install -e ".[dev]"
+# Run tests
+pytest tests/ -v
+# Type checking
+pyright src/
+mypy src/
+# Linting
+ruff check src/
+ruff format src/
+```
+## License
+MIT License - see [LICENSE](LICENSE) for details.

pyproject.toml ADDED Viewed

	@@ -0,0 +1,180 @@

+[project]
+name = "flow-agent"
+version = "0.1.0"
+description = "Autonomous coding agent with a polished CLI"
+authors = [{ name = "Victor Dibia" }]
+readme = "README.md"
+requires-python = ">=3.10"
+license = { text = "MIT" }
+classifiers = [
+    "Development Status :: 4 - Beta",
+    "Environment :: Console",
+    "Intended Audience :: Developers",
+    "License :: OSI Approved :: MIT License",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
+    "Typing :: Typed",
+]
+dependencies = [
+    "pydantic>=2.0.0",
+    "pydantic-settings>=2.0.0",
+    "rich>=13.0.0",
+    "typer>=0.9.0",
+    "httpx>=0.25.0",
+    "python-dotenv>=1.0.0",
+    "agent-framework-core>=1.0.0b0",
+    "azure-identity>=1.15.0",
+    "pyyaml>=6.0.0",
+    # OpenTelemetry for experiments tracing
+    "opentelemetry-api>=1.20.0",
+    "opentelemetry-sdk>=1.20.0",
+    "opentelemetry-semantic-conventions>=0.41b0",
+    # Web UI dependencies
+    "fastapi>=0.109.0",
+    "uvicorn>=0.27.0",
+    "sqlmodel>=0.0.14",
+    "aiosqlite>=0.19.0",
+]
+[project.optional-dependencies]
+# Optional features
+research = ["beautifulsoup4>=4.12.0", "html2text>=2024.2.26"]
+# Bundles
+all = ["flow-agent[research]"]
+dev = [
+    "pytest>=8.0.0",
+    "pytest-asyncio>=0.23.0",
+    "pytest-cov>=4.1.0",
+    "mypy>=1.8.0",
+    "pyright>=1.1.350",
+    "ruff>=0.2.0",
+    "pre-commit>=3.6.0",
+    "poethepoet>=0.24.0",
+]
+[project.scripts]
+flow = "flow.cli:main"
+[project.urls]
+Homepage = "https://github.com/victordibia/flow"
+Repository = "https://github.com/victordibia/flow"
+Issues = "https://github.com/victordibia/flow/issues"
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+[tool.hatch.build.targets.wheel]
+packages = ["src/flow"]
+# ============================================================================
+# Type Checking - Strict
+# ============================================================================
+[tool.pyright]
+include = ["src"]
+exclude = ["**/tests/**", "**/.venv/**"]
+typeCheckingMode = "strict"
+pythonVersion = "3.10"
+reportMissingTypeStubs = false
+reportUnnecessaryIsInstance = false
+# agent_framework is optional - ignore type issues in harness
+reportUnknownMemberType = "warning"
+reportUnknownVariableType = "warning"
+reportUnknownArgumentType = "warning"
+[tool.mypy]
+plugins = ["pydantic.mypy"]
+strict = true
+python_version = "3.10"
+ignore_missing_imports = true
+disallow_untyped_defs = true
+no_implicit_optional = true
+check_untyped_defs = true
+warn_return_any = true
+show_error_codes = true
+warn_unused_ignores = false
+disallow_incomplete_defs = true
+disallow_untyped_decorators = true
+# ============================================================================
+# Linting - Ruff
+# ============================================================================
+[tool.ruff]
+line-length = 120
+target-version = "py310"
+src = ["src"]
+fix = true
+include = ["*.py", "*.pyi", "**/pyproject.toml"]
+exclude = ["docs/*"]
+[tool.ruff.lint]
+select = [
+    "E",      # pycodestyle errors
+    "F",      # pyflakes
+    "I",      # isort
+    "B",      # bugbear
+    "UP",     # pyupgrade
+    "ANN",    # annotations
+    "S",      # bandit (security)
+    "RUF",    # ruff-specific
+    "ASYNC",  # async checks
+    "D",      # pydocstyle
+]
+ignore = [
+    "D100",   # allow missing docstring in public module
+    "D104",   # allow missing docstring in public package
+    "D107",   # allow missing docstring in __init__
+    "ANN401", # allow Any type (needed for generic tool/event handling)
+    "S101",   # allow assert statements (used in tests)
+]
+[tool.ruff.lint.per-file-ignores]
+"**/tests/**" = ["D", "ANN", "S"]
+[tool.ruff.lint.pydocstyle]
+convention = "google"
+[tool.ruff.format]
+docstring-code-format = true
+# ============================================================================
+# Testing - Pytest
+# ============================================================================
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+pythonpath = ["src"]
+addopts = "-ra -q -r fEX"
+asyncio_mode = "auto"
+asyncio_default_fixture_loop_scope = "function"
+filterwarnings = []
+[tool.coverage.run]
+source = ["src/flow"]
+omit = ["**/__init__.py"]
+[tool.coverage.report]
+exclude_lines = [
+    "pragma: no cover",
+    "if TYPE_CHECKING:",
+    "raise NotImplementedError",
+]
+# ============================================================================
+# Task Runner - Poe
+# ============================================================================
+[tool.poe.tasks]
+fmt = "ruff format src tests"
+lint = "ruff check src tests --fix"
+pyright = "pyright src"
+mypy = "mypy src"
+test = "pytest tests -v --cov=flow --cov-report=term-missing"
+check = ["fmt", "lint", "pyright", "mypy", "test"]

src/flow/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+"""Flow - Autonomous Coding Agent.
+An autonomous coding agent with a polished CLI experience.
+Uses Microsoft Agent Framework as the runtime.
+Usage:
+    from flow.harness.maf import MAFHarness
+    # Simple - creates agent with defaults
+    harness = MAFHarness()
+    async for event in harness.run_stream("Create a hello world script"):
+        print(event)
+    # Or with custom settings
+    harness = MAFHarness(workspace=Path("/tmp/workspace"), enable_compaction=False)
+"""
+from flow.harness.maf import MAFHarness, create_agent
+__version__ = "0.1.0"
+__all__ = [
+    "MAFHarness",
+    "create_agent",
+    "__version__",
+]

src/flow/cli/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+"""Flow CLI - Command-line interface.
+Provides the `flow` command for running the autonomous coding agent.
+"""
+from flow.cli.app import app, main
+__all__ = [
+    "app",
+    "main",
+]

src/flow/cli/app.py ADDED Viewed

	@@ -0,0 +1,216 @@

+"""Flow CLI application.
+Main entry point for the `flow` command.
+"""
+from __future__ import annotations
+import asyncio
+import os
+from pathlib import Path
+from typing import Annotated
+import typer
+from rich.console import Console
+from flow import __version__
+app = typer.Typer(
+    name="flow",
+    help="Flow - Autonomous Coding Agent",
+    add_completion=False,
+    no_args_is_help=True,
+)
+console = Console()
+# Default paths
+DEFAULT_WORKSPACE = Path.home() / ".flow" / "workspace"
+DEFAULT_MEMORY_PATH = Path.home() / ".flow" / "memory"
+def version_callback(value: bool) -> None:
+    """Print version and exit."""
+    if value:
+        console.print(f"Flow v{__version__}")
+        raise typer.Exit()
+@app.callback()
+def callback(
+    version: Annotated[
+        bool | None,
+        typer.Option("--version", "-v", callback=version_callback, is_eager=True),
+    ] = None,
+) -> None:
+    """Flow - Autonomous Coding Agent."""
+    pass
+@app.command()
+def run(
+    task: Annotated[
+        str | None,
+        typer.Argument(help="Task to execute (or enter interactive mode if not provided)"),
+    ] = None,
+    workspace: Annotated[
+        Path | None,
+        typer.Option("--workspace", "-w", help="Workspace directory for writing files"),
+    ] = None,
+    config: Annotated[
+        Path | None,
+        typer.Option("--config", "-c", help="Config file from optimization (YAML)"),
+    ] = None,
+    interactive: Annotated[
+        bool,
+        typer.Option("--interactive/--no-interactive", "-i", help="Interactive mode"),
+    ] = True,
+) -> None:
+    """Run the coding agent.
+    If a task is provided, execute it and exit.
+    Otherwise, start an interactive REPL session.
+    The agent can read files from anywhere but writes go to the workspace.
+    Use --config to load a configuration from a previous optimization run.
+    """
+    workspace_path = workspace or DEFAULT_WORKSPACE
+    memory_path = DEFAULT_MEMORY_PATH
+    # Ensure directories exist
+    workspace_path.mkdir(parents=True, exist_ok=True)
+    memory_path.mkdir(parents=True, exist_ok=True)
+    if task:
+        # Single task mode
+        asyncio.run(_run_single_task(workspace_path, memory_path, task, config))
+    elif interactive:
+        # Interactive REPL mode
+        from flow.cli.repl import FlowREPL
+        repl = FlowREPL(workspace=workspace_path, memory_path=memory_path)
+        asyncio.run(repl.run())
+    else:
+        console.print("[red]Error:[/] No task provided and interactive mode disabled.")
+        raise typer.Exit(1)
+async def _run_single_task(
+    workspace: Path,
+    memory_path: Path,
+    task: str,
+    config_path: Path | None = None,
+) -> None:
+    """Run a single task and print the result."""
+    from flow.cli.output import print_event
+    from flow.harness.base import EventType
+    from flow.harness.maf import MAFHarness
+    if config_path:
+        # Load config from optimization result
+        from flow.experiments.config_export import load_config
+        from flow.experiments.ablation import create_harness_from_config
+        ablation_config = load_config(config_path)
+        console.print(f"[dim]Using config: {ablation_config.name}[/]")
+        harness = create_harness_from_config(ablation_config, workspace)
+    else:
+        harness = MAFHarness(workspace=workspace, memory_path=memory_path)
+    try:
+        console.print("\n[bold blue]Flow[/] - Executing task...\n")
+        async for event in harness.run_stream(task):
+            print_event(console, event)
+            if event.type == EventType.ERROR:
+                raise typer.Exit(1)
+    except KeyboardInterrupt:
+        console.print("\n[yellow]Cancelled.[/]")
+    finally:
+        await harness.close()
+# Import and register the optimize command
+from flow.cli.optimize import optimize as optimize_cmd
+app.command()(optimize_cmd)
+@app.command()
+def serve(
+    host: Annotated[
+        str,
+        typer.Option("--host", "-h", help="Host to bind to"),
+    ] = "0.0.0.0",  # noqa: S104
+    port: Annotated[
+        int,
+        typer.Option("--port", "-p", help="Port to bind to"),
+    ] = 8091,
+    reload: Annotated[
+        bool,
+        typer.Option("--reload", help="Enable auto-reload for development"),
+    ] = False,
+) -> None:
+    """Start the Flow web UI server.
+    Launches a web interface for managing agent configurations,
+    running optimization experiments, and viewing results.
+    """
+    import uvicorn
+    console.print(f"\n[bold blue]Flow UI[/] starting on [cyan]http://{host}:{port}[/]\n")
+    uvicorn.run(
+        "flow.ui.main:app",
+        host=host,
+        port=port,
+        reload=reload,
+    )
+@app.command()
+def config() -> None:
+    """Show current configuration."""
+    from rich.table import Table
+    table = Table(title="Flow Configuration")
+    table.add_column("Setting", style="cyan")
+    table.add_column("Value", style="green")
+    table.add_row("Workspace", str(DEFAULT_WORKSPACE))
+    table.add_row("Memory Path", str(DEFAULT_MEMORY_PATH))
+    table.add_row("Azure Endpoint", os.environ.get("AZURE_OPENAI_ENDPOINT", "(not set)"))
+    table.add_row("Azure Deployment", os.environ.get("AZURE_OPENAI_DEPLOYMENT", "(not set)"))
+    console.print(table)
+@app.command()
+def init() -> None:
+    """Initialize Flow directories and show setup instructions."""
+    DEFAULT_WORKSPACE.mkdir(parents=True, exist_ok=True)
+    DEFAULT_MEMORY_PATH.mkdir(parents=True, exist_ok=True)
+    console.print("\n[bold green]Flow initialized![/]\n")
+    console.print(f"  Workspace: [cyan]{DEFAULT_WORKSPACE}[/]")
+    console.print(f"  Memory:    [cyan]{DEFAULT_MEMORY_PATH}[/]")
+    console.print("\n[bold]Next steps:[/]")
+    console.print("  1. Set your Azure OpenAI credentials:")
+    console.print("     [dim]export AZURE_OPENAI_API_KEY=your-key[/]")
+    console.print("     [dim]export AZURE_OPENAI_ENDPOINT=https://your-resource.openai.azure.com/[/]")
+    console.print("     [dim]export AZURE_OPENAI_DEPLOYMENT=your-deployment[/]")
+    console.print("\n  2. Run Flow:")
+    console.print('     [dim]flow run "Create a hello world Python script"[/]')
+    console.print("     [dim]flow run -i  # Interactive mode[/]")
+def main() -> None:
+    """Main entry point."""
+    app()
+if __name__ == "__main__":
+    main()

src/flow/cli/optimize.py ADDED Viewed

	@@ -0,0 +1,332 @@

+# Copyright (c) Microsoft. All rights reserved.
+"""Optimize command for finding best agent configurations."""
+from __future__ import annotations
+import asyncio
+import importlib.util
+import sys
+from pathlib import Path
+from typing import Annotated, Any
+import typer
+from rich.console import Console
+from flow.experiments.ablation import AblationConfig, CONTEXT_ENGINEERING_CONFIGS
+from flow.experiments.optimizer import (
+    FlowOptimizer,
+    generate_grid_configs,
+    load_tasks_from_jsonl,
+)
+from flow.experiments.types import EvalCriterion, Task
+console = Console()
+def optimize(
+    tasks: Annotated[
+        Path | None,
+        typer.Option(
+            "--tasks", "-t",
+            help="Path to tasks.jsonl file",
+        ),
+    ] = None,
+    config: Annotated[
+        Path | None,
+        typer.Option(
+            "--config", "-c",
+            help="Path to Python config file with CONFIGS or VARIATIONS",
+        ),
+    ] = None,
+    agent: Annotated[
+        Path | None,
+        typer.Option(
+            "--agent", "-a",
+            help="Path to base agent Python file (for optimization)",
+        ),
+    ] = None,
+    suite: Annotated[
+        str | None,
+        typer.Option(
+            "--suite", "-s",
+            help="Built-in task suite: coding, research",
+        ),
+    ] = None,
+    parallel: Annotated[
+        int,
+        typer.Option(
+            "--parallel", "-p",
+            help="Max concurrent experiments",
+        ),
+    ] = 4,
+    mode: Annotated[
+        str,
+        typer.Option(
+            "--mode", "-m",
+            help="Config mode: named (use CONFIGS), grid (use VARIATIONS)",
+        ),
+    ] = "named",
+    vary: Annotated[
+        str | None,
+        typer.Option(
+            "--vary", "-v",
+            help="Comma-separated params to vary: compaction,memory,model",
+        ),
+    ] = None,
+    output: Annotated[
+        Path | None,
+        typer.Option(
+            "--output", "-o",
+            help="Output directory for results",
+        ),
+    ] = None,
+    no_llm_eval: Annotated[
+        bool,
+        typer.Option(
+            "--no-llm-eval",
+            help="Disable LLM-as-Judge evaluation (faster, less accurate)",
+        ),
+    ] = False,
+) -> None:
+    """Find the best agent configuration through experimentation.
+    Runs experiments in parallel, evaluates with LLM-as-Judge,
+    ranks via Pareto analysis, and exports winning configs.
+    Examples:
+        # Run with task file and default configs
+        flow optimize --tasks tasks.jsonl
+        # Use custom configs from Python file
+        flow optimize --config my_configs.py --tasks tasks.jsonl
+        # Grid search over variations
+        flow optimize --config my_configs.py --tasks tasks.jsonl --mode grid
+        # Use built-in task suite
+        flow optimize --suite coding --parallel 2
+        # Vary specific parameters
+        flow optimize --vary compaction,memory --tasks tasks.jsonl
+    """
+    asyncio.run(_run_optimize(
+        tasks_path=tasks,
+        config_path=config,
+        agent_path=agent,
+        suite=suite,
+        parallel=parallel,
+        mode=mode,
+        vary=vary,
+        output_dir=output,
+        use_llm_eval=not no_llm_eval,
+    ))
+async def _run_optimize(
+    tasks_path: Path | None,
+    config_path: Path | None,
+    agent_path: Path | None,
+    suite: str | None,
+    parallel: int,
+    mode: str,
+    vary: str | None,
+    output_dir: Path | None,
+    use_llm_eval: bool,
+) -> None:
+    """Run the optimization."""
+    # Load tasks
+    tasks = _load_tasks(tasks_path, suite)
+    if not tasks:
+        console.print("[red]Error:[/] No tasks specified. Use --tasks or --suite")
+        raise typer.Exit(1)
+    # Load configs
+    configs = _load_configs(config_path, mode, vary)
+    if not configs:
+        console.print("[red]Error:[/] No configs to test. Use --config or --vary")
+        raise typer.Exit(1)
+    console.print(f"\n[bold]Tasks:[/] {len(tasks)}")
+    for t in tasks:
+        console.print(f"  - {t.name}")
+    console.print(f"\n[bold]Configs:[/] {len(configs)}")
+    for c in configs:
+        console.print(f"  - {c.name}")
+    # Run optimizer
+    optimizer = FlowOptimizer(
+        parallel=parallel,
+        use_llm_evaluator=use_llm_eval,
+        output_dir=output_dir,
+    )
+    try:
+        result = await optimizer.optimize(configs, tasks)
+        console.print("\n[bold green]Optimization complete![/]")
+        console.print(f"\nBest configs exported to: [cyan]{result.output_dir / 'configs'}[/]")
+        console.print("\nTo use a config:")
+        console.print(f"  [dim]flow run --config {result.output_dir / 'configs' / 'best_score.yaml'} \"your task\"[/]")
+    except KeyboardInterrupt:
+        console.print("\n[yellow]Optimization cancelled.[/]")
+        raise typer.Exit(1)
+def _load_tasks(tasks_path: Path | None, suite: str | None) -> list[Task]:
+    """Load tasks from file or built-in suite."""
+    if tasks_path:
+        if not tasks_path.exists():
+            console.print(f"[red]Error:[/] Tasks file not found: {tasks_path}")
+            raise typer.Exit(1)
+        return load_tasks_from_jsonl(tasks_path)
+    if suite:
+        return _get_builtin_suite(suite)
+    # Default: simple test suite
+    return _get_builtin_suite("quick")
+def _get_builtin_suite(name: str) -> list[Task]:
+    """Get a built-in task suite."""
+    suites = {
+        "quick": [
+            Task(
+                name="hello_world",
+                prompt="Create a Python script 'hello.py' that prints 'Hello, World!' and run it.",
+                criteria=[
+                    EvalCriterion(name="file_created", instruction="hello.py should be created"),
+                    EvalCriterion(name="correct_output", instruction="Output should include 'Hello, World!'"),
+                ],
+            ),
+        ],
+        "coding": [
+            Task(
+                name="fizzbuzz",
+                prompt="Create fizzbuzz.py that prints 1-30 with Fizz/Buzz/FizzBuzz rules. Run it.",
+                criteria=[
+                    EvalCriterion(name="file_created", instruction="fizzbuzz.py should be created"),
+                    EvalCriterion(name="correct_output", instruction="Output shows correct FizzBuzz pattern"),
+                ],
+                metadata={"category": "short"},
+            ),
+            Task(
+                name="rest_api",
+                prompt="Create a FastAPI app with a /health endpoint that returns JSON {'status': 'ok'}. Save as api.py.",
+                criteria=[
+                    EvalCriterion(name="file_created", instruction="api.py should be created"),
+                    EvalCriterion(name="fastapi_used", instruction="Should use FastAPI"),
+                    EvalCriterion(name="endpoint_defined", instruction="Should have /health endpoint"),
+                ],
+                metadata={"category": "medium"},
+            ),
+            Task(
+                name="data_pipeline",
+                prompt="""Create a data processing pipeline:
+1. data_types.py - DataRecord dataclass (id, name, value)
+2. validators.py - validate_id, validate_name functions
+3. pipeline.py - chain validators together
+4. test_pipeline.py - tests for the pipeline
+Run the tests.""",
+                criteria=[
+                    EvalCriterion(name="modules_created", instruction="All 4 Python files created"),
+                    EvalCriterion(name="tests_run", instruction="Tests should be executed"),
+                ],
+                metadata={"category": "long"},
+            ),
+        ],
+        "research": [
+            Task(
+                name="codebase_analysis",
+                prompt="""Analyze this workspace:
+1. Explore the directory structure
+2. Identify Python files and their purposes
+3. Create analysis_report.md with findings""",
+                criteria=[
+                    EvalCriterion(name="exploration", instruction="Should explore directory"),
+                    EvalCriterion(name="report_created", instruction="analysis_report.md created"),
+                ],
+                metadata={"category": "research"},
+            ),
+        ],
+    }
+    if name not in suites:
+        console.print(f"[red]Error:[/] Unknown suite '{name}'. Available: {list(suites.keys())}")
+        raise typer.Exit(1)
+    return suites[name]
+def _load_configs(
+    config_path: Path | None,
+    mode: str,
+    vary: str | None,
+) -> list[AblationConfig]:
+    """Load configs from file or generate from variations."""
+    # Load from Python file
+    if config_path:
+        if not config_path.exists():
+            console.print(f"[red]Error:[/] Config file not found: {config_path}")
+            raise typer.Exit(1)
+        configs, variations = _load_python_config(config_path)
+        if mode == "grid" and variations:
+            return generate_grid_configs("grid", variations)
+        elif configs:
+            return configs
+        else:
+            console.print("[red]Error:[/] Config file has no CONFIGS or VARIATIONS")
+            raise typer.Exit(1)
+    # Generate from --vary flag
+    if vary:
+        variations = _parse_vary_flag(vary)
+        return generate_grid_configs("vary", variations)
+    # Default: use context engineering configs
+    return CONTEXT_ENGINEERING_CONFIGS
+def _load_python_config(path: Path) -> tuple[list[AblationConfig], dict[str, Any]]:
+    """Load CONFIGS and VARIATIONS from a Python file."""
+    spec = importlib.util.spec_from_file_location("config_module", path)
+    if spec is None or spec.loader is None:
+        raise ValueError(f"Cannot load {path}")
+    module = importlib.util.module_from_spec(spec)
+    sys.modules["config_module"] = module
+    spec.loader.exec_module(module)
+    configs = getattr(module, "CONFIGS", [])
+    variations = getattr(module, "VARIATIONS", {})
+    return configs, variations
+def _parse_vary_flag(vary: str) -> dict[str, Any]:
+    """Parse --vary flag into variations dict."""
+    variations = {}
+    for param in vary.split(","):
+        param = param.strip().lower()
+        if param in ("compaction", "compact"):
+            variations["enable_message_compaction"] = [True, False]
+        elif param in ("memory", "mem"):
+            variations["enable_memory_tool"] = [True, False]
+        elif param in ("subagent", "sub"):
+            variations["enable_sub_agent"] = [True, False]
+        elif param in ("head", "head_size"):
+            variations["compaction_head_size"] = [5, 10, 20]
+        elif param in ("tail", "tail_size"):
+            variations["compaction_tail_size"] = [20, 40, 60]
+        else:
+            console.print(f"[yellow]Warning:[/] Unknown vary param: {param}")
+    return variations

src/flow/cli/output.py ADDED Viewed

	@@ -0,0 +1,99 @@

+"""Output formatting for Flow CLI.
+Provides functions for rendering agent events to the terminal
+with rich formatting.
+"""
+from rich.console import Console
+from rich.markdown import Markdown
+from rich.markup import escape
+from rich.panel import Panel
+from rich.syntax import Syntax
+from flow.harness.base import Event, EventType
+def print_event(console: Console, event: Event) -> None:
+    """Print an agent event to the console.
+    Args:
+        console: Rich console instance
+        event: Event to print
+    """
+    if event.type == EventType.TEXT_DELTA:
+        # Stream text without newline
+        console.print(event.content, end="")
+    elif event.type == EventType.TEXT_DONE:
+        # Final text - print with newline
+        if event.content:
+            console.print(event.content)
+        console.print()  # Extra newline for spacing
+    elif event.type == EventType.TOOL_CALL_START:
+        # Show tool being called
+        tool_name = event.tool_name or "unknown"
+        console.print(f"\n[dim]▶ Calling tool:[/] [cyan]{tool_name}[/]")
+    elif event.type == EventType.TOOL_CALL_ARGS:
+        # Show tool arguments (streaming) - escape to prevent Rich markup interpretation
+        if event.content:
+            console.print(f"[dim]{escape(event.content)}[/]", end="")
+    elif event.type == EventType.TOOL_CALL_DONE:
+        # Tool call complete
+        console.print()  # Newline after args
+    elif event.type == EventType.TOOL_RESULT:
+        # Show tool result (truncated if long)
+        result = event.content or ""
+        if len(result) > 500:
+            result = result[:500] + "\n... (truncated)"
+        console.print(Panel(
+            escape(result),
+            title="[green]Tool Result[/]",
+            border_style="dim",
+            expand=False,
+        ))
+    elif event.type == EventType.THINKING:
+        # Show agent thinking
+        console.print(f"[dim italic]💭 {escape(event.content or '')}[/]")
+    elif event.type == EventType.ERROR:
+        # Show error
+        console.print(f"\n[bold red]Error:[/] {escape(event.content or '')}")
+    elif event.type == EventType.DONE:
+        # Execution complete
+        console.print("\n[dim]─── Done ───[/]\n")
+def print_welcome(console: Console) -> None:
+    """Print welcome message for interactive mode."""
+    console.print("\n[bold blue]Flow[/] - Autonomous Coding Agent")
+    console.print("[dim]Type your task and press Enter. Type 'exit' or Ctrl+D to quit.[/]\n")
+def print_code(console: Console, code: str, language: str = "python") -> None:
+    """Print syntax-highlighted code.
+    Args:
+        console: Rich console instance
+        code: Code to print
+        language: Programming language for syntax highlighting
+    """
+    syntax = Syntax(code, language, theme="monokai", line_numbers=True)
+    console.print(syntax)
+def print_markdown(console: Console, text: str) -> None:
+    """Print markdown-formatted text.
+    Args:
+        console: Rich console instance
+        text: Markdown text to print
+    """
+    md = Markdown(text)
+    console.print(md)

src/flow/cli/repl.py ADDED Viewed

	@@ -0,0 +1,153 @@

+"""Interactive REPL for Flow.
+Provides an interactive command-line interface for running
+the Flow agent with streaming output.
+"""
+from __future__ import annotations
+from pathlib import Path
+from rich.console import Console
+from flow.cli.output import print_event, print_welcome
+from flow.harness.base import EventType
+from flow.harness.maf import MAFHarness
+# Default paths
+DEFAULT_WORKSPACE = Path.home() / ".flow" / "workspace"
+DEFAULT_MEMORY_PATH = Path.home() / ".flow" / "memory"
+class FlowREPL:
+    """Interactive REPL for Flow agent.
+    Provides a command-line interface similar to Claude Code,
+    with streaming output and tool call visualization.
+    """
+    def __init__(
+        self,
+        workspace: Path | None = None,
+        memory_path: Path | None = None,
+    ) -> None:
+        """Initialize the REPL.
+        Args:
+            workspace: Workspace directory. Defaults to ~/.flow/workspace.
+            memory_path: Memory directory. Defaults to ~/.flow/memory.
+        """
+        self._workspace = workspace or DEFAULT_WORKSPACE
+        self._memory_path = memory_path or DEFAULT_MEMORY_PATH
+        self._console = Console()
+        self._harness: MAFHarness | None = None
+        self._thread_id: str | None = None
+    def _get_harness(self) -> MAFHarness:
+        """Get or create the harness instance."""
+        if self._harness is None:
+            self._harness = MAFHarness(
+                workspace=self._workspace,
+                memory_path=self._memory_path,
+            )
+        return self._harness
+    async def run(self) -> None:
+        """Run the interactive REPL loop."""
+        print_welcome(self._console)
+        harness = self._get_harness()
+        while True:
+            try:
+                # Get user input
+                user_input = self._get_input()
+                if user_input is None:
+                    # EOF (Ctrl+D)
+                    break
+                user_input = user_input.strip()
+                if not user_input:
+                    continue
+                # Handle special commands
+                if user_input.lower() in ("exit", "quit", "q"):
+                    break
+                if user_input.lower() == "clear":
+                    self._console.clear()
+                    print_welcome(self._console)
+                    continue
+                if user_input.lower() == "help":
+                    self._print_help()
+                    continue
+                if user_input.lower() == "config":
+                    self._print_config()
+                    continue
+                # Run the task
+                await self._run_task(harness, user_input)
+            except KeyboardInterrupt:
+                self._console.print("\n[yellow]Interrupted. Type 'exit' to quit.[/]")
+                continue
+        # Cleanup
+        self._console.print("\n[dim]Goodbye![/]\n")
+        if self._harness:
+            await self._harness.close()
+    def _get_input(self) -> str | None:
+        """Get input from the user.
+        Returns:
+            User input string, or None on EOF.
+        """
+        try:
+            return self._console.input("[bold green]>[/] ")
+        except EOFError:
+            return None
+    async def _run_task(self, harness: MAFHarness, task: str) -> None:
+        """Run a task and stream the output.
+        Args:
+            harness: Harness instance
+            task: Task to execute
+        """
+        self._console.print()  # Blank line before output
+        try:
+            async for event in harness.run_stream(task, self._thread_id):
+                print_event(self._console, event)
+                # Store thread ID for conversation continuity
+                if event.type == EventType.DONE:
+                    self._thread_id = harness.get_thread_id()
+        except Exception as e:
+            self._console.print(f"\n[bold red]Error:[/] {e}")
+    def _print_help(self) -> None:
+        """Print help information."""
+        self._console.print("\n[bold]Flow Commands:[/]")
+        self._console.print("  [cyan]exit[/], [cyan]quit[/], [cyan]q[/] - Exit the REPL")
+        self._console.print("  [cyan]clear[/]              - Clear the screen")
+        self._console.print("  [cyan]config[/]             - Show current configuration")
+        self._console.print("  [cyan]help[/]               - Show this help message")
+        self._console.print("\n[bold]Tips:[/]")
+        self._console.print("  - Type your task and press Enter to execute")
+        self._console.print("  - Press Ctrl+C to cancel a running task")
+        self._console.print("  - Press Ctrl+D to exit")
+        self._console.print()
+    def _print_config(self) -> None:
+        """Print current configuration."""
+        self._console.print("\n[bold]Configuration:[/]")
+        self._console.print(f"  Workspace: [cyan]{self._workspace}[/]")
+        self._console.print(f"  Memory:    [cyan]{self._memory_path}[/]")
+        self._console.print()

src/flow/experiments/__init__.py ADDED Viewed

	@@ -0,0 +1,204 @@

+# Copyright (c) Microsoft. All rights reserved.
+"""Experiments framework for running and evaluating Flow agent tasks.
+This package provides a structured way to:
+- Define tasks with evaluation criteria
+- Run agents on tasks and collect OpenTelemetry traces
+- Evaluate agent outputs using LLM, heuristic, or trace-based evaluators
+- Extract metrics from execution traces
+- Run ablation studies comparing different configurations
+Example usage:
+    from flow.harness.maf import MAFHarness
+    from flow.experiments import (
+        FlowExperimentRunner,
+        Task,
+        EvalCriterion,
+        TraceEvaluator,
+        HeuristicEvaluator,
+        extract_metrics,
+        format_metrics_summary,
+        setup_tracing,
+    )
+    # Setup tracing (call once at startup)
+    setup_tracing("my-experiment")
+    # Define a task
+    task = Task(
+        name="hello_world",
+        prompt="Write a Python function that prints 'Hello, World!'",
+        criteria=[
+            EvalCriterion(
+                name="correctness",
+                instruction="The function should print exactly 'Hello, World!'",
+            ),
+        ],
+    )
+    # Run the experiment
+    harness = MAFHarness()
+    runner = FlowExperimentRunner(keep_workspace=True)
+    result = await runner.run(harness, task)
+    # Extract metrics
+    metrics = extract_metrics(result.trace)
+    print(format_metrics_summary(metrics))
+    # Evaluate the result
+    evaluator = HeuristicEvaluator()
+    eval_result = await evaluator.evaluate(result)
+    print(f"Score: {eval_result.score}, Passed: {eval_result.passed}")
+    await harness.close()
+Ablation studies:
+    from flow.experiments import run_ablations, AblationConfig
+    configs = [
+        AblationConfig(name="baseline", enable_message_compaction=False),
+        AblationConfig(name="with_compaction", enable_message_compaction=True),
+    ]
+    results = await run_ablations(
+        configs,
+        task_prompt="Create a simple HTTP server",
+    )
+"""
+# Types
+# Ablation
+from .ablation import (
+    AGENT_MEMORY_ONLY,
+    ALL_CONTEXT_ENGINEERING,
+    COMPACTION_ONLY,
+    # Context engineering configs
+    CONTEXT_ENG_BASELINE,
+    CONTEXT_ENGINEERING_CONFIGS,
+    ISOLATION_ONLY,
+    AblationConfig,
+    AblationResult,
+    # Shared utilities
+    compute_pareto_frontier,
+    create_harness_from_config,
+    generate_recommendation,
+    run_ablations,
+    run_context_engineering_comparison,
+    run_single_ablation,
+)
+# Config export
+from .config_export import (
+    export_config,
+    export_optimization_configs,
+    load_config,
+)
+# Evaluators
+from .evaluators import (
+    CompositeEvaluator,
+    Evaluator,
+    HeuristicEvaluator,
+    LLMEvaluator,
+    TraceEvaluator,
+)
+# Metrics
+from .metrics import (
+    LLMCallInfo,
+    ToolCallInfo,
+    TraceMetrics,
+    extract_metrics,
+    format_metrics_summary,
+    metrics_to_dict,
+)
+# Optimizer
+from .optimizer import (
+    ConfigSummary,
+    FlowOptimizer,
+    OptimizationResult,
+    TaskResult,
+    generate_grid_configs,
+    load_tasks_from_jsonl,
+)
+# Reporters
+from .reporters import (
+    load_run_result_summary,
+    print_comparison_table,
+    print_eval_result,
+    print_metrics_summary,
+    save_comparison,
+    save_run_result,
+)
+# Runner
+from .runner import FlowExperimentRunner, setup_tracing
+# Trace collection
+from .trace_collector import FlowTraceCollector
+from .types import CriterionResult, EvalCriterion, EvalResult, RunResult, Task
+__all__ = [  # noqa: RUF022  # Intentionally grouped by category
+    # Types
+    "Task",
+    "EvalCriterion",
+    "RunResult",
+    "EvalResult",
+    "CriterionResult",
+    # Trace collection
+    "FlowTraceCollector",
+    # Metrics
+    "TraceMetrics",
+    "LLMCallInfo",
+    "ToolCallInfo",
+    "extract_metrics",
+    "format_metrics_summary",
+    "metrics_to_dict",
+    # Runner
+    "FlowExperimentRunner",
+    "setup_tracing",
+    # Evaluators
+    "Evaluator",
+    "LLMEvaluator",
+    "TraceEvaluator",
+    "HeuristicEvaluator",
+    "CompositeEvaluator",
+    # Reporters
+    "save_run_result",
+    "load_run_result_summary",
+    "save_comparison",
+    "print_metrics_summary",
+    "print_comparison_table",
+    "print_eval_result",
+    # Ablation
+    "AblationConfig",
+    "AblationResult",
+    "run_ablations",
+    "run_single_ablation",
+    "create_harness_from_config",
+    # Context engineering configs
+    "CONTEXT_ENG_BASELINE",
+    "COMPACTION_ONLY",
+    "AGENT_MEMORY_ONLY",
+    "ISOLATION_ONLY",
+    "ALL_CONTEXT_ENGINEERING",
+    "CONTEXT_ENGINEERING_CONFIGS",
+    "run_context_engineering_comparison",
+    # Shared utilities
+    "compute_pareto_frontier",
+    "generate_recommendation",
+    # Optimizer
+    "FlowOptimizer",
+    "OptimizationResult",
+    "ConfigSummary",
+    "TaskResult",
+    "generate_grid_configs",
+    "load_tasks_from_jsonl",
+    # Config export
+    "export_config",
+    "load_config",
+    "export_optimization_configs",
+]

src/flow/experiments/ablation.py ADDED Viewed

	@@ -0,0 +1,472 @@

+# Copyright (c) Microsoft. All rights reserved.
+"""Ablation runner for comparing Flow agent configurations.
+This module provides:
+- AblationConfig: Dataclass for agent configuration parameters
+- Pareto analysis utilities for multi-objective optimization
+- Pre-defined configurations for context engineering strategies
+- Convenience functions for running ablation studies
+"""
+from __future__ import annotations
+import json
+import logging
+from dataclasses import asdict, dataclass
+from datetime import datetime
+from pathlib import Path
+from typing import TYPE_CHECKING
+from .evaluators import HeuristicEvaluator
+from .metrics import TraceMetrics, extract_metrics, metrics_to_dict
+from .reporters import print_comparison_table, save_run_result
+from .runner import FlowExperimentRunner, setup_tracing
+from .types import EvalCriterion, RunResult, Task
+if TYPE_CHECKING:
+    from flow.harness.maf import MAFHarness
+    from .optimizer import ConfigSummary
+logger = logging.getLogger(__name__)
+@dataclass
+class AblationConfig:
+    """Configuration for a single ablation run.
+    Each config represents a different agent configuration to test.
+    The name is used as an identifier in comparison results.
+    Attributes:
+        name: Unique identifier for this configuration
+        enable_message_compaction: Whether to enable message compaction
+        enable_memory_tool: Whether to enable agent-managed memory
+        enable_sub_agent: Whether to enable sub-agent for isolated research
+        compaction_head_size: Number of initial messages to keep
+        compaction_tail_size: Number of recent messages to keep
+        bash_timeout: Timeout for bash commands in seconds
+    """
+    name: str
+    enable_message_compaction: bool = True
+    enable_memory_tool: bool = True
+    enable_sub_agent: bool = False
+    compaction_head_size: int = 10
+    compaction_tail_size: int = 40
+    bash_timeout: int = 120
+@dataclass
+class AblationResult:
+    """Result of a single ablation run.
+    Contains all data from the run including raw results,
+    extracted metrics, and evaluation scores.
+    """
+    config: AblationConfig
+    run_result: RunResult
+    metrics: TraceMetrics
+    eval_score: float
+    eval_passed: bool
+    eval_reasoning: str
+def create_harness_from_config(config: AblationConfig, workspace: Path) -> MAFHarness:
+    """Create a MAFHarness from an ablation config.
+    Args:
+        config: The ablation configuration
+        workspace: Working directory
+    Returns:
+        A configured MAFHarness
+    """
+    from flow.harness.maf import MAFHarness
+    return MAFHarness(
+        workspace=workspace,
+        memory_path=workspace / "memory",
+        enable_compaction=config.enable_message_compaction,
+        enable_memory_tool=config.enable_memory_tool,
+        enable_sub_agent=config.enable_sub_agent,
+        compaction_head_size=config.compaction_head_size,
+        compaction_tail_size=config.compaction_tail_size,
+        bash_timeout=config.bash_timeout,
+    )
+async def run_single_ablation(
+    config: AblationConfig,
+    task: Task,
+    workspace: Path,
+) -> AblationResult:
+    """Run a single ablation with trace capture and evaluation.
+    Args:
+        config: The ablation configuration
+        task: The task to run
+        workspace: Working directory
+    Returns:
+        AblationResult with metrics and evaluation
+    """
+    # Create harness from config
+    harness = create_harness_from_config(config, workspace)
+    try:
+        # Create runner
+        runner = FlowExperimentRunner(keep_workspace=True)
+        # Run the experiment
+        run_result = await runner.run(harness, task, workspace=workspace)
+        # Extract metrics
+        metrics = extract_metrics(run_result.trace)
+        # Evaluate the result
+        evaluator = HeuristicEvaluator()
+        eval_result = await evaluator.evaluate(run_result)
+        return AblationResult(
+            config=config,
+            run_result=run_result,
+            metrics=metrics,
+            eval_score=eval_result.score,
+            eval_passed=eval_result.passed,
+            eval_reasoning=eval_result.reasoning,
+        )
+    finally:
+        await harness.close()
+def save_ablation_result(result: AblationResult, output_dir: Path) -> None:
+    """Save ablation result to files.
+    Creates a subdirectory for the config with all result files.
+    Args:
+        result: The ablation result to save
+        output_dir: Base directory for output
+    """
+    config_dir = output_dir / result.config.name
+    save_run_result(
+        result.run_result,
+        config_dir,
+        metrics=result.metrics,
+    )
+    # Save ablation-specific data
+    with open(config_dir / "ablation.json", "w") as f:
+        json.dump({
+            "config": asdict(result.config),
+            "evaluation": {
+                "score": result.eval_score,
+                "passed": result.eval_passed,
+                "reasoning": result.eval_reasoning,
+            },
+        }, f, indent=2)
+async def run_ablations(
+    configs: list[AblationConfig],
+    task_prompt: str,
+    output_dir: Path | None = None,
+    task_name: str = "ablation_task",
+) -> list[AblationResult]:
+    """Run multiple ablation configurations and compare.
+    This function:
+    1. Sets up tracing
+    2. Runs each configuration on the same task
+    3. Collects metrics and evaluation scores
+    4. Saves results and prints comparison
+    Args:
+        configs: List of configurations to test
+        task_prompt: The task prompt to run
+        output_dir: Base directory for output (default: ~/.flow/ablations)
+        task_name: Name for the task (used in file paths)
+    Returns:
+        List of ablation results
+    """
+    # Setup output directory
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    if output_dir is None:
+        output_dir = Path.home() / ".flow" / "ablations"
+    output_dir = output_dir / timestamp
+    output_dir.mkdir(parents=True, exist_ok=True)
+    # Create task
+    task = Task(
+        name=task_name,
+        prompt=task_prompt,
+        criteria=[
+            EvalCriterion(
+                name="completion",
+                instruction="The task should be completed successfully",
+            ),
+        ],
+    )
+    # Save configs
+    with open(output_dir / "config.json", "w") as f:  # noqa: ASYNC230
+        json.dump({
+            "task": task_prompt,
+            "timestamp": timestamp,
+            "configs": [asdict(c) for c in configs],
+        }, f, indent=2)
+    print("=" * 80)
+    print(" FLOW ABLATION RUNNER")
+    print("=" * 80)
+    print(f" Task:    {task_prompt[:60]}{'...' if len(task_prompt) > 60 else ''}")
+    print(f" Configs: {len(configs)}")
+    print(f" Output:  {output_dir}")
+    print("=" * 80)
+    # Setup tracing once
+    setup_tracing("flow-ablation")
+    results = []
+    for i, config in enumerate(configs, 1):
+        print(f"\n[{i}/{len(configs)}] Running: {config.name}")
+        print("-" * 40)
+        # Each config gets its own workspace
+        workspace = output_dir / config.name / "workspace"
+        workspace.mkdir(parents=True, exist_ok=True)
+        result = await run_single_ablation(
+            config=config,
+            task=task,
+            workspace=workspace,
+        )
+        results.append(result)
+        save_ablation_result(result, output_dir)
+        # Quick status
+        status = "OK" if result.run_result.success else "FAIL"
+        print(f"  {status} | {result.run_result.duration_seconds:.1f}s | "
+              f"Tokens: {result.metrics.total_tokens} | Tools: {result.metrics.tool_call_count}")
+    # Save comparison
+    comparison_data = [
+        {
+            "name": r.config.name,
+            "success": r.run_result.success,
+            "duration_seconds": r.run_result.duration_seconds,
+            "metrics": metrics_to_dict(r.metrics),
+            "evaluation": {
+                "score": r.eval_score,
+                "passed": r.eval_passed,
+            },
+        }
+        for r in results
+    ]
+    with open(output_dir / "comparison.json", "w") as f:  # noqa: ASYNC230
+        json.dump({"task": task_prompt, "results": comparison_data}, f, indent=2)
+    # Print comparison
+    print_comparison_table(comparison_data, "Ablation Comparison")
+    print(f"\nResults saved to: {output_dir}")
+    return results
+# =============================================================================
+# Context Engineering Baseline Configurations
+# =============================================================================
+# These configurations demonstrate the three main context engineering strategies:
+# 1. Compaction - Reactive trimming via message stores
+# 2. Agent-Managed Memory - Agent controls when to write/read/delete
+# 3. Isolation - Sub-agent architecture prevents context pollution
+# Baseline: No context engineering (for comparison)
+CONTEXT_ENG_BASELINE = AblationConfig(
+    name="no_context_engineering",
+    enable_message_compaction=False,
+    enable_memory_tool=False,
+    enable_sub_agent=False,
+)
+# Strategy 1: Compaction via Message Stores
+# Uses HeadTailCompactingMessageStore to keep first N + last M messages
+# Good for: Long-running sessions where middle context is less important
+COMPACTION_ONLY = AblationConfig(
+    name="compaction_only",
+    enable_message_compaction=True,
+    enable_memory_tool=False,
+    enable_sub_agent=False,
+    compaction_head_size=10,  # Keep task context
+    compaction_tail_size=40,  # Keep recent work
+)
+# Strategy 2: Agent-Managed Memory
+# Agent decides when to save/retrieve information from persistent storage
+# Good for: Cross-session memory, learning patterns, storing decisions
+AGENT_MEMORY_ONLY = AblationConfig(
+    name="agent_memory_only",
+    enable_message_compaction=False,
+    enable_memory_tool=True,
+    enable_sub_agent=False,
+)
+# Strategy 3: Isolation via Sub-Agent
+# Delegate heavy research to sub-agent with isolated context
+# Good for: Complex research tasks that would pollute main context
+ISOLATION_ONLY = AblationConfig(
+    name="isolation_only",
+    enable_message_compaction=False,
+    enable_memory_tool=False,
+    enable_sub_agent=True,
+)
+# Combined: All context engineering strategies
+# Uses compaction + memory + isolation together
+# Good for: Production systems with long-running, complex tasks
+ALL_CONTEXT_ENGINEERING = AblationConfig(
+    name="all_context_engineering",
+    enable_message_compaction=True,
+    enable_memory_tool=True,
+    enable_sub_agent=True,
+    compaction_head_size=10,
+    compaction_tail_size=40,
+)
+# Predefined list for running context engineering comparison
+CONTEXT_ENGINEERING_CONFIGS = [
+    CONTEXT_ENG_BASELINE,
+    COMPACTION_ONLY,
+    AGENT_MEMORY_ONLY,
+    ISOLATION_ONLY,
+    ALL_CONTEXT_ENGINEERING,
+]
+async def run_context_engineering_comparison(
+    task_prompt: str,
+    output_dir: Path | None = None,
+) -> list[AblationResult]:
+    """Run a comparison of all context engineering strategies.
+    This is a convenience function that runs all context engineering
+    baseline configurations against a single task for comparison.
+    Args:
+        task_prompt: The task to run (should benefit from context management)
+        output_dir: Optional output directory for results
+    Returns:
+        List of AblationResult for each strategy
+    Example:
+        >>> results = await run_context_engineering_comparison(
+        ...     "Research the authentication patterns in this codebase and "
+        ...     "create a summary document with recommendations."
+        ... )
+    """
+    return await run_ablations(
+        configs=CONTEXT_ENGINEERING_CONFIGS,
+        task_prompt=task_prompt,
+        output_dir=output_dir,
+        task_name="context_engineering_comparison",
+    )
+# =============================================================================
+# Shared Utilities for Pareto Analysis
+# =============================================================================
+def compute_pareto_frontier(
+    summaries: list[ConfigSummary],
+    score_key: str = "avg_score",
+    cost_key: str = "avg_tokens",
+) -> list[str]:
+    """Compute Pareto frontier for multi-objective optimization.
+    Identifies configurations that are not dominated by any other configuration.
+    A config is dominated if another config has better score AND lower tokens.
+    Args:
+        summaries: List of ConfigSummary objects (or dicts with score/token keys)
+        score_key: Attribute name for the score metric (higher is better)
+        cost_key: Attribute name for the cost metric (lower is better)
+    Returns:
+        List of names of Pareto-optimal configurations
+    """
+    # Sort by cost (ascending)
+    def get_val(s: object, key: str) -> float:
+        if isinstance(s, dict):
+            return float(s.get(key, 0))
+        return float(getattr(s, key, 0))
+    def get_name(s: object) -> str:
+        if isinstance(s, dict):
+            return str(s.get("name", ""))
+        return str(getattr(s, "name", ""))
+    sorted_summaries = sorted(summaries, key=lambda s: get_val(s, cost_key))
+    pareto_names = []
+    best_score = -1.0
+    for summary in sorted_summaries:
+        score = get_val(summary, score_key)
+        if score > best_score:
+            pareto_names.append(get_name(summary))
+            best_score = score
+    return pareto_names
+def generate_recommendation(
+    summaries: list[ConfigSummary],
+    pareto_names: list[str],
+    min_score: float = 0.7,
+) -> tuple[str | None, str]:
+    """Generate a recommendation based on Pareto analysis.
+    Args:
+        summaries: List of ConfigSummary objects
+        pareto_names: Names of Pareto-optimal configs
+        min_score: Minimum acceptable score threshold
+    Returns:
+        Tuple of (recommended_config_name, recommendation_text)
+    """
+    def get_val(s: object, key: str) -> float:
+        if isinstance(s, dict):
+            return float(s.get(key, 0))
+        return float(getattr(s, key, 0))
+    def get_name(s: object) -> str:
+        if isinstance(s, dict):
+            return str(s.get("name", ""))
+        return str(getattr(s, "name", ""))
+    # Filter to acceptable configs
+    acceptable = [s for s in summaries if get_val(s, "avg_score") >= min_score]
+    if not acceptable:
+        return None, "No configuration met the minimum score threshold."
+    # Prefer Pareto-optimal configs
+    pareto_acceptable = [s for s in acceptable if get_name(s) in pareto_names]
+    candidates = pareto_acceptable if pareto_acceptable else acceptable
+    # Pick the one with lowest tokens among candidates
+    best = min(candidates, key=lambda s: get_val(s, "avg_tokens"))
+    name = get_name(best)
+    tokens = get_val(best, "avg_tokens")
+    score = get_val(best, "avg_score")
+    return name, f"Recommended: {name} (avg {tokens:.0f} tokens, {score:.2f} score)"

src/flow/experiments/config_export.py ADDED Viewed

	@@ -0,0 +1,184 @@

+# Copyright (c) Microsoft. All rights reserved.
+"""Config export/import utilities for optimizer results.
+Exports winning configurations as YAML files that can be loaded
+and used directly with `flow run --config <path>`.
+"""
+from __future__ import annotations
+from dataclasses import asdict
+from pathlib import Path
+from typing import Any
+import yaml
+from .ablation import AblationConfig
+def export_config(
+    config: AblationConfig,
+    metrics: dict[str, Any],
+    path: Path,
+) -> None:
+    """Export an AblationConfig as a reusable YAML file.
+    The exported YAML includes:
+    - All config parameters (directly loadable)
+    - Optimization metadata prefixed with _ (ignored when loading)
+    Args:
+        config: The AblationConfig to export
+        metrics: Optimization metrics (score, tokens, etc.)
+        path: Path to write the YAML file
+    Example output:
+        name: compaction_head10_tail40
+        enable_message_compaction: true
+        compaction_head_size: 10
+        ...
+        _optimization:
+          timestamp: "2026-01-26T14:30:22"
+          avg_score: 0.89
+          avg_tokens: 12400
+    """
+    data = asdict(config)
+    data["_optimization"] = metrics
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(yaml.dump(data, default_flow_style=False, sort_keys=False))
+def load_config(path: Path) -> AblationConfig:
+    """Load an AblationConfig from a YAML file.
+    Ignores any keys prefixed with _ (optimization metadata).
+    Args:
+        path: Path to the YAML config file
+    Returns:
+        AblationConfig instance
+    Raises:
+        FileNotFoundError: If the config file doesn't exist
+        ValueError: If the config is invalid
+    """
+    if not path.exists():
+        raise FileNotFoundError(f"Config file not found: {path}")
+    data = yaml.safe_load(path.read_text())
+    # Filter out metadata keys (prefixed with _)
+    config_data = {k: v for k, v in data.items() if not k.startswith("_")}
+    try:
+        return AblationConfig(**config_data)
+    except TypeError as e:
+        raise ValueError(f"Invalid config file {path}: {e}") from e
+def export_optimization_configs(
+    summaries: list[dict[str, Any]],
+    pareto_names: list[str],
+    output_dir: Path,
+    timestamp: str,
+) -> dict[str, Path]:
+    """Export all notable configs from an optimization run.
+    Exports:
+    - best_score.yaml: Highest quality config
+    - best_cost.yaml: Lowest token usage config
+    - best_efficiency.yaml: Best score/token ratio
+    - pareto/<name>.yaml: All Pareto-optimal configs
+    Args:
+        summaries: List of ConfigSummary dicts with metrics
+        pareto_names: Names of Pareto-optimal configs
+        output_dir: Directory to write configs
+        timestamp: Optimization timestamp for metadata
+    Returns:
+        Dict mapping config type to file path
+    """
+    configs_dir = output_dir / "configs"
+    configs_dir.mkdir(parents=True, exist_ok=True)
+    exported: dict[str, Path] = {}
+    if not summaries:
+        return exported
+    # Find best by different criteria
+    best_score = max(summaries, key=lambda s: s.get("avg_score", 0))
+    best_cost = min(summaries, key=lambda s: s.get("avg_tokens", float("inf")))
+    best_efficiency = max(
+        summaries,
+        key=lambda s: s.get("avg_score", 0) / max(s.get("avg_tokens", 1), 1),
+    )
+    # Export best configs
+    for label, summary in [
+        ("best_score", best_score),
+        ("best_cost", best_cost),
+        ("best_efficiency", best_efficiency),
+    ]:
+        config = _summary_to_config(summary)
+        metrics = _extract_metrics(summary, timestamp, label)
+        path = configs_dir / f"{label}.yaml"
+        export_config(config, metrics, path)
+        exported[label] = path
+    # Export Pareto-optimal configs
+    pareto_dir = configs_dir / "pareto"
+    pareto_dir.mkdir(exist_ok=True)
+    for summary in summaries:
+        name = summary.get("name", "unknown")
+        if name in pareto_names:
+            config = _summary_to_config(summary)
+            metrics = _extract_metrics(summary, timestamp, "pareto")
+            metrics["is_pareto_optimal"] = True
+            path = pareto_dir / f"{name}.yaml"
+            export_config(config, metrics, path)
+            exported[f"pareto/{name}"] = path
+    return exported
+def _summary_to_config(summary: dict[str, Any]) -> AblationConfig:
+    """Convert a summary dict back to an AblationConfig."""
+    # Extract config fields from summary
+    config_fields = {
+        "name": summary.get("name", "unknown"),
+        "enable_message_compaction": summary.get("enable_message_compaction", True),
+        "enable_memory_tool": summary.get("enable_memory_tool", True),
+        "enable_sub_agent": summary.get("enable_sub_agent", False),
+        "compaction_head_size": summary.get("compaction_head_size", 10),
+        "compaction_tail_size": summary.get("compaction_tail_size", 40),
+        "bash_timeout": summary.get("bash_timeout", 120),
+    }
+    # Also check nested config if present
+    if "config" in summary:
+        config_fields.update(summary["config"])
+    return AblationConfig(**config_fields)
+def _extract_metrics(
+    summary: dict[str, Any],
+    timestamp: str,
+    selection_reason: str,
+) -> dict[str, Any]:
+    """Extract optimization metrics from a summary."""
+    return {
+        "timestamp": timestamp,
+        "selection_reason": selection_reason,
+        "avg_score": summary.get("avg_score", 0),
+        "avg_tokens": summary.get("avg_tokens", 0),
+        "avg_duration": summary.get("avg_duration", 0),
+        "pass_rate": summary.get("pass_rate", 0),
+        "pareto_rank": summary.get("pareto_rank"),
+        "is_pareto_optimal": summary.get("is_pareto_optimal", False),
+    }

src/flow/experiments/evaluators/__init__.py ADDED Viewed

	@@ -0,0 +1,17 @@

+# Copyright (c) Microsoft. All rights reserved.
+"""Evaluators for the experiments framework."""
+from .base import Evaluator
+from .composite import CompositeEvaluator
+from .heuristic import HeuristicEvaluator
+from .llm import LLMEvaluator
+from .trace import TraceEvaluator
+__all__ = [
+    "CompositeEvaluator",
+    "Evaluator",
+    "HeuristicEvaluator",
+    "LLMEvaluator",
+    "TraceEvaluator",
+]

src/flow/experiments/evaluators/base.py ADDED Viewed

	@@ -0,0 +1,32 @@

+# Copyright (c) Microsoft. All rights reserved.
+"""Base evaluator protocol for the experiments framework."""
+from typing import Protocol
+from ..types import EvalResult, RunResult
+class Evaluator(Protocol):
+    """Protocol for evaluating agent outputs.
+    Evaluators assess the results of agent runs and produce scores
+    and pass/fail determinations based on various criteria.
+    Implementations:
+        - TraceEvaluator: Based on trace metrics (tokens, duration, tool calls)
+        - LLMEvaluator: Uses an LLM to judge output quality
+        - HeuristicEvaluator: Rule-based evaluation (files created, syntax, etc.)
+        - CompositeEvaluator: Combines multiple evaluators
+    """
+    async def evaluate(self, run_result: RunResult) -> EvalResult:
+        """Evaluate the result of an agent run.
+        Args:
+            run_result: The result from running an agent on a task
+        Returns:
+            EvalResult with scores and reasoning
+        """
+        ...

src/flow/experiments/evaluators/composite.py ADDED Viewed

	@@ -0,0 +1,80 @@

+# Copyright (c) Microsoft. All rights reserved.
+"""Composite evaluator that combines multiple evaluators."""
+from typing import TYPE_CHECKING
+from ..types import EvalResult, RunResult
+if TYPE_CHECKING:
+    from .base import Evaluator
+class CompositeEvaluator:
+    """Evaluator that combines multiple evaluators.
+    Useful for combining different evaluation strategies:
+    - LLM evaluation with trace-based metrics
+    - Multiple heuristic checks
+    - Weighted combination of evaluators
+    Example:
+        evaluator = CompositeEvaluator([
+            TraceEvaluator(max_tokens=5000),
+            HeuristicEvaluator(),
+        ], weights=[0.3, 0.7])
+        result = await evaluator.evaluate(run_result)
+    """
+    def __init__(
+        self,
+        evaluators: list["Evaluator"],
+        weights: list[float] | None = None,
+    ) -> None:
+        """Initialize the composite evaluator.
+        Args:
+            evaluators: List of evaluators to combine
+            weights: Optional weights for each evaluator (default: equal weights)
+        Raises:
+            ValueError: If number of weights doesn't match number of evaluators
+        """
+        self.evaluators = evaluators
+        self.weights = weights or [1.0] * len(evaluators)
+        if len(self.weights) != len(self.evaluators):
+            raise ValueError("Number of weights must match number of evaluators")
+    async def evaluate(self, run_result: RunResult) -> EvalResult:
+        """Run all evaluators and combine results.
+        The overall score is a weighted average of all evaluator scores.
+        The overall pass/fail is determined by whether ALL evaluators pass.
+        Args:
+            run_result: The result from running an agent on a task
+        Returns:
+            Combined EvalResult
+        """
+        all_criteria_results = []
+        total_weighted_score = 0.0
+        total_weight = sum(self.weights)
+        all_passed = True
+        all_reasoning = []
+        for evaluator, weight in zip(self.evaluators, self.weights, strict=True):
+            result = await evaluator.evaluate(run_result)
+            all_criteria_results.extend(result.criteria_results)
+            total_weighted_score += result.score * weight
+            all_passed = all_passed and result.passed
+            if result.reasoning:
+                all_reasoning.append(result.reasoning)
+        return EvalResult(
+            score=total_weighted_score / total_weight if total_weight > 0 else 0.0,
+            passed=all_passed,
+            criteria_results=all_criteria_results,
+            reasoning=" | ".join(all_reasoning),
+        )

src/flow/experiments/evaluators/heuristic.py ADDED Viewed

	@@ -0,0 +1,193 @@

+# Copyright (c) Microsoft. All rights reserved.
+"""Heuristic evaluator using rule-based assessment."""
+import logging
+import subprocess
+from ..types import CriterionResult, EvalResult, RunResult
+logger = logging.getLogger(__name__)
+class HeuristicEvaluator:
+    """Evaluator that uses heuristic rules to assess agent output.
+    This evaluator checks:
+    1. Were files created?
+    2. Do Python files have valid syntax?
+    3. Did the agent report completion?
+    4. Does the output match expected patterns based on the task?
+    Useful for quick, deterministic evaluation without LLM calls.
+    Example:
+        evaluator = HeuristicEvaluator(passing_threshold=0.5)
+        result = await evaluator.evaluate(run_result)
+        print(f"Score: {result.score}, Passed: {result.passed}")
+    """
+    def __init__(self, passing_threshold: float = 0.5) -> None:
+        """Initialize the heuristic evaluator.
+        Args:
+            passing_threshold: Minimum score to pass (0.0 to 1.0)
+        """
+        self.passing_threshold = passing_threshold
+    async def evaluate(self, run_result: RunResult) -> EvalResult:
+        """Evaluate the agent's output using heuristic rules.
+        Args:
+            run_result: The result from running an agent on a task
+        Returns:
+            EvalResult with heuristic-based scores
+        """
+        criteria_results = []
+        notes = []
+        score = 0.0
+        # Check if files were created
+        if run_result.files_created:
+            criteria_results.append(
+                CriterionResult(
+                    name="files_created",
+                    score=1.0,
+                    passed=True,
+                    reasoning=f"Created {len(run_result.files_created)} file(s)",
+                )
+            )
+            score += 0.25
+            notes.append(f"Created {len(run_result.files_created)} file(s)")
+        else:
+            criteria_results.append(
+                CriterionResult(
+                    name="files_created",
+                    score=0.0,
+                    passed=False,
+                    reasoning="No files created",
+                )
+            )
+            notes.append("No files created")
+        # Check if agent reported task complete
+        output_lower = run_result.output.lower()
+        if "task_done" in output_lower or "complete" in output_lower or "finished" in output_lower:
+            criteria_results.append(
+                CriterionResult(
+                    name="task_completed",
+                    score=1.0,
+                    passed=True,
+                    reasoning="Agent reported completion",
+                )
+            )
+            score += 0.25
+            notes.append("Agent reported completion")
+        else:
+            criteria_results.append(
+                CriterionResult(
+                    name="task_completed",
+                    score=0.0,
+                    passed=False,
+                    reasoning="Agent did not report completion",
+                )
+            )
+        # Try to validate Python files (check syntax)
+        python_files = [f for f in run_result.files_created if f.endswith(".py")]
+        if python_files:
+            all_valid = True
+            syntax_notes = []
+            for py_file in python_files[:5]:  # Check up to 5 files
+                file_path = run_result.workspace / py_file
+                if file_path.exists():
+                    try:
+                        result = subprocess.run(  # noqa: ASYNC221, S603
+                            ["python3", "-m", "py_compile", str(file_path)],  # noqa: S607
+                            capture_output=True,
+                            timeout=5,
+                        )
+                        if result.returncode != 0:
+                            all_valid = False
+                            syntax_notes.append(f"Syntax error in {py_file}")
+                    except subprocess.TimeoutExpired:
+                        syntax_notes.append(f"Timeout checking {py_file}")
+                    except FileNotFoundError:
+                        # python3 not available, skip syntax check
+                        pass
+                    except Exception as e:
+                        all_valid = False
+                        syntax_notes.append(f"Error checking {py_file}: {e}")
+            if all_valid and not syntax_notes:
+                criteria_results.append(
+                    CriterionResult(
+                        name="code_syntax",
+                        score=1.0,
+                        passed=True,
+                        reasoning="Python files have valid syntax",
+                    )
+                )
+                score += 0.25
+                notes.append("Python files have valid syntax")
+            elif syntax_notes:
+                criteria_results.append(
+                    CriterionResult(
+                        name="code_syntax",
+                        score=0.0,
+                        passed=False,
+                        reasoning="; ".join(syntax_notes),
+                    )
+                )
+                notes.extend(syntax_notes)
+        # Check for expected patterns in output based on task
+        task_lower = run_result.task.prompt.lower()
+        output_correct = False
+        if "hello" in task_lower and "hello" in output_lower:
+            output_correct = True
+        elif "api" in task_lower and (
+            "fastapi" in output_lower or "endpoint" in output_lower or "flask" in output_lower
+        ):
+            output_correct = True
+        elif "http" in task_lower and ("server" in output_lower or "port" in output_lower):
+            output_correct = True
+        elif "test" in task_lower and ("pytest" in output_lower or "test" in output_lower):
+            output_correct = True
+        elif run_result.files_created:
+            # Generic: if files created, give partial credit
+            score += 0.125
+        if output_correct:
+            criteria_results.append(
+                CriterionResult(
+                    name="output_relevance",
+                    score=1.0,
+                    passed=True,
+                    reasoning="Output matches expected patterns for task",
+                )
+            )
+            score += 0.25
+        # Check for execution errors
+        if run_result.error:
+            criteria_results.append(
+                CriterionResult(
+                    name="execution_success",
+                    score=0.0,
+                    passed=False,
+                    reasoning=f"Execution failed: {run_result.error}",
+                )
+            )
+            score = max(0.0, score - 0.25)
+        final_score = min(score, 1.0)
+        return EvalResult(
+            score=final_score,
+            passed=final_score >= self.passing_threshold,
+            criteria_results=criteria_results,
+            reasoning="; ".join(notes) if notes else "Heuristic evaluation complete",
+        )

src/flow/experiments/evaluators/llm.py ADDED Viewed

	@@ -0,0 +1,223 @@

+# Copyright (c) Microsoft. All rights reserved.
+"""LLM-as-judge evaluator for quality assessment."""
+import json
+import logging
+from typing import Any
+from ..metrics import extract_metrics
+from ..types import CriterionResult, EvalResult, RunResult
+logger = logging.getLogger(__name__)
+class LLMEvaluator:
+    """Evaluator that uses an LLM to assess agent output against criteria.
+    This implements the LLM-as-a-judge pattern, where a language model
+    evaluates whether the agent's output meets specified criteria.
+    Note: Requires a separate model client - not tied to FlowConfig.
+    This allows using a different model for evaluation than for agent execution.
+    Example:
+        from openai import AsyncOpenAI
+        client = AsyncOpenAI()
+        evaluator = LLMEvaluator(
+            model_client=client,
+            model_name="gpt-4o",
+            passing_threshold=0.7,
+        )
+        result = await evaluator.evaluate(run_result)
+    """
+    def __init__(
+        self,
+        model_client: Any,
+        model_name: str = "gpt-4o",
+        passing_threshold: float = 0.7,
+    ) -> None:
+        """Initialize the LLM evaluator.
+        Args:
+            model_client: An async client with chat.completions.create method
+                         (e.g., AsyncOpenAI, AsyncAzureOpenAI)
+            model_name: Model name/deployment to use for evaluation
+            passing_threshold: Minimum score to pass (0.0 to 1.0)
+        """
+        self.model_client = model_client
+        self.model_name = model_name
+        self.passing_threshold = passing_threshold
+    def _get_evaluation_prompt(self, run_result: RunResult) -> str:
+        """Build the evaluation prompt for the LLM."""
+        criteria_text = "\n".join(
+            f"- **{c.name}** (weight: {c.weight}): {c.instruction}"
+            for c in run_result.task.criteria
+        )
+        # Extract execution trace summary for research/multi-step tasks
+        trace_summary = self._get_trace_summary(run_result)
+        return f"""You are an expert evaluator assessing an AI agent's output.
+## Task
+The agent was given this task:
+```
+{run_result.task.prompt}
+```
+## Agent Output
+```
+{run_result.output[:8000]}
+```
+## Files Created
+{json.dumps(run_result.files_created, indent=2) if run_result.files_created else "None"}
+## Execution Trace
+{trace_summary}
+## Execution Status
+{"Success" if run_result.success else f"Failed: {run_result.error}"}
+## Evaluation Criteria
+{criteria_text}
+## Instructions
+Evaluate the agent's output against each criterion. Consider both the final output AND the execution
+trace (tools used, steps taken) when assessing correctness.
+For each criterion:
+1. Assess how well the output meets the criterion (0.0 to 1.0)
+2. Determine if it passes (score >= 0.7)
+3. Provide brief reasoning
+Respond in this exact JSON format:
+```json
+{{
+    "criteria_results": [
+        {{
+            "name": "criterion_name",
+            "score": 0.85,
+            "passed": true,
+            "reasoning": "Brief explanation"
+        }}
+    ],
+    "overall_reasoning": "Summary of the overall evaluation"
+}}
+```
+"""
+    def _get_trace_summary(self, run_result: RunResult) -> str:
+        """Extract a summary of the execution trace for evaluation."""
+        if not run_result.trace:
+            return "No trace data available"
+        metrics = extract_metrics(run_result.trace)
+        # Build tool usage summary
+        tool_summary = ""
+        if metrics.tool_calls_by_name:
+            tool_lines = [f"  - {name}: {count}x" for name, count in metrics.tool_calls_by_name.items()]
+            tool_summary = "Tools used:\n" + "\n".join(tool_lines)
+        else:
+            tool_summary = "Tools used: None"
+        return f"""Duration: {run_result.duration_seconds:.1f}s
+LLM calls: {metrics.llm_call_count}
+Total tool calls: {metrics.tool_call_count}
+{tool_summary}
+Tokens used: {metrics.total_tokens} (input: {metrics.input_tokens}, output: {metrics.output_tokens})"""
+    async def evaluate(self, run_result: RunResult) -> EvalResult:
+        """Evaluate the agent's output using an LLM.
+        Args:
+            run_result: The result from running an agent on a task
+        Returns:
+            EvalResult with LLM-generated scores and reasoning
+        """
+        if not run_result.task.criteria:
+            # No criteria to evaluate - return a default pass
+            return EvalResult(
+                score=1.0 if run_result.success else 0.0,
+                passed=run_result.success,
+                criteria_results=[],
+                reasoning=(
+                    "No evaluation criteria specified"
+                    + ("" if run_result.success else f"; Error: {run_result.error}")
+                ),
+            )
+        prompt = self._get_evaluation_prompt(run_result)
+        try:
+            response = await self.model_client.chat.completions.create(
+                model=self.model_name,
+                messages=[
+                    {
+                        "role": "system",
+                        "content": "You are an expert evaluator. Respond only with valid JSON.",
+                    },
+                    {"role": "user", "content": prompt},
+                ],
+                temperature=0.1,  # Low temperature for consistent evaluation
+            )
+            # Extract the response text
+            response_text = response.choices[0].message.content or ""
+            # Parse JSON from response
+            json_start = response_text.find("{")
+            json_end = response_text.rfind("}") + 1
+            if json_start >= 0 and json_end > json_start:
+                eval_data = json.loads(response_text[json_start:json_end])
+            else:
+                raise ValueError("No JSON found in response")
+            # Build criterion results
+            criteria_results = []
+            total_weighted_score = 0.0
+            total_weight = 0.0
+            for cr_data in eval_data.get("criteria_results", []):
+                cr = CriterionResult(
+                    name=cr_data.get("name", "unknown"),
+                    score=float(cr_data.get("score", 0.0)),
+                    passed=bool(cr_data.get("passed", False)),
+                    reasoning=cr_data.get("reasoning", ""),
+                )
+                criteria_results.append(cr)
+                # Find the weight for this criterion
+                weight = 1.0
+                for task_criterion in run_result.task.criteria:
+                    if task_criterion.name == cr.name:
+                        weight = task_criterion.weight
+                        break
+                total_weighted_score += cr.score * weight
+                total_weight += weight
+            # Calculate overall score
+            overall_score = total_weighted_score / total_weight if total_weight > 0 else 0.0
+            return EvalResult(
+                score=overall_score,
+                passed=overall_score >= self.passing_threshold,
+                criteria_results=criteria_results,
+                reasoning=eval_data.get("overall_reasoning", ""),
+            )
+        except Exception as e:
+            logger.error(f"LLM evaluation failed: {e}")
+            return EvalResult(
+                score=0.0,
+                passed=False,
+                criteria_results=[],
+                reasoning=f"Evaluation failed: {e}",
+            )

src/flow/experiments/evaluators/trace.py ADDED Viewed

	@@ -0,0 +1,149 @@

+# Copyright (c) Microsoft. All rights reserved.
+"""Trace-based evaluator for objective metrics assessment."""
+from ..metrics import extract_metrics
+from ..types import CriterionResult, EvalResult, RunResult
+class TraceEvaluator:
+    """Evaluator that assesses agent output based on trace metrics.
+    This evaluator checks objective metrics from the execution trace,
+    such as token usage, tool calls, and timing. All limits are optional -
+    only specified limits are evaluated.
+    Example:
+        evaluator = TraceEvaluator(
+            max_tokens=5000,
+            max_tool_calls=20,
+            max_duration_seconds=60.0,
+        )
+        result = await evaluator.evaluate(run_result)
+        print(f"Passed: {result.passed}, Score: {result.score}")
+    """
+    def __init__(
+        self,
+        max_tokens: int | None = None,
+        max_tool_calls: int | None = None,
+        max_duration_seconds: float | None = None,
+    ) -> None:
+        """Initialize the trace evaluator.
+        Args:
+            max_tokens: Maximum allowed total tokens (None = no limit)
+            max_tool_calls: Maximum allowed tool calls (None = no limit)
+            max_duration_seconds: Maximum allowed duration (None = no limit)
+        """
+        self.max_tokens = max_tokens
+        self.max_tool_calls = max_tool_calls
+        self.max_duration_seconds = max_duration_seconds
+    async def evaluate(self, run_result: RunResult) -> EvalResult:
+        """Evaluate the agent's output based on trace metrics.
+        Args:
+            run_result: The result from running an agent on a task
+        Returns:
+            EvalResult with metric-based scores
+        """
+        metrics = extract_metrics(run_result.trace)
+        criteria_results = []
+        all_passed = True
+        # Check token limit
+        if self.max_tokens is not None:
+            passed = metrics.total_tokens <= self.max_tokens
+            all_passed = all_passed and passed
+            # Score decreases proportionally when over limit
+            if passed:
+                score = 1.0
+            else:
+                overage = metrics.total_tokens - self.max_tokens
+                score = max(0.0, 1.0 - (overage / self.max_tokens))
+            criteria_results.append(
+                CriterionResult(
+                    name="token_limit",
+                    score=score,
+                    passed=passed,
+                    reasoning=f"Used {metrics.total_tokens} tokens (limit: {self.max_tokens})",
+                )
+            )
+        # Check tool call limit
+        if self.max_tool_calls is not None:
+            passed = metrics.tool_call_count <= self.max_tool_calls
+            all_passed = all_passed and passed
+            if passed:
+                score = 1.0
+            else:
+                overage = metrics.tool_call_count - self.max_tool_calls
+                score = max(0.0, 1.0 - (overage / self.max_tool_calls))
+            criteria_results.append(
+                CriterionResult(
+                    name="tool_call_limit",
+                    score=score,
+                    passed=passed,
+                    reasoning=f"Made {metrics.tool_call_count} tool calls (limit: {self.max_tool_calls})",
+                )
+            )
+        # Check duration limit
+        if self.max_duration_seconds is not None:
+            passed = run_result.duration_seconds <= self.max_duration_seconds
+            all_passed = all_passed and passed
+            if passed:
+                score = 1.0
+            else:
+                overage = run_result.duration_seconds - self.max_duration_seconds
+                score = max(0.0, 1.0 - (overage / self.max_duration_seconds))
+            criteria_results.append(
+                CriterionResult(
+                    name="duration_limit",
+                    score=score,
+                    passed=passed,
+                    reasoning=f"Took {run_result.duration_seconds:.2f}s (limit: {self.max_duration_seconds}s)",
+                )
+            )
+        # Check for execution errors
+        if run_result.error:
+            all_passed = False
+            criteria_results.append(
+                CriterionResult(
+                    name="execution_success",
+                    score=0.0,
+                    passed=False,
+                    reasoning=f"Execution failed: {run_result.error}",
+                )
+            )
+        # Check for trace errors
+        if metrics.error_count > 0:
+            criteria_results.append(
+                CriterionResult(
+                    name="trace_errors",
+                    score=max(0.0, 1.0 - (metrics.error_count * 0.2)),
+                    passed=metrics.error_count == 0,
+                    reasoning=f"Found {metrics.error_count} error(s) in trace",
+                )
+            )
+        # Calculate overall score
+        if criteria_results:
+            overall_score = sum(cr.score for cr in criteria_results) / len(criteria_results)
+        else:
+            # No criteria specified - just check success
+            overall_score = 1.0 if run_result.success else 0.0
+        return EvalResult(
+            score=overall_score,
+            passed=all_passed and run_result.success,
+            criteria_results=criteria_results,
+            reasoning=f"Trace evaluation: {len(criteria_results)} criteria checked",
+        )

src/flow/experiments/metrics.py ADDED Viewed

	@@ -0,0 +1,267 @@

+# Copyright (c) Microsoft. All rights reserved.
+"""Metrics extraction utilities for the experiments framework."""
+from dataclasses import dataclass, field
+from typing import Any
+@dataclass
+class LLMCallInfo:
+    """Information about a single LLM call."""
+    model: str = "unknown"
+    input_tokens: int = 0
+    output_tokens: int = 0
+    finish_reason: str = ""
+    duration_ms: float = 0.0
+@dataclass
+class ToolCallInfo:
+    """Information about a single tool call."""
+    name: str = "unknown"
+    duration_ms: float = 0.0
+    call_id: str = ""
+@dataclass
+class TraceMetrics:
+    """Objective metrics extracted from execution traces.
+    These are factual measurements from the trace, not subjective assessments.
+    Attributes:
+        total_tokens: Total tokens used (input + output)
+        input_tokens: Input/prompt tokens used
+        output_tokens: Output/completion tokens used
+        tool_call_count: Number of tool calls made
+        tool_calls_by_name: Count of calls per tool name
+        llm_call_count: Number of LLM API calls
+        total_duration_ms: Total execution time in milliseconds
+        llm_duration_ms: Time spent in LLM calls
+        tool_duration_ms: Time spent in tool calls
+        span_count: Total number of trace spans
+        error_count: Number of error spans
+        llm_calls: Detailed info for each LLM call
+        tool_calls: Detailed info for each tool call
+    """
+    total_tokens: int = 0
+    input_tokens: int = 0
+    output_tokens: int = 0
+    tool_call_count: int = 0
+    tool_calls_by_name: dict[str, int] = field(default_factory=dict)
+    llm_call_count: int = 0
+    total_duration_ms: float = 0.0
+    llm_duration_ms: float = 0.0
+    tool_duration_ms: float = 0.0
+    span_count: int = 0
+    error_count: int = 0
+    llm_calls: list[LLMCallInfo] = field(default_factory=list)
+    tool_calls: list[ToolCallInfo] = field(default_factory=list)
+def extract_metrics(trace: list[dict[str, Any]]) -> TraceMetrics:
+    """Extract objective metrics from a trace.
+    Parses OpenTelemetry semantic conventions for GenAI:
+    - gen_ai.operation.name == "chat" for LLM calls
+    - gen_ai.usage.input_tokens / output_tokens for token counts
+    - gen_ai.operation.name == "execute_tool" for tool calls
+    - gen_ai.tool.name for tool identification
+    Args:
+        trace: List of trace span dictionaries
+    Returns:
+        TraceMetrics with extracted values
+    """
+    metrics = TraceMetrics()
+    metrics.span_count = len(trace)
+    for span in trace:
+        data = span.get("data", {})
+        attributes = data.get("attributes", {})
+        operation_name = data.get("operation_name", "")
+        duration_ms = data.get("duration_ms", 0) or 0
+        # Check for errors
+        status = data.get("status", "")
+        if "ERROR" in str(status).upper():
+            metrics.error_count += 1
+        # Check for LLM operations (gen_ai.operation.name = "chat")
+        if attributes.get("gen_ai.operation.name") == "chat":
+            input_tokens = attributes.get("gen_ai.usage.input_tokens", 0) or 0
+            output_tokens = attributes.get("gen_ai.usage.output_tokens", 0) or 0
+            metrics.llm_call_count += 1
+            metrics.input_tokens += int(input_tokens)
+            metrics.output_tokens += int(output_tokens)
+            metrics.llm_duration_ms += duration_ms
+            metrics.llm_calls.append(LLMCallInfo(
+                model=attributes.get("gen_ai.request.model", "unknown"),
+                input_tokens=int(input_tokens),
+                output_tokens=int(output_tokens),
+                finish_reason=str(attributes.get("gen_ai.response.finish_reasons", "")),
+                duration_ms=duration_ms,
+            ))
+        # Check for tool executions
+        elif attributes.get("gen_ai.operation.name") == "execute_tool":
+            tool_name = attributes.get("gen_ai.tool.name", operation_name)
+            metrics.tool_call_count += 1
+            metrics.tool_duration_ms += duration_ms
+            metrics.tool_calls_by_name[tool_name] = metrics.tool_calls_by_name.get(tool_name, 0) + 1
+            metrics.tool_calls.append(ToolCallInfo(
+                name=tool_name,
+                duration_ms=duration_ms,
+                call_id=attributes.get("gen_ai.tool.call.id", ""),
+            ))
+        # Also check for generic tool patterns (fallback)
+        elif not attributes.get("gen_ai.operation.name"):
+            is_tool_call = (
+                "tool" in operation_name.lower()
+                or attributes.get("tool.name")
+                or attributes.get("gen_ai.tool.name")
+                or "function_call" in operation_name.lower()
+            )
+            if is_tool_call:
+                tool_name = (
+                    attributes.get("tool.name")
+                    or attributes.get("gen_ai.tool.name")
+                    or _extract_tool_name_from_operation(operation_name)
+                    or "unknown"
+                )
+                metrics.tool_call_count += 1
+                metrics.tool_duration_ms += duration_ms
+                metrics.tool_calls_by_name[tool_name] = metrics.tool_calls_by_name.get(tool_name, 0) + 1
+                metrics.tool_calls.append(ToolCallInfo(
+                    name=tool_name,
+                    duration_ms=duration_ms,
+                    call_id="",
+                ))
+            # Check for token counts in non-chat spans (fallback)
+            input_tokens = (
+                attributes.get("gen_ai.usage.input_tokens")
+                or attributes.get("llm.token_count.prompt")
+                or attributes.get("input_tokens")
+            )
+            output_tokens = (
+                attributes.get("gen_ai.usage.output_tokens")
+                or attributes.get("llm.token_count.completion")
+                or attributes.get("output_tokens")
+            )
+            if input_tokens or output_tokens:
+                metrics.input_tokens += int(input_tokens or 0)
+                metrics.output_tokens += int(output_tokens or 0)
+                metrics.llm_call_count += 1
+                metrics.llm_duration_ms += duration_ms
+        # Track total duration from root span
+        if not data.get("parent_span_id"):
+            metrics.total_duration_ms = max(metrics.total_duration_ms, duration_ms)
+    # Calculate total tokens
+    metrics.total_tokens = metrics.input_tokens + metrics.output_tokens
+    return metrics
+def _extract_tool_name_from_operation(operation_name: str) -> str | None:
+    """Try to extract a tool name from an operation name.
+    Args:
+        operation_name: The span operation name
+    Returns:
+        Extracted tool name or None
+    """
+    # Common patterns: "tool:read_file", "execute_tool:write_file", "function_call:search"
+    for prefix in ["tool:", "execute_tool:", "function_call:", "call_"]:
+        if operation_name.lower().startswith(prefix):
+            return operation_name[len(prefix):]
+    return None
+def format_metrics_summary(metrics: TraceMetrics) -> str:
+    """Format metrics as a human-readable summary.
+    Args:
+        metrics: TraceMetrics to format
+    Returns:
+        Formatted string summary
+    """
+    lines = [
+        "=== Trace Metrics ===",
+        f"Tokens: {metrics.total_tokens} total ({metrics.input_tokens} input, {metrics.output_tokens} output)",
+        f"LLM Calls: {metrics.llm_call_count} ({metrics.llm_duration_ms:.1f}ms)",
+        f"Tool Calls: {metrics.tool_call_count} ({metrics.tool_duration_ms:.1f}ms)",
+    ]
+    if metrics.tool_calls_by_name:
+        lines.append("  Tool breakdown:")
+        for name, count in sorted(metrics.tool_calls_by_name.items()):
+            lines.append(f"    - {name}: {count}")
+    lines.extend([
+        f"Duration: {metrics.total_duration_ms:.2f}ms",
+        f"Spans: {metrics.span_count}",
+        f"Errors: {metrics.error_count}",
+    ])
+    return "\n".join(lines)
+def metrics_to_dict(metrics: TraceMetrics) -> dict[str, Any]:
+    """Convert TraceMetrics to a JSON-serializable dictionary.
+    Args:
+        metrics: TraceMetrics to convert
+    Returns:
+        Dictionary representation
+    """
+    return {
+        "total_tokens": metrics.total_tokens,
+        "input_tokens": metrics.input_tokens,
+        "output_tokens": metrics.output_tokens,
+        "tool_call_count": metrics.tool_call_count,
+        "tool_calls_by_name": metrics.tool_calls_by_name,
+        "llm_call_count": metrics.llm_call_count,
+        "total_duration_ms": metrics.total_duration_ms,
+        "llm_duration_ms": metrics.llm_duration_ms,
+        "tool_duration_ms": metrics.tool_duration_ms,
+        "span_count": metrics.span_count,
+        "error_count": metrics.error_count,
+        "llm_calls": [
+            {
+                "model": c.model,
+                "input_tokens": c.input_tokens,
+                "output_tokens": c.output_tokens,
+                "finish_reason": c.finish_reason,
+                "duration_ms": c.duration_ms,
+            }
+            for c in metrics.llm_calls
+        ],
+        "tool_calls": [
+            {
+                "name": c.name,
+                "duration_ms": c.duration_ms,
+                "call_id": c.call_id,
+            }
+            for c in metrics.tool_calls
+        ],
+    }

src/flow/experiments/optimizer.py ADDED Viewed

	@@ -0,0 +1,547 @@

+# Copyright (c) Microsoft. All rights reserved.
+"""Optimizer service for finding best agent configurations.
+Runs experiments in parallel, evaluates with LLM-as-Judge,
+ranks via Pareto analysis, and exports reusable configs.
+"""
+from __future__ import annotations
+import asyncio
+import json
+import logging
+import os
+from collections.abc import Callable
+from dataclasses import asdict, dataclass, field
+from datetime import datetime
+from itertools import product
+from pathlib import Path
+from typing import Any
+from openai import AsyncAzureOpenAI
+from .ablation import (
+    AblationConfig,
+    compute_pareto_frontier,
+    create_harness_from_config,
+)
+from .config_export import export_optimization_configs
+from .evaluators import LLMEvaluator
+from .metrics import TraceMetrics, extract_metrics
+from .runner import FlowExperimentRunner, setup_tracing
+from .types import EvalCriterion, RunResult, Task
+logger = logging.getLogger(__name__)
+@dataclass
+class TaskResult:
+    """Result for a single config-task pair."""
+    config_name: str
+    task_name: str
+    run_result: RunResult
+    metrics: TraceMetrics
+    eval_score: float
+    eval_passed: bool
+    eval_reasoning: str
+@dataclass
+class ConfigSummary:
+    """Aggregated summary for a configuration across all tasks."""
+    name: str
+    config: AblationConfig
+    task_results: list[TaskResult] = field(default_factory=list)
+    # Aggregated metrics
+    avg_score: float = 0.0
+    avg_tokens: float = 0.0
+    avg_duration: float = 0.0
+    pass_rate: float = 0.0
+    total_tokens: int = 0
+    task_count: int = 0
+    # Pareto analysis
+    pareto_rank: int | None = None
+    is_pareto_optimal: bool = False
+    def to_dict(self) -> dict[str, Any]:
+        """Convert to dictionary for serialization."""
+        return {
+            "name": self.name,
+            "config": asdict(self.config),
+            "avg_score": self.avg_score,
+            "avg_tokens": self.avg_tokens,
+            "avg_duration": self.avg_duration,
+            "pass_rate": self.pass_rate,
+            "total_tokens": self.total_tokens,
+            "task_count": self.task_count,
+            "pareto_rank": self.pareto_rank,
+            "is_pareto_optimal": self.is_pareto_optimal,
+        }
+@dataclass
+class OptimizationResult:
+    """Complete results from an optimization run."""
+    timestamp: str
+    output_dir: Path
+    summaries: list[ConfigSummary]
+    pareto_frontier: list[str]
+    exported_configs: dict[str, Path]
+    # Rankings
+    rank_by_score: list[str] = field(default_factory=list)
+    rank_by_tokens: list[str] = field(default_factory=list)
+    rank_by_efficiency: list[str] = field(default_factory=list)
+    # Stats
+    total_experiments: int = 0
+    total_duration_seconds: float = 0.0
+    def get_best_config(self, criterion: str = "score") -> ConfigSummary | None:
+        """Get the best config by a criterion."""
+        if criterion == "score":
+            names = self.rank_by_score
+        elif criterion == "tokens":
+            names = self.rank_by_tokens
+        elif criterion == "efficiency":
+            names = self.rank_by_efficiency
+        else:
+            return None
+        if not names:
+            return None
+        for summary in self.summaries:
+            if summary.name == names[0]:
+                return summary
+        return None
+class FlowOptimizer:
+    """Optimizer for finding best agent configurations.
+    Runs experiments in parallel, evaluates results, performs
+    Pareto analysis, and exports winning configs.
+    Example:
+        optimizer = FlowOptimizer(parallel=4)
+        configs = [
+            AblationConfig(name="baseline", enable_message_compaction=False),
+            AblationConfig(name="compaction", enable_message_compaction=True),
+        ]
+        tasks = [Task(name="test", prompt="Create hello world")]
+        result = await optimizer.optimize(configs, tasks)
+        print(f"Best: {result.rank_by_score[0]}")
+    """
+    def __init__(
+        self,
+        parallel: int = 4,
+        use_llm_evaluator: bool = True,
+        output_dir: Path | None = None,
+    ) -> None:
+        """Initialize the optimizer.
+        Args:
+            parallel: Max concurrent experiments
+            use_llm_evaluator: Whether to use LLM for evaluation
+            output_dir: Base directory for results
+        """
+        self.parallel = parallel
+        self.use_llm_evaluator = use_llm_evaluator
+        self.output_dir = output_dir or Path.home() / ".flow" / "optimizations"
+    async def optimize(
+        self,
+        configs: list[AblationConfig],
+        tasks: list[Task],
+        progress_callback: Callable[[int, int, str, str], None] | None = None,
+    ) -> OptimizationResult:
+        """Run optimization across all configs and tasks.
+        Args:
+            configs: Configurations to test
+            tasks: Tasks to run each config on
+            progress_callback: Optional callback(completed, total, config, task)
+        Returns:
+            OptimizationResult with rankings and exported configs
+        """
+        start_time = datetime.now()
+        timestamp = start_time.strftime("%Y%m%d_%H%M%S")
+        run_dir = self.output_dir / timestamp
+        run_dir.mkdir(parents=True, exist_ok=True)
+        # Setup
+        setup_tracing("flow-optimizer")
+        self._save_config(configs, tasks, run_dir)
+        print("=" * 70)
+        print(" FLOW OPTIMIZER")
+        print("=" * 70)
+        print(f" Configs: {len(configs)}")
+        print(f" Tasks:   {len(tasks)}")
+        print(f" Total:   {len(configs) * len(tasks)} experiments")
+        print(f" Parallel: {self.parallel}")
+        print(f" Output:  {run_dir}")
+        print("=" * 70)
+        # Create LLM evaluator if needed
+        evaluator = None
+        if self.use_llm_evaluator:
+            evaluator = self._create_evaluator()
+        # Run all experiments in parallel
+        task_results = await self._run_parallel(
+            configs, tasks, run_dir, evaluator, progress_callback
+        )
+        # Aggregate by config
+        summaries = self._aggregate_results(task_results, configs)
+        # Pareto analysis
+        pareto_names = self._compute_pareto(summaries)
+        # Compute rankings
+        rank_by_score = sorted(summaries, key=lambda s: s.avg_score, reverse=True)
+        rank_by_tokens = sorted(summaries, key=lambda s: s.avg_tokens)
+        rank_by_efficiency = sorted(
+            summaries,
+            key=lambda s: s.avg_score / max(s.avg_tokens, 1),
+            reverse=True,
+        )
+        # Export configs
+        summary_dicts = [s.to_dict() for s in summaries]
+        exported = export_optimization_configs(
+            summary_dicts, pareto_names, run_dir, timestamp
+        )
+        end_time = datetime.now()
+        result = OptimizationResult(
+            timestamp=timestamp,
+            output_dir=run_dir,
+            summaries=summaries,
+            pareto_frontier=pareto_names,
+            exported_configs=exported,
+            rank_by_score=[s.name for s in rank_by_score],
+            rank_by_tokens=[s.name for s in rank_by_tokens],
+            rank_by_efficiency=[s.name for s in rank_by_efficiency],
+            total_experiments=len(task_results),
+            total_duration_seconds=(end_time - start_time).total_seconds(),
+        )
+        # Save results
+        self._save_results(result, run_dir)
+        # Print summary
+        self._print_summary(result)
+        return result
+    async def _run_parallel(
+        self,
+        configs: list[AblationConfig],
+        tasks: list[Task],
+        run_dir: Path,
+        evaluator: LLMEvaluator | None,
+        progress_callback: Callable[[int, int, str, str], None] | None,
+    ) -> list[TaskResult]:
+        """Run all config-task pairs in parallel with semaphore control."""
+        semaphore = asyncio.Semaphore(self.parallel)
+        total = len(configs) * len(tasks)
+        completed = 0
+        lock = asyncio.Lock()
+        async def run_one(config: AblationConfig, task: Task) -> TaskResult:
+            nonlocal completed
+            async with semaphore:
+                workspace = run_dir / "workspaces" / config.name / task.name
+                workspace.mkdir(parents=True, exist_ok=True)
+                result = await self._run_single(config, task, workspace, evaluator)
+                async with lock:
+                    completed += 1
+                    status = "✓" if result.eval_passed else "✗"
+                    print(
+                        f"  [{completed}/{total}] {config.name}/{task.name}: "
+                        f"{status} score={result.eval_score:.2f} "
+                        f"tokens={result.metrics.total_tokens:,}"
+                    )
+                    if progress_callback:
+                        progress_callback(completed, total, config.name, task.name)
+                return result
+        # Create all tasks
+        coroutines = [run_one(config, task) for config in configs for task in tasks]
+        # Run with gather
+        gather_results = await asyncio.gather(*coroutines, return_exceptions=True)
+        # Filter out exceptions
+        valid_results: list[TaskResult] = []
+        for r in gather_results:
+            if isinstance(r, BaseException):
+                logger.error(f"Experiment failed: {r}")
+            else:
+                valid_results.append(r)
+        return valid_results
+    async def _run_single(
+        self,
+        config: AblationConfig,
+        task: Task,
+        workspace: Path,
+        evaluator: LLMEvaluator | None,
+    ) -> TaskResult:
+        """Run a single config-task experiment."""
+        harness = create_harness_from_config(config, workspace)
+        try:
+            runner = FlowExperimentRunner(keep_workspace=True)
+            run_result = await runner.run(harness, task, workspace=workspace)
+            metrics = extract_metrics(run_result.trace)
+            # Evaluate
+            if evaluator:
+                eval_result = await evaluator.evaluate(run_result)
+                eval_score = eval_result.score
+                eval_passed = eval_result.passed
+                eval_reasoning = eval_result.reasoning
+            else:
+                # Simple heuristic: passed if no error
+                eval_score = 1.0 if run_result.success else 0.0
+                eval_passed = run_result.success
+                eval_reasoning = "Success" if run_result.success else run_result.error or "Failed"
+            return TaskResult(
+                config_name=config.name,
+                task_name=task.name,
+                run_result=run_result,
+                metrics=metrics,
+                eval_score=eval_score,
+                eval_passed=eval_passed,
+                eval_reasoning=eval_reasoning,
+            )
+        finally:
+            await harness.close()
+    def _aggregate_results(
+        self,
+        task_results: list[TaskResult],
+        configs: list[AblationConfig],
+    ) -> list[ConfigSummary]:
+        """Aggregate task results into config summaries."""
+        config_map = {c.name: c for c in configs}
+        results_by_config: dict[str, list[TaskResult]] = {c.name: [] for c in configs}
+        for result in task_results:
+            if result.config_name in results_by_config:
+                results_by_config[result.config_name].append(result)
+        summaries = []
+        for name, results in results_by_config.items():
+            if not results:
+                continue
+            config = config_map[name]
+            summary = ConfigSummary(
+                name=name,
+                config=config,
+                task_results=results,
+                avg_score=sum(r.eval_score for r in results) / len(results),
+                avg_tokens=sum(r.metrics.total_tokens for r in results) / len(results),
+                avg_duration=sum(r.run_result.duration_seconds for r in results) / len(results),
+                pass_rate=sum(1 for r in results if r.eval_passed) / len(results),
+                total_tokens=sum(r.metrics.total_tokens for r in results),
+                task_count=len(results),
+            )
+            summaries.append(summary)
+        return summaries
+    def _compute_pareto(self, summaries: list[ConfigSummary]) -> list[str]:
+        """Compute Pareto frontier (maximize score, minimize tokens)."""
+        # Use shared utility
+        pareto_names = compute_pareto_frontier(summaries)
+        # Mark summaries with Pareto status
+        for summary in summaries:
+            if summary.name in pareto_names:
+                summary.is_pareto_optimal = True
+                summary.pareto_rank = 0
+            else:
+                summary.is_pareto_optimal = False
+                summary.pareto_rank = 1  # Simplified: all non-Pareto get rank 1
+        return pareto_names
+    def _create_evaluator(self) -> LLMEvaluator | None:
+        """Create LLM evaluator if credentials available."""
+        api_key = os.environ.get("AZURE_OPENAI_API_KEY")
+        endpoint = os.environ.get("AZURE_OPENAI_ENDPOINT")
+        deployment = os.environ.get("AZURE_OPENAI_DEPLOYMENT", "gpt-4o")
+        if not api_key or not endpoint:
+            logger.warning("No Azure OpenAI credentials, using heuristic evaluation")
+            return None
+        client = AsyncAzureOpenAI(
+            api_key=api_key,
+            api_version="2024-02-15-preview",
+            azure_endpoint=endpoint,
+        )
+        return LLMEvaluator(
+            model_client=client,
+            model_name=deployment,
+            passing_threshold=0.7,
+        )
+    def _save_config(
+        self,
+        configs: list[AblationConfig],
+        tasks: list[Task],
+        run_dir: Path,
+    ) -> None:
+        """Save optimization config."""
+        with open(run_dir / "optimization_config.json", "w") as f:
+            json.dump(
+                {
+                    "configs": [asdict(c) for c in configs],
+                    "tasks": [{"name": t.name, "prompt": t.prompt} for t in tasks],
+                    "parallel": self.parallel,
+                    "use_llm_evaluator": self.use_llm_evaluator,
+                },
+                f,
+                indent=2,
+            )
+    def _save_results(self, result: OptimizationResult, run_dir: Path) -> None:
+        """Save optimization results."""
+        summary_data = {
+            "timestamp": result.timestamp,
+            "total_experiments": result.total_experiments,
+            "total_duration_seconds": result.total_duration_seconds,
+            "pareto_frontier": result.pareto_frontier,
+            "rank_by_score": result.rank_by_score,
+            "rank_by_tokens": result.rank_by_tokens,
+            "rank_by_efficiency": result.rank_by_efficiency,
+            "exported_configs": {k: str(v) for k, v in result.exported_configs.items()},
+            "summaries": [s.to_dict() for s in result.summaries],
+        }
+        with open(run_dir / "summary.json", "w") as f:
+            json.dump(summary_data, f, indent=2)
+    def _print_summary(self, result: OptimizationResult) -> None:
+        """Print optimization summary."""
+        print("\n" + "=" * 70)
+        print(" OPTIMIZATION RESULTS")
+        print("=" * 70)
+        # Rankings table
+        print(f"\n{'Config':<30} | {'Score':>8} | {'Tokens':>10} | {'Pareto':>8}")
+        print("-" * 65)
+        for summary in sorted(result.summaries, key=lambda s: s.avg_score, reverse=True):
+            pareto = "★" if summary.is_pareto_optimal else ""
+            print(
+                f"{summary.name:<30} | {summary.avg_score:>8.2f} | "
+                f"{summary.avg_tokens:>10,.0f} | {pareto:>8}"
+            )
+        print("\n" + "-" * 70)
+        print(f"Pareto frontier: {result.pareto_frontier}")
+        print(f"Best by score: {result.rank_by_score[0] if result.rank_by_score else 'N/A'}")
+        print(f"Best by efficiency: {result.rank_by_efficiency[0] if result.rank_by_efficiency else 'N/A'}")
+        print("\nExported configs:")
+        for name, path in result.exported_configs.items():
+            print(f"  {name}: {path}")
+        print(f"\nResults saved to: {result.output_dir}")
+def generate_grid_configs(
+    base_name: str,
+    variations: dict[str, list[Any]],
+) -> list[AblationConfig]:
+    """Generate configs from a variation grid.
+    Args:
+        base_name: Base name for generated configs
+        variations: Dict of param_name -> list of values
+    Returns:
+        List of AblationConfig for each combination
+    Example:
+        configs = generate_grid_configs("grid", {
+            "enable_message_compaction": [True, False],
+            "compaction_head_size": [5, 10, 20],
+        })
+    """
+    if not variations:
+        return [AblationConfig(name=base_name)]
+    param_names = list(variations.keys())
+    param_values = list(variations.values())
+    configs = []
+    for values in product(*param_values):
+        kwargs = dict(zip(param_names, values, strict=True))
+        name = f"{base_name}_" + "_".join(f"{k}={v}" for k, v in kwargs.items())
+        configs.append(AblationConfig(name=name, **kwargs))
+    return configs
+def load_tasks_from_jsonl(path: Path) -> list[Task]:
+    """Load tasks from a JSONL file.
+    Each line should be a JSON object with:
+    - name: Task name
+    - prompt: Task prompt
+    - criteria: Optional list of evaluation criteria
+    - category: Optional category string
+    Args:
+        path: Path to JSONL file
+    Returns:
+        List of Task objects
+    """
+    tasks = []
+    with open(path) as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            data = json.loads(line)
+            criteria = []
+            for c in data.get("criteria", []):
+                if isinstance(c, dict):
+                    criteria.append(EvalCriterion(**c))
+                else:
+                    criteria.append(EvalCriterion(name="default", instruction=str(c)))
+            tasks.append(
+                Task(
+                    name=data["name"],
+                    prompt=data["prompt"],
+                    criteria=criteria,
+                    metadata={"category": data.get("category", "default")},
+                )
+            )
+    return tasks

src/flow/experiments/reporters/__init__.py ADDED Viewed

	@@ -0,0 +1,17 @@

+# Copyright (c) Microsoft. All rights reserved.
+"""Reporters for experiment results."""
+from .console_reporter import print_comparison_table, print_eval_result, print_metrics_summary
+from .json_reporter import load_run_result_summary, save_comparison, save_run_result
+__all__ = [  # noqa: RUF022  # Intentionally grouped by category
+    # JSON reporter
+    "save_run_result",
+    "load_run_result_summary",
+    "save_comparison",
+    # Console reporter
+    "print_metrics_summary",
+    "print_comparison_table",
+    "print_eval_result",
+]

src/flow/experiments/reporters/console_reporter.py ADDED Viewed

	@@ -0,0 +1,135 @@

+# Copyright (c) Microsoft. All rights reserved.
+"""Console reporter for experiment results with rich formatting."""
+from typing import Any
+from ..metrics import TraceMetrics
+def print_metrics_summary(metrics: TraceMetrics, title: str = "Trace Metrics") -> None:
+    """Print a formatted metrics summary to console.
+    Args:
+        metrics: TraceMetrics to display
+        title: Title for the summary section
+    """
+    print(f"\n{'=' * 60}")
+    print(f" {title}")
+    print("=" * 60)
+    print(f"  Tokens:     {metrics.total_tokens:,} total ({metrics.input_tokens:,} in, {metrics.output_tokens:,} out)")
+    print(f"  LLM Calls:  {metrics.llm_call_count} ({metrics.llm_duration_ms:.1f}ms)")
+    print(f"  Tool Calls: {metrics.tool_call_count} ({metrics.tool_duration_ms:.1f}ms)")
+    if metrics.tool_calls_by_name:
+        print("  Tool breakdown:")
+        for name, count in sorted(metrics.tool_calls_by_name.items()):
+            print(f"    - {name}: {count}")
+    print(f"  Duration:   {metrics.total_duration_ms:.2f}ms")
+    print(f"  Spans:      {metrics.span_count}")
+    if metrics.error_count > 0:
+        print(f"  Errors:     {metrics.error_count}")
+    print("=" * 60)
+def print_comparison_table(
+    results: list[dict[str, Any]],
+    title: str = "Comparison",
+) -> None:
+    """Print a side-by-side comparison table of multiple results.
+    Args:
+        results: List of result dictionaries with 'name' and 'metrics' keys
+        title: Title for the comparison
+    """
+    if not results:
+        print("No results to compare")
+        return
+    names = [r.get("name", "unknown") for r in results]
+    col_width = max(15, max(len(n) for n in names) + 2)
+    print(f"\n{'=' * 80}")
+    print(f" {title}")
+    print("=" * 80)
+    # Header
+    print(f"\n{'Metric':<30} | " + " | ".join(f"{n:>{col_width}}" for n in names))
+    print("-" * (32 + (col_width + 3) * len(names)))
+    def row(label: str, values: list[Any]) -> None:
+        formatted = []
+        for v in values:
+            if isinstance(v, float):
+                formatted.append(f"{v:>{col_width}.1f}")
+            elif isinstance(v, bool):
+                formatted.append(f"{v!s:>{col_width}}")
+            else:
+                formatted.append(f"{v:>{col_width}}")
+        print(f"{label:<30} | " + " | ".join(formatted))
+    # Extract metrics for each result
+    metrics_list = [r.get("metrics", {}) for r in results]
+    row("Duration (s)", [r.get("duration_seconds", 0) for r in results])
+    row("Success", [r.get("success", False) for r in results])
+    if any(r.get("evaluation") for r in results):
+        row("Eval Score", [r.get("evaluation", {}).get("score", 0) for r in results])
+    row("Total Tokens", [m.get("total_tokens", 0) for m in metrics_list])
+    row("Input Tokens", [m.get("input_tokens", 0) for m in metrics_list])
+    row("Output Tokens", [m.get("output_tokens", 0) for m in metrics_list])
+    row("LLM Calls", [m.get("llm_call_count", 0) for m in metrics_list])
+    row("Tool Calls", [m.get("tool_call_count", 0) for m in metrics_list])
+    row("LLM Time (ms)", [m.get("llm_duration_ms", 0) for m in metrics_list])
+    row("Tool Time (ms)", [m.get("tool_duration_ms", 0) for m in metrics_list])
+    # Tool breakdown
+    all_tools: set[str] = set()
+    for m in metrics_list:
+        all_tools.update(m.get("tool_calls_by_name", {}).keys())
+    if all_tools:
+        print("\n" + "-" * 80)
+        print("Tool Usage Breakdown:")
+        for tool in sorted(all_tools):
+            values = [m.get("tool_calls_by_name", {}).get(tool, 0) for m in metrics_list]
+            row(f"  {tool}", values)
+    print("=" * 80)
+def print_eval_result(
+    score: float,
+    passed: bool,
+    reasoning: str,
+    criteria_results: list[dict[str, Any]] | None = None,
+) -> None:
+    """Print evaluation results in a formatted way.
+    Args:
+        score: Overall score (0.0 to 1.0)
+        passed: Whether evaluation passed
+        reasoning: Overall reasoning
+        criteria_results: Optional list of individual criterion results
+    """
+    status = "PASS" if passed else "FAIL"
+    print(f"\n{'=' * 60}")
+    print(f" Evaluation Result: {status}")
+    print("=" * 60)
+    print(f"  Score:  {score:.2f}")
+    print(f"  Passed: {passed}")
+    print(f"  Reason: {reasoning}")
+    if criteria_results:
+        print("\n  Criteria:")
+        for cr in criteria_results:
+            cr_status = "PASS" if cr.get("passed") else "FAIL"
+            print(f"    - {cr.get('name', 'unknown')}: {cr_status} ({cr.get('score', 0):.2f})")
+            if cr.get("reasoning"):
+                print(f"      {cr['reasoning']}")
+    print("=" * 60)

src/flow/experiments/reporters/json_reporter.py ADDED Viewed

	@@ -0,0 +1,133 @@

+# Copyright (c) Microsoft. All rights reserved.
+"""JSON reporter for experiment results."""
+import json
+from dataclasses import asdict
+from pathlib import Path
+from typing import Any
+from ..metrics import TraceMetrics, metrics_to_dict
+from ..types import EvalResult, RunResult
+def save_run_result(
+    result: RunResult,
+    output_dir: Path,
+    eval_result: EvalResult | None = None,
+    metrics: TraceMetrics | None = None,
+) -> None:
+    """Save a run result to JSON files.
+    Creates the following files in output_dir:
+    - traces.json: Raw OpenTelemetry spans
+    - metrics.json: Extracted metrics (if provided)
+    - output.txt: Agent text output
+    - result.json: Full result summary
+    Args:
+        result: The RunResult to save
+        output_dir: Directory to save files
+        eval_result: Optional evaluation result
+        metrics: Optional extracted metrics
+    """
+    output_dir.mkdir(parents=True, exist_ok=True)
+    # Save raw traces
+    with open(output_dir / "traces.json", "w") as f:
+        json.dump(result.trace, f, indent=2, default=str)
+    # Save extracted metrics
+    if metrics:
+        with open(output_dir / "metrics.json", "w") as f:
+            json.dump(metrics_to_dict(metrics), f, indent=2)
+    # Save agent output
+    with open(output_dir / "output.txt", "w") as f:
+        f.write(f"Task: {result.task.prompt}\n")
+        f.write(f"Duration: {result.duration_seconds:.1f}s\n")
+        f.write(f"Success: {result.success}\n")
+        if eval_result:
+            f.write(f"Eval Score: {eval_result.score:.2f}\n")
+        if result.error:
+            f.write(f"Error: {result.error}\n")
+        f.write("\n" + "=" * 60 + "\n\n")
+        f.write(result.output)
+    # Save full result
+    result_dict: dict[str, Any] = {
+        "task": {
+            "name": result.task.name,
+            "prompt": result.task.prompt,
+            "criteria": [asdict(c) for c in result.task.criteria],
+            "metadata": result.task.metadata,
+        },
+        "success": result.success,
+        "error": result.error,
+        "duration_seconds": result.duration_seconds,
+        "files_created": result.files_created,
+        "trace_count": len(result.trace),
+        "output_length": len(result.output),
+    }
+    if metrics:
+        result_dict["metrics"] = metrics_to_dict(metrics)
+    if eval_result:
+        result_dict["evaluation"] = {
+            "score": eval_result.score,
+            "passed": eval_result.passed,
+            "reasoning": eval_result.reasoning,
+            "criteria_results": [
+                {
+                    "name": cr.name,
+                    "score": cr.score,
+                    "passed": cr.passed,
+                    "reasoning": cr.reasoning,
+                }
+                for cr in eval_result.criteria_results
+            ],
+        }
+    with open(output_dir / "result.json", "w") as f:
+        json.dump(result_dict, f, indent=2)
+def load_run_result_summary(result_path: Path) -> dict[str, Any]:
+    """Load a run result summary from a result.json file.
+    Args:
+        result_path: Path to result.json file
+    Returns:
+        Dictionary with result summary
+    """
+    with open(result_path) as f:
+        return json.load(f)
+def save_comparison(
+    results: list[tuple[str, dict[str, Any]]],
+    output_path: Path,
+) -> None:
+    """Save a comparison of multiple results.
+    Args:
+        results: List of (name, result_dict) tuples
+        output_path: Path to save comparison JSON
+    """
+    comparison = {
+        "results": [
+            {
+                "name": name,
+                "success": result.get("success"),
+                "duration_seconds": result.get("duration_seconds"),
+                "metrics": result.get("metrics"),
+                "evaluation": result.get("evaluation"),
+            }
+            for name, result in results
+        ],
+    }
+    with open(output_path, "w") as f:
+        json.dump(comparison, f, indent=2)

src/flow/experiments/runner.py ADDED Viewed

	@@ -0,0 +1,243 @@

+# Copyright (c) Microsoft. All rights reserved.
+"""Experiment runner for executing agents on tasks with trace capture."""
+from __future__ import annotations
+import logging
+import os
+import tempfile
+import time
+from pathlib import Path
+from typing import TYPE_CHECKING
+from opentelemetry import trace
+from opentelemetry.sdk.resources import Resource
+from opentelemetry.sdk.trace import TracerProvider
+from opentelemetry.sdk.trace.export import SimpleSpanProcessor
+from opentelemetry.semconv._incubating.attributes.service_attributes import SERVICE_NAME
+from .trace_collector import FlowTraceCollector
+from .types import RunResult, Task
+if TYPE_CHECKING:
+    from flow.harness.maf import MAFHarness
+logger = logging.getLogger(__name__)
+def setup_tracing(service_name: str = "flow-experiments") -> TracerProvider:
+    """Setup OpenTelemetry tracing with in-memory collection.
+    This creates a new TracerProvider configured for experiment tracing.
+    Call this once at the start of your experiment session.
+    Args:
+        service_name: Name for the tracing service
+    Returns:
+        The configured TracerProvider
+    """
+    resource = Resource.create({SERVICE_NAME: service_name})
+    provider = TracerProvider(resource=resource)
+    trace.set_tracer_provider(provider)
+    # Enable agent framework instrumentation if available
+    try:
+        from agent_framework.observability import enable_instrumentation
+        enable_instrumentation()
+        logger.debug("Agent Framework instrumentation enabled")
+    except ImportError:
+        logger.debug("Agent Framework not available, skipping instrumentation")
+    except Exception as e:
+        logger.debug(f"Could not enable Agent Framework instrumentation: {e}")
+    return provider
+class FlowExperimentRunner:
+    """Runner for executing experiments with Flow agents.
+    The runner handles:
+    - Setting up temporary workspaces
+    - Collecting execution traces via OpenTelemetry
+    - Measuring execution time
+    - Capturing files created
+    - Supporting streaming execution
+    Example:
+        from flow.harness.maf import MAFHarness
+        from flow.experiments import FlowExperimentRunner, Task
+        harness = MAFHarness()
+        runner = FlowExperimentRunner(keep_workspace=True)
+        task = Task(name="hello", prompt="Create a hello world script")
+        result = await runner.run(harness, task)
+        print(f"Duration: {result.duration_seconds}s")
+        print(f"Files: {result.files_created}")
+    """
+    def __init__(
+        self,
+        workspace_base: Path | None = None,
+        keep_workspace: bool = False,
+    ) -> None:
+        """Initialize the experiment runner.
+        Args:
+            workspace_base: Base directory for workspaces (default: system temp)
+            keep_workspace: Whether to keep workspace after run (default: False)
+        """
+        self.workspace_base = workspace_base or Path(tempfile.gettempdir())
+        self.keep_workspace = keep_workspace
+    async def run(
+        self,
+        harness: MAFHarness,
+        task: Task,
+        workspace: Path | None = None,
+    ) -> RunResult:
+        """Run a harness on a task and collect results.
+        This method:
+        1. Creates or uses a workspace directory
+        2. Sets up trace collection
+        3. Executes the harness with streaming
+        4. Collects output and files created
+        5. Returns a RunResult with all data
+        Args:
+            harness: The MAFHarness to run
+            task: The task to execute
+            workspace: Optional workspace directory (creates temp if None)
+        Returns:
+            RunResult with trace, output, and metrics
+        """
+        # Create or use workspace directory
+        if workspace is None:
+            workspace = Path(tempfile.mkdtemp(
+                prefix=f"flow_experiment_{task.name}_",
+                dir=self.workspace_base,
+            ))
+            workspace_created = True
+        else:
+            workspace.mkdir(parents=True, exist_ok=True)
+            workspace_created = False
+        logger.info(f"Running task '{task.name}' in workspace: {workspace}")
+        # Track files before execution
+        files_before = set(self._list_files(workspace))
+        # Set up trace collection
+        collector = FlowTraceCollector()
+        processor: SimpleSpanProcessor | None = None
+        try:
+            provider = trace.get_tracer_provider()
+            if isinstance(provider, TracerProvider):
+                processor = SimpleSpanProcessor(collector)
+                provider.add_span_processor(processor)
+                logger.debug("Trace collection enabled")
+        except Exception as e:
+            logger.debug(f"Could not set up trace collection: {e}")
+        # Execute the harness
+        start_time = time.time()
+        output_chunks: list[str] = []
+        error: str | None = None
+        try:
+            # Change to workspace directory for execution
+            original_cwd = os.getcwd()
+            os.chdir(workspace)
+            try:
+                # Use streaming execution to capture all output
+                async for event in harness.run_stream(task.prompt):
+                    # Collect text output
+                    if hasattr(event, "content") and event.content:
+                        if hasattr(event, "type"):
+                            from ..harness.base import EventType
+                            if event.type in (EventType.TEXT_DELTA, EventType.TEXT_DONE):
+                                output_chunks.append(event.content)
+                            elif event.type == EventType.TOOL_RESULT:
+                                # Optionally capture tool results
+                                pass
+            finally:
+                os.chdir(original_cwd)
+        except Exception as e:
+            error = str(e)
+            logger.error(f"Task execution failed: {e}")
+        end_time = time.time()
+        duration_seconds = end_time - start_time
+        # Force flush and get traces
+        if processor:
+            try:
+                processor.force_flush()
+            except Exception as e:
+                logger.debug(f"Error flushing processor: {e}")
+        # Get collected traces
+        trace_data = collector.get_traces()
+        # Clean up trace processor
+        if processor:
+            try:
+                processor.shutdown()
+            except Exception as e:
+                logger.debug(f"Error shutting down processor: {e}")
+        # Find files created
+        files_after = set(self._list_files(workspace))
+        files_created = sorted(files_after - files_before)
+        # Clean up workspace if not keeping and we created it
+        if not self.keep_workspace and workspace_created and not error:
+            try:
+                import shutil
+                shutil.rmtree(workspace)
+                logger.debug(f"Cleaned up workspace: {workspace}")
+            except Exception as e:
+                logger.warning(f"Failed to clean up workspace: {e}")
+        output = "".join(output_chunks)
+        return RunResult(
+            task=task,
+            trace=trace_data,
+            output=output,
+            files_created=files_created,
+            duration_seconds=duration_seconds,
+            workspace=workspace,
+            error=error,
+        )
+    def _list_files(self, directory: Path) -> list[str]:
+        """List all files in a directory recursively.
+        Args:
+            directory: Directory to scan
+        Returns:
+            List of relative file paths
+        """
+        files = []
+        try:
+            for root, _, filenames in os.walk(directory):
+                for filename in filenames:
+                    # Skip hidden files and common temp files
+                    if filename.startswith("."):
+                        continue
+                    full_path = Path(root) / filename
+                    rel_path = full_path.relative_to(directory)
+                    files.append(str(rel_path))
+        except Exception as e:
+            logger.debug(f"Error listing files: {e}")
+        return files

src/flow/experiments/trace_collector.py ADDED Viewed

	@@ -0,0 +1,104 @@

+# Copyright (c) Microsoft. All rights reserved.
+"""OpenTelemetry trace collector for experiment analysis."""
+import logging
+from datetime import datetime
+from typing import Any
+from opentelemetry.sdk.trace.export import SpanExporter, SpanExportResult
+logger = logging.getLogger(__name__)
+class FlowTraceCollector(SpanExporter):
+    """Collects OpenTelemetry spans for experiment analysis.
+    This exporter captures spans during agent execution and converts them
+    to a dictionary format suitable for metrics extraction and analysis.
+    Example:
+        collector = FlowTraceCollector()
+        # Attach to TracerProvider via SimpleSpanProcessor
+        # Run agent execution
+        traces = collector.get_traces()
+    """
+    def __init__(self) -> None:
+        """Initialize the trace collector."""
+        self.spans: list[dict[str, Any]] = []
+    def export(self, spans: Any) -> SpanExportResult:
+        """Collect spans from OpenTelemetry.
+        Args:
+            spans: Sequence of OpenTelemetry ReadableSpan objects
+        Returns:
+            SpanExportResult indicating success
+        """
+        for span in spans:
+            try:
+                # Convert nanoseconds to seconds for timestamps
+                start_time = span.start_time / 1_000_000_000
+                end_time = span.end_time / 1_000_000_000 if span.end_time else None
+                duration_ms = ((end_time - start_time) * 1000) if end_time else None
+                self.spans.append({
+                    "type": "trace_span",
+                    "timestamp": datetime.fromtimestamp(start_time).isoformat(),
+                    "data": {
+                        "operation_name": span.name,
+                        "span_id": format(span.context.span_id, "016x"),
+                        "trace_id": format(span.context.trace_id, "032x"),
+                        "parent_span_id": (
+                            format(span.parent.span_id, "016x") if span.parent else None
+                        ),
+                        "duration_ms": duration_ms,
+                        "attributes": dict(span.attributes) if span.attributes else {},
+                        "status": str(span.status.status_code.name) if hasattr(span, "status") else "OK",
+                        "events": [
+                            {
+                                "name": event.name,
+                                "timestamp": datetime.fromtimestamp(
+                                    event.timestamp / 1_000_000_000
+                                ).isoformat(),
+                                "attributes": dict(event.attributes) if event.attributes else {},
+                            }
+                            for event in (span.events or [])
+                        ],
+                    },
+                })
+            except Exception as e:
+                logger.debug(f"Failed to collect span: {e}")
+        return SpanExportResult.SUCCESS
+    def force_flush(self, timeout_millis: int = 30000) -> bool:
+        """Force flush spans (no-op for simple collection).
+        Args:
+            timeout_millis: Timeout in milliseconds (unused)
+        Returns:
+            True always
+        """
+        return True
+    def shutdown(self) -> None:
+        """Shutdown the exporter (no-op)."""
+        pass
+    def get_traces(self) -> list[dict[str, Any]]:
+        """Get and clear collected traces.
+        Returns:
+            List of collected trace spans, clearing the internal list
+        """
+        traces = self.spans.copy()
+        self.spans.clear()
+        return traces
+    def clear(self) -> None:
+        """Clear collected traces without returning them."""
+        self.spans.clear()

src/flow/experiments/types.py ADDED Viewed

	@@ -0,0 +1,266 @@

+# Copyright (c) Microsoft. All rights reserved.
+"""Type definitions for the experiments framework."""
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any
+@dataclass
+class EvalCriterion:
+    """A criterion for evaluating agent output.
+    Attributes:
+        name: Short identifier for the criterion (e.g., "correctness", "completeness")
+        instruction: Detailed instruction for how to evaluate this criterion
+        weight: Relative weight for scoring (default 1.0)
+    """
+    name: str
+    instruction: str
+    weight: float = 1.0
+@dataclass
+class Task:
+    """A task for the agent to perform.
+    Attributes:
+        name: Short identifier for the task
+        prompt: The prompt/instruction given to the agent
+        criteria: List of evaluation criteria for assessing the output
+        metadata: Additional task metadata (e.g., expected output, difficulty)
+    """
+    name: str
+    prompt: str
+    criteria: list[EvalCriterion] = field(default_factory=list)
+    metadata: dict[str, Any] = field(default_factory=dict)
+@dataclass
+class RunResult:
+    """Result of running an agent on a task.
+    Attributes:
+        task: The task that was executed
+        trace: OpenTelemetry trace spans collected during execution
+        output: The agent's final output/response
+        files_created: List of files created during execution
+        duration_seconds: Total execution time
+        workspace: Path to the workspace directory used
+        error: Error message if execution failed, None if successful
+    """
+    task: Task
+    trace: list[dict[str, Any]]
+    output: str
+    files_created: list[str]
+    duration_seconds: float
+    workspace: Path
+    error: str | None = None
+    @property
+    def success(self) -> bool:
+        """Whether the run completed without errors."""
+        return self.error is None
+@dataclass
+class CriterionResult:
+    """Result of evaluating a single criterion.
+    Attributes:
+        name: Name of the criterion evaluated
+        score: Numeric score (0.0 to 1.0)
+        passed: Whether the criterion was met
+        reasoning: Explanation of the evaluation
+    """
+    name: str
+    score: float
+    passed: bool
+    reasoning: str
+@dataclass
+class EvalResult:
+    """Result of evaluating an agent's output.
+    Attributes:
+        score: Overall weighted score (0.0 to 1.0)
+        passed: Whether the evaluation passed overall
+        criteria_results: Results for each individual criterion
+        reasoning: Overall evaluation reasoning/summary
+    """
+    score: float
+    passed: bool
+    criteria_results: list[CriterionResult]
+    reasoning: str
+# =============================================================================
+# Built-in Task Suites for Optimization
+# =============================================================================
+TASK_SUITES: dict[str, list[Task]] = {
+    "quick": [
+        Task(
+            name="fizzbuzz",
+            prompt="Create a Python file fizzbuzz.py that prints FizzBuzz from 1-100. Then run it.",
+            criteria=[
+                EvalCriterion(name="file_created", instruction="fizzbuzz.py file was created"),
+                EvalCriterion(name="correct_output", instruction="Output shows correct FizzBuzz pattern"),
+            ],
+            metadata={"category": "short", "expected_duration": 60},
+        ),
+        Task(
+            name="hello_api",
+            prompt="Create a FastAPI app in api.py with a /hello endpoint that returns {'message': 'hello'}.",
+            criteria=[
+                EvalCriterion(name="file_created", instruction="api.py file was created"),
+                EvalCriterion(name="has_endpoint", instruction="Contains a /hello GET endpoint"),
+            ],
+            metadata={"category": "short", "expected_duration": 90},
+        ),
+        Task(
+            name="file_counter",
+            prompt="Create a Python script count_files.py that counts .py files in current directory and prints the count.",
+            criteria=[
+                EvalCriterion(name="file_created", instruction="count_files.py was created"),
+                EvalCriterion(name="runs_correctly", instruction="Script runs and outputs a number"),
+            ],
+            metadata={"category": "short", "expected_duration": 60},
+        ),
+    ],
+    "core": [
+        Task(
+            name="fizzbuzz",
+            prompt="Create a Python file fizzbuzz.py that prints FizzBuzz from 1-100. Then run it.",
+            criteria=[
+                EvalCriterion(name="file_created", instruction="fizzbuzz.py file was created"),
+                EvalCriterion(name="correct_output", instruction="Output shows correct FizzBuzz pattern"),
+            ],
+            metadata={"category": "short"},
+        ),
+        Task(
+            name="rest_api",
+            prompt="Create a FastAPI app with CRUD endpoints for a TODO list (in-memory storage). Include GET /todos, POST /todos, DELETE /todos/{id}.",
+            criteria=[
+                EvalCriterion(name="file_created", instruction="API file was created"),
+                EvalCriterion(name="has_crud", instruction="Contains GET, POST, DELETE endpoints"),
+            ],
+            metadata={"category": "medium"},
+        ),
+        Task(
+            name="data_analysis",
+            prompt="Create a Python script that generates 100 random data points, calculates mean/median/std, and saves results to stats.json.",
+            criteria=[
+                EvalCriterion(name="script_created", instruction="Python script was created"),
+                EvalCriterion(name="json_output", instruction="stats.json was created with results"),
+            ],
+            metadata={"category": "medium"},
+        ),
+        Task(
+            name="cli_tool",
+            prompt="Create a CLI tool using argparse that takes a filename and counts lines, words, and characters (like wc).",
+            criteria=[
+                EvalCriterion(name="file_created", instruction="CLI script was created"),
+                EvalCriterion(name="uses_argparse", instruction="Uses argparse for argument parsing"),
+            ],
+            metadata={"category": "medium"},
+        ),
+        Task(
+            name="unit_tests",
+            prompt="Create a calculator module (calc.py) with add/subtract/multiply/divide functions, then write pytest tests for it (test_calc.py).",
+            criteria=[
+                EvalCriterion(name="module_created", instruction="calc.py was created"),
+                EvalCriterion(name="tests_created", instruction="test_calc.py was created"),
+                EvalCriterion(name="tests_pass", instruction="Tests pass when run"),
+            ],
+            metadata={"category": "medium"},
+        ),
+    ],
+    "coding": [
+        Task(
+            name="fizzbuzz",
+            prompt="Create fizzbuzz.py that prints FizzBuzz 1-100 and run it.",
+            criteria=[EvalCriterion(name="correct", instruction="Correct FizzBuzz output")],
+            metadata={"category": "short"},
+        ),
+        Task(
+            name="rest_api",
+            prompt="Create a FastAPI CRUD TODO app with GET/POST/DELETE endpoints.",
+            criteria=[EvalCriterion(name="has_crud", instruction="Has working CRUD")],
+            metadata={"category": "medium"},
+        ),
+        Task(
+            name="cli_tool",
+            prompt="Create an argparse CLI that counts lines/words/chars in a file.",
+            criteria=[EvalCriterion(name="works", instruction="CLI works correctly")],
+            metadata={"category": "medium"},
+        ),
+        Task(
+            name="data_pipeline",
+            prompt="Create a script that reads CSV data, filters rows, aggregates, and outputs JSON.",
+            criteria=[EvalCriterion(name="works", instruction="Pipeline produces correct output")],
+            metadata={"category": "medium"},
+        ),
+        Task(
+            name="unit_tests",
+            prompt="Create calc.py with math functions and test_calc.py with pytest tests.",
+            criteria=[EvalCriterion(name="tests_pass", instruction="Tests pass")],
+            metadata={"category": "medium"},
+        ),
+        Task(
+            name="web_scraper",
+            prompt="Create a script that fetches a webpage and extracts all links.",
+            criteria=[EvalCriterion(name="extracts_links", instruction="Extracts links correctly")],
+            metadata={"category": "medium"},
+        ),
+        Task(
+            name="async_downloader",
+            prompt="Create an async script that downloads multiple URLs concurrently using aiohttp.",
+            criteria=[EvalCriterion(name="uses_async", instruction="Uses async/await correctly")],
+            metadata={"category": "complex"},
+        ),
+        Task(
+            name="database_orm",
+            prompt="Create a SQLAlchemy model for Users with CRUD operations.",
+            criteria=[EvalCriterion(name="has_orm", instruction="Uses SQLAlchemy ORM correctly")],
+            metadata={"category": "complex"},
+        ),
+        Task(
+            name="decorator_lib",
+            prompt="Create a library with timing, retry, and caching decorators.",
+            criteria=[EvalCriterion(name="decorators_work", instruction="Decorators function correctly")],
+            metadata={"category": "complex"},
+        ),
+        Task(
+            name="config_parser",
+            prompt="Create a config parser that supports YAML, JSON, and env vars with validation.",
+            criteria=[EvalCriterion(name="multi_format", instruction="Supports multiple formats")],
+            metadata={"category": "complex"},
+        ),
+    ],
+}
+def get_task_suite(suite_name: str) -> list[Task]:
+    """Get a built-in task suite by name.
+    Args:
+        suite_name: Name of the suite ('quick', 'core', 'coding')
+    Returns:
+        List of Task objects
+    Raises:
+        ValueError: If suite_name is not found
+    """
+    if suite_name not in TASK_SUITES:
+        available = ", ".join(TASK_SUITES.keys())
+        raise ValueError(f"Unknown suite '{suite_name}'. Available: {available}")
+    return TASK_SUITES[suite_name]

src/flow/harness/__init__.py ADDED Viewed

	@@ -0,0 +1,18 @@

+"""Harness modules for Flow agent.
+Harnesses are agent runtime adapters that convert different agent framework
+events to a uniform Event format for CLI/UI consumption.
+Available harnesses:
+- maf: Microsoft Agent Framework harness
+- (future) langchain: LangChain harness
+- (future) claude: Claude SDK harness
+"""
+from flow.harness.base import BaseHarness, Event, EventType
+__all__ = [
+    "BaseHarness",
+    "Event",
+    "EventType",
+]

src/flow/harness/base.py ADDED Viewed

	@@ -0,0 +1,110 @@

+"""Base harness interface for agent runtimes.
+Defines the abstract interface that all harnesses must implement,
+allowing Flow to run on different agent frameworks.
+"""
+from __future__ import annotations
+from abc import ABC, abstractmethod
+from collections.abc import AsyncIterator, Callable, Coroutine
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Any
+class EventType(Enum):
+    """Types of events that can be streamed from an agent."""
+    TEXT_DELTA = "text_delta"  # Streaming text chunk
+    TEXT_DONE = "text_done"  # Text generation complete
+    TOOL_CALL_START = "tool_call_start"  # Starting a tool call
+    TOOL_CALL_ARGS = "tool_call_args"  # Tool call arguments (streaming)
+    TOOL_CALL_DONE = "tool_call_done"  # Tool call complete
+    TOOL_RESULT = "tool_result"  # Tool execution result
+    THINKING = "thinking"  # Agent reasoning/thinking
+    ERROR = "error"  # An error occurred
+    DONE = "done"  # Agent run complete
+@dataclass
+class Event:
+    """An event from the agent execution stream.
+    Events provide real-time feedback during agent execution,
+    allowing the CLI to display progress, tool calls, and results.
+    """
+    type: EventType
+    content: str = ""
+    tool_name: str | None = None
+    tool_call_id: str | None = None
+    metadata: dict[str, str | int | float | bool | None] = field(default_factory=dict)
+class BaseHarness(ABC):
+    """Abstract base class for agent execution harnesses.
+    A harness is a thin adapter that converts agent framework events
+    to the uniform Flow Event format for CLI/UI consumption.
+    Each harness implementation handles:
+    - Taking a pre-configured agent from the framework
+    - Running tasks on the agent
+    - Converting framework-specific events to Flow Events
+    - Managing conversation threads
+    Implementations:
+    - MAFHarness (flow.harness.maf): Microsoft Agent Framework
+    - (Future) LangChainHarness: LangChain
+    - (Future) ClaudeHarness: Claude SDK
+    """
+    @abstractmethod
+    async def run(self, task: str, thread_id: str | None = None) -> str:
+        """Run a task and return the final response.
+        Args:
+            task: The task/prompt to execute
+            thread_id: Optional thread ID for conversation continuity
+        Returns:
+            The agent's final response text
+        """
+        ...
+    @abstractmethod
+    def run_stream(self, task: str, thread_id: str | None = None) -> AsyncIterator[Event]:
+        """Run a task with streaming events.
+        Args:
+            task: The task/prompt to execute
+            thread_id: Optional thread ID for conversation continuity
+        Yields:
+            Event objects representing agent activity
+        """
+        ...
+    @abstractmethod
+    def register_tools(self, tools: list[Callable[..., Coroutine[Any, Any, str]]]) -> None:
+        """Register tools with the harness.
+        Args:
+            tools: List of tool functions to register
+        """
+        ...
+    @abstractmethod
+    def get_thread_id(self) -> str:
+        """Get the current thread ID.
+        Returns:
+            The current conversation thread ID
+        """
+        ...
+    @abstractmethod
+    async def close(self) -> None:
+        """Clean up resources used by the harness."""
+        ...

src/flow/harness/maf/__init__.py ADDED Viewed

	@@ -0,0 +1,14 @@

+"""Microsoft Agent Framework harness module.
+Provides integration with Microsoft Agent Framework for running Flow agents.
+"""
+from flow.harness.maf.agent import create_agent
+from flow.harness.maf.harness import MAFHarness
+from flow.harness.maf.message_store import HeadTailCompactingChatMessageStore
+__all__ = [
+    "create_agent",
+    "HeadTailCompactingChatMessageStore",
+    "MAFHarness",
+]

src/flow/harness/maf/agent.py ADDED Viewed

	@@ -0,0 +1,176 @@

+"""Agent factory for Microsoft Agent Framework.
+Provides factory functions to create configured ChatAgent instances.
+"""
+import logging
+import os
+from collections.abc import Callable, Coroutine, Sequence
+from pathlib import Path
+from typing import TYPE_CHECKING, Any
+from flow.harness.maf.message_store import HeadTailCompactingChatMessageStore
+from flow.prompts import FLOW_AGENT_INSTRUCTIONS
+from flow.tools import create_all_tools
+if TYPE_CHECKING:
+    from agent_framework import ChatAgent
+logger = logging.getLogger(__name__)
+# Default paths
+DEFAULT_WORKSPACE = Path.home() / ".flow" / "workspace"
+DEFAULT_MEMORY_PATH = Path.home() / ".flow" / "memory"
+def create_agent(
+    *,
+    # Model/API configuration
+    endpoint: str | None = None,
+    api_key: str | None = None,
+    deployment: str | None = None,
+    api_version: str = "2024-02-15-preview",
+    # Agent configuration
+    name: str = "Flow",
+    instructions: str | None = None,
+    # Workspace configuration
+    workspace: Path | None = None,
+    memory_path: Path | None = None,
+    # Tool configuration
+    tools: Sequence[Callable[..., Coroutine[Any, Any, str]]] | None = None,
+    enable_memory_tool: bool = True,
+    enable_sub_agent: bool = False,
+    bash_timeout: int = 120,
+    # Context engineering
+    enable_compaction: bool = True,
+    compaction_head_size: int = 10,
+    compaction_tail_size: int = 40,
+) -> "ChatAgent":
+    """Create a configured ChatAgent for Flow.
+    This factory creates a Microsoft Agent Framework ChatAgent with:
+    - Azure OpenAI as the backend
+    - Flow's standard tools (coding, execution, memory)
+    - Optional message compaction for long conversations
+    - Optional agent-managed memory tool
+    - Optional sub-agent for isolated research
+    Args:
+        endpoint: Azure OpenAI endpoint URL. Defaults to AZURE_OPENAI_ENDPOINT env var.
+        api_key: Azure OpenAI API key. Defaults to AZURE_OPENAI_API_KEY env var.
+        deployment: Azure OpenAI deployment name. Defaults to AZURE_OPENAI_DEPLOYMENT env var.
+        api_version: Azure OpenAI API version.
+        name: Agent name.
+        instructions: Agent instructions. Defaults to FLOW_AGENT_INSTRUCTIONS.
+        workspace: Directory for file operations. Defaults to ~/.flow/workspace.
+        memory_path: Directory for persistent memory. Defaults to ~/.flow/memory.
+        tools: Custom tools to use. If None, creates standard Flow tools.
+        enable_memory_tool: Whether to include the memory tool (default: True).
+        enable_sub_agent: Whether to include the sub-agent tool (default: False).
+        bash_timeout: Timeout for bash commands in seconds.
+        enable_compaction: Whether to enable head+tail message compaction.
+        compaction_head_size: Number of initial messages to keep.
+        compaction_tail_size: Number of recent messages to keep.
+    Returns:
+        Configured ChatAgent instance.
+    Raises:
+        ImportError: If agent_framework is not installed.
+        ValueError: If required Azure OpenAI credentials are missing.
+    Example:
+        >>> from flow.harness.maf import create_agent
+        >>> agent = create_agent()
+        >>> thread = agent.get_new_thread()
+        >>> response = await agent.run("Create a hello world script", thread=thread)
+    """
+    try:
+        from agent_framework import ChatAgent, ai_function
+        from agent_framework.azure import AzureOpenAIChatClient
+    except ImportError as e:
+        raise ImportError(
+            "Microsoft Agent Framework is required. "
+            "Install with: pip install agent-framework-core"
+        ) from e
+    # Resolve configuration from environment if not provided
+    endpoint = endpoint or os.environ.get("AZURE_OPENAI_ENDPOINT")
+    api_key = api_key or os.environ.get("AZURE_OPENAI_API_KEY")
+    deployment = deployment or os.environ.get("AZURE_OPENAI_DEPLOYMENT")
+    if not endpoint:
+        raise ValueError(
+            "Azure OpenAI endpoint is required. "
+            "Set AZURE_OPENAI_ENDPOINT or pass endpoint parameter."
+        )
+    if not api_key:
+        raise ValueError(
+            "Azure OpenAI API key is required. "
+            "Set AZURE_OPENAI_API_KEY or pass api_key parameter."
+        )
+    if not deployment:
+        raise ValueError(
+            "Azure OpenAI deployment is required. "
+            "Set AZURE_OPENAI_DEPLOYMENT or pass deployment parameter."
+        )
+    # Resolve paths
+    workspace = workspace or DEFAULT_WORKSPACE
+    memory_path = memory_path or DEFAULT_MEMORY_PATH
+    # Ensure directories exist
+    workspace.mkdir(parents=True, exist_ok=True)
+    memory_path.mkdir(parents=True, exist_ok=True)
+    # Create or use provided tools
+    if tools is None:
+        tools = create_all_tools(
+            workspace=workspace,
+            memory_path=memory_path,
+            bash_timeout=bash_timeout,
+            enable_memory_tool=enable_memory_tool,
+            enable_sub_agent=enable_sub_agent,
+        )
+    # Wrap tools with ai_function decorator for Agent Framework
+    converted_tools = []
+    for tool_func in tools:
+        tool_name = getattr(tool_func, "_tool_name", tool_func.__name__)
+        tool_description = getattr(tool_func, "_tool_description", tool_func.__doc__ or "")
+        wrapped = ai_function(name=tool_name, description=tool_description)(tool_func)
+        converted_tools.append(wrapped)
+    # Create the chat client
+    client = AzureOpenAIChatClient(
+        api_key=api_key,
+        endpoint=endpoint,
+        deployment=deployment,
+        api_version=api_version,
+    )
+    # Create message store factory if compaction is enabled
+    message_store_factory = None
+    if enable_compaction:
+        def create_compacting_store() -> HeadTailCompactingChatMessageStore:
+            return HeadTailCompactingChatMessageStore(
+                head_size=compaction_head_size,
+                tail_size=compaction_tail_size,
+            )
+        message_store_factory = create_compacting_store
+        logger.debug(
+            f"Message compaction enabled: head={compaction_head_size}, tail={compaction_tail_size}"
+        )
+    # Create the agent
+    agent = ChatAgent(
+        name=name,
+        description="Autonomous coding agent",
+        instructions=instructions or FLOW_AGENT_INSTRUCTIONS,
+        chat_client=client,
+        tools=converted_tools,
+        chat_message_store_factory=message_store_factory,
+    )
+    return agent

src/flow/harness/maf/harness.py ADDED Viewed

	@@ -0,0 +1,258 @@

+"""Microsoft Agent Framework harness.
+A thin adapter that converts Agent Framework events to the uniform Flow Event format.
+"""
+import logging
+import uuid
+from collections.abc import AsyncIterator
+from typing import TYPE_CHECKING, Any
+from flow.harness.base import BaseHarness, Event, EventType
+if TYPE_CHECKING:
+    from agent_framework import ChatAgent
+logger = logging.getLogger(__name__)
+# Track if instrumentation has been enabled globally
+_instrumentation_enabled = False
+def _enable_instrumentation() -> None:
+    """Enable OpenTelemetry instrumentation for Agent Framework.
+    This is called once when the first harness is created.
+    Instrumentation allows trace collection for experiments.
+    """
+    global _instrumentation_enabled
+    if _instrumentation_enabled:
+        return
+    try:
+        from agent_framework.observability import enable_instrumentation
+        enable_instrumentation()
+        _instrumentation_enabled = True
+        logger.debug("Agent Framework instrumentation enabled")
+    except ImportError:
+        logger.debug("Agent Framework observability not available")
+    except Exception as e:
+        logger.debug(f"Could not enable instrumentation: {e}")
+class MAFHarness(BaseHarness):
+    """Harness adapter for Microsoft Agent Framework.
+    This adapter:
+    1. Takes a ChatAgent (or creates one with default settings)
+    2. Runs tasks on the agent
+    3. Converts Agent Framework events to uniform Flow Events
+    Example:
+        >>> from flow.harness.maf import MAFHarness
+        >>> # Simple usage - creates agent with defaults
+        >>> harness = MAFHarness()
+        >>> async for event in harness.run_stream("Create a hello world script"):
+        ...     print(event)
+        >>> # Or with custom agent
+        >>> from flow.harness.maf import create_agent
+        >>> agent = create_agent(enable_compaction=False)
+        >>> harness = MAFHarness(agent)
+    """
+    def __init__(
+        self,
+        agent: "ChatAgent | None" = None,
+        **create_agent_kwargs: Any,
+    ) -> None:
+        """Initialize the harness.
+        Args:
+            agent: Optional ChatAgent instance. If not provided, creates one
+                   using create_agent() with the given kwargs.
+            **create_agent_kwargs: Passed to create_agent() if agent is None.
+                                   Common options: workspace, memory_path,
+                                   enable_compaction, enable_memory_tool.
+        """
+        if agent is None:
+            from flow.harness.maf.agent import create_agent
+            agent = create_agent(**create_agent_kwargs)
+        self._agent: ChatAgent = agent  # type: ignore[assignment]
+        self._thread: Any = None  # AgentThread for conversation continuity
+        self._thread_id: str | None = None
+        # Track tool calls we've seen to avoid duplicate TOOL_CALL_START events
+        self._seen_tool_calls: set[str] = set()
+        # Enable OpenTelemetry instrumentation for trace collection
+        _enable_instrumentation()
+    def register_tools(self, tools: list[Any]) -> None:
+        """Register tools with the harness.
+        Note: For MAFHarness, tools should be configured when creating the agent
+        via create_agent(). This method is provided for interface compatibility
+        but will log a warning if called.
+        Args:
+            tools: List of tool functions (ignored - configure via create_agent)
+        """
+        logger.warning(
+            "MAFHarness.register_tools() called but tools should be configured "
+            "via create_agent(). These tools will be ignored."
+        )
+    async def run(self, task: str, thread_id: str | None = None) -> str:
+        """Run a task and return the final response.
+        Args:
+            task: The task/prompt to execute
+            thread_id: Optional thread ID for conversation continuity
+        Returns:
+            The agent's final response text
+        """
+        if thread_id:
+            self._thread_id = thread_id
+        # Get or create an AgentThread for conversation continuity
+        if self._thread is None:
+            self._thread = self._agent.get_new_thread()
+        response = await self._agent.run(task, thread=self._thread)
+        # Extract text content from response
+        content = getattr(response, "content", None)
+        if content is not None:
+            return str(content)
+        return str(response)
+    async def run_stream(
+        self, task: str, thread_id: str | None = None
+    ) -> AsyncIterator[Event]:
+        """Run a task with streaming events.
+        Args:
+            task: The task/prompt to execute
+            thread_id: Optional thread ID for conversation continuity
+        Yields:
+            Event objects representing agent activity
+        """
+        if thread_id:
+            self._thread_id = thread_id
+        # Get or create an AgentThread for conversation continuity
+        if self._thread is None:
+            self._thread = self._agent.get_new_thread()
+        # Clear seen tool calls for this run
+        self._seen_tool_calls.clear()
+        try:
+            # Check if agent supports streaming
+            if hasattr(self._agent, "run_stream"):
+                async for chunk in self._agent.run_stream(task, thread=self._thread):
+                    # Convert agent_framework events to Flow events
+                    events = self._convert_event(chunk)
+                    for event in events:
+                        yield event
+            else:
+                # Fallback: run non-streaming and emit single event
+                response = await self._agent.run(task, thread=self._thread)
+                response_content = getattr(response, "content", None)
+                content = str(response_content) if response_content is not None else str(response)
+                yield Event(type=EventType.TEXT_DONE, content=content)
+            yield Event(type=EventType.DONE)
+        except Exception as e:
+            yield Event(type=EventType.ERROR, content=str(e))
+    def _convert_event(self, chunk: Any) -> list[Event]:
+        """Convert an agent_framework event to Flow Events.
+        Args:
+            chunk: Event from agent_framework (AgentResponseUpdate)
+        Returns:
+            List of converted Events (may be empty)
+        """
+        events: list[Event] = []
+        chunk_type = type(chunk).__name__
+        # AgentResponseUpdate/AgentRunResponseUpdate has .contents list and .text property
+        if chunk_type in ("AgentResponseUpdate", "AgentRunResponseUpdate") or hasattr(chunk, "contents"):
+            contents = getattr(chunk, "contents", []) or []
+            for content in contents:
+                content_type = type(content).__name__
+                if content_type == "TextContent":
+                    text = getattr(content, "text", "")
+                    if text:
+                        events.append(Event(type=EventType.TEXT_DELTA, content=text))
+                elif content_type == "FunctionCallContent":
+                    # Streaming pattern:
+                    # - First chunk has call_id and name set, arguments=''
+                    # - Subsequent chunks have empty call_id/name, just argument fragments
+                    call_id = getattr(content, "call_id", "") or ""
+                    name = getattr(content, "name", "") or ""
+                    args = getattr(content, "arguments", "") or ""
+                    if call_id and name:
+                        # First chunk - emit TOOL_CALL_START
+                        self._seen_tool_calls.add(call_id)
+                        events.append(Event(
+                            type=EventType.TOOL_CALL_START,
+                            tool_name=name,
+                            tool_call_id=call_id,
+                        ))
+                    elif args:
+                        # Argument fragment - emit as TOOL_CALL_ARGS
+                        events.append(Event(
+                            type=EventType.TOOL_CALL_ARGS,
+                            content=args,
+                        ))
+                elif content_type == "FunctionResultContent":
+                    result = getattr(content, "result", "")
+                    call_id = getattr(content, "call_id", None)
+                    events.append(Event(
+                        type=EventType.TOOL_RESULT,
+                        content=str(result),
+                        tool_call_id=call_id,
+                    ))
+                    # Emit TOOL_CALL_DONE after result
+                    events.append(Event(type=EventType.TOOL_CALL_DONE))
+            # If no contents but has text, use that
+            if not events and hasattr(chunk, "text"):
+                text = chunk.text
+                if text:
+                    events.append(Event(type=EventType.TEXT_DELTA, content=text))
+        # Fallback for other chunk types
+        elif hasattr(chunk, "text"):
+            text = chunk.text
+            if text:
+                events.append(Event(type=EventType.TEXT_DELTA, content=text))
+        return events
+    def get_thread_id(self) -> str:
+        """Get the current thread ID.
+        Returns:
+            The current conversation thread ID
+        """
+        if self._thread_id is None:
+            self._thread_id = str(uuid.uuid4())
+        return self._thread_id
+    async def close(self) -> None:
+        """Clean up resources used by the harness."""
+        # Agent Framework doesn't require explicit cleanup
+        self._thread = None
+        self._thread_id = None

src/flow/harness/maf/message_store.py ADDED Viewed

	@@ -0,0 +1,177 @@

+"""Message store implementations for Microsoft Agent Framework.
+Provides ChatMessageStoreProtocol implementations for context management.
+"""
+from collections.abc import MutableMapping, Sequence
+from typing import TYPE_CHECKING, Any
+if TYPE_CHECKING:
+    from agent_framework import ChatMessage
+class HeadTailCompactingChatMessageStore:
+    """A compacting message store that works directly with Agent Framework ChatMessage.
+    This store implements ChatMessageStoreProtocol and keeps the first N messages
+    (head) and last M messages (tail), dropping middle messages to prevent
+    context overflow in long conversations.
+    IMPORTANT: This store preserves full ChatMessage objects including:
+    - FunctionCallContent (tool calls)
+    - FunctionResultContent (tool results)
+    - All other content types
+    This is critical because OpenAI's API requires tool results to immediately
+    follow their corresponding tool calls.
+    The compaction strategy:
+    - Keeps the first N messages (task context, initial instructions)
+    - Keeps the last M messages (recent work, current state)
+    - Drops middle messages to prevent context overflow
+    """
+    def __init__(
+        self,
+        messages: Sequence["ChatMessage"] | None = None,
+        head_size: int = 10,
+        tail_size: int = 40,
+    ) -> None:
+        """Initialize the compacting store.
+        Args:
+            messages: Initial messages to store
+            head_size: Number of initial messages to keep
+            tail_size: Number of recent messages to keep
+        """
+        if head_size < 0:
+            raise ValueError("head_size must be non-negative")
+        if tail_size < 0:
+            raise ValueError("tail_size must be non-negative")
+        self._messages: list["ChatMessage"] = list(messages) if messages else []
+        self._head_size = head_size
+        self._tail_size = tail_size
+    @property
+    def head_size(self) -> int:
+        """Number of messages kept from the beginning."""
+        return self._head_size
+    @property
+    def tail_size(self) -> int:
+        """Number of messages kept from the end."""
+        return self._tail_size
+    @property
+    def total_messages(self) -> int:
+        """Total number of messages stored (before compaction)."""
+        return len(self._messages)
+    @property
+    def compacted_count(self) -> int:
+        """Number of messages that would be returned by list_messages()."""
+        total = len(self._messages)
+        max_kept = self._head_size + self._tail_size
+        return min(total, max_kept)
+    @property
+    def dropped_count(self) -> int:
+        """Number of messages dropped during compaction."""
+        return max(0, self.total_messages - self.compacted_count)
+    async def add_messages(self, messages: Sequence["ChatMessage"]) -> None:
+        """Add messages to the store.
+        Messages are stored as-is, preserving all content types.
+        Args:
+            messages: Sequence of ChatMessage objects to add
+        """
+        self._messages.extend(messages)
+    async def list_messages(self) -> list["ChatMessage"]:
+        """Get messages with head+tail compaction applied.
+        Returns the first head_size messages plus the last tail_size messages.
+        If total messages <= head_size + tail_size, returns all messages.
+        Returns:
+            List of ChatMessage objects after compaction
+        """
+        total = len(self._messages)
+        max_kept = self._head_size + self._tail_size
+        # No compaction needed
+        if total <= max_kept:
+            return list(self._messages)
+        # Return head + tail
+        head = self._messages[: self._head_size]
+        tail = self._messages[-self._tail_size :] if self._tail_size > 0 else []
+        return head + tail
+    @classmethod
+    async def deserialize(
+        cls,
+        serialized_store_state: MutableMapping[str, Any],
+        **kwargs: Any,
+    ) -> "HeadTailCompactingChatMessageStore":
+        """Create store from serialized state."""
+        from agent_framework import ChatMessage
+        head_size = kwargs.get("head_size", serialized_store_state.get("head_size", 10))
+        tail_size = kwargs.get("tail_size", serialized_store_state.get("tail_size", 40))
+        messages_data = serialized_store_state.get("messages", [])
+        messages = [
+            ChatMessage.from_dict(m) if isinstance(m, dict) else m
+            for m in messages_data
+        ]
+        return cls(messages=messages, head_size=head_size, tail_size=tail_size)
+    async def update_from_state(
+        self,
+        serialized_store_state: MutableMapping[str, Any],
+        **kwargs: Any,
+    ) -> None:
+        """Update store from serialized state."""
+        from agent_framework import ChatMessage
+        if not serialized_store_state:
+            return
+        messages_data = serialized_store_state.get("messages", [])
+        self._messages = [
+            ChatMessage.from_dict(m) if isinstance(m, dict) else m
+            for m in messages_data
+        ]
+        if "head_size" in serialized_store_state:
+            self._head_size = serialized_store_state["head_size"]
+        if "tail_size" in serialized_store_state:
+            self._tail_size = serialized_store_state["tail_size"]
+    async def serialize(self, **kwargs: Any) -> dict[str, Any]:
+        """Serialize the store state.
+        Serializes ALL messages (not just compacted view) plus configuration.
+        """
+        return {
+            "messages": [m.to_dict() for m in self._messages],
+            "head_size": self._head_size,
+            "tail_size": self._tail_size,
+        }
+    @property
+    def stats(self) -> dict[str, int]:
+        """Get compaction statistics."""
+        return {
+            "total_messages": self.total_messages,
+            "compacted_count": self.compacted_count,
+            "dropped_count": self.dropped_count,
+            "head_size": self._head_size,
+            "tail_size": self._tail_size,
+        }

src/flow/prompts.py ADDED Viewed

	@@ -0,0 +1,407 @@

+"""System prompts for the Flow agent.
+Defines the structured workflow for software engineering tasks.
+"""
+FLOW_AGENT_INSTRUCTIONS = """
+You are an expert autonomous agent. You solve problems end-to-end by composing your available tools.
+## CORE PRINCIPLE: BE AUTONOMOUS
+**You are NOT just an assistant that tells users what to do. You ARE the one who does it.**
+When asked to solve a task:
+1. **DO IT YOURSELF** - Don't tell the user to run commands. Run them yourself.
+2. **COMPLETE THE LOOP** - Write code AND execute it. Don't stop at writing.
+3. **VERIFY YOUR WORK** - Test that it actually works before reporting done.
+4. **ITERATE ON FAILURES** - If something fails, fix it and try again.
+**Example - BAD (passive):**
+> "Here's the code. You can run it with `python script.py`"
+**Example - GOOD (autonomous):**
+> *writes code* → *executes code* → *sees output* → *fixes any errors*
+> → "Done! The script ran successfully and output X."
+---
+## YOUR CAPABILITIES
+**Coding Tools:**
+- `read_file`: Read file contents with line numbers
+- `write_file`: Create/edit files (full write, str_replace, or insert_at_line)
+- `list_directory`: Explore project structure
+- `grep_search`: Search for patterns in code (regex supported)
+**Execution Tools:**
+- `bash_execute`: Run shell commands (tests, git, npm, pip, builds, etc.)
+- `python_repl`: Execute Python code snippets for quick validation
+**Research Tools (if available):**
+- `web_search`: Search the web using Google (requires GOOGLE_API_KEY and GOOGLE_CSE_ID)
+- `web_fetch`: Fetch and read content from URLs
+**Memory Tools:**
+- `memory`: Persistent storage that survives across conversations
+  - view: See directory or file contents
+  - create: Create new files
+  - str_replace: Edit existing files
+  - append: Add to files
+  - search: Find text across memory
+  - delete: Remove files
+**Thinking Tools:**
+- `think`: Pause to reason through complex problems
+- `task_done`: Report when task is complete or blocked
+**Skills Tool (if available):**
+- `skills`: Discover and load domain-specific expertise
+  - `skills(action='list')`: See available skills with descriptions
+  - `skills(action='load', name='skill-name')`: Load full skill content
+---
+## WORKFLOW
+### 1. UNDERSTAND
+- Read the user's request carefully
+- **If the `skills` tool is available**, call `skills(action='list')` to discover relevant expertise
+- Use `list_directory` to understand the workspace structure
+- Use `grep_search` to find relevant existing code
+- Check memory for relevant patterns: `memory(command="view", path="/memory")`
+### 2. PLAN
+- Use `think` tool to plan your approach for complex tasks
+- Break down into small, testable steps
+- Consider edge cases and error handling
+### 3. EXECUTE
+- Create/edit files using `write_file`
+- Test changes using `bash_execute` or `python_repl`
+- Fix issues immediately when tests fail
+### 4. VERIFY (REQUIRED)
+**You MUST test your work before calling `task_done`.** Never assume code works.
+**For Python apps/scripts:**
+```
+bash_execute("cd project && python -c 'import main'")  # Check imports work
+bash_execute("cd project && python main.py --help")    # Test CLI if applicable
+bash_execute("cd project && pytest")                    # Run tests if they exist
+```
+**For JavaScript/TypeScript:**
+```
+bash_execute("cd project && npm install && npm run build")  # Must pass!
+bash_execute("cd project && npx tsc --noEmit")              # Type check
+```
+**For Web APIs (FastAPI, Express, etc.):**
+```
+# Start server in background, test with curl, then cleanup
+bash_execute("cd project && uvicorn main:app --port 8000 &", background=True)
+bash_execute("sleep 2 && curl http://localhost:8000/health")  # Test endpoint
+bash_execute("check_processes action=list")                    # Verify it's running
+# When done testing, kill the process
+```
+**For Frontend apps (React, Vue, etc.):**
+```
+bash_execute("cd project && npm run build")  # Production build must succeed
+# If you need to test dev server, use background=True
+```
+**For full-stack apps:**
+1. Test backend API with curl (start in background)
+2. Test frontend build succeeds
+3. Clean up background processes when done
+### 5. COMPLETE
+- Clean up any background processes you started
+- Call `task_done` with status and summary
+- Include files created and suggested next steps
+---
+## WORKSPACE
+Your workspace is at `~/.flow/workspace/`
+**Organization:**
+- Create a folder for each project (e.g., `todo_app/`, `calculator/`)
+- Use `list_directory` to see existing projects before creating new ones
+- Follow standard project structure conventions:
+  - Python: `src/`, `tests/`, `requirements.txt` or `pyproject.toml`
+  - JavaScript: `src/`, `package.json`, standard Node.js layout
+  - Full-stack: `backend/`, `frontend/` folders
+**Important:**
+- Each `bash_execute` runs from workspace root in a fresh shell
+- Use `cd project && command` for commands in subdirectories
+- Multiple commands: `cd project && cmd1 && cmd2`
+---
+## MEMORY
+Your memory persists at `~/.flow/memory/`
+**Recommended structure:**
+- `/memory/patterns/` - Reusable solutions and code patterns
+- `/memory/projects/` - Per-project context and notes
+- `/memory/decisions/` - Why you made certain choices
+**Best practices:**
+When storing information, include context:
+- **Date**: When was this created/learned?
+- **Project**: What project did this come from?
+- **Context**: Why was this approach chosen?
+**Example pattern file** (`/memory/patterns/fastapi_cors.md`):
+```markdown
+# FastAPI CORS Setup
+Created: 2025-01-15
+Source: sleep_tracker project
+## Pattern
+from fastapi.middleware.cors import CORSMiddleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+## When to use
+- Full-stack apps with separate frontend/backend
+- Frontend on different port than backend
+## Notes
+- Must add before routes
+- Restrict origins in production
+```
+**Check memory first** - you may have solved similar problems before!
+---
+## CLI TOOLS
+Many CLI tools have interactive prompts that will hang.
+ALWAYS use non-interactive flags:
+```bash
+# Good
+npm create vite@latest myapp -- --template react-ts
+pip install -q package
+npx shadcn@latest init --defaults --yes
+# Bad (will hang)
+npm create vite@latest myapp  # Interactive prompts
+npx shadcn init               # Interactive prompts
+```
+**Shadcn UI** is a CLI tool, not an npm package:
+```bash
+# Wrong
+npm install @shadcn/ui
+# Right
+npx shadcn@latest init --defaults --yes
+npx shadcn@latest add button card --yes
+```
+---
+## FULL-STACK APPS
+When building apps with separate frontend and backend:
+1. **Always add CORS to backend:**
+```python
+from fastapi.middleware.cors import CORSMiddleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # Restrict in production
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+```
+2. **Document which ports each server uses**
+3. **Verify both sides build/run:**
+```bash
+cd backend && python -c "from main import app; print('Backend OK')"
+cd frontend && npm run build && echo "Frontend OK"
+```
+---
+## BACKGROUND PROCESSES
+When you need to start long-running processes (servers, watchers, etc.):
+**Use `background=True` parameter:**
+```python
+# Start a server in background - returns immediately with PID
+bash_execute("uvicorn main:app --port 8000", background=True)
+# Then test it
+bash_execute("curl http://localhost:8000/health")
+# Check what's running
+check_processes(action="list")
+# Clean up when done
+check_processes(action="kill", pid=12345)
+```
+**Process registry** is at `/memory/processes.md` - view it with:
+`memory(command='view', path='/memory/processes.md')`
+**IMPORTANT:**
+- NEVER start servers without `background=True` - they will timeout after 120s
+- ALWAYS clean up background processes when done testing
+- Check for port conflicts before starting servers
+**Common patterns:**
+```bash
+# Good - background server for testing
+bash_execute("cd backend && uvicorn main:app --port 8000", background=True)
+bash_execute("sleep 2")  # Wait for startup
+bash_execute("curl localhost:8000/docs")  # Test
+check_processes(action="cleanup")  # Kill all when done
+# Bad - will timeout!
+bash_execute("uvicorn main:app --port 8000")  # Blocks forever
+```
+---
+## ERROR HANDLING
+- If a command fails, analyze the error and try alternatives
+- Log failures and solutions to memory for future reference
+- Don't give up after first failure - iterate
+- If truly blocked, call `task_done` with status="incomplete" and explain why
+---
+## SKILLS
+**If the `skills` tool is available**, use it to access domain-specific expertise:
+```python
+# At the start of complex tasks, discover what expertise is available
+skills(action='list')
+# Output shows available skills with descriptions:
+# - fastapi-patterns: Build REST APIs with FastAPI...
+# - react-components: Build React components with hooks...
+# - testing-strategies: Write comprehensive tests...
+# Load relevant skills before implementation
+skills(action='load', name='fastapi-patterns')
+```
+**Skills provide:**
+- Domain-specific patterns and best practices
+- Code examples and templates
+- Common pitfalls to avoid
+**When to load skills:**
+- Before starting a new project type (API, frontend, CLI)
+- When working with unfamiliar frameworks
+- For complex tasks requiring specialized knowledge
+**Skills location:** `~/.flow/skills/`
+Each skill is a folder with a `SKILL.md` file following the Anthropic Skills standard.
+---
+## COMPOSING TOOLS FOR COMPLEX TASKS
+**You have all the tools needed to solve problems end-to-end. Compose them!**
+### Example: "What's the weather API response for Seattle?"
+```
+# DON'T just tell the user how to do it. DO IT:
+1. web_search("weather API free") → Find a free weather API
+2. web_fetch(api_docs_url) → Read the API documentation
+3. write_file("weather.py", code) → Write a script to call the API
+4. bash_execute("python weather.py") → Run it and get the answer
+5. Report the actual result to the user
+```
+### Example: "Create a CLI tool that converts CSV to JSON"
+```
+1. write_file("csv_to_json.py", code) → Write the tool
+2. write_file("test.csv", sample_data) → Create test data
+3. bash_execute("python csv_to_json.py test.csv") → Test it works
+4. bash_execute("cat output.json") → Verify the output
+5. Report success with example output
+```
+### Example: "Find and summarize the latest Python 3.12 features"
+```
+1. web_search("Python 3.12 new features") → Find relevant pages
+2. web_fetch(python_docs_url) → Read the official docs
+3. Summarize findings directly OR write to a file if requested
+```
+### Example: "Debug why my FastAPI app returns 500 errors"
+```
+1. read_file("main.py") → Understand the code
+2. bash_execute("cd app && python -c 'from main import app'") → Check imports
+3. bash_execute("cd app && uvicorn main:app --port 8000", background=True) → Start server
+4. bash_execute("curl localhost:8000/endpoint") → Reproduce the error
+5. Analyze error → Fix code → Test again → Iterate until fixed
+```
+---
+## RESEARCH WORKFLOW
+When you need information from the web:
+1. **Search first**: Use `web_search` to find relevant URLs
+2. **Fetch details**: Use `web_fetch` to read specific pages
+3. **Apply knowledge**: Write code, update configs, or summarize findings
+**Example - Learning a new library:**
+```python
+# 1. Search for docs
+web_search("httpx python async http client tutorial")
+# 2. Read the documentation
+web_fetch("https://www.python-httpx.org/quickstart/", output_format="markdown")
+# 3. Write code using what you learned
+write_file("http_client.py", '''
+import httpx
+async def fetch_data(url):
+    async with httpx.AsyncClient() as client:
+        return await client.get(url)
+''')
+# 4. Test it
+python_repl("import httpx; print(httpx.__version__)")
+```
+---
+## REMEMBER
+1. **BE AUTONOMOUS** - Do the work yourself, don't instruct the user
+2. **COMPLETE THE LOOP** - Write code → Execute → Verify → Report results
+3. **COMPOSE TOOLS** - Chain multiple tools to solve complex problems
+4. **RESEARCH WHEN NEEDED** - Use web_search/web_fetch to learn new things
+5. **ITERATE ON FAILURES** - Don't give up, debug and fix issues
+6. **TEST EVERYTHING** - Never assume code works
+7. **USE NON-INTERACTIVE FLAGS** - Avoid hanging commands
+8. **CLEAN UP** - Kill background processes when done
+9. **STORE LEARNINGS** - Save patterns to memory for future use
+**Your goal is to deliver RESULTS, not instructions.**
+"""

src/flow/py.typed ADDED Viewed

File without changes

src/flow/tools/__init__.py ADDED Viewed

	@@ -0,0 +1,172 @@

+"""Flow agent tools.
+Provides coding, execution, memory, and core tools for software engineering tasks.
+Tools are harness-agnostic - they return plain data that harnesses adapt.
+"""
+import inspect
+from collections.abc import Callable, Sequence
+from functools import wraps
+from pathlib import Path
+from typing import Any, get_type_hints
+from flow.tools.coding import create_coding_tools
+from flow.tools.core import create_core_tools
+from flow.tools.execution import create_execution_tools
+from flow.tools.memory import create_memory_tool
+from flow.tools.sub_agent import create_sub_agent_tool
+__all__ = [
+    "create_all_tools",
+    "create_coding_tools",
+    "create_core_tools",
+    "create_execution_tools",
+    "create_memory_tool",
+    "create_sub_agent_tool",
+    "get_tool_schema",
+    "tool",
+]
+def tool(
+    name: str | None = None,
+    description: str | None = None,
+) -> Callable[[Callable[..., Any]], Callable[..., Any]]:
+    """Decorator to mark a function as an agent tool.
+    This decorator adds metadata to functions that allows harnesses
+    to discover and use them as agent tools.
+    Args:
+        name: Tool name (defaults to function name)
+        description: Tool description (defaults to docstring)
+    Returns:
+        Decorated function with tool metadata
+    Example:
+        @tool(name="read_file", description="Read file contents")
+        async def read_file(path: str) -> str:
+            ...
+    """
+    def decorator(func: Callable[..., Any]) -> Callable[..., Any]:
+        @wraps(func)
+        def wrapper(*args: Any, **kwargs: Any) -> Any:
+            return func(*args, **kwargs)
+        # Store tool metadata
+        wrapper._tool_name = name or func.__name__  # type: ignore[attr-defined]
+        wrapper._tool_description = description or func.__doc__ or ""  # type: ignore[attr-defined]
+        wrapper._is_tool = True  # type: ignore[attr-defined]
+        return wrapper
+    return decorator
+def get_tool_schema(func: Callable[..., Any]) -> dict[str, Any]:
+    """Extract JSON schema from a tool function.
+    Uses type hints and Annotated metadata to build the schema.
+    Args:
+        func: Tool function to extract schema from
+    Returns:
+        JSON schema dict for the tool's parameters
+    """
+    hints = get_type_hints(func, include_extras=True)
+    sig = inspect.signature(func)
+    properties: dict[str, Any] = {}
+    required: list[str] = []
+    for param_name, param in sig.parameters.items():
+        if param_name in ("self", "cls"):
+            continue
+        param_schema: dict[str, Any] = {}
+        hint = hints.get(param_name, Any)
+        # Handle Annotated types
+        origin = getattr(hint, "__origin__", None)
+        if origin is not None:
+            # Check if it's Annotated
+            if hasattr(hint, "__metadata__"):
+                # Extract description from Annotated metadata
+                for meta in hint.__metadata__:
+                    if isinstance(meta, str):
+                        param_schema["description"] = meta
+                        break
+                # Get the actual type
+                hint = hint.__args__[0]
+                origin = getattr(hint, "__origin__", None)
+        # Map Python types to JSON schema types
+        if hint is str:
+            param_schema["type"] = "string"
+        elif hint is int:
+            param_schema["type"] = "integer"
+        elif hint is float:
+            param_schema["type"] = "number"
+        elif hint is bool:
+            param_schema["type"] = "boolean"
+        elif origin is list:
+            param_schema["type"] = "array"
+        elif origin is dict:
+            param_schema["type"] = "object"
+        else:
+            param_schema["type"] = "string"  # Default fallback
+        properties[param_name] = param_schema
+        # Check if parameter is required (no default value)
+        if param.default is inspect.Parameter.empty:
+            required.append(param_name)
+    return {
+        "type": "object",
+        "properties": properties,
+        "required": required,
+    }
+def create_all_tools(
+    workspace: Path,
+    memory_path: Path,
+    bash_timeout: int = 120,
+    *,
+    enable_memory_tool: bool = True,
+    enable_sub_agent: bool = False,
+    sub_agent_model: str = "gpt-4o-mini",
+) -> Sequence[Callable[..., Any]]:
+    """Create all standard tools for the Flow agent.
+    Args:
+        workspace: Root directory for file operations
+        memory_path: Directory for persistent memory
+        bash_timeout: Timeout for bash commands in seconds
+        enable_memory_tool: Whether to include the memory tool
+        enable_sub_agent: Whether to include the sub-agent research tool
+        sub_agent_model: Model to use for sub-agent (default: gpt-4o-mini)
+    Returns:
+        List of all tool functions
+    """
+    tools: list[Callable[..., Any]] = []
+    # Core tools always included
+    tools.extend(create_coding_tools(workspace))
+    tools.extend(create_execution_tools(workspace, memory_path, bash_timeout))
+    tools.extend(create_core_tools())
+    # Optional: Agent-managed memory tool
+    if enable_memory_tool:
+        tools.append(create_memory_tool(memory_path))
+    # Optional: Sub-agent for isolated research
+    if enable_sub_agent:
+        tools.append(create_sub_agent_tool(workspace, model=sub_agent_model))
+    return tools

src/flow/tools/coding.py ADDED Viewed

	@@ -0,0 +1,391 @@

+"""Coding tools for file operations and code search.
+These tools enable agents to read/write files, list directories,
+and search for patterns in code.
+The agent can read and write to any path the user has access to.
+The workspace serves as the default working directory for relative paths.
+"""
+import re
+from collections.abc import Callable, Coroutine, Sequence
+from pathlib import Path
+from typing import Annotated, Any
+def create_read_file_tool(workspace: Path) -> Callable[..., Coroutine[Any, Any, str]]:
+    """Create a read_file tool that can read from any path.
+    Args:
+        workspace: Default directory for relative paths (not a restriction)
+    """
+    async def read_file(
+        file_path: Annotated[str, "Path to the file (absolute or relative to workspace)"],
+        max_lines: Annotated[int, "Maximum lines to return (default: 500)"] = 500,
+    ) -> str:
+        """Read the contents of a file. Can read from any path on the system."""
+        try:
+            # Support both absolute and relative paths
+            path = Path(file_path)
+            if path.is_absolute():
+                full_path = path.resolve()
+            else:
+                full_path = (workspace / file_path).resolve()
+            if not full_path.exists():
+                return f"Error: File not found: {file_path}"
+            if not full_path.is_file():
+                return f"Error: Not a file: {file_path}"
+            content = full_path.read_text(encoding="utf-8")
+            lines = content.splitlines()
+            # Apply line limit
+            total_lines = len(lines)
+            if len(lines) > max_lines:
+                lines = lines[:max_lines]
+                truncated_msg = f"\n... (truncated, showing first {max_lines} of {total_lines} lines)"
+            else:
+                truncated_msg = ""
+            # Format with line numbers
+            numbered_lines = [f"{i + 1:5d}: {line}" for i, line in enumerate(lines)]
+            result = "\n".join(numbered_lines) + truncated_msg
+            return f"File: {full_path} ({total_lines} lines)\n{'=' * 40}\n{result}"
+        except UnicodeDecodeError:
+            return f"Error: Cannot read file (binary or non-UTF-8): {file_path}"
+        except PermissionError:
+            return f"Error: Permission denied: {file_path}"
+        except Exception as e:
+            return f"Error reading file: {e}"
+    # Add tool metadata
+    read_file._tool_name = "read_file"  # type: ignore[attr-defined]
+    read_file._tool_description = (  # type: ignore[attr-defined]
+        "Read the contents of a file. Accepts absolute paths (e.g., /path/to/file) "
+        "or relative paths (relative to workspace). Returns content with line numbers."
+    )
+    read_file._is_tool = True  # type: ignore[attr-defined]
+    return read_file
+def create_write_file_tool(workspace: Path) -> Callable[..., Coroutine[Any, Any, str]]:
+    """Create a write_file tool.
+    Args:
+        workspace: Default directory for relative paths
+    """
+    async def write_file(
+        file_path: Annotated[str, "Path to the file (absolute or relative to workspace)"],
+        content: Annotated[str | None, "Full content to write (for complete file write)"] = None,
+        old_str: Annotated[str | None, "Text to replace (for str_replace operation)"] = None,
+        new_str: Annotated[str | None, "Replacement text (for str_replace operation)"] = None,
+        insert_line: Annotated[int | None, "Line number to insert at (1-indexed)"] = None,
+        insert_content: Annotated[str | None, "Content to insert at line"] = None,
+    ) -> str:
+        """Write or edit file content.
+        Supports: (1) full file write with 'content',
+        (2) str_replace to replace specific text,
+        (3) insert_at_line to add content at a specific line.
+        Creates parent directories if needed.
+        """
+        try:
+            # Support both absolute and relative paths
+            path = Path(file_path)
+            if path.is_absolute():
+                full_path = path.resolve()
+            else:
+                full_path = (workspace / file_path).resolve()
+            # Create parent directories
+            full_path.parent.mkdir(parents=True, exist_ok=True)
+            # Operation 1: Full file write
+            if content is not None:
+                full_path.write_text(content, encoding="utf-8")
+                return f"Successfully wrote {len(content)} characters to {file_path}"
+            # Operation 2: str_replace
+            if old_str is not None and new_str is not None:
+                if not full_path.exists():
+                    return f"Error: File not found for str_replace: {file_path}"
+                current_content = full_path.read_text(encoding="utf-8")
+                if old_str not in current_content:
+                    # Show a snippet of the file to help debug
+                    if len(current_content) > 500:
+                        snippet = current_content[:500] + "..."
+                    else:
+                        snippet = current_content
+                    return (
+                        f"Error: String to replace not found in file.\n"
+                        f"Searching for: '{old_str[:100]}...'\n"
+                        f"File content preview:\n{snippet}"
+                    )
+                # Replace first occurrence only
+                new_content = current_content.replace(old_str, new_str, 1)
+                full_path.write_text(new_content, encoding="utf-8")
+                return f"Successfully replaced text in {file_path}"
+            # Operation 3: insert_at_line
+            if insert_line is not None and insert_content is not None:
+                if full_path.exists():
+                    current_content = full_path.read_text(encoding="utf-8")
+                    lines = current_content.splitlines(keepends=True)
+                else:
+                    lines = []
+                # Ensure insert_content ends with newline
+                if not insert_content.endswith("\n"):
+                    insert_content += "\n"
+                # Insert at specified line (1-indexed)
+                insert_index = insert_line - 1
+                if insert_index < 0:
+                    return f"Error: Invalid line number: {insert_line}. Must be >= 1."
+                # Allow inserting at end
+                if insert_index > len(lines):
+                    insert_index = len(lines)
+                lines.insert(insert_index, insert_content)
+                new_content = "".join(lines)
+                full_path.write_text(new_content, encoding="utf-8")
+                return f"Successfully inserted content at line {insert_line} in {file_path}"
+            return "Error: Must provide either 'content', 'old_str' + 'new_str', or 'insert_line' + 'insert_content'"
+        except Exception as e:
+            return f"Error writing file: {e}"
+    # Add tool metadata
+    write_file._tool_name = "write_file"  # type: ignore[attr-defined]
+    write_file._tool_description = (  # type: ignore[attr-defined]
+        "Write or edit file content. Accepts absolute paths or relative paths (relative to workspace). "
+        "Supports: (1) full file write with 'content', (2) str_replace to replace specific text, "
+        "(3) insert_at_line to add content at a specific line. Creates parent directories if needed."
+    )
+    write_file._is_tool = True  # type: ignore[attr-defined]
+    return write_file
+def create_list_directory_tool(workspace: Path) -> Callable[..., Coroutine[Any, Any, str]]:
+    """Create a list_directory tool that can list any directory.
+    Args:
+        workspace: Default directory for relative paths (not a restriction)
+    """
+    async def list_directory(
+        directory_path: Annotated[str, "Path to directory (absolute or relative to workspace, default: '.')"] = ".",
+        recursive: Annotated[bool, "List subdirectories recursively (default: false)"] = False,
+        max_entries: Annotated[int, "Maximum entries to return (default: 200)"] = 200,
+    ) -> str:
+        """List files and directories at a given path. Can list any directory on the system."""
+        try:
+            # Support both absolute and relative paths
+            path = Path(directory_path)
+            if path.is_absolute():
+                full_path = path.resolve()
+            else:
+                full_path = (workspace / directory_path).resolve()
+            if not full_path.exists():
+                return f"Error: Directory not found: {directory_path}"
+            if not full_path.is_dir():
+                return f"Error: Not a directory: {directory_path}"
+            entries: list[tuple[str, str, int]] = []
+            if recursive:
+                for item in full_path.rglob("*"):
+                    if len(entries) >= max_entries:
+                        break
+                    # Skip common non-essential directories
+                    skip_dirs = ["node_modules", "__pycache__", ".git", "venv", ".venv"]
+                    if any(part in item.parts for part in skip_dirs):
+                        continue
+                    rel_path = item.relative_to(full_path)
+                    item_type = "file" if item.is_file() else "dir"
+                    size = item.stat().st_size if item.is_file() else 0
+                    entries.append((str(rel_path), item_type, size))
+            else:
+                for item in full_path.iterdir():
+                    if len(entries) >= max_entries:
+                        break
+                    item_type = "file" if item.is_file() else "dir"
+                    size = item.stat().st_size if item.is_file() else 0
+                    entries.append((item.name, item_type, size))
+            # Sort: directories first, then by name
+            entries.sort(key=lambda x: (x[1] != "dir", x[0]))
+            # Format output
+            result_lines = [f"Directory: {directory_path} ({len(entries)} entries)"]
+            result_lines.append("=" * 50)
+            for name, item_type, size in entries:
+                if item_type == "dir":
+                    result_lines.append(f"  [DIR]  {name}/")
+                else:
+                    size_str = f"{size:,} bytes" if size < 10000 else f"{size / 1024:.1f} KB"
+                    result_lines.append(f"  [FILE] {name} ({size_str})")
+            if len(entries) >= max_entries:
+                result_lines.append(f"\n... (truncated at {max_entries} entries)")
+            return "\n".join(result_lines)
+        except Exception as e:
+            return f"Error listing directory: {e}"
+    # Add tool metadata
+    list_directory._tool_name = "list_directory"  # type: ignore[attr-defined]
+    list_directory._tool_description = (  # type: ignore[attr-defined]
+        "List files and directories at a given path. Accepts absolute paths (e.g., /path/to/dir) "
+        "or relative paths (relative to workspace). Returns names, types, and sizes."
+    )
+    list_directory._is_tool = True  # type: ignore[attr-defined]
+    return list_directory
+def create_grep_search_tool(workspace: Path) -> Callable[..., Coroutine[Any, Any, str]]:
+    """Create a grep_search tool that can search any directory.
+    Args:
+        workspace: Default directory for relative paths (not a restriction)
+    """
+    async def grep_search(
+        pattern: Annotated[str, "Pattern to search for (regex supported)"],
+        path: Annotated[str, "Path to search in (absolute or relative to workspace, default: '.')"] = ".",
+        file_pattern: Annotated[str | None, "File pattern to filter (e.g., '*.py', '*.js')"] = None,
+        case_sensitive: Annotated[bool, "Case sensitive search (default: true)"] = True,
+        max_matches: Annotated[int, "Maximum matches to return (default: 50)"] = 50,
+    ) -> str:
+        """Search for text patterns in files. Can search any path on the system."""
+        try:
+            # Support both absolute and relative paths
+            search_path = Path(path)
+            if search_path.is_absolute():
+                full_path = search_path.resolve()
+            else:
+                full_path = (workspace / path).resolve()
+            if not full_path.exists():
+                return f"Error: Path not found: {path}"
+            # Compile regex
+            flags = 0 if case_sensitive else re.IGNORECASE
+            try:
+                regex = re.compile(pattern, flags)
+            except re.error as e:
+                return f"Error: Invalid regex pattern: {e}"
+            matches: list[dict[str, Any]] = []
+            # Get files to search
+            if full_path.is_file():
+                files = [full_path]
+            else:
+                if file_pattern:
+                    files = list(full_path.rglob(file_pattern))
+                else:
+                    files = [f for f in full_path.rglob("*") if f.is_file()]
+            # Search each file
+            for file_path_item in files:
+                if len(matches) >= max_matches:
+                    break
+                # Skip common non-essential directories and binary files
+                skip_dirs = ["node_modules", "__pycache__", ".git", "venv", ".venv"]
+                if any(part in file_path_item.parts for part in skip_dirs):
+                    continue
+                try:
+                    # Skip large files (> 1MB)
+                    if file_path_item.stat().st_size > 1_000_000:
+                        continue
+                    file_content = file_path_item.read_text(encoding="utf-8", errors="ignore")
+                    lines = file_content.splitlines()
+                    for line_num, line in enumerate(lines, 1):
+                        if len(matches) >= max_matches:
+                            break
+                        if regex.search(line):
+                            # Compute relative path from search root
+                            try:
+                                rel_path = file_path_item.relative_to(full_path)
+                            except ValueError:
+                                # If file is the search path itself, use filename
+                                rel_path = file_path_item.name
+                            matches.append({
+                                "file": str(rel_path),
+                                "line": line_num,
+                                "text": line.strip()[:200],
+                            })
+                except (UnicodeDecodeError, PermissionError):
+                    continue
+            # Format output
+            if not matches:
+                return f"No matches found for pattern '{pattern}' in {path}"
+            result_lines = [f"Found {len(matches)} match(es) for '{pattern}'"]
+            result_lines.append("=" * 50)
+            for match in matches:
+                result_lines.append(f"{match['file']}:{match['line']}: {match['text']}")
+            if len(matches) >= max_matches:
+                result_lines.append(f"\n... (truncated at {max_matches} matches)")
+            return "\n".join(result_lines)
+        except Exception as e:
+            return f"Error searching: {e}"
+    # Add tool metadata
+    grep_search._tool_name = "grep_search"  # type: ignore[attr-defined]
+    grep_search._tool_description = (  # type: ignore[attr-defined]
+        "Search for text patterns in files. Accepts absolute paths (e.g., /path/to/dir) "
+        "or relative paths (relative to workspace). Supports regex patterns and file filtering."
+    )
+    grep_search._is_tool = True  # type: ignore[attr-defined]
+    return grep_search
+def create_coding_tools(workspace: Path) -> Sequence[Callable[..., Coroutine[Any, Any, str]]]:
+    """Create all coding tools bound to a workspace.
+    Args:
+        workspace: Root directory for file operations
+    Returns:
+        List of coding tool functions
+    """
+    workspace = Path(workspace).resolve()
+    return [
+        create_read_file_tool(workspace),
+        create_write_file_tool(workspace),
+        create_list_directory_tool(workspace),
+        create_grep_search_tool(workspace),
+    ]

src/flow/tools/core.py ADDED Viewed

	@@ -0,0 +1,100 @@

+"""Core metacognitive tools for agent reasoning and task management.
+These tools enable agents to think explicitly, track task status,
+and make structured decisions during complex software engineering tasks.
+"""
+from collections.abc import Callable, Coroutine, Sequence
+from typing import Annotated, Any, Literal
+async def think(
+    thought: Annotated[
+        str,
+        (
+            "Your detailed reasoning about the current situation. "
+            "Include: what you've learned, options you're considering, "
+            "potential risks, and your planned approach."
+        ),
+    ],
+) -> str:
+    """Use this tool to pause and think through a complex problem.
+    Helpful when: (1) analyzing tool results, (2) planning multi-step approaches,
+    (3) making design decisions, (4) debugging issues, (5) avoiding mistakes.
+    Your reasoning is recorded and helps structure your approach.
+    """
+    # The value is in giving the LLM dedicated space to reason
+    summary = thought[:300] + "..." if len(thought) > 300 else thought
+    return f"Thought recorded: {summary}"
+async def task_done(
+    status: Annotated[
+        Literal["complete", "incomplete"],
+        "'complete' if task finished successfully, 'incomplete' if blocked or needs input",
+    ],
+    summary: Annotated[
+        str,
+        (
+            "Summary of what was accomplished. "
+            "If complete: what was done and how to use/test it. "
+            "If incomplete: what's blocking and what's needed."
+        ),
+    ],
+    files_created: Annotated[
+        list[str] | None,
+        "List of files created or modified (if any)",
+    ] = None,
+    next_steps: Annotated[
+        list[str] | None,
+        "Suggested next steps for the user (if any)",
+    ] = None,
+) -> str:
+    """Call this when you have completed the user's task.
+    Provide a summary of what was accomplished and any relevant details.
+    Use 'complete' if all requirements are satisfied,
+    'incomplete' if blocked or need more information.
+    """
+    result_lines = [
+        f"Task Status: {status.upper()}",
+        "",
+        "Summary:",
+        summary,
+    ]
+    if files_created:
+        result_lines.extend([
+            "",
+            "Files Created/Modified:",
+            *[f"  - {f}" for f in files_created],
+        ])
+    if next_steps:
+        result_lines.extend([
+            "",
+            "Suggested Next Steps:",
+            *[f"  - {step}" for step in next_steps],
+        ])
+    return "\n".join(result_lines)
+# Add tool metadata
+think._tool_name = "think"  # type: ignore[attr-defined]
+think._tool_description = think.__doc__ or ""  # type: ignore[attr-defined]
+think._is_tool = True  # type: ignore[attr-defined]
+task_done._tool_name = "task_done"  # type: ignore[attr-defined]
+task_done._tool_description = task_done.__doc__ or ""  # type: ignore[attr-defined]
+task_done._is_tool = True  # type: ignore[attr-defined]
+def create_core_tools() -> Sequence[Callable[..., Coroutine[Any, Any, str]]]:
+    """Create all core metacognitive tools.
+    Returns:
+        List of core tool functions
+    """
+    return [think, task_done]

src/flow/tools/execution.py ADDED Viewed

	@@ -0,0 +1,479 @@

+"""Execution tools for running commands and code.
+These tools enable agents to execute bash commands and Python code
+with safety controls (timeouts, output limits), and manage background processes.
+"""
+import asyncio
+import os
+import re
+import signal
+import sys
+from collections.abc import Callable, Coroutine, Sequence
+from datetime import datetime
+from io import StringIO
+from pathlib import Path
+from typing import Annotated, Any, Literal
+def _get_process_registry_path(memory_path: Path) -> Path:
+    """Get the path to the process registry file in memory."""
+    return memory_path / "processes.md"
+def _ensure_process_registry(memory_path: Path) -> Path:
+    """Ensure the process registry file exists and return its path."""
+    registry_path = _get_process_registry_path(memory_path)
+    registry_path.parent.mkdir(parents=True, exist_ok=True)
+    if not registry_path.exists():
+        registry_path.write_text(
+            "# Background Processes\n\n"
+            "This file tracks background processes started by the Flow agent.\n"
+            "You can view this file with `memory(command='view', path='/memory/processes.md')`\n\n"
+            "## Running\n\n"
+            "## Stopped\n\n"
+        )
+    return registry_path
+def _add_process_to_registry(
+    memory_path: Path,
+    pid: int,
+    command: str,
+    workspace: str,
+    log_file: str,
+    port: int | None = None,
+) -> None:
+    """Add a process to the registry using checklist format."""
+    registry_path = _ensure_process_registry(memory_path)
+    content = registry_path.read_text()
+    # Extract port from command if not provided
+    if port is None:
+        port_match = re.search(r"(?:--port|-p)\s+(\d+)", command)
+        if port_match:
+            port = int(port_match.group(1))
+        elif ":8000" in command or "8000" in command:
+            port = 8000
+        elif ":3000" in command or "3000" in command:
+            port = 3000
+    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M")
+    port_str = f"Port: {port}" if port else "Port: -"
+    cmd_short = command[:60] + "..." if len(command) > 60 else command
+    workspace_short = workspace.split("/")[-1] if "/" in workspace else workspace
+    # Create checklist entry
+    entry = f"- [ ] **PID {pid}** | `{cmd_short}` | {timestamp} | {port_str} | {workspace_short}\n"
+    # Add under "## Running" section
+    if "## Running" in content:
+        content = content.replace("## Running\n\n", f"## Running\n\n{entry}")
+    else:
+        # Add Running section if missing
+        content += f"\n## Running\n\n{entry}"
+    registry_path.write_text(content)
+def _mark_process_stopped(memory_path: Path, pid: int, reason: str = "killed") -> None:
+    """Mark a process as stopped in the registry (check the box and move to Stopped)."""
+    registry_path = _get_process_registry_path(memory_path)
+    if not registry_path.exists():
+        return
+    content = registry_path.read_text()
+    lines = content.split("\n")
+    new_lines: list[str] = []
+    stopped_entry: str | None = None
+    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M")
+    for line in lines:
+        if f"**PID {pid}**" in line and "- [ ]" in line:
+            # Found the running process - mark it as checked and prepare for Stopped section
+            stopped_entry = line.replace("- [ ]", "- [x]") + f" | {reason} @ {timestamp}"
+            # Don't add to new_lines yet (will move to Stopped section)
+        else:
+            new_lines.append(line)
+    # Add stopped entry to Stopped section
+    if stopped_entry:
+        content = "\n".join(new_lines)
+        if "## Stopped" in content:
+            content = content.replace("## Stopped\n\n", f"## Stopped\n\n{stopped_entry}\n")
+        else:
+            content += f"\n## Stopped\n\n{stopped_entry}\n"
+        registry_path.write_text(content)
+def _is_process_running(pid: int) -> bool:
+    """Check if a process is still running."""
+    try:
+        os.kill(pid, 0)
+        return True
+    except (OSError, ProcessLookupError):
+        return False
+def _get_running_pids_from_registry(memory_path: Path) -> list[tuple[int, str]]:
+    """Get list of (pid, line) for processes marked as running in registry."""
+    registry_path = _get_process_registry_path(memory_path)
+    if not registry_path.exists():
+        return []
+    content = registry_path.read_text()
+    running: list[tuple[int, str]] = []
+    for line in content.split("\n"):
+        if "- [ ]" in line and "**PID" in line:
+            # Extract PID from format: **PID 12345**
+            match = re.search(r"\*\*PID (\d+)\*\*", line)
+            if match:
+                pid = int(match.group(1))
+                running.append((pid, line))
+    return running
+def create_bash_execute_tool(
+    workspace: Path, memory_path: Path, default_timeout: int = 120
+) -> Callable[..., Coroutine[Any, Any, str]]:
+    """Create a bash_execute tool bound to a specific workspace."""
+    async def bash_execute(
+        command: Annotated[str, "Bash command to execute"],
+        timeout: Annotated[int, f"Command timeout in seconds (default: {default_timeout})"] = default_timeout,
+        background: Annotated[
+            bool, "Run in background and return immediately with PID. Use for servers/long-running processes."
+        ] = False,
+    ) -> str:
+        """Execute bash commands in the workspace.
+        Returns stdout, stderr, and return code.
+        Use for running tests, git commands, package managers, builds, etc.
+        IMPORTANT: Each call runs in a fresh shell from workspace root -
+        use 'cd dir && command' for commands in subdirectories.
+        For long-running processes (servers), use background=True to avoid timeout.
+        """
+        try:
+            if background:
+                # Run in background using nohup and capture PID
+                # Redirect output to a log file
+                log_file = workspace / ".background_logs" / f"bg_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
+                log_file.parent.mkdir(parents=True, exist_ok=True)
+                bg_command = f"nohup {command} > {log_file} 2>&1 & echo $!"
+                proc = await asyncio.create_subprocess_shell(
+                    bg_command,
+                    stdout=asyncio.subprocess.PIPE,
+                    stderr=asyncio.subprocess.PIPE,
+                    cwd=str(workspace),
+                )
+                stdout, _ = await proc.communicate()
+                pid_str = stdout.decode().strip()
+                try:
+                    pid = int(pid_str)
+                    # Register the process in memory
+                    _add_process_to_registry(
+                        memory_path=memory_path,
+                        pid=pid,
+                        command=command,
+                        workspace=str(workspace),
+                        log_file=str(log_file),
+                    )
+                    return (
+                        f"Background process started successfully.\n"
+                        f"PID: {pid}\n"
+                        f"Command: {command}\n"
+                        f"Log file: {log_file}\n"
+                        f"\nProcess registered in /memory/processes.md\n"
+                        f"Use check_processes(action='list') to see all background processes.\n"
+                        f"Use check_processes(action='kill', pid={pid}) to stop this process."
+                    )
+                except ValueError:
+                    return f"Error: Could not get PID. Output: {pid_str}"
+            # Regular (blocking) execution
+            proc = await asyncio.create_subprocess_shell(
+                command,
+                stdout=asyncio.subprocess.PIPE,
+                stderr=asyncio.subprocess.PIPE,
+                cwd=str(workspace),
+            )
+            try:
+                stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=timeout)
+            except asyncio.TimeoutError:
+                proc.kill()
+                await proc.wait()
+                return (
+                    f"Error: Command timed out after {timeout} seconds.\n"
+                    f"Command: {command}\n\n"
+                    f"TIP: If this is a long-running process (like a server), "
+                    f"use background=True to run it in the background."
+                )
+            stdout_str = stdout.decode("utf-8", errors="replace")
+            stderr_str = stderr.decode("utf-8", errors="replace")
+            return_code = proc.returncode
+            # Format output
+            result_parts = [f"Command: {command}"]
+            result_parts.append(f"Return code: {return_code}")
+            result_parts.append("=" * 50)
+            if stdout_str.strip():
+                # Truncate very long output
+                if len(stdout_str) > 15000:
+                    stdout_str = stdout_str[:15000] + "\n... (stdout truncated)"
+                result_parts.append("STDOUT:")
+                result_parts.append(stdout_str)
+            if stderr_str.strip():
+                if len(stderr_str) > 5000:
+                    stderr_str = stderr_str[:5000] + "\n... (stderr truncated)"
+                result_parts.append("STDERR:")
+                result_parts.append(stderr_str)
+            if not stdout_str.strip() and not stderr_str.strip():
+                result_parts.append("(no output)")
+            return "\n".join(result_parts)
+        except Exception as e:
+            return f"Error executing command: {e}"
+    # Add tool metadata
+    bash_execute._tool_name = "bash_execute"  # type: ignore[attr-defined]
+    bash_execute._tool_description = (  # type: ignore[attr-defined]
+        "Execute bash commands in the workspace. "
+        "Returns stdout, stderr, and return code. "
+        "Use for running tests, git commands, package managers, builds, etc."
+    )
+    bash_execute._is_tool = True  # type: ignore[attr-defined]
+    return bash_execute
+def create_check_processes_tool(
+    workspace: Path, memory_path: Path
+) -> Callable[..., Coroutine[Any, Any, str]]:
+    """Create a tool to check and manage background processes."""
+    async def check_processes(
+        action: Annotated[
+            Literal["list", "kill", "cleanup"],
+            "'list' to see processes, 'kill' to stop one by PID, 'cleanup' to kill all",
+        ],
+        pid: Annotated[int | None, "PID to kill (required for 'kill' action)"] = None,
+    ) -> str:
+        """Check and manage background processes.
+        Use 'list' to see all background processes (also viewable at /memory/processes.md),
+        'kill' to stop a specific process by PID,
+        'cleanup' to kill all background processes from this workspace.
+        """
+        _ensure_process_registry(memory_path)
+        registry_path = _get_process_registry_path(memory_path)
+        if action == "list":
+            # Read the registry and update status of running processes
+            running_pids = _get_running_pids_from_registry(memory_path)
+            active_count = 0
+            dead_pids: list[int] = []
+            for proc_pid, _ in running_pids:
+                if _is_process_running(proc_pid):
+                    active_count += 1
+                else:
+                    dead_pids.append(proc_pid)
+            # Mark dead processes as stopped
+            for dead_pid in dead_pids:
+                _mark_process_stopped(memory_path, dead_pid, reason="exited")
+            # Return the updated registry
+            content = registry_path.read_text()
+            return (
+                f"Active background processes: {active_count}\n"
+                f"(View full registry at /memory/processes.md)\n\n"
+                f"{content}"
+            )
+        if action == "kill":
+            if pid is None:
+                return "Error: 'pid' is required for 'kill' action."
+            try:
+                os.kill(pid, signal.SIGTERM)
+                await asyncio.sleep(0.5)  # Give it time to terminate
+                # Check if it's really dead, if not SIGKILL
+                if _is_process_running(pid):
+                    os.kill(pid, signal.SIGKILL)
+                    await asyncio.sleep(0.2)
+                _mark_process_stopped(memory_path, pid, reason="killed")
+                if _is_process_running(pid):
+                    return f"Warning: Process {pid} may still be running after kill attempt."
+                return f"Successfully killed process {pid}. Updated /memory/processes.md"
+            except ProcessLookupError:
+                _mark_process_stopped(memory_path, pid, reason="not found")
+                return f"Process {pid} was not running (already terminated). Updated /memory/processes.md"
+            except PermissionError:
+                return f"Error: Permission denied to kill process {pid}."
+            except Exception as e:
+                return f"Error killing process {pid}: {e}"
+        if action == "cleanup":
+            # Kill all processes from this workspace
+            running_pids = _get_running_pids_from_registry(memory_path)
+            workspace_str = str(workspace)
+            killed: list[int] = []
+            failed: list[tuple[int, str]] = []
+            for proc_pid, line in running_pids:
+                # Check if this process is from our workspace
+                workspace_short = workspace_str.split("/")[-1]
+                if workspace_short in line or workspace_str in line:
+                    try:
+                        os.kill(proc_pid, signal.SIGTERM)
+                        await asyncio.sleep(0.2)
+                        if _is_process_running(proc_pid):
+                            os.kill(proc_pid, signal.SIGKILL)
+                        _mark_process_stopped(memory_path, proc_pid, reason="cleanup")
+                        killed.append(proc_pid)
+                    except (ProcessLookupError, PermissionError) as e:
+                        _mark_process_stopped(memory_path, proc_pid, reason=f"cleanup failed: {e}")
+                        failed.append((proc_pid, str(e)))
+            result = "Cleanup complete. Updated /memory/processes.md\n"
+            if killed:
+                result += f"Killed processes: {killed}\n"
+            if failed:
+                result += f"Failed to kill: {failed}\n"
+            if not killed and not failed:
+                result += "No active processes found for this workspace."
+            return result
+        return f"Unknown action: {action}"
+    # Add tool metadata
+    check_processes._tool_name = "check_processes"  # type: ignore[attr-defined]
+    check_processes._tool_description = (  # type: ignore[attr-defined]
+        "Check and manage background processes. "
+        "Use 'list' to see all background processes, "
+        "'kill' to stop a specific process by PID, "
+        "'cleanup' to kill all background processes from this workspace."
+    )
+    check_processes._is_tool = True  # type: ignore[attr-defined]
+    return check_processes
+def create_python_repl_tool(workspace: Path) -> Callable[..., Coroutine[Any, Any, str]]:
+    """Create a python_repl tool bound to a specific workspace."""
+    async def python_repl(
+        code: Annotated[str, "Python code to execute"],
+    ) -> str:
+        """Execute Python code in an isolated namespace.
+        Returns the output (stdout) or any errors.
+        Use for testing code snippets, calculations, data manipulation, or quick validation.
+        The WORKSPACE variable is available with the workspace path.
+        """
+        old_stdout = sys.stdout
+        old_stderr = sys.stderr
+        try:
+            # Capture stdout and stderr
+            redirected_output = StringIO()
+            redirected_error = StringIO()
+            sys.stdout = redirected_output
+            sys.stderr = redirected_error
+            # Create isolated namespace with builtins
+            namespace: dict[str, Any] = {
+                "__builtins__": __builtins__,
+                "__name__": "__main__",
+                "WORKSPACE": workspace,
+            }
+            try:
+                # Try to compile and exec
+                compiled = compile(code, "<repl>", "exec")
+                exec(compiled, namespace)  # noqa: S102
+                output = redirected_output.getvalue()
+                error = redirected_error.getvalue()
+                result_parts = ["Python REPL Output"]
+                result_parts.append("=" * 50)
+                if output.strip():
+                    if len(output) > 15000:
+                        output = output[:15000] + "\n... (output truncated)"
+                    result_parts.append(output)
+                if error.strip():
+                    result_parts.append("STDERR:")
+                    result_parts.append(error)
+                if not output.strip() and not error.strip():
+                    result_parts.append("(code executed successfully, no output)")
+                return "\n".join(result_parts)
+            except SyntaxError as e:
+                return f"SyntaxError: {e}"
+            except Exception as e:
+                return f"Error: {type(e).__name__}: {e}"
+        finally:
+            sys.stdout = old_stdout
+            sys.stderr = old_stderr
+    # Add tool metadata
+    python_repl._tool_name = "python_repl"  # type: ignore[attr-defined]
+    python_repl._tool_description = (  # type: ignore[attr-defined]
+        "Execute Python code in an isolated namespace. "
+        "Returns the output (stdout) or any errors. "
+        "Use for testing code snippets, calculations, data manipulation, or quick validation."
+    )
+    python_repl._is_tool = True  # type: ignore[attr-defined]
+    return python_repl
+def create_execution_tools(
+    workspace: Path,
+    memory_path: Path,
+    bash_timeout: int = 120,
+) -> Sequence[Callable[..., Coroutine[Any, Any, str]]]:
+    """Create all execution tools bound to a workspace.
+    Args:
+        workspace: Root directory for command execution
+        memory_path: Path to memory directory for process registry
+        bash_timeout: Default timeout for bash commands in seconds
+    Returns:
+        List of execution tool functions
+    """
+    workspace = Path(workspace).resolve()
+    memory_path = Path(memory_path).resolve()
+    return [
+        create_bash_execute_tool(workspace, memory_path, bash_timeout),
+        create_check_processes_tool(workspace, memory_path),
+        create_python_repl_tool(workspace),
+    ]

src/flow/tools/memory.py ADDED Viewed

	@@ -0,0 +1,260 @@

+"""Memory tool for persistent storage across sessions.
+Provides file-based memory storage allowing agents to store and retrieve
+information, patterns, and decisions across conversations.
+"""
+from collections.abc import Callable, Coroutine
+from pathlib import Path
+from typing import Annotated, Any, Literal
+class MemoryBackend:
+    """File-based memory storage backend with security controls."""
+    def __init__(self, base_path: Path) -> None:
+        """Initialize memory backend."""
+        self.base_path = Path(base_path).resolve()
+        self.base_path.mkdir(parents=True, exist_ok=True)
+    def _validate_path(self, path: str) -> Path:
+        """Validate and resolve a memory path."""
+        # Normalize path (remove /memory prefix if present)
+        if path.startswith("/memory"):
+            path = path[len("/memory") :]
+        path = path.lstrip("/")
+        # Handle empty path
+        if not path:
+            return self.base_path
+        # Resolve to absolute path
+        full_path = (self.base_path / path).resolve()
+        # Security: Ensure path is within base_path
+        try:
+            full_path.relative_to(self.base_path)
+        except ValueError as err:
+            raise ValueError(f"Access denied: path '{path}' is outside memory directory") from err
+        return full_path
+    def view(self, path: str, view_range: list[int] | None = None) -> str:
+        """View directory contents or file contents."""
+        full_path = self._validate_path(path)
+        if not full_path.exists():
+            return f"Path not found: {path}\nUse 'create' to create new files."
+        # Directory listing
+        if full_path.is_dir():
+            contents = [f"Directory: {path or '/memory'}"]
+            items = sorted(full_path.iterdir(), key=lambda x: (x.is_file(), x.name))
+            if not items:
+                contents.append("(empty directory)")
+            else:
+                for item in items:
+                    suffix = "/" if item.is_dir() else ""
+                    contents.append(f"  - {item.name}{suffix}")
+            return "\n".join(contents)
+        # File contents
+        if full_path.is_file():
+            content = full_path.read_text(encoding="utf-8")
+            lines = content.splitlines()
+            if view_range:
+                start, end = view_range
+                start = max(1, start)
+                end = min(len(lines), end)
+                lines = lines[start - 1 : end]
+                numbered_lines = [f"{i + start:5d}: {line}" for i, line in enumerate(lines)]
+            else:
+                numbered_lines = [f"{i + 1:5d}: {line}" for i, line in enumerate(lines)]
+            return "\n".join(numbered_lines) if numbered_lines else "(empty file)"
+        return f"Unknown path type: {path}"
+    def create(self, path: str, file_text: str) -> str:
+        """Create or overwrite a file."""
+        full_path = self._validate_path(path)
+        full_path.parent.mkdir(parents=True, exist_ok=True)
+        full_path.write_text(file_text, encoding="utf-8")
+        return f"File created successfully at {path}"
+    def str_replace(self, path: str, old_str: str, new_str: str) -> str:
+        """Replace text in a file."""
+        full_path = self._validate_path(path)
+        if not full_path.is_file():
+            raise FileNotFoundError(f"File not found: {path}")
+        content = full_path.read_text(encoding="utf-8")
+        if old_str not in content:
+            raise ValueError(f"Text not found in file: '{old_str[:50]}...'")
+        new_content = content.replace(old_str, new_str, 1)
+        full_path.write_text(new_content, encoding="utf-8")
+        return f"File {path} has been edited successfully"
+    def append(self, path: str, text: str) -> str:
+        """Append text to end of file."""
+        full_path = self._validate_path(path)
+        if not full_path.exists():
+            full_path.parent.mkdir(parents=True, exist_ok=True)
+            full_path.write_text("", encoding="utf-8")
+        # Ensure text starts with newline if file isn't empty
+        if full_path.stat().st_size > 0:
+            existing = full_path.read_text(encoding="utf-8")
+            if existing and not existing.endswith("\n"):
+                text = "\n" + text
+        # Ensure text ends with newline
+        if not text.endswith("\n"):
+            text += "\n"
+        with full_path.open("a", encoding="utf-8") as f:
+            f.write(text)
+        return f"Text appended to {path}"
+    def search(self, query: str, path: str = "") -> str:
+        """Search for text across memory files."""
+        full_path = self._validate_path(path)
+        if not full_path.exists():
+            return f"Path not found: {path or '/memory'}"
+        if not full_path.is_dir():
+            # Search single file
+            files = [full_path]
+        else:
+            files = list(full_path.rglob("*"))
+        matches: list[dict[str, Any]] = []
+        query_lower = query.lower()
+        for file_path in files:
+            if not file_path.is_file():
+                continue
+            try:
+                content = file_path.read_text(encoding="utf-8")
+                lines = content.splitlines()
+                for line_num, line in enumerate(lines, 1):
+                    if query_lower in line.lower():
+                        rel_path = file_path.relative_to(self.base_path)
+                        matches.append({
+                            "file": str(rel_path),
+                            "line": line_num,
+                            "content": line.strip()[:100],
+                        })
+            except (UnicodeDecodeError, PermissionError):
+                continue
+        if not matches:
+            return f"No matches found for '{query}' in {path or '/memory'}"
+        result_lines = [f"Found {len(matches)} match(es) for '{query}':\n"]
+        for match in matches[:50]:
+            result_lines.append(f"  {match['file']}:{match['line']} - {match['content']}")
+        if len(matches) > 50:
+            result_lines.append(f"\n... and {len(matches) - 50} more matches")
+        return "\n".join(result_lines)
+    def delete(self, path: str) -> str:
+        """Delete a file or empty directory."""
+        full_path = self._validate_path(path)
+        if not full_path.exists():
+            raise FileNotFoundError(f"Path not found: {path}")
+        if full_path.is_file():
+            full_path.unlink()
+            return f"File deleted: {path}"
+        if full_path.is_dir():
+            if any(full_path.iterdir()):
+                raise ValueError(f"Directory not empty: {path}. Delete contents first.")
+            full_path.rmdir()
+            return f"Directory deleted: {path}"
+        return f"Unknown path type: {path}"
+def create_memory_tool(memory_path: Path) -> Callable[..., Coroutine[Any, Any, str]]:
+    """Create a memory tool bound to a specific memory directory."""
+    backend = MemoryBackend(memory_path)
+    async def memory(
+        command: Annotated[
+            Literal["view", "create", "str_replace", "append", "search", "delete"],
+            "Operation to perform",
+        ],
+        path: Annotated[str, "Path to file or directory (e.g., '/memory/patterns/cors.md')"] = "/memory",
+        file_text: Annotated[str | None, "Content to write (for create)"] = None,
+        old_str: Annotated[str | None, "Text to find (for str_replace)"] = None,
+        new_str: Annotated[str | None, "Replacement text (for str_replace)"] = None,
+        append_text: Annotated[str | None, "Text to append (for append)"] = None,
+        query: Annotated[str | None, "Search query (for search)"] = None,
+        view_range: Annotated[list[int] | None, "Line range [start, end] (for view)"] = None,
+    ) -> str:
+        """Store and retrieve information in persistent memory.
+        Memory persists across conversations - use it to remember patterns,
+        insights, project context, and decisions.
+        Operations: view (show directory/file), create (new file),
+        str_replace (edit file), append (add to file),
+        search (find text), delete (remove file/dir).
+        Organize by: /memory/patterns/, /memory/projects/, /memory/decisions/
+        """
+        try:
+            if command == "view":
+                return backend.view(path, view_range)
+            if command == "create":
+                if file_text is None:
+                    return "Error: 'file_text' is required for create operation"
+                return backend.create(path, file_text)
+            if command == "str_replace":
+                if old_str is None or new_str is None:
+                    return "Error: 'old_str' and 'new_str' are required for str_replace"
+                return backend.str_replace(path, old_str, new_str)
+            if command == "append":
+                if append_text is None:
+                    return "Error: 'append_text' is required for append operation"
+                return backend.append(path, append_text)
+            if command == "search":
+                if query is None:
+                    return "Error: 'query' is required for search operation"
+                return backend.search(query, path)
+            if command == "delete":
+                return backend.delete(path)
+            return f"Error: Unknown command: {command}"
+        except Exception as e:
+            return f"Memory operation failed: {e}"
+    # Add tool metadata
+    memory._tool_name = "memory"  # type: ignore[attr-defined]
+    memory._tool_description = (  # type: ignore[attr-defined]
+        "Store and retrieve information in persistent memory. "
+        "Memory persists across conversations - use it to remember patterns, "
+        "insights, project context, and decisions."
+    )
+    memory._is_tool = True  # type: ignore[attr-defined]
+    return memory

src/flow/tools/sub_agent.py ADDED Viewed

	@@ -0,0 +1,188 @@

+"""Sub-agent tool for isolated research tasks.
+Provides context isolation by delegating complex research tasks to a
+separate agent that operates in its own context window. The sub-agent
+processes the request and returns only a concise summary, preventing
+context pollution in the main agent.
+This implements the "Isolation" strategy for context engineering:
+- Coordinator agent stays lean with minimal context
+- Sub-agent can use 30K+ tokens internally for research
+- Only the distilled result (200-500 tokens) returns to coordinator
+"""
+from __future__ import annotations
+import os
+from collections.abc import Callable, Coroutine
+from pathlib import Path
+from typing import Annotated, Any
+# Sub-agent system prompt focused on research and summarization
+SUB_AGENT_INSTRUCTIONS = """You are a research assistant that helps with complex information gathering tasks.
+Your role:
+1. Thoroughly research the given topic or question
+2. Gather relevant information from available tools
+3. Synthesize findings into a clear, concise summary
+4. Return ONLY the essential information needed by the requesting agent
+Guidelines:
+- Be thorough in your research but concise in your response
+- Focus on facts and actionable information
+- If you can't find information, say so clearly
+- Your response will be passed to another agent, so make it self-contained
+- Target 200-500 tokens for your final response unless more detail is explicitly requested
+Do NOT:
+- Include conversational fluff or preamble
+- Repeat the original question back
+- Add disclaimers about your limitations
+- Include information that wasn't requested
+"""
+def create_sub_agent_tool(
+    workspace: Path,
+    model: str = "gpt-4o-mini",
+    endpoint: str | None = None,
+    api_key: str | None = None,
+    api_version: str = "2024-02-15-preview",
+) -> Callable[..., Coroutine[Any, Any, str]]:
+    """Create a sub-agent tool for isolated research tasks.
+    The sub-agent runs in its own isolated context, preventing context
+    pollution in the main agent. This is useful for:
+    - Complex research that requires many tool calls
+    - Tasks that generate lots of intermediate content
+    - Keeping the main agent's context lean and focused
+    Args:
+        workspace: Workspace directory for file operations
+        model: Model to use for sub-agent (default: gpt-4o-mini for efficiency)
+        endpoint: Azure OpenAI endpoint (defaults to AZURE_OPENAI_ENDPOINT env var)
+        api_key: Azure OpenAI API key (defaults to AZURE_OPENAI_API_KEY env var)
+        api_version: Azure OpenAI API version
+    Returns:
+        An async function that can be used as a tool
+    """
+    # Resolve credentials from environment if not provided
+    _endpoint = endpoint or os.environ.get("AZURE_OPENAI_ENDPOINT", "")
+    _api_key = api_key or os.environ.get("AZURE_OPENAI_API_KEY", "")
+    # Lazy import to avoid circular dependencies
+    _sub_agent: Any = None
+    async def _ensure_sub_agent() -> Any:
+        """Lazily create the sub-agent on first use."""
+        nonlocal _sub_agent
+        if _sub_agent is not None:
+            return _sub_agent
+        try:
+            from agent_framework import ChatAgent
+            from agent_framework.azure import AzureOpenAIChatClient
+        except ImportError as e:
+            raise ImportError(
+                "Microsoft Agent Framework is required for sub-agent. "
+                "Install with: pip install agent-framework-core"
+            ) from e
+        # Create a lightweight chat client for the sub-agent
+        # Uses a smaller/faster model by default for efficiency
+        client = AzureOpenAIChatClient(
+            api_key=_api_key,
+            endpoint=_endpoint,
+            deployment=model,
+            api_version=api_version,
+        )
+        # Create basic tools for the sub-agent
+        # Keep it minimal - just what's needed for research
+        from flow.tools.coding import create_coding_tools
+        from flow.tools.core import create_core_tools
+        sub_tools: list[Callable[..., Any]] = []
+        sub_tools.extend(create_coding_tools(workspace))
+        sub_tools.extend(create_core_tools())
+        # Convert tools to agent_framework format
+        from agent_framework import ai_function
+        converted_tools = []
+        for tool_func in sub_tools:
+            name = getattr(tool_func, "_tool_name", tool_func.__name__)
+            description = getattr(tool_func, "_tool_description", tool_func.__doc__ or "")
+            wrapped = ai_function(name=name, description=description)(tool_func)
+            converted_tools.append(wrapped)
+        _sub_agent = ChatAgent(
+            name="ResearchAssistant",
+            description="Research assistant for complex information gathering",
+            instructions=SUB_AGENT_INSTRUCTIONS,
+            chat_client=client,
+            tools=converted_tools,
+        )
+        return _sub_agent
+    async def research(
+        task: Annotated[
+            str,
+            "The research task or question to investigate. Be specific about what information you need.",
+        ],
+        context: Annotated[
+            str | None,
+            "Optional context to help the sub-agent understand the broader goal.",
+        ] = None,
+    ) -> str:
+        """Delegate a research task to a sub-agent with isolated context.
+        Use this tool when you need to:
+        - Research a complex topic that may require multiple steps
+        - Gather information without polluting your main context
+        - Get a summarized answer to a specific question
+        The sub-agent operates in its own context window, so it can
+        use many tokens internally while only returning a concise summary.
+        This keeps your main context lean and focused.
+        Examples:
+        - "Find all Python files that import the requests library and summarize their purpose"
+        - "Research how authentication is implemented in this codebase"
+        - "Analyze the error handling patterns used across the project"
+        """
+        sub_agent = await _ensure_sub_agent()
+        # Build the research prompt
+        prompt_parts = [f"Research task: {task}"]
+        if context:
+            prompt_parts.insert(0, f"Context: {context}")
+        prompt_parts.append("\nProvide a concise summary of your findings.")
+        full_prompt = "\n\n".join(prompt_parts)
+        try:
+            # Run the sub-agent - it operates in isolated context
+            response = await sub_agent.run(full_prompt)
+            # Extract text content from response
+            if hasattr(response, "content"):
+                return str(response.content)
+            return str(response)
+        except Exception as e:
+            return f"Research failed: {e}"
+    # Add tool metadata
+    research._tool_name = "research"  # type: ignore[attr-defined]
+    research._tool_description = (  # type: ignore[attr-defined]
+        "Delegate a research task to a sub-agent with isolated context. "
+        "The sub-agent can thoroughly investigate a topic using many tool calls "
+        "internally, then return only a concise summary. Use this for complex "
+        "research that would otherwise pollute your main context."
+    )
+    research._is_tool = True  # type: ignore[attr-defined]
+    return research

src/flow/ui/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Copyright (c) Microsoft. All rights reserved.
2	+ """Flow UI Backend - FastAPI server."""

src/flow/ui/api/__init__.py ADDED Viewed

	@@ -0,0 +1,14 @@

+# Copyright (c) Microsoft. All rights reserved.
+"""API routes package."""
+from .configs import router as configs_router
+from .tasks import router as tasks_router
+from .jobs import router as jobs_router
+from .runs import router as runs_router
+__all__ = [
+    "configs_router",
+    "tasks_router",
+    "jobs_router",
+    "runs_router",
+]

src/flow/ui/api/configs.py ADDED Viewed

	@@ -0,0 +1,121 @@

+# Copyright (c) Microsoft. All rights reserved.
+"""Config API routes."""
+from uuid import UUID
+from fastapi import APIRouter, Depends, HTTPException
+from sqlalchemy.ext.asyncio import AsyncSession
+from sqlmodel import select, desc
+from ..database import get_session
+from ..models.config import AgentConfig
+from ..schemas import ConfigCreate, ConfigUpdate, ConfigResponse
+router = APIRouter(prefix="/configs", tags=["configs"])
+def parse_uuid(id_str: str) -> UUID:
+    """Parse a string to UUID, raising 400 if invalid."""
+    try:
+        return UUID(id_str)
+    except ValueError as e:
+        raise HTTPException(status_code=400, detail=f"Invalid UUID: {id_str}") from e
+@router.get("", response_model=list[ConfigResponse])
+async def list_configs(session: AsyncSession = Depends(get_session)) -> list[AgentConfig]:
+    """List all agent configurations."""
+    result = await session.execute(select(AgentConfig).order_by(desc(AgentConfig.created_at)))
+    return list(result.scalars().all())
+@router.post("", response_model=ConfigResponse, status_code=201)
+async def create_config(
+    data: ConfigCreate,
+    session: AsyncSession = Depends(get_session),
+) -> AgentConfig:
+    """Create a new agent configuration."""
+    config = AgentConfig(
+        name=data.name,
+        description=data.description,
+        config_json=data.to_config_json(),
+    )
+    session.add(config)
+    await session.commit()
+    await session.refresh(config)
+    return config
+@router.get("/{config_id}", response_model=ConfigResponse)
+async def get_config(
+    config_id: str,
+    session: AsyncSession = Depends(get_session),
+) -> AgentConfig:
+    """Get a specific agent configuration."""
+    uuid_id = parse_uuid(config_id)
+    result = await session.execute(select(AgentConfig).where(AgentConfig.id == uuid_id))
+    config = result.scalar_one_or_none()
+    if not config:
+        raise HTTPException(status_code=404, detail="Config not found")
+    return config
+@router.put("/{config_id}", response_model=ConfigResponse)
+async def update_config(
+    config_id: str,
+    data: ConfigUpdate,
+    session: AsyncSession = Depends(get_session),
+) -> AgentConfig:
+    """Update an agent configuration."""
+    uuid_id = parse_uuid(config_id)
+    result = await session.execute(select(AgentConfig).where(AgentConfig.id == uuid_id))
+    config = result.scalar_one_or_none()
+    if not config:
+        raise HTTPException(status_code=404, detail="Config not found")
+    # Update fields that were provided
+    update_data = data.model_dump(exclude_unset=True)
+    # Handle config_json fields separately
+    config_fields = [
+        "enable_message_compaction",
+        "enable_memory_tool",
+        "enable_sub_agent",
+        "compaction_head_size",
+        "compaction_tail_size",
+        "bash_timeout",
+    ]
+    config_json = dict(config.config_json)
+    for field in config_fields:
+        if field in update_data:
+            config_json[field] = update_data.pop(field)
+    # Update top-level fields
+    for key, value in update_data.items():
+        setattr(config, key, value)
+    config.config_json = config_json
+    from datetime import datetime, timezone
+    config.updated_at = datetime.now(timezone.utc)
+    await session.commit()
+    await session.refresh(config)
+    return config
+@router.delete("/{config_id}", status_code=204)
+async def delete_config(
+    config_id: str,
+    session: AsyncSession = Depends(get_session),
+) -> None:
+    """Delete an agent configuration."""
+    uuid_id = parse_uuid(config_id)
+    result = await session.execute(select(AgentConfig).where(AgentConfig.id == uuid_id))
+    config = result.scalar_one_or_none()
+    if not config:
+        raise HTTPException(status_code=404, detail="Config not found")
+    await session.delete(config)
+    await session.commit()

src/flow/ui/api/jobs.py ADDED Viewed

	@@ -0,0 +1,169 @@

+# Copyright (c) Microsoft. All rights reserved.
+"""Job API routes."""
+import asyncio
+from typing import Any, AsyncGenerator
+from uuid import UUID
+from fastapi import APIRouter, Depends, HTTPException
+from fastapi.responses import StreamingResponse
+from sqlalchemy.ext.asyncio import AsyncSession
+from sqlmodel import select, desc
+from ..database import get_session
+from ..models.job import OptimizationJob, JobStatus
+from ..models.config import AgentConfig
+from ..models.task import TaskModel
+from ..schemas import JobCreate, JobResponse
+from ..services.optimizer_service import OptimizerService
+router = APIRouter(prefix="/jobs", tags=["jobs"])
+# Store running jobs for cancellation
+_running_jobs: dict[str, asyncio.Task[Any]] = {}
+def parse_uuid(id_str: str) -> UUID:
+    """Parse a string to UUID, raising 400 if invalid."""
+    try:
+        return UUID(id_str)
+    except ValueError as e:
+        raise HTTPException(status_code=400, detail=f"Invalid UUID: {id_str}") from e
+@router.get("", response_model=list[JobResponse])
+async def list_jobs(
+    status: JobStatus | None = None,
+    session: AsyncSession = Depends(get_session),
+) -> list[OptimizationJob]:
+    """List all optimization jobs."""
+    query = select(OptimizationJob)
+    if status:
+        query = query.where(OptimizationJob.status == status)
+    query = query.order_by(desc(OptimizationJob.created_at))
+    result = await session.execute(query)
+    return list(result.scalars().all())
+@router.post("", response_model=JobResponse, status_code=201)
+async def create_job(
+    data: JobCreate,
+    session: AsyncSession = Depends(get_session),
+) -> OptimizationJob:
+    """Create a new optimization job."""
+    # Validate config_ids exist
+    for config_id in data.config_ids:
+        uuid_id = parse_uuid(config_id)
+        result = await session.execute(select(AgentConfig).where(AgentConfig.id == uuid_id))
+        if not result.scalar_one_or_none():
+            raise HTTPException(status_code=400, detail=f"Config {config_id} not found")
+    # Validate task_ids exist
+    for task_id in data.task_ids:
+        uuid_id = parse_uuid(task_id)
+        result = await session.execute(select(TaskModel).where(TaskModel.id == uuid_id))
+        if not result.scalar_one_or_none():
+            raise HTTPException(status_code=400, detail=f"Task {task_id} not found")
+    job = OptimizationJob(
+        name=data.name,
+        config_ids=data.config_ids,
+        task_ids=data.task_ids,
+        parallel=data.parallel,
+        use_llm_eval=data.use_llm_eval,
+        total_experiments=len(data.config_ids) * len(data.task_ids),
+    )
+    session.add(job)
+    await session.commit()
+    await session.refresh(job)
+    return job
+@router.get("/{job_id}", response_model=JobResponse)
+async def get_job(
+    job_id: str,
+    session: AsyncSession = Depends(get_session),
+) -> OptimizationJob:
+    """Get a specific optimization job."""
+    uuid_id = parse_uuid(job_id)
+    result = await session.execute(select(OptimizationJob).where(OptimizationJob.id == uuid_id))
+    job = result.scalar_one_or_none()
+    if not job:
+        raise HTTPException(status_code=404, detail="Job not found")
+    return job
+@router.post("/{job_id}/start")
+async def start_job(
+    job_id: str,
+    session: AsyncSession = Depends(get_session),
+) -> StreamingResponse:
+    """Start an optimization job and stream progress via SSE."""
+    uuid_id = parse_uuid(job_id)
+    result = await session.execute(select(OptimizationJob).where(OptimizationJob.id == uuid_id))
+    job = result.scalar_one_or_none()
+    if not job:
+        raise HTTPException(status_code=404, detail="Job not found")
+    if job.status != JobStatus.PENDING:
+        raise HTTPException(status_code=400, detail=f"Job is already {job.status}")
+    async def event_stream() -> AsyncGenerator[str, None]:
+        service = OptimizerService()
+        async for progress in service.run_job(job_id):
+            yield f"data: {progress.model_dump_json()}\n\n"
+    return StreamingResponse(
+        event_stream(),
+        media_type="text/event-stream",
+        headers={
+            "Cache-Control": "no-cache",
+            "Connection": "keep-alive",
+        },
+    )
+@router.post("/{job_id}/cancel", response_model=JobResponse)
+async def cancel_job(
+    job_id: str,
+    session: AsyncSession = Depends(get_session),
+) -> OptimizationJob:
+    """Cancel a running optimization job."""
+    uuid_id = parse_uuid(job_id)
+    result = await session.execute(select(OptimizationJob).where(OptimizationJob.id == uuid_id))
+    job = result.scalar_one_or_none()
+    if not job:
+        raise HTTPException(status_code=404, detail="Job not found")
+    if job.status != JobStatus.RUNNING:
+        raise HTTPException(status_code=400, detail=f"Job is not running (status: {job.status})")
+    # Cancel the running task if it exists
+    if job_id in _running_jobs:
+        _running_jobs[job_id].cancel()
+        del _running_jobs[job_id]
+    job.status = JobStatus.CANCELLED
+    await session.commit()
+    await session.refresh(job)
+    return job
+@router.delete("/{job_id}", status_code=204)
+async def delete_job(
+    job_id: str,
+    session: AsyncSession = Depends(get_session),
+) -> None:
+    """Delete an optimization job and its runs."""
+    uuid_id = parse_uuid(job_id)
+    result = await session.execute(select(OptimizationJob).where(OptimizationJob.id == uuid_id))
+    job = result.scalar_one_or_none()
+    if not job:
+        raise HTTPException(status_code=404, detail="Job not found")
+    if job.status == JobStatus.RUNNING:
+        raise HTTPException(status_code=400, detail="Cannot delete a running job")
+    # Runs will be cascade deleted due to foreign key
+    await session.delete(job)
+    await session.commit()

src/flow/ui/api/runs.py ADDED Viewed

	@@ -0,0 +1,157 @@

+# Copyright (c) Microsoft. All rights reserved.
+"""Run API routes."""
+from typing import Any
+from uuid import UUID
+from fastapi import APIRouter, Depends, HTTPException
+from sqlalchemy.ext.asyncio import AsyncSession
+from sqlmodel import select, desc
+from ..database import get_session
+from ..models.run import ExperimentRun
+from ..schemas import RunResponse, RunDetailResponse, CriterionResultSchema
+router = APIRouter(prefix="/runs", tags=["runs"])
+def parse_uuid(id_str: str) -> UUID:
+    """Parse a string to UUID, raising 400 if invalid."""
+    try:
+        return UUID(id_str)
+    except ValueError as e:
+        raise HTTPException(status_code=400, detail=f"Invalid UUID: {id_str}") from e
+@router.get("", response_model=list[RunResponse])
+async def list_runs(
+    job_id: str | None = None,
+    config_name: str | None = None,
+    task_name: str | None = None,
+    is_pareto: bool | None = None,
+    session: AsyncSession = Depends(get_session),
+) -> list[ExperimentRun]:
+    """List experiment runs with optional filters."""
+    query = select(ExperimentRun)
+    if job_id:
+        uuid_id = parse_uuid(job_id)
+        query = query.where(ExperimentRun.job_id == uuid_id)
+    if config_name:
+        query = query.where(ExperimentRun.config_name == config_name)
+    if task_name:
+        query = query.where(ExperimentRun.task_name == task_name)
+    if is_pareto is not None:
+        query = query.where(ExperimentRun.is_pareto == is_pareto)
+    query = query.order_by(desc(ExperimentRun.created_at))
+    result = await session.execute(query)
+    return list(result.scalars().all())
+@router.get("/{run_id}", response_model=RunDetailResponse)
+async def get_run(
+    run_id: str,
+    session: AsyncSession = Depends(get_session),
+) -> dict[str, Any]:
+    """Get detailed information about a specific run."""
+    uuid_id = parse_uuid(run_id)
+    result = await session.execute(select(ExperimentRun).where(ExperimentRun.id == uuid_id))
+    run = result.scalar_one_or_none()
+    if not run:
+        raise HTTPException(status_code=404, detail="Run not found")
+    # Parse criteria results from trace
+    criteria_results = []
+    if run.trace_json and "criteria_results" in run.trace_json:
+        for cr in run.trace_json["criteria_results"]:
+            criteria_results.append(CriterionResultSchema(
+                name=cr.get("name", ""),
+                score=cr.get("score", 0.0),
+                passed=cr.get("passed", False),
+                reasoning=cr.get("reasoning", ""),
+            ))
+    return {
+        "id": str(run.id),
+        "job_id": str(run.job_id),
+        "config_name": run.config_name,
+        "task_name": run.task_name,
+        "status": run.status,
+        "tokens_total": run.tokens_total,
+        "tokens_input": run.tokens_input,
+        "tokens_output": run.tokens_output,
+        "duration_seconds": run.duration_seconds,
+        "score": run.score,
+        "passed": run.passed,
+        "reasoning": run.reasoning,
+        "criteria_results": criteria_results,
+        "output": run.output,
+        "files_created": run.files_created,
+        "trace": run.trace_json,
+        "is_pareto": run.is_pareto,
+        "pareto_rank": run.pareto_rank,
+        "created_at": run.created_at,
+    }
+@router.get("/job/{job_id}/summary")
+async def get_job_summary(
+    job_id: str,
+    session: AsyncSession = Depends(get_session),
+) -> dict[str, Any]:
+    """Get aggregated summary for a job's runs."""
+    uuid_id = parse_uuid(job_id)
+    result = await session.execute(
+        select(ExperimentRun).where(ExperimentRun.job_id == uuid_id)
+    )
+    runs = list(result.scalars().all())
+    if not runs:
+        raise HTTPException(status_code=404, detail="No runs found for job")
+    # Aggregate by config
+    config_summaries: dict[str, dict[str, Any]] = {}
+    for run in runs:
+        if run.config_name not in config_summaries:
+            config_summaries[run.config_name] = {
+                "config_name": run.config_name,
+                "total_runs": 0,
+                "passed_runs": 0,
+                "avg_score": 0.0,
+                "avg_tokens": 0.0,
+                "avg_duration": 0.0,
+                "is_pareto": False,
+                "pareto_rank": 999,
+            }
+        summary = config_summaries[run.config_name]
+        summary["total_runs"] += 1
+        if run.passed:
+            summary["passed_runs"] += 1
+        summary["avg_score"] += run.score
+        summary["avg_tokens"] += run.tokens_total
+        summary["avg_duration"] += run.duration_seconds
+        if run.is_pareto:
+            summary["is_pareto"] = True
+            summary["pareto_rank"] = min(summary["pareto_rank"], run.pareto_rank)
+    # Calculate averages
+    for summary in config_summaries.values():
+        n = summary["total_runs"]
+        summary["avg_score"] /= n
+        summary["avg_tokens"] /= n
+        summary["avg_duration"] /= n
+    # Sort by score descending
+    sorted_summaries = sorted(
+        config_summaries.values(),
+        key=lambda x: (-x["avg_score"], x["avg_tokens"]),
+    )
+    return {
+        "job_id": job_id,
+        "total_runs": len(runs),
+        "config_summaries": sorted_summaries,
+        "pareto_configs": [s["config_name"] for s in sorted_summaries if s["is_pareto"]],
+    }

src/flow/ui/api/tasks.py ADDED Viewed

	@@ -0,0 +1,119 @@

+# Copyright (c) Microsoft. All rights reserved.
+"""Task API routes."""
+from uuid import UUID
+from fastapi import APIRouter, Depends, HTTPException
+from sqlalchemy.ext.asyncio import AsyncSession
+from sqlmodel import select, desc
+from ..database import get_session
+from ..models.task import TaskModel
+from ..schemas import TaskCreate, TaskResponse
+router = APIRouter(prefix="/tasks", tags=["tasks"])
+def parse_uuid(id_str: str) -> UUID:
+    """Parse a string to UUID, raising 400 if invalid."""
+    try:
+        return UUID(id_str)
+    except ValueError as e:
+        raise HTTPException(status_code=400, detail=f"Invalid UUID: {id_str}") from e
+@router.get("", response_model=list[TaskResponse])
+async def list_tasks(
+    category: str | None = None,
+    suite: str | None = None,
+    session: AsyncSession = Depends(get_session),
+) -> list[TaskModel]:
+    """List all tasks, optionally filtered by category or suite."""
+    query = select(TaskModel)
+    if category:
+        query = query.where(TaskModel.category == category)
+    if suite:
+        query = query.where(TaskModel.suite == suite)
+    query = query.order_by(desc(TaskModel.created_at))
+    result = await session.execute(query)
+    return list(result.scalars().all())
+@router.post("", response_model=TaskResponse, status_code=201)
+async def create_task(
+    data: TaskCreate,
+    session: AsyncSession = Depends(get_session),
+) -> TaskModel:
+    """Create a new task."""
+    task = TaskModel(
+        name=data.name,
+        prompt=data.prompt,
+        criteria_json=data.to_criteria_json(),
+        category=data.category,
+    )
+    session.add(task)
+    await session.commit()
+    await session.refresh(task)
+    return task
+@router.get("/{task_id}", response_model=TaskResponse)
+async def get_task(
+    task_id: str,
+    session: AsyncSession = Depends(get_session),
+) -> TaskModel:
+    """Get a specific task."""
+    uuid_id = parse_uuid(task_id)
+    result = await session.execute(select(TaskModel).where(TaskModel.id == uuid_id))
+    task = result.scalar_one_or_none()
+    if not task:
+        raise HTTPException(status_code=404, detail="Task not found")
+    return task
+@router.delete("/{task_id}", status_code=204)
+async def delete_task(
+    task_id: str,
+    session: AsyncSession = Depends(get_session),
+) -> None:
+    """Delete a task."""
+    uuid_id = parse_uuid(task_id)
+    result = await session.execute(select(TaskModel).where(TaskModel.id == uuid_id))
+    task = result.scalar_one_or_none()
+    if not task:
+        raise HTTPException(status_code=404, detail="Task not found")
+    await session.delete(task)
+    await session.commit()
+@router.post("/import-suite", response_model=list[TaskResponse], status_code=201)
+async def import_suite(
+    suite_name: str,
+    session: AsyncSession = Depends(get_session),
+) -> list[TaskModel]:
+    """Import tasks from a built-in suite."""
+    from flow.experiments.types import get_task_suite
+    try:
+        suite_tasks = get_task_suite(suite_name)
+    except ValueError as e:
+        raise HTTPException(status_code=400, detail=str(e))
+    created_tasks = []
+    for t in suite_tasks:
+        task = TaskModel(
+            name=t.name,
+            prompt=t.prompt,
+            criteria_json=[{"name": c.name, "instruction": c.instruction, "weight": c.weight} for c in t.criteria],
+            category=t.metadata.get("category", "default"),
+            suite=suite_name,
+        )
+        session.add(task)
+        created_tasks.append(task)
+    await session.commit()
+    for task in created_tasks:
+        await session.refresh(task)
+    return created_tasks

src/flow/ui/database.py ADDED Viewed

	@@ -0,0 +1,30 @@

+# Copyright (c) Microsoft. All rights reserved.
+"""Database setup with SQLModel and SQLite."""
+from pathlib import Path
+from typing import AsyncGenerator
+from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine
+from sqlmodel import SQLModel
+# Database path
+DB_PATH = Path.home() / ".flow" / "flow_ui.db"
+DB_PATH.parent.mkdir(parents=True, exist_ok=True)
+DATABASE_URL = f"sqlite+aiosqlite:///{DB_PATH}"
+engine = create_async_engine(DATABASE_URL, echo=False, future=True)
+async_session = async_sessionmaker(engine, class_=AsyncSession, expire_on_commit=False)
+async def init_db() -> None:
+    """Initialize database tables."""
+    async with engine.begin() as conn:
+        await conn.run_sync(SQLModel.metadata.create_all)
+async def get_session() -> AsyncGenerator[AsyncSession, None]:
+    """Get database session."""
+    async with async_session() as session:
+        yield session

src/flow/ui/main.py ADDED Viewed

	@@ -0,0 +1,94 @@

+# Copyright (c) Microsoft. All rights reserved.
+"""FastAPI server for Flow UI."""
+from contextlib import asynccontextmanager
+from pathlib import Path
+from typing import Any, AsyncGenerator
+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.staticfiles import StaticFiles
+from fastapi.responses import FileResponse
+from .database import init_db
+from .api import configs_router, tasks_router, jobs_router, runs_router
+@asynccontextmanager
+async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
+    """Initialize database on startup."""
+    await init_db()
+    yield
+app = FastAPI(
+    title="Flow Optimization UI",
+    description="Web UI for running agent configuration optimization experiments",
+    version="0.1.0",
+    lifespan=lifespan,
+)
+# CORS for development
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["http://localhost:5173", "http://127.0.0.1:5173"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# API routes
+app.include_router(configs_router, prefix="/api")
+app.include_router(tasks_router, prefix="/api")
+app.include_router(jobs_router, prefix="/api")
+app.include_router(runs_router, prefix="/api")
+# Health check
+@app.get("/api/health")
+async def health_check() -> dict[str, Any]:
+    """Health check endpoint."""
+    return {"status": "ok", "service": "flow-ui"}
+# Static files and SPA fallback
+# UI is built to backend/ui/ directory so the backend package is self-contained
+UI_DIR = Path(__file__).parent / "ui"
+def setup_static_files() -> None:
+    """Set up static file serving if frontend is built."""
+    if UI_DIR.exists():
+        # Serve assets directory
+        assets_dir = UI_DIR / "assets"
+        if assets_dir.exists():
+            app.mount("/assets", StaticFiles(directory=assets_dir), name="assets")
+        @app.get("/{full_path:path}")
+        async def serve_spa(full_path: str) -> FileResponse:  # pyright: ignore[reportUnusedFunction]
+            """Serve SPA for all non-API routes."""
+            file_path = UI_DIR / full_path
+            if file_path.exists() and file_path.is_file():
+                return FileResponse(file_path)
+            return FileResponse(UI_DIR / "index.html")
+# Only set up static files if UI is built
+if UI_DIR.exists():
+    setup_static_files()
+def run_server(host: str = "0.0.0.0", port: int = 8091) -> None:  # noqa: S104
+    """Run the FastAPI server."""
+    import uvicorn
+    uvicorn.run(
+        "flow.ui.main:app",
+        host=host,
+        port=port,
+        reload=False,
+    )
+if __name__ == "__main__":
+    run_server()

src/flow/ui/models/__init__.py ADDED Viewed

	@@ -0,0 +1,15 @@

+# Copyright (c) Microsoft. All rights reserved.
+"""Database models."""
+from .config import AgentConfig
+from .task import TaskModel
+from .job import OptimizationJob, JobStatus
+from .run import ExperimentRun
+__all__ = [
+    "AgentConfig",
+    "TaskModel",
+    "OptimizationJob",
+    "JobStatus",
+    "ExperimentRun",
+]