Spaces:

victordibia
/

flow

Running

App Files Files Community

victordibia commited on Feb 9

Commit

f4dca43

1 Parent(s): c02193c

Deploy 2026-02-09 15:08:25

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

pyproject.toml +31 -4
src/flow/__init__.py +1 -1
src/flow/cli/app.py +22 -12
src/flow/cli/hf_import.py +159 -0
src/flow/cli/optimize.py +193 -112
src/flow/cli/repl.py +3 -1
src/flow/experiments/__init__.py +79 -15
src/flow/experiments/ablation.py +6 -2
src/flow/experiments/agent_api.py +305 -0
src/flow/experiments/evaluators/heuristic.py +1 -1
src/flow/experiments/evaluators/llm.py +138 -37
src/flow/experiments/expansion.py +250 -0
src/flow/experiments/gaia_converter.py +216 -0
src/flow/experiments/hf_datasets.py +354 -0
src/flow/experiments/models.py +635 -83
src/flow/experiments/optimizer.py +270 -12
src/flow/experiments/presets.py +123 -0
src/flow/experiments/results.py +118 -0
src/flow/experiments/runner.py +63 -50
src/flow/experiments/strategies/__init__.py +103 -0
src/flow/experiments/strategies/llm_rewriter.py +357 -0
src/flow/experiments/strategies/tool_selector.py +426 -0
src/flow/experiments/trace_collector.py +92 -49
src/flow/experiments/types.py +7 -2
src/flow/harness/__init__.py +5 -4
src/flow/harness/base.py +19 -7
src/flow/harness/compaction/__init__.py +38 -0
src/flow/harness/compaction/strategies.py +502 -0
src/flow/harness/compaction/tokenizer.py +131 -0
src/flow/harness/langgraph/__init__.py +7 -1
src/flow/harness/langgraph/compaction.py +187 -19
src/flow/harness/langgraph/harness.py +11 -4
src/flow/harness/maf/__init__.py +1 -1
src/flow/harness/maf/agent.py +7 -3
src/flow/harness/maf/harness.py +11 -5
src/flow/harness/maf/message_store.py +247 -69
src/flow/harness/maf/tools/__init__.py +55 -20
src/flow/harness/maf/wrappers.py +1 -1
src/flow/harness/miniagent/__init__.py +19 -19
src/flow/harness/miniagent/agent.py +13 -12
src/flow/harness/miniagent/client.py +2 -2
src/flow/harness/miniagent/context.py +5 -4
src/flow/harness/miniagent/harness.py +54 -30
src/flow/harness/miniagent/hooks.py +2 -1
src/flow/harness/miniagent/instructions.py +23 -64
src/flow/harness/miniagent/otel.py +8 -8
src/flow/harness/miniagent/tool.py +8 -7
src/flow/harness/miniagent/tools/__init__.py +27 -28
src/flow/harness/miniagent/workspace.py +23 -20
src/flow/harness/registry.py +43 -6

pyproject.toml CHANGED Viewed

@@ -38,21 +38,46 @@ dependencies = [
     "uvicorn>=0.27.0",
     "sqlmodel>=0.0.14",
     "aiosqlite>=0.19.0",
     "tiktoken>=0.12.0",
 ]
 [project.optional-dependencies]
 # Optional features
 research = ["beautifulsoup4>=4.12.0", "html2text>=2024.2.26"]
 langgraph = [
     "langgraph>=0.2.0",
     "langchain-core>=0.3.0",
     "langchain-openai>=0.2.0",
 ]
-optimizer = ["gepa>=0.0.20"]
 # Bundles
-all = ["flow-agent[research,langgraph,optimizer]"]
 dev = [
     "pytest>=8.0.0",
     "pytest-asyncio>=0.23.0",
@@ -85,7 +110,7 @@ packages = ["src/flow"]
 [tool.pyright]
 include = ["src"]
-exclude = ["**/tests/**", "**/.venv/**"]
 typeCheckingMode = "strict"
 pythonVersion = "3.10"
 reportMissingTypeStubs = false
@@ -108,6 +133,7 @@ show_error_codes = true
 warn_unused_ignores = false
 disallow_incomplete_defs = true
 disallow_untyped_decorators = true
 # ============================================================================
 # Linting - Ruff
@@ -119,7 +145,7 @@ target-version = "py310"
 src = ["src"]
 fix = true
 include = ["*.py", "*.pyi", "**/pyproject.toml"]
-exclude = ["docs/*"]
 [tool.ruff.lint]
 select = [
@@ -140,6 +166,7 @@ ignore = [
     "D107",   # allow missing docstring in __init__
     "ANN401", # allow Any type (needed for generic tool/event handling)
     "S101",   # allow assert statements (used in tests)
 ]
 [tool.ruff.lint.per-file-ignores]

     "uvicorn>=0.27.0",
     "sqlmodel>=0.0.14",
     "aiosqlite>=0.19.0",
+    "greenlet>=3.0.0",  # Required for SQLAlchemy async support
+    # Logging dependencies
+    "loguru>=0.7.3",
     "tiktoken>=0.12.0",
 ]
 [project.optional-dependencies]
 # Optional features
 research = ["beautifulsoup4>=4.12.0", "html2text>=2024.2.26"]
 langgraph = [
     "langgraph>=0.2.0",
     "langchain-core>=0.3.0",
     "langchain-openai>=0.2.0",
 ]
+smolagents = [
+    "smolagents[toolkit]>=1.24.0",
+    "pdfminer.six>=20240706",
+    "cffi>=1.16.0",
+    "cryptography>=42.0.0",
+    "Pillow>=11.0.0",
+    "puremagic>=1.28",
+    "pypdf>=5.1.0",
+    "youtube_transcript_api>=0.6.2",
+    "python_pptx>=1.0.2",
+    "serpapi>=0.1.5",
+    "mammoth>=1.8.0",
+    "markdownify>=0.13.1",
+    "pandas>=2.2.3",
+    "openpyxl>=3.1.0",
+    "wikipedia-api>=0.9.0",
+]
+# Bundles
+optimizer = ["gepa>=0.0.27", "litellm>=1.0.0"]
+hf-datasets = ["datasets>=2.0.0"]
 # Bundles
+all = ["flow-agent[research,langgraph,optimizer,smolagents,hf-datasets]"]
 dev = [
     "pytest>=8.0.0",
     "pytest-asyncio>=0.23.0",
 [tool.pyright]
 include = ["src"]
+exclude = ["**/tests/**", "**/.venv/**", "**/skills/**"]
 typeCheckingMode = "strict"
 pythonVersion = "3.10"
 reportMissingTypeStubs = false
 warn_unused_ignores = false
 disallow_incomplete_defs = true
 disallow_untyped_decorators = true
+exclude = ["src/flow/skills/"]
 # ============================================================================
 # Linting - Ruff
 src = ["src"]
 fix = true
 include = ["*.py", "*.pyi", "**/pyproject.toml"]
+exclude = ["docs/*", "src/flow/skills/*"]
 [tool.ruff.lint]
 select = [
     "D107",   # allow missing docstring in __init__
     "ANN401", # allow Any type (needed for generic tool/event handling)
     "S101",   # allow assert statements (used in tests)
+    "B008",   # allow Depends() in function defaults (FastAPI pattern)
 ]
 [tool.ruff.lint.per-file-ignores]

src/flow/__init__.py CHANGED Viewed

@@ -21,6 +21,6 @@ __version__ = "0.1.0"
 __all__ = [
     "MAFHarness",
-    "create_agent",
     "__version__",
 ]

 __all__ = [
     "MAFHarness",
     "__version__",
+    "create_agent",
 ]

src/flow/cli/app.py CHANGED Viewed

@@ -65,7 +65,7 @@ def run(
     framework: Annotated[
         str,
         typer.Option("--framework", "-f", help="Agent framework: 'maf', 'miniagent', or 'langgraph'"),
-    ] = "maf",
     interactive: Annotated[
         bool,
         typer.Option("--interactive/--no-interactive", "-i", help="Interactive mode"),
@@ -110,26 +110,34 @@ async def _run_single_task(
     memory_path: Path,
     task: str,
     config_path: Path | None = None,
-    framework: str = "maf",
 ) -> None:
     """Run a single task and print the result."""
     from flow.cli.output import print_event
     from flow.harness.base import EventType
-    # Import harness modules to register them
-    import flow.harness.maf  # noqa: F401
-    import flow.harness.miniagent  # noqa: F401  # pyright: ignore[reportUnusedImport]
     if framework == "langgraph":
         try:
-            import flow.harness.langgraph  # noqa: F401
         except ImportError:
             console.print("[red]Error:[/] LangGraph dependencies not installed.")
             console.print("[dim]Install with: pip install flow-agent[langgraph][/]")
             raise typer.Exit(1)
     from flow.harness import create_harness
-    from flow.experiments.models import Agent
     if config_path:
         # Load agent config from optimization result
@@ -137,19 +145,19 @@ async def _run_single_task(
         agent_config = load_agent(config_path)
         # Override framework if specified
-        if framework != "maf":
             agent_config = Agent(
                 name=agent_config.name,
-                framework=framework,
                 tools=agent_config.tools,
-                model=agent_config.model,
                 instructions=agent_config.instructions,
                 compaction=agent_config.compaction,
             )
         console.print(f"[dim]Using agent config: {agent_config.name} ({framework})[/]")
         harness = create_harness(agent_config, workspace)
     else:
-        agent = Agent(name="flow-cli", framework=framework)
         harness = create_harness(agent, workspace)
     try:
@@ -167,10 +175,12 @@ async def _run_single_task(
         await harness.close()
-# Import and register the optimize command
 from flow.cli.optimize import optimize as optimize_cmd
 app.command()(optimize_cmd)
 @app.command()

     framework: Annotated[
         str,
         typer.Option("--framework", "-f", help="Agent framework: 'maf', 'miniagent', or 'langgraph'"),
+    ] = "miniagent",
     interactive: Annotated[
         bool,
         typer.Option("--interactive/--no-interactive", "-i", help="Interactive mode"),
     memory_path: Path,
     task: str,
     config_path: Path | None = None,
+    framework: str = "miniagent",
 ) -> None:
     """Run a single task and print the result."""
+    # Import harness modules to register them (side-effect imports)
+    import flow.harness.maf as _maf
+    import flow.harness.miniagent as _miniagent
     from flow.cli.output import print_event
     from flow.harness.base import EventType
+    _ = (_maf, _miniagent)
     if framework == "langgraph":
         try:
+            import flow.harness.langgraph as _lg
+            _ = _lg
         except ImportError:
             console.print("[red]Error:[/] LangGraph dependencies not installed.")
             console.print("[dim]Install with: pip install flow-agent[langgraph][/]")
             raise typer.Exit(1)
+    from typing import cast
+    from flow.experiments.models import Agent, Framework
     from flow.harness import create_harness
+    # Cast the validated framework string to Framework literal type
+    framework_typed = cast(Framework, framework)
     if config_path:
         # Load agent config from optimization result
         agent_config = load_agent(config_path)
         # Override framework if specified
+        if framework != "miniagent":
             agent_config = Agent(
                 name=agent_config.name,
+                framework=framework_typed,
                 tools=agent_config.tools,
+                llm_config=agent_config.llm_config,
                 instructions=agent_config.instructions,
                 compaction=agent_config.compaction,
             )
         console.print(f"[dim]Using agent config: {agent_config.name} ({framework})[/]")
         harness = create_harness(agent_config, workspace)
     else:
+        agent = Agent(name="flow-cli", framework=framework_typed)
         harness = create_harness(agent, workspace)
     try:
         await harness.close()
+# Import and register commands
+from flow.cli.hf_import import hf_import as hf_import_cmd
 from flow.cli.optimize import optimize as optimize_cmd
 app.command()(optimize_cmd)
+app.command(name="hf-import")(hf_import_cmd)
 @app.command()

src/flow/cli/hf_import.py ADDED Viewed

	@@ -0,0 +1,159 @@

+"""CLI command to import Hugging Face datasets."""
+from __future__ import annotations
+from pathlib import Path
+from typing import Annotated
+import typer
+from rich.console import Console
+from flow.experiments.hf_datasets import (
+    DATASET_CONVERTERS,
+    import_hf_dataset,
+    save_tasks_to_jsonl,
+)
+console = Console()
+def hf_import(
+    dataset: Annotated[
+        str,
+        typer.Argument(help="Hugging Face dataset name (e.g., 'openai/gsm8k')"),
+    ],
+    output: Annotated[
+        Path,
+        typer.Option(
+            "--output",
+            "-o",
+            help="Output JSONL file path",
+        ),
+    ] = Path("tasks/imported.jsonl"),
+    config: Annotated[
+        str | None,
+        typer.Option(
+            "--config",
+            "-c",
+            help="Dataset configuration/subset (e.g., 'main' for gsm8k)",
+        ),
+    ] = None,
+    split: Annotated[
+        str,
+        typer.Option(
+            "--split",
+            "-s",
+            help="Dataset split to use",
+        ),
+    ] = "train",
+    limit: Annotated[
+        int | None,
+        typer.Option(
+            "--limit",
+            "-n",
+            help="Maximum number of examples to import",
+        ),
+    ] = None,
+    local_path: Annotated[
+        Path | None,
+        typer.Option(
+            "--local-path",
+            "-l",
+            help="Path to download dataset snapshot to. Uses huggingface_hub.snapshot_download(). "
+            "For private datasets, set HF_TOKEN env variable.",
+        ),
+    ] = None,
+    list_supported: Annotated[
+        bool,
+        typer.Option(
+            "--list",
+            help="List supported datasets and exit",
+        ),
+    ] = False,
+) -> None:
+    """Import a Hugging Face dataset for use with GEPA optimization.
+    Converts HF datasets into Flow's task format with evaluation criteria.
+    Examples:
+        # Import 100 GSM8K math problems
+        flow hf-import openai/gsm8k --config main --output tasks/gsm8k.jsonl --limit 100
+        # Import HumanEval coding problems
+        flow hf-import openai_humaneval --output tasks/humaneval.jsonl
+        # Download to local path first (useful for caching or private datasets)
+        flow hf-import openai/gsm8k --local-path /data/gsm8k --output tasks/gsm8k.jsonl
+        # For private datasets, set HF_TOKEN env variable
+        HF_TOKEN=hf_... flow hf-import org/private-dataset --local-path /data/private
+        # Use with GEPA
+        flow optimize \\
+            --config examples/gepa_strategy.yaml \\
+            --agent examples/base_agent.yaml \\
+            --tasks tasks/gsm8k.jsonl \\
+            --budget 10
+    """
+    if list_supported:
+        console.print("\n[bold]Supported Datasets:[/]")
+        console.print("\n[dim]You can add custom converters via register_converter()[/]\n")
+        for name in sorted(DATASET_CONVERTERS.keys()):
+            console.print(f"  • {name}")
+        return
+    console.print(f"\n[bold]Importing dataset:[/] {dataset}")
+    if config:
+        console.print(f"[dim]Config:[/] {config}")
+    console.print(f"[dim]Split:[/] {split}")
+    if limit:
+        console.print(f"[dim]Limit:[/] {limit} examples")
+    if local_path:
+        console.print(f"[dim]Local path:[/] {local_path}")
+    console.print()
+    try:
+        # Import dataset
+        tasks = import_hf_dataset(
+            dataset_name=dataset,
+            config=config,
+            split=split,
+            limit=limit,
+            local_path=local_path,
+        )
+        if not tasks:
+            console.print("[red]Error:[/] No tasks were converted")
+            raise typer.Exit(1)
+        # Save to file
+        save_tasks_to_jsonl(tasks, output)
+        console.print(f"\n[green]Success![/] Imported {len(tasks)} tasks")
+        console.print(f"[dim]Output:[/] {output}")
+        console.print("\n[bold]Sample task:[/]")
+        console.print(f"  Name: {tasks[0].name}")
+        console.print(f"  Prompt: {tasks[0].prompt[:100]}...")
+        console.print(f"  Criteria: {len(tasks[0].criteria)} evaluation criteria")
+        console.print("\n[bold]Next steps:[/]")
+        console.print("  [dim]# Run GEPA optimization[/]")
+        console.print("  flow optimize \\")
+        console.print("    --config examples/gepa_strategy.yaml \\")
+        console.print("    --agent examples/base_agent.yaml \\")
+        console.print(f"    --tasks {output} \\")
+        console.print("    --budget 10")
+    except ImportError:
+        console.print("[red]Error:[/] Hugging Face datasets library not installed")
+        console.print("[dim]Install with:[/] pip install datasets")
+        raise typer.Exit(1)
+    except ValueError as e:
+        console.print(f"[red]Error:[/] {e}")
+        raise typer.Exit(1)
+    except Exception as e:
+        console.print(f"[red]Error:[/] {e}")
+        import traceback
+        traceback.print_exc()
+        raise typer.Exit(1)

src/flow/cli/optimize.py CHANGED Viewed

@@ -18,7 +18,6 @@ from flow.experiments.models import (
     Agent,
     Candidate,
     CompactionConfig,
-    Experiment,
     ExperimentResult,
     GridSearchStrategy,
     load_experiment,
@@ -86,6 +85,13 @@ def optimize(
             help="Output directory for results",
         ),
     ] = None,
     no_llm_eval: Annotated[
         bool,
         typer.Option(
@@ -107,7 +113,6 @@ def optimize(
     ranks via Pareto analysis, and exports winning agent configs.
     Examples:
         # Use experiment YAML (recommended - defines agent, tasks, and variations)
         flow optimize --experiment experiment.yaml
@@ -140,6 +145,7 @@ def optimize(
         output_dir=output,
         use_llm_eval=not no_llm_eval,
         budget=budget,
     ))
@@ -154,15 +160,16 @@ async def _run_optimize(
     output_dir: Path | None,
     use_llm_eval: bool,
     budget: int,
 ) -> None:
     """Run the optimization."""
     # If experiment YAML provided, use it as the source of truth
     if experiment_path:
-        await _run_from_experiment(experiment_path, output_dir)
         return
     # Load tasks
-    tasks = _load_tasks(tasks_path, suite)
     if not tasks:
         console.print("[red]Error:[/] No tasks specified. Use --tasks or --suite")
         raise typer.Exit(1)
@@ -171,7 +178,7 @@ async def _run_optimize(
     base = _load_base_agent(agent_path)
     # Load candidates and check if a strategy is defined in config
-    candidates, strategy_instance = _load_candidates_and_strategy(config_path, vary, base, budget)
     # If a strategy was provided (like GepaStrategy), run it directly
     if strategy_instance is not None:
@@ -221,7 +228,7 @@ async def _run_optimize(
         raise typer.Exit(1)
-async def _run_from_experiment(experiment_path: Path, output_dir: Path | None) -> None:
     """Run optimization from an experiment YAML file.
     The experiment YAML defines:
@@ -270,10 +277,13 @@ async def _run_from_experiment(experiment_path: Path, output_dir: Path | None) -
         console.print("[red]Error:[/] Experiment must specify 'suite' or 'tasks'")
         raise typer.Exit(1)
     # Generate candidates from variations
     if exp.variations:
         strategy = GridSearchStrategy(exp.variations)
-        candidates = strategy.generate(base, exp.budget)
     else:
         candidates = [Candidate(agent=base, mutations={}, rationale="baseline")]
@@ -283,7 +293,7 @@ async def _run_from_experiment(experiment_path: Path, output_dir: Path | None) -
     for t in tasks:
         console.print(f"  - {t.name}")
-    console.print(f"\n[bold]Variations:[/]")
     for key, values in exp.variations.items():
         console.print(f"  - {key}: {len(values)} variants")
@@ -309,27 +319,31 @@ async def _run_from_experiment(experiment_path: Path, output_dir: Path | None) -
         raise typer.Exit(1)
-def _load_tasks(tasks_path: Path | None, suite: str | None) -> list[Task]:
     """Load tasks from file or built-in suite."""
     if tasks_path:
         if not tasks_path.exists():
             console.print(f"[red]Error:[/] Tasks file not found: {tasks_path}")
             raise typer.Exit(1)
-        return load_tasks_from_jsonl(tasks_path)
-    if suite:
         try:
-            return get_task_suite(suite)
         except ValueError as e:
             console.print(f"[red]Error:[/] {e}")
             raise typer.Exit(1)
-    # Default: quick suite
-    try:
-        return get_task_suite("quick")
-    except ValueError:
-        console.print("[red]Error:[/] No built-in suites available. Use --tasks to specify a JSONL file.")
-        raise typer.Exit(1)
 def _load_base_agent(agent_path: Path | None) -> Agent:
@@ -344,18 +358,18 @@ def _load_base_agent(agent_path: Path | None) -> Agent:
     return Agent(name="flow_agent")
-def _load_candidates_and_strategy(
     config_path: Path | None,
     vary: str | None,
     base: Agent,
     budget: int,
 ) -> tuple[list[Candidate], Any | None]:
     """Load candidates from file or generate from variations.
     Supports both YAML and Python config files:
     - YAML: strategy configuration (strategy_type, config)
     - Python: STRATEGY object, CANDIDATES list, or VARIATIONS dict
     Returns:
         Tuple of (candidates, strategy_instance)
         - If a STRATEGY is defined in config, returns ([], strategy_instance)
@@ -374,17 +388,17 @@ def _load_candidates_and_strategy(
             # YAML files currently only support strategy definitions
             console.print("[red]Error:[/] YAML config must define a strategy")
             raise typer.Exit(1)
         # Python config file
         candidates, variations, strategy_obj = _load_python_config(config_path)
         # If a strategy object was provided (e.g., GepaStrategy), return it
         if strategy_obj is not None:
             return [], strategy_obj
         if variations:
             strategy = GridSearchStrategy(variations)
-            return strategy.generate(base, budget), None
         elif candidates:
             return candidates, None
         else:
@@ -394,7 +408,7 @@ def _load_candidates_and_strategy(
     if vary:
         variations = _parse_vary_flag(vary)
         strategy = GridSearchStrategy(variations)
-        return strategy.generate(base, budget), None
     # Default: explore context engineering dimensions
     strategy = GridSearchStrategy(variations={
@@ -402,9 +416,8 @@ def _load_candidates_and_strategy(
             CompactionConfig.head_tail(10, 40),
             CompactionConfig.none(),
         ],
-        "tools": ["minimal", "standard"],
     })
-    return strategy.generate(base, budget), None
 def _load_yaml_strategy(path: Path) -> Any | None:
@@ -442,9 +455,12 @@ def _load_yaml_strategy(path: Path) -> Any | None:
             console.print("[red]Error:[/] GEPA optimizer not available.")
             console.print("[dim]Install with: pip install flow-agent[optimizer][/]")
             raise typer.Exit(1)
     else:
         console.print(f"[red]Error:[/] Unknown strategy type: {strategy_type}")
-        console.print("[dim]Supported: gepa[/]")
         raise typer.Exit(1)
@@ -526,128 +542,193 @@ async def _run_active_strategy(
     use_llm_eval: bool,
     budget: int,
 ) -> None:
-    """Run an active optimization strategy (like GEPA)."""
     logger = logging.getLogger(__name__)
-    # Create optimizer instance to run evaluations
     optimizer_runner = FlowOptimizer(
         parallel=parallel,
         use_llm_evaluator=use_llm_eval,
-        output_dir=None, # Don't export every intermediate run result
     )
     main_loop = asyncio.get_running_loop()
-    # Define evaluator function to inject into strategy
     def evaluator(candidate: Candidate, minibatch: list[Task] | None = None) -> ExperimentResult:
-        """Evaluate a candidate on a minibatch of tasks."""
         eval_tasks = minibatch if minibatch else tasks
-        logger.info(f"[EVALUATOR] Evaluating candidate '{candidate.agent.name}' on {len(eval_tasks)} tasks")
-        logger.info(f"[EVALUATOR] Using LLM evaluator: {use_llm_eval}")
-        logger.debug(f"[EVALUATOR] Tasks: {[t.name for t in eval_tasks]}")
         try:
-            # Run async evaluation on the main loop and wait for result
-            # This is safe because strategy.generate (which calls this)
-            # is running in an executor thread.
             future = asyncio.run_coroutine_threadsafe(
-                optimizer_runner.optimize([candidate], eval_tasks),
                 main_loop
             )
             optimization_result = future.result()
-            # Check if we got any results
             if not optimization_result.summaries:
-                logger.warning(f"[EVALUATOR] Optimization produced no summaries for candidate '{candidate.agent.name}'")
-                # Return a fallback result with zero score instead of raising
                 return ExperimentResult(
-                    candidate=candidate,
-                    run_result=None,
-                    metrics={"score": 0.0, "error": "No summaries produced"},
-                    eval_score=0.0,
-                    eval_passed=False,
-                    eval_reasoning="Evaluation failed to produce results",
-                    traces={}
                 )
             summary = optimization_result.summaries[0]
-            logger.info(f"[EVALUATOR] Candidate '{candidate.agent.name}' avg_score={summary.avg_score:.3f}, pass_rate={summary.pass_rate:.2f}")
-            # Log individual task results for debugging
-            if summary.task_results:
-                for tr in summary.task_results:
-                    logger.info(f"[EVALUATOR]   Task '{tr.task_name}': score={tr.eval_score:.3f}, passed={tr.eval_passed}")
-                    logger.debug(f"[EVALUATOR]     Reasoning: '{tr.eval_reasoning[:150]}'")
-                    logger.debug(f"[EVALUATOR]     Metrics: tokens={tr.metrics.total_tokens}, duration={tr.run_result.duration_seconds if tr.run_result else 0:.2f}s")
-            # Convert CandidateSummary to ExperimentResult for GEPA
             if summary.task_results:
-                tr = summary.task_results[0]
                 return ExperimentResult(
                     candidate=candidate,
-                    run_result=tr.run_result,
-                    metrics=tr.metrics,
-                    eval_score=tr.eval_score,
-                    eval_passed=tr.eval_passed,
-                    eval_reasoning=tr.eval_reasoning,
-                    traces=tr.run_result.trace if tr.run_result and isinstance(tr.run_result.trace, dict) else {}
                 )
-            # Fallback to aggregate metrics if no individual task results
             return ExperimentResult(
-                candidate=candidate,
-                run_result=None,
-                metrics={"score": summary.avg_score},
-                eval_score=summary.avg_score,
                 eval_passed=summary.pass_rate > 0.5,
-                eval_reasoning=f"Aggregate pass rate: {summary.pass_rate}",
-                traces={}
             )
         except Exception as e:
             logger.error(f"Error evaluating candidate '{candidate.agent.name}': {e}", exc_info=True)
-            # Return a fallback result instead of propagating the exception
             return ExperimentResult(
-                candidate=candidate,
-                run_result=None,
-                metrics={"score": 0.0, "error": str(e)},
-                eval_score=0.0,
-                eval_passed=False,
-                eval_reasoning=f"Evaluation error: {str(e)}",
-                traces={}
             )
-    # Inject dependencies into strategy if supported
-    # GepaStrategy accepts them in __init__, but we might have loaded it from config
-    # without them.
-    if hasattr(strategy, "evaluator") and strategy.evaluator is None:
         strategy.evaluator = evaluator
     if hasattr(strategy, "dataset") and strategy.dataset is None:
         strategy.dataset = tasks
-    # Execute strategy (blocking/sync)
-    # We should run this in an executor to avoid blocking the main async loop
-    # if we were doing other async things, but here we just wait for it.
-    loop = asyncio.get_running_loop()
-    candidates = await loop.run_in_executor(None, strategy.generate, base_agent, budget)
     console.print("\n[bold green]Optimization complete![/]")
     console.print(f"Generated {len(candidates)} candidates.")
-    # Export results
-    if output_dir:
         from flow.experiments.models import export_agent
-        output_dir.mkdir(parents=True, exist_ok=True)
-        (output_dir / "agents").mkdir(exist_ok=True)
         for i, cand in enumerate(candidates):
-            # Basic export
             name = cand.agent.name or f"candidate_{i}"
-            export_agent(cand.agent, output_dir / "agents" / f"{name}.yaml", metrics={"rationale": cand.rationale})
-        console.print(f"\nAgents exported to: [cyan]{output_dir / 'agents'}[/]")

     Agent,
     Candidate,
     CompactionConfig,
     ExperimentResult,
     GridSearchStrategy,
     load_experiment,
             help="Output directory for results",
         ),
     ] = None,
+    limit: Annotated[
+        int | None,
+        typer.Option(
+            "--limit", "-l",
+            help="Max number of tasks to run (use first N tasks from suite/file)",
+        ),
+    ] = None,
     no_llm_eval: Annotated[
         bool,
         typer.Option(
     ranks via Pareto analysis, and exports winning agent configs.
     Examples:
         # Use experiment YAML (recommended - defines agent, tasks, and variations)
         flow optimize --experiment experiment.yaml
         output_dir=output,
         use_llm_eval=not no_llm_eval,
         budget=budget,
+        limit=limit,
     ))
     output_dir: Path | None,
     use_llm_eval: bool,
     budget: int,
+    limit: int | None = None,
 ) -> None:
     """Run the optimization."""
     # If experiment YAML provided, use it as the source of truth
     if experiment_path:
+        await _run_from_experiment(experiment_path, output_dir, limit=limit)
         return
     # Load tasks
+    tasks = _load_tasks(tasks_path, suite, limit=limit)
     if not tasks:
         console.print("[red]Error:[/] No tasks specified. Use --tasks or --suite")
         raise typer.Exit(1)
     base = _load_base_agent(agent_path)
     # Load candidates and check if a strategy is defined in config
+    candidates, strategy_instance = await _load_candidates_and_strategy(config_path, vary, base, budget)
     # If a strategy was provided (like GepaStrategy), run it directly
     if strategy_instance is not None:
         raise typer.Exit(1)
+async def _run_from_experiment(experiment_path: Path, output_dir: Path | None, limit: int | None = None) -> None:
     """Run optimization from an experiment YAML file.
     The experiment YAML defines:
         console.print("[red]Error:[/] Experiment must specify 'suite' or 'tasks'")
         raise typer.Exit(1)
+    if limit is not None and limit > 0:
+        tasks = tasks[:limit]
     # Generate candidates from variations
     if exp.variations:
         strategy = GridSearchStrategy(exp.variations)
+        candidates = await strategy.generate(base, exp.budget)
     else:
         candidates = [Candidate(agent=base, mutations={}, rationale="baseline")]
     for t in tasks:
         console.print(f"  - {t.name}")
+    console.print("\n[bold]Variations:[/]")
     for key, values in exp.variations.items():
         console.print(f"  - {key}: {len(values)} variants")
         raise typer.Exit(1)
+def _load_tasks(tasks_path: Path | None, suite: str | None, limit: int | None = None) -> list[Task]:
     """Load tasks from file or built-in suite."""
+    tasks: list[Task] = []
     if tasks_path:
         if not tasks_path.exists():
             console.print(f"[red]Error:[/] Tasks file not found: {tasks_path}")
             raise typer.Exit(1)
+        tasks = load_tasks_from_jsonl(tasks_path)
+    elif suite:
         try:
+            tasks = get_task_suite(suite)
         except ValueError as e:
             console.print(f"[red]Error:[/] {e}")
             raise typer.Exit(1)
+    else:
+        # Default: quick suite
+        try:
+            tasks = get_task_suite("quick")
+        except ValueError:
+            console.print("[red]Error:[/] No built-in suites available. Use --tasks to specify a JSONL file.")
+            raise typer.Exit(1)
+    if limit is not None and limit > 0:
+        tasks = tasks[:limit]
+    return tasks
 def _load_base_agent(agent_path: Path | None) -> Agent:
     return Agent(name="flow_agent")
+async def _load_candidates_and_strategy(
     config_path: Path | None,
     vary: str | None,
     base: Agent,
     budget: int,
 ) -> tuple[list[Candidate], Any | None]:
     """Load candidates from file or generate from variations.
     Supports both YAML and Python config files:
     - YAML: strategy configuration (strategy_type, config)
     - Python: STRATEGY object, CANDIDATES list, or VARIATIONS dict
     Returns:
         Tuple of (candidates, strategy_instance)
         - If a STRATEGY is defined in config, returns ([], strategy_instance)
             # YAML files currently only support strategy definitions
             console.print("[red]Error:[/] YAML config must define a strategy")
             raise typer.Exit(1)
         # Python config file
         candidates, variations, strategy_obj = _load_python_config(config_path)
         # If a strategy object was provided (e.g., GepaStrategy), return it
         if strategy_obj is not None:
             return [], strategy_obj
         if variations:
             strategy = GridSearchStrategy(variations)
+            return await strategy.generate(base, budget), None
         elif candidates:
             return candidates, None
         else:
     if vary:
         variations = _parse_vary_flag(vary)
         strategy = GridSearchStrategy(variations)
+        return await strategy.generate(base, budget), None
     # Default: explore context engineering dimensions
     strategy = GridSearchStrategy(variations={
             CompactionConfig.head_tail(10, 40),
             CompactionConfig.none(),
         ],
     })
+    return await strategy.generate(base, budget), None
 def _load_yaml_strategy(path: Path) -> Any | None:
             console.print("[red]Error:[/] GEPA optimizer not available.")
             console.print("[dim]Install with: pip install flow-agent[optimizer][/]")
             raise typer.Exit(1)
+    elif strategy_type == "llm_rewriter":
+        from flow.experiments.strategies.llm_rewriter import LLMRewriterStrategy
+        return LLMRewriterStrategy(config=strategy_config)
     else:
         console.print(f"[red]Error:[/] Unknown strategy type: {strategy_type}")
+        console.print("[dim]Supported: gepa, llm_rewriter[/]")
         raise typer.Exit(1)
     use_llm_eval: bool,
     budget: int,
 ) -> None:
+    """Run an active optimization strategy.
+    For strategies that use the ExperimentRunner protocol (LLMRewriterStrategy),
+    delegates to FlowOptimizer.optimize_with_strategy() which handles setup,
+    evaluation, Pareto analysis, and export.
+    For GEPA (which uses its own evaluator callback), uses the legacy path
+    with a bridging evaluator function.
+    """
+    # Check if strategy uses GEPA's evaluator pattern (legacy path)
+    is_gepa = hasattr(strategy, "evaluator")
+    if is_gepa:
+        await _run_gepa_strategy(strategy, base_agent, tasks, output_dir, parallel, use_llm_eval, budget)
+    else:
+        # Modern path: use optimize_with_strategy which passes self as runner
+        optimizer = FlowOptimizer(
+            parallel=parallel,
+            use_llm_evaluator=use_llm_eval,
+            output_dir=output_dir,
+        )
+        result = await optimizer.optimize_with_strategy(
+            strategy=strategy,
+            base=base_agent,
+            tasks=tasks,
+            budget=budget,
+        )
+        console.print("\n[bold green]Optimization complete![/]")
+        console.print(f"\nBest agents exported to: [cyan]{result.output_dir / 'agents'}[/]")
+async def _run_gepa_strategy(
+    strategy: Any,
+    base_agent: Agent,
+    tasks: list[Task],
+    output_dir: Path | None,
+    parallel: int,
+    use_llm_eval: bool,
+    budget: int,
+) -> None:
+    """Run GEPA strategy with its custom evaluator callback bridge."""
     logger = logging.getLogger(__name__)
+    import threading
+    from datetime import datetime
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    if output_dir is None:
+        base_output_dir = Path.home() / ".flow" / "optimizations"
+    else:
+        base_output_dir = output_dir
+    run_dir = base_output_dir / f"gepa_{timestamp}"
+    run_dir.mkdir(parents=True, exist_ok=True)
+    eval_counter = 0
+    counter_lock = threading.Lock()
     optimizer_runner = FlowOptimizer(
         parallel=parallel,
         use_llm_evaluator=use_llm_eval,
+        output_dir=run_dir,
     )
     main_loop = asyncio.get_running_loop()
     def evaluator(candidate: Candidate, minibatch: list[Task] | None = None) -> ExperimentResult:
+        """Evaluate a candidate on a minibatch of tasks (GEPA bridge)."""
+        nonlocal eval_counter
         eval_tasks = minibatch if minibatch else tasks
+        candidate_id = candidate.mutations.get("_candidate_id", "unknown")
+        with counter_lock:
+            rollout_num = eval_counter
+            eval_counter += 1
+        rollout_dir = run_dir / f"rollout_{rollout_num}_{candidate_id}"
+        logger.debug(f"[EVALUATOR] Evaluating {candidate_id} on {len(eval_tasks)} tasks (rollout {rollout_num})")
         try:
             future = asyncio.run_coroutine_threadsafe(
+                optimizer_runner.optimize([candidate], eval_tasks, run_dir=rollout_dir),
                 main_loop
             )
             optimization_result = future.result()
             if not optimization_result.summaries:
                 return ExperimentResult(
+                    candidate=candidate, run_result=None,
+                    metrics={"score": 0.0}, eval_score=0.0,
+                    eval_passed=False, eval_reasoning="No results", traces={}
                 )
             summary = optimization_result.summaries[0]
             if summary.task_results:
+                total_tokens = sum(tr.metrics.total_tokens for tr in summary.task_results)
+                avg_duration = sum(
+                    tr.run_result.duration_seconds for tr in summary.task_results if tr.run_result
+                ) / max(len(summary.task_results), 1)
+                combined_reasoning = "\n".join(
+                    f"Task {tr.task_name}: {tr.eval_reasoning}" for tr in summary.task_results
+                )
                 return ExperimentResult(
                     candidate=candidate,
+                    run_result=summary.task_results[0].run_result,
+                    metrics={"total_tokens": total_tokens, "avg_duration": avg_duration,
+                             "pass_rate": summary.pass_rate, "num_tasks": len(summary.task_results)},
+                    eval_score=summary.avg_score,
+                    eval_passed=summary.pass_rate > 0.5,
+                    eval_reasoning=combined_reasoning,
+                    traces=summary.task_results[0].run_result.trace if summary.task_results[0].run_result else {},
                 )
             return ExperimentResult(
+                candidate=candidate, run_result=None,
+                metrics={"score": summary.avg_score}, eval_score=summary.avg_score,
                 eval_passed=summary.pass_rate > 0.5,
+                eval_reasoning=f"Aggregate pass rate: {summary.pass_rate}", traces={}
             )
         except Exception as e:
             logger.error(f"Error evaluating candidate '{candidate.agent.name}': {e}", exc_info=True)
             return ExperimentResult(
+                candidate=candidate, run_result=None,
+                metrics={"score": 0.0, "error": str(e)}, eval_score=0.0,
+                eval_passed=False, eval_reasoning=f"Evaluation error: {e!s}", traces={}
             )
+    # Inject GEPA-specific dependencies
+    if strategy.evaluator is None:
         strategy.evaluator = evaluator
     if hasattr(strategy, "dataset") and strategy.dataset is None:
         strategy.dataset = tasks
+    candidates = await strategy.generate(base_agent, budget, tasks=tasks, runner=None)
+    if hasattr(strategy, "print_report"):
+        strategy.print_report()
     console.print("\n[bold green]Optimization complete![/]")
     console.print(f"Generated {len(candidates)} candidates.")
+    output_path = output_dir if output_dir else run_dir
+    if output_path:
+        import json
         from flow.experiments.models import export_agent
+        output_path.mkdir(parents=True, exist_ok=True)
+        (output_path / "agents").mkdir(exist_ok=True)
         for i, cand in enumerate(candidates):
             name = cand.agent.name or f"candidate_{i}"
+            export_agent(cand.agent, output_path / "agents" / f"{name}.yaml", metrics={"rationale": cand.rationale})
+        if hasattr(strategy, "get_report") and strategy.get_report():
+            report = strategy.get_report()
+            report_data = {
+                "baseline_prompt": report.baseline_prompt,
+                "baseline_score": report.baseline_score,
+                "final_prompt": report.final_prompt,
+                "final_score": report.final_score,
+                "best_candidate_id": report.best_candidate_id,
+                "improvement": report.improvement,
+                "improvement_percent": (report.improvement / max(report.baseline_score, 0.001)) * 100,
+                "total_candidates_evaluated": report.total_candidates_evaluated,
+                "total_generations": report.total_generations,
+                "candidate_history": [
+                    {
+                        "generation": r.generation,
+                        "candidate_id": r.candidate_id,
+                        "avg_score": r.avg_score,
+                        "best_score": r.best_score,
+                        "best_eval_num": r.best_eval_num,
+                        "eval_count": r.eval_count,
+                        "pass_rate": r.pass_rate,
+                        "is_selected": r.is_selected,
+                        "instructions_preview": r.instructions_preview,
+                    }
+                    for r in report.candidate_history
+                ]
+            }
+            with open(output_path / "optimization_report.json", "w") as f:
+                json.dump(report_data, f, indent=2)
+            console.print(f"Optimization report saved to: [cyan]{output_path / 'optimization_report.json'}[/]")
+        console.print(f"\nAgents exported to: [cyan]{output_path / 'agents'}[/]")

src/flow/cli/repl.py CHANGED Viewed

@@ -47,7 +47,9 @@ class FlowREPL:
         """Get or create the harness instance."""
         if self._harness is None:
             # Import maf module to register the harness, then use registry
-            import flow.harness.maf  # noqa: F401
             from flow.harness import create_harness
             agent = Agent(name="flow-repl")

         """Get or create the harness instance."""
         if self._harness is None:
             # Import maf module to register the harness, then use registry
+            import flow.harness.maf as _maf
+            _ = _maf
             from flow.harness import create_harness
             agent = Agent(name="flow-repl")

src/flow/experiments/__init__.py CHANGED Viewed

@@ -27,7 +27,7 @@ Example usage:
     strategy = GridSearchStrategy(variations={
         "enable_memory": [True, False],
     })
-    candidates = strategy.generate(base, budget=10)
     # Run optimization
     optimizer = FlowOptimizer(parallel=4)
@@ -37,18 +37,6 @@ Example usage:
 """
 # Core models
-from .models import (
-    Agent,
-    Candidate,
-    CandidateStrategy,
-    CompactionConfig,
-    ExperimentResult,
-    GridSearchStrategy,
-    export_agent,
-    export_optimization_results,
-    load_agent,
-)
 # Experiment runner + Pareto analysis
 from .ablation import (
     compute_pareto_frontier,
@@ -66,6 +54,16 @@ from .evaluators import (
     TraceEvaluator,
 )
 # Metrics
 from .metrics import (
     LLMCallInfo,
@@ -75,6 +73,26 @@ from .metrics import (
     format_metrics_summary,
     metrics_to_dict,
 )
 # Optimizer
 from .optimizer import (
@@ -82,6 +100,7 @@ from .optimizer import (
     FlowOptimizer,
     OptimizationResult,
     TaskResult,
     load_tasks_from_jsonl,
 )
@@ -96,11 +115,24 @@ from .reporters import (
 )
 # Runner
-from .runner import FlowExperimentRunner, setup_tracing
 # Trace collection
 from .trace_collector import FlowTraceCollector
-from .types import CriterionResult, EvalCriterion, EvalResult, RunResult, Task
 __all__ = [  # noqa: RUF022  # Intentionally grouped by category
     # Core models
@@ -108,11 +140,27 @@ __all__ = [  # noqa: RUF022  # Intentionally grouped by category
     "Candidate",
     "CandidateStrategy",
     "CompactionConfig",
     "ExperimentResult",
     "GridSearchStrategy",
     "export_agent",
     "load_agent",
     "export_optimization_results",
     # Types
     "Task",
     "EvalCriterion",
@@ -130,6 +178,7 @@ __all__ = [  # noqa: RUF022  # Intentionally grouped by category
     "metrics_to_dict",
     # Runner
     "FlowExperimentRunner",
     "setup_tracing",
     # Evaluators
     "Evaluator",
@@ -154,5 +203,20 @@ __all__ = [  # noqa: RUF022  # Intentionally grouped by category
     "OptimizationResult",
     "CandidateSummary",
     "TaskResult",
     "load_tasks_from_jsonl",
 ]

     strategy = GridSearchStrategy(variations={
         "enable_memory": [True, False],
     })
+    candidates = await strategy.generate(base, budget=10)
     # Run optimization
     optimizer = FlowOptimizer(parallel=4)
 """
 # Core models
 # Experiment runner + Pareto analysis
 from .ablation import (
     compute_pareto_frontier,
     TraceEvaluator,
 )
+# Expansion pipeline
+from .expansion import expand_variations, generate_candidates
+# HF Dataset Integration
+from .hf_datasets import (
+    import_hf_dataset,
+    register_converter,
+    save_tasks_to_jsonl,
+)
 # Metrics
 from .metrics import (
     LLMCallInfo,
     format_metrics_summary,
     metrics_to_dict,
 )
+from .models import (
+    Agent,
+    Candidate,
+    CandidateStrategy,
+    CompactionConfig,
+    Experiment,
+    ExperimentResult,
+    ExperimentRunner,
+    Framework,
+    GridSearchStrategy,
+    LiteralVariation,
+    StrategyIteration,
+    StrategyVariation,
+    VariationItem,
+    compute_max_experiments,
+    export_agent,
+    export_optimization_results,
+    load_agent,
+    load_experiment,
+)
 # Optimizer
 from .optimizer import (
     FlowOptimizer,
     OptimizationResult,
     TaskResult,
+    evaluate_agent,
     load_tasks_from_jsonl,
 )
 )
 # Runner
+from .runner import FlowExperimentRunner, get_shared_collector, setup_tracing
+# Strategy registry
+from .strategies import get_registered_strategies, get_strategy, register_strategy
+# Presets
+from .presets import AgentPreset, get_all_presets, get_preset
+# Results (simple API)
+from .results import (
+    AgentOptimizationResult,
+    EvaluationResult,
+    ImprovementMetrics,
+)
 # Trace collection
 from .trace_collector import FlowTraceCollector
+from .types import CriterionResult, EvalCriterion, EvalResult, RunResult, Task, get_task_suite
 __all__ = [  # noqa: RUF022  # Intentionally grouped by category
     # Core models
     "Candidate",
     "CandidateStrategy",
     "CompactionConfig",
+    "Experiment",
     "ExperimentResult",
+    "ExperimentRunner",
+    "Framework",
     "GridSearchStrategy",
+    "LiteralVariation",
+    "StrategyIteration",
+    "StrategyVariation",
+    "VariationItem",
+    "compute_max_experiments",
     "export_agent",
     "load_agent",
+    "load_experiment",
     "export_optimization_results",
+    # Expansion pipeline
+    "expand_variations",
+    "generate_candidates",
+    # Strategy registry
+    "get_strategy",
+    "register_strategy",
+    "get_registered_strategies",
     # Types
     "Task",
     "EvalCriterion",
     "metrics_to_dict",
     # Runner
     "FlowExperimentRunner",
+    "get_shared_collector",
     "setup_tracing",
     # Evaluators
     "Evaluator",
     "OptimizationResult",
     "CandidateSummary",
     "TaskResult",
+    "evaluate_agent",
     "load_tasks_from_jsonl",
+    # Presets
+    "AgentPreset",
+    "get_preset",
+    "get_all_presets",
+    # Results (simple API)
+    "EvaluationResult",
+    "AgentOptimizationResult",
+    "ImprovementMetrics",
+    # Task suites
+    "get_task_suite",
+    # HF Datasets
+    "import_hf_dataset",
+    "register_converter",
+    "save_tasks_to_jsonl",
 ]

src/flow/experiments/ablation.py CHANGED Viewed

@@ -46,9 +46,13 @@ async def run_single_experiment(
         ExperimentResult with metrics and evaluation
     """
     # Import harness modules to register them, then use registry
-    import flow.harness.maf  # noqa: F401
     try:
-        import flow.harness.miniagent  # noqa: F401
     except ImportError:
         pass  # miniagent harness is optional
     from flow.harness import create_harness

         ExperimentResult with metrics and evaluation
     """
     # Import harness modules to register them, then use registry
+    import flow.harness.maf as _maf
+    _ = _maf
     try:
+        import flow.harness.miniagent as _miniagent
+        _ = _miniagent
     except ImportError:
         pass  # miniagent harness is optional
     from flow.harness import create_harness

src/flow/experiments/agent_api.py ADDED Viewed

	@@ -0,0 +1,305 @@

+# Copyright (c) Microsoft. All rights reserved.
+"""Implementation of Agent.evaluate() and Agent.optimize() methods.
+This module contains the implementation details, keeping the Agent class
+itself clean and focused on configuration.
+"""
+from __future__ import annotations
+import contextlib
+import io
+import sys
+from pathlib import Path
+from typing import TYPE_CHECKING, Any
+from .models import Candidate, CompactionConfig, GridSearchStrategy
+from .optimizer import FlowOptimizer, OptimizationResult, evaluate_agent
+from .results import AgentOptimizationResult, EvaluationResult, ImprovementMetrics
+from .types import Task, get_task_suite, load_tasks_from_jsonl
+if TYPE_CHECKING:
+    from .models import Agent
+    from .optimizer import CandidateSummary
+# Default variations for optimize() when none provided
+DEFAULT_VARIATIONS: dict[str, list[Any]] = {
+    "tools": ["minimal", "standard"],
+    "compaction": [
+        CompactionConfig.none(),
+        CompactionConfig.head_tail(10, 40),
+        CompactionConfig.sliding_window(100_000),
+    ],
+}
+# Known active strategy names and their classes
+_STRATEGY_MAP: dict[str, str] = {
+    "tools": "flow.experiments.strategies.tool_selector.ToolSelectorStrategy",
+    "instructions": "flow.experiments.strategies.llm_rewriter.LLMRewriterStrategy",
+}
+def _resolve_tasks(tasks: str | list[Task] | Path) -> list[Task]:
+    """Resolve tasks specification to list of Task objects.
+    Args:
+        tasks: One of:
+            - str: Suite name (e.g., "quick", "coding", "gaia_level1")
+            - list[Task]: Already resolved tasks
+            - Path: Path to JSONL file
+    Returns:
+        List of Task objects
+    """
+    if isinstance(tasks, str):
+        return get_task_suite(tasks)
+    elif isinstance(tasks, Path):
+        return load_tasks_from_jsonl(tasks)
+    else:
+        return tasks
+def _summary_to_eval_result(summary: CandidateSummary) -> EvaluationResult:
+    """Convert internal CandidateSummary to user-friendly EvaluationResult."""
+    return EvaluationResult(
+        score=summary.avg_score,
+        tokens=summary.total_tokens,
+        pass_rate=summary.pass_rate,
+        duration=summary.avg_duration * summary.task_count,
+        task_count=summary.task_count,
+        _details=summary,
+    )
+@contextlib.contextmanager
+def _suppress_output():
+    """Context manager to suppress stdout/stderr."""
+    old_stdout, old_stderr = sys.stdout, sys.stderr
+    sys.stdout = io.StringIO()
+    sys.stderr = io.StringIO()
+    try:
+        yield
+    finally:
+        sys.stdout, sys.stderr = old_stdout, old_stderr
+async def _evaluate_agent_impl(
+    agent: Agent,
+    tasks: str | list[Task] | Path,
+    parallel: int,
+    use_llm_eval: bool,
+    quiet: bool,
+    agent_id: str | None = None,
+) -> EvaluationResult:
+    """Implementation of Agent.evaluate().
+    Args:
+        agent_id: If set (from deploy()), results are auto-persisted to DB.
+    """
+    resolved_tasks = _resolve_tasks(tasks)
+    if quiet:
+        with _suppress_output():
+            summary = await evaluate_agent(
+                agent,
+                resolved_tasks,
+                parallel=parallel,
+                use_llm_evaluator=use_llm_eval,
+            )
+    else:
+        summary = await evaluate_agent(
+            agent,
+            resolved_tasks,
+            parallel=parallel,
+            use_llm_evaluator=use_llm_eval,
+        )
+    result = _summary_to_eval_result(summary)
+    # Auto-persist if agent was deployed
+    if agent_id is not None:
+        try:
+            from flow.ui.services.persistence_adapter import PersistenceAdapter
+            adapter = PersistenceAdapter()
+            result.job_id = await adapter.persist_evaluation(summary, agent_id)
+        except ImportError:
+            pass  # DB not available, skip persistence
+    return result
+def _resolve_strategy(name: str) -> Any:
+    """Import and instantiate a named strategy.
+    Args:
+        name: Strategy name ("tools", "instructions")
+    Returns:
+        Strategy instance
+    Raises:
+        ValueError: If name is not a known strategy
+    """
+    if name not in _STRATEGY_MAP:
+        available = ["grid"] + list(_STRATEGY_MAP.keys())
+        raise ValueError(f"Unknown strategy: {name!r}. Available: {available}")
+    module_path, class_name = _STRATEGY_MAP[name].rsplit(".", 1)
+    import importlib
+    mod = importlib.import_module(module_path)
+    cls = getattr(mod, class_name)
+    return cls(config={
+        "max_iterations": 3,
+        "min_improvement": 0.01,
+    })
+def _opt_result_to_agent_result(
+    opt_result: OptimizationResult,
+    baseline_agent: Agent,
+) -> AgentOptimizationResult:
+    """Convert internal OptimizationResult to user-friendly AgentOptimizationResult."""
+    # Find baseline: look for the original agent name with no mutations, else first summary
+    baseline_summary = next(
+        (s for s in opt_result.summaries if s.name == baseline_agent.name and s.candidate.mutations == {}),
+        None,
+    )
+    best_summary = opt_result.get_best_candidate("score") or opt_result.summaries[0]
+    if baseline_summary is not None:
+        baseline_result = _summary_to_eval_result(baseline_summary)
+    else:
+        # For active strategies, the baseline is the first iteration in optimization_history
+        history = best_summary.candidate.optimization_history
+        if history:
+            baseline_result = EvaluationResult(
+                score=history[0].avg_score,
+                tokens=0,
+                pass_rate=history[0].pass_rate,
+                duration=0.0,
+                task_count=best_summary.task_count or len(history[0].change_description),
+            )
+        else:
+            baseline_result = _summary_to_eval_result(best_summary)
+    best_result = _summary_to_eval_result(best_summary)
+    score_delta = best_result.score - baseline_result.score
+    token_reduction_pct = 0.0
+    if baseline_result.tokens > 0:
+        token_reduction_pct = (baseline_result.tokens - best_result.tokens) / baseline_result.tokens * 100
+    return AgentOptimizationResult(
+        baseline=baseline_result,
+        best=best_result,
+        improvement=ImprovementMetrics(
+            score_delta=score_delta,
+            token_reduction_pct=token_reduction_pct,
+        ),
+        best_agent=best_summary.candidate.agent,
+        candidates_tested=len(opt_result.summaries),
+        pareto_frontier=opt_result.pareto_frontier,
+        output_dir=opt_result.output_dir,
+    )
+async def _optimize_agent_impl(
+    agent: Agent,
+    tasks: str | list[Task] | Path,
+    variations: dict[str, list[Any]] | None,
+    parallel: int,
+    budget: int,
+    use_llm_eval: bool,
+    quiet: bool,
+    agent_id: str | None = None,
+    strategy: str | list[str] | None = None,
+) -> AgentOptimizationResult:
+    """Implementation of Agent.optimize().
+    Args:
+        agent_id: If set (from deploy()), results are auto-persisted to DB.
+        strategy: Active optimization strategy name(s). None or "grid" uses
+            grid search. A string like "tools" or "instructions" runs that
+            strategy. A list runs them sequentially, each starting from the
+            previous best.
+    """
+    resolved_tasks = _resolve_tasks(tasks)
+    # Normalize strategy to a list (or None for grid search)
+    if strategy is None or strategy == "grid":
+        strategy_list = None
+    elif isinstance(strategy, str):
+        strategy_list = [strategy]
+    else:
+        strategy_list = list(strategy)
+    # ── Grid search path (original behavior) ──
+    if strategy_list is None:
+        actual_variations = variations if variations is not None else DEFAULT_VARIATIONS
+        grid_strategy = GridSearchStrategy(variations=actual_variations)
+        candidates = await grid_strategy.generate(agent, budget=budget)
+        baseline_candidate = Candidate(agent=agent, mutations={}, rationale="baseline")
+        has_baseline = any(c.agent.name == agent.name and c.mutations == {} for c in candidates)
+        if not has_baseline:
+            candidates.insert(0, baseline_candidate)
+        optimizer = FlowOptimizer(parallel=parallel, use_llm_evaluator=use_llm_eval)
+        if quiet:
+            with _suppress_output():
+                opt_result = await optimizer.optimize(candidates, resolved_tasks)
+        else:
+            opt_result = await optimizer.optimize(candidates, resolved_tasks)
+        result = _opt_result_to_agent_result(opt_result, agent)
+    # ── Active strategy path ──
+    else:
+        current_agent = agent
+        last_opt_result: OptimizationResult | None = None
+        for strat_name in strategy_list:
+            strat_instance = _resolve_strategy(strat_name)
+            optimizer = FlowOptimizer(parallel=parallel, use_llm_evaluator=use_llm_eval)
+            if quiet:
+                with _suppress_output():
+                    last_opt_result = await optimizer.optimize_with_strategy(
+                        strategy=strat_instance,
+                        base=current_agent,
+                        tasks=resolved_tasks,
+                        budget=budget,
+                    )
+            else:
+                last_opt_result = await optimizer.optimize_with_strategy(
+                    strategy=strat_instance,
+                    base=current_agent,
+                    tasks=resolved_tasks,
+                    budget=budget,
+                )
+            # Next stage starts from the best agent found
+            best = last_opt_result.get_best_candidate("score")
+            if best:
+                current_agent = best.candidate.agent
+        assert last_opt_result is not None
+        result = _opt_result_to_agent_result(last_opt_result, agent)
+    # Auto-persist if agent was deployed
+    if agent_id is not None:
+        try:
+            from flow.ui.services.persistence_adapter import PersistenceAdapter
+            adapter = PersistenceAdapter()
+            opt_to_persist = opt_result if strategy_list is None else last_opt_result
+            result.job_id = await adapter.persist_optimization(opt_to_persist, agent_id)
+        except ImportError:
+            pass  # DB not available, skip persistence
+    return result

src/flow/experiments/evaluators/heuristic.py CHANGED Viewed

@@ -73,7 +73,7 @@ class HeuristicEvaluator:
         # Check if agent reported task complete
         output_lower = run_result.output.lower()
-        if "complete" in output_lower or "complete" in output_lower or "finished" in output_lower:
             criteria_results.append(
                 CriterionResult(
                     name="task_completed",

         # Check if agent reported task complete
         output_lower = run_result.output.lower()
+        if "complete" in output_lower or "finished" in output_lower:
             criteria_results.append(
                 CriterionResult(
                     name="task_completed",

src/flow/experiments/evaluators/llm.py CHANGED Viewed

@@ -11,6 +11,21 @@ from ..types import CriterionResult, EvalResult, RunResult
 logger = logging.getLogger(__name__)
 class LLMEvaluator:
     """Evaluator that uses an LLM to assess agent output against criteria.
@@ -39,6 +54,7 @@ class LLMEvaluator:
         model_name: str = "gpt-4o",
         passing_threshold: float = 0.7,
         temperature: float | None = None,
     ) -> None:
         """Initialize the LLM evaluator.
@@ -50,13 +66,56 @@ class LLMEvaluator:
             temperature: Temperature for LLM calls. None means don't specify
                         (use model default). Some models like gpt-5.2-chat
                         only support temperature=1.0.
         """
         self.model_client = model_client
         self.model_name = model_name
         self.passing_threshold = passing_threshold
         self.temperature = temperature
-    def _get_evaluation_prompt(self, run_result: RunResult) -> str:
         """Build the evaluation prompt for the LLM."""
         criteria_text = "\n".join(
             f"- **{c.name}** (weight: {c.weight}): {c.instruction}"
@@ -66,6 +125,8 @@ class LLMEvaluator:
         # Extract execution trace summary for research/multi-step tasks
         trace_summary = self._get_trace_summary(run_result)
         return f"""You are an expert evaluator assessing an AI agent's output.
 ## Task
@@ -73,15 +134,18 @@ The agent was given this task:
 ```
 {run_result.task.prompt}
 ```
 ## Agent Output
 ```
-{run_result.output[:8000]}
 ```
 ## Files Created
 {json.dumps(run_result.files_created, indent=2) if run_result.files_created else "None"}
 ## Execution Trace
 {trace_summary}
@@ -95,27 +159,61 @@ The agent was given this task:
 Evaluate the agent's output against each criterion. Consider both the final output AND the execution
 trace (tools used, steps taken) when assessing correctness.
-For each criterion:
-1. Assess how well the output meets the criterion (0.0 to 1.0)
-2. Determine if it passes (score >= 0.7)
-3. Provide brief reasoning
-Respond in this exact JSON format:
-```json
-{{
-    "criteria_results": [
-        {{
-            "name": "criterion_name",
-            "score": 0.85,
-            "passed": true,
-            "reasoning": "Brief explanation"
-        }}
-    ],
-    "overall_reasoning": "Summary of the overall evaluation"
-}}
-```
 """
     def _get_trace_summary(self, run_result: RunResult) -> str:
         """Extract a summary of the execution trace for evaluation."""
         if not run_result.trace:
@@ -137,11 +235,12 @@ Total tool calls: {metrics.tool_call_count}
 {tool_summary}
 Tokens used: {metrics.total_tokens} (input: {metrics.input_tokens}, output: {metrics.output_tokens})"""
-    async def evaluate(self, run_result: RunResult) -> EvalResult:
         """Evaluate the agent's output using an LLM.
         Args:
             run_result: The result from running an agent on a task
         Returns:
             EvalResult with LLM-generated scores and reasoning
@@ -158,45 +257,44 @@ Tokens used: {metrics.total_tokens} (input: {metrics.input_tokens}, output: {met
                 ),
             )
-        prompt = self._get_evaluation_prompt(run_result)
         try:
-            # Build params - only include temperature if explicitly set
             params: dict[str, Any] = {
                 "model": self.model_name,
                 "messages": [
                     {
                         "role": "system",
-                        "content": "You are an expert evaluator. Respond only with valid JSON.",
                     },
                     {"role": "user", "content": prompt},
                 ],
             }
             if self.temperature is not None:
                 params["temperature"] = self.temperature
             response = await self.model_client.chat.completions.create(**params)
-            # Extract the response text
-            response_text = response.choices[0].message.content or ""
-            # Parse JSON from response
-            json_start = response_text.find("{")
-            json_end = response_text.rfind("}") + 1
-            if json_start >= 0 and json_end > json_start:
-                eval_data = json.loads(response_text[json_start:json_end])
-            else:
-                raise ValueError("No JSON found in response")
             # Build criterion results
             criteria_results = []
             total_weighted_score = 0.0
             total_weight = 0.0
             for cr_data in eval_data.get("criteria_results", []):
                 cr = CriterionResult(
                     name=cr_data.get("name", "unknown"),
                     score=float(cr_data.get("score", 0.0)),
                     passed=bool(cr_data.get("passed", False)),
                     reasoning=cr_data.get("reasoning", ""),
                 )
@@ -210,13 +308,16 @@ Tokens used: {metrics.total_tokens} (input: {metrics.input_tokens}, output: {met
                         break
                 total_weighted_score += cr.score * weight
                 total_weight += weight
-            # Calculate overall score
             overall_score = total_weighted_score / total_weight if total_weight > 0 else 0.0
             return EvalResult(
                 score=overall_score,
                 passed=overall_score >= self.passing_threshold,
                 criteria_results=criteria_results,
                 reasoning=eval_data.get("overall_reasoning", ""),

 logger = logging.getLogger(__name__)
+# Presets for how agent output is formatted before sending to the LLM judge.
+# Agent outputs can be very large (100K-600K+ chars for multi-step tasks).
+# The final answer is almost always at the end, so "head_tail" (default)
+# keeps both the initial approach and the final answer visible to the judge.
+#
+# Each preset specifies {"head": N, "tail": M} where N chars from the start
+# and M chars from the end are kept. When truncation occurs, a marker like
+# "... [150,000 chars truncated] ..." is inserted.
+OUTPUT_FORMAT_PRESETS: dict[str, dict[str, int]] = {
+    "head_tail": {"head": 2000, "tail": 10000},  # Default: sees start + final answer
+    "head_only": {"head": 8000, "tail": 0},  # Legacy: first 8K only
+    "tail_only": {"head": 0, "tail": 12000},  # Only the final output
+    "full": {"head": 0, "tail": 0},  # No truncation (watch context limits)
+}
 class LLMEvaluator:
     """Evaluator that uses an LLM to assess agent output against criteria.
         model_name: str = "gpt-4o",
         passing_threshold: float = 0.7,
         temperature: float | None = None,
+        output_format: str | dict[str, int] = "head_tail",
     ) -> None:
         """Initialize the LLM evaluator.
             temperature: Temperature for LLM calls. None means don't specify
                         (use model default). Some models like gpt-5.2-chat
                         only support temperature=1.0.
+            output_format: How to format agent output for the judge. Either a
+                          preset name ("head_tail", "head_only", "tail_only", "full")
+                          or a dict with "head" and "tail" char counts.
+                          See OUTPUT_FORMAT_PRESETS for details.
         """
         self.model_client = model_client
         self.model_name = model_name
         self.passing_threshold = passing_threshold
         self.temperature = temperature
+        # Resolve output format
+        if isinstance(output_format, str):
+            if output_format not in OUTPUT_FORMAT_PRESETS:
+                raise ValueError(
+                    f"Unknown output_format '{output_format}'. "
+                    f"Available: {list(OUTPUT_FORMAT_PRESETS.keys())}"
+                )
+            fmt = OUTPUT_FORMAT_PRESETS[output_format]
+        else:
+            fmt = output_format
+        self._output_head = fmt["head"]
+        self._output_tail = fmt["tail"]
+    def _format_output(self, output: str) -> str:
+        """Format agent output for the evaluation prompt.
+        Uses head+tail truncation to ensure the judge sees both the initial
+        approach and the final answer. When output is truncated, a marker is
+        inserted showing how many characters were removed.
+        The strategy is configured via the output_format parameter on __init__.
+        """
+        head = self._output_head
+        tail = self._output_tail
+        budget = head + tail
+        # No truncation if budget is 0 (full mode) or output fits
+        if budget == 0 or len(output) <= budget:
+            return output
+        parts: list[str] = []
+        if head > 0:
+            parts.append(output[:head])
+        truncated = len(output) - budget
+        parts.append(f"\n\n... [{truncated:,} chars truncated] ...\n\n")
+        if tail > 0:
+            parts.append(output[-tail:])
+        return "".join(parts)
+    def _get_evaluation_prompt(self, run_result: RunResult, instructions: str | None = None) -> str:
         """Build the evaluation prompt for the LLM."""
         criteria_text = "\n".join(
             f"- **{c.name}** (weight: {c.weight}): {c.instruction}"
         # Extract execution trace summary for research/multi-step tasks
         trace_summary = self._get_trace_summary(run_result)
+        instructions_section = f"\n## Agent Instructions\n```\n{instructions}\n```\n" if instructions else ""
         return f"""You are an expert evaluator assessing an AI agent's output.
 ## Task
 ```
 {run_result.task.prompt}
 ```
+{instructions_section}
 ## Agent Output
 ```
+{self._format_output(run_result.output)}
 ```
 ## Files Created
 {json.dumps(run_result.files_created, indent=2) if run_result.files_created else "None"}
+## Tool Results
+{self._format_tool_results(run_result.tool_results)}
 ## Execution Trace
 {trace_summary}
 Evaluate the agent's output against each criterion. Consider both the final output AND the execution
 trace (tools used, steps taken) when assessing correctness.
+For each criterion, provide TWO scores:
+1. **score** (0.0 or 1.0): Does the agent's final answer exactly match what's required? This is strict exact-match.
+2. **reasoning_score** (0.0 to 1.0): Did the agent demonstrate correct reasoning/methodology? Give partial credit for:
+   - Correct approach but wrong format (e.g., "17000" when "17" was expected)
+   - Correct methodology but wrong final number
+   - Identifying the right sources/data but making a calculation error
+   - Partial completion of a multi-part task
+3. **passed**: true if score >= 1.0 (exact match)
+4. Provide brief reasoning explaining both scores
 """
+    def _get_eval_schema(self) -> dict[str, Any]:
+        """Get JSON schema for structured evaluation output."""
+        return {
+            "name": "evaluation_result",
+            "strict": True,
+            "schema": {
+                "type": "object",
+                "properties": {
+                    "criteria_results": {
+                        "type": "array",
+                        "items": {
+                            "type": "object",
+                            "properties": {
+                                "name": {"type": "string"},
+                                "score": {"type": "number"},
+                                "reasoning_score": {"type": "number"},
+                                "passed": {"type": "boolean"},
+                                "reasoning": {"type": "string"},
+                            },
+                            "required": ["name", "score", "reasoning_score", "passed", "reasoning"],
+                            "additionalProperties": False,
+                        },
+                    },
+                    "overall_reasoning": {"type": "string"},
+                },
+                "required": ["criteria_results", "overall_reasoning"],
+                "additionalProperties": False,
+            },
+        }
+    def _format_tool_results(self, tool_results: list[dict[str, str]]) -> str:
+        """Format tool results for the evaluation prompt."""
+        if not tool_results:
+            return "None"
+        lines = []
+        for tr in tool_results:
+            tool = tr.get("tool", "unknown")
+            output = tr.get("output", "")
+            # Truncate long tool outputs
+            if len(output) > 500:
+                output = output[:500] + f"... [{len(output) - 500} chars truncated]"
+            lines.append(f"**{tool}**:\n```\n{output}\n```")
+        return "\n".join(lines)
     def _get_trace_summary(self, run_result: RunResult) -> str:
         """Extract a summary of the execution trace for evaluation."""
         if not run_result.trace:
 {tool_summary}
 Tokens used: {metrics.total_tokens} (input: {metrics.input_tokens}, output: {metrics.output_tokens})"""
+    async def evaluate(self, run_result: RunResult, instructions: str | None = None) -> EvalResult:
         """Evaluate the agent's output using an LLM.
         Args:
             run_result: The result from running an agent on a task
+            instructions: Optional instructions used by the agent during the run
         Returns:
             EvalResult with LLM-generated scores and reasoning
                 ),
             )
+        prompt = self._get_evaluation_prompt(run_result, instructions=instructions)
         try:
+            # Build params with structured output
             params: dict[str, Any] = {
                 "model": self.model_name,
                 "messages": [
                     {
                         "role": "system",
+                        "content": "You are an expert evaluator.",
                     },
                     {"role": "user", "content": prompt},
                 ],
+                "response_format": {
+                    "type": "json_schema",
+                    "json_schema": self._get_eval_schema(),
+                },
             }
             if self.temperature is not None:
                 params["temperature"] = self.temperature
             response = await self.model_client.chat.completions.create(**params)
+            # Extract and parse response - structured output guarantees valid JSON
+            response_text = response.choices[0].message.content or "{}"
+            eval_data = json.loads(response_text)
             # Build criterion results
             criteria_results = []
             total_weighted_score = 0.0
+            total_weighted_reasoning = 0.0
             total_weight = 0.0
             for cr_data in eval_data.get("criteria_results", []):
                 cr = CriterionResult(
                     name=cr_data.get("name", "unknown"),
                     score=float(cr_data.get("score", 0.0)),
+                    reasoning_score=float(cr_data.get("reasoning_score", 0.0)),
                     passed=bool(cr_data.get("passed", False)),
                     reasoning=cr_data.get("reasoning", ""),
                 )
                         break
                 total_weighted_score += cr.score * weight
+                total_weighted_reasoning += cr.reasoning_score * weight
                 total_weight += weight
+            # Calculate overall scores
             overall_score = total_weighted_score / total_weight if total_weight > 0 else 0.0
+            overall_reasoning_score = total_weighted_reasoning / total_weight if total_weight > 0 else 0.0
             return EvalResult(
                 score=overall_score,
+                reasoning_score=overall_reasoning_score,
                 passed=overall_score >= self.passing_threshold,
                 criteria_results=criteria_results,
                 reasoning=eval_data.get("overall_reasoning", ""),

src/flow/experiments/expansion.py ADDED Viewed

	@@ -0,0 +1,250 @@

+# Copyright (c) Microsoft. All rights reserved.
+"""Variation expansion pipeline.
+Expands experiment variations (literals + strategies) into concrete values,
+then generates candidates via Cartesian product.
+"""
+from __future__ import annotations
+import logging
+from dataclasses import asdict
+from itertools import product as itertools_product
+from typing import TYPE_CHECKING, Any
+from .models import (
+    Agent,
+    Candidate,
+    CompactionConfig,
+    ExperimentRunner,
+    LiteralVariation,
+    StrategyVariation,
+    VariationItem,
+)
+from .strategies import get_strategy
+if TYPE_CHECKING:
+    from .types import Task
+logger = logging.getLogger(__name__)
+async def expand_variations(
+    variations: dict[str, list[VariationItem]],
+    base: Agent,
+    tasks: list[Task],
+    runner: ExperimentRunner | None = None,
+) -> dict[str, list[Any]]:
+    """Expand all variations to concrete values.
+    - LiteralVariation: value passes through directly
+    - StrategyVariation: strategy.generate() is called to produce values
+    Args:
+        variations: Parsed variations from Experiment
+        base: Base agent for strategies
+        tasks: Tasks for active strategies
+        runner: Optional ExperimentRunner for active strategies
+    Returns:
+        Dict mapping dimension names to lists of concrete values
+    """
+    expanded: dict[str, list[Any]] = {}
+    for dimension, items in variations.items():
+        expanded[dimension] = []
+        logger.info(f"Expanding dimension '{dimension}' ({len(items)} items)")
+        for item in items:
+            if isinstance(item, LiteralVariation):
+                # Literal: add directly
+                expanded[dimension].append(item.value)
+                logger.debug(f"  Literal: {_format_value(item.value)}")
+            elif isinstance(item, StrategyVariation):
+                # Strategy: invoke and collect results
+                logger.info(f"  Running strategy '{item.strategy}' (max_candidates={item.max_candidates})")
+                try:
+                    strategy = get_strategy(item.strategy, item.config)
+                    candidates = await strategy.generate(
+                        base=base,
+                        budget=item.max_candidates,
+                        tasks=tasks,
+                        runner=runner,
+                    )
+                    # Extract mutated values for this dimension
+                    for cand in candidates:
+                        if dimension in cand.mutations:
+                            value = cand.mutations[dimension]
+                            expanded[dimension].append(value)
+                            logger.debug(f"    Strategy produced: {_format_value(value)}")
+                        else:
+                            # Strategy didn't mutate this dimension, use base value
+                            base_value = getattr(base, dimension, None)
+                            if base_value is not None:
+                                expanded[dimension].append(base_value)
+                                logger.debug(f"    Strategy kept base: {_format_value(base_value)}")
+                    logger.info(f"  Strategy '{item.strategy}' produced {len(candidates)} candidates")
+                except Exception as e:
+                    logger.error(f"  Strategy '{item.strategy}' failed: {e}")
+                    raise
+        logger.info(f"Dimension '{dimension}': {len(expanded[dimension])} total values")
+    return expanded
+def generate_candidates(
+    base: Agent,
+    expanded: dict[str, list[Any]],
+    budget: int = 1000,
+) -> list[Candidate]:
+    """Generate candidates via Cartesian product of expanded variations.
+    Args:
+        base: Base agent to mutate
+        expanded: Dict mapping dimension names to lists of concrete values
+        budget: Maximum candidates to generate (safety limit)
+    Returns:
+        List of Candidate objects
+    """
+    if not expanded:
+        return [Candidate(agent=base, mutations={}, rationale="baseline")]
+    dimensions = list(expanded.keys())
+    value_lists = [expanded[d] for d in dimensions]
+    # Check if any dimension is empty
+    for dim, values in zip(dimensions, value_lists, strict=True):
+        if not values:
+            logger.warning(f"Dimension '{dim}' has no values, using baseline")
+            return [Candidate(agent=base, mutations={}, rationale="baseline (empty variations)")]
+    candidates: list[Candidate] = []
+    for values in itertools_product(*value_lists):
+        if len(candidates) >= budget:
+            logger.warning(f"Reached budget limit ({budget}), stopping candidate generation")
+            break
+        mutations = dict(zip(dimensions, values, strict=True))
+        candidate = _create_candidate(base, mutations)
+        candidates.append(candidate)
+    logger.info(f"Generated {len(candidates)} candidates from {len(dimensions)} dimensions")
+    return candidates
+def _create_candidate(base: Agent, mutations: dict[str, Any]) -> Candidate:
+    """Create a candidate by applying mutations to base agent.
+    Args:
+        base: Base agent
+        mutations: Dict of field name -> value
+    Returns:
+        New Candidate with mutated agent
+    """
+    # Build mutated agent dict
+    agent_dict = asdict(base)
+    for key, value in mutations.items():
+        if key == "compaction" and isinstance(value, CompactionConfig):
+            agent_dict["compaction"] = asdict(value)
+        elif key in agent_dict:
+            agent_dict[key] = value
+    # Reconstruct CompactionConfig from dict
+    comp_data = agent_dict.pop("compaction")
+    if isinstance(comp_data, dict):
+        compaction = CompactionConfig(**comp_data)
+    else:
+        compaction = comp_data
+    # Handle tools field - keep as-is (str, list, or dict)
+    tools = agent_dict.pop("tools", "standard")
+    mutated = Agent(
+        **{k: v for k, v in agent_dict.items() if k not in ("compaction", "tools")},
+        compaction=compaction,
+        tools=tools,
+    )
+    # Build name from mutations
+    name_parts = _build_name_parts(mutations)
+    mutated.name = f"{base.name}_{'_'.join(name_parts)}" if name_parts else base.name
+    # Serialize mutations for storage
+    serializable_mutations = _serialize_mutations(mutations)
+    return Candidate(
+        agent=mutated,
+        mutations=serializable_mutations,
+        rationale=f"Variations: {', '.join(name_parts)}" if name_parts else "baseline",
+    )
+def _build_name_parts(mutations: dict[str, Any]) -> list[str]:
+    """Build name parts from mutations for candidate naming."""
+    name_parts = []
+    for k, v in mutations.items():
+        if isinstance(v, CompactionConfig):
+            name_parts.append(f"{v.strategy}")
+            if v.strategy == "head_tail":
+                name_parts.append(f"h{v.head_size}_t{v.tail_size}")
+        elif k == "tools":
+            if isinstance(v, str):
+                name_parts.append(f"tools={v}")
+            elif isinstance(v, list):
+                name_parts.append(f"tools=[{len(v)}]")
+            else:
+                name_parts.append(f"tools=[{len(v)}]")
+        elif k == "llm_config" and isinstance(v, dict):
+            provider = v.get("provider", "unknown")
+            model = v.get("model", "")
+            name_parts.append(f"{provider}/{model}" if model else provider)
+        elif k == "instructions":
+            # Truncate instructions for name
+            preview = str(v)[:30].replace(" ", "_").replace("\n", "_")
+            name_parts.append(f"instr={preview}")
+        elif isinstance(v, bool):
+            name_parts.append(f"{k}={'on' if v else 'off'}")
+        else:
+            name_parts.append(f"{k}={v}")
+    return name_parts
+def _serialize_mutations(mutations: dict[str, Any]) -> dict[str, Any]:
+    """Serialize mutations for storage (convert non-serializable types)."""
+    serializable = {}
+    for k, v in mutations.items():
+        if isinstance(v, CompactionConfig):
+            serializable[k] = asdict(v)
+        else:
+            serializable[k] = v
+    return serializable
+def _format_value(value: Any) -> str:
+    """Format a value for logging."""
+    if isinstance(value, CompactionConfig):
+        return f"CompactionConfig({value.strategy})"
+    elif isinstance(value, str) and len(value) > 50:
+        return f'"{value[:50]}..."'
+    elif isinstance(value, dict):
+        return f"dict({len(value)} keys)"
+    elif isinstance(value, list):
+        return f"list({len(value)} items)"
+    else:
+        return repr(value)

src/flow/experiments/gaia_converter.py ADDED Viewed

	@@ -0,0 +1,216 @@

+import os
+from pathlib import Path
+from typing import Any
+from loguru import logger
+from flow.experiments.types import EvalCriterion, Task
+from flow.tools.text_inspector_qa import TextInspectorTool
+from flow.tools.visual_inspector_qa import VisualInspectorTool
+def _get_augmented_prompt_for_files(
+    local_path: str,
+    task: Task,
+    visual_inspector_tool: VisualInspectorTool | None,
+    text_inspector_tool: TextInspectorTool,
+) -> str:
+    gaia_file = task.metadata.get("gaia_file")
+    if not gaia_file:
+        return ""
+    file_name = str(gaia_file)
+    full_file_path = str(Path(local_path) / file_name)
+    ext = Path(file_name).suffix.lower()
+    prompt_use_files = "\n\nTo answer the question above, you will have to use these attached files:"
+    if ext in [".pdf", ".xlsx"]:
+        image_path = file_name.split(".")[0] + ".png"
+        full_image_path = Path(local_path) / image_path
+        if full_image_path.exists():
+            prompt_use_files += f"\nAttached image: {full_image_path}"
+        else:
+            prompt_use_files += f"\nAttached file: {full_file_path}"
+    elif ext == ".zip":
+        import shutil
+        folder_name = full_file_path.replace(".zip", "")
+        os.makedirs(folder_name, exist_ok=True)
+        shutil.unpack_archive(full_file_path, folder_name)
+        # Convert the extracted files
+        prompt_use_files = (
+            "\n\nYou have been given a zip archive of supporting files. "
+            "We extracted it into a directory: find the extracted files at the following paths:\n"
+        )
+        for root, _, files in os.walk(folder_name):
+            for file in files:
+                file_path = os.path.join(root, file)
+                prompt_use_files += f"- {file_path}\n"
+                if Path(file).suffix.lower() in [".png", ".jpg", ".jpeg"] and visual_inspector_tool is not None:
+                    prompt = f"""Write a caption of 5 sentences maximum for this image. Pay special attention to any details that might be useful for someone answering the following question:
+{task.prompt}. But do not try to answer the question directly!
+Do not add any information that is not present in the image.
+""".strip()
+                    prompt_use_files += (
+                        "> Description of this image: "
+                        + visual_inspector_tool(image_path=file_path, question=prompt)
+                        + "\n\n"
+                    )
+                else:
+                    prompt = f"""Write a short caption (5 sentences maximum) for this file. Pay special attention to any details that might be useful for someone answering the following question:
+{task.prompt}. But do not try to answer the question directly!
+Do not add any information that is not present in the file.
+""".strip()
+                    prompt_use_files += (
+                        "> Description of this file: "
+                        + text_inspector_tool.forward_initial_exam_mode(file_path=file_path, question=prompt)
+                        + "\n\n"
+                    )
+    elif ext in [".png", ".jpg", ".jpeg"]:
+        prompt_use_files += f"\nAttached image: {full_file_path}"
+    elif ext in [".mp3", ".m4a", ".wav"]:
+        prompt_use_files += f"\nAttached audio: {full_file_path}"
+    else:
+        prompt_use_files += f"\nAttached file: {full_file_path}"
+    return prompt_use_files
+def extract_task(row: dict[str, Any]) -> dict[str, Any] | None:
+    """Extract task fields from a row with flexible field names.
+    GAIA dataset has inconsistent field names across versions, so we try
+    multiple variants for each field.
+    Args:
+        row: Raw row from parquet/jsonl
+    Returns:
+        Normalized task dict, or None if task should be skipped
+    """
+    # Question field variants
+    question = row.get("Question") or row.get("question") or row.get("query") or row.get("prompt")
+    # Answer field variants
+    answer = row.get("Final answer") or row.get("answer") or row.get("final_answer")
+    # Task ID field variants
+    task_id = str(row.get("task_id") or row.get("question_id") or row.get("id") or row.get("uuid"))
+    # Level field
+    level = row.get("Level") or row.get("level")
+    if isinstance(level, str) and level.isdigit():
+        level = int(level)
+    # File attachment
+    file_name = row.get("file_name") or row.get("filename")
+    # Skip tasks without question or valid answer (test set has "?" placeholders)
+    if not question or answer is None or str(answer).strip() in ["?", ""]:
+        return None
+    return {
+        "task_id": task_id,
+        "question": question,
+        "answer": str(answer),
+        "level": level,
+        "file_name": file_name,
+    }
+def convert_to_flow_task(gaia_task: dict[str, Any]) -> Task:
+    """Convert a GAIA task to Flow task format.
+    Flow uses LLM-as-judge evaluation with criteria instructions. For GAIA,
+    we store the expected answer in both the criteria instruction and metadata
+    so that:
+    1. LLM-as-judge can evaluate based on the instruction
+    2. Custom evaluators can use metadata.gaia_answer for exact-match scoring
+    Args:
+        gaia_task: Normalized GAIA task dict
+    Returns:
+        Flow-compatible task dict
+    """
+    return Task(
+        name=gaia_task["task_id"],
+        prompt=gaia_task["question"],
+        criteria=[
+            EvalCriterion(
+                name="correct_answer",
+                instruction=f"The agent's final answer must match: {gaia_task['answer']}",
+                weight=1.0,
+            )
+        ],
+        metadata={
+            "gaia_answer": gaia_task["answer"],
+            "gaia_level": gaia_task.get("level"),
+            "gaia_file": gaia_task.get("file_name"),
+            "source": "gaia-benchmark",
+        },
+    )
+def convert_gaia(example: dict[str, Any], index: int, dataset_metadata: dict[str, Any] | None = None) -> Task:
+    logger.debug(f"Processing task at index: {index}")
+    if dataset_metadata is None:
+        raise ValueError("dataset_metadata is required and cannot be None.")
+    # Validate required fields in dataset_metadata
+    config = dataset_metadata.get("config")
+    split = dataset_metadata.get("split")
+    local_path = dataset_metadata.get("local_path")
+    if config is None:
+        raise ValueError("dataset_metadata 'config' is required and cannot be None.")
+    if split is None:
+        raise ValueError("dataset_metadata 'split' is required and cannot be None.")
+    if local_path is None:
+        raise ValueError("dataset_metadata 'local_path' is required and cannot be None.")
+    # Derive GAIA year from the config when possible (e.g., "2023_level2" -> "2023"),
+    # falling back to "2023" to preserve existing behavior if parsing fails.
+    gaia_year = "2023"
+    if isinstance(config, str):
+        year_candidate = config.split("_", 1)[0]
+        if year_candidate.isdigit() and len(year_candidate) == 4:
+            gaia_year = year_candidate
+    resolved_local_path = str(Path(local_path) / gaia_year / split)
+    extracted_task = extract_task(example)
+    converted_task = convert_to_flow_task(extracted_task)
+    try:
+        visual_inspector_tool = VisualInspectorTool()
+        text_inspector_tool = TextInspectorTool()
+    except RuntimeError as exc:
+        logger.warning(
+            "Inspector tools could not be initialized (likely missing environment "
+            "variables). Skipping file-based prompt augmentation. Error: {}",
+            exc,
+        )
+        prompt_for_files = ""
+    else:
+        prompt_for_files = _get_augmented_prompt_for_files(
+            local_path=resolved_local_path,
+            task=converted_task,
+            visual_inspector_tool=visual_inspector_tool,
+            text_inspector_tool=text_inspector_tool,
+        )
+    if len(prompt_for_files) > 0:
+        new_prompt = converted_task.prompt + prompt_for_files
+        converted_task.metadata["original_prompt"] = converted_task.prompt
+        converted_task.prompt = new_prompt
+    return converted_task

src/flow/experiments/hf_datasets.py ADDED Viewed

	@@ -0,0 +1,354 @@

+"""Convert Hugging Face datasets to Flow task format.
+This module provides utilities to convert HF datasets (like GSM8K, MATH, HumanEval)
+into Flow's task format for use with GEPA optimization.
+Usage:
+    # From CLI
+    python -m flow.cli.hf_import gsm8k --output tasks/gsm8k.jsonl --limit 100
+    # Programmatically
+    from flow.experiments.hf_datasets import import_hf_dataset
+    tasks = import_hf_dataset("openai/gsm8k", split="train", limit=50)
+"""
+from __future__ import annotations
+import json
+import logging
+import os
+from pathlib import Path
+from typing import Any
+from flow.experiments.types import EvalCriterion, Task
+logger = logging.getLogger(__name__)
+# Dataset-specific converters
+# Each converter knows how to extract question/answer from a specific dataset
+def convert_gsm8k(example: dict[str, Any], index: int, dataset_metadata: dict[str, Any] | None = None) -> Task:
+    """Convert GSM8K math problem to Flow task.
+    GSM8K format:
+    {
+        "question": "Natalia sold clips to 48 of her friends...",
+        "answer": "Natalia sold 48/2 = 24 clips in May. ... #### 72"
+    }
+    """
+    question = example["question"]
+    answer = example["answer"]
+    # Extract final answer (after ####)
+    final_answer = None
+    if "####" in answer:
+        final_answer = answer.split("####")[-1].strip()
+    # Create task with evaluation criteria
+    criteria = [
+        EvalCriterion(
+            name="correctness",
+            instruction=f"The solution correctly answers: {question}. The correct answer is {final_answer}",
+            weight=1.0,
+        ),
+        EvalCriterion(
+            name="reasoning",
+            instruction="The solution shows clear mathematical reasoning and step-by-step work",
+            weight=0.7,
+        ),
+    ]
+    task_metadata = {"dataset": "gsm8k", "index": index, "answer": answer, "final_answer": final_answer}
+    if dataset_metadata:
+        task_metadata.update(dataset_metadata)
+    return Task(
+        name=f"gsm8k_{index}",
+        prompt=f"Solve this math problem step by step:\n\n{question}",
+        criteria=criteria,
+        metadata=task_metadata,
+    )
+def convert_math(example: dict[str, Any], index: int, dataset_metadata: dict[str, Any] | None = None) -> Task:
+    """Convert MATH dataset problem to Flow task.
+    MATH format:
+    {
+        "problem": "What is 2+2?",
+        "solution": "The answer is 4",
+        "level": "Level 1",
+        "type": "Algebra"
+    }
+    """
+    problem = example["problem"]
+    solution = example.get("solution", "")
+    level = example.get("level", "Unknown")
+    problem_type = example.get("type", "Unknown")
+    criteria = [
+        EvalCriterion(name="correctness", instruction=f"The solution correctly solves: {problem}", weight=1.0),
+        EvalCriterion(
+            name="mathematical_rigor",
+            instruction="The solution uses proper mathematical notation and reasoning",
+            weight=0.8,
+        ),
+    ]
+    task_metadata = {"dataset": "math", "index": index, "level": level, "type": problem_type, "solution": solution}
+    if dataset_metadata:
+        task_metadata.update(dataset_metadata)
+    return Task(
+        name=f"math_{problem_type.lower()}_{index}",
+        prompt=f"Solve this {level} {problem_type} problem:\n\n{problem}",
+        criteria=criteria,
+        metadata=task_metadata,
+    )
+def convert_humaneval(example: dict[str, Any], index: int, dataset_metadata: dict[str, Any] | None = None) -> Task:
+    r"""Convert HumanEval coding problem to Flow task.
+    HumanEval format:
+    {
+        "task_id": "HumanEval/0",
+        "prompt": "def has_close_elements(numbers, threshold):\n    ...",
+        "canonical_solution": "    ...",
+        "test": "def check(...):\n    ...",
+        "entry_point": "has_close_elements"
+    }
+    """
+    task_id = example.get("task_id", f"task_{index}")
+    prompt = example["prompt"]
+    entry_point = example.get("entry_point", "")
+    test = example.get("test", "")
+    criteria = [
+        EvalCriterion(
+            name="correctness", instruction="The code implementation is correct and passes all test cases", weight=1.0
+        ),
+        EvalCriterion(
+            name="code_quality",
+            instruction="The code is clean, well-documented, and follows best practices",
+            weight=0.6,
+        ),
+    ]
+    task_metadata = {"dataset": "humaneval", "task_id": task_id, "entry_point": entry_point, "test": test}
+    if dataset_metadata:
+        task_metadata.update(dataset_metadata)
+    return Task(
+        name=f"humaneval_{task_id.replace('/', '_')}",
+        prompt=f"Complete this Python function:\n\n{prompt}\n\nMake sure it passes the test cases.",
+        criteria=criteria,
+        metadata=task_metadata,
+    )
+def convert_mbpp(example: dict[str, Any], index: int, dataset_metadata: dict[str, Any] | None = None) -> Task:
+    """Convert MBPP coding problem to Flow task.
+    MBPP format:
+    {
+        "task_id": 1,
+        "text": "Write a function to find the minimum cost path...",
+        "code": "def min_cost(cost, m, n): ...",
+        "test_list": ["assert min_cost(...) == ..."]
+    }
+    """
+    task_id = example.get("task_id", index)
+    text = example.get("text", "")
+    test_list = example.get("test_list", [])
+    criteria = [
+        EvalCriterion(name="correctness", instruction=f"The solution correctly implements: {text}", weight=1.0),
+        EvalCriterion(name="efficiency", instruction="The solution uses an efficient algorithm", weight=0.7),
+    ]
+    task_metadata = {"dataset": "mbpp", "task_id": task_id, "test_list": test_list}
+    if dataset_metadata:
+        task_metadata.update(dataset_metadata)
+    return Task(
+        name=f"mbpp_{task_id}",
+        prompt=f"{text}\n\nImplement this in Python.",
+        criteria=criteria,
+        metadata=task_metadata,
+    )
+# Registry of dataset converters
+def _get_gaia_converter():
+    """Lazy import for GAIA converter to avoid smolagents dependency at import time."""
+    from flow.experiments.gaia_converter import convert_gaia
+    return convert_gaia
+DATASET_CONVERTERS = {
+    "openai/gsm8k": convert_gsm8k,
+    "gsm8k": convert_gsm8k,
+    "competition_math": convert_math,
+    "hendrycks/math": convert_math,
+    "humaneval": convert_humaneval,
+    "openai_humaneval": convert_humaneval,
+    "mbpp": convert_mbpp,
+    "google-research-datasets/mbpp": convert_mbpp,
+    "gaia-benchmark/GAIA": _get_gaia_converter,  # Lazy loaded
+}
+def import_hf_dataset(
+    dataset_name: str,
+    config: str | None = None,
+    split: str = "train",
+    limit: int | None = None,
+    converter_override: Any = None,
+    local_path: str | Path | None = None,
+) -> list[Task]:
+    """Import a Hugging Face dataset and convert to Flow tasks.
+    Args:
+        dataset_name: HF dataset name (e.g., "openai/gsm8k")
+        config: Dataset configuration/subset (e.g., "main")
+        split: Dataset split to use (default: "train")
+        limit: Maximum number of examples to convert (default: all)
+        converter_override: Custom converter function (optional)
+        local_path: Path to download the dataset snapshot to using huggingface_hub.snapshot_download().
+            When provided, downloads the dataset to this path first, then loads from local files.
+            If the snapshot already exists at this path, it will be reused.
+            For private datasets, set the HF_TOKEN environment variable with your Hugging Face token.
+    Returns:
+        List of Flow Task objects
+    Environment Variables:
+        HF_TOKEN: Hugging Face API token for accessing private/gated datasets.
+            Required when using local_path with private datasets.
+    Example:
+        >>> # Load from Hugging Face Hub (default behavior)
+        >>> tasks = import_hf_dataset("openai/gsm8k", config="main", split="train", limit=50)
+        >>> print(f"Loaded {len(tasks)} tasks")
+        >>> # Download to local path first, then load
+        >>> tasks = import_hf_dataset("openai/gsm8k", config="main", split="train", local_path="/data/gsm8k")
+        >>> # For private datasets, set HF_TOKEN env variable first
+        >>> # export HF_TOKEN="hf_..."
+        >>> tasks = import_hf_dataset("org/private-dataset", split="train", local_path="/data/private")
+    """
+    try:
+        from datasets import load_dataset
+    except ImportError as e:
+        raise ImportError("Hugging Face datasets library is required. Install with: pip install datasets") from e
+    # Download to local path if specified, then load from there
+    if local_path is not None:
+        try:
+            from huggingface_hub import snapshot_download
+        except ImportError as e:
+            raise ImportError(
+                "huggingface_hub library is required for local_path support. Install with: pip install huggingface_hub"
+            ) from e
+        local_path = Path(local_path)
+        hf_token = os.environ.get("HF_TOKEN")
+        logger.info(f"Downloading dataset {dataset_name} to local path: {local_path}")
+        snapshot_path = snapshot_download(
+            repo_id=dataset_name,
+            repo_type="dataset",
+            local_dir=str(local_path),
+            token=hf_token,
+        )
+        logger.info(f"Loading dataset from local snapshot: {snapshot_path} (split: {split})")
+        dataset = load_dataset(snapshot_path, config, split=split)
+    else:
+        logger.info(f"Loading dataset: {dataset_name} (config: {config}, split: {split})")
+        dataset = load_dataset(dataset_name, config, split=split)
+    # Apply limit
+    if limit:
+        dataset = dataset.select(range(min(limit, len(dataset))))
+    logger.info(f"Converting {len(dataset)} examples to Flow tasks...")
+    # Find converter
+    converter = converter_override
+    if converter is None:
+        # Try to find matching converter
+        for key, conv in DATASET_CONVERTERS.items():
+            if key in dataset_name:
+                # Handle lazy loaders (functions that return the actual converter)
+                if conv is _get_gaia_converter:
+                    converter = conv()
+                else:
+                    converter = conv
+                break
+        if converter is None:
+            raise ValueError(
+                f"No converter found for dataset '{dataset_name}'. "
+                f"Available: {list(DATASET_CONVERTERS.keys())}\n"
+                f"Use converter_override parameter to provide a custom converter."
+            )
+    # Build dataset metadata to pass to converters
+    dataset_metadata: dict[str, Any] = {}
+    dataset_metadata["local_path"] = str(local_path) if local_path else None
+    dataset_metadata["config"] = config
+    dataset_metadata["split"] = split
+    # Convert examples
+    tasks = []
+    for i, example in enumerate(dataset):
+        try:
+            task = converter(example, i, dataset_metadata)
+            tasks.append(task)
+        except Exception as e:
+            logger.warning(f"Failed to convert example {i}: {e}", exc_info=True)
+    logger.info(f"Successfully converted {len(tasks)} tasks")
+    return tasks
+def save_tasks_to_jsonl(tasks: list[Task], output_path: Path) -> None:
+    """Save tasks to JSONL file.
+    Args:
+        tasks: List of Task objects
+        output_path: Path to output JSONL file
+    """
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    with open(output_path, "w") as f:
+        for task in tasks:
+            # Convert to dict
+            task_dict = {
+                "name": task.name,
+                "prompt": task.prompt,
+                "criteria": [{"name": c.name, "instruction": c.instruction, "weight": c.weight} for c in task.criteria],
+                "metadata": task.metadata,
+            }
+            f.write(json.dumps(task_dict) + "\n")
+    logger.info(f"Saved {len(tasks)} tasks to {output_path}")
+def register_converter(dataset_name: str, converter_func: Any) -> None:
+    """Register a custom converter for a dataset.
+    Args:
+        dataset_name: Dataset identifier
+        converter_func: Function that converts example dict to Task
+    Example:
+        >>> def my_converter(example, index):
+        ...     return Task(name=f"task_{index}", prompt=example["text"], ...)
+        >>> register_converter("my/dataset", my_converter)
+    """
+    DATASET_CONVERTERS[dataset_name] = converter_func
+    logger.info(f"Registered converter for '{dataset_name}'")

src/flow/experiments/models.py CHANGED Viewed

@@ -3,6 +3,8 @@
 """Core data models for the optimization framework.
 Defines:
 - CompactionConfig: Extensible compaction strategy configuration
 - Agent: Framework-agnostic agent definition (what the customer brings)
 - Candidate: A mutated agent variant produced by optimization
@@ -17,14 +19,17 @@ from __future__ import annotations
 from dataclasses import asdict, dataclass, field
 from itertools import product as itertools_product
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Protocol, runtime_checkable
 import yaml
 if TYPE_CHECKING:
-    from collections.abc import Awaitable, Callable
     from .evaluators.base import Evaluator
     from .types import Task
@@ -34,6 +39,89 @@ if TYPE_CHECKING:
 # Tool presets define common tool configurations.
 # Each preset maps tool names to their configuration dicts.
 TOOL_PRESETS: dict[str, dict[str, dict[str, Any]]] = {
     "full": {
         "read_file": {},
@@ -43,9 +131,8 @@ TOOL_PRESETS: dict[str, dict[str, dict[str, Any]]] = {
         "glob_files": {},
         "ls": {},
         "grep": {},
-        "bash": {"timeout": 120},
         "check_processes": {},
-        "python_repl": {},
         "think": {},
         "todo_write": {},
         "todo_read": {},
@@ -65,20 +152,21 @@ TOOL_PRESETS: dict[str, dict[str, dict[str, Any]]] = {
         "glob_files": {},
         "ls": {},
         "grep": {},
-        "bash": {"timeout": 120},
         "check_processes": {},
-        "python_repl": {},
         "think": {},
         "todo_write": {},
         "todo_read": {},
         "memory": {},
         "skills": {},
     },
     "minimal": {
         "read_file": {},
         "write_file": {},
         "edit_file": {},
-        "bash": {"timeout": 120},
         "think": {},
     },
     "readonly": {
@@ -91,16 +179,17 @@ TOOL_PRESETS: dict[str, dict[str, dict[str, Any]]] = {
 }
-def resolve_tools(tools: str | list[str] | dict[str, dict[str, Any]]) -> dict[str, dict[str, Any]]:
     """Normalize tool specification to dict form.
-    Accepts three input formats:
     - str: Preset name (e.g., "standard", "minimal", "full", "readonly")
     - list[str]: List of tool names with default configs
     - dict[str, dict]: Full specification with per-tool configs
     Args:
-        tools: Tool specification in any supported format
     Returns:
         Dict mapping tool names to their configuration dicts
@@ -109,6 +198,9 @@ def resolve_tools(tools: str | list[str] | dict[str, dict[str, Any]]) -> dict[st
         ValueError: If preset name is unknown
     Example:
         >>> resolve_tools("standard")
         {"read_file": {}, "write_file": {}, ...}
@@ -118,6 +210,8 @@ def resolve_tools(tools: str | list[str] | dict[str, dict[str, Any]]) -> dict[st
         >>> resolve_tools({"bash": {"timeout": 60}})
         {"bash": {"timeout": 60}}
     """
     if isinstance(tools, str):
         if tools not in TOOL_PRESETS:
             raise ValueError(f"Unknown tool preset: {tools}. Available: {list(TOOL_PRESETS.keys())}")
@@ -148,8 +242,8 @@ class CompactionConfig:
         token_budget: Maximum tokens for context window (used by token-based strategies)
     """
-    strategy: str = "head_tail"
-    params: dict[str, Any] = field(default_factory=lambda: {"head_size": 10, "tail_size": 40})
     token_budget: int = 100_000
     # =========================================================================
@@ -278,6 +372,10 @@ class CompactionConfig:
         return self.params.get("head_ratio", 0.2)
 @dataclass
 class Agent:
     """Framework-agnostic agent definition.
@@ -289,10 +387,10 @@ class Agent:
     Attributes:
         name: Unique identifier for this agent
-        framework: Which harness to use ("maf", "langgraph", "claude")
         description: Human-readable description
         instructions: System prompt / instructions (optional, uses framework default if None)
-        instructions_preset: Preset name for instructions ("coding", "benchmark", etc.)
         llm_config: LLM configuration with provider and model info:
             {"provider": "azure|openai|anthropic", "model": "gpt-4o"}
             If None, auto-detects from environment variables.
@@ -304,13 +402,270 @@ class Agent:
     """
     name: str
-    framework: str = "maf"
     description: str = ""
     instructions: str | None = None
-    instructions_preset: str | None = None  # e.g., "coding", "benchmark", "research"
     llm_config: dict[str, Any] | None = None  # {"provider": "azure", "model": "gpt-4o"}
     compaction: CompactionConfig = field(default_factory=CompactionConfig)
-    tools: str | list[str] | dict[str, dict[str, Any]] = "standard"
 @dataclass
@@ -325,11 +680,15 @@ class Candidate:
         agent: The mutated agent configuration
         mutations: Dict describing what was changed from the base
         rationale: Human-readable explanation of why this candidate exists
     """
     agent: Agent
     mutations: dict[str, Any] = field(default_factory=dict)
     rationale: str = ""
 @dataclass
@@ -345,47 +704,93 @@ class ExperimentResult:
     traces: dict[str, Any] = field(default_factory=dict)
 @runtime_checkable
 class CandidateStrategy(Protocol):
     """Protocol for generating candidate variants from a base agent.
     Implementations can be:
-    - Simple (single-shot): GridSearchStrategy ignores optional params
-    - Complex (iterative): Runs internal experiments, checks convergence,
-      distills failures, etc. using the provided callbacks
-    All logic is internal to the strategy - the caller just calls generate()
     and receives the final list of candidates.
     Examples:
     - GridSearchStrategy: Exhaustive grid over parameter combinations
-    - (Future) AdaptivePromptOptimizer: Iteratively improves prompts from failures
     - (Future) BayesianStrategy: Bayesian optimization over parameters
     """
-    def generate(
         self,
         base: Agent,
         budget: int,
         *,
         tasks: list[Task] | None = None,
-        evaluator: Evaluator | None = None,
-        run_experiment: Callable[[Candidate, Task], Awaitable[ExperimentResult]] | None = None,
     ) -> list[Candidate]:
         """Generate candidate variants from a base agent.
         Args:
             base: The base agent to optimize
             budget: Maximum number of candidates to return
-            tasks: Optional tasks for strategies that run internal experiments
-            evaluator: Optional evaluator for strategies that need scoring
-            run_experiment: Optional async callback to execute a candidate on a task.
-                           Signature: async (candidate, task) -> ExperimentResult
         Returns:
             List of Candidate objects (at most `budget` items).
             For iterative strategies, returns the final/best candidates after
-            internal optimization loops complete.
         """
         ...
@@ -419,23 +824,22 @@ class GridSearchStrategy:
         """
         self.variations = variations
-    def generate(
         self,
         base: Agent,
         budget: int,
         *,
         tasks: list[Task] | None = None,
-        evaluator: Evaluator | None = None,
-        run_experiment: Callable[[Candidate, Task], Awaitable[ExperimentResult]] | None = None,
     ) -> list[Candidate]:
         """Generate all grid combinations up to budget.
-        Note: tasks, evaluator, and run_experiment are accepted for protocol
-        compatibility but ignored - GridSearchStrategy is a simple single-shot
-        strategy that doesn't run experiments internally.
         """
         # Delete unused params to satisfy linters
-        del tasks, evaluator, run_experiment
         if not self.variations:
             return [Candidate(agent=base, mutations={}, rationale="baseline")]
@@ -690,6 +1094,38 @@ def _extract_metrics(
 # =============================================================================
 @dataclass
 class Experiment:
     """Experiment configuration for optimization.
@@ -699,53 +1135,173 @@ class Experiment:
     - Experiment YAML: How to test it (variations, tasks, evaluation settings)
     Attributes:
-        base_agent: Path to base agent YAML file
         suite: Built-in task suite name (e.g., "coding", "quick")
         tasks: Path to custom tasks JSONL file (alternative to suite)
-        variations: Dict of parameter variations for grid search
         parallel: Max concurrent experiments
-        budget: Maximum candidates to generate
         use_llm_eval: Whether to use LLM-as-Judge evaluation
     Example YAML:
         ```yaml
-        base_agent: examples/miniagent_base.yaml
         suite: coding
         variations:
-          compaction:
-            - strategy: none
-            - strategy: head_tail
-              params: { head_size: 10, tail_size: 40 }
-            - strategy: sliding_window
-              token_budget: 50000
-            - strategy: summarization
-              token_budget: 50000
           tools:
             - minimal
             - standard
-            - [read_file, write_file, bash, memory]
-        parallel: 4
-        budget: 20
         use_llm_eval: true
         ```
     """
-    base_agent: str | None = None
     suite: str | None = None
     tasks: str | None = None
-    variations: dict[str, list[Any]] = field(default_factory=dict)
     parallel: int = 4
     budget: int = 100
     use_llm_eval: bool = True
 def load_experiment(path: Path) -> Experiment:
     """Load an Experiment from a YAML file.
-    Handles conversion of compaction variations from dict to CompactionConfig.
     Args:
         path: Path to the experiment YAML file
@@ -762,38 +1318,34 @@ def load_experiment(path: Path) -> Experiment:
     data = yaml.safe_load(path.read_text())
-    # Parse variations - convert compaction dicts to CompactionConfig
-    variations: dict[str, list[Any]] = {}
     raw_variations = data.get("variations", {})
-    for key, values in raw_variations.items():
-        if key == "compaction":
-            # Convert each compaction dict to CompactionConfig
-            parsed_compactions = []
-            for v in values:
-                if isinstance(v, dict):
-                    parsed_compactions.append(CompactionConfig(**v))
-                elif isinstance(v, str):
-                    # Handle shorthand: "none", "head_tail", etc.
-                    if v == "none":
-                        parsed_compactions.append(CompactionConfig.none())
-                    elif v == "head_tail":
-                        parsed_compactions.append(CompactionConfig.head_tail())
-                    elif v == "sliding_window":
-                        parsed_compactions.append(CompactionConfig.sliding_window())
-                    elif v == "summarization":
-                        parsed_compactions.append(CompactionConfig.summarization())
-                    else:
-                        raise ValueError(f"Unknown compaction shorthand: {v}")
-                else:
-                    parsed_compactions.append(v)
-            variations["compaction"] = parsed_compactions
-        else:
-            # Other variations pass through as-is
-            variations[key] = values
     return Experiment(
-        base_agent=data.get("base_agent"),
         suite=data.get("suite"),
         tasks=data.get("tasks"),
         variations=variations,

 """Core data models for the optimization framework.
 Defines:
+- COMPACTION_STRATEGIES: Registry of compaction strategies for schema API
+- DEFAULT_TOKEN_BUDGET: Default token budget (200k) for modern models
 - CompactionConfig: Extensible compaction strategy configuration
 - Agent: Framework-agnostic agent definition (what the customer brings)
 - Candidate: A mutated agent variant produced by optimization
 from dataclasses import asdict, dataclass, field
 from itertools import product as itertools_product
 from pathlib import Path
+from typing import TYPE_CHECKING, Any, Literal, Protocol, runtime_checkable
 import yaml
 if TYPE_CHECKING:
+    from collections.abc import AsyncIterator, Awaitable, Callable
+    from flow.harness.base import Event
     from .evaluators.base import Evaluator
+    from .results import AgentOptimizationResult, EvaluationResult
     from .types import Task
 # Tool presets define common tool configurations.
 # Each preset maps tool names to their configuration dicts.
+# =============================================================================
+# Compaction Strategy Configuration
+# =============================================================================
+# Default token budget for modern models (GPT-4o, Claude 3.5, etc.)
+DEFAULT_TOKEN_BUDGET = 200_000
+# Compaction strategies registry for schema API
+# All strategies use token-based triggers (not message count) for safety
+COMPACTION_STRATEGIES: dict[str, dict[str, Any]] = {
+    "head_tail": {
+        "label": "Head + Tail",
+        "description": "Keep head (system prompt, initial context) + tail (recent messages). Drops middle when over budget.",
+        "params": {
+            "head_ratio": {
+                "type": "number",
+                "default": 0.2,
+                "min": 0,
+                "max": 1,
+                "description": "Fraction of budget for head messages (0.2 = 20%)",
+            },
+            "token_budget": {
+                "type": "number",
+                "default": DEFAULT_TOKEN_BUDGET,
+                "min": 1000,
+                "description": "Max tokens before compaction triggers",
+            },
+        },
+    },
+    "sliding_window": {
+        "label": "Sliding Window",
+        "description": "Keep system message + most recent messages that fit within budget. Simple and effective.",
+        "params": {
+            "token_budget": {
+                "type": "number",
+                "default": DEFAULT_TOKEN_BUDGET,
+                "min": 1000,
+                "description": "Max tokens for context window",
+            },
+        },
+    },
+    "summarization": {
+        "label": "Summarization",
+        "description": "Summarize middle messages using LLM instead of dropping them. Preserves context but adds latency.",
+        "params": {
+            "head_messages": {
+                "type": "number",
+                "default": 2,
+                "min": 1,
+                "description": "Messages to preserve at head",
+            },
+            "tail_messages": {
+                "type": "number",
+                "default": 4,
+                "min": 1,
+                "description": "Messages to preserve at tail",
+            },
+            "summary_max_tokens": {
+                "type": "number",
+                "default": 1000,
+                "min": 100,
+                "description": "Max tokens for the summary",
+            },
+            "token_budget": {
+                "type": "number",
+                "default": DEFAULT_TOKEN_BUDGET,
+                "min": 1000,
+                "description": "Max tokens before compaction triggers",
+            },
+        },
+    },
+    "none": {
+        "label": "No Compaction",
+        "description": "Context grows unbounded. Only use for benchmarking or very short tasks.",
+        "params": {},
+    },
+}
+# =============================================================================
+# Tool Configuration
+# =============================================================================
 TOOL_PRESETS: dict[str, dict[str, dict[str, Any]]] = {
     "full": {
         "read_file": {},
         "glob_files": {},
         "ls": {},
         "grep": {},
+        "bash": {"timeout": 300},
         "check_processes": {},
         "think": {},
         "todo_write": {},
         "todo_read": {},
         "glob_files": {},
         "ls": {},
         "grep": {},
+        "bash": {"timeout": 300},
         "check_processes": {},
         "think": {},
         "todo_write": {},
         "todo_read": {},
         "memory": {},
         "skills": {},
+        "web_search": {},
+        "web_fetch": {},
     },
     "minimal": {
         "read_file": {},
         "write_file": {},
         "edit_file": {},
+        "bash": {"timeout": 300},
         "think": {},
     },
     "readonly": {
 }
+def resolve_tools(tools: str | list[str] | dict[str, dict[str, Any]] | None) -> dict[str, dict[str, Any]]:
     """Normalize tool specification to dict form.
+    Accepts four input formats:
+    - None: No tools (empty set)
     - str: Preset name (e.g., "standard", "minimal", "full", "readonly")
     - list[str]: List of tool names with default configs
     - dict[str, dict]: Full specification with per-tool configs
     Args:
+        tools: Tool specification in any supported format, or None for no tools
     Returns:
         Dict mapping tool names to their configuration dicts
         ValueError: If preset name is unknown
     Example:
+        >>> resolve_tools(None)
+        {}
         >>> resolve_tools("standard")
         {"read_file": {}, "write_file": {}, ...}
         >>> resolve_tools({"bash": {"timeout": 60}})
         {"bash": {"timeout": 60}}
     """
+    if tools is None:
+        return {}
     if isinstance(tools, str):
         if tools not in TOOL_PRESETS:
             raise ValueError(f"Unknown tool preset: {tools}. Available: {list(TOOL_PRESETS.keys())}")
         token_budget: Maximum tokens for context window (used by token-based strategies)
     """
+    strategy: str = "none"
+    params: dict[str, Any] = field(default_factory=dict)
     token_budget: int = 100_000
     # =========================================================================
         return self.params.get("head_ratio", 0.2)
+# Supported agent frameworks (harnesses)
+Framework = Literal["maf", "miniagent", "langgraph"]
 @dataclass
 class Agent:
     """Framework-agnostic agent definition.
     Attributes:
         name: Unique identifier for this agent
+        framework: Which harness to use ("maf", "miniagent", "langgraph")
         description: Human-readable description
         instructions: System prompt / instructions (optional, uses framework default if None)
+        instructions_preset: Preset name for instructions (default: "general")
         llm_config: LLM configuration with provider and model info:
             {"provider": "azure|openai|anthropic", "model": "gpt-4o"}
             If None, auto-detects from environment variables.
     """
     name: str
+    framework: Framework = "miniagent"
     description: str = ""
     instructions: str | None = None
+    instructions_preset: str | None = None  # e.g., "general"
     llm_config: dict[str, Any] | None = None  # {"provider": "azure", "model": "gpt-4o"}
     compaction: CompactionConfig = field(default_factory=CompactionConfig)
+    tools: str | list[str] | dict[str, dict[str, Any]] | None = None
+    # Set by deploy() — when set, evaluate/optimize auto-persist to DB
+    _id: str | None = field(default=None, repr=False, compare=False)
+    @property
+    def id(self) -> str | None:
+        """Agent ID in the database, set after deploy()."""
+        return self._id
+    @classmethod
+    def from_preset(cls, name: str) -> Agent:
+        """Create an Agent from a named preset.
+        Args:
+            name: Preset name (e.g., "coding", "research", "document-analysis")
+        Returns:
+            A new Agent instance with the preset's configuration
+        Example:
+            agent = Agent.from_preset("coding")
+            result = await agent.evaluate(tasks="quick")
+        """
+        from .presets import get_preset
+        preset = get_preset(name)
+        return cls(
+            name=preset.agent.name,
+            framework=preset.agent.framework,
+            description=preset.agent.description,
+            instructions=preset.agent.instructions,
+            instructions_preset=preset.agent.instructions_preset,
+            llm_config=preset.agent.llm_config,
+            compaction=preset.agent.compaction,
+            tools=preset.agent.tools,
+        )
+    async def run(self, task: str, workspace: Path | None = None) -> str:
+        """Run the agent on a task and return the final output.
+        This is the simplest way to use an agent — give it a task, get a result.
+        Args:
+            task: The task/prompt to execute
+            workspace: Optional workspace directory (creates temp dir if None)
+        Returns:
+            The agent's final text output
+        Example:
+            agent = Agent(name="coding-agent", tools="standard")
+            output = await agent.run("Create hello.py that prints Hello World")
+            print(output)
+        """
+        output_parts: list[str] = []
+        async for event in self.run_stream(task, workspace=workspace):
+            if event.type.value == "text_delta":
+                output_parts.append(event.content)
+        return "".join(output_parts)
+    async def run_stream(
+        self, task: str, *, workspace: Path | None = None
+    ) -> AsyncIterator[Event]:
+        """Run the agent on a task with streaming events.
+        Yields real-time events as the agent works — text chunks, tool calls,
+        tool results, and completion. Use this for live output in notebooks or CLIs.
+        Args:
+            task: The task/prompt to execute
+            workspace: Optional workspace directory (creates temp dir if None)
+        Yields:
+            Event objects (text_delta, tool_call_start, tool_result, done, etc.)
+        Example:
+            agent = Agent(name="coding-agent", tools="standard")
+            async for event in agent.run_stream("Create hello.py"):
+                if event.type.value == "text_delta":
+                    print(event.content, end="", flush=True)
+        """
+        import tempfile
+        # Lazy imports to avoid circular deps and keep Agent lightweight
+        from flow.harness import create_harness
+        from flow.harness.registry import ensure_harnesses_registered
+        ensure_harnesses_registered()
+        if workspace is None:
+            workspace = Path(tempfile.mkdtemp(prefix="flow_run_"))
+        harness = create_harness(self, workspace)
+        try:
+            async for event in harness.run_stream(task):
+                yield event
+        finally:
+            await harness.close()
+    async def deploy(self) -> str:
+        """Register this agent in the Flow database.
+        Creates an AgentConfig row in the local SQLite DB (~/.flow/flow_ui.db).
+        No running server required — this is a pure DB write. After deploying,
+        all evaluate() and optimize() calls auto-persist results to the DB.
+        Run ``flow serve`` separately to browse results in the UI.
+        Returns:
+            The agent ID (UUID string)
+        Example:
+            agent = Agent(name="coding-agent", tools="standard")
+            agent_id = await agent.deploy()
+            # Results now auto-persist
+            result = await agent.evaluate(tasks="quick")
+            # Run `flow serve` to view at http://localhost:7860/agents/{agent_id}
+        """
+        try:
+            from flow.ui.services.persistence_adapter import PersistenceAdapter
+        except ImportError as e:
+            raise ImportError(
+                "DB dependencies not available. Install flow with UI support "
+                "to use deploy(): pip install flow[ui] or uv sync"
+            ) from e
+        adapter = PersistenceAdapter()
+        self._id = await adapter.deploy_agent(self)
+        return self._id
+    async def evaluate(
+        self,
+        tasks: str | list[Task] | Path = "quick",
+        *,
+        parallel: int = 4,
+        use_llm_eval: bool = True,
+        quiet: bool = False,
+    ) -> EvaluationResult:
+        """Evaluate this agent on a set of tasks.
+        If the agent has been deployed (via deploy()), results are
+        automatically persisted to the database.
+        Args:
+            tasks: Task specification - suite name (str like "quick", "coding"),
+                   list of Task objects, or Path to JSONL file
+            parallel: Number of concurrent task executions
+            use_llm_eval: Whether to use LLM-as-Judge for scoring
+            quiet: Suppress verbose output
+        Returns:
+            EvaluationResult with score, tokens, pass_rate, etc.
+        Example:
+            agent = Agent(name="my-agent", tools="standard")
+            result = await agent.evaluate(tasks="quick")
+            print(f"Score: {result.score:.2f}, Pass rate: {result.pass_rate:.0%}")
+        """
+        from .agent_api import _evaluate_agent_impl
+        return await _evaluate_agent_impl(
+            self, tasks, parallel, use_llm_eval, quiet, agent_id=self._id
+        )
+    async def optimize(
+        self,
+        tasks: str | list[Task] | Path = "quick",
+        *,
+        strategy: str | list[str] | None = None,
+        variations: dict[str, list[Any]] | None = None,
+        parallel: int = 4,
+        budget: int = 50,
+        use_llm_eval: bool = True,
+        quiet: bool = False,
+    ) -> AgentOptimizationResult:
+        """Optimize this agent's configuration.
+        Supports two modes:
+        - **Grid search** (default): Exhaustive search over parameter combinations
+        - **Active strategies**: Iterative evaluate-reflect-adjust optimization
+        If the agent has been deployed (via deploy()), results are
+        automatically persisted to the database.
+        Args:
+            tasks: Task specification - suite name (str), list of Tasks, or Path
+            strategy: Optimization strategy to use:
+                - None or "grid": Grid search over variations (default)
+                - "tools": Iteratively discover optimal tool configuration
+                - "instructions": Iteratively rewrite instructions from failures
+                - list: Run multiple strategies sequentially, e.g.
+                  ["instructions", "tools"] optimizes instructions first,
+                  then tools starting from the improved agent
+            variations: Custom grid search variations (only used with grid strategy)
+            parallel: Number of concurrent experiments
+            budget: Maximum number of candidates to test
+            use_llm_eval: Whether to use LLM-as-Judge for scoring
+            quiet: Suppress verbose output
+        Returns:
+            AgentOptimizationResult with baseline, best, and improvement metrics
+        Example:
+            agent = Agent(name="my-agent", tools="standard")
+            # Grid search (default)
+            result = await agent.optimize(tasks="quick")
+            # Active: discover optimal tools
+            result = await agent.optimize(tasks="quick", strategy="tools")
+            # Active: improve instructions
+            result = await agent.optimize(tasks="quick", strategy="instructions")
+            # Pipeline: instructions first, then tools
+            result = await agent.optimize(
+                tasks="quick", strategy=["instructions", "tools"]
+            )
+            print(f"Best score: {result.best.score:.2f}")
+            optimized = result.best_agent
+        """
+        from .agent_api import _optimize_agent_impl
+        return await _optimize_agent_impl(
+            self, tasks, variations, parallel, budget, use_llm_eval, quiet,
+            agent_id=self._id,
+            strategy=strategy,
+        )
+@dataclass
+class StrategyIteration:
+    """One iteration of an active strategy's optimization loop.
+    Tracks what was tried, how it scored, and why the change was made.
+    Active strategies accumulate these to provide a full audit trail.
+    Attributes:
+        iteration: Iteration number (0 = baseline)
+        instructions_preview: First 200 chars of instructions used
+        full_instructions: Complete instructions text for this iteration
+        avg_score: Average score across tasks for this iteration
+        pass_rate: Fraction of tasks that passed
+        failures_count: Number of tasks that failed
+        change_description: What was changed (e.g., "Added bash timeout instructions")
+        change_rationale: Why the change was made (e.g., "3/5 tasks failed due to hanging bash commands")
+    """
+    iteration: int
+    instructions_preview: str
+    avg_score: float
+    pass_rate: float
+    failures_count: int
+    change_description: str = ""
+    change_rationale: str = ""
+    full_instructions: str = ""
 @dataclass
         agent: The mutated agent configuration
         mutations: Dict describing what was changed from the base
         rationale: Human-readable explanation of why this candidate exists
+        optimization_history: Audit trail from active optimization strategies.
+            Each entry records one iteration of the optimization loop with
+            scores, failure counts, and descriptions of what changed and why.
     """
     agent: Agent
     mutations: dict[str, Any] = field(default_factory=dict)
     rationale: str = ""
+    optimization_history: list[StrategyIteration] = field(default_factory=list)
 @dataclass
     traces: dict[str, Any] = field(default_factory=dict)
+@runtime_checkable
+class ExperimentRunner(Protocol):
+    """Protocol for evaluating candidates against tasks.
+    This is the interface that active strategies use to test candidate
+    configurations. The FlowOptimizer implements this protocol, providing
+    strategies with access to the full execution pipeline (harness creation,
+    agent execution, trace collection, LLM evaluation, metrics extraction)
+    without exposing internal details.
+    Passive strategies (GridSearchStrategy, etc.) ignore this entirely.
+    Active strategies call evaluate() in a loop to iteratively refine
+    candidates based on real execution results.
+    The evaluate() method returns a CandidateSummary (from optimizer.py)
+    which contains:
+    - avg_score, pass_rate: Aggregate performance metrics
+    - task_results: list[TaskResult] — per-task details including:
+        - eval_reasoning: Why the evaluator scored it this way
+        - eval_score, eval_passed: Score and pass/fail status
+        - criteria_results: Per-criterion breakdown
+        - run_result.output: What the agent produced
+        - run_result.trace: Full OTel execution trace
+        - metrics: Token counts, tool usage, duration
+    """
+    async def evaluate(
+        self,
+        candidate: Candidate,
+        tasks: list[Task],
+    ) -> Any:
+        """Evaluate a candidate on a set of tasks.
+        Args:
+            candidate: The candidate to evaluate
+            tasks: Tasks to run the candidate on
+        Returns:
+            CandidateSummary with aggregated scores and per-task details.
+            Typed as Any to avoid circular imports — the actual return type
+            is flow.experiments.optimizer.CandidateSummary.
+        """
+        ...
 @runtime_checkable
 class CandidateStrategy(Protocol):
     """Protocol for generating candidate variants from a base agent.
     Implementations can be:
+    - Passive (single-shot): GridSearchStrategy ignores optional params
+    - Active (iterative): Uses runner to evaluate candidates, inspect failures,
+      and iteratively refine configurations based on real execution results
+    All logic is internal to the strategy — the caller just calls generate()
     and receives the final list of candidates.
     Examples:
     - GridSearchStrategy: Exhaustive grid over parameter combinations
+    - (Future) InstructionOptimizer: Iteratively improves instructions from failures
     - (Future) BayesianStrategy: Bayesian optimization over parameters
     """
+    async def generate(
         self,
         base: Agent,
         budget: int,
         *,
         tasks: list[Task] | None = None,
+        runner: ExperimentRunner | None = None,
     ) -> list[Candidate]:
         """Generate candidate variants from a base agent.
         Args:
             base: The base agent to optimize
             budget: Maximum number of candidates to return
+            tasks: Optional tasks for active strategies that run internal experiments
+            runner: Optional experiment runner for active strategies.
+                    Active strategies call runner.evaluate(candidate, tasks)
+                    to test candidates and use results to guide optimization.
+                    Passive strategies ignore this parameter.
         Returns:
             List of Candidate objects (at most `budget` items).
             For iterative strategies, returns the final/best candidates after
+            internal optimization loops complete. Candidates may include
+            optimization_history with per-iteration audit trail.
         """
         ...
         """
         self.variations = variations
+    async def generate(
         self,
         base: Agent,
         budget: int,
         *,
         tasks: list[Task] | None = None,
+        runner: ExperimentRunner | None = None,
     ) -> list[Candidate]:
         """Generate all grid combinations up to budget.
+        Note: tasks and runner are accepted for protocol compatibility but
+        ignored — GridSearchStrategy is a passive strategy that doesn't
+        run experiments internally.
         """
         # Delete unused params to satisfy linters
+        del tasks, runner
         if not self.variations:
             return [Candidate(agent=base, mutations={}, rationale="baseline")]
 # =============================================================================
+@dataclass
+class LiteralVariation:
+    """A literal/static variation value.
+    Used for predefined values like tool presets, compaction configs, etc.
+    """
+    value: Any
+@dataclass
+class StrategyVariation:
+    """A strategy that generates variation values dynamically.
+    Used for active optimization strategies like GEPA (instructions)
+    or agentic tool selection.
+    Attributes:
+        strategy: Strategy name (e.g., "gepa", "agentic")
+        max_candidates: Number of candidates this strategy will produce
+        config: Strategy-specific configuration
+    """
+    strategy: str
+    max_candidates: int = 1
+    config: dict[str, Any] = field(default_factory=dict)
+# Union type for variation items
+VariationItem = LiteralVariation | StrategyVariation
 @dataclass
 class Experiment:
     """Experiment configuration for optimization.
     - Experiment YAML: How to test it (variations, tasks, evaluation settings)
     Attributes:
+        base_agent: Path to base agent YAML file (required)
         suite: Built-in task suite name (e.g., "coding", "quick")
         tasks: Path to custom tasks JSONL file (alternative to suite)
+        variations: Dict mapping dimension names to lists of VariationItems
         parallel: Max concurrent experiments
+        budget: Maximum candidates to generate (safety limit)
         use_llm_eval: Whether to use LLM-as-Judge evaluation
     Example YAML:
         ```yaml
+        base_agent: agents/coder.yaml
         suite: coding
         variations:
+          instructions:
+            # Literal values
+            - "You are a helpful coding assistant"
+            - file: prompts/expert.md
+            # Strategy (active optimization)
+            - strategy: gepa
+              max_candidates: 3
+              config:
+                reflection_lm: gpt-4o
           tools:
             - minimal
             - standard
+            - strategy: agentic
+              max_candidates: 2
+          compaction:
+            - strategy: none
+            - strategy: sliding_window
+              token_budget: 50000
+        parallel: 8
+        budget: 100
         use_llm_eval: true
         ```
     """
+    base_agent: str
     suite: str | None = None
     tasks: str | None = None
+    variations: dict[str, list[VariationItem]] = field(default_factory=dict)
     parallel: int = 4
     budget: int = 100
     use_llm_eval: bool = True
+def compute_max_experiments(variations: dict[str, list[VariationItem]]) -> int:
+    """Compute maximum number of experiments from variations.
+    Each dimension contributes its count (literals + strategy max_candidates),
+    and total is the Cartesian product.
+    Args:
+        variations: Parsed variations dict
+    Returns:
+        Maximum number of experiments
+    """
+    if not variations:
+        return 1
+    import math
+    counts = []
+    for items in variations.values():
+        dim_count = 0
+        for item in items:
+            if isinstance(item, StrategyVariation):
+                dim_count += item.max_candidates
+            else:
+                dim_count += 1
+        counts.append(max(dim_count, 1))
+    return math.prod(counts)
+def _parse_literal_value(dimension: str, value: Any) -> Any:
+    """Parse a literal value for a specific dimension.
+    Handles special cases like compaction configs and file references.
+    Args:
+        dimension: The dimension name (e.g., "compaction", "tools")
+        value: The raw value from YAML
+    Returns:
+        Parsed value appropriate for the dimension
+    """
+    # Handle file references
+    if isinstance(value, dict) and "file" in value:
+        file_path = Path(value["file"])
+        if file_path.exists():
+            return file_path.read_text()
+        # If relative path, caller should resolve it
+        return value
+    # Handle compaction dimension
+    if dimension == "compaction":
+        if isinstance(value, dict):
+            # Dict with strategy key is a CompactionConfig
+            return CompactionConfig(**value)
+        elif isinstance(value, str):
+            # Shorthand: "none", "head_tail", etc.
+            if value == "none":
+                return CompactionConfig.none()
+            elif value == "head_tail":
+                return CompactionConfig.head_tail()
+            elif value == "sliding_window":
+                return CompactionConfig.sliding_window()
+            elif value == "summarization":
+                return CompactionConfig.summarization()
+            else:
+                raise ValueError(f"Unknown compaction shorthand: {value}")
+    # All other values pass through as-is
+    return value
+# Known compaction strategy names (these are NOT optimization strategies)
+_COMPACTION_STRATEGY_NAMES = {"none", "head_tail", "sliding_window", "summarization", "last_n", "head_tail_tokens"}
+def _is_strategy_variation(item: Any) -> bool:
+    """Check if an item is a StrategyVariation (optimization strategy).
+    Distinguishes between:
+    - StrategyVariation: {"strategy": "gepa", "max_candidates": 3, "config": {...}}
+    - Compaction literal: {"strategy": "sliding_window", "token_budget": 50000}
+    The key difference:
+    - Optimization strategies have max_candidates or config keys
+    - Compaction configs have strategy names like "none", "sliding_window", etc.
+    Args:
+        item: The raw item from YAML
+    Returns:
+        True if this is a StrategyVariation, False if literal
+    """
+    if not isinstance(item, dict):
+        return False
+    if "strategy" not in item:
+        return False
+    strategy_name = item["strategy"]
+    # If it has max_candidates or config, it's definitely an optimization strategy
+    if "max_candidates" in item or "config" in item:
+        return True
+    # If the strategy name is a known compaction strategy, it's a literal
+    if strategy_name in _COMPACTION_STRATEGY_NAMES:
+        return False
+    # Otherwise assume it's an optimization strategy (will fail at runtime if invalid)
+    return True
 def load_experiment(path: Path) -> Experiment:
     """Load an Experiment from a YAML file.
+    Parses variations into VariationItem objects (LiteralVariation or StrategyVariation).
     Args:
         path: Path to the experiment YAML file
     data = yaml.safe_load(path.read_text())
+    # Validate required fields
+    if "base_agent" not in data:
+        raise ValueError("Experiment YAML must specify 'base_agent'")
+    # Parse variations into VariationItem objects
+    variations: dict[str, list[VariationItem]] = {}
     raw_variations = data.get("variations", {})
+    for dimension, items in raw_variations.items():
+        parsed_items: list[VariationItem] = []
+        for item in items:
+            if _is_strategy_variation(item):
+                # This is a StrategyVariation (optimization strategy like "gepa")
+                parsed_items.append(StrategyVariation(
+                    strategy=item["strategy"],
+                    max_candidates=item.get("max_candidates", 1),
+                    config=item.get("config", {}),
+                ))
+            else:
+                # This is a LiteralVariation
+                parsed_value = _parse_literal_value(dimension, item)
+                parsed_items.append(LiteralVariation(value=parsed_value))
+        variations[dimension] = parsed_items
     return Experiment(
+        base_agent=data["base_agent"],
         suite=data.get("suite"),
         tasks=data.get("tasks"),
         variations=variations,

src/flow/experiments/optimizer.py CHANGED Viewed

@@ -24,11 +24,13 @@ from .ablation import compute_pareto_frontier
 from .evaluators import LLMEvaluator
 from .metrics import TraceMetrics, extract_metrics
 from .models import (
     Candidate,
     export_optimization_results,
 )
 from .runner import FlowExperimentRunner, setup_tracing
-from .types import RunResult, Task, load_tasks_from_jsonl as _load_tasks_impl
 logger = logging.getLogger(__name__)
@@ -45,6 +47,7 @@ class TaskResult:
     eval_passed: bool
     eval_reasoning: str
     criteria_results: list[dict[str, Any]] = field(default_factory=list)  # Per-criterion scores
 @dataclass
@@ -57,6 +60,7 @@ class CandidateSummary:
     # Aggregated metrics
     avg_score: float = 0.0
     avg_tokens: float = 0.0
     avg_duration: float = 0.0
     pass_rate: float = 0.0
@@ -69,12 +73,17 @@ class CandidateSummary:
     def to_dict(self) -> dict[str, Any]:
         """Convert to dictionary for serialization."""
         return {
             "name": self.name,
             "agent": asdict(self.candidate.agent),
             "mutations": self.candidate.mutations,
             "rationale": self.candidate.rationale,
             "avg_score": self.avg_score,
             "avg_tokens": self.avg_tokens,
             "avg_duration": self.avg_duration,
             "pass_rate": self.pass_rate,
@@ -82,15 +91,22 @@ class CandidateSummary:
             "task_count": self.task_count,
             "pareto_rank": self.pareto_rank,
             "is_pareto_optimal": self.is_pareto_optimal,
-            # Include per-task results with eval reasoning
             "task_results": [
                 {
                     "task_name": tr.task_name,
                     "eval_score": tr.eval_score,
                     "eval_passed": tr.eval_passed,
                     "eval_reasoning": tr.eval_reasoning,
                     "tokens": tr.metrics.total_tokens,
                     "duration": tr.run_result.duration_seconds,
                 }
                 for tr in self.task_results
             ],
@@ -149,7 +165,7 @@ class FlowOptimizer:
         })
         optimizer = FlowOptimizer(parallel=4)
         base = Agent(name="my_agent")
-        candidates = strategy.generate(base, budget=10)
         result = await optimizer.optimize(candidates, tasks)
         print(f"Best: {result.rank_by_score[0]}")
     """
@@ -164,11 +180,16 @@ class FlowOptimizer:
         self.use_llm_evaluator = use_llm_evaluator
         self.output_dir = output_dir or Path.home() / ".flow" / "optimizations"
     async def optimize(
         self,
         candidates: list[Candidate],
         tasks: list[Task],
         progress_callback: Callable[[int, int, str, str], None] | None = None,
     ) -> OptimizationResult:
         """Run optimization across all candidates and tasks.
@@ -176,13 +197,15 @@ class FlowOptimizer:
             candidates: Candidates to test
             tasks: Tasks to run each candidate on
             progress_callback: Optional callback(completed, total, candidate_name, task_name)
         Returns:
             OptimizationResult with rankings and exported agents
         """
         start_time = datetime.now()
         timestamp = start_time.strftime("%Y%m%d_%H%M%S")
-        run_dir = self.output_dir / timestamp
         run_dir.mkdir(parents=True, exist_ok=True)
         setup_tracing("flow-optimizer")
@@ -202,6 +225,127 @@ class FlowOptimizer:
         if self.use_llm_evaluator:
             evaluator = self._create_evaluator()
         task_results = await self._run_parallel(
             candidates, tasks, run_dir, evaluator, progress_callback
         )
@@ -266,10 +410,11 @@ class FlowOptimizer:
                 async with lock:
                     completed += 1
-                    status = "✓" if result.eval_passed else "✗"
                     print(
                         f"  [{completed}/{total}] {candidate.agent.name}/{task.name}: "
                         f"{status} score={result.eval_score:.2f} "
                         f"tokens={result.metrics.total_tokens:,}"
                     )
                     if progress_callback:
@@ -289,6 +434,43 @@ class FlowOptimizer:
         return valid_results
     async def _run_single(
         self,
         candidate: Candidate,
@@ -298,9 +480,13 @@ class FlowOptimizer:
     ) -> TaskResult:
         """Run a single candidate-task experiment."""
         # Import harness modules to register them, then use registry
-        import flow.harness.maf  # noqa: F401
         try:
-            import flow.harness.miniagent  # noqa: F401
         except ImportError:
             pass  # miniagent harness is optional
         from flow.harness import create_harness
@@ -313,16 +499,24 @@ class FlowOptimizer:
             metrics = extract_metrics(run_result.trace)
             criteria_results: list[dict[str, Any]] = []
             if evaluator:
-                eval_result = await evaluator.evaluate(run_result)
                 eval_score = eval_result.score
                 eval_passed = eval_result.passed
                 eval_reasoning = eval_result.reasoning
                 # Convert criteria results to dicts for serialization
                 criteria_results = [
                     {
                         "name": cr.name,
                         "score": cr.score,
                         "passed": cr.passed,
                         "reasoning": cr.reasoning,
                     }
@@ -342,6 +536,7 @@ class FlowOptimizer:
                 eval_passed=eval_passed,
                 eval_reasoning=eval_reasoning,
                 criteria_results=criteria_results,
             )
         finally:
             await harness.close()
@@ -370,6 +565,7 @@ class FlowOptimizer:
                 candidate=candidate,
                 task_results=results,
                 avg_score=sum(r.eval_score for r in results) / len(results),
                 avg_tokens=sum(r.metrics.total_tokens for r in results) / len(results),
                 avg_duration=sum(r.run_result.duration_seconds for r in results) / len(results),
                 pass_rate=sum(1 for r in results if r.eval_passed) / len(results),
@@ -425,7 +621,7 @@ class FlowOptimizer:
             logger.info("Creating AsyncAzureOpenAI client for evaluator")
             client = AsyncAzureOpenAI(
                 api_key=api_key,
-                api_version="2024-02-15-preview",
                 azure_endpoint=endpoint,
             )
@@ -480,13 +676,14 @@ class FlowOptimizer:
         print(" OPTIMIZATION RESULTS")
         print("=" * 70)
-        print(f"\n{'Candidate':<30} | {'Score':>8} | {'Tokens':>10} | {'Pareto':>8}")
-        print("-" * 65)
         for summary in sorted(result.summaries, key=lambda s: s.avg_score, reverse=True):
-            pareto = "★" if summary.is_pareto_optimal else ""
             print(
                 f"{summary.name:<30} | {summary.avg_score:>8.2f} | "
                 f"{summary.avg_tokens:>10,.0f} | {pareto:>8}"
             )
@@ -510,3 +707,64 @@ def load_tasks_from_jsonl(path: Path) -> list[Task]:
         List of Task objects
     """
     return _load_tasks_impl(path)

 from .evaluators import LLMEvaluator
 from .metrics import TraceMetrics, extract_metrics
 from .models import (
+    Agent,
     Candidate,
     export_optimization_results,
 )
 from .runner import FlowExperimentRunner, setup_tracing
+from .types import RunResult, Task
+from .types import load_tasks_from_jsonl as _load_tasks_impl
 logger = logging.getLogger(__name__)
     eval_passed: bool
     eval_reasoning: str
     criteria_results: list[dict[str, Any]] = field(default_factory=list)  # Per-criterion scores
+    eval_reasoning_score: float = 0.0  # Partial credit for correct methodology
 @dataclass
     # Aggregated metrics
     avg_score: float = 0.0
+    avg_reasoning_score: float = 0.0
     avg_tokens: float = 0.0
     avg_duration: float = 0.0
     pass_rate: float = 0.0
     def to_dict(self) -> dict[str, Any]:
         """Convert to dictionary for serialization."""
+        # Extract candidate_id from mutations if available (set by GEPA adapter)
+        candidate_id = self.candidate.mutations.get("_candidate_id", None)
         return {
             "name": self.name,
+            "candidate_id": candidate_id,
             "agent": asdict(self.candidate.agent),
             "mutations": self.candidate.mutations,
             "rationale": self.candidate.rationale,
             "avg_score": self.avg_score,
+            "avg_reasoning_score": self.avg_reasoning_score,
             "avg_tokens": self.avg_tokens,
             "avg_duration": self.avg_duration,
             "pass_rate": self.pass_rate,
             "task_count": self.task_count,
             "pareto_rank": self.pareto_rank,
             "is_pareto_optimal": self.is_pareto_optimal,
+            # Include per-task results with full agent output and trace
             "task_results": [
                 {
                     "task_name": tr.task_name,
+                    "task_prompt": tr.run_result.task.prompt,
+                    "agent_output": tr.run_result.output,
                     "eval_score": tr.eval_score,
+                    "eval_reasoning_score": tr.eval_reasoning_score,
                     "eval_passed": tr.eval_passed,
                     "eval_reasoning": tr.eval_reasoning,
+                    "criteria_results": tr.criteria_results,
                     "tokens": tr.metrics.total_tokens,
                     "duration": tr.run_result.duration_seconds,
+                    "files_created": tr.run_result.files_created,
+                    "tool_results": tr.run_result.tool_results,
+                    "trace": tr.run_result.trace,
                 }
                 for tr in self.task_results
             ],
         })
         optimizer = FlowOptimizer(parallel=4)
         base = Agent(name="my_agent")
+        candidates = await strategy.generate(base, budget=10)
         result = await optimizer.optimize(candidates, tasks)
         print(f"Best: {result.rank_by_score[0]}")
     """
         self.use_llm_evaluator = use_llm_evaluator
         self.output_dir = output_dir or Path.home() / ".flow" / "optimizations"
+        # Internal state set during optimize() for use by evaluate()
+        self._evaluator: LLMEvaluator | None = None
+        self._run_dir: Path | None = None
     async def optimize(
         self,
         candidates: list[Candidate],
         tasks: list[Task],
         progress_callback: Callable[[int, int, str, str], None] | None = None,
+        run_dir: Path | None = None,
     ) -> OptimizationResult:
         """Run optimization across all candidates and tasks.
             candidates: Candidates to test
             tasks: Tasks to run each candidate on
             progress_callback: Optional callback(completed, total, candidate_name, task_name)
+            run_dir: Optional fixed directory for this run. If None, creates timestamped subdir.
         Returns:
             OptimizationResult with rankings and exported agents
         """
         start_time = datetime.now()
         timestamp = start_time.strftime("%Y%m%d_%H%M%S")
+        if run_dir is None:
+            run_dir = self.output_dir / timestamp
         run_dir.mkdir(parents=True, exist_ok=True)
         setup_tracing("flow-optimizer")
         if self.use_llm_evaluator:
             evaluator = self._create_evaluator()
+        # Store for use by evaluate() (ExperimentRunner protocol)
+        self._evaluator = evaluator
+        self._run_dir = run_dir
+        task_results = await self._run_parallel(
+            candidates, tasks, run_dir, evaluator, progress_callback
+        )
+        summaries = self._aggregate_results(task_results, candidates)
+        pareto_names = self._compute_pareto(summaries)
+        rank_by_score = sorted(summaries, key=lambda s: s.avg_score, reverse=True)
+        rank_by_tokens = sorted(summaries, key=lambda s: s.avg_tokens)
+        rank_by_efficiency = sorted(
+            summaries,
+            key=lambda s: s.avg_score / max(s.avg_tokens, 1),
+            reverse=True,
+        )
+        summary_dicts = [s.to_dict() for s in summaries]
+        exported = export_optimization_results(
+            summary_dicts, pareto_names, run_dir, timestamp
+        )
+        end_time = datetime.now()
+        result = OptimizationResult(
+            timestamp=timestamp,
+            output_dir=run_dir,
+            summaries=summaries,
+            pareto_frontier=pareto_names,
+            exported_agents=exported,
+            rank_by_score=[s.name for s in rank_by_score],
+            rank_by_tokens=[s.name for s in rank_by_tokens],
+            rank_by_efficiency=[s.name for s in rank_by_efficiency],
+            total_experiments=len(task_results),
+            total_duration_seconds=(end_time - start_time).total_seconds(),
+        )
+        self._save_results(result, run_dir)
+        self._print_summary(result)
+        return result
+    async def optimize_with_strategy(
+        self,
+        strategy: Any,  # CandidateStrategy
+        base: Agent,
+        tasks: list[Task],
+        budget: int = 50,
+        progress_callback: Callable[[int, int, str, str], None] | None = None,
+        run_dir: Path | None = None,
+    ) -> OptimizationResult:
+        """Run optimization using a CandidateStrategy.
+        This is the entry point for strategy-driven optimization. It:
+        1. Sets up infrastructure (evaluator, tracing, output dir)
+        2. Passes self as ExperimentRunner to the strategy
+        3. Runs the strategy's generate() to get candidates
+        4. Does a final evaluation of returned candidates
+        5. Performs Pareto analysis and exports results
+        For active strategies, the strategy will call self.evaluate()
+        during generate() to test candidates iteratively.
+        Args:
+            strategy: A CandidateStrategy implementation
+            base: Base agent to optimize
+            tasks: Tasks to evaluate candidates on
+            budget: Maximum candidates for the strategy to produce
+            progress_callback: Optional callback(completed, total, candidate, task)
+            run_dir: Optional fixed output directory
+        Returns:
+            OptimizationResult with rankings and exported agents
+        """
+        start_time = datetime.now()
+        timestamp = start_time.strftime("%Y%m%d_%H%M%S")
+        if run_dir is None:
+            run_dir = self.output_dir / timestamp
+        run_dir.mkdir(parents=True, exist_ok=True)
+        setup_tracing("flow-optimizer")
+        # Set up evaluator and store state for evaluate()
+        evaluator = None
+        if self.use_llm_evaluator:
+            evaluator = self._create_evaluator()
+        self._evaluator = evaluator
+        self._run_dir = run_dir
+        print("=" * 70)
+        print(" FLOW OPTIMIZER (Strategy Mode)")
+        print("=" * 70)
+        print(f" Strategy:   {type(strategy).__name__}")
+        print(f" Base Agent: {base.name}")
+        print(f" Tasks:      {len(tasks)}")
+        print(f" Budget:     {budget}")
+        print(f" Parallel:   {self.parallel}")
+        print(f" Output:     {run_dir}")
+        print("=" * 70)
+        # Pass self as runner — FlowOptimizer implements the ExperimentRunner
+        # protocol via the evaluate() method above
+        candidates = await strategy.generate(
+            base=base,
+            budget=budget,
+            tasks=tasks,
+            runner=self,
+        )
+        if not candidates:
+            logger.warning("Strategy produced no candidates")
+            candidates = [Candidate(agent=base, mutations={}, rationale="baseline (strategy produced none)")]
+        print(f"\nStrategy produced {len(candidates)} candidates. Running final evaluation...")
+        # Save config
+        self._save_config(candidates, tasks, run_dir)
+        # Final evaluation of all candidates across all tasks
         task_results = await self._run_parallel(
             candidates, tasks, run_dir, evaluator, progress_callback
         )
                 async with lock:
                     completed += 1
+                    status = "PASS" if result.eval_passed else "FAIL"
                     print(
                         f"  [{completed}/{total}] {candidate.agent.name}/{task.name}: "
                         f"{status} score={result.eval_score:.2f} "
+                        f"reasoning={result.eval_reasoning_score:.2f} "
                         f"tokens={result.metrics.total_tokens:,}"
                     )
                     if progress_callback:
         return valid_results
+    async def evaluate(
+        self,
+        candidate: Candidate,
+        tasks: list[Task],
+    ) -> CandidateSummary:
+        """Evaluate a candidate on a set of tasks.
+        Implements the ExperimentRunner protocol. Active strategies call this
+        to test candidates during their optimization loop, reusing the full
+        execution pipeline (harness, tracing, LLM evaluation, metrics).
+        This method requires that optimize() has been called first (or that
+        _evaluator and _run_dir have been set up), since it reuses the
+        optimizer's evaluator and output directory.
+        Args:
+            candidate: The candidate to evaluate
+            tasks: Tasks to run the candidate on
+        Returns:
+            CandidateSummary with aggregated scores and per-task details
+        """
+        if self._run_dir is None:
+            raise RuntimeError(
+                "evaluate() requires the optimizer to be initialized. "
+                "Call optimize() first, or use optimize_with_strategy() which handles setup."
+            )
+        task_results = await self._run_parallel(
+            [candidate], tasks, self._run_dir, self._evaluator, None
+        )
+        summaries = self._aggregate_results(task_results, [candidate])
+        if not summaries:
+            # Return empty summary if all experiments failed
+            return CandidateSummary(name=candidate.agent.name, candidate=candidate)
+        return summaries[0]
     async def _run_single(
         self,
         candidate: Candidate,
     ) -> TaskResult:
         """Run a single candidate-task experiment."""
         # Import harness modules to register them, then use registry
+        import flow.harness.maf as _maf
+        _ = _maf
         try:
+            import flow.harness.miniagent as _miniagent
+            _ = _miniagent
         except ImportError:
             pass  # miniagent harness is optional
         from flow.harness import create_harness
             metrics = extract_metrics(run_result.trace)
             criteria_results: list[dict[str, Any]] = []
+            eval_reasoning_score = 0.0
             if evaluator:
+                if isinstance(evaluator, LLMEvaluator):
+                    eval_result = await evaluator.evaluate(
+                        run_result, instructions=candidate.agent.instructions
+                    )
+                else:
+                    eval_result = await evaluator.evaluate(run_result)
                 eval_score = eval_result.score
                 eval_passed = eval_result.passed
                 eval_reasoning = eval_result.reasoning
+                eval_reasoning_score = eval_result.reasoning_score
                 # Convert criteria results to dicts for serialization
                 criteria_results = [
                     {
                         "name": cr.name,
                         "score": cr.score,
+                        "reasoning_score": cr.reasoning_score,
                         "passed": cr.passed,
                         "reasoning": cr.reasoning,
                     }
                 eval_passed=eval_passed,
                 eval_reasoning=eval_reasoning,
                 criteria_results=criteria_results,
+                eval_reasoning_score=eval_reasoning_score,
             )
         finally:
             await harness.close()
                 candidate=candidate,
                 task_results=results,
                 avg_score=sum(r.eval_score for r in results) / len(results),
+                avg_reasoning_score=sum(r.eval_reasoning_score for r in results) / len(results),
                 avg_tokens=sum(r.metrics.total_tokens for r in results) / len(results),
                 avg_duration=sum(r.run_result.duration_seconds for r in results) / len(results),
                 pass_rate=sum(1 for r in results if r.eval_passed) / len(results),
             logger.info("Creating AsyncAzureOpenAI client for evaluator")
             client = AsyncAzureOpenAI(
                 api_key=api_key,
+                api_version="2024-08-01-preview",  # Required for json_schema response_format
                 azure_endpoint=endpoint,
             )
         print(" OPTIMIZATION RESULTS")
         print("=" * 70)
+        print(f"\n{'Candidate':<30} | {'Score':>8} | {'Reason':>8} | {'Tokens':>10} | {'Pareto':>8}")
+        print("-" * 75)
         for summary in sorted(result.summaries, key=lambda s: s.avg_score, reverse=True):
+            pareto = "*" if summary.is_pareto_optimal else ""
             print(
                 f"{summary.name:<30} | {summary.avg_score:>8.2f} | "
+                f"{summary.avg_reasoning_score:>8.2f} | "
                 f"{summary.avg_tokens:>10,.0f} | {pareto:>8}"
             )
         List of Task objects
     """
     return _load_tasks_impl(path)
+async def evaluate_agent(
+    agent: Agent,
+    tasks: list[Task],
+    *,
+    parallel: int = 4,
+    use_llm_evaluator: bool = True,
+    output_dir: Path | None = None,
+) -> CandidateSummary:
+    """Evaluate a single agent on a set of tasks.
+    This is useful for:
+    - Getting baseline performance before optimization
+    - Testing a specific agent configuration
+    - Validating an exported/promoted agent
+    Example:
+        from flow.experiments import Agent, evaluate_agent, get_task_suite
+        agent = Agent(name="my-agent", instructions="You are helpful.")
+        tasks = get_task_suite("coding")
+        result = await evaluate_agent(agent, tasks)
+        print(f"Score: {result.avg_score:.2f}")
+        print(f"Pass rate: {result.pass_rate:.0%}")
+        print(f"Avg tokens: {result.avg_tokens:,.0f}")
+    Args:
+        agent: The agent to evaluate
+        tasks: List of tasks to run the agent on
+        parallel: Number of concurrent task executions (default: 4)
+        use_llm_evaluator: Whether to use LLM-as-Judge for scoring (default: True)
+        output_dir: Optional directory for results (default: ~/.flow/evaluations)
+    Returns:
+        CandidateSummary with aggregated metrics:
+        - avg_score: Mean evaluation score across tasks
+        - pass_rate: Fraction of tasks that passed
+        - avg_tokens: Mean token usage per task
+        - avg_duration: Mean execution time per task
+        - task_results: Per-task breakdown with scores and reasoning
+    """
+    # Wrap agent in a candidate for the optimizer
+    candidate = Candidate(agent=agent, mutations={}, rationale="baseline evaluation")
+    # Use a separate output directory for evaluations
+    eval_output_dir = output_dir or Path.home() / ".flow" / "evaluations"
+    optimizer = FlowOptimizer(
+        parallel=parallel,
+        use_llm_evaluator=use_llm_evaluator,
+        output_dir=eval_output_dir,
+    )
+    result = await optimizer.optimize([candidate], tasks)
+    if not result.summaries:
+        raise RuntimeError("Evaluation produced no results")
+    return result.summaries[0]

src/flow/experiments/presets.py ADDED Viewed

	@@ -0,0 +1,123 @@

+# Copyright (c) Microsoft. All rights reserved.
+"""Agent presets — pre-configured agent bundles for common use cases.
+Presets are the single source of truth for agent templates. They are:
+- Defined here in Python
+- Served to the UI via the /api/schema/agent endpoint
+- Used in code via Agent.from_preset("coding")
+Each preset bundles a full Agent configuration with metadata
+(label, description, suggested datasets, tags) so users can
+get started quickly without configuring every field.
+"""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING
+from .models import Agent, CompactionConfig
+if TYPE_CHECKING:
+    pass
+@dataclass
+class AgentPreset:
+    """A pre-configured agent bundle for a specific use case.
+    Attributes:
+        name: Machine identifier (e.g., "coding", "research")
+        label: Human-readable name (e.g., "Coding Agent")
+        description: What this preset is optimized for
+        agent: Fully configured Agent instance
+        suggested_datasets: Task suite names to evaluate this preset
+        tags: Categorization tags for UI display
+    """
+    name: str
+    label: str
+    description: str
+    agent: Agent
+    suggested_datasets: list[str] = field(default_factory=list)
+    tags: list[str] = field(default_factory=list)
+# =============================================================================
+# Preset Registry
+# =============================================================================
+AGENT_PRESETS: dict[str, AgentPreset] = {
+    "coding": AgentPreset(
+        name="coding",
+        label="Coding Agent",
+        description="Writes, debugs, and refactors code. Reads and edits files, "
+        "runs shell commands, and tracks progress with todos.",
+        agent=Agent(
+            name="coding-agent",
+            framework="miniagent",
+            instructions_preset="general",
+            compaction=CompactionConfig.none(),
+            tools="standard",
+        ),
+        suggested_datasets=["quick", "coding"],
+        tags=["code", "files", "debugging"],
+    ),
+    "research": AgentPreset(
+        name="research",
+        label="Research Agent",
+        description="Answers factual questions using web search, fetches and reads "
+        "web pages, and executes code for calculations. Verifies claims from sources.",
+        agent=Agent(
+            name="research-agent",
+            framework="miniagent",
+            instructions_preset="general",
+            compaction=CompactionConfig.none(),
+            tools="standard",
+        ),
+        suggested_datasets=["quick"],
+        tags=["web", "search", "facts"],
+    ),
+    "document-analysis": AgentPreset(
+        name="document-analysis",
+        label="Document Analysis Agent",
+        description="Processes and analyzes documents including PDFs, Word docs, "
+        "spreadsheets, and presentations. Uses specialized skills for document formats.",
+        agent=Agent(
+            name="document-analysis-agent",
+            framework="miniagent",
+            instructions_preset="general",
+            compaction=CompactionConfig.none(),
+            tools="full",
+        ),
+        suggested_datasets=["quick"],
+        tags=["documents", "analysis", "skills"],
+    ),
+}
+def get_preset(name: str) -> AgentPreset:
+    """Get an agent preset by name.
+    Args:
+        name: Preset identifier (e.g., "coding", "research", "document-analysis")
+    Returns:
+        The AgentPreset
+    Raises:
+        ValueError: If preset name is not found
+    """
+    if name not in AGENT_PRESETS:
+        available = ", ".join(AGENT_PRESETS.keys())
+        raise ValueError(f"Unknown preset: {name!r}. Available: {available}")
+    return AGENT_PRESETS[name]
+def get_all_presets() -> dict[str, AgentPreset]:
+    """Get all available agent presets.
+    Returns:
+        Dict mapping preset names to AgentPreset instances
+    """
+    return dict(AGENT_PRESETS)

src/flow/experiments/results.py ADDED Viewed

	@@ -0,0 +1,118 @@

+# Copyright (c) Microsoft. All rights reserved.
+"""Simple result types for the Agent API.
+These types provide a clean, user-friendly interface for
+evaluation and optimization results.
+"""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import TYPE_CHECKING, Any
+if TYPE_CHECKING:
+    from .optimizer import CandidateSummary
+@dataclass
+class EvaluationResult:
+    """Result from evaluating an agent on tasks.
+    Attributes:
+        score: Average evaluation score (0.0 to 1.0)
+        tokens: Total tokens used across all tasks
+        pass_rate: Fraction of tasks that passed (0.0 to 1.0)
+        duration: Total duration in seconds
+        task_count: Number of tasks evaluated
+    Example:
+        result = await agent.evaluate(tasks="quick")
+        print(f"Score: {result.score:.2f}")
+        print(f"Pass rate: {result.pass_rate:.0%}")
+    """
+    score: float
+    tokens: int
+    pass_rate: float
+    duration: float
+    task_count: int
+    # Set when agent was deployed — links to the DB job
+    job_id: str | None = field(default=None, repr=False)
+    # Internal reference to full details (for advanced users)
+    _details: CandidateSummary | None = field(default=None, repr=False)
+    def __str__(self) -> str:
+        return f"score={self.score:.2f}, tokens={self.tokens:,}, pass_rate={self.pass_rate:.0%}"
+@dataclass
+class ImprovementMetrics:
+    """Metrics showing improvement from optimization.
+    Attributes:
+        score_delta: Improvement in score (best - baseline)
+        token_reduction_pct: Token reduction as percentage (positive = fewer tokens)
+    Example:
+        if result.improvement.token_reduction_pct > 20:
+            print("Significant token savings!")
+    """
+    score_delta: float
+    token_reduction_pct: float
+    def __str__(self) -> str:
+        score_str = f"{self.score_delta:+.2f}" if self.score_delta != 0 else "0"
+        # Token reduction: positive = saved tokens, so show as negative change
+        if self.token_reduction_pct > 0:
+            token_str = f"-{self.token_reduction_pct:.0f}%"
+        elif self.token_reduction_pct < 0:
+            token_str = f"+{-self.token_reduction_pct:.0f}%"
+        else:
+            token_str = "0%"
+        return f"score: {score_str}, tokens: {token_str}"
+@dataclass
+class AgentOptimizationResult:
+    """Result from optimizing an agent.
+    Attributes:
+        baseline: Performance of the original agent
+        best: Performance of the best found configuration
+        improvement: Metrics showing improvement over baseline
+        best_agent: The optimized agent configuration
+        candidates_tested: Number of candidates evaluated
+        pareto_frontier: Names of Pareto-optimal candidates
+        output_dir: Directory where detailed results are saved
+    Example:
+        result = await agent.optimize(tasks="quick")
+        print(f"Best score: {result.best.score:.2f}")
+        print(f"Token reduction: {result.improvement.token_reduction_pct:.0f}%")
+        # Use the optimized agent
+        optimized_agent = result.best_agent
+    """
+    baseline: EvaluationResult
+    best: EvaluationResult
+    improvement: ImprovementMetrics
+    best_agent: Any  # Agent type (Any to avoid circular import)
+    candidates_tested: int
+    pareto_frontier: list[str]
+    output_dir: Path
+    # Set when agent was deployed — links to the DB job
+    job_id: str | None = field(default=None, repr=False)
+    def __str__(self) -> str:
+        return (
+            f"Optimization: {self.baseline} → {self.best}\n"
+            f"Improvement: {self.improvement}\n"
+            f"Candidates tested: {self.candidates_tested}"
+        )

src/flow/experiments/runner.py CHANGED Viewed

@@ -17,6 +17,8 @@ from opentelemetry.sdk.trace import TracerProvider
 from opentelemetry.sdk.trace.export import SimpleSpanProcessor
 from opentelemetry.semconv._incubating.attributes.service_attributes import SERVICE_NAME
 from .trace_collector import FlowTraceCollector
 from .types import RunResult, Task
@@ -25,23 +27,40 @@ if TYPE_CHECKING:
 logger = logging.getLogger(__name__)
-def setup_tracing(service_name: str = "flow-experiments") -> TracerProvider:
-    """Setup OpenTelemetry tracing with in-memory collection.
-    This creates a new TracerProvider configured for experiment tracing.
-    Call this once at the start of your experiment session.
     Args:
         service_name: Name for the tracing service
     Returns:
-        The configured TracerProvider
     """
     resource = Resource.create({SERVICE_NAME: service_name})
     provider = TracerProvider(resource=resource)
     trace.set_tracer_provider(provider)
     # Enable agent framework instrumentation if available
     try:
         from agent_framework.observability import enable_instrumentation
@@ -52,7 +71,16 @@ def setup_tracing(service_name: str = "flow-experiments") -> TracerProvider:
     except Exception as e:
         logger.debug(f"Could not enable Agent Framework instrumentation: {e}")
-    return provider
 class FlowExperimentRunner:
@@ -60,7 +88,7 @@ class FlowExperimentRunner:
     The runner handles:
     - Setting up temporary workspaces
-    - Collecting execution traces via OpenTelemetry
     - Measuring execution time
     - Capturing files created
     - Supporting streaming execution
@@ -97,18 +125,14 @@ class FlowExperimentRunner:
     async def run(
         self,
-        harness: "BaseHarness",
         task: Task,
         workspace: Path | None = None,
     ) -> RunResult:
         """Run a harness on a task and collect results.
-        This method:
-        1. Creates or uses a workspace directory
-        2. Sets up trace collection
-        3. Executes the harness with streaming
-        4. Collects output and files created
-        5. Returns a RunResult with all data
         Args:
             harness: The harness to run (any BaseHarness implementation)
@@ -134,30 +158,30 @@ class FlowExperimentRunner:
         # Track files before execution
         files_before = set(self._list_files(workspace))
-        # Set up trace collection
-        collector = FlowTraceCollector()
-        processor: SimpleSpanProcessor | None = None
-        try:
-            provider = trace.get_tracer_provider()
-            if isinstance(provider, TracerProvider):
-                processor = SimpleSpanProcessor(collector)
-                provider.add_span_processor(processor)
-                logger.debug("Trace collection enabled")
-        except Exception as e:
-            logger.debug(f"Could not set up trace collection: {e}")
         # Execute the harness
         start_time = time.time()
         output_chunks: list[str] = []
         error: str | None = None
         try:
-            # Change to workspace directory for execution
-            original_cwd = os.getcwd()
-            os.chdir(workspace)
-            try:
                 # Use streaming execution to capture all output
                 async for event in harness.run_stream(task.prompt):
                     # Collect text output
@@ -167,14 +191,13 @@ class FlowExperimentRunner:
                             if event.type in (EventType.TEXT_DELTA, EventType.TEXT_DONE):
                                 output_chunks.append(event.content)
                             elif event.type == EventType.TOOL_RESULT:
-                                # Optionally capture tool results
-                                pass
                             elif event.type == EventType.ERROR:
-                                # Capture error from harness
                                 error = event.content
                                 logger.error(f"Harness error: {error}")
-            finally:
-                os.chdir(original_cwd)
         except Exception as e:
             error = str(e)
@@ -183,22 +206,11 @@ class FlowExperimentRunner:
         end_time = time.time()
         duration_seconds = end_time - start_time
-        # Force flush and get traces
-        if processor:
-            try:
-                processor.force_flush()
-            except Exception as e:
-                logger.debug(f"Error flushing processor: {e}")
-        # Get collected traces
-        trace_data = collector.get_traces()
-        # Clean up trace processor
-        if processor:
-            try:
-                processor.shutdown()
-            except Exception as e:
-                logger.debug(f"Error shutting down processor: {e}")
         # Find files created
         files_after = set(self._list_files(workspace))
@@ -223,6 +235,7 @@ class FlowExperimentRunner:
             duration_seconds=duration_seconds,
             workspace=workspace,
             error=error,
         )
     def _list_files(self, directory: Path) -> list[str]:

 from opentelemetry.sdk.trace.export import SimpleSpanProcessor
 from opentelemetry.semconv._incubating.attributes.service_attributes import SERVICE_NAME
+from flow.tools.workspace import set_workspace
 from .trace_collector import FlowTraceCollector
 from .types import RunResult, Task
 logger = logging.getLogger(__name__)
+# Module-level shared collector — set up once via setup_tracing()
+_shared_collector: FlowTraceCollector | None = None
+def setup_tracing(service_name: str = "flow-experiments") -> FlowTraceCollector:
+    """Setup OpenTelemetry tracing with a single shared collector.
+    Creates one TracerProvider + one SimpleSpanProcessor + one FlowTraceCollector.
+    Idempotent: if already set up, returns the existing collector. This avoids
+    the issue where ``trace.set_tracer_provider()`` silently ignores subsequent
+    calls (OTEL SDK only allows setting the provider once), which would cause
+    a new collector to be created but never receive any spans.
     Args:
         service_name: Name for the tracing service
     Returns:
+        The shared FlowTraceCollector (also stored module-level)
     """
+    global _shared_collector
+    # Already set up — return existing collector
+    if _shared_collector is not None:
+        return _shared_collector
     resource = Resource.create({SERVICE_NAME: service_name})
     provider = TracerProvider(resource=resource)
     trace.set_tracer_provider(provider)
+    # Create ONE shared collector and ONE processor
+    _shared_collector = FlowTraceCollector()
+    processor = SimpleSpanProcessor(_shared_collector)
+    provider.add_span_processor(processor)
     # Enable agent framework instrumentation if available
     try:
         from agent_framework.observability import enable_instrumentation
     except Exception as e:
         logger.debug(f"Could not enable Agent Framework instrumentation: {e}")
+    return _shared_collector
+def get_shared_collector() -> FlowTraceCollector | None:
+    """Get the shared trace collector (if setup_tracing was called).
+    Returns:
+        The shared FlowTraceCollector or None
+    """
+    return _shared_collector
 class FlowExperimentRunner:
     The runner handles:
     - Setting up temporary workspaces
+    - Collecting execution traces via OpenTelemetry (isolated per task)
     - Measuring execution time
     - Capturing files created
     - Supporting streaming execution
     async def run(
         self,
+        harness: BaseHarness,
         task: Task,
         workspace: Path | None = None,
     ) -> RunResult:
         """Run a harness on a task and collect results.
+        Uses a root span to obtain a trace_id, then retrieves only this
+        task's spans from the shared collector after execution.
         Args:
             harness: The harness to run (any BaseHarness implementation)
         # Track files before execution
         files_before = set(self._list_files(workspace))
+        # Get the shared collector (set up by setup_tracing)
+        collector = _shared_collector
+        # Create a root span to get a unique trace_id for this task
+        tracer = trace.get_tracer("flow.experiments", "0.1.0")
+        task_trace_ids: set[str] = set()
         # Execute the harness
         start_time = time.time()
         output_chunks: list[str] = []
+        tool_results: list[dict[str, str]] = []
         error: str | None = None
         try:
+            # Set workspace via contextvar (safe for concurrent async tasks —
+            # each task gets its own contextvar copy, no process-global cwd mutation)
+            set_workspace(workspace)
+            # Create root span — all child spans inherit its trace_id
+            with tracer.start_as_current_span(f"task_{task.name}") as root_span:
+                trace_id = format(root_span.get_span_context().trace_id, "032x")
+                task_trace_ids.add(trace_id)
+                logger.debug(f"Task '{task.name}' trace_id: {trace_id}")
                 # Use streaming execution to capture all output
                 async for event in harness.run_stream(task.prompt):
                     # Collect text output
                             if event.type in (EventType.TEXT_DELTA, EventType.TEXT_DONE):
                                 output_chunks.append(event.content)
                             elif event.type == EventType.TOOL_RESULT:
+                                tool_results.append({
+                                    "tool": event.tool_name or "unknown",
+                                    "output": event.content,
+                                })
                             elif event.type == EventType.ERROR:
                                 error = event.content
                                 logger.error(f"Harness error: {error}")
         except Exception as e:
             error = str(e)
         end_time = time.time()
         duration_seconds = end_time - start_time
+        # Retrieve only this task's traces from the shared collector
+        if collector is not None:
+            trace_data = collector.get_traces_for_task(task_trace_ids)
+        else:
+            trace_data = []
         # Find files created
         files_after = set(self._list_files(workspace))
             duration_seconds=duration_seconds,
             workspace=workspace,
             error=error,
+            tool_results=tool_results,
         )
     def _list_files(self, directory: Path) -> list[str]:

src/flow/experiments/strategies/__init__.py ADDED Viewed

	@@ -0,0 +1,103 @@

+# Copyright (c) Microsoft. All rights reserved.
+"""Strategy registry for optimization.
+Provides a registry of available strategies that can be used in experiment YAML
+via the `strategy:` key in variations.
+Example YAML:
+    variations:
+      instructions:
+        - "You are helpful"           # Literal
+        - strategy: gepa              # Strategy
+          max_candidates: 3
+          config:
+            reflection_lm: gpt-4o
+"""
+from __future__ import annotations
+import logging
+from typing import TYPE_CHECKING, Any
+if TYPE_CHECKING:
+    from ..models import CandidateStrategy
+logger = logging.getLogger(__name__)
+# Strategy registry maps strategy names to factory functions
+# Factory functions take config dict and return strategy instances
+_STRATEGY_REGISTRY: dict[str, type] = {}
+def register_strategy(name: str, strategy_class: type) -> None:
+    """Register a strategy class.
+    Args:
+        name: Strategy name used in YAML
+        strategy_class: Strategy class to instantiate
+    """
+    _STRATEGY_REGISTRY[name] = strategy_class
+    logger.debug(f"Registered strategy: {name}")
+def get_strategy(name: str, config: dict[str, Any]) -> CandidateStrategy:
+    """Get a strategy instance by name.
+    Args:
+        name: Strategy name from YAML
+        config: Strategy configuration dict
+    Returns:
+        Instantiated strategy
+    Raises:
+        ValueError: If strategy name is unknown
+    """
+    if name not in _STRATEGY_REGISTRY:
+        available = list(_STRATEGY_REGISTRY.keys())
+        raise ValueError(f"Unknown strategy: {name}. Available: {available}")
+    strategy_class = _STRATEGY_REGISTRY[name]
+    return strategy_class(config=config)
+def get_registered_strategies() -> dict[str, type]:
+    """Get all registered strategies.
+    Returns:
+        Dict mapping strategy names to their classes
+    """
+    return dict(_STRATEGY_REGISTRY)
+# =============================================================================
+# Register built-in strategies
+# =============================================================================
+def _register_builtin_strategies() -> None:
+    """Register built-in strategies."""
+    # GEPA strategy (optional - requires gepa package)
+    try:
+        from flow.optimizers.gepa_adapter import GepaStrategy
+        register_strategy("gepa", GepaStrategy)
+    except ImportError:
+        logger.debug("GEPA strategy not available (gepa package not installed)")
+    # LLM rewriter strategy (simple instruction variations)
+    try:
+        from .llm_rewriter import LLMRewriterStrategy
+        register_strategy("llm_rewriter", LLMRewriterStrategy)
+    except ImportError:
+        logger.debug("LLM rewriter strategy not available")
+    # Tool selector strategy (generates tool configurations)
+    try:
+        from .tool_selector import ToolSelectorStrategy
+        register_strategy("tool_selector", ToolSelectorStrategy)
+    except ImportError:
+        logger.debug("Tool selector strategy not available")
+# Register on module import
+_register_builtin_strategies()

src/flow/experiments/strategies/llm_rewriter.py ADDED Viewed

	@@ -0,0 +1,357 @@

+# Copyright (c) Microsoft. All rights reserved.
+"""LLM-based instruction rewriter strategy.
+This strategy always requires a runner and tasks. It:
+1. Evaluates the current instructions on all tasks
+2. Reflects on failures to understand what went wrong
+3. Rewrites instructions to address failures
+4. Re-evaluates and repeats until convergence or budget exhausted
+"""
+from __future__ import annotations
+import logging
+import os
+from dataclasses import dataclass, field
+from typing import Any
+from ..models import Agent, Candidate, ExperimentRunner, StrategyIteration
+from ..types import Task
+logger = logging.getLogger(__name__)
+@dataclass
+class LLMRewriterStrategy:
+    """Strategy that uses an LLM to iteratively improve agent instructions.
+    Runs an evaluate-reflect-rewrite loop. Each iteration evaluates
+    the current instructions on tasks via the runner, analyzes failures,
+    and rewrites the instructions to address them. Stops when:
+    - All tasks pass
+    - Score improvement drops below min_improvement
+    - max_iterations reached
+    Requires both a runner (to evaluate candidates) and tasks (to test on).
+    Config options:
+        model: LLM for rewriting (default: gpt-4o-mini)
+        max_iterations: Max optimization iterations (default: 5)
+        min_improvement: Min score gain to continue (default: 0.05)
+    Example YAML:
+        strategy:
+          type: llm_rewriter
+          config:
+            model: gpt-4o-mini
+            max_iterations: 5
+            min_improvement: 0.05
+    """
+    config: dict[str, Any] = field(default_factory=dict)
+    async def generate(
+        self,
+        base: Agent,
+        budget: int,
+        *,
+        tasks: list[Task] | None = None,
+        runner: ExperimentRunner | None = None,
+    ) -> list[Candidate]:
+        """Generate optimized instruction variants via evaluate-reflect-rewrite loop.
+        Args:
+            base: Base agent with instructions to rewrite
+            budget: Max candidates to generate
+            tasks: Tasks to evaluate on (required)
+            runner: ExperimentRunner for evaluation (required)
+        Returns:
+            List of candidates with optimized instructions
+        Raises:
+            ValueError: If tasks or runner not provided
+        """
+        if runner is None:
+            raise ValueError(
+                "LLMRewriterStrategy requires a runner. "
+                "Use FlowOptimizer.optimize_with_strategy() to provide one."
+            )
+        if not tasks:
+            raise ValueError(
+                "LLMRewriterStrategy requires tasks to evaluate against."
+            )
+        base_instructions = base.instructions or "You are a helpful assistant."
+        return await self._generate_active(base, base_instructions, budget, tasks, runner)
+    async def _generate_active(
+        self,
+        base: Agent,
+        instructions: str,
+        budget: int,
+        tasks: list[Task],
+        runner: ExperimentRunner,
+    ) -> list[Candidate]:
+        """Run active optimization loop with real evaluation feedback."""
+        model = self.config.get("model", "gpt-4o-mini")
+        max_iterations = self.config.get("max_iterations", 5)
+        min_improvement = self.config.get("min_improvement", 0.05)
+        logger.info(
+            f"LLMRewriterStrategy: active mode (max_iterations={max_iterations}, "
+            f"min_improvement={min_improvement})"
+        )
+        current_instructions = instructions
+        best_instructions = instructions
+        best_score = 0.0
+        history: list[StrategyIteration] = []
+        for iteration in range(max_iterations):
+            # 1. Evaluate current instructions
+            agent = Agent(
+                name=f"{base.name}_rewrite_iter{iteration}",
+                framework=base.framework,
+                instructions=current_instructions,
+                llm_config=base.llm_config,
+                compaction=base.compaction,
+                tools=base.tools,
+            )
+            candidate = Candidate(
+                agent=agent,
+                mutations={"instructions": current_instructions},
+            )
+            summary = await runner.evaluate(candidate, tasks)
+            avg_score = getattr(summary, "avg_score", 0.0)
+            pass_rate = getattr(summary, "pass_rate", 0.0)
+            task_results = getattr(summary, "task_results", [])
+            failures = [tr for tr in task_results if not getattr(tr, "eval_passed", True)]
+            logger.info(
+                f"  Iteration {iteration}: avg_score={avg_score:.3f}, "
+                f"pass_rate={pass_rate:.1%}, failures={len(failures)}"
+            )
+            # Build per-task summary for rationale
+            task_lines: list[str] = []
+            for tr in task_results:
+                task_name = getattr(tr, "task_name", "unknown")
+                passed = getattr(tr, "eval_passed", True)
+                reasoning = getattr(tr, "eval_reasoning", "")
+                status = "PASS" if passed else "FAIL"
+                task_lines.append(f"  [{status}] {task_name}: {reasoning[:150]}")
+            tasks_summary = "\n".join(task_lines)
+            # Record iteration
+            change_desc = "Baseline evaluation" if iteration == 0 else f"Rewrite iteration {iteration}"
+            change_rationale = f"Per-task results:\n{tasks_summary}"
+            if iteration > 0:
+                score_delta = avg_score - history[-1].avg_score
+                change_rationale = (
+                    f"Score {'improved' if score_delta > 0 else 'declined'} by {abs(score_delta):.3f}. "
+                    f"{len(failures)} failures remaining.\n{tasks_summary}"
+                )
+            history.append(
+                StrategyIteration(
+                    iteration=iteration,
+                    instructions_preview=current_instructions[:200],
+                    full_instructions=current_instructions,
+                    avg_score=avg_score,
+                    pass_rate=pass_rate,
+                    failures_count=len(failures),
+                    change_description=change_desc,
+                    change_rationale=change_rationale,
+                )
+            )
+            # Track best
+            if avg_score > best_score:
+                best_score = avg_score
+                best_instructions = current_instructions
+            # 2. Check stopping conditions
+            if iteration > 0:
+                improvement = avg_score - history[-2].avg_score
+                if improvement < min_improvement and avg_score <= best_score:
+                    logger.info(
+                        f"  Stopping: improvement ({improvement:.3f}) < "
+                        f"min_improvement ({min_improvement})"
+                    )
+                    break
+            if not failures:
+                logger.info("  Stopping: all tasks passed")
+                break
+            if iteration == max_iterations - 1:
+                break  # Don't rewrite on last iteration
+            # 3. Reflect on failures and rewrite
+            current_instructions = self._reflect_and_rewrite(
+                current_instructions, failures, avg_score, model
+            )
+            logger.info(f"  Rewrote instructions ({len(current_instructions)} chars)")
+        # Build final candidate with optimization history
+        final_agent = Agent(
+            name=f"{base.name}_llm_rewriter_optimized",
+            framework=base.framework,
+            instructions=best_instructions,
+            llm_config=base.llm_config,
+            compaction=base.compaction,
+            tools=base.tools,
+        )
+        score_progression = f"{history[0].avg_score:.2f} → {best_score:.2f}"
+        return [
+            Candidate(
+                agent=final_agent,
+                mutations={"instructions": best_instructions},
+                rationale=f"LLM rewriter active optimization: {len(history)} iterations, {score_progression}",
+                optimization_history=history,
+            )
+        ]
+    def _reflect_and_rewrite(
+        self,
+        instructions: str,
+        failures: list[Any],
+        current_score: float,
+        model: str,
+    ) -> str:
+        """Analyze failures and rewrite instructions to address them."""
+        # Build failure analysis
+        failure_descriptions = []
+        for tr in failures[:5]:  # Limit to 5 failures for context
+            task_name = getattr(tr, "task_name", "unknown")
+            reasoning = getattr(tr, "eval_reasoning", "No reasoning")
+            score = getattr(tr, "eval_score", 0.0)
+            failure_descriptions.append(
+                f"- Task '{task_name}' (score={score:.2f}): {reasoning[:200]}"
+            )
+        failures_text = "\n".join(failure_descriptions)
+        prompt = f"""You are a prompt engineer writing guidelines for a coding assistant.
+The assistant's current guidelines scored {current_score:.2f} out of 1.0 on a benchmark.
+Here are the tasks where performance was low:
+{failures_text}
+The current guidelines are:
+---
+{instructions}
+---
+Write a new, improved version of the guidelines. The new guidelines should:
+1. Help the assistant succeed on a wide range of coding tasks — the failures
+   above are examples, but the guidelines must generalize beyond them
+2. Include concrete strategies (e.g., always verify output, check edge cases,
+   create and run files when asked)
+3. Be general-purpose: do NOT reference specific task names, specific answers,
+   or specific test cases from the failures above
+4. Focus on transferable skills and habits (e.g., "verify output matches
+   requirements" not "check that fibonacci returns 55")
+5. Be concise
+Output ONLY the new guidelines text, nothing else."""
+        try:
+            return self._call_llm(prompt, model) or instructions
+        except Exception as e:
+            logger.warning(f"LLM rewrite failed: {e}")
+            # Primary prompt failed — the original instructions may have
+            # triggered a content filter (Azure, OpenAI, etc.) or caused
+            # another error. Try a fallback that omits them entirely.
+            logger.info("Retrying rewrite with fallback prompt (without original instructions)")
+            return self._fallback_rewrite(failures_text, current_score, model)
+    def _fallback_rewrite(
+        self,
+        failures_text: str,
+        current_score: float,
+        model: str,
+    ) -> str:
+        """Generate new instructions from scratch when the primary rewrite is blocked.
+        This avoids including the original instructions (which may trigger
+        content filters) and instead writes fresh guidelines based solely on
+        the task failure descriptions.
+        """
+        prompt = f"""You are a prompt engineer. Write guidelines for a coding assistant.
+The assistant scored {current_score:.2f} out of 1.0 on these tasks:
+{failures_text}
+Write concise guidelines that would help a coding assistant succeed on
+a wide range of coding tasks. The failures above are examples — the
+guidelines must generalize beyond them. The guidelines should:
+1. Instruct the assistant to complete coding tasks by creating files and
+   running code
+2. Include strategies for verifying output and handling edge cases
+3. Be general-purpose: do NOT reference specific task names or answers
+   from the failures above
+4. Focus on transferable habits and skills
+Output ONLY the guidelines text, nothing else."""
+        try:
+            result = self._call_llm(prompt, model)
+            if result:
+                logger.info("Fallback rewrite succeeded")
+                return result
+        except Exception as e2:
+            logger.warning(f"Fallback rewrite also failed: {e2}")
+        # Last resort: return a sensible default
+        logger.info("Using default coding assistant guidelines")
+        return (
+            "You are a helpful coding assistant. When given a task:\n"
+            "1. Create the requested files with correct, working code\n"
+            "2. Run the code and verify the output is correct\n"
+            "3. Handle edge cases and validate results before finishing"
+        )
+    def _get_client(self, model: str) -> tuple[Any, str]:
+        """Get OpenAI client and model name."""
+        try:
+            from openai import AzureOpenAI, OpenAI
+        except ImportError as e:
+            raise ImportError("openai package required for LLMRewriterStrategy") from e
+        azure_key = os.environ.get("AZURE_OPENAI_API_KEY")
+        azure_endpoint = os.environ.get("AZURE_OPENAI_ENDPOINT")
+        if azure_key and azure_endpoint:
+            client = AzureOpenAI(
+                api_key=azure_key,
+                api_version="2024-08-01-preview",
+                azure_endpoint=azure_endpoint,
+            )
+            model_name = os.environ.get("AZURE_OPENAI_DEPLOYMENT", model)
+        else:
+            openai_key = os.environ.get("OPENAI_API_KEY")
+            if not openai_key:
+                raise ValueError("No OpenAI or Azure OpenAI credentials found")
+            client = OpenAI(api_key=openai_key)
+            model_name = model
+        return client, model_name
+    def _call_llm(self, prompt: str, model: str) -> str:
+        """Call LLM with a prompt."""
+        client, model_name = self._get_client(model)
+        response = client.chat.completions.create(
+            model=model_name,
+            messages=[{"role": "user", "content": prompt}],
+        )
+        return response.choices[0].message.content or ""

src/flow/experiments/strategies/tool_selector.py ADDED Viewed

	@@ -0,0 +1,426 @@

+# Copyright (c) Microsoft. All rights reserved.
+"""Active tool selector strategy.
+Uses the runner to evaluate tool configurations and iteratively adjust
+the tool set based on actual execution failures. The strategy:
+1. Evaluates the current tool set on all tasks
+2. Analyzes failures and trace data to identify missing/unnecessary tools
+3. Uses an LLM to recommend tool changes
+4. Re-evaluates and repeats until convergence or budget exhausted
+"""
+from __future__ import annotations
+import logging
+import os
+from dataclasses import dataclass, field
+from typing import Any
+from ..metrics import extract_metrics
+from ..models import Agent, Candidate, ExperimentRunner, StrategyIteration, TOOL_PRESETS
+from ..types import Task
+logger = logging.getLogger(__name__)
+# All tools the strategy can choose from
+ALL_AVAILABLE_TOOLS: list[str] = sorted(
+    {tool for preset in TOOL_PRESETS.values() for tool in preset}
+)
+@dataclass
+class ToolSelectorStrategy:
+    """Strategy that iteratively optimizes tool configurations via evaluation.
+    Runs an evaluate-analyze-adjust loop. Each iteration evaluates
+    the current tool set on tasks via the runner, analyzes which tools
+    were used/missing from traces, and uses an LLM to recommend changes.
+    Requires both a runner (to evaluate candidates) and tasks (to test on).
+    Config options:
+        model: LLM for tool recommendations (default: gpt-4o-mini)
+        max_iterations: Max optimization iterations (default: 3)
+        min_improvement: Min score gain to continue (default: 0.05)
+        available_tools: List of tool names to choose from (default: all known tools)
+    Example YAML:
+        strategy:
+          type: tool_selector
+          config:
+            model: gpt-4o-mini
+            max_iterations: 3
+    """
+    config: dict[str, Any] = field(default_factory=dict)
+    async def generate(
+        self,
+        base: Agent,
+        budget: int,
+        *,
+        tasks: list[Task] | None = None,
+        runner: ExperimentRunner | None = None,
+    ) -> list[Candidate]:
+        """Generate optimized tool configurations via evaluate-analyze-adjust loop.
+        Args:
+            base: Base agent with initial tool configuration
+            budget: Max candidates to generate
+            tasks: Tasks to evaluate on (required)
+            runner: ExperimentRunner for evaluation (required)
+        Returns:
+            List of candidates with optimized tool sets
+        Raises:
+            ValueError: If tasks or runner not provided
+        """
+        if runner is None:
+            raise ValueError(
+                "ToolSelectorStrategy requires a runner. "
+                "Use FlowOptimizer.optimize_with_strategy() to provide one."
+            )
+        if not tasks:
+            raise ValueError(
+                "ToolSelectorStrategy requires tasks to evaluate against."
+            )
+        # Resolve initial tools to a list
+        from ..models import resolve_tools
+        if base.tools is None or (isinstance(base.tools, list) and len(base.tools) == 0):
+            current_tools = []
+        else:
+            current_tools = sorted(resolve_tools(base.tools).keys())
+        return await self._generate_active(base, current_tools, budget, tasks, runner)
+    async def _generate_active(
+        self,
+        base: Agent,
+        tools: list[str],
+        budget: int,
+        tasks: list[Task],
+        runner: ExperimentRunner,
+    ) -> list[Candidate]:
+        """Run active optimization loop with real evaluation feedback."""
+        model = self.config.get("model", "gpt-4o-mini")
+        max_iterations = self.config.get("max_iterations", 3)
+        min_improvement = self.config.get("min_improvement", 0.05)
+        available_tools = self.config.get("available_tools", ALL_AVAILABLE_TOOLS)
+        logger.info(
+            f"ToolSelectorStrategy: active mode (max_iterations={max_iterations}, "
+            f"available_tools={len(available_tools)})"
+        )
+        current_tools = tools
+        best_tools = tools
+        best_score = 0.0
+        history: list[StrategyIteration] = []
+        # Track all unique tool configs tried, for returning as candidates
+        iteration_candidates: list[tuple[list[str], str]] = []  # (tools, name_suffix)
+        for iteration in range(max_iterations):
+            # 1. Evaluate current tool set
+            agent = Agent(
+                name=f"{base.name}_tools_iter{iteration}",
+                framework=base.framework,
+                instructions=base.instructions,
+                llm_config=base.llm_config,
+                compaction=base.compaction,
+                tools=current_tools,
+            )
+            candidate = Candidate(
+                agent=agent,
+                mutations={"tools": current_tools},
+            )
+            summary = await runner.evaluate(candidate, tasks)
+            avg_score = getattr(summary, "avg_score", 0.0)
+            pass_rate = getattr(summary, "pass_rate", 0.0)
+            task_results = getattr(summary, "task_results", [])
+            failures = [tr for tr in task_results if not getattr(tr, "eval_passed", True)]
+            # Collect tool usage from traces
+            tools_used: dict[str, int] = {}
+            for tr in task_results:
+                metrics = getattr(tr, "metrics", None)
+                if metrics and hasattr(metrics, "tool_calls_by_name"):
+                    for name, count in metrics.tool_calls_by_name.items():
+                        tools_used[name] = tools_used.get(name, 0) + count
+            logger.info(
+                f"  Iteration {iteration}: avg_score={avg_score:.3f}, "
+                f"pass_rate={pass_rate:.1%}, failures={len(failures)}, "
+                f"tools={current_tools}, used={tools_used}"
+            )
+            # Build per-task summary for rationale
+            task_lines: list[str] = []
+            for tr in task_results:
+                task_name = getattr(tr, "task_name", "unknown")
+                passed = getattr(tr, "eval_passed", True)
+                reasoning = getattr(tr, "eval_reasoning", "")
+                task_metrics = getattr(tr, "metrics", None)
+                task_tools: dict[str, int] = {}
+                if task_metrics and hasattr(task_metrics, "tool_calls_by_name"):
+                    task_tools = dict(task_metrics.tool_calls_by_name)
+                status = "PASS" if passed else "FAIL"
+                tools_info = f" (tools used: {task_tools})" if task_tools else ""
+                task_lines.append(f"  [{status}] {task_name}{tools_info}: {reasoning[:150]}")
+            tasks_summary = "\n".join(task_lines)
+            # Record iteration
+            tools_desc = ", ".join(current_tools) or "(none)"
+            used_desc = ", ".join(f"{k}={v}" for k, v in sorted(tools_used.items())) or "(none)"
+            change_desc = "Baseline evaluation" if iteration == 0 else f"Tool adjustment iteration {iteration}"
+            change_rationale = f"Tools used: {used_desc}\n{tasks_summary}"
+            if iteration > 0:
+                score_delta = avg_score - history[-1].avg_score
+                added = set(current_tools) - set(best_tools if iteration == 1 else _prev_tools)
+                removed = set(_prev_tools) - set(current_tools) if iteration > 0 else set()
+                change_rationale = (
+                    f"Score {'improved' if score_delta > 0 else 'declined'} by {abs(score_delta):.3f}. "
+                    f"Added: {sorted(added) or 'none'}. Removed: {sorted(removed) or 'none'}. "
+                    f"{len(failures)} failures remaining.\n"
+                    f"Tools used: {used_desc}\n{tasks_summary}"
+                )
+            history.append(
+                StrategyIteration(
+                    iteration=iteration,
+                    instructions_preview=f"[{tools_desc}]"[:200],
+                    full_instructions=f"[{tools_desc}]",
+                    avg_score=avg_score,
+                    pass_rate=pass_rate,
+                    failures_count=len(failures),
+                    change_description=change_desc,
+                    change_rationale=change_rationale,
+                )
+            )
+            # Track this iteration's config
+            label = "baseline" if iteration == 0 else f"iter{iteration}"
+            iteration_candidates.append((list(current_tools), label))
+            # Track best
+            if avg_score > best_score:
+                best_score = avg_score
+                best_tools = current_tools
+            # 2. Check stopping conditions
+            if iteration > 0:
+                improvement = avg_score - history[-2].avg_score
+                if improvement < min_improvement and avg_score <= best_score:
+                    logger.info(
+                        f"  Stopping: improvement ({improvement:.3f}) < "
+                        f"min_improvement ({min_improvement})"
+                    )
+                    break
+            if not failures:
+                logger.info("  Stopping: all tasks passed")
+                break
+            if iteration == max_iterations - 1:
+                break  # Don't adjust on last iteration
+            # 3. Analyze failures and adjust tools
+            _prev_tools = current_tools
+            current_tools = self._analyze_and_adjust(
+                current_tools, task_results, tools_used, available_tools, model
+            )
+            logger.info(f"  Adjusted tools: {current_tools}")
+        # Build candidates for all unique tool configs tried
+        # This gives the Pareto chart multiple data points to compare
+        candidates: list[Candidate] = []
+        seen_tool_sets: set[tuple[str, ...]] = set()
+        for iter_tools, label in iteration_candidates:
+            tool_key = tuple(sorted(iter_tools))
+            if tool_key in seen_tool_sets:
+                continue
+            seen_tool_sets.add(tool_key)
+            is_best = sorted(iter_tools) == sorted(best_tools)
+            suffix = "optimized" if is_best else label
+            agent = Agent(
+                name=f"{base.name}_tools_{suffix}",
+                framework=base.framework,
+                instructions=base.instructions,
+                llm_config=base.llm_config,
+                compaction=base.compaction,
+                tools=iter_tools,
+            )
+            tools_desc = ", ".join(iter_tools) or "(none)"
+            candidates.append(
+                Candidate(
+                    agent=agent,
+                    mutations={"tools": iter_tools},
+                    rationale=f"Tools: [{tools_desc}]",
+                    optimization_history=history if is_best else [],
+                )
+            )
+        # Ensure best is always included (may differ from any iteration if
+        # the best score was from an earlier iteration)
+        best_key = tuple(sorted(best_tools))
+        if best_key not in seen_tool_sets:
+            final_agent = Agent(
+                name=f"{base.name}_tools_optimized",
+                framework=base.framework,
+                instructions=base.instructions,
+                llm_config=base.llm_config,
+                compaction=base.compaction,
+                tools=best_tools,
+            )
+            tools_desc = ", ".join(best_tools)
+            candidates.append(
+                Candidate(
+                    agent=final_agent,
+                    mutations={"tools": best_tools},
+                    rationale=f"Tools: [{tools_desc}]",
+                    optimization_history=history,
+                )
+            )
+        return candidates
+    def _analyze_and_adjust(
+        self,
+        current_tools: list[str],
+        task_results: list[Any],
+        tools_used: dict[str, int],
+        available_tools: list[str],
+        model: str,
+    ) -> list[str]:
+        """Analyze failures and traces, then recommend tool changes."""
+        # Build analysis of what happened
+        failure_descriptions = []
+        for tr in task_results:
+            task_name = getattr(tr, "task_name", "unknown")
+            passed = getattr(tr, "eval_passed", True)
+            reasoning = getattr(tr, "eval_reasoning", "")
+            score = getattr(tr, "eval_score", 0.0)
+            # Get per-task tool usage
+            metrics = getattr(tr, "metrics", None)
+            task_tools = {}
+            if metrics and hasattr(metrics, "tool_calls_by_name"):
+                task_tools = dict(metrics.tool_calls_by_name)
+            status = "PASS" if passed else "FAIL"
+            failure_descriptions.append(
+                f"- [{status}] Task '{task_name}' (score={score:.2f}): "
+                f"tools_used={task_tools}. {reasoning[:200]}"
+            )
+        results_text = "\n".join(failure_descriptions)
+        not_in_current = sorted(set(available_tools) - set(current_tools))
+        prompt = f"""You are optimizing the tool configuration for a coding assistant.
+Current tools: {current_tools}
+Available tools NOT currently enabled: {not_in_current}
+Task results with this tool set:
+{results_text}
+Tool usage across all tasks: {tools_used}
+Based on the failures and tool usage patterns, recommend an updated tool list.
+Consider:
+- Tools that were needed but missing (e.g., agent tried to search but had no grep)
+- Tools that were never used (candidates for removal to reduce complexity)
+- Tools that could help with the failed tasks
+Rules:
+- Only select from the full available set: {available_tools}
+- Always include at minimum: read_file, write_file, bash
+- Do NOT add tools just because they exist — only add tools that would
+  address specific failure patterns seen above
+Respond with ONLY a comma-separated list of tool names, nothing else.
+Example: read_file, write_file, bash, grep, edit_file"""
+        try:
+            result = self._call_llm(prompt, model)
+            if result:
+                # Parse comma-separated tool names
+                parsed = [t.strip() for t in result.split(",") if t.strip()]
+                # Validate against available tools
+                valid = [t for t in parsed if t in available_tools]
+                if valid:
+                    return sorted(valid)
+                logger.warning(f"No valid tools in LLM response: {parsed}")
+        except Exception as e:
+            logger.warning(f"LLM tool adjustment failed: {e}")
+            # Fallback: try adding commonly useful tools
+            return self._heuristic_adjust(current_tools, tools_used, available_tools)
+        return current_tools
+    def _heuristic_adjust(
+        self,
+        current_tools: list[str],
+        tools_used: dict[str, int],
+        available_tools: list[str],
+    ) -> list[str]:
+        """Fallback heuristic when LLM is unavailable."""
+        adjusted = set(current_tools)
+        # If bash was used heavily but grep/glob not available, add them
+        if "bash" in tools_used and tools_used["bash"] > 2:
+            for tool in ["grep", "glob_files", "ls"]:
+                if tool in available_tools:
+                    adjusted.add(tool)
+        # If write_file was used but edit_file not available, add it
+        if "write_file" in tools_used and "edit_file" not in adjusted:
+            if "edit_file" in available_tools:
+                adjusted.add("edit_file")
+        # Add think if not present (helps with reasoning)
+        if "think" in available_tools:
+            adjusted.add("think")
+        return sorted(adjusted)
+    def _get_client(self, model: str) -> tuple[Any, str]:
+        """Get OpenAI client and model name."""
+        try:
+            from openai import AzureOpenAI, OpenAI
+        except ImportError as e:
+            raise ImportError("openai package required for ToolSelectorStrategy") from e
+        azure_key = os.environ.get("AZURE_OPENAI_API_KEY")
+        azure_endpoint = os.environ.get("AZURE_OPENAI_ENDPOINT")
+        if azure_key and azure_endpoint:
+            client = AzureOpenAI(
+                api_key=azure_key,
+                api_version="2024-08-01-preview",
+                azure_endpoint=azure_endpoint,
+            )
+            model_name = os.environ.get("AZURE_OPENAI_DEPLOYMENT", model)
+        else:
+            openai_key = os.environ.get("OPENAI_API_KEY")
+            if not openai_key:
+                raise ValueError("No OpenAI or Azure OpenAI credentials found")
+            client = OpenAI(api_key=openai_key)
+            model_name = model
+        return client, model_name
+    def _call_llm(self, prompt: str, model: str) -> str:
+        """Call LLM with a prompt."""
+        client, model_name = self._get_client(model)
+        response = client.chat.completions.create(
+            model=model_name,
+            messages=[{"role": "user", "content": prompt}],
+        )
+        return response.choices[0].message.content or ""

src/flow/experiments/trace_collector.py CHANGED Viewed

@@ -1,8 +1,13 @@
 # Copyright (c) Microsoft. All rights reserved.
-"""OpenTelemetry trace collector for experiment analysis."""
 import logging
 from datetime import datetime
 from typing import Any
@@ -12,24 +17,26 @@ logger = logging.getLogger(__name__)
 class FlowTraceCollector(SpanExporter):
-    """Collects OpenTelemetry spans for experiment analysis.
-    This exporter captures spans during agent execution and converts them
-    to a dictionary format suitable for metrics extraction and analysis.
     Example:
         collector = FlowTraceCollector()
-        # Attach to TracerProvider via SimpleSpanProcessor
-        # Run agent execution
-        traces = collector.get_traces()
     """
     def __init__(self) -> None:
         """Initialize the trace collector."""
-        self.spans: list[dict[str, Any]] = []
     def export(self, spans: Any) -> SpanExportResult:
-        """Collect spans from OpenTelemetry.
         Args:
             spans: Sequence of OpenTelemetry ReadableSpan objects
@@ -39,41 +46,46 @@ class FlowTraceCollector(SpanExporter):
         """
         for span in spans:
             try:
-                # Convert nanoseconds to seconds for timestamps
-                start_time = span.start_time / 1_000_000_000
-                end_time = span.end_time / 1_000_000_000 if span.end_time else None
-                duration_ms = ((end_time - start_time) * 1000) if end_time else None
-                self.spans.append({
-                    "type": "trace_span",
-                    "timestamp": datetime.fromtimestamp(start_time).isoformat(),
-                    "data": {
-                        "operation_name": span.name,
-                        "span_id": format(span.context.span_id, "016x"),
-                        "trace_id": format(span.context.trace_id, "032x"),
-                        "parent_span_id": (
-                            format(span.parent.span_id, "016x") if span.parent else None
-                        ),
-                        "duration_ms": duration_ms,
-                        "attributes": dict(span.attributes) if span.attributes else {},
-                        "status": str(span.status.status_code.name) if hasattr(span, "status") else "OK",
-                        "events": [
-                            {
-                                "name": event.name,
-                                "timestamp": datetime.fromtimestamp(
-                                    event.timestamp / 1_000_000_000
-                                ).isoformat(),
-                                "attributes": dict(event.attributes) if event.attributes else {},
-                            }
-                            for event in (span.events or [])
-                        ],
-                    },
-                })
             except Exception as e:
                 logger.debug(f"Failed to collect span: {e}")
         return SpanExportResult.SUCCESS
     def force_flush(self, timeout_millis: int = 30000) -> bool:
         """Force flush spans (no-op for simple collection).
@@ -89,16 +101,47 @@ class FlowTraceCollector(SpanExporter):
         """Shutdown the exporter (no-op)."""
         pass
-    def get_traces(self) -> list[dict[str, Any]]:
-        """Get and clear collected traces.
         Returns:
-            List of collected trace spans, clearing the internal list
         """
-        traces = self.spans.copy()
-        self.spans.clear()
-        return traces
-    def clear(self) -> None:
-        """Clear collected traces without returning them."""
-        self.spans.clear()

 # Copyright (c) Microsoft. All rights reserved.
+"""OpenTelemetry trace collector for experiment analysis.
+Uses trace_id-based bucketing to isolate spans per task, even when
+multiple tasks run concurrently on a shared TracerProvider.
+"""
 import logging
+import threading
 from datetime import datetime
 from typing import Any
 class FlowTraceCollector(SpanExporter):
+    """Collects OpenTelemetry spans, bucketed by trace_id for isolation.
+    All spans from the global TracerProvider flow into this single collector.
+    Spans are stored in per-trace_id buckets so that each task can retrieve
+    only its own spans without cross-contamination.
     Example:
         collector = FlowTraceCollector()
+        # Attach ONCE to the global TracerProvider via SimpleSpanProcessor
+        # Run multiple tasks concurrently — each gets a unique trace_id
+        task_traces = collector.get_traces_for_task({"abc123"})
     """
     def __init__(self) -> None:
         """Initialize the trace collector."""
+        self._spans_by_trace: dict[str, list[dict[str, Any]]] = {}
+        self._lock = threading.Lock()
     def export(self, spans: Any) -> SpanExportResult:
+        """Collect spans, bucketed by trace_id.
         Args:
             spans: Sequence of OpenTelemetry ReadableSpan objects
         """
         for span in spans:
             try:
+                trace_id = format(span.context.trace_id, "032x")
+                span_dict = self._convert_span(span)
+                with self._lock:
+                    if trace_id not in self._spans_by_trace:
+                        self._spans_by_trace[trace_id] = []
+                    self._spans_by_trace[trace_id].append(span_dict)
             except Exception as e:
                 logger.debug(f"Failed to collect span: {e}")
         return SpanExportResult.SUCCESS
+    def get_traces_for_task(self, trace_ids: set[str]) -> list[dict[str, Any]]:
+        """Get spans matching any of the given trace_ids, removing them.
+        Args:
+            trace_ids: Set of trace_id hex strings to retrieve
+        Returns:
+            List of span dicts belonging to those trace_ids
+        """
+        result: list[dict[str, Any]] = []
+        with self._lock:
+            for tid in trace_ids:
+                result.extend(self._spans_by_trace.pop(tid, []))
+        return result
+    def get_traces(self) -> list[dict[str, Any]]:
+        """Get and clear ALL collected traces (legacy API).
+        Returns:
+            List of all collected trace spans, clearing internal state
+        """
+        with self._lock:
+            all_spans: list[dict[str, Any]] = []
+            for spans in self._spans_by_trace.values():
+                all_spans.extend(spans)
+            self._spans_by_trace.clear()
+        return all_spans
     def force_flush(self, timeout_millis: int = 30000) -> bool:
         """Force flush spans (no-op for simple collection).
         """Shutdown the exporter (no-op)."""
         pass
+    def clear(self) -> None:
+        """Clear collected traces without returning them."""
+        with self._lock:
+            self._spans_by_trace.clear()
+    @staticmethod
+    def _convert_span(span: Any) -> dict[str, Any]:
+        """Convert an OTEL ReadableSpan to a dict.
+        Args:
+            span: OpenTelemetry ReadableSpan
         Returns:
+            Dictionary representation of the span
         """
+        start_time = span.start_time / 1_000_000_000
+        end_time = span.end_time / 1_000_000_000 if span.end_time else None
+        duration_ms = ((end_time - start_time) * 1000) if end_time else None
+        return {
+            "type": "trace_span",
+            "timestamp": datetime.fromtimestamp(start_time).isoformat(),
+            "data": {
+                "operation_name": span.name,
+                "span_id": format(span.context.span_id, "016x"),
+                "trace_id": format(span.context.trace_id, "032x"),
+                "parent_span_id": (
+                    format(span.parent.span_id, "016x") if span.parent else None
+                ),
+                "duration_ms": duration_ms,
+                "attributes": dict(span.attributes) if span.attributes else {},
+                "status": str(span.status.status_code.name) if hasattr(span, "status") else "OK",
+                "events": [
+                    {
+                        "name": event.name,
+                        "timestamp": datetime.fromtimestamp(
+                            event.timestamp / 1_000_000_000
+                        ).isoformat(),
+                        "attributes": dict(event.attributes) if event.attributes else {},
+                    }
+                    for event in (span.events or [])
+                ],
+            },
+        }

src/flow/experiments/types.py CHANGED Viewed

@@ -61,6 +61,7 @@ class RunResult:
     duration_seconds: float
     workspace: Path
     error: str | None = None
     @property
     def success(self) -> bool:
@@ -74,7 +75,8 @@ class CriterionResult:
     Attributes:
         name: Name of the criterion evaluated
-        score: Numeric score (0.0 to 1.0)
         passed: Whether the criterion was met
         reasoning: Explanation of the evaluation
     """
@@ -83,6 +85,7 @@ class CriterionResult:
     score: float
     passed: bool
     reasoning: str
 @dataclass
@@ -90,7 +93,8 @@ class EvalResult:
     """Result of evaluating an agent's output.
     Attributes:
-        score: Overall weighted score (0.0 to 1.0)
         passed: Whether the evaluation passed overall
         criteria_results: Results for each individual criterion
         reasoning: Overall evaluation reasoning/summary
@@ -100,6 +104,7 @@ class EvalResult:
     passed: bool
     criteria_results: list[CriterionResult]
     reasoning: str
 # =============================================================================

     duration_seconds: float
     workspace: Path
     error: str | None = None
+    tool_results: list[dict[str, str]] = field(default_factory=list)
     @property
     def success(self) -> bool:
     Attributes:
         name: Name of the criterion evaluated
+        score: Numeric score (0.0 to 1.0) — exact match score
+        reasoning_score: Partial credit for correct reasoning/methodology (0.0 to 1.0)
         passed: Whether the criterion was met
         reasoning: Explanation of the evaluation
     """
     score: float
     passed: bool
     reasoning: str
+    reasoning_score: float = 0.0
 @dataclass
     """Result of evaluating an agent's output.
     Attributes:
+        score: Overall weighted exact-match score (0.0 to 1.0)
+        reasoning_score: Overall weighted reasoning/methodology score (0.0 to 1.0)
         passed: Whether the evaluation passed overall
         criteria_results: Results for each individual criterion
         reasoning: Overall evaluation reasoning/summary
     passed: bool
     criteria_results: list[CriterionResult]
     reasoning: str
+    reasoning_score: float = 0.0
 # =============================================================================

src/flow/harness/__init__.py CHANGED Viewed

@@ -16,6 +16,10 @@ Usage:
     harness = create_harness(agent, workspace=Path("/tmp"))
 """
 from flow.harness.base import BaseHarness, Event, EventType
 from flow.harness.registry import (
     available_frameworks,
@@ -24,10 +28,7 @@ from flow.harness.registry import (
     register,
 )
-# Auto-register harnesses by importing them
-# Each harness module calls register() on import
-from flow.harness import maf as _maf  # noqa: F401
-from flow.harness import miniagent as _miniagent  # noqa: F401
 __all__ = [
     "BaseHarness",

     harness = create_harness(agent, workspace=Path("/tmp"))
 """
+# Auto-register harnesses by importing them
+# Each harness module calls register() on import
+from flow.harness import maf as _maf
+from flow.harness import miniagent as _miniagent
 from flow.harness.base import BaseHarness, Event, EventType
 from flow.harness.registry import (
     available_frameworks,
     register,
 )
+_ = (_maf, _miniagent)  # Suppress unused import warnings
 __all__ = [
     "BaseHarness",

src/flow/harness/base.py CHANGED Viewed

@@ -10,7 +10,7 @@ from abc import ABC, abstractmethod
 from collections.abc import AsyncIterator
 from dataclasses import dataclass, field
 from enum import Enum
-from typing import TYPE_CHECKING
 if TYPE_CHECKING:
     from pathlib import Path
@@ -62,18 +62,30 @@ class BaseHarness(ABC):
     Implementations:
     - MAFHarness (flow.harness.maf): Microsoft Agent Framework
-    - (Future) LangGraphHarness: LangGraph
-    - (Future) ClaudeHarness: Claude SDK
     """
     @classmethod
     @abstractmethod
     def from_agent(
         cls,
-        agent: "Agent",
-        workspace: "Path",
-        llm_config: "LLMClientConfig | None" = None,
-    ) -> "BaseHarness":
         """Create a harness from an Agent definition.
         Args:

 from collections.abc import AsyncIterator
 from dataclasses import dataclass, field
 from enum import Enum
+from typing import TYPE_CHECKING, ClassVar
 if TYPE_CHECKING:
     from pathlib import Path
     Implementations:
     - MAFHarness (flow.harness.maf): Microsoft Agent Framework
+    - LangGraphHarness (flow.harness.langgraph): LangGraph
+    - MiniAgentHarness (flow.harness.miniagent): MiniAgent
+    Class Attributes:
+        framework_name: Unique identifier for this framework (e.g., "maf", "langgraph")
+        framework_label: Human-readable label (e.g., "Microsoft Agent Framework")
+        framework_description: Short description of the framework
+        supported_compaction_strategies: List of compaction strategy names this framework supports
     """
+    # Framework metadata - subclasses should override these
+    framework_name: ClassVar[str] = ""
+    framework_label: ClassVar[str] = ""
+    framework_description: ClassVar[str] = ""
+    supported_compaction_strategies: ClassVar[list[str]] = []
     @classmethod
     @abstractmethod
     def from_agent(
         cls,
+        agent: Agent,
+        workspace: Path,
+        llm_config: LLMClientConfig | None = None,
+    ) -> BaseHarness:
         """Create a harness from an Agent definition.
         Args:

src/flow/harness/compaction/__init__.py ADDED Viewed

	@@ -0,0 +1,38 @@

+# Copyright (c) Microsoft. All rights reserved.
+"""Shared token-aware compaction strategies for all frameworks.
+This module provides unified compaction strategies that work across
+MAF, MiniAgent, and LangGraph frameworks. All strategies are token-based
+to ensure safety against large messages.
+Strategies:
+- HeadTailStrategy: Keep head (system prompt) + tail (recent), drop middle
+- SlidingWindowStrategy: Keep system + most recent messages within budget
+- SummarizationStrategy: Summarize middle messages using LLM
+- NoCompactionStrategy: Baseline (no management)
+Usage:
+    from flow.harness.compaction import HeadTailStrategy, count_tokens
+    strategy = HeadTailStrategy(head_ratio=0.2, token_budget=200_000)
+    compacted = strategy.compact(messages)
+"""
+from flow.harness.compaction.strategies import (
+    CompactionStrategy,
+    HeadTailStrategy,
+    NoCompactionStrategy,
+    SlidingWindowStrategy,
+    SummarizationStrategy,
+)
+from flow.harness.compaction.tokenizer import count_tokens, get_encoder
+__all__ = [
+    "CompactionStrategy",
+    "HeadTailStrategy",
+    "NoCompactionStrategy",
+    "SlidingWindowStrategy",
+    "SummarizationStrategy",
+    "count_tokens",
+    "get_encoder",
+]

src/flow/harness/compaction/strategies.py ADDED Viewed

	@@ -0,0 +1,502 @@

+# Copyright (c) Microsoft. All rights reserved.
+"""Token-aware compaction strategies for context management.
+All strategies use token counting (not message counting) to ensure
+safety against large messages that could blow past LLM context limits.
+"""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import Any, Protocol
+from flow.harness.compaction.tokenizer import (
+    count_message_tokens,
+    count_messages_tokens,
+    get_encoder,
+)
+# Default token budget (safe for GPT-4o, Claude 3.5, etc.)
+DEFAULT_TOKEN_BUDGET = 200_000
+class CompactionStrategy(Protocol):
+    """Protocol for compaction strategies.
+    All strategies must implement compact() which takes messages and
+    returns a (possibly compacted) list of messages.
+    """
+    def compact(
+        self,
+        messages: list[dict[str, Any]],
+        token_budget: int | None = None,
+    ) -> list[dict[str, Any]]:
+        """Compact messages to fit within token budget.
+        Args:
+            messages: List of chat message dicts
+            token_budget: Max tokens (uses strategy default if None)
+        Returns:
+            Compacted message list
+        """
+        ...
+@dataclass
+class NoCompactionStrategy:
+    """Baseline: no compaction, context grows unbounded.
+    Use this for benchmarking to see how context grows without management.
+    """
+    def compact(
+        self,
+        messages: list[dict[str, Any]],
+        token_budget: int | None = None,
+    ) -> list[dict[str, Any]]:
+        """Return messages unchanged."""
+        return messages
+@dataclass
+class HeadTailStrategy:
+    """Token-aware head+tail compaction.
+    Preserves:
+    - Head: System prompt, initial user message (critical context)
+    - Tail: Recent tool calls and results (working memory)
+    Drops middle messages when over budget, respecting atomic groups
+    (tool calls and their results must stay together).
+    This is the recommended strategy for most use cases.
+    """
+    head_ratio: float = 0.2  # 20% for head by default
+    token_budget: int = DEFAULT_TOKEN_BUDGET
+    model: str = "gpt-4o"
+    # Statistics
+    compaction_count: int = field(default=0, repr=False)
+    total_tokens_saved: int = field(default=0, repr=False)
+    def _find_atomic_groups(
+        self, messages: list[dict[str, Any]]
+    ) -> list[tuple[int, ...]]:
+        """Group tool_call messages with their results.
+        OpenAI requires every tool_call to have a corresponding result.
+        This ensures we never split a tool call from its results.
+        Returns list of tuples, where each tuple contains indices that
+        must stay together.
+        """
+        groups: list[tuple[int, ...]] = []
+        i = 0
+        while i < len(messages):
+            msg = messages[i]
+            if msg.get("tool_calls"):
+                # This message has tool calls - find all results
+                call_ids = {tc.get("id") for tc in msg["tool_calls"] if tc.get("id")}
+                group_indices = [i]
+                # Look ahead for results
+                j = i + 1
+                while j < len(messages) and call_ids:
+                    if messages[j].get("role") == "tool":
+                        tool_call_id = messages[j].get("tool_call_id")
+                        if tool_call_id in call_ids:
+                            group_indices.append(j)
+                            call_ids.remove(tool_call_id)
+                    j += 1
+                groups.append(tuple(group_indices))
+                i = max(group_indices) + 1 if group_indices else i + 1
+            else:
+                groups.append((i,))
+                i += 1
+        return groups
+    def compact(
+        self,
+        messages: list[dict[str, Any]],
+        token_budget: int | None = None,
+    ) -> list[dict[str, Any]]:
+        """Compact if over budget."""
+        if not messages:
+            return messages
+        budget = token_budget or self.token_budget
+        encoder = get_encoder(self.model)
+        current_tokens = count_messages_tokens(messages, self.model)
+        if current_tokens <= budget:
+            return messages
+        # COMPACTION NEEDED
+        self.compaction_count += 1
+        groups = self._find_atomic_groups(messages)
+        head_budget = int(budget * self.head_ratio)
+        tail_budget = budget - head_budget
+        # Fill head from start
+        head_groups: list[tuple[int, ...]] = []
+        head_tokens = 0
+        for group in groups:
+            group_msgs = [messages[i] for i in group]
+            group_tokens = sum(
+                count_message_tokens(m, self.model, encoder) for m in group_msgs
+            )
+            if head_tokens + group_tokens <= head_budget:
+                head_groups.append(group)
+                head_tokens += group_tokens
+            else:
+                break
+        # Fill tail from end (skip head groups)
+        remaining_groups = groups[len(head_groups) :]
+        tail_groups: list[tuple[int, ...]] = []
+        tail_tokens = 0
+        for group in reversed(remaining_groups):
+            group_msgs = [messages[i] for i in group]
+            group_tokens = sum(
+                count_message_tokens(m, self.model, encoder) for m in group_msgs
+            )
+            if tail_tokens + group_tokens <= tail_budget:
+                tail_groups.insert(0, group)
+                tail_tokens += group_tokens
+            else:
+                break
+        # Build compacted list
+        kept_indices: set[int] = set()
+        for group in head_groups + tail_groups:
+            kept_indices.update(group)
+        compacted = [messages[i] for i in sorted(kept_indices)]
+        # Track savings
+        compacted_tokens = count_messages_tokens(compacted, self.model)
+        self.total_tokens_saved += current_tokens - compacted_tokens
+        return compacted
+@dataclass
+class SlidingWindowStrategy:
+    """Keep only recent messages within budget.
+    Always preserves the system message (if present) plus the most
+    recent messages that fit in the budget. Respects atomic groups
+    (tool calls and their results must stay together).
+    Simpler than HeadTailStrategy but may lose important early context.
+    """
+    token_budget: int = DEFAULT_TOKEN_BUDGET
+    model: str = "gpt-4o"
+    def _find_atomic_groups(
+        self, messages: list[dict[str, Any]]
+    ) -> list[tuple[int, ...]]:
+        """Group tool_call messages with their results."""
+        groups: list[tuple[int, ...]] = []
+        i = 0
+        while i < len(messages):
+            msg = messages[i]
+            if msg.get("tool_calls"):
+                call_ids = {tc.get("id") for tc in msg["tool_calls"] if tc.get("id")}
+                group_indices = [i]
+                j = i + 1
+                while j < len(messages) and call_ids:
+                    if messages[j].get("role") == "tool":
+                        tool_call_id = messages[j].get("tool_call_id")
+                        if tool_call_id in call_ids:
+                            group_indices.append(j)
+                            call_ids.remove(tool_call_id)
+                    j += 1
+                groups.append(tuple(group_indices))
+                i = max(group_indices) + 1 if group_indices else i + 1
+            else:
+                groups.append((i,))
+                i += 1
+        return groups
+    def compact(
+        self,
+        messages: list[dict[str, Any]],
+        token_budget: int | None = None,
+    ) -> list[dict[str, Any]]:
+        """Keep system message + most recent messages within budget."""
+        if not messages:
+            return messages
+        budget = token_budget or self.token_budget
+        encoder = get_encoder(self.model)
+        # Always keep system messages at the start
+        system_msgs: list[dict[str, Any]] = []
+        non_system_start = 0
+        for i, msg in enumerate(messages):
+            if msg.get("role") == "system":
+                system_msgs.append(msg)
+                non_system_start = i + 1
+            else:
+                break
+        other_msgs = messages[non_system_start:]
+        system_tokens = sum(
+            count_message_tokens(m, self.model, encoder) for m in system_msgs
+        )
+        remaining_budget = budget - system_tokens
+        if remaining_budget <= 0:
+            return system_msgs
+        # Check if we need to compact
+        other_tokens = sum(
+            count_message_tokens(m, self.model, encoder) for m in other_msgs
+        )
+        if other_tokens <= remaining_budget:
+            return messages  # No compaction needed
+        # Find atomic groups in other messages
+        groups = self._find_atomic_groups(other_msgs)
+        # Fill from end, respecting atomic groups
+        kept_groups: list[tuple[int, ...]] = []
+        kept_tokens = 0
+        for group in reversed(groups):
+            group_msgs = [other_msgs[i] for i in group]
+            group_tokens = sum(
+                count_message_tokens(m, self.model, encoder) for m in group_msgs
+            )
+            if kept_tokens + group_tokens <= remaining_budget:
+                kept_groups.insert(0, group)
+                kept_tokens += group_tokens
+            else:
+                break
+        # Build result from kept groups
+        kept_indices: set[int] = set()
+        for group in kept_groups:
+            kept_indices.update(group)
+        result = [other_msgs[i] for i in sorted(kept_indices)]
+        return system_msgs + result
+@dataclass
+class SummarizationStrategy:
+    """Summarize old messages instead of dropping them.
+    When over budget, this strategy:
+    1. Keeps: System message + initial user message (head)
+    2. Keeps: Most recent messages (tail)
+    3. Summarizes: Everything in between into a single "context so far" message
+    This preserves critical state (files read, findings, progress) that would
+    otherwise be lost with simple truncation strategies.
+    Note: Requires an async summarization function to be provided.
+    """
+    head_messages: int = 2  # Keep first N messages
+    tail_messages: int = 4  # Keep last N messages
+    summary_max_tokens: int = 1000
+    token_budget: int = DEFAULT_TOKEN_BUDGET
+    model: str = "gpt-4o"
+    # Async function to generate summaries (must be set before use)
+    summarize_fn: Any = field(default=None, repr=False)
+    # Statistics
+    compaction_count: int = field(default=0, repr=False)
+    total_tokens_saved: int = field(default=0, repr=False)
+    def _find_safe_split_points(
+        self, messages: list[dict[str, Any]]
+    ) -> tuple[int, int]:
+        """Find safe points to split messages without breaking tool call/result pairs.
+        Returns (head_end, tail_start) indices where it's safe to summarize between.
+        """
+        groups: list[tuple[int, int]] = []  # (start, end) indices
+        i = 0
+        while i < len(messages):
+            msg = messages[i]
+            if msg.get("tool_calls"):
+                call_ids = {tc.get("id") for tc in msg["tool_calls"] if tc.get("id")}
+                end = i
+                j = i + 1
+                while j < len(messages) and call_ids:
+                    if messages[j].get("role") == "tool":
+                        tool_call_id = messages[j].get("tool_call_id")
+                        if tool_call_id in call_ids:
+                            call_ids.discard(tool_call_id)
+                            end = j
+                    j += 1
+                groups.append((i, end + 1))
+                i = end + 1
+            else:
+                groups.append((i, i + 1))
+                i += 1
+        # Find safe head end (after self.head_messages worth of groups)
+        head_end = 0
+        for idx, (_start, end) in enumerate(groups):
+            if idx < self.head_messages:
+                head_end = end
+            else:
+                break
+        # Find safe tail start (before last self.tail_messages groups)
+        tail_start = len(messages)
+        tail_groups = min(self.tail_messages, len(groups))
+        if tail_groups > 0 and len(groups) > tail_groups:
+            tail_start = groups[-tail_groups][0]
+        # Ensure we don't overlap
+        if head_end >= tail_start:
+            return len(messages), len(messages)
+        return head_end, tail_start
+    def _extract_key_info(self, messages: list[dict[str, Any]]) -> str:
+        """Extract key info without LLM (fallback)."""
+        files_read: set[str] = set()
+        key_findings: list[str] = []
+        for msg in messages:
+            if msg.get("role") == "tool" and msg.get("name") == "read_file":
+                files_read.add(msg.get("name") or "file")
+            if msg.get("role") == "assistant" and msg.get("content"):
+                content = msg["content"]
+                if isinstance(content, str) and len(content) < 200:
+                    key_findings.append(content)
+        parts: list[str] = []
+        if files_read:
+            parts.append(f"Files accessed: {', '.join(files_read)}")
+        if key_findings:
+            parts.append(f"Key points: {'; '.join(key_findings[:5])}")
+        return "\n".join(parts) if parts else "Previous context was processed."
+    def compact(
+        self,
+        messages: list[dict[str, Any]],
+        token_budget: int | None = None,
+    ) -> list[dict[str, Any]]:
+        """Summarize middle messages if over budget.
+        Note: This is synchronous and uses simple extraction.
+        For LLM-based summarization, use compact_async().
+        """
+        if not messages:
+            return messages
+        budget = token_budget or self.token_budget
+        current_tokens = count_messages_tokens(messages, self.model)
+        if current_tokens <= budget:
+            return messages
+        self.compaction_count += 1
+        head_end, tail_start = self._find_safe_split_points(messages)
+        head = messages[:head_end]
+        tail = messages[tail_start:]
+        middle = messages[head_end:tail_start]
+        if not middle:
+            return messages
+        # Extract key info without LLM
+        summary_text = self._extract_key_info(middle)
+        summary_message = {
+            "role": "user",
+            "content": f"[CONTEXT SUMMARY - Previous {len(middle)} messages compressed]\n\n{summary_text}\n\n[END SUMMARY - Continue from here]",
+        }
+        compacted = head + [summary_message] + tail
+        compacted_tokens = count_messages_tokens(compacted, self.model)
+        self.total_tokens_saved += current_tokens - compacted_tokens
+        return compacted
+    async def compact_async(
+        self,
+        messages: list[dict[str, Any]],
+        token_budget: int | None = None,
+    ) -> list[dict[str, Any]]:
+        """Async version that can use LLM for summarization."""
+        if not messages:
+            return messages
+        budget = token_budget or self.token_budget
+        current_tokens = count_messages_tokens(messages, self.model)
+        if current_tokens <= budget:
+            return messages
+        self.compaction_count += 1
+        head_end, tail_start = self._find_safe_split_points(messages)
+        head = messages[:head_end]
+        tail = messages[tail_start:]
+        middle = messages[head_end:tail_start]
+        if not middle:
+            return messages
+        # Generate summary
+        if self.summarize_fn:
+            try:
+                summary_text = await self.summarize_fn(middle, self.summary_max_tokens)
+            except Exception:
+                summary_text = self._extract_key_info(middle)
+        else:
+            summary_text = self._extract_key_info(middle)
+        summary_message = {
+            "role": "user",
+            "content": f"""[CONTEXT CHECKPOINT - Your previous work has been summarized below]
+{summary_text}
+---
+IMPORTANT: Continue from where you left off. Do not repeat work already done.""",
+        }
+        compacted = head + [summary_message] + tail
+        compacted_tokens = count_messages_tokens(compacted, self.model)
+        self.total_tokens_saved += current_tokens - compacted_tokens
+        return compacted

src/flow/harness/compaction/tokenizer.py ADDED Viewed

	@@ -0,0 +1,131 @@

+# Copyright (c) Microsoft. All rights reserved.
+"""Shared tiktoken wrapper for consistent token counting across frameworks."""
+from __future__ import annotations
+from typing import Any
+import tiktoken
+# Cache encoders to avoid repeated initialization
+_ENCODER_CACHE: dict[str, tiktoken.Encoding] = {}
+# Default encoding for unknown models
+DEFAULT_ENCODING = "cl100k_base"
+def get_encoder(model: str = "gpt-4o") -> tiktoken.Encoding:
+    """Get tiktoken encoder for a model.
+    Args:
+        model: Model name (e.g., "gpt-4o", "gpt-4", "gpt-3.5-turbo")
+    Returns:
+        tiktoken Encoding instance
+    """
+    if model in _ENCODER_CACHE:
+        return _ENCODER_CACHE[model]
+    try:
+        encoder = tiktoken.encoding_for_model(model)
+    except KeyError:
+        # Fallback for unknown models (Claude, etc.)
+        encoder = tiktoken.get_encoding(DEFAULT_ENCODING)
+    _ENCODER_CACHE[model] = encoder
+    return encoder
+def count_tokens(
+    text: str,
+    model: str = "gpt-4o",
+    encoder: tiktoken.Encoding | None = None,
+) -> int:
+    """Count tokens in a text string.
+    Args:
+        text: The text to count tokens for
+        model: Model name for encoding selection
+        encoder: Optional pre-fetched encoder (for performance)
+    Returns:
+        Number of tokens
+    """
+    if encoder is None:
+        encoder = get_encoder(model)
+    return len(encoder.encode(text))
+def count_message_tokens(
+    message: dict[str, Any],
+    model: str = "gpt-4o",
+    encoder: tiktoken.Encoding | None = None,
+) -> int:
+    """Count tokens in a chat message dict.
+    Handles:
+    - role overhead (~4 tokens per message)
+    - content text
+    - tool_calls (name + arguments)
+    - tool results
+    Args:
+        message: Chat message dict with role, content, etc.
+        model: Model name for encoding selection
+        encoder: Optional pre-fetched encoder
+    Returns:
+        Approximate token count for the message
+    """
+    if encoder is None:
+        encoder = get_encoder(model)
+    total = 4  # Role overhead (approximate)
+    # Content
+    content = message.get("content")
+    if content:
+        if isinstance(content, str):
+            total += len(encoder.encode(content))
+        elif isinstance(content, list):
+            # Handle structured content (text blocks, etc.)
+            for item in content:
+                if isinstance(item, dict) and "text" in item:
+                    total += len(encoder.encode(item["text"]))
+    # Tool calls
+    tool_calls = message.get("tool_calls")
+    if tool_calls:
+        for tc in tool_calls:
+            total += 4  # Tool call overhead
+            if isinstance(tc, dict):
+                name = tc.get("name") or tc.get("function", {}).get("name", "")
+                args = tc.get("arguments") or tc.get("function", {}).get("arguments", "")
+            else:
+                # Handle object-style tool calls
+                name = getattr(tc, "name", "")
+                args = getattr(tc, "arguments", "")
+            if name:
+                total += len(encoder.encode(name))
+            if args:
+                total += len(encoder.encode(args))
+    return total
+def count_messages_tokens(
+    messages: list[dict[str, Any]],
+    model: str = "gpt-4o",
+) -> int:
+    """Count total tokens across all messages.
+    Args:
+        messages: List of chat message dicts
+        model: Model name for encoding selection
+    Returns:
+        Total token count
+    """
+    encoder = get_encoder(model)
+    return sum(count_message_tokens(m, model, encoder) for m in messages)

src/flow/harness/langgraph/__init__.py CHANGED Viewed

@@ -19,7 +19,11 @@ Usage:
         print(event.type, event.content)
 """
-from flow.harness.langgraph.compaction import create_compaction_hook
 from flow.harness.langgraph.harness import LangGraphHarness
 from flow.harness.langgraph.otel_callback import OTelCallbackHandler
 from flow.harness.langgraph.wrappers import build_langgraph_tools, wrap_for_langgraph
@@ -33,5 +37,7 @@ __all__ = [
     "OTelCallbackHandler",
     "build_langgraph_tools",
     "create_compaction_hook",
     "wrap_for_langgraph",
 ]

         print(event.type, event.content)
 """
+from flow.harness.langgraph.compaction import (
+    create_compaction_hook,
+    create_head_tail_hook,
+    create_sliding_window_hook,
+)
 from flow.harness.langgraph.harness import LangGraphHarness
 from flow.harness.langgraph.otel_callback import OTelCallbackHandler
 from flow.harness.langgraph.wrappers import build_langgraph_tools, wrap_for_langgraph
     "OTelCallbackHandler",
     "build_langgraph_tools",
     "create_compaction_hook",
+    "create_head_tail_hook",
+    "create_sliding_window_hook",
     "wrap_for_langgraph",
 ]

src/flow/harness/langgraph/compaction.py CHANGED Viewed

@@ -1,51 +1,219 @@
-"""Message compaction for LangGraph.
-Provides a pre-model hook that implements head-tail message compaction,
-similar to MAF's HeadTailCompactingChatMessageStore.
 """
 from __future__ import annotations
 from typing import Any
-__all__ = ["create_compaction_hook"]
-def create_compaction_hook(head_size: int, tail_size: int):
-    """Create a pre-model hook for message compaction.
-    This hook compacts messages by keeping the first `head_size` messages
-    and the last `tail_size` messages, dropping the middle.
     Args:
-        head_size: Number of messages to keep from the start
-        tail_size: Number of messages to keep from the end
     Returns:
         A function that can be used as a pre_model_hook in create_react_agent
     Example:
-        hook = create_compaction_hook(10, 40)
         graph = create_react_agent(
             model=model,
             tools=tools,
             pre_model_hook=hook,
         )
     """
     def compact_messages(state: dict[str, Any]) -> dict[str, Any]:
-        """Compact messages keeping head and tail, dropping middle."""
         messages = state.get("messages", [])
-        total = len(messages)
-        # No compaction needed if within limits
-        if total <= head_size + tail_size:
             return {"llm_input_messages": messages}
-        # Keep head and tail
-        head = messages[:head_size]
-        tail = messages[-tail_size:]
-        return {"llm_input_messages": head + tail}
     return compact_messages

+# Copyright (c) Microsoft. All rights reserved.
+"""Token-aware message compaction for LangGraph.
+Provides pre-model hooks that implement token-based message compaction,
+ensuring safety against large messages that could exceed LLM context limits.
 """
 from __future__ import annotations
 from typing import Any
+from flow.harness.compaction import (
+    HeadTailStrategy,
+    SlidingWindowStrategy,
+)
+__all__ = [
+    "create_compaction_hook",
+    "create_head_tail_hook",
+    "create_sliding_window_hook",
+]
+# Default token budget (safe for GPT-4o, Claude 3.5, etc.)
+DEFAULT_TOKEN_BUDGET = 200_000
+def _langchain_msg_to_dict(msg: Any) -> dict[str, Any]:
+    """Convert LangChain message to dict format for compaction strategies."""
+    if isinstance(msg, dict):
+        return msg
+    # Handle LangChain message types
+    result: dict[str, Any] = {}
+    # Get role from type
+    msg_type = getattr(msg, "type", None)
+    if msg_type == "human":
+        result["role"] = "user"
+    elif msg_type == "ai":
+        result["role"] = "assistant"
+    elif msg_type == "system":
+        result["role"] = "system"
+    elif msg_type == "tool":
+        result["role"] = "tool"
+        result["tool_call_id"] = getattr(msg, "tool_call_id", None)
+    else:
+        result["role"] = msg_type or "user"
+    # Get content
+    content = getattr(msg, "content", "")
+    result["content"] = content
+    # Get tool calls (for AIMessage)
+    tool_calls = getattr(msg, "tool_calls", None)
+    if tool_calls:
+        result["tool_calls"] = [
+            {
+                "id": tc.get("id") if isinstance(tc, dict) else getattr(tc, "id", None),
+                "function": {
+                    "name": tc.get("name") if isinstance(tc, dict) else getattr(tc, "name", ""),
+                    "arguments": str(tc.get("args", {})) if isinstance(tc, dict) else str(getattr(tc, "args", {})),
+                },
+            }
+            for tc in tool_calls
+        ]
+    return result
+def _dict_to_langchain_msg(msg_dict: dict[str, Any], original_msg: Any) -> Any:
+    """Preserve original LangChain message (we don't convert back)."""
+    # For compaction, we return the original message objects
+    # The strategy just tells us which indices to keep
+    return original_msg
+def create_compaction_hook(
+    head_ratio: float = 0.2,
+    token_budget: int = DEFAULT_TOKEN_BUDGET,
+    model: str = "gpt-4o",
+):
+    """Create a pre-model hook for token-aware head+tail compaction.
+    This hook compacts messages by keeping head messages (system prompt,
+    initial context) and tail messages (recent work), dropping the middle
+    when token count exceeds the budget.
     Args:
+        head_ratio: Fraction of budget for head messages (0.2 = 20%)
+        token_budget: Max tokens before compaction triggers
+        model: Model name for tokenizer selection
     Returns:
         A function that can be used as a pre_model_hook in create_react_agent
     Example:
+        hook = create_compaction_hook(head_ratio=0.2, token_budget=200000)
         graph = create_react_agent(
             model=model,
             tools=tools,
             pre_model_hook=hook,
         )
     """
+    strategy = HeadTailStrategy(
+        head_ratio=head_ratio,
+        token_budget=token_budget,
+        model=model,
+    )
     def compact_messages(state: dict[str, Any]) -> dict[str, Any]:
+        """Compact messages using token-aware head+tail strategy."""
         messages = state.get("messages", [])
+        if not messages:
             return {"llm_input_messages": messages}
+        # Convert to dict format for strategy
+        msg_dicts = [_langchain_msg_to_dict(m) for m in messages]
+        # Apply compaction
+        compacted_dicts = strategy.compact(msg_dicts)
+        # Map back to original message objects
+        # We need to find which original messages correspond to kept dicts
+        compacted_messages = []
+        # Note: dict_to_idx was used for ID-based matching but content matching is more reliable
+        # Build index set of kept messages
+        kept_indices = set()
+        for cd in compacted_dicts:
+            for i, md in enumerate(msg_dicts):
+                # Compare by content since we can't rely on identity
+                if (
+                    md.get("role") == cd.get("role")
+                    and md.get("content") == cd.get("content")
+                    and i not in kept_indices
+                ):
+                    kept_indices.add(i)
+                    break
+        compacted_messages = [messages[i] for i in sorted(kept_indices)]
+        return {"llm_input_messages": compacted_messages}
+    return compact_messages
+def create_head_tail_hook(
+    head_ratio: float = 0.2,
+    token_budget: int = DEFAULT_TOKEN_BUDGET,
+    model: str = "gpt-4o",
+):
+    """Alias for create_compaction_hook with head+tail strategy."""
+    return create_compaction_hook(
+        head_ratio=head_ratio,
+        token_budget=token_budget,
+        model=model,
+    )
+def create_sliding_window_hook(
+    token_budget: int = DEFAULT_TOKEN_BUDGET,
+    model: str = "gpt-4o",
+):
+    """Create a pre-model hook for token-aware sliding window compaction.
+    This hook keeps the system message plus the most recent messages
+    that fit within the token budget.
+    Args:
+        token_budget: Max tokens for context window
+        model: Model name for tokenizer selection
+    Returns:
+        A function that can be used as a pre_model_hook in create_react_agent
+    Example:
+        hook = create_sliding_window_hook(token_budget=100000)
+        graph = create_react_agent(
+            model=model,
+            tools=tools,
+            pre_model_hook=hook,
+        )
+    """
+    strategy = SlidingWindowStrategy(
+        token_budget=token_budget,
+        model=model,
+    )
+    def compact_messages(state: dict[str, Any]) -> dict[str, Any]:
+        """Compact messages using sliding window strategy."""
+        messages = state.get("messages", [])
+        if not messages:
+            return {"llm_input_messages": messages}
+        # Convert to dict format for strategy
+        msg_dicts = [_langchain_msg_to_dict(m) for m in messages]
+        # Apply compaction
+        compacted_dicts = strategy.compact(msg_dicts)
+        # Map back to original message objects
+        kept_indices = set()
+        for cd in compacted_dicts:
+            for i, md in enumerate(msg_dicts):
+                if (
+                    md.get("role") == cd.get("role")
+                    and md.get("content") == cd.get("content")
+                    and i not in kept_indices
+                ):
+                    kept_indices.add(i)
+                    break
+        compacted_messages = [messages[i] for i in sorted(kept_indices)]
+        return {"llm_input_messages": compacted_messages}
     return compact_messages

src/flow/harness/langgraph/harness.py CHANGED Viewed

@@ -10,7 +10,7 @@ import logging
 import uuid
 from collections.abc import AsyncIterator
 from pathlib import Path
-from typing import TYPE_CHECKING, Any
 from opentelemetry import trace
@@ -50,6 +50,12 @@ class LangGraphHarness(BaseHarness):
             print(event.type, event.content)
     """
     @classmethod
     def from_agent(cls, agent: Agent, workspace: Path) -> LangGraphHarness:
         """Create a LangGraph harness from an Agent spec.
@@ -61,11 +67,12 @@ class LangGraphHarness(BaseHarness):
         Returns:
             Configured LangGraphHarness instance
         """
         from flow.experiments.models import resolve_tools
         from flow.harness.langgraph.compaction import create_compaction_hook
         from flow.harness.langgraph.wrappers import build_langgraph_tools
-        from langgraph.checkpoint.memory import InMemorySaver
-        from langgraph.prebuilt import create_react_agent
         # Build tools (skip sub_agent - MAF-specific)
         tools_spec = resolve_tools(agent.tools)
@@ -234,7 +241,7 @@ class LangGraphHarness(BaseHarness):
         mode, data = chunk
         if mode == "messages":
-            msg_chunk, metadata = data
             # Text content
             if hasattr(msg_chunk, "content") and msg_chunk.content:

 import uuid
 from collections.abc import AsyncIterator
 from pathlib import Path
+from typing import TYPE_CHECKING, Any, ClassVar
 from opentelemetry import trace
             print(event.type, event.content)
     """
+    # Framework metadata
+    framework_name: ClassVar[str] = "langgraph"
+    framework_label: ClassVar[str] = "LangGraph"
+    framework_description: ClassVar[str] = "Graph-based workflows with state management"
+    supported_compaction_strategies: ClassVar[list[str]] = ["head_tail", "sliding_window", "none"]
     @classmethod
     def from_agent(cls, agent: Agent, workspace: Path) -> LangGraphHarness:
         """Create a LangGraph harness from an Agent spec.
         Returns:
             Configured LangGraphHarness instance
         """
+        from langgraph.checkpoint.memory import InMemorySaver
+        from langgraph.prebuilt import create_react_agent
         from flow.experiments.models import resolve_tools
         from flow.harness.langgraph.compaction import create_compaction_hook
         from flow.harness.langgraph.wrappers import build_langgraph_tools
         # Build tools (skip sub_agent - MAF-specific)
         tools_spec = resolve_tools(agent.tools)
         mode, data = chunk
         if mode == "messages":
+            msg_chunk, _metadata = data
             # Text content
             if hasattr(msg_chunk, "content") and msg_chunk.content:

src/flow/harness/maf/__init__.py CHANGED Viewed

@@ -12,7 +12,7 @@ from flow.harness.registry import register
 register("maf", MAFHarness)
 __all__ = [
-    "create_agent",
     "HeadTailCompactingChatMessageStore",
     "MAFHarness",
 ]

 register("maf", MAFHarness)
 __all__ = [
     "HeadTailCompactingChatMessageStore",
     "MAFHarness",
+    "create_agent",
 ]

src/flow/harness/maf/agent.py CHANGED Viewed

@@ -148,15 +148,19 @@ def create_agent(
     # Create message store factory if compaction is enabled
     message_store_factory = None
     if enable_compaction:
         def create_compacting_store() -> HeadTailCompactingChatMessageStore:
             return HeadTailCompactingChatMessageStore(
-                head_size=compaction_head_size,
-                tail_size=compaction_tail_size,
             )
         message_store_factory = create_compacting_store
         logger.debug(
-            f"Message compaction enabled: head={compaction_head_size}, tail={compaction_tail_size}"
         )
     # Determine if memory is enabled for instructions

     # Create message store factory if compaction is enabled
     message_store_factory = None
     if enable_compaction:
+        # Convert head/tail message counts to head_ratio for token-based compaction
+        # head_ratio = head_size / (head_size + tail_size)
+        total_size = compaction_head_size + compaction_tail_size
+        head_ratio = compaction_head_size / total_size if total_size > 0 else 0.2
         def create_compacting_store() -> HeadTailCompactingChatMessageStore:
             return HeadTailCompactingChatMessageStore(
+                head_ratio=head_ratio,
             )
         message_store_factory = create_compacting_store
         logger.debug(
+            f"Message compaction enabled: head={compaction_head_size}, tail={compaction_tail_size}, head_ratio={head_ratio:.2f}"
         )
     # Determine if memory is enabled for instructions

src/flow/harness/maf/harness.py CHANGED Viewed

@@ -9,7 +9,7 @@ import logging
 import uuid
 from collections.abc import AsyncIterator
 from pathlib import Path
-from typing import TYPE_CHECKING, Any
 from flow.harness.base import BaseHarness, Event, EventType
@@ -67,13 +67,19 @@ class MAFHarness(BaseHarness):
         >>> harness = MAFHarness.from_agent(agent, workspace=Path("/tmp"))
     """
     @classmethod
     def from_agent(
         cls,
-        agent: "Agent",
         workspace: Path,
-        llm_config: "LLMClientConfig | None" = None,
-    ) -> "MAFHarness":
         """Create a MAFHarness from an Agent definition.
         Args:
@@ -126,7 +132,7 @@ class MAFHarness(BaseHarness):
     def __init__(
         self,
-        agent: "ChatAgent | None" = None,
         **create_agent_kwargs: Any,
     ) -> None:
         """Initialize the harness.

 import uuid
 from collections.abc import AsyncIterator
 from pathlib import Path
+from typing import TYPE_CHECKING, Any, ClassVar
 from flow.harness.base import BaseHarness, Event, EventType
         >>> harness = MAFHarness.from_agent(agent, workspace=Path("/tmp"))
     """
+    # Framework metadata
+    framework_name: ClassVar[str] = "maf"
+    framework_label: ClassVar[str] = "Microsoft Agent Framework"
+    framework_description: ClassVar[str] = "Default agent implementation with ChatAgent"
+    supported_compaction_strategies: ClassVar[list[str]] = ["head_tail", "sliding_window", "none"]
     @classmethod
     def from_agent(
         cls,
+        agent: Agent,
         workspace: Path,
+        llm_config: LLMClientConfig | None = None,
+    ) -> MAFHarness:
         """Create a MAFHarness from an Agent definition.
         Args:
     def __init__(
         self,
+        agent: ChatAgent | None = None,
         **create_agent_kwargs: Any,
     ) -> None:
         """Initialize the harness.

src/flow/harness/maf/message_store.py CHANGED Viewed

@@ -1,21 +1,82 @@
-"""Message store implementations for Microsoft Agent Framework.
-Provides ChatMessageStoreProtocol implementations for context management.
 """
 from collections.abc import MutableMapping, Sequence
 from typing import TYPE_CHECKING, Any
 if TYPE_CHECKING:
     from agent_framework import ChatMessage
-class HeadTailCompactingChatMessageStore:
-    """A compacting message store that works directly with Agent Framework ChatMessage.
-    This store implements ChatMessageStoreProtocol and keeps the first N messages
-    (head) and last M messages (tail), dropping middle messages to prevent
-    context overflow in long conversations.
     IMPORTANT: This store preserves full ChatMessage objects including:
     - FunctionCallContent (tool calls)
@@ -24,44 +85,44 @@ class HeadTailCompactingChatMessageStore:
     This is critical because OpenAI's API requires tool results to immediately
     follow their corresponding tool calls.
-    The compaction strategy:
-    - Keeps the first N messages (task context, initial instructions)
-    - Keeps the last M messages (recent work, current state)
-    - Drops middle messages to prevent context overflow
     """
     def __init__(
         self,
         messages: Sequence["ChatMessage"] | None = None,
-        head_size: int = 10,
-        tail_size: int = 40,
     ) -> None:
-        """Initialize the compacting store.
         Args:
             messages: Initial messages to store
-            head_size: Number of initial messages to keep
-            tail_size: Number of recent messages to keep
         """
-        if head_size < 0:
-            raise ValueError("head_size must be non-negative")
-        if tail_size < 0:
-            raise ValueError("tail_size must be non-negative")
-        self._messages: list["ChatMessage"] = list(messages) if messages else []
-        self._head_size = head_size
-        self._tail_size = tail_size
     @property
-    def head_size(self) -> int:
-        """Number of messages kept from the beginning."""
-        return self._head_size
     @property
-    def tail_size(self) -> int:
-        """Number of messages kept from the end."""
-        return self._tail_size
     @property
     def total_messages(self) -> int:
@@ -69,16 +130,126 @@ class HeadTailCompactingChatMessageStore:
         return len(self._messages)
     @property
-    def compacted_count(self) -> int:
-        """Number of messages that would be returned by list_messages()."""
-        total = len(self._messages)
-        max_kept = self._head_size + self._tail_size
-        return min(total, max_kept)
     @property
-    def dropped_count(self) -> int:
-        """Number of messages dropped during compaction."""
-        return max(0, self.total_messages - self.compacted_count)
     async def add_messages(self, messages: Sequence["ChatMessage"]) -> None:
         """Add messages to the store.
@@ -91,38 +262,32 @@ class HeadTailCompactingChatMessageStore:
         self._messages.extend(messages)
     async def list_messages(self) -> list["ChatMessage"]:
-        """Get messages with head+tail compaction applied.
-        Returns the first head_size messages plus the last tail_size messages.
-        If total messages <= head_size + tail_size, returns all messages.
         Returns:
             List of ChatMessage objects after compaction
         """
-        total = len(self._messages)
-        max_kept = self._head_size + self._tail_size
-        # No compaction needed
-        if total <= max_kept:
-            return list(self._messages)
-        # Return head + tail
-        head = self._messages[: self._head_size]
-        tail = self._messages[-self._tail_size :] if self._tail_size > 0 else []
-        return head + tail
     @classmethod
     async def deserialize(
         cls,
         serialized_store_state: MutableMapping[str, Any],
         **kwargs: Any,
-    ) -> "HeadTailCompactingChatMessageStore":
         """Create store from serialized state."""
         from agent_framework import ChatMessage
-        head_size = kwargs.get("head_size", serialized_store_state.get("head_size", 10))
-        tail_size = kwargs.get("tail_size", serialized_store_state.get("tail_size", 40))
         messages_data = serialized_store_state.get("messages", [])
         messages = [
@@ -130,7 +295,12 @@ class HeadTailCompactingChatMessageStore:
             for m in messages_data
         ]
-        return cls(messages=messages, head_size=head_size, tail_size=tail_size)
     async def update_from_state(
         self,
@@ -149,10 +319,12 @@ class HeadTailCompactingChatMessageStore:
             for m in messages_data
         ]
-        if "head_size" in serialized_store_state:
-            self._head_size = serialized_store_state["head_size"]
-        if "tail_size" in serialized_store_state:
-            self._tail_size = serialized_store_state["tail_size"]
     async def serialize(self, **kwargs: Any) -> dict[str, Any]:
         """Serialize the store state.
@@ -161,17 +333,23 @@ class HeadTailCompactingChatMessageStore:
         """
         return {
             "messages": [m.to_dict() for m in self._messages],
-            "head_size": self._head_size,
-            "tail_size": self._tail_size,
         }
     @property
-    def stats(self) -> dict[str, int]:
         """Get compaction statistics."""
         return {
             "total_messages": self.total_messages,
-            "compacted_count": self.compacted_count,
-            "dropped_count": self.dropped_count,
-            "head_size": self._head_size,
-            "tail_size": self._tail_size,
         }

+# Copyright (c) Microsoft. All rights reserved.
+"""Token-aware message store for Microsoft Agent Framework.
+Provides ChatMessageStoreProtocol implementations with token-based compaction
+to ensure safety against large messages that could exceed LLM context limits.
 """
 from collections.abc import MutableMapping, Sequence
 from typing import TYPE_CHECKING, Any
+from flow.harness.compaction import HeadTailStrategy
+from flow.harness.compaction.tokenizer import get_encoder
+_ = (HeadTailStrategy, get_encoder)  # Used for external access via this module
 if TYPE_CHECKING:
     from agent_framework import ChatMessage
+# Default token budget (safe for GPT-4o, Claude 3.5, etc.)
+DEFAULT_TOKEN_BUDGET = 200_000
+def _chat_message_to_dict(msg: "ChatMessage") -> dict[str, Any]:
+    """Convert ChatMessage to dict format for token counting."""
+    return msg.to_dict()
+def _count_chat_message_tokens(msg: "ChatMessage", model: str = "gpt-4o") -> int:
+    """Count tokens in a ChatMessage."""
+    msg_dict = _chat_message_to_dict(msg)
+    encoder = get_encoder(model)
+    tokens = 0
+    # Count role
+    if "role" in msg_dict:
+        tokens += len(encoder.encode(str(msg_dict["role"])))
+    # Count content
+    content = msg_dict.get("content")
+    if isinstance(content, str):
+        tokens += len(encoder.encode(content))
+    elif isinstance(content, list):
+        for item in content:
+            if isinstance(item, dict):
+                if "text" in item:
+                    tokens += len(encoder.encode(str(item["text"])))
+                elif "content" in item:
+                    tokens += len(encoder.encode(str(item["content"])))
+            elif isinstance(item, str):
+                tokens += len(encoder.encode(item))
+    # Count tool calls
+    tool_calls = msg_dict.get("tool_calls", [])
+    for tc in tool_calls:
+        if isinstance(tc, dict):
+            if "function" in tc:
+                func = tc["function"]
+                if isinstance(func, dict):
+                    tokens += len(encoder.encode(func.get("name", "")))
+                    tokens += len(encoder.encode(func.get("arguments", "")))
+    # Base overhead per message
+    tokens += 4
+    return tokens
+class TokenAwareChatMessageStore:
+    """A token-aware message store for Agent Framework ChatMessage.
+    This store implements ChatMessageStoreProtocol and uses token counting
+    to trigger compaction, ensuring safety against large messages that could
+    exceed LLM context limits.
+    The compaction strategy:
+    - Keeps head messages (system prompt, initial context) based on head_ratio
+    - Keeps tail messages (recent work, current state)
+    - Drops middle messages when token count exceeds budget
+    - Respects atomic groups (tool calls + results must stay together)
     IMPORTANT: This store preserves full ChatMessage objects including:
     - FunctionCallContent (tool calls)
     This is critical because OpenAI's API requires tool results to immediately
     follow their corresponding tool calls.
     """
     def __init__(
         self,
         messages: Sequence["ChatMessage"] | None = None,
+        head_ratio: float = 0.2,
+        token_budget: int = DEFAULT_TOKEN_BUDGET,
+        model: str = "gpt-4o",
     ) -> None:
+        """Initialize the token-aware store.
         Args:
             messages: Initial messages to store
+            head_ratio: Fraction of budget for head messages (0.2 = 20%)
+            token_budget: Max tokens before compaction triggers
+            model: Model name for tokenizer selection
         """
+        if head_ratio < 0 or head_ratio > 1:
+            raise ValueError("head_ratio must be between 0 and 1")
+        if token_budget < 1000:
+            raise ValueError("token_budget must be at least 1000")
+        self._messages: list[ChatMessage] = list(messages) if messages else []
+        self._head_ratio = head_ratio
+        self._token_budget = token_budget
+        self._model = model
+        self._compaction_count = 0
+        self._total_tokens_saved = 0
     @property
+    def head_ratio(self) -> float:
+        """Fraction of budget for head messages."""
+        return self._head_ratio
     @property
+    def token_budget(self) -> int:
+        """Max tokens before compaction triggers."""
+        return self._token_budget
     @property
     def total_messages(self) -> int:
         return len(self._messages)
     @property
+    def compaction_count(self) -> int:
+        """Number of times compaction has been triggered."""
+        return self._compaction_count
     @property
+    def total_tokens_saved(self) -> int:
+        """Total tokens saved through compaction."""
+        return self._total_tokens_saved
+    def _count_tokens(self) -> int:
+        """Count total tokens in all messages."""
+        return sum(_count_chat_message_tokens(m, self._model) for m in self._messages)
+    def _find_atomic_groups(
+        self, messages: list["ChatMessage"]
+    ) -> list[tuple[int, ...]]:
+        """Group tool_call messages with their results.
+        OpenAI requires every tool_call to have a corresponding result.
+        This ensures we never split a tool call from its results.
+        """
+        groups: list[tuple[int, ...]] = []
+        i = 0
+        while i < len(messages):
+            msg = messages[i]
+            msg_dict = _chat_message_to_dict(msg)
+            if msg_dict.get("tool_calls"):
+                # This message has tool calls - find all results
+                call_ids = {
+                    tc.get("id") for tc in msg_dict["tool_calls"] if tc.get("id")
+                }
+                group_indices = [i]
+                # Look ahead for results
+                j = i + 1
+                while j < len(messages) and call_ids:
+                    next_dict = _chat_message_to_dict(messages[j])
+                    if next_dict.get("role") == "tool":
+                        tool_call_id = next_dict.get("tool_call_id")
+                        if tool_call_id in call_ids:
+                            group_indices.append(j)
+                            call_ids.remove(tool_call_id)
+                    j += 1
+                groups.append(tuple(group_indices))
+                i = max(group_indices) + 1 if group_indices else i + 1
+            else:
+                groups.append((i,))
+                i += 1
+        return groups
+    def _compact_messages(
+        self, messages: list["ChatMessage"]
+    ) -> list["ChatMessage"]:
+        """Apply head+tail compaction to messages."""
+        if not messages:
+            return messages
+        current_tokens = sum(
+            _count_chat_message_tokens(m, self._model) for m in messages
+        )
+        if current_tokens <= self._token_budget:
+            return messages
+        # COMPACTION NEEDED
+        self._compaction_count += 1
+        groups = self._find_atomic_groups(messages)
+        head_budget = int(self._token_budget * self._head_ratio)
+        tail_budget = self._token_budget - head_budget
+        # Fill head from start
+        head_groups: list[tuple[int, ...]] = []
+        head_tokens = 0
+        for group in groups:
+            group_tokens = sum(
+                _count_chat_message_tokens(messages[i], self._model) for i in group
+            )
+            if head_tokens + group_tokens <= head_budget:
+                head_groups.append(group)
+                head_tokens += group_tokens
+            else:
+                break
+        # Fill tail from end (skip head groups)
+        remaining_groups = groups[len(head_groups):]
+        tail_groups: list[tuple[int, ...]] = []
+        tail_tokens = 0
+        for group in reversed(remaining_groups):
+            group_tokens = sum(
+                _count_chat_message_tokens(messages[i], self._model) for i in group
+            )
+            if tail_tokens + group_tokens <= tail_budget:
+                tail_groups.insert(0, group)
+                tail_tokens += group_tokens
+            else:
+                break
+        # Build compacted list
+        kept_indices: set[int] = set()
+        for group in head_groups + tail_groups:
+            kept_indices.update(group)
+        compacted = [messages[i] for i in sorted(kept_indices)]
+        # Track savings
+        compacted_tokens = sum(
+            _count_chat_message_tokens(m, self._model) for m in compacted
+        )
+        self._total_tokens_saved += current_tokens - compacted_tokens
+        return compacted
     async def add_messages(self, messages: Sequence["ChatMessage"]) -> None:
         """Add messages to the store.
         self._messages.extend(messages)
     async def list_messages(self) -> list["ChatMessage"]:
+        """Get messages with token-aware compaction applied.
+        Applies head+tail compaction if total tokens exceed budget.
+        Respects atomic groups (tool calls stay with their results).
         Returns:
             List of ChatMessage objects after compaction
         """
+        return self._compact_messages(self._messages)
     @classmethod
     async def deserialize(
         cls,
         serialized_store_state: MutableMapping[str, Any],
         **kwargs: Any,
+    ) -> "TokenAwareChatMessageStore":
         """Create store from serialized state."""
         from agent_framework import ChatMessage
+        head_ratio = kwargs.get(
+            "head_ratio", serialized_store_state.get("head_ratio", 0.2)
+        )
+        token_budget = kwargs.get(
+            "token_budget", serialized_store_state.get("token_budget", DEFAULT_TOKEN_BUDGET)
+        )
+        model = kwargs.get("model", serialized_store_state.get("model", "gpt-4o"))
         messages_data = serialized_store_state.get("messages", [])
         messages = [
             for m in messages_data
         ]
+        return cls(
+            messages=messages,
+            head_ratio=head_ratio,
+            token_budget=token_budget,
+            model=model,
+        )
     async def update_from_state(
         self,
             for m in messages_data
         ]
+        if "head_ratio" in serialized_store_state:
+            self._head_ratio = serialized_store_state["head_ratio"]
+        if "token_budget" in serialized_store_state:
+            self._token_budget = serialized_store_state["token_budget"]
+        if "model" in serialized_store_state:
+            self._model = serialized_store_state["model"]
     async def serialize(self, **kwargs: Any) -> dict[str, Any]:
         """Serialize the store state.
         """
         return {
             "messages": [m.to_dict() for m in self._messages],
+            "head_ratio": self._head_ratio,
+            "token_budget": self._token_budget,
+            "model": self._model,
         }
     @property
+    def stats(self) -> dict[str, Any]:
         """Get compaction statistics."""
         return {
             "total_messages": self.total_messages,
+            "current_tokens": self._count_tokens(),
+            "token_budget": self._token_budget,
+            "head_ratio": self._head_ratio,
+            "compaction_count": self._compaction_count,
+            "total_tokens_saved": self._total_tokens_saved,
         }
+# Backwards compatibility alias
+HeadTailCompactingChatMessageStore = TokenAwareChatMessageStore

src/flow/harness/maf/tools/__init__.py CHANGED Viewed

@@ -6,7 +6,7 @@ the to_maf_tool adapter.
 Available tools:
 - read_file, write_file, edit_file, multi_edit, glob_files, grep, ls
-- bash, check_processes, python_repl
 - think, todo_write, todo_read
 - memory, skills, task
 - web_search, web_fetch
@@ -19,28 +19,49 @@ from pathlib import Path
 from typing import Any
 from flow.tools import (
-    # Coding
-    read_file, write_file, edit_file, multi_edit, glob_files, grep, ls,
     # Execution
-    bash, check_processes, python_repl,
-    # Planning
-    think, todo_write, todo_read,
     # Memory
-    memory, create_memory_tool,
-    # Web
-    web_search, web_fetch,
     # Notebooks
-    notebook_edit, notebook_read,
     # Skills
-    skills, create_skills_tool,
     # Sub-agent
-    task, create_task_tool,
-    # Workspace management
-    set_workspace, Workspace,
     # Adapters
     to_maf_tool,
-    # Base
-    Tool,
 )
 __all__ = [
@@ -93,7 +114,6 @@ def build_tools(
         # Execution
         "bash": bash,
         "check_processes": check_processes,
-        "python_repl": python_repl,
         # Planning
         "think": think,
         "todo_write": todo_write,
@@ -110,6 +130,11 @@ def build_tools(
         "skills": skills,
         # Task/sub-agent (default instance)
         "task": task,
     }
     tools: list[Callable[..., Coroutine[Any, Any, str]]] = []
@@ -128,10 +153,20 @@ def build_tools(
             tools.append(to_maf_tool(custom_task))
         elif name == "skills" and config.get("additional_paths"):
             # Skills with custom paths
-            custom_skills = create_skills_tool(
-                project_path=Path(config["additional_paths"][0])
-            )
             tools.append(to_maf_tool(custom_skills))
         else:
             logger.warning(f"Unknown tool name: {name}. Skipping.")

 Available tools:
 - read_file, write_file, edit_file, multi_edit, glob_files, grep, ls
+- bash, check_processes
 - think, todo_write, todo_read
 - memory, skills, task
 - web_search, web_fetch
 from typing import Any
 from flow.tools import (
+    # Base
+    Tool,
+    Workspace,
     # Execution
+    bash,
+    check_processes,
+    create_skills_tool,
+    # Browsing
+    create_smol_web_search_tool,
+    create_task_tool,
+    create_visit_webpage_tool,
+    edit_file,
+    glob_files,
+    grep,
+    ls,
     # Memory
+    memory,
+    multi_edit,
     # Notebooks
+    notebook_edit,
+    notebook_read,
+    # Coding
+    read_file,
+    # Workspace management
+    set_workspace,
     # Skills
+    skills,
     # Sub-agent
+    task,
+    # File inspection
+    text_inspector,
+    # Planning
+    think,
     # Adapters
     to_maf_tool,
+    todo_read,
+    todo_write,
+    visual_inspector,
+    web_fetch,
+    # Web
+    web_search,
+    wikipedia_search,
+    write_file,
 )
 __all__ = [
         # Execution
         "bash": bash,
         "check_processes": check_processes,
         # Planning
         "think": think,
         "todo_write": todo_write,
         "skills": skills,
         # Task/sub-agent (default instance)
         "task": task,
+        # Wikipedia search
+        "wikipedia_search": wikipedia_search,
+        # File inspection tools
+        "text_inspector": text_inspector,
+        "visual_inspector": visual_inspector,
     }
     tools: list[Callable[..., Coroutine[Any, Any, str]]] = []
             tools.append(to_maf_tool(custom_task))
         elif name == "skills" and config.get("additional_paths"):
             # Skills with custom paths
+            custom_skills = create_skills_tool(project_path=Path(config["additional_paths"][0]))
             tools.append(to_maf_tool(custom_skills))
+        # Web search tool
+        elif name == "smol_web_search":
+            wst_max_results = config.get("wst_max_results", 10)
+            wst_engine = config.get("wst_engine", "duckduckgo")
+            custom_smol_web_search = create_smol_web_search_tool(max_results=wst_max_results, engine=wst_engine)
+            tools.append(to_maf_tool(custom_smol_web_search))
+        elif name == "visit_webpage":
+            vwp_max_output_length = config.get("vwp_max_output_length", 40000)
+            custom_visit_webpage = create_visit_webpage_tool(max_output_length=vwp_max_output_length)
+            tools.append(to_maf_tool(custom_visit_webpage))
         else:
             logger.warning(f"Unknown tool name: {name}. Skipping.")

src/flow/harness/maf/wrappers.py CHANGED Viewed

@@ -14,8 +14,8 @@ from collections.abc import Callable, Coroutine
 from pathlib import Path
 from typing import Any
-from flow.tools import Tool, to_maf_tool
 from flow.harness.maf.tools import build_tools as build_maf_tools_impl
 logger = logging.getLogger(__name__)

 from pathlib import Path
 from typing import Any
 from flow.harness.maf.tools import build_tools as build_maf_tools_impl
+from flow.tools import Tool, to_maf_tool
 logger = logging.getLogger(__name__)

src/flow/harness/miniagent/__init__.py CHANGED Viewed

@@ -51,38 +51,38 @@ MiniAgent's tool loop:
         messages.extend(results)  # Next iteration will compact again
 """
-from .agent import ChatAgent, AgentThread, AgentResponse, UsageStats, StreamEvent, StreamEventType
-from .tool import Tool, tool
-from .messages import ChatMessage, ToolCall, ToolResult
 from .context import (
     ContextStrategy,
-    NoCompactionStrategy,
     HeadTailStrategy,
     SlidingWindowStrategy,
     SummarizationStrategy,
 )
-from .client import ChatClient, ClientConfig, ChatCompletionResult
 from .hooks import (
-    Hooks,
     HookEvent,
-    PreToolUseEvent,
-    PreToolUseResult,
     PostToolUseEvent,
     PostToolUseResult,
-    PreModelCallEvent,
-    PostModelCallEvent,
     PreCompactEvent,
-    PostCompactEvent,
-    AgentStartEvent,
-    AgentEndEvent,
 )
-from .instructions import get_instructions, INSTRUCTIONS
 from .workspace import Workspace, get_workspace, set_workspace
-from . import tools
-# Register with Flow's harness system
-from flow.harness.registry import register
-from .harness import MiniAgentHarness
 register("miniagent", MiniAgentHarness)

         messages.extend(results)  # Next iteration will compact again
 """
+# Register with Flow's harness system
+from flow.harness.registry import register
+from . import tools
+from .agent import AgentResponse, AgentThread, ChatAgent, StreamEvent, StreamEventType, UsageStats
+from .client import ChatClient, ChatCompletionResult, ClientConfig
 from .context import (
     ContextStrategy,
     HeadTailStrategy,
+    NoCompactionStrategy,
     SlidingWindowStrategy,
     SummarizationStrategy,
 )
+from .harness import MiniAgentHarness
 from .hooks import (
+    AgentEndEvent,
+    AgentStartEvent,
     HookEvent,
+    Hooks,
+    PostCompactEvent,
+    PostModelCallEvent,
     PostToolUseEvent,
     PostToolUseResult,
     PreCompactEvent,
+    PreModelCallEvent,
+    PreToolUseEvent,
+    PreToolUseResult,
 )
+from .instructions import INSTRUCTIONS, get_instructions
+from .messages import ChatMessage, ToolCall, ToolResult
+from .tool import Tool, tool
 from .workspace import Workspace, get_workspace, set_workspace
 register("miniagent", MiniAgentHarness)

src/flow/harness/miniagent/agent.py CHANGED Viewed

@@ -5,28 +5,29 @@ The key difference: context strategy is called BEFORE each LLM call in the
 tool loop, and the compacted list continues to the next iteration.
 """
 from dataclasses import dataclass, field
-from typing import Any, AsyncGenerator
 from enum import Enum
-import json
-from .messages import ChatMessage, ToolCall
-from .tool import Tool
 from .client import ChatClient, ChatCompletionResult
 from .context import ContextStrategy, NoCompactionStrategy
 from .hooks import (
     Hooks,
-    PreToolUseEvent,
-    PreToolUseResult,
     PostToolUseEvent,
     PostToolUseResult,
-    PreModelCallEvent,
-    PostModelCallEvent,
     PreCompactEvent,
-    PostCompactEvent,
-    AgentStartEvent,
-    AgentEndEvent,
 )
 class StreamEventType(str, Enum):
@@ -452,7 +453,7 @@ class ChatAgent:
         try:
             return await tool.invoke(**arguments)
         except Exception as e:
-            return f"Error executing {name}: {str(e)}"
     # === Hook emission methods ===

 tool loop, and the compacted list continues to the next iteration.
 """
+import json
+from collections.abc import AsyncGenerator
 from dataclasses import dataclass, field
 from enum import Enum
+from typing import Any
 from .client import ChatClient, ChatCompletionResult
 from .context import ContextStrategy, NoCompactionStrategy
 from .hooks import (
+    AgentEndEvent,
+    AgentStartEvent,
     Hooks,
+    PostCompactEvent,
+    PostModelCallEvent,
     PostToolUseEvent,
     PostToolUseResult,
     PreCompactEvent,
+    PreModelCallEvent,
+    PreToolUseEvent,
+    PreToolUseResult,
 )
+from .messages import ChatMessage, ToolCall
+from .tool import Tool
 class StreamEventType(str, Enum):
         try:
             return await tool.invoke(**arguments)
         except Exception as e:
+            return f"Error executing {name}: {e!s}"
     # === Hook emission methods ===

src/flow/harness/miniagent/client.py CHANGED Viewed

@@ -4,9 +4,9 @@ Provides a unified interface for both OpenAI and Azure OpenAI APIs.
 Auto-detects configuration from environment variables.
 """
 from dataclasses import dataclass
 from typing import Any
-import os
 # Load .env file if present (override=True to prefer .env over shell env)
 try:
@@ -88,7 +88,7 @@ class ChatClient:
     def _create_client(self):
         """Create the appropriate async client."""
         try:
-            from openai import AsyncOpenAI, AsyncAzureOpenAI
         except ImportError:
             raise ImportError(
                 "openai package is required. Install with: pip install openai"

 Auto-detects configuration from environment variables.
 """
+import os
 from dataclasses import dataclass
 from typing import Any
 # Load .env file if present (override=True to prefer .env over shell env)
 try:
     def _create_client(self):
         """Create the appropriate async client."""
         try:
+            from openai import AsyncAzureOpenAI, AsyncOpenAI
         except ImportError:
             raise ImportError(
                 "openai package is required. Install with: pip install openai"

src/flow/harness/miniagent/context.py CHANGED Viewed

@@ -5,8 +5,10 @@ Strategies are called BEFORE each LLM call, and the returned (potentially
 compacted) list continues to the next iteration.
 """
 from dataclasses import dataclass, field
-from typing import Protocol, Any
 import tiktoken
 from .messages import ChatMessage
@@ -471,13 +473,12 @@ SUMMARY:"""
                     if tc.name in ("read_file", "Read"):
                         # Try to extract path from arguments
                         try:
-                            import json
                             args = json.loads(tc.arguments)
                             path = args.get("path") or args.get("file_path") or args.get("filename")
                             if path:
                                 files.append(path)
-                        except:
-                            pass
         return list(dict.fromkeys(files))  # Remove duplicates, preserve order
     def _extract_key_info(self, messages: list[ChatMessage]) -> str:

 compacted) list continues to the next iteration.
 """
+import json
 from dataclasses import dataclass, field
+from typing import Any, Protocol
 import tiktoken
 from .messages import ChatMessage
                     if tc.name in ("read_file", "Read"):
                         # Try to extract path from arguments
                         try:
                             args = json.loads(tc.arguments)
                             path = args.get("path") or args.get("file_path") or args.get("filename")
                             if path:
                                 files.append(path)
+                        except (json.JSONDecodeError, KeyError, TypeError):
+                            pass  # Skip malformed tool calls
         return list(dict.fromkeys(files))  # Remove duplicates, preserve order
     def _extract_key_info(self, messages: list[ChatMessage]) -> str:

src/flow/harness/miniagent/harness.py CHANGED Viewed

@@ -10,7 +10,7 @@ import logging
 import uuid
 from collections.abc import AsyncIterator
 from pathlib import Path
-from typing import TYPE_CHECKING, Any
 from flow.harness.base import BaseHarness, Event, EventType
@@ -18,19 +18,21 @@ if TYPE_CHECKING:
     from flow.experiments.models import Agent
     from flow.llm import LLMClientConfig
-from .agent import ChatAgent, AgentThread, StreamEvent, StreamEventType
 from .context import (
     ContextStrategy,
-    NoCompactionStrategy,
     HeadTailStrategy,
     SlidingWindowStrategy,
     SummarizationStrategy,
 )
-from .client import ChatClient
-from .otel import enable_instrumentation
 from .instructions import get_instructions
-from flow.tools import Tool
 logger = logging.getLogger(__name__)
@@ -61,13 +63,21 @@ class MiniAgentHarness(BaseHarness):
         ...     print(event)
     """
     @classmethod
     def from_agent(
         cls,
-        agent: "Agent",
         workspace: Path,
-        llm_config: "LLMClientConfig | None" = None,
-    ) -> "MiniAgentHarness":
         """Create a MiniAgentHarness from an Agent definition.
         Args:
@@ -105,13 +115,13 @@ class MiniAgentHarness(BaseHarness):
         from .otel import create_otel_hooks
         otel_hooks = create_otel_hooks(model=config.model)
-        # Resolve instructions: explicit > preset > default "coding"
         if agent.instructions:
             instructions = agent.instructions
         elif agent.instructions_preset:
             instructions = get_instructions(agent.instructions_preset)
         else:
-            instructions = get_instructions("coding")
         chat_agent = ChatAgent(
             client=chat_client,
@@ -126,8 +136,8 @@ class MiniAgentHarness(BaseHarness):
     @classmethod
     def _create_client_config_from_llm_config(
-        cls, llm_config: "LLMClientConfig"
-    ) -> "ClientConfig":
         """Create MiniAgent ClientConfig from Flow LLMClientConfig.
         Args:
@@ -137,6 +147,7 @@ class MiniAgentHarness(BaseHarness):
             MiniAgent ClientConfig
         """
         from flow.llm import LLMProvider
         from .client import ClientConfig
         match llm_config.provider:
@@ -177,7 +188,7 @@ class MiniAgentHarness(BaseHarness):
     @classmethod
     def _create_client_config_from_dict(
         cls, llm_config: dict[str, Any]
-    ) -> "ClientConfig":
         """Create ClientConfig from agent's llm_config dict.
         Supports a simple format for YAML configuration:
@@ -197,6 +208,7 @@ class MiniAgentHarness(BaseHarness):
             ValueError: If required fields or env vars are missing
         """
         import os
         from .client import ClientConfig
         provider = llm_config.get("provider", "").lower()
@@ -273,7 +285,7 @@ class MiniAgentHarness(BaseHarness):
             )
     @classmethod
-    def _create_context_strategy(cls, agent: "Agent") -> ContextStrategy:
         """Map Flow's CompactionConfig to MiniAgent's ContextStrategy."""
         config = agent.compaction
@@ -328,24 +340,37 @@ class MiniAgentHarness(BaseHarness):
         """
         # Import shared tools
         from flow.tools import (
-            # Coding
-            read_file, write_file, edit_file, multi_edit, glob_files, grep, ls,
             # Execution
-            bash, check_processes, python_repl,
-            # Planning
-            think, todo_write, todo_read,
             # Memory
-            memory, create_memory_tool,
-            # Web
-            web_search, web_fetch,
             # Notebooks
-            notebook_edit, notebook_read,
             # Skills
-            skills, create_skills_tool,
             # Sub-agent
-            task, create_task_tool,
-            # Workspace management
-            set_workspace, Workspace,
         )
         # Set workspace for tools that need it (memory, todos, etc.)
@@ -364,7 +389,6 @@ class MiniAgentHarness(BaseHarness):
             # Execution
             "bash": bash,
             "check_processes": check_processes,
-            "python_repl": python_repl,
             # Planning
             "think": think,
             "todo_write": todo_write,

 import uuid
 from collections.abc import AsyncIterator
 from pathlib import Path
+from typing import TYPE_CHECKING, Any, ClassVar
 from flow.harness.base import BaseHarness, Event, EventType
     from flow.experiments.models import Agent
     from flow.llm import LLMClientConfig
+    from .client import ClientConfig
+from flow.tools import Tool
+from .agent import AgentThread, ChatAgent, StreamEvent, StreamEventType
+from .client import ChatClient
 from .context import (
     ContextStrategy,
     HeadTailStrategy,
+    NoCompactionStrategy,
     SlidingWindowStrategy,
     SummarizationStrategy,
 )
 from .instructions import get_instructions
+from .otel import enable_instrumentation
 logger = logging.getLogger(__name__)
         ...     print(event)
     """
+    # Framework metadata
+    framework_name: ClassVar[str] = "miniagent"
+    framework_label: ClassVar[str] = "MiniAgent"
+    framework_description: ClassVar[str] = "Token-aware context management with advanced compaction"
+    supported_compaction_strategies: ClassVar[list[str]] = [
+        "head_tail", "sliding_window", "summarization", "none"
+    ]
     @classmethod
     def from_agent(
         cls,
+        agent: Agent,
         workspace: Path,
+        llm_config: LLMClientConfig | None = None,
+    ) -> MiniAgentHarness:
         """Create a MiniAgentHarness from an Agent definition.
         Args:
         from .otel import create_otel_hooks
         otel_hooks = create_otel_hooks(model=config.model)
+        # Resolve instructions: explicit > preset > default "general"
         if agent.instructions:
             instructions = agent.instructions
         elif agent.instructions_preset:
             instructions = get_instructions(agent.instructions_preset)
         else:
+            instructions = get_instructions("general")
         chat_agent = ChatAgent(
             client=chat_client,
     @classmethod
     def _create_client_config_from_llm_config(
+        cls, llm_config: LLMClientConfig
+    ) -> ClientConfig:
         """Create MiniAgent ClientConfig from Flow LLMClientConfig.
         Args:
             MiniAgent ClientConfig
         """
         from flow.llm import LLMProvider
         from .client import ClientConfig
         match llm_config.provider:
     @classmethod
     def _create_client_config_from_dict(
         cls, llm_config: dict[str, Any]
+    ) -> ClientConfig:
         """Create ClientConfig from agent's llm_config dict.
         Supports a simple format for YAML configuration:
             ValueError: If required fields or env vars are missing
         """
         import os
         from .client import ClientConfig
         provider = llm_config.get("provider", "").lower()
             )
     @classmethod
+    def _create_context_strategy(cls, agent: Agent) -> ContextStrategy:
         """Map Flow's CompactionConfig to MiniAgent's ContextStrategy."""
         config = agent.compaction
         """
         # Import shared tools
         from flow.tools import (
+            Workspace,
             # Execution
+            bash,
+            check_processes,
+            create_task_tool,
+            edit_file,
+            glob_files,
+            grep,
+            ls,
             # Memory
+            memory,
+            multi_edit,
             # Notebooks
+            notebook_edit,
+            notebook_read,
+            # Coding
+            read_file,
+            # Workspace management
+            set_workspace,
             # Skills
+            skills,
             # Sub-agent
+            task,
+            # Planning
+            think,
+            todo_read,
+            todo_write,
+            web_fetch,
+            # Web
+            web_search,
+            write_file,
         )
         # Set workspace for tools that need it (memory, todos, etc.)
             # Execution
             "bash": bash,
             "check_processes": check_processes,
             # Planning
             "think": think,
             "todo_write": todo_write,

src/flow/harness/miniagent/hooks.py CHANGED Viewed

@@ -6,9 +6,10 @@ Inspired by Claude Agent SDK's hooks system. Hooks allow applications to:
 - Control: Block tool calls, stop execution
 """
 from dataclasses import dataclass, field
-from typing import Any, Callable, Awaitable, Literal
 from enum import Enum
 class HookEvent(str, Enum):

 - Control: Block tool calls, stop execution
 """
+from collections.abc import Awaitable, Callable
 from dataclasses import dataclass, field
 from enum import Enum
+from typing import Any, Literal
 class HookEvent(str, Enum):

src/flow/harness/miniagent/instructions.py CHANGED Viewed

@@ -89,7 +89,7 @@ Never assume libraries exist. Check package.json, requirements.txt, or equivalen
 # Preset-specific instructions
 # =============================================================================
-CODING_AGENT_INSTRUCTIONS = f"""You are an expert coding assistant. You help users with software engineering tasks including writing code, debugging, refactoring, and explaining code.
 ## Response Style
@@ -111,7 +111,11 @@ CODING_AGENT_INSTRUCTIONS = f"""You are an expert coding assistant. You help use
 - **ls**: List directory contents.
 ### Execution
-- **bash**: Execute shell commands. Use for git, running tests, installing packages.
 ### Planning
 - **think**: Reason through complex problems before acting.
@@ -120,68 +124,25 @@ CODING_AGENT_INSTRUCTIONS = f"""You are an expert coding assistant. You help use
 ### Delegation (if available)
 - **task**: Delegate complex sub-tasks to a specialist agent with isolated context.
-{EFFICIENCY_INSTRUCTIONS}
-{BEST_PRACTICES_INSTRUCTIONS}
-"""
-RESEARCH_AGENT_INSTRUCTIONS = f"""You are a research assistant. You help users find information, synthesize knowledge, and answer questions.
-## Response Style
-- Be thorough in research, concise in presentation.
-- Cite sources with URLs when reporting findings.
-- Synthesize information - don't just list results.
-{TASK_COMPLETION_INSTRUCTIONS}
-## Tools
-### Search & Fetch
-- **web_search**: Search the web for information.
-- **web_fetch**: Fetch and read web page contents.
-### Planning
-- **think**: Work through complex questions step by step.
-- **todo_write**: Track research progress on multi-part questions.
-## Research Strategy
-1. Start with broad searches to identify relevant sources
-2. Fetch multiple promising URLs in parallel (batch web_fetch calls)
-3. Synthesize findings into a coherent answer
-4. If initial searches don't answer the question, refine and search again
-## Guidelines
-1. **Be thorough**: Search multiple queries if needed - batch them.
-2. **Cite sources**: Include URLs when reporting findings.
-3. **Synthesize**: Draw conclusions, don't just list results.
-4. **Keep going**: If first searches don't work, try different queries.
-5. **Acknowledge uncertainty**: If information is unclear, say so.
-"""
-EXPLORE_AGENT_INSTRUCTIONS = f"""You are a codebase exploration specialist. Your job is to quickly find and understand code.
-## Response Style
-- Be concise. Your response goes to another agent, so be self-contained.
-- Include file paths and line numbers in findings.
-- Summarize what you found, don't dump raw content.
-{TASK_COMPLETION_INSTRUCTIONS}
-## Tools
-- **read_file**: Read file contents (read fully, don't chunk).
-- **glob_files**: Find files by pattern.
-- **grep**: Search file contents with regex.
-- **ls**: List directory contents.
-- **think**: Reason about what you're finding.
-- **todo_write**: Track exploration progress for complex searches.
 {EFFICIENCY_INSTRUCTIONS}
-## Guidelines
-1. **Start broad, then narrow**: Use glob/grep to find candidates, then batch-read.
-2. **Be efficient**: Don't read files you don't need.
-3. **Report clearly**: Include file paths and line numbers.
-4. **Keep searching**: If first attempt doesn't find what's needed, try different patterns.
-5. **Summarize**: Be self-contained for the calling agent.
 """
 # =============================================================================
@@ -189,19 +150,17 @@ EXPLORE_AGENT_INSTRUCTIONS = f"""You are a codebase exploration specialist. Your
 # =============================================================================
 INSTRUCTIONS = {
-    "coding": CODING_AGENT_INSTRUCTIONS,
-    "research": RESEARCH_AGENT_INSTRUCTIONS,
-    "explore": EXPLORE_AGENT_INSTRUCTIONS,
 }
-def get_instructions(preset: str = "coding") -> str:
     """Get system instructions by preset name.
     Args:
-        preset: One of 'coding', 'research', 'explore'
     Returns:
         System instruction string
     """
-    return INSTRUCTIONS.get(preset, CODING_AGENT_INSTRUCTIONS)

 # Preset-specific instructions
 # =============================================================================
+GENERAL_AGENT_INSTRUCTIONS = f"""You are a helpful general-purpose agent. You solve tasks by combining reasoning, code execution, file operations, and web research as needed.
 ## Response Style
 - **ls**: List directory contents.
 ### Execution
+- **bash**: Execute shell commands. Use for git, running tests, installing packages, and running Python code (e.g., `python -c "print(2+2)"` or `python script.py`).
+### Web Research
+- **web_search**: Search the web for current information, facts, or data.
+- **web_fetch**: Fetch and read web page contents.
 ### Planning
 - **think**: Reason through complex problems before acting.
 ### Delegation (if available)
 - **task**: Delegate complex sub-tasks to a specialist agent with isolated context.
+## Problem-Solving Strategy
+### For calculations and math problems
+Write and execute code rather than computing in your head. Use `bash` with `python -c "..."` or write a script with `write_file` then run it with `bash` — this avoids arithmetic errors.
+### For questions requiring specific facts or current information
+Use `web_search` to find authoritative sources, then `web_fetch` to read them. Do NOT guess or rely on memory for factual claims like dates, numbers, names, or statistics.
+### For complex tasks (data processing, file analysis, media)
+Write code to solve them. Install required libraries with `bash` (e.g., `pip install ...`). Break the problem into steps and verify each step works before moving on.
+### Anti-hallucination
+- NEVER guess factual answers. If you don't know, search or compute.
+- When a task asks for a specific number or name, verify it from a source.
+- If web search fails, try different queries before giving up.
+- State your confidence level when reporting facts.
 {EFFICIENCY_INSTRUCTIONS}
+{BEST_PRACTICES_INSTRUCTIONS}
 """
 # =============================================================================
 # =============================================================================
 INSTRUCTIONS = {
+    "general": GENERAL_AGENT_INSTRUCTIONS,
 }
+def get_instructions(preset: str = "general") -> str:
     """Get system instructions by preset name.
     Args:
+        preset: Preset name (default: 'general')
     Returns:
         System instruction string
     """
+    return INSTRUCTIONS.get(preset, GENERAL_AGENT_INSTRUCTIONS)

src/flow/harness/miniagent/otel.py CHANGED Viewed

@@ -16,12 +16,12 @@ from opentelemetry import trace
 if TYPE_CHECKING:
     from .hooks import (
         Hooks,
-        PreModelCallEvent,
         PostModelCallEvent,
-        PreToolUseEvent,
-        PreToolUseResult,
         PostToolUseEvent,
         PostToolUseResult,
     )
 __all__ = ["GenAIAttr", "create_otel_hooks", "enable_instrumentation"]
@@ -157,7 +157,7 @@ class OTelHooks:
         self._llm_spans: dict[int, trace.Span] = {}  # iteration -> span
         self._tool_spans: dict[str, trace.Span] = {}  # call_id -> span
-    async def on_pre_model_call(self, event: "PreModelCallEvent") -> None:
         """Start an LLM span before model call.
         Args:
@@ -166,7 +166,7 @@ class OTelHooks:
         span = start_llm_span(model=self.model)
         self._llm_spans[event.iteration] = span
-    async def on_post_model_call(self, event: "PostModelCallEvent") -> None:
         """End the LLM span after model call.
         Args:
@@ -178,7 +178,7 @@ class OTelHooks:
             output_tokens = event.usage.get("output_tokens", 0)
             end_llm_span(span, input_tokens, output_tokens)
-    async def on_pre_tool_use(self, event: "PreToolUseEvent") -> "PreToolUseResult | None":
         """Start a tool span before tool execution.
         Args:
@@ -191,7 +191,7 @@ class OTelHooks:
         self._tool_spans[event.tool_call_id] = span
         return None  # Don't block
-    async def on_post_tool_use(self, event: "PostToolUseEvent") -> "PostToolUseResult | None":
         """End the tool span after tool execution.
         Args:
@@ -230,7 +230,7 @@ def enable_instrumentation() -> None:
     _instrumentation_enabled = True
-def create_otel_hooks(model: str = "gpt-4o") -> "Hooks":
     """Create a Hooks instance with OTEL instrumentation.
     This is the main entry point for adding OTEL tracing to a MiniAgent.

 if TYPE_CHECKING:
     from .hooks import (
         Hooks,
         PostModelCallEvent,
         PostToolUseEvent,
         PostToolUseResult,
+        PreModelCallEvent,
+        PreToolUseEvent,
+        PreToolUseResult,
     )
 __all__ = ["GenAIAttr", "create_otel_hooks", "enable_instrumentation"]
         self._llm_spans: dict[int, trace.Span] = {}  # iteration -> span
         self._tool_spans: dict[str, trace.Span] = {}  # call_id -> span
+    async def on_pre_model_call(self, event: PreModelCallEvent) -> None:
         """Start an LLM span before model call.
         Args:
         span = start_llm_span(model=self.model)
         self._llm_spans[event.iteration] = span
+    async def on_post_model_call(self, event: PostModelCallEvent) -> None:
         """End the LLM span after model call.
         Args:
             output_tokens = event.usage.get("output_tokens", 0)
             end_llm_span(span, input_tokens, output_tokens)
+    async def on_pre_tool_use(self, event: PreToolUseEvent) -> PreToolUseResult | None:
         """Start a tool span before tool execution.
         Args:
         self._tool_spans[event.tool_call_id] = span
         return None  # Don't block
+    async def on_post_tool_use(self, event: PostToolUseEvent) -> PostToolUseResult | None:
         """End the tool span after tool execution.
         Args:
     _instrumentation_enabled = True
+def create_otel_hooks(model: str = "gpt-4o") -> Hooks:
     """Create a Hooks instance with OTEL instrumentation.
     This is the main entry point for adding OTEL tracing to a MiniAgent.

src/flow/harness/miniagent/tool.py CHANGED Viewed

@@ -3,9 +3,10 @@
 Provides a simple way to define tools that can be called by the LLM.
 """
-from dataclasses import dataclass
-from typing import Any, Callable, Literal, get_type_hints, get_origin, get_args, Annotated
 import inspect
 @dataclass
@@ -43,7 +44,7 @@ class Tool:
                 result = await result
             return str(result) if not isinstance(result, str) else result
         except Exception as e:
-            return f"Error executing {self.name}: {str(e)}"
 def _python_type_to_json_schema(py_type: Any) -> dict[str, Any]:
@@ -53,13 +54,13 @@ def _python_type_to_json_schema(py_type: Any) -> dict[str, Any]:
         return {"type": "null"}
     # Handle basic types
-    if py_type == str:
         return {"type": "string"}
-    if py_type == int:
         return {"type": "integer"}
-    if py_type == float:
         return {"type": "number"}
-    if py_type == bool:
         return {"type": "boolean"}
     # Handle dict without type args

 Provides a simple way to define tools that can be called by the LLM.
 """
 import inspect
+from collections.abc import Callable
+from dataclasses import dataclass
+from typing import Annotated, Any, Literal, get_args, get_origin, get_type_hints
 @dataclass
                 result = await result
             return str(result) if not isinstance(result, str) else result
         except Exception as e:
+            return f"Error executing {self.name}: {e!s}"
 def _python_type_to_json_schema(py_type: Any) -> dict[str, Any]:
         return {"type": "null"}
     # Handle basic types
+    if py_type is str:
         return {"type": "string"}
+    if py_type is int:
         return {"type": "integer"}
+    if py_type is float:
         return {"type": "number"}
+    if py_type is bool:
         return {"type": "boolean"}
     # Handle dict without type args

src/flow/harness/miniagent/tools/__init__.py CHANGED Viewed

@@ -33,51 +33,51 @@ Example:
 from flow.tools import (
     # Base
     Tool,
-    # File operations
-    read_file,
-    write_file,
     edit_file,
-    multi_edit,
     glob_files,
     grep,
     ls,
     # Notebook operations
     notebook_edit,
     notebook_read,
-    # Execution
-    bash,
-    check_processes,
-    python_repl,
     # Planning and reasoning
     think,
-    todo_write,
     todo_read,
     # Web operations
     web_search,
-    web_fetch,
-    # Memory
-    memory,
-    create_memory_tool,
-    # Skills
-    skills,
-    create_skills_tool,
-    # Sub-agent
-    task,
-    create_task_tool,
-    # Presets
-    coding_tools,
-    planning_tools,
     web_tools as research_tools,
-    notebook_tools,
-    all_tools,
 )
-# Compatibility: reset_todos from planning module
-from flow.tools.planning import reset_todos, get_todos
 # Compatibility: reset_memory from memory module
 from flow.tools.memory import reset_memory
 __all__ = [
     # Base
@@ -102,7 +102,6 @@ __all__ = [
     # Execution
     "bash",
     "check_processes",
-    "python_repl",
     # Planning
     "think",
     "todo_write",

 from flow.tools import (
     # Base
     Tool,
+    all_tools,
+    # Execution
+    bash,
+    check_processes,
+    # Presets
+    coding_tools,
+    create_memory_tool,
+    create_skills_tool,
+    create_task_tool,
     edit_file,
     glob_files,
     grep,
     ls,
+    # Memory
+    memory,
+    multi_edit,
     # Notebook operations
     notebook_edit,
     notebook_read,
+    notebook_tools,
+    planning_tools,
+    # File operations
+    read_file,
+    # Skills
+    skills,
+    # Sub-agent
+    task,
     # Planning and reasoning
     think,
     todo_read,
+    todo_write,
+    web_fetch,
     # Web operations
     web_search,
+    write_file,
+)
+from flow.tools import (
     web_tools as research_tools,
 )
 # Compatibility: reset_memory from memory module
 from flow.tools.memory import reset_memory
+# Compatibility: reset_todos from planning module
+from flow.tools.planning import get_todos, reset_todos
 __all__ = [
     # Base
     # Execution
     "bash",
     "check_processes",
     # Planning
     "think",
     "todo_write",

src/flow/harness/miniagent/workspace.py CHANGED Viewed

@@ -31,6 +31,7 @@ Usage:
     ws.memory_dir    # /path/to/project/.miniagent/memory
 """
 import json
 from pathlib import Path
 from typing import Any
@@ -97,7 +98,7 @@ class Workspace:
         try:
             with open(self.todos_file) as f:
                 return json.load(f)  # type: ignore[no-any-return]
-        except (json.JSONDecodeError, IOError):
             return []
     def save_todos(self, todos: list[dict[str, Any]]) -> None:
@@ -118,7 +119,7 @@ class Workspace:
             try:
                 with open(filepath) as f:
                     memories.append(json.load(f))
-            except (json.JSONDecodeError, IOError):
                 continue
         return memories
@@ -130,7 +131,7 @@ class Workspace:
         try:
             with open(filepath) as f:
                 return json.load(f)  # type: ignore[no-any-return]
-        except (json.JSONDecodeError, IOError):
             return None
     def save_memory(self, memory_id: str, data: dict[str, Any]) -> None:
@@ -157,7 +158,7 @@ class Workspace:
         try:
             with open(self.config_file) as f:
                 return json.load(f)
-        except (json.JSONDecodeError, IOError):
             return {}
     def save_config(self, config: dict[str, Any]) -> None:
@@ -170,29 +171,31 @@ class Workspace:
         return f"Workspace({self._root})"
-# Default workspace (current directory)
-_default_workspace: Workspace | None = None
 def get_workspace() -> Workspace:
-    """Get the default workspace (creates if needed)."""
-    global _default_workspace
-    if _default_workspace is None:
-        _default_workspace = Workspace()
-    return _default_workspace
 def set_workspace(workspace: Workspace | str | Path) -> Workspace:
-    """Set the default workspace."""
-    global _default_workspace
     if isinstance(workspace, Workspace):
-        _default_workspace = workspace
-    else:
-        _default_workspace = Workspace(workspace)
-    return _default_workspace
 def reset_workspace() -> None:
-    """Reset default workspace (for testing)."""
-    global _default_workspace
-    _default_workspace = None

     ws.memory_dir    # /path/to/project/.miniagent/memory
 """
+import contextvars
 import json
 from pathlib import Path
 from typing import Any
         try:
             with open(self.todos_file) as f:
                 return json.load(f)  # type: ignore[no-any-return]
+        except (OSError, json.JSONDecodeError):
             return []
     def save_todos(self, todos: list[dict[str, Any]]) -> None:
             try:
                 with open(filepath) as f:
                     memories.append(json.load(f))
+            except (OSError, json.JSONDecodeError):
                 continue
         return memories
         try:
             with open(filepath) as f:
                 return json.load(f)  # type: ignore[no-any-return]
+        except (OSError, json.JSONDecodeError):
             return None
     def save_memory(self, memory_id: str, data: dict[str, Any]) -> None:
         try:
             with open(self.config_file) as f:
                 return json.load(f)
+        except (OSError, json.JSONDecodeError):
             return {}
     def save_config(self, config: dict[str, Any]) -> None:
         return f"Workspace({self._root})"
+# Per-task workspace via contextvars (safe for concurrent async tasks).
+_workspace_var: contextvars.ContextVar[Workspace | None] = contextvars.ContextVar(
+    "miniagent_workspace", default=None
+)
 def get_workspace() -> Workspace:
+    """Get the current workspace (creates from cwd if not set)."""
+    ws = _workspace_var.get()
+    if ws is None:
+        ws = Workspace()
+        _workspace_var.set(ws)
+    return ws
 def set_workspace(workspace: Workspace | str | Path) -> Workspace:
+    """Set the workspace for the current async task."""
     if isinstance(workspace, Workspace):
+        _workspace_var.set(workspace)
+        return workspace
+    ws = Workspace(workspace)
+    _workspace_var.set(ws)
+    return ws
 def reset_workspace() -> None:
+    """Reset workspace for the current context (for testing)."""
+    _workspace_var.set(None)

src/flow/harness/registry.py CHANGED Viewed

@@ -14,10 +14,10 @@ if TYPE_CHECKING:
     from flow.harness.base import BaseHarness
     from flow.llm import LLMClientConfig
-_HARNESSES: dict[str, type["BaseHarness"]] = {}
-def register(name: str, harness_class: type["BaseHarness"]) -> None:
     """Register a harness class for a framework.
     Args:
@@ -27,7 +27,7 @@ def register(name: str, harness_class: type["BaseHarness"]) -> None:
     _HARNESSES[name] = harness_class
-def get_harness_class(name: str) -> type["BaseHarness"]:
     """Get harness class by framework name.
     Args:
@@ -46,10 +46,10 @@ def get_harness_class(name: str) -> type["BaseHarness"]:
 def create_harness(
-    agent: "Agent",
     workspace: Path,
-    llm_config: "LLMClientConfig | None" = None,
-) -> "BaseHarness":
     """Create a harness from an Agent spec.
     This is the main entry point for creating harnesses. It looks up
@@ -71,6 +71,29 @@ def create_harness(
     return harness_class.from_agent(agent, workspace, llm_config=llm_config)
 def available_frameworks() -> list[str]:
     """Get list of available framework names.
@@ -78,3 +101,17 @@ def available_frameworks() -> list[str]:
         List of registered framework names
     """
     return list(_HARNESSES.keys())

     from flow.harness.base import BaseHarness
     from flow.llm import LLMClientConfig
+_HARNESSES: dict[str, type[BaseHarness]] = {}
+def register(name: str, harness_class: type[BaseHarness]) -> None:
     """Register a harness class for a framework.
     Args:
     _HARNESSES[name] = harness_class
+def get_harness_class(name: str) -> type[BaseHarness]:
     """Get harness class by framework name.
     Args:
 def create_harness(
+    agent: Agent,
     workspace: Path,
+    llm_config: LLMClientConfig | None = None,
+) -> BaseHarness:
     """Create a harness from an Agent spec.
     This is the main entry point for creating harnesses. It looks up
     return harness_class.from_agent(agent, workspace, llm_config=llm_config)
+def ensure_harnesses_registered() -> None:
+    """Ensure all built-in harnesses are registered.
+    Safe to call multiple times — only imports once.
+    """
+    if _HARNESSES:
+        return
+    # Import harness modules to trigger their self-registration
+    import flow.harness.maf as _maf
+    import flow.harness.miniagent as _miniagent
+    _ = (_maf, _miniagent)
+    # LangGraph is optional
+    try:
+        import flow.harness.langgraph as _lg
+        _ = _lg  # type: ignore[assignment]
+    except ImportError:
+        pass
 def available_frameworks() -> list[str]:
     """Get list of available framework names.
         List of registered framework names
     """
     return list(_HARNESSES.keys())
+def get_registered_harnesses() -> dict[str, type[BaseHarness]]:
+    """Get all registered harness classes.
+    Returns:
+        Dict mapping framework names to harness classes.
+        Each harness class has metadata attributes:
+        - framework_name: Unique identifier
+        - framework_label: Human-readable name
+        - framework_description: Short description
+        - supported_compaction_strategies: List of supported strategy names
+    """
+    return dict(_HARNESSES)