Spaces:

victordibia
/

flow

Sleeping

App Files Files Community

victordibia commited on Feb 23

Commit

708a48b

1 Parent(s): f4dca43

Deploy 2026-02-23 09:17:49

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

src/flow/__init__.py +8 -0
src/flow/cli/app.py +4 -0
src/flow/cli/deploy.py +113 -0
src/flow/cli/evaluate.py +279 -0
src/flow/cli/hf_import.py +2 -1
src/flow/cli/optimize.py +221 -89
src/flow/experiments/agent_api.py +22 -9
src/flow/experiments/data/tasks/house_rules.jsonl +3 -0
src/flow/experiments/eval_cache.py +223 -0
src/flow/experiments/evaluators/llm.py +8 -2
src/flow/experiments/gaia_converter.py +13 -18
src/flow/experiments/hf_datasets.py +69 -45
src/flow/experiments/models.py +108 -18
src/flow/experiments/optimizer.py +160 -35
src/flow/experiments/results.py +21 -0
src/flow/experiments/runner.py +1 -1
src/flow/experiments/strategies/__init__.py +25 -11
src/flow/experiments/strategies/gepa_instruction.py +415 -0
src/flow/experiments/strategies/{llm_rewriter.py → instruction.py} +118 -76
src/flow/experiments/strategies/skill.py +692 -0
src/flow/experiments/strategies/{tool_selector.py → tool.py} +16 -22
src/flow/harness/compaction/strategies.py +4 -1
src/flow/harness/maf/agent.py +16 -2
src/flow/harness/maf/tools/__init__.py +9 -1
src/flow/harness/miniagent/harness.py +32 -1
src/flow/harness/miniagent/tool.py +4 -1
src/flow/prompts.py +23 -0
src/flow/tools/__init__.py +4 -4
src/flow/tools/adapters.py +1 -7
src/flow/tools/base.py +4 -1
src/flow/tools/browsing.py +13 -3
src/flow/tools/coding.py +47 -12
src/flow/tools/execution.py +17 -1
src/flow/tools/memory.py +17 -13
src/flow/tools/notebook.py +21 -1
src/flow/tools/planning.py +13 -5
src/flow/tools/skills.py +71 -5
src/flow/tools/subagent.py +11 -1
src/flow/tools/text_inspector_qa.py +4 -1
src/flow/tools/web.py +15 -1
src/flow/tools/workspace.py +18 -5
src/flow/ui/api/__init__.py +2 -0
src/flow/ui/api/deployments.py +145 -0
src/flow/ui/api/experiment.py +0 -1
src/flow/ui/api/jobs.py +40 -7
src/flow/ui/api/schema.py +9 -5
src/flow/ui/api/tests.py +1 -35
src/flow/ui/auth/__init__.py +1 -2
src/flow/ui/auth/config.py +0 -4
src/flow/ui/auth/middleware.py +0 -42

src/flow/__init__.py CHANGED Viewed

@@ -15,6 +15,14 @@ Usage:
     harness = MAFHarness(workspace=Path("/tmp/workspace"), enable_compaction=False)
 """
 from flow.harness.maf import MAFHarness, create_agent
 __version__ = "0.1.0"

     harness = MAFHarness(workspace=Path("/tmp/workspace"), enable_compaction=False)
 """
+import sys
+from loguru import logger
+# Default to INFO — suppress DEBUG noise from tools/workspace/etc.
+logger.remove()
+logger.add(sys.stderr, level="INFO")
 from flow.harness.maf import MAFHarness, create_agent
 __version__ = "0.1.0"

src/flow/cli/app.py CHANGED Viewed

@@ -176,9 +176,13 @@ async def _run_single_task(
 # Import and register commands
 from flow.cli.hf_import import hf_import as hf_import_cmd
 from flow.cli.optimize import optimize as optimize_cmd
 app.command()(optimize_cmd)
 app.command(name="hf-import")(hf_import_cmd)

 # Import and register commands
+from flow.cli.deploy import deploy as deploy_cmd
+from flow.cli.evaluate import evaluate as evaluate_cmd
 from flow.cli.hf_import import hf_import as hf_import_cmd
 from flow.cli.optimize import optimize as optimize_cmd
+app.command()(deploy_cmd)
+app.command()(evaluate_cmd)
 app.command()(optimize_cmd)
 app.command(name="hf-import")(hf_import_cmd)

src/flow/cli/deploy.py ADDED Viewed

	@@ -0,0 +1,113 @@

+# Copyright (c) Microsoft. All rights reserved.
+"""Deploy command for persisting agent configs to the database."""
+from __future__ import annotations
+import asyncio
+from pathlib import Path
+from typing import Annotated
+import typer
+from rich.console import Console
+from flow.experiments.models import load_agent
+console = Console()
+def deploy(
+    agent: Annotated[
+        Path,
+        typer.Option(
+            "--agent", "-a",
+            help="Path to agent YAML config file",
+        ),
+    ],
+    name: Annotated[
+        str | None,
+        typer.Option(
+            "--name", "-n",
+            help="Deployment name (defaults to agent name from YAML)",
+        ),
+    ] = None,
+    deployment_id: Annotated[
+        str | None,
+        typer.Option(
+            "--deployment-id", "-d",
+            help="Add version to existing deployment (UUID)",
+        ),
+    ] = None,
+    description: Annotated[
+        str,
+        typer.Option(
+            "--description",
+            help="Version description",
+        ),
+    ] = "",
+) -> None:
+    """Deploy an agent config to the FAOS database.
+    Creates a versioned deployment that can be tracked, evaluated,
+    and compared in the dashboard.
+    First deploy creates a new deployment (v1). Subsequent deploys
+    with --deployment-id add versions to the same deployment.
+    Examples:
+        # Deploy a new agent
+        flow deploy --agent agent_config.yaml
+        # Deploy with custom name
+        flow deploy --agent agent_config.yaml --name "trip-planner-v2"
+        # Add version to existing deployment
+        flow deploy --agent optimized.yaml --deployment-id <uuid>
+        # Deploy best config from optimization
+        flow deploy --agent ~/.flow/optimizations/<ts>/agents/best_score.yaml
+    """
+    asyncio.run(_run_deploy(
+        agent_path=agent,
+        name=name,
+        deployment_id=deployment_id,
+        description=description,
+    ))
+async def _run_deploy(
+    agent_path: Path,
+    name: str | None,
+    deployment_id: str | None,
+    description: str,
+) -> None:
+    """Run deployment."""
+    if not agent_path.exists():
+        console.print(f"[red]Error:[/] Agent file not found: {agent_path}")
+        raise typer.Exit(1)
+    agent_config = load_agent(agent_path)
+    if name:
+        agent_config.name = name
+    try:
+        from flow.ui.services.persistence_adapter import PersistenceAdapter
+        adapter = PersistenceAdapter()
+        result = await adapter.deploy_agent(
+            agent_config,
+            deployment_id=deployment_id,
+            source="deploy",
+            version_description=description,
+        )
+    except ImportError:
+        console.print("[red]Error:[/] Database dependencies not available.")
+        console.print("[dim]Make sure flow is installed with UI support.[/]")
+        raise typer.Exit(1)
+    console.print("\n[bold green]Deployed![/]\n")
+    console.print(f"  Agent:         [cyan]{agent_config.name}[/]")
+    console.print(f"  Deployment ID: [cyan]{result.deployment_id}[/]")
+    console.print(f"  Config ID:     [cyan]{result.config_id}[/]")
+    console.print(f"  Version:       [cyan]{result.version}[/]")
+    console.print(f"\n[dim]View in dashboard:[/] http://localhost:8091/deployments/{result.deployment_id}")

src/flow/cli/evaluate.py ADDED Viewed

	@@ -0,0 +1,279 @@

+# Copyright (c) Microsoft. All rights reserved.
+"""Evaluate command for measuring agent performance on tasks."""
+from __future__ import annotations
+import asyncio
+import json
+from pathlib import Path
+from typing import Annotated
+import typer
+from rich.console import Console
+from rich.table import Table
+from flow.experiments.models import Agent, load_agent
+from flow.experiments.optimizer import evaluate_agent
+from flow.experiments.types import Task, get_task_suite, load_tasks_from_jsonl
+console = Console()
+def evaluate(
+    agent: Annotated[
+        Path | None,
+        typer.Option(
+            "--agent", "-a",
+            help="Path to agent YAML config file",
+        ),
+    ] = None,
+    tasks: Annotated[
+        Path | None,
+        typer.Option(
+            "--tasks", "-t",
+            help="Path to tasks.jsonl file",
+        ),
+    ] = None,
+    suite: Annotated[
+        str | None,
+        typer.Option(
+            "--suite", "-s",
+            help="Built-in task suite: quick, core, coding",
+        ),
+    ] = None,
+    parallel: Annotated[
+        int,
+        typer.Option(
+            "--parallel", "-p",
+            help="Max concurrent task executions",
+        ),
+    ] = 4,
+    limit: Annotated[
+        int | None,
+        typer.Option(
+            "--limit", "-l",
+            help="Max number of tasks to run",
+        ),
+    ] = None,
+    no_llm_eval: Annotated[
+        bool,
+        typer.Option(
+            "--no-llm-eval",
+            help="Disable LLM-as-Judge evaluation (faster, less accurate)",
+        ),
+    ] = False,
+    output_json: Annotated[
+        bool,
+        typer.Option(
+            "--json",
+            help="Output results as JSON",
+        ),
+    ] = False,
+    persist: Annotated[
+        bool,
+        typer.Option(
+            "--persist/--no-persist",
+            help="Persist results to the FAOS database (visible in flow serve dashboard)",
+        ),
+    ] = True,
+) -> None:
+    """Evaluate an agent's performance on a set of tasks.
+    Runs a single agent configuration against tasks and reports
+    score, pass rate, token usage, and per-task breakdown.
+    No optimization or candidate generation — just measurement.
+    Examples:
+        # Evaluate agent config on a task file
+        flow evaluate --agent agent_config.yaml --tasks tasks.jsonl
+        # Evaluate with built-in suite
+        flow evaluate --agent agent_config.yaml --suite quick
+        # Evaluate and persist to dashboard
+        flow evaluate --agent agent_config.yaml --tasks tasks.jsonl --persist
+        # JSON output for scripting
+        flow evaluate --agent agent_config.yaml --tasks tasks.jsonl --json
+    """
+    asyncio.run(_run_evaluate(
+        agent_path=agent,
+        tasks_path=tasks,
+        suite=suite,
+        parallel=parallel,
+        limit=limit,
+        use_llm_eval=not no_llm_eval,
+        output_json=output_json,
+        persist=persist,
+    ))
+async def _run_evaluate(
+    agent_path: Path | None,
+    tasks_path: Path | None,
+    suite: str | None,
+    parallel: int,
+    limit: int | None,
+    use_llm_eval: bool,
+    output_json: bool,
+    persist: bool,
+) -> None:
+    """Run evaluation."""
+    agent_config, task_list = _load_agent_and_tasks(agent_path, tasks_path, suite, limit)
+    if not output_json:
+        console.print(f"\n[bold]Agent:[/] {agent_config.name}")
+        console.print(f"[bold]Tasks:[/] {len(task_list)}")
+        for t in task_list:
+            console.print(f"  - {t.name}")
+        console.print()
+    try:
+        summary = await evaluate_agent(
+            agent_config,
+            task_list,
+            parallel=parallel,
+            use_llm_evaluator=use_llm_eval,
+            quiet=True,
+        )
+    except KeyboardInterrupt:
+        console.print("\n[yellow]Evaluation cancelled.[/]")
+        raise typer.Exit(1)
+    # Persist to database if requested
+    job_id: str | None = None
+    if persist:
+        job_id = await _persist_evaluation(summary, agent_config)
+    # Output results
+    if output_json:
+        result = {
+            "agent": agent_config.name,
+            "score": round(summary.avg_score, 4),
+            "pass_rate": round(summary.pass_rate, 4),
+            "total_tokens": summary.total_tokens,
+            "avg_tokens": round(summary.avg_tokens, 1),
+            "avg_duration": round(summary.avg_duration, 2),
+            "task_count": summary.task_count,
+            "job_id": job_id,
+            "tasks": [
+                {
+                    "name": tr.task_name,
+                    "score": round(tr.eval_score, 4),
+                    "passed": tr.eval_passed,
+                    "tokens": tr.metrics.total_tokens,
+                    "reasoning": tr.eval_reasoning,
+                }
+                for tr in summary.task_results
+            ],
+        }
+        console.print(json.dumps(result, indent=2))
+    else:
+        _print_eval_results(summary, job_id)
+def _load_agent_and_tasks(
+    agent_path: Path | None,
+    tasks_path: Path | None,
+    suite: str | None,
+    limit: int | None,
+) -> tuple[Agent, list[Task]]:
+    """Load agent config and task list from CLI arguments."""
+    if agent_path:
+        if not agent_path.exists():
+            console.print(f"[red]Error:[/] Agent file not found: {agent_path}")
+            raise typer.Exit(1)
+        agent_config = load_agent(agent_path)
+    else:
+        agent_config = Agent(name="flow_agent")
+    task_list: list[Task] = []
+    if tasks_path:
+        if not tasks_path.exists():
+            console.print(f"[red]Error:[/] Tasks file not found: {tasks_path}")
+            raise typer.Exit(1)
+        task_list = load_tasks_from_jsonl(tasks_path)
+    elif suite:
+        try:
+            task_list = get_task_suite(suite)
+        except ValueError as e:
+            console.print(f"[red]Error:[/] {e}")
+            raise typer.Exit(1)
+    else:
+        try:
+            task_list = get_task_suite("quick")
+        except ValueError:
+            console.print("[red]Error:[/] No tasks specified. Use --tasks or --suite")
+            raise typer.Exit(1)
+    if limit is not None and limit > 0:
+        task_list = task_list[:limit]
+    if not task_list:
+        console.print("[red]Error:[/] No tasks to evaluate")
+        raise typer.Exit(1)
+    return agent_config, task_list
+async def _persist_evaluation(summary: object, agent_config: Agent) -> str | None:
+    """Deploy agent and persist evaluation results to database."""
+    try:
+        from flow.ui.services.persistence_adapter import PersistenceAdapter
+        adapter = PersistenceAdapter()
+        deploy_result = await adapter.deploy_agent(agent_config, source="evaluate")
+        job_id = await adapter.persist_evaluation(summary, deploy_result.config_id)
+        return job_id
+    except ImportError:
+        console.print("[yellow]Warning:[/] Database not available. Results not persisted.")
+        console.print("[dim]Start the dashboard with: flow serve[/]")
+        return None
+    except Exception as e:
+        console.print(f"[yellow]Warning:[/] Failed to persist results: {e}")
+        return None
+def _print_eval_results(summary: object, job_id: str | None = None) -> None:
+    """Print evaluation results as Rich tables."""
+    from flow.experiments.optimizer import CandidateSummary
+    assert isinstance(summary, CandidateSummary)
+    console.print("[bold green]Evaluation complete![/]\n")
+    table = Table(title="Results")
+    table.add_column("Metric", style="cyan")
+    table.add_column("Value", style="green")
+    table.add_row("Score", f"{summary.avg_score:.2f}")
+    table.add_row("Pass Rate", f"{summary.pass_rate:.0%}")
+    table.add_row("Total Tokens", f"{summary.total_tokens:,}")
+    table.add_row("Avg Tokens", f"{summary.avg_tokens:,.0f}")
+    table.add_row("Avg Duration", f"{summary.avg_duration:.1f}s")
+    table.add_row("Tasks", str(summary.task_count))
+    if job_id:
+        table.add_row("Job ID", job_id)
+    console.print(table)
+    if summary.task_results:
+        console.print()
+        task_table = Table(title="Per-Task Breakdown")
+        task_table.add_column("Task", style="cyan")
+        task_table.add_column("Score", style="green")
+        task_table.add_column("Status", style="bold")
+        task_table.add_column("Tokens", style="dim")
+        for tr in summary.task_results:
+            status = "[green]PASS[/]" if tr.eval_passed else "[red]FAIL[/]"
+            task_table.add_row(
+                tr.task_name,
+                f"{tr.eval_score:.2f}",
+                status,
+                f"{tr.metrics.total_tokens:,}",
+            )
+        console.print(task_table)
+    if job_id:
+        console.print(f"\n[dim]View in dashboard:[/] http://localhost:8091/jobs/{job_id}")

src/flow/cli/hf_import.py CHANGED Viewed

@@ -10,6 +10,7 @@ from rich.console import Console
 from flow.experiments.hf_datasets import (
     DATASET_CONVERTERS,
     import_hf_dataset,
     save_tasks_to_jsonl,
 )
@@ -98,7 +99,7 @@ def hf_import(
     if list_supported:
         console.print("\n[bold]Supported Datasets:[/]")
         console.print("\n[dim]You can add custom converters via register_converter()[/]\n")
-        for name in sorted(DATASET_CONVERTERS.keys()):
             console.print(f"  • {name}")
         return

 from flow.experiments.hf_datasets import (
     DATASET_CONVERTERS,
+    LAZY_CONVERTERS,
     import_hf_dataset,
     save_tasks_to_jsonl,
 )
     if list_supported:
         console.print("\n[bold]Supported Datasets:[/]")
         console.print("\n[dim]You can add custom converters via register_converter()[/]\n")
+        for name in sorted({*DATASET_CONVERTERS, *LAZY_CONVERTERS}):
             console.print(f"  • {name}")
         return

src/flow/cli/optimize.py CHANGED Viewed

@@ -71,13 +71,27 @@ def optimize(
             help="Max concurrent experiments",
         ),
     ] = 4,
-    vary: Annotated[
         str | None,
         typer.Option(
-            "--vary", "-v",
-            help="Comma-separated params to vary: compaction, strategy, tools, head, tail",
         ),
     ] = None,
     output: Annotated[
         Path | None,
         typer.Option(
@@ -106,33 +120,31 @@ def optimize(
             help="Maximum number of candidates to generate",
         ),
     ] = 100,
 ) -> None:
     """Find the best agent configuration through experimentation.
-    Runs experiments in parallel, evaluates with LLM-as-Judge,
-    ranks via Pareto analysis, and exports winning agent configs.
     Examples:
-        # Use experiment YAML (recommended - defines agent, tasks, and variations)
-        flow optimize --experiment experiment.yaml
-        # Run with task file and default candidates
-        flow optimize --tasks tasks.jsonl
-        # Vary specific parameters
-        flow optimize --vary compaction,tools --tasks tasks.jsonl
-        # Test all compaction strategies
-        flow optimize --vary strategy --suite coding
-        # Use built-in task suite
-        flow optimize --suite coding --parallel 2
-        # Start from a base agent definition
-        flow optimize --agent base_agent.yaml --vary compaction,tools --tasks tasks.jsonl
-        # Use GEPA for active prompt optimization (via YAML config)
-        flow optimize --config gepa_strategy.yaml --agent base_agent.yaml --tasks tasks.jsonl
     """
     asyncio.run(_run_optimize(
         tasks_path=tasks,
@@ -141,11 +153,14 @@ def optimize(
         agent_path=agent,
         suite=suite,
         parallel=parallel,
-        vary=vary,
         output_dir=output,
         use_llm_eval=not no_llm_eval,
         budget=budget,
         limit=limit,
     ))
@@ -156,11 +171,14 @@ async def _run_optimize(
     agent_path: Path | None,
     suite: str | None,
     parallel: int,
-    vary: str | None,
     output_dir: Path | None,
     use_llm_eval: bool,
     budget: int,
     limit: int | None = None,
 ) -> None:
     """Run the optimization."""
     # If experiment YAML provided, use it as the source of truth
@@ -177,26 +195,43 @@ async def _run_optimize(
     # Load base agent
     base = _load_base_agent(agent_path)
     # Load candidates and check if a strategy is defined in config
-    candidates, strategy_instance = await _load_candidates_and_strategy(config_path, vary, base, budget)
     # If a strategy was provided (like GepaStrategy), run it directly
     if strategy_instance is not None:
         console.print("\n[bold]Running active optimization strategy...[/]")
         await _run_active_strategy(
-            strategy=strategy_instance,
-            base_agent=base,
-            tasks=tasks,
-            output_dir=output_dir,
             parallel=parallel,
             use_llm_eval=use_llm_eval,
             budget=budget
         )
         return
     # Otherwise, use traditional grid search with candidates
     if not candidates:
-        console.print("[red]Error:[/] No candidates to test. Use --config or --vary")
         raise typer.Exit(1)
     console.print(f"\n[bold]Base Agent:[/] {base.name}")
@@ -223,6 +258,9 @@ async def _run_optimize(
         console.print("\nTo use an agent config:")
         console.print(f"  [dim]flow run --config {result.output_dir / 'agents' / 'best_score.yaml'} \"your task\"[/]")
     except KeyboardInterrupt:
         console.print("\n[yellow]Optimization cancelled.[/]")
         raise typer.Exit(1)
@@ -360,7 +398,6 @@ def _load_base_agent(agent_path: Path | None) -> Agent:
 async def _load_candidates_and_strategy(
     config_path: Path | None,
-    vary: str | None,
     base: Agent,
     budget: int,
 ) -> tuple[list[Candidate], Any | None]:
@@ -405,17 +442,13 @@ async def _load_candidates_and_strategy(
             console.print("[red]Error:[/] Config file has no CANDIDATES, VARIATIONS, or STRATEGY")
             raise typer.Exit(1)
-    if vary:
-        variations = _parse_vary_flag(vary)
-        strategy = GridSearchStrategy(variations)
-        return await strategy.generate(base, budget), None
-    # Default: explore context engineering dimensions
     strategy = GridSearchStrategy(variations={
         "compaction": [
-            CompactionConfig.head_tail(10, 40),
             CompactionConfig.none(),
         ],
     })
     return await strategy.generate(base, budget), None
@@ -455,12 +488,18 @@ def _load_yaml_strategy(path: Path) -> Any | None:
             console.print("[red]Error:[/] GEPA optimizer not available.")
             console.print("[dim]Install with: pip install flow-agent[optimizer][/]")
             raise typer.Exit(1)
-    elif strategy_type == "llm_rewriter":
-        from flow.experiments.strategies.llm_rewriter import LLMRewriterStrategy
-        return LLMRewriterStrategy(config=strategy_config)
     else:
         console.print(f"[red]Error:[/] Unknown strategy type: {strategy_type}")
-        console.print("[dim]Supported: gepa, llm_rewriter[/]")
         raise typer.Exit(1)
@@ -488,50 +527,6 @@ def _load_python_config(path: Path) -> tuple[list[Candidate], dict[str, Any], An
     return candidates, variations, strategy
-def _parse_vary_flag(vary: str) -> dict[str, Any]:
-    """Parse --vary flag into variations dict.
-    Supported parameters:
-        compaction, compact: Test head_tail vs none
-        strategy: Test all compaction strategies (none, head_tail, sliding_window, summarization)
-        tools: Test minimal vs standard tool sets
-        head, head_size: Vary head sizes (5, 10, 20)
-        tail, tail_size: Vary tail sizes (20, 40, 60)
-    """
-    variations: dict[str, Any] = {}
-    for param in vary.split(","):
-        param = param.strip().lower()
-        if param in ("compaction", "compact"):
-            variations["compaction"] = [
-                CompactionConfig.head_tail(10, 40),
-                CompactionConfig.none(),
-            ]
-        elif param in ("strategy", "strategies"):
-            # Test all compaction strategies
-            variations["compaction"] = [
-                CompactionConfig.none(),
-                CompactionConfig.head_tail(10, 40),
-                CompactionConfig(strategy="sliding_window", token_budget=50_000),
-                CompactionConfig(strategy="summarization", token_budget=50_000),
-            ]
-        elif param in ("tools", "toolset"):
-            # Tool variations - memory and subagent are just tools
-            variations["tools"] = ["minimal", "standard"]
-        elif param in ("head", "head_size"):
-            variations["compaction"] = [
-                CompactionConfig.head_tail(h, 40) for h in [5, 10, 20]
-            ]
-        elif param in ("tail", "tail_size"):
-            variations["compaction"] = [
-                CompactionConfig.head_tail(10, t) for t in [20, 40, 60]
-            ]
-        else:
-            console.print(f"[yellow]Warning:[/] Unknown vary param: {param}")
-    return variations
 async def _run_active_strategy(
     strategy: Any,
@@ -544,7 +539,7 @@ async def _run_active_strategy(
 ) -> None:
     """Run an active optimization strategy.
-    For strategies that use the ExperimentRunner protocol (LLMRewriterStrategy),
     delegates to FlowOptimizer.optimize_with_strategy() which handles setup,
     evaluation, Pareto analysis, and export.
@@ -732,3 +727,140 @@ async def _run_gepa_strategy(
         console.print(f"\nAgents exported to: [cyan]{output_path / 'agents'}[/]")

             help="Max concurrent experiments",
         ),
     ] = 4,
+    strategy: Annotated[
         str | None,
         typer.Option(
+            "--strategy", "-S",
+            help="Active strategy: tools, instructions, skills (comma-separated for pipeline)",
         ),
     ] = None,
+    max_iterations: Annotated[
+        int,
+        typer.Option(
+            "--max-iterations",
+            help="Max iterations for active strategies",
+        ),
+    ] = 3,
+    min_improvement: Annotated[
+        float,
+        typer.Option(
+            "--min-improvement",
+            help="Min score improvement to continue iterating",
+        ),
+    ] = 0.01,
     output: Annotated[
         Path | None,
         typer.Option(
             help="Maximum number of candidates to generate",
         ),
     ] = 100,
+    persist: Annotated[
+        bool,
+        typer.Option(
+            "--persist/--no-persist",
+            help="Persist results to the FAOS database (visible in flow serve dashboard)",
+        ),
+    ] = True,
 ) -> None:
     """Find the best agent configuration through experimentation.
     Examples:
+        # Optimize tools
+        flow optimize --agent agent.yaml --tasks tasks.jsonl --strategy tools
+        # Optimize instructions
+        flow optimize --agent agent.yaml --suite quick --strategy instructions
+        # Optimize both (pipeline: instructions then tools)
+        flow optimize --agent agent.yaml --tasks tasks.jsonl --strategy instructions,tools
+        # Skip persisting to dashboard
+        flow optimize --agent agent.yaml --tasks tasks.jsonl --strategy tools --no-persist
+        # Use experiment YAML (defines agent, tasks, and variations)
+        flow optimize --experiment experiment.yaml
     """
     asyncio.run(_run_optimize(
         tasks_path=tasks,
         agent_path=agent,
         suite=suite,
         parallel=parallel,
+        strategy=strategy,
+        max_iterations=max_iterations,
+        min_improvement=min_improvement,
         output_dir=output,
         use_llm_eval=not no_llm_eval,
         budget=budget,
         limit=limit,
+        persist=persist,
     ))
     agent_path: Path | None,
     suite: str | None,
     parallel: int,
+    strategy: str | None,
+    max_iterations: int,
+    min_improvement: float,
     output_dir: Path | None,
     use_llm_eval: bool,
     budget: int,
     limit: int | None = None,
+    persist: bool = False,
 ) -> None:
     """Run the optimization."""
     # If experiment YAML provided, use it as the source of truth
     # Load base agent
     base = _load_base_agent(agent_path)
+    # Active strategy mode (--strategy tools, --strategy instructions,tools)
+    if strategy:
+        result = await _run_strategy_optimize(
+            strategy_names=strategy,
+            base=base,
+            tasks=tasks,
+            parallel=parallel,
+            use_llm_eval=use_llm_eval,
+            budget=budget,
+            output_dir=output_dir,
+            max_iterations=max_iterations,
+            min_improvement=min_improvement,
+        )
+        if persist and result:
+            await _persist_optimization(result, base)
+        return
     # Load candidates and check if a strategy is defined in config
+    candidates, strategy_instance = await _load_candidates_and_strategy(config_path, base, budget)
     # If a strategy was provided (like GepaStrategy), run it directly
     if strategy_instance is not None:
         console.print("\n[bold]Running active optimization strategy...[/]")
         await _run_active_strategy(
+            strategy=strategy_instance,
+            base_agent=base,
+            tasks=tasks,
+            output_dir=output_dir,
             parallel=parallel,
             use_llm_eval=use_llm_eval,
             budget=budget
         )
         return
     # Otherwise, use traditional grid search with candidates
     if not candidates:
+        console.print("[red]Error:[/] No candidates to test. Use --strategy or --config")
         raise typer.Exit(1)
     console.print(f"\n[bold]Base Agent:[/] {base.name}")
         console.print("\nTo use an agent config:")
         console.print(f"  [dim]flow run --config {result.output_dir / 'agents' / 'best_score.yaml'} \"your task\"[/]")
+        if persist:
+            await _persist_optimization(result, base)
     except KeyboardInterrupt:
         console.print("\n[yellow]Optimization cancelled.[/]")
         raise typer.Exit(1)
 async def _load_candidates_and_strategy(
     config_path: Path | None,
     base: Agent,
     budget: int,
 ) -> tuple[list[Candidate], Any | None]:
             console.print("[red]Error:[/] Config file has no CANDIDATES, VARIATIONS, or STRATEGY")
             raise typer.Exit(1)
+    # Default: explore all key dimensions (compaction, tools, instructions)
     strategy = GridSearchStrategy(variations={
         "compaction": [
             CompactionConfig.none(),
+            CompactionConfig.head_tail(10, 40),
         ],
+        "tools": ["minimal", "standard"],
     })
     return await strategy.generate(base, budget), None
             console.print("[red]Error:[/] GEPA optimizer not available.")
             console.print("[dim]Install with: pip install flow-agent[optimizer][/]")
             raise typer.Exit(1)
+    elif strategy_type == "instruction":
+        from flow.experiments.strategies.instruction import InstructionOptimizer
+        return InstructionOptimizer(config=strategy_config)
+    elif strategy_type == "tool":
+        from flow.experiments.strategies.tool import ToolOptimizer
+        return ToolOptimizer(config=strategy_config)
+    elif strategy_type == "skill":
+        from flow.experiments.strategies.skill import SkillOptimizer
+        return SkillOptimizer(config=strategy_config)
     else:
         console.print(f"[red]Error:[/] Unknown strategy type: {strategy_type}")
+        console.print("[dim]Supported: gepa, instruction, tool, skill[/]")
         raise typer.Exit(1)
     return candidates, variations, strategy
 async def _run_active_strategy(
     strategy: Any,
 ) -> None:
     """Run an active optimization strategy.
+    For strategies that use the ExperimentRunner protocol (InstructionOptimizer),
     delegates to FlowOptimizer.optimize_with_strategy() which handles setup,
     evaluation, Pareto analysis, and export.
         console.print(f"\nAgents exported to: [cyan]{output_path / 'agents'}[/]")
+async def _run_strategy_optimize(
+    strategy_names: str,
+    base: Agent,
+    tasks: list[Task],
+    parallel: int,
+    use_llm_eval: bool,
+    budget: int,
+    output_dir: Path | None,
+    max_iterations: int,
+    min_improvement: float,
+) -> "OptimizationResult | None":
+    """Run active strategy optimization (--strategy flag).
+    Supports single strategies and comma-separated pipelines.
+    Reuses _resolve_strategy() from agent_api to avoid duplication.
+    """
+    from flow.experiments.ablation import compute_pareto_frontier
+    from flow.experiments.agent_api import _resolve_strategy
+    from flow.experiments.optimizer import CandidateSummary, OptimizationResult
+    strategy_list = [s.strip() for s in strategy_names.split(",")]
+    strategy_config = {
+        "max_iterations": max_iterations,
+        "min_improvement": min_improvement,
+    }
+    console.print(f"\n[bold]Strategy:[/] {' → '.join(strategy_list)}")
+    console.print(f"[bold]Base Agent:[/] {base.name}")
+    console.print(f"[bold]Tasks:[/] {len(tasks)}")
+    console.print(f"[bold]Max Iterations:[/] {max_iterations}")
+    console.print()
+    current_agent = base
+    last_result: OptimizationResult | None = None
+    all_summaries: list[CandidateSummary] = []
+    total_experiments = 0
+    total_duration = 0.0
+    try:
+        for strat_name in strategy_list:
+            try:
+                strat_instance = _resolve_strategy(strat_name, strategy_config)
+            except ValueError as e:
+                console.print(f"[red]Error:[/] {e}")
+                raise typer.Exit(1)
+            optimizer = FlowOptimizer(
+                parallel=parallel,
+                use_llm_evaluator=use_llm_eval,
+                output_dir=output_dir,
+            )
+            last_result = await optimizer.optimize_with_strategy(
+                strategy=strat_instance,
+                base=current_agent,
+                tasks=tasks,
+                budget=budget,
+            )
+            # Accumulate results from all stages
+            all_summaries.extend(last_result.summaries)
+            total_experiments += last_result.total_experiments
+            total_duration += last_result.total_duration_seconds
+            # Next stage starts from the best agent found
+            best = last_result.get_best_candidate("score")
+            if best:
+                current_agent = best.candidate.agent
+        # Merge all stage results into a combined result with recomputed Pareto
+        if last_result and len(strategy_list) > 1:
+            # Deduplicate summaries by name (baseline may appear in multiple stages)
+            seen_names: set[str] = set()
+            deduped: list[CandidateSummary] = []
+            for s in all_summaries:
+                if s.name not in seen_names:
+                    seen_names.add(s.name)
+                    deduped.append(s)
+            # Recompute Pareto frontier across all stages
+            pareto_names = compute_pareto_frontier(deduped)
+            for s in deduped:
+                s.is_pareto_optimal = s.name in pareto_names
+                s.pareto_rank = 0 if s.is_pareto_optimal else 1
+            rank_by_score = sorted(deduped, key=lambda s: s.avg_score, reverse=True)
+            rank_by_tokens = sorted(deduped, key=lambda s: s.avg_tokens)
+            rank_by_efficiency = sorted(
+                deduped,
+                key=lambda s: s.avg_score / max(s.avg_tokens, 1),
+                reverse=True,
+            )
+            last_result = OptimizationResult(
+                timestamp=last_result.timestamp,
+                output_dir=last_result.output_dir,
+                summaries=deduped,
+                pareto_frontier=pareto_names,
+                exported_agents=last_result.exported_agents,
+                rank_by_score=[s.name for s in rank_by_score],
+                rank_by_tokens=[s.name for s in rank_by_tokens],
+                rank_by_efficiency=[s.name for s in rank_by_efficiency],
+                total_experiments=total_experiments,
+                total_duration_seconds=total_duration,
+            )
+        console.print("\n[bold green]Optimization complete![/]")
+        if last_result:
+            console.print(f"\nBest agents exported to: [cyan]{last_result.output_dir / 'agents'}[/]")
+            console.print("\nTo use the best config:")
+            console.print(f"  [dim]flow run --config {last_result.output_dir / 'agents' / 'best_score.yaml'} \"your task\"[/]")
+    except KeyboardInterrupt:
+        console.print("\n[yellow]Optimization cancelled.[/]")
+        raise typer.Exit(1)
+    return last_result
+async def _persist_optimization(result: "OptimizationResult", base_agent: Agent) -> None:
+    """Deploy agent and persist optimization results to database."""
+    from flow.experiments.optimizer import OptimizationResult
+    try:
+        from flow.ui.services.persistence_adapter import PersistenceAdapter
+        adapter = PersistenceAdapter()
+        deploy_result = await adapter.deploy_agent(base_agent, source="optimize")
+        job_id = await adapter.persist_optimization(result, deploy_result.config_id)
+        console.print(f"\n[dim]View in dashboard:[/] http://localhost:8091/jobs/{job_id}")
+    except ImportError:
+        console.print("[yellow]Warning:[/] Database not available. Results not persisted.")
+        console.print("[dim]Start the dashboard with: flow serve[/]")
+    except Exception as e:
+        console.print(f"[yellow]Warning:[/] Failed to persist results: {e}")

src/flow/experiments/agent_api.py CHANGED Viewed

@@ -36,8 +36,10 @@ DEFAULT_VARIATIONS: dict[str, list[Any]] = {
 # Known active strategy names and their classes
 _STRATEGY_MAP: dict[str, str] = {
-    "tools": "flow.experiments.strategies.tool_selector.ToolSelectorStrategy",
-    "instructions": "flow.experiments.strategies.llm_rewriter.LLMRewriterStrategy",
 }
@@ -131,11 +133,15 @@ async def _evaluate_agent_impl(
     return result
-def _resolve_strategy(name: str) -> Any:
     """Import and instantiate a named strategy.
     Args:
-        name: Strategy name ("tools", "instructions")
     Returns:
         Strategy instance
@@ -147,14 +153,18 @@ def _resolve_strategy(name: str) -> Any:
         available = ["grid"] + list(_STRATEGY_MAP.keys())
         raise ValueError(f"Unknown strategy: {name!r}. Available: {available}")
     module_path, class_name = _STRATEGY_MAP[name].rsplit(".", 1)
     import importlib
     mod = importlib.import_module(module_path)
     cls = getattr(mod, class_name)
-    return cls(config={
-        "max_iterations": 3,
-        "min_improvement": 0.01,
-    })
 def _opt_result_to_agent_result(
@@ -216,6 +226,7 @@ async def _optimize_agent_impl(
     quiet: bool,
     agent_id: str | None = None,
     strategy: str | list[str] | None = None,
 ) -> AgentOptimizationResult:
     """Implementation of Agent.optimize().
@@ -225,6 +236,8 @@ async def _optimize_agent_impl(
             grid search. A string like "tools" or "instructions" runs that
             strategy. A list runs them sequentially, each starting from the
             previous best.
     """
     resolved_tasks = _resolve_tasks(tasks)
@@ -264,7 +277,7 @@ async def _optimize_agent_impl(
         last_opt_result: OptimizationResult | None = None
         for strat_name in strategy_list:
-            strat_instance = _resolve_strategy(strat_name)
             optimizer = FlowOptimizer(parallel=parallel, use_llm_evaluator=use_llm_eval)
             if quiet:

 # Known active strategy names and their classes
 _STRATEGY_MAP: dict[str, str] = {
+    "tools": "flow.experiments.strategies.tool.ToolOptimizer",
+    "instructions": "flow.experiments.strategies.instruction.InstructionOptimizer",
+    "skills": "flow.experiments.strategies.skill.SkillOptimizer",
+    "gepa_instructions": "flow.experiments.strategies.gepa_instruction.GEPAInstructionOptimizer",
 }
     return result
+def _resolve_strategy(name: str, config: dict[str, Any] | None = None) -> Any:
     """Import and instantiate a named strategy.
     Args:
+        name: Strategy name ("tools", "instructions", "skills")
+        config: Optional strategy-specific config. Merged with defaults:
+            max_iterations (int): Max optimization iterations (default: 3)
+            min_improvement (float): Min score gain to continue (default: 0.01)
+            Additional keys are passed through to the strategy.
     Returns:
         Strategy instance
         available = ["grid"] + list(_STRATEGY_MAP.keys())
         raise ValueError(f"Unknown strategy: {name!r}. Available: {available}")
+    defaults: dict[str, Any] = {
+        "max_iterations": 3,
+        "min_improvement": 0.01,
+    }
+    if config:
+        defaults.update(config)
     module_path, class_name = _STRATEGY_MAP[name].rsplit(".", 1)
     import importlib
     mod = importlib.import_module(module_path)
     cls = getattr(mod, class_name)
+    return cls(config=defaults)
 def _opt_result_to_agent_result(
     quiet: bool,
     agent_id: str | None = None,
     strategy: str | list[str] | None = None,
+    strategy_config: dict[str, Any] | None = None,
 ) -> AgentOptimizationResult:
     """Implementation of Agent.optimize().
             grid search. A string like "tools" or "instructions" runs that
             strategy. A list runs them sequentially, each starting from the
             previous best.
+        strategy_config: Optional config dict passed to strategy constructors.
+            Merged with defaults (max_iterations=3, min_improvement=0.01).
     """
     resolved_tasks = _resolve_tasks(tasks)
         last_opt_result: OptimizationResult | None = None
         for strat_name in strategy_list:
+            strat_instance = _resolve_strategy(strat_name, strategy_config)
             optimizer = FlowOptimizer(parallel=parallel, use_llm_evaluator=use_llm_eval)
             if quiet:

src/flow/experiments/data/tasks/house_rules.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+{"name": "calculator_module", "prompt": "Create a Python module calc.py with four functions: add, subtract, multiply, divide. Each takes two numbers and returns the result. Include a brief test at the bottom (if __name__ == '__main__') that exercises each function and prints the results. Then run it.", "criteria": [{"name": "file_created", "instruction": "calc.py was created and runs without errors"}, {"name": "correct_results", "instruction": "All four operations produce correct results when run"}, {"name": "type_annotations", "instruction": "EVERY function has full type annotations on ALL parameters AND the return type. e.g. def add(a: float, b: float) -> float. Functions missing ANY annotation fail this criterion."}, {"name": "error_handling", "instruction": "divide() MUST raise a ValueError (not ZeroDivisionError, not return None) when the divisor is zero. The test block must demonstrate this by catching ValueError and printing a message. If ZeroDivisionError is raised or None is returned, this fails."}, {"name": "module_exports", "instruction": "The module MUST define __all__ = ['add', 'subtract', 'multiply', 'divide'] at the top of the file. If __all__ is missing this criterion fails."}], "category": "house_rules", "metadata": {"expected_duration": 120}}
+{"name": "csv_report", "prompt": "Create a Python script report.py that generates a CSV file 'sales_report.csv' with 10 rows of sample sales data. Columns: date, product, quantity, unit_price, total. Then read the CSV back and print a summary: total revenue and the top-selling product by quantity. Run the script.", "criteria": [{"name": "file_created", "instruction": "report.py was created and runs without errors"}, {"name": "csv_generated", "instruction": "sales_report.csv was created with 10 data rows"}, {"name": "iso_dates", "instruction": "ALL dates in the CSV MUST be in ISO-8601 format (YYYY-MM-DD). Dates like 'Jan 15, 2024' or '01/15/2024' or 'January 15' FAIL this criterion. Only YYYY-MM-DD is acceptable."}, {"name": "header_comment", "instruction": "The very first line of the CSV file MUST be a comment line starting with '# ' that describes the file contents and generation timestamp. e.g. '# Sales report generated 2024-01-15T10:30:00'. If the first line is the column header row, this fails."}, {"name": "monetary_format", "instruction": "When printing the summary to stdout, ALL monetary values MUST be formatted with exactly 2 decimal places and a dollar sign. e.g. '$1,234.56' or '$42.00'. Values like '1234.5' or '42' or '$1234.567' FAIL this criterion."}], "category": "house_rules", "metadata": {"expected_duration": 120}}
+{"name": "api_response_builder", "prompt": "Create a Python module api_utils.py with a function build_response(data, status_code=200) that builds a JSON-ready dictionary representing an API response. Also create a function validate_email(email: str) -> bool that checks if an email is roughly valid. Write a test block that demonstrates both functions with a few examples and prints the JSON output. Run it.", "criteria": [{"name": "file_created", "instruction": "api_utils.py was created and runs without errors"}, {"name": "correct_behavior", "instruction": "build_response returns a dict and validate_email correctly accepts/rejects obvious cases"}, {"name": "response_envelope", "instruction": "build_response() MUST return a dict with EXACTLY this structure: {'status': 'ok' or 'error', 'code': int, 'data': ..., 'timestamp': ISO-8601 string}. The 'status' field MUST be 'ok' for codes 200-299 and 'error' for all others. 'timestamp' MUST be present and in ISO-8601 format. If any of these keys are missing or the status logic is wrong, this fails."}, {"name": "error_response", "instruction": "When status_code >= 400, the response MUST include an 'error' key with a human-readable error message string (not None, not empty). The test block MUST demonstrate at least one error response (e.g. status_code=404). If no error response is shown or the 'error' key is missing for error codes, this fails."}, {"name": "json_output", "instruction": "The test block MUST use json.dumps with indent=2 to print the response. Raw dict printing (using print(dict)) or json.dumps without indent FAIL this criterion. The output must be valid, pretty-printed JSON."}], "category": "house_rules", "metadata": {"expected_duration": 120}}

src/flow/experiments/eval_cache.py ADDED Viewed

	@@ -0,0 +1,223 @@

+# Copyright (c) Microsoft. All rights reserved.
+"""Evaluation cache for avoiding redundant agent evaluations.
+Provides pluggable backends (in-memory and disk-based) so that identical
+(agent-config, task) pairs are not re-evaluated within or across sessions.
+Cache keys are SHA-256 hashes of the agent's functional configuration
+(instructions, tools, framework, llm_config, compaction) combined with
+the task definition (prompt, criteria).  The agent *name* is intentionally
+excluded because it varies across iterations while the actual behaviour
+remains the same.
+"""
+from __future__ import annotations
+import hashlib
+import json
+import logging
+import sqlite3
+import time
+from dataclasses import asdict
+from pathlib import Path
+from typing import Any, Protocol, runtime_checkable
+logger = logging.getLogger(__name__)
+# ---------------------------------------------------------------------------
+# Protocol
+# ---------------------------------------------------------------------------
+@runtime_checkable
+class EvaluationCache(Protocol):
+    """Protocol for evaluation result caching."""
+    def get(self, key: str) -> dict[str, Any] | None:
+        """Return cached result dict for *key*, or ``None`` on miss."""
+        ...
+    def put(self, key: str, result: dict[str, Any]) -> None:
+        """Store *result* under *key*."""
+        ...
+# ---------------------------------------------------------------------------
+# Cache-key builder
+# ---------------------------------------------------------------------------
+def build_cache_key(agent_config: dict[str, Any], task_dict: dict[str, Any]) -> str:
+    """Build a deterministic cache key from agent config and task.
+    Args:
+        agent_config: Dict with the agent's functional fields
+            (instructions, tools, framework, llm_config, compaction).
+        task_dict: Dict with the task's identity fields
+            (prompt, criteria).
+    Returns:
+        A hex SHA-256 digest string.
+    """
+    blob = json.dumps(
+        {"agent": agent_config, "task": task_dict},
+        sort_keys=True,
+        default=str,
+    )
+    return hashlib.sha256(blob.encode()).hexdigest()
+def agent_cache_dict(agent: Any) -> dict[str, Any]:
+    """Extract the functional (behaviour-defining) fields from an Agent.
+    The agent *name* is excluded so that two agents with identical
+    config but different names share the same cache key.
+    """
+    return {
+        "instructions": getattr(agent, "instructions", None),
+        "tools": _normalise(getattr(agent, "tools", None)),
+        "framework": getattr(agent, "framework", None),
+        "llm_config": getattr(agent, "llm_config", None),
+        "compaction": _compaction_dict(getattr(agent, "compaction", None)),
+    }
+def task_cache_dict(task: Any) -> dict[str, Any]:
+    """Extract the identity fields from a Task."""
+    criteria = getattr(task, "criteria", [])
+    criteria_dicts = []
+    for c in criteria:
+        if hasattr(c, "name"):
+            criteria_dicts.append({
+                "name": c.name,
+                "instruction": getattr(c, "instruction", ""),
+            })
+        else:
+            criteria_dicts.append(c)
+    return {
+        "prompt": getattr(task, "prompt", ""),
+        "criteria": criteria_dicts,
+    }
+def _normalise(value: Any) -> Any:
+    """Normalise tool configs for deterministic hashing."""
+    if isinstance(value, list):
+        return sorted(value)
+    return value
+def _compaction_dict(compaction: Any) -> dict[str, Any] | None:
+    if compaction is None:
+        return None
+    try:
+        return asdict(compaction)
+    except Exception:
+        return str(compaction)
+# ---------------------------------------------------------------------------
+# In-memory backend
+# ---------------------------------------------------------------------------
+class InMemoryCache:
+    """Dict-backed cache that lives for the lifetime of the object."""
+    def __init__(self) -> None:
+        self._store: dict[str, dict[str, Any]] = {}
+    def get(self, key: str) -> dict[str, Any] | None:
+        return self._store.get(key)
+    def put(self, key: str, result: dict[str, Any]) -> None:
+        self._store[key] = result
+    @property
+    def size(self) -> int:
+        return len(self._store)
+# ---------------------------------------------------------------------------
+# Disk (SQLite) backend
+# ---------------------------------------------------------------------------
+_DEFAULT_CACHE_DIR = Path.home() / ".flow" / "cache"
+_DB_FILENAME = "eval_cache.db"
+class DiskCache:
+    """SQLite-backed cache that persists across sessions.
+    The database is created lazily on first access at
+    ``~/.flow/cache/eval_cache.db`` (configurable via *cache_dir*).
+    """
+    def __init__(self, cache_dir: Path | None = None) -> None:
+        self._cache_dir = cache_dir or _DEFAULT_CACHE_DIR
+        self._db_path = self._cache_dir / _DB_FILENAME
+        self._conn: sqlite3.Connection | None = None
+    # -- lazy init ----------------------------------------------------------
+    def _ensure_db(self) -> sqlite3.Connection:
+        if self._conn is not None:
+            return self._conn
+        self._cache_dir.mkdir(parents=True, exist_ok=True)
+        self._conn = sqlite3.connect(str(self._db_path))
+        self._conn.execute(
+            """
+            CREATE TABLE IF NOT EXISTS eval_cache (
+                key        TEXT PRIMARY KEY,
+                result     TEXT NOT NULL,
+                created_at REAL NOT NULL
+            )
+            """
+        )
+        self._conn.commit()
+        return self._conn
+    # -- protocol -----------------------------------------------------------
+    def get(self, key: str) -> dict[str, Any] | None:
+        conn = self._ensure_db()
+        row = conn.execute(
+            "SELECT result FROM eval_cache WHERE key = ?", (key,)
+        ).fetchone()
+        if row is None:
+            return None
+        try:
+            return json.loads(row[0])
+        except (json.JSONDecodeError, TypeError):
+            return None
+    def put(self, key: str, result: dict[str, Any]) -> None:
+        conn = self._ensure_db()
+        conn.execute(
+            """
+            INSERT OR REPLACE INTO eval_cache (key, result, created_at)
+            VALUES (?, ?, ?)
+            """,
+            (key, json.dumps(result, default=str), time.time()),
+        )
+        conn.commit()
+    # -- helpers ------------------------------------------------------------
+    def close(self) -> None:
+        if self._conn is not None:
+            self._conn.close()
+            self._conn = None
+    @property
+    def size(self) -> int:
+        conn = self._ensure_db()
+        row = conn.execute("SELECT COUNT(*) FROM eval_cache").fetchone()
+        return row[0] if row else 0
+    def clear(self) -> None:
+        conn = self._ensure_db()
+        conn.execute("DELETE FROM eval_cache")
+        conn.commit()

src/flow/experiments/evaluators/llm.py CHANGED Viewed

@@ -141,7 +141,7 @@ The agent was given this task:
 ```
 ## Files Created
-{json.dumps(run_result.files_created, indent=2) if run_result.files_created else "None"}
 ## Tool Results
 {self._format_tool_results(run_result.tool_results)}
@@ -200,6 +200,12 @@ For each criterion, provide TWO scores:
             },
         }
     def _format_tool_results(self, tool_results: list[dict[str, str]]) -> str:
         """Format tool results for the evaluation prompt."""
         if not tool_results:
@@ -324,7 +330,7 @@ Tokens used: {metrics.total_tokens} (input: {metrics.input_tokens}, output: {met
             )
         except Exception as e:
-            logger.error(f"LLM evaluation failed: {e}")
             return EvalResult(
                 score=0.0,
                 passed=False,

 ```
 ## Files Created
+{self._format_files_created(run_result)}
 ## Tool Results
 {self._format_tool_results(run_result.tool_results)}
             },
         }
+    def _format_files_created(self, run_result: RunResult) -> str:
+        """Format files created section as a simple list of filenames."""
+        if not run_result.files_created:
+            return "None"
+        return "\n".join(f"- {f}" for f in run_result.files_created)
     def _format_tool_results(self, tool_results: list[dict[str, str]]) -> str:
         """Format tool results for the evaluation prompt."""
         if not tool_results:
             )
         except Exception as e:
+            logger.error(f"LLM evaluation failed: {e}", exc_info=True)
             return EvalResult(
                 score=0.0,
                 passed=False,

src/flow/experiments/gaia_converter.py CHANGED Viewed

@@ -156,25 +156,20 @@ def convert_to_flow_task(gaia_task: dict[str, Any]) -> Task:
     )
-def convert_gaia(example: dict[str, Any], index: int, dataset_metadata: dict[str, Any] | None = None) -> Task:
-    logger.debug(f"Processing task at index: {index}")
-    if dataset_metadata is None:
-        raise ValueError("dataset_metadata is required and cannot be None.")
-    # Validate required fields in dataset_metadata
-    config = dataset_metadata.get("config")
-    split = dataset_metadata.get("split")
-    local_path = dataset_metadata.get("local_path")
-    if config is None:
-        raise ValueError("dataset_metadata 'config' is required and cannot be None.")
-    if split is None:
-        raise ValueError("dataset_metadata 'split' is required and cannot be None.")
-    if local_path is None:
-        raise ValueError("dataset_metadata 'local_path' is required and cannot be None.")
     # Derive GAIA year from the config when possible (e.g., "2023_level2" -> "2023"),
     # falling back to "2023" to preserve existing behavior if parsing fails.

     )
+def convert_gaia(
+    example: dict[str, Any], index: int, *, config: str, split: str, local_path: str, **kwargs: Any
+) -> Task:
+    """Convert a GAIA benchmark example to a Flow task.
+    Args:
+        example: Raw example dict from the GAIA dataset.
+        index: Index of the example in the dataset.
+        config: Dataset configuration/subset (e.g. ``"2023_level1"``).
+        split: Dataset split (e.g. ``"train"``, ``"validation"``).
+        local_path: Root path where the dataset snapshot was downloaded.
+        **kwargs: Additional metadata reserved for future use; currently ignored.
+    """
+    logger.debug(f"Processing task at index: {index}")
     # Derive GAIA year from the config when possible (e.g., "2023_level2" -> "2023"),
     # falling back to "2023" to preserve existing behavior if parsing fails.

src/flow/experiments/hf_datasets.py CHANGED Viewed

@@ -17,11 +17,26 @@ from __future__ import annotations
 import json
 import logging
 import os
 from pathlib import Path
-from typing import Any
 from flow.experiments.types import EvalCriterion, Task
 logger = logging.getLogger(__name__)
@@ -29,7 +44,7 @@ logger = logging.getLogger(__name__)
 # Each converter knows how to extract question/answer from a specific dataset
-def convert_gsm8k(example: dict[str, Any], index: int, dataset_metadata: dict[str, Any] | None = None) -> Task:
     """Convert GSM8K math problem to Flow task.
     GSM8K format:
@@ -61,8 +76,8 @@ def convert_gsm8k(example: dict[str, Any], index: int, dataset_metadata: dict[st
     ]
     task_metadata = {"dataset": "gsm8k", "index": index, "answer": answer, "final_answer": final_answer}
-    if dataset_metadata:
-        task_metadata.update(dataset_metadata)
     return Task(
         name=f"gsm8k_{index}",
@@ -72,7 +87,7 @@ def convert_gsm8k(example: dict[str, Any], index: int, dataset_metadata: dict[st
     )
-def convert_math(example: dict[str, Any], index: int, dataset_metadata: dict[str, Any] | None = None) -> Task:
     """Convert MATH dataset problem to Flow task.
     MATH format:
@@ -98,8 +113,8 @@ def convert_math(example: dict[str, Any], index: int, dataset_metadata: dict[str
     ]
     task_metadata = {"dataset": "math", "index": index, "level": level, "type": problem_type, "solution": solution}
-    if dataset_metadata:
-        task_metadata.update(dataset_metadata)
     return Task(
         name=f"math_{problem_type.lower()}_{index}",
@@ -109,7 +124,7 @@ def convert_math(example: dict[str, Any], index: int, dataset_metadata: dict[str
     )
-def convert_humaneval(example: dict[str, Any], index: int, dataset_metadata: dict[str, Any] | None = None) -> Task:
     r"""Convert HumanEval coding problem to Flow task.
     HumanEval format:
@@ -138,8 +153,8 @@ def convert_humaneval(example: dict[str, Any], index: int, dataset_metadata: dic
     ]
     task_metadata = {"dataset": "humaneval", "task_id": task_id, "entry_point": entry_point, "test": test}
-    if dataset_metadata:
-        task_metadata.update(dataset_metadata)
     return Task(
         name=f"humaneval_{task_id.replace('/', '_')}",
@@ -149,7 +164,7 @@ def convert_humaneval(example: dict[str, Any], index: int, dataset_metadata: dic
     )
-def convert_mbpp(example: dict[str, Any], index: int, dataset_metadata: dict[str, Any] | None = None) -> Task:
     """Convert MBPP coding problem to Flow task.
     MBPP format:
@@ -170,8 +185,8 @@ def convert_mbpp(example: dict[str, Any], index: int, dataset_metadata: dict[str
     ]
     task_metadata = {"dataset": "mbpp", "task_id": task_id, "test_list": test_list}
-    if dataset_metadata:
-        task_metadata.update(dataset_metadata)
     return Task(
         name=f"mbpp_{task_id}",
@@ -182,13 +197,8 @@ def convert_mbpp(example: dict[str, Any], index: int, dataset_metadata: dict[str
 # Registry of dataset converters
-def _get_gaia_converter():
-    """Lazy import for GAIA converter to avoid smolagents dependency at import time."""
-    from flow.experiments.gaia_converter import convert_gaia
-    return convert_gaia
-DATASET_CONVERTERS = {
     "openai/gsm8k": convert_gsm8k,
     "gsm8k": convert_gsm8k,
     "competition_math": convert_math,
@@ -197,7 +207,18 @@ DATASET_CONVERTERS = {
     "openai_humaneval": convert_humaneval,
     "mbpp": convert_mbpp,
     "google-research-datasets/mbpp": convert_mbpp,
-    "gaia-benchmark/GAIA": _get_gaia_converter,  # Lazy loaded
 }
@@ -206,7 +227,7 @@ def import_hf_dataset(
     config: str | None = None,
     split: str = "train",
     limit: int | None = None,
-    converter_override: Any = None,
     local_path: str | Path | None = None,
 ) -> list[Task]:
     """Import a Hugging Face dataset and convert to Flow tasks.
@@ -246,6 +267,29 @@ def import_hf_dataset(
     except ImportError as e:
         raise ImportError("Hugging Face datasets library is required. Install with: pip install datasets") from e
     # Download to local path if specified, then load from there
     if local_path is not None:
         try:
@@ -276,26 +320,6 @@ def import_hf_dataset(
     logger.info(f"Converting {len(dataset)} examples to Flow tasks...")
-    # Find converter
-    converter = converter_override
-    if converter is None:
-        # Try to find matching converter
-        for key, conv in DATASET_CONVERTERS.items():
-            if key in dataset_name:
-                # Handle lazy loaders (functions that return the actual converter)
-                if conv is _get_gaia_converter:
-                    converter = conv()
-                else:
-                    converter = conv
-                break
-        if converter is None:
-            raise ValueError(
-                f"No converter found for dataset '{dataset_name}'. "
-                f"Available: {list(DATASET_CONVERTERS.keys())}\n"
-                f"Use converter_override parameter to provide a custom converter."
-            )
     # Build dataset metadata to pass to converters
     dataset_metadata: dict[str, Any] = {}
     dataset_metadata["local_path"] = str(local_path) if local_path else None
@@ -303,10 +327,10 @@ def import_hf_dataset(
     dataset_metadata["split"] = split
     # Convert examples
-    tasks = []
     for i, example in enumerate(dataset):
         try:
-            task = converter(example, i, dataset_metadata)
             tasks.append(task)
         except Exception as e:
             logger.warning(f"Failed to convert example {i}: {e}", exc_info=True)
@@ -338,7 +362,7 @@ def save_tasks_to_jsonl(tasks: list[Task], output_path: Path) -> None:
     logger.info(f"Saved {len(tasks)} tasks to {output_path}")
-def register_converter(dataset_name: str, converter_func: Any) -> None:
     """Register a custom converter for a dataset.
     Args:
@@ -346,7 +370,7 @@ def register_converter(dataset_name: str, converter_func: Any) -> None:
         converter_func: Function that converts example dict to Task
     Example:
-        >>> def my_converter(example, index):
         ...     return Task(name=f"task_{index}", prompt=example["text"], ...)
         >>> register_converter("my/dataset", my_converter)
     """

 import json
 import logging
 import os
+from collections.abc import Callable
 from pathlib import Path
+from typing import Any, Protocol
 from flow.experiments.types import EvalCriterion, Task
+class ConverterFunc(Protocol):
+    """Protocol for dataset converter functions.
+    Converters accept a raw example dict, an index, and optional keyword
+    arguments (e.g. ``config``, ``split``, ``local_path``) that carry
+    dataset-level metadata.
+    """
+    def __call__(self, example: dict[str, Any], index: int, **kwargs: Any) -> Task:
+        """Convert a raw dataset example to a Flow Task."""
+        ...
 logger = logging.getLogger(__name__)
 # Each converter knows how to extract question/answer from a specific dataset
+def convert_gsm8k(example: dict[str, Any], index: int, **kwargs: Any) -> Task:
     """Convert GSM8K math problem to Flow task.
     GSM8K format:
     ]
     task_metadata = {"dataset": "gsm8k", "index": index, "answer": answer, "final_answer": final_answer}
+    if kwargs:
+        task_metadata.update(kwargs)
     return Task(
         name=f"gsm8k_{index}",
     )
+def convert_math(example: dict[str, Any], index: int, **kwargs: Any) -> Task:
     """Convert MATH dataset problem to Flow task.
     MATH format:
     ]
     task_metadata = {"dataset": "math", "index": index, "level": level, "type": problem_type, "solution": solution}
+    if kwargs:
+        task_metadata.update(kwargs)
     return Task(
         name=f"math_{problem_type.lower()}_{index}",
     )
+def convert_humaneval(example: dict[str, Any], index: int, **kwargs: Any) -> Task:
     r"""Convert HumanEval coding problem to Flow task.
     HumanEval format:
     ]
     task_metadata = {"dataset": "humaneval", "task_id": task_id, "entry_point": entry_point, "test": test}
+    if kwargs:
+        task_metadata.update(kwargs)
     return Task(
         name=f"humaneval_{task_id.replace('/', '_')}",
     )
+def convert_mbpp(example: dict[str, Any], index: int, **kwargs: Any) -> Task:
     """Convert MBPP coding problem to Flow task.
     MBPP format:
     ]
     task_metadata = {"dataset": "mbpp", "task_id": task_id, "test_list": test_list}
+    if kwargs:
+        task_metadata.update(kwargs)
     return Task(
         name=f"mbpp_{task_id}",
 # Registry of dataset converters
+DATASET_CONVERTERS: dict[str, ConverterFunc] = {
     "openai/gsm8k": convert_gsm8k,
     "gsm8k": convert_gsm8k,
     "competition_math": convert_math,
     "openai_humaneval": convert_humaneval,
     "mbpp": convert_mbpp,
     "google-research-datasets/mbpp": convert_mbpp,
+}
+def _get_gaia_converter() -> ConverterFunc:
+    """Lazy import for GAIA converter to avoid smolagents dependency at import time."""
+    from flow.experiments.gaia_converter import convert_gaia
+    return convert_gaia
+LAZY_CONVERTERS: dict[str, Callable[[], ConverterFunc]] = {
+    "gaia-benchmark/GAIA": _get_gaia_converter,
 }
     config: str | None = None,
     split: str = "train",
     limit: int | None = None,
+    converter_override: ConverterFunc | None = None,
     local_path: str | Path | None = None,
 ) -> list[Task]:
     """Import a Hugging Face dataset and convert to Flow tasks.
     except ImportError as e:
         raise ImportError("Hugging Face datasets library is required. Install with: pip install datasets") from e
+    # Find converter
+    converter: ConverterFunc | None = converter_override
+    if converter is None:
+        # Try direct converters first
+        for key, conv in DATASET_CONVERTERS.items():
+            if key in dataset_name:
+                converter = conv
+                break
+        # Fall back to lazy-loaded converters only if no direct match was found
+        if converter is None:
+            for key, factory in LAZY_CONVERTERS.items():
+                if key in dataset_name:
+                    converter = factory()
+                    break
+        if converter is None:
+            all_keys = sorted({*DATASET_CONVERTERS, *LAZY_CONVERTERS})
+            raise ValueError(
+                f"No converter found for dataset '{dataset_name}'. "
+                f"Available: {all_keys}\n"
+                f"Use converter_override parameter to provide a custom converter."
+            )
     # Download to local path if specified, then load from there
     if local_path is not None:
         try:
     logger.info(f"Converting {len(dataset)} examples to Flow tasks...")
     # Build dataset metadata to pass to converters
     dataset_metadata: dict[str, Any] = {}
     dataset_metadata["local_path"] = str(local_path) if local_path else None
     dataset_metadata["split"] = split
     # Convert examples
+    tasks: list[Task] = []
     for i, example in enumerate(dataset):
         try:
+            task = converter(dict(example), i, **dataset_metadata)
             tasks.append(task)
         except Exception as e:
             logger.warning(f"Failed to convert example {i}: {e}", exc_info=True)
     logger.info(f"Saved {len(tasks)} tasks to {output_path}")
+def register_converter(dataset_name: str, converter_func: ConverterFunc) -> None:
     """Register a custom converter for a dataset.
     Args:
         converter_func: Function that converts example dict to Task
     Example:
+        >>> def my_converter(example, index, **kwargs):
         ...     return Task(name=f"task_{index}", prompt=example["text"], ...)
         >>> register_converter("my/dataset", my_converter)
     """

src/flow/experiments/models.py CHANGED Viewed

@@ -409,14 +409,47 @@ class Agent:
     llm_config: dict[str, Any] | None = None  # {"provider": "azure", "model": "gpt-4o"}
     compaction: CompactionConfig = field(default_factory=CompactionConfig)
     tools: str | list[str] | dict[str, dict[str, Any]] | None = None
     # Set by deploy() — when set, evaluate/optimize auto-persist to DB
-    _id: str | None = field(default=None, repr=False, compare=False)
     @property
     def id(self) -> str | None:
-        """Agent ID in the database, set after deploy()."""
-        return self._id
     @classmethod
     def from_preset(cls, name: str) -> Agent:
@@ -508,24 +541,30 @@ class Agent:
         finally:
             await harness.close()
-    async def deploy(self) -> str:
-        """Register this agent in the Flow database.
-        Creates an AgentConfig row in the local SQLite DB (~/.flow/flow_ui.db).
-        No running server required — this is a pure DB write. After deploying,
-        all evaluate() and optimize() calls auto-persist results to the DB.
-        Run ``flow serve`` separately to browse results in the UI.
         Returns:
-            The agent ID (UUID string)
         Example:
             agent = Agent(name="coding-agent", tools="standard")
-            agent_id = await agent.deploy()
-            # Results now auto-persist
-            result = await agent.evaluate(tasks="quick")
-            # Run `flow serve` to view at http://localhost:7860/agents/{agent_id}
         """
         try:
             from flow.ui.services.persistence_adapter import PersistenceAdapter
@@ -535,9 +574,20 @@ class Agent:
                 "to use deploy(): pip install flow[ui] or uv sync"
             ) from e
         adapter = PersistenceAdapter()
-        self._id = await adapter.deploy_agent(self)
-        return self._id
     async def evaluate(
         self,
@@ -570,7 +620,7 @@ class Agent:
         from .agent_api import _evaluate_agent_impl
         return await _evaluate_agent_impl(
-            self, tasks, parallel, use_llm_eval, quiet, agent_id=self._id
         )
     async def optimize(
@@ -578,6 +628,7 @@ class Agent:
         tasks: str | list[Task] | Path = "quick",
         *,
         strategy: str | list[str] | None = None,
         variations: dict[str, list[Any]] | None = None,
         parallel: int = 4,
         budget: int = 50,
@@ -599,9 +650,13 @@ class Agent:
                 - None or "grid": Grid search over variations (default)
                 - "tools": Iteratively discover optimal tool configuration
                 - "instructions": Iteratively rewrite instructions from failures
                 - list: Run multiple strategies sequentially, e.g.
                   ["instructions", "tools"] optimizes instructions first,
                   then tools starting from the improved agent
             variations: Custom grid search variations (only used with grid strategy)
             parallel: Number of concurrent experiments
             budget: Maximum number of candidates to test
@@ -623,6 +678,13 @@ class Agent:
             # Active: improve instructions
             result = await agent.optimize(tasks="quick", strategy="instructions")
             # Pipeline: instructions first, then tools
             result = await agent.optimize(
                 tasks="quick", strategy=["instructions", "tools"]
@@ -635,8 +697,9 @@ class Agent:
         return await _optimize_agent_impl(
             self, tasks, variations, parallel, budget, use_llm_eval, quiet,
-            agent_id=self._id,
             strategy=strategy,
         )
@@ -934,12 +997,28 @@ def export_agent(
 ) -> None:
     """Export an Agent as a reusable YAML file.
     Args:
         agent: The Agent to export
         path: Path to write the YAML file
         metrics: Optional optimization metrics (stored under _optimization key)
     """
     data = asdict(agent)
     if metrics:
         data["_optimization"] = metrics
     path.parent.mkdir(parents=True, exist_ok=True)
@@ -971,6 +1050,17 @@ def load_agent(path: Path) -> Agent:
     if "compaction" in config_data and isinstance(config_data["compaction"], dict):
         config_data["compaction"] = CompactionConfig(**config_data["compaction"])
     try:
         return Agent(**config_data)
     except TypeError as e:

     llm_config: dict[str, Any] | None = None  # {"provider": "azure", "model": "gpt-4o"}
     compaction: CompactionConfig = field(default_factory=CompactionConfig)
     tools: str | list[str] | dict[str, dict[str, Any]] | None = None
+    skills: dict[str, str] | None = None  # skill_name -> SKILL.md content
     # Set by deploy() — when set, evaluate/optimize auto-persist to DB
+    _deployment_id: str | None = field(default=None, repr=False, compare=False)
+    _config_id: str | None = field(default=None, repr=False, compare=False)
+    _version: int | None = field(default=None, repr=False, compare=False)
     @property
     def id(self) -> str | None:
+        """Deployment ID, set after deploy(). This is the stable identity."""
+        return self._deployment_id
+    @property
+    def config_id(self) -> str | None:
+        """AgentConfig ID for the current version, set after deploy()."""
+        return self._config_id
+    @property
+    def version(self) -> int | None:
+        """Current deployment version number, set after deploy()."""
+        return self._version
+    @classmethod
+    def from_config(cls, path: str | Path) -> Agent:
+        """Create an Agent from a YAML config file.
+        Args:
+            path: Path to the YAML config file
+        Returns:
+            A new Agent instance with the config's values
+        Raises:
+            FileNotFoundError: If the file doesn't exist
+            ValueError: If the config is invalid
+        Example:
+            agent = Agent.from_config("examples/base_agent.yaml")
+            print(agent.name, agent.tools)
+        """
+        return load_agent(Path(path))
     @classmethod
     def from_preset(cls, name: str) -> Agent:
         finally:
             await harness.close()
+    async def deploy(self, candidate: Agent | None = None) -> str:
+        """Deploy this agent (or a candidate) to the Flow database.
+        First call creates a new Deployment + AgentConfig (v1).
+        Subsequent calls on the same agent append a new version to the
+        same deployment — same stable URL, new config behind it.
+        Passing a ``candidate`` (e.g. from optimization results) deploys
+        that candidate's config as the next version of this deployment.
+        Args:
+            candidate: Optional Agent whose config to deploy as the next
+                version. If None, deploys this agent's current config.
         Returns:
+            The deployment ID (stable UUID string)
         Example:
             agent = Agent(name="coding-agent", tools="standard")
+            dep_id = await agent.deploy()          # v1
+            agent.tools = ["bash", "read_file"]
+            await agent.deploy()                   # v2, same dep_id
+            result = await agent.optimize(tasks="quick", strategy="tools")
+            await agent.deploy(result.best_agent)  # v3, same dep_id
         """
         try:
             from flow.ui.services.persistence_adapter import PersistenceAdapter
                 "to use deploy(): pip install flow[ui] or uv sync"
             ) from e
+        source_agent = candidate or self
+        source = "optimize" if candidate is not None else "deploy"
         adapter = PersistenceAdapter()
+        result = await adapter.deploy_agent(
+            source_agent,
+            deployment_id=self._deployment_id,
+            source=source,
+        )
+        self._deployment_id = result.deployment_id
+        self._config_id = result.config_id
+        self._version = result.version
+        return self._deployment_id
     async def evaluate(
         self,
         from .agent_api import _evaluate_agent_impl
         return await _evaluate_agent_impl(
+            self, tasks, parallel, use_llm_eval, quiet, agent_id=self._config_id
         )
     async def optimize(
         tasks: str | list[Task] | Path = "quick",
         *,
         strategy: str | list[str] | None = None,
+        strategy_config: dict[str, Any] | None = None,
         variations: dict[str, list[Any]] | None = None,
         parallel: int = 4,
         budget: int = 50,
                 - None or "grid": Grid search over variations (default)
                 - "tools": Iteratively discover optimal tool configuration
                 - "instructions": Iteratively rewrite instructions from failures
+                - "skills": Iteratively generate domain knowledge skills
                 - list: Run multiple strategies sequentially, e.g.
                   ["instructions", "tools"] optimizes instructions first,
                   then tools starting from the improved agent
+            strategy_config: Optional config passed to the strategy. Merged
+                with defaults (max_iterations=3, min_improvement=0.01).
+                Example: {"max_iterations": 5, "include_builtin": True}
             variations: Custom grid search variations (only used with grid strategy)
             parallel: Number of concurrent experiments
             budget: Maximum number of candidates to test
             # Active: improve instructions
             result = await agent.optimize(tasks="quick", strategy="instructions")
+            # Active: skills with custom config
+            result = await agent.optimize(
+                tasks="quick",
+                strategy="skills",
+                strategy_config={"max_iterations": 5, "min_improvement": 0.0},
+            )
             # Pipeline: instructions first, then tools
             result = await agent.optimize(
                 tasks="quick", strategy=["instructions", "tools"]
         return await _optimize_agent_impl(
             self, tasks, variations, parallel, budget, use_llm_eval, quiet,
+            agent_id=self._config_id,
             strategy=strategy,
+            strategy_config=strategy_config,
         )
 ) -> None:
     """Export an Agent as a reusable YAML file.
+    If the agent has skills, each skill is written as a SKILL.md file in a
+    ``skills/<name>/`` directory next to the YAML. The YAML references skills
+    by name (list of strings) so the agent/framework can locate them at the
+    relative path ``skills/<name>/SKILL.md``.
     Args:
         agent: The Agent to export
         path: Path to write the YAML file
         metrics: Optional optimization metrics (stored under _optimization key)
     """
     data = asdict(agent)
+    # Write skill folders alongside the YAML and replace content with names
+    if agent.skills:
+        skills_dir = path.parent / "skills"
+        for skill_name, skill_content in agent.skills.items():
+            skill_folder = skills_dir / skill_name
+            skill_folder.mkdir(parents=True, exist_ok=True)
+            (skill_folder / "SKILL.md").write_text(skill_content)
+        # In the YAML, store just the skill names (not full content)
+        data["skills"] = sorted(agent.skills.keys())
     if metrics:
         data["_optimization"] = metrics
     path.parent.mkdir(parents=True, exist_ok=True)
     if "compaction" in config_data and isinstance(config_data["compaction"], dict):
         config_data["compaction"] = CompactionConfig(**config_data["compaction"])
+    # Load skills from disk: YAML stores skill names as a list,
+    # resolve to dict[name, content] by reading skills/<name>/SKILL.md
+    if "skills" in config_data and isinstance(config_data["skills"], list):
+        skills_dir = path.parent / "skills"
+        loaded_skills: dict[str, str] = {}
+        for skill_name in config_data["skills"]:
+            skill_path = skills_dir / skill_name / "SKILL.md"
+            if skill_path.exists():
+                loaded_skills[skill_name] = skill_path.read_text()
+        config_data["skills"] = loaded_skills if loaded_skills else None
     try:
         return Agent(**config_data)
     except TypeError as e:

src/flow/experiments/optimizer.py CHANGED Viewed

@@ -21,6 +21,13 @@ from typing import Any
 from openai import AsyncAzureOpenAI
 from .ablation import compute_pareto_frontier
 from .evaluators import LLMEvaluator
 from .metrics import TraceMetrics, extract_metrics
 from .models import (
@@ -175,15 +182,27 @@ class FlowOptimizer:
         parallel: int = 4,
         use_llm_evaluator: bool = True,
         output_dir: Path | None = None,
     ) -> None:
         self.parallel = parallel
         self.use_llm_evaluator = use_llm_evaluator
         self.output_dir = output_dir or Path.home() / ".flow" / "optimizations"
         # Internal state set during optimize() for use by evaluate()
         self._evaluator: LLMEvaluator | None = None
         self._run_dir: Path | None = None
     async def optimize(
         self,
         candidates: list[Candidate],
@@ -211,15 +230,15 @@ class FlowOptimizer:
         setup_tracing("flow-optimizer")
         self._save_config(candidates, tasks, run_dir)
-        print("=" * 70)
-        print(" FLOW OPTIMIZER")
-        print("=" * 70)
-        print(f" Candidates: {len(candidates)}")
-        print(f" Tasks:      {len(tasks)}")
-        print(f" Total:      {len(candidates) * len(tasks)} experiments")
-        print(f" Parallel:   {self.parallel}")
-        print(f" Output:     {run_dir}")
-        print("=" * 70)
         evaluator = None
         if self.use_llm_evaluator:
@@ -316,16 +335,16 @@ class FlowOptimizer:
         self._evaluator = evaluator
         self._run_dir = run_dir
-        print("=" * 70)
-        print(" FLOW OPTIMIZER (Strategy Mode)")
-        print("=" * 70)
-        print(f" Strategy:   {type(strategy).__name__}")
-        print(f" Base Agent: {base.name}")
-        print(f" Tasks:      {len(tasks)}")
-        print(f" Budget:     {budget}")
-        print(f" Parallel:   {self.parallel}")
-        print(f" Output:     {run_dir}")
-        print("=" * 70)
         # Pass self as runner — FlowOptimizer implements the ExperimentRunner
         # protocol via the evaluate() method above
@@ -340,7 +359,7 @@ class FlowOptimizer:
             logger.warning("Strategy produced no candidates")
             candidates = [Candidate(agent=base, mutations={}, rationale="baseline (strategy produced none)")]
-        print(f"\nStrategy produced {len(candidates)} candidates. Running final evaluation...")
         # Save config
         self._save_config(candidates, tasks, run_dir)
@@ -403,15 +422,41 @@ class FlowOptimizer:
         async def run_one(candidate: Candidate, task: Task) -> TaskResult:
             nonlocal completed
             async with semaphore:
                 workspace = run_dir / "workspaces" / candidate.agent.name / task.name
                 workspace.mkdir(parents=True, exist_ok=True)
                 result = await self._run_single(candidate, task, workspace, evaluator)
                 async with lock:
                     completed += 1
                     status = "PASS" if result.eval_passed else "FAIL"
-                    print(
                         f"  [{completed}/{total}] {candidate.agent.name}/{task.name}: "
                         f"{status} score={result.eval_score:.2f} "
                         f"reasoning={result.eval_reasoning_score:.2f} "
@@ -428,7 +473,11 @@ class FlowOptimizer:
         valid_results: list[TaskResult] = []
         for r in gather_results:
             if isinstance(r, BaseException):
-                logger.error(f"Experiment failed: {r}")
             else:
                 valid_results.append(r)
@@ -672,29 +721,29 @@ class FlowOptimizer:
     def _print_summary(self, result: OptimizationResult) -> None:
         """Print optimization summary."""
-        print("\n" + "=" * 70)
-        print(" OPTIMIZATION RESULTS")
-        print("=" * 70)
-        print(f"\n{'Candidate':<30} | {'Score':>8} | {'Reason':>8} | {'Tokens':>10} | {'Pareto':>8}")
-        print("-" * 75)
         for summary in sorted(result.summaries, key=lambda s: s.avg_score, reverse=True):
             pareto = "*" if summary.is_pareto_optimal else ""
-            print(
                 f"{summary.name:<30} | {summary.avg_score:>8.2f} | "
                 f"{summary.avg_reasoning_score:>8.2f} | "
                 f"{summary.avg_tokens:>10,.0f} | {pareto:>8}"
             )
-        print("\n" + "-" * 70)
-        print(f"Pareto frontier: {result.pareto_frontier}")
-        print(f"Best by score: {result.rank_by_score[0] if result.rank_by_score else 'N/A'}")
-        print(f"Best by efficiency: {result.rank_by_efficiency[0] if result.rank_by_efficiency else 'N/A'}")
-        print("\nExported agents:")
         for name, path in result.exported_agents.items():
-            print(f"  {name}: {path}")
-        print(f"\nResults saved to: {result.output_dir}")
 def load_tasks_from_jsonl(path: Path) -> list[Task]:
@@ -716,6 +765,7 @@ async def evaluate_agent(
     parallel: int = 4,
     use_llm_evaluator: bool = True,
     output_dir: Path | None = None,
 ) -> CandidateSummary:
     """Evaluate a single agent on a set of tasks.
@@ -760,6 +810,7 @@ async def evaluate_agent(
         parallel=parallel,
         use_llm_evaluator=use_llm_evaluator,
         output_dir=eval_output_dir,
     )
     result = await optimizer.optimize([candidate], tasks)
@@ -768,3 +819,77 @@ async def evaluate_agent(
         raise RuntimeError("Evaluation produced no results")
     return result.summaries[0]

 from openai import AsyncAzureOpenAI
 from .ablation import compute_pareto_frontier
+from .eval_cache import (
+    DiskCache,
+    EvaluationCache,
+    agent_cache_dict,
+    build_cache_key,
+    task_cache_dict,
+)
 from .evaluators import LLMEvaluator
 from .metrics import TraceMetrics, extract_metrics
 from .models import (
         parallel: int = 4,
         use_llm_evaluator: bool = True,
         output_dir: Path | None = None,
+        quiet: bool = False,
+        cache_evaluations: bool = True,
     ) -> None:
         self.parallel = parallel
         self.use_llm_evaluator = use_llm_evaluator
         self.output_dir = output_dir or Path.home() / ".flow" / "optimizations"
+        self.quiet = quiet
+        # Evaluation cache — avoids redundant agent runs for identical
+        # (agent-config, task) pairs.  Persists across sessions via SQLite.
+        self._cache: EvaluationCache | None = DiskCache() if cache_evaluations else None
         # Internal state set during optimize() for use by evaluate()
         self._evaluator: LLMEvaluator | None = None
         self._run_dir: Path | None = None
+    def _log(self, msg: str) -> None:
+        """Print a message unless quiet mode is enabled."""
+        if not self.quiet:
+            print(msg)
     async def optimize(
         self,
         candidates: list[Candidate],
         setup_tracing("flow-optimizer")
         self._save_config(candidates, tasks, run_dir)
+        self._log("=" * 70)
+        self._log(" FLOW OPTIMIZER")
+        self._log("=" * 70)
+        self._log(f" Candidates: {len(candidates)}")
+        self._log(f" Tasks:      {len(tasks)}")
+        self._log(f" Total:      {len(candidates) * len(tasks)} experiments")
+        self._log(f" Parallel:   {self.parallel}")
+        self._log(f" Output:     {run_dir}")
+        self._log("=" * 70)
         evaluator = None
         if self.use_llm_evaluator:
         self._evaluator = evaluator
         self._run_dir = run_dir
+        self._log("=" * 70)
+        self._log(" FLOW OPTIMIZER (Strategy Mode)")
+        self._log("=" * 70)
+        self._log(f" Strategy:   {type(strategy).__name__}")
+        self._log(f" Base Agent: {base.name}")
+        self._log(f" Tasks:      {len(tasks)}")
+        self._log(f" Budget:     {budget}")
+        self._log(f" Parallel:   {self.parallel}")
+        self._log(f" Output:     {run_dir}")
+        self._log("=" * 70)
         # Pass self as runner — FlowOptimizer implements the ExperimentRunner
         # protocol via the evaluate() method above
             logger.warning("Strategy produced no candidates")
             candidates = [Candidate(agent=base, mutations={}, rationale="baseline (strategy produced none)")]
+        self._log(f"\nStrategy produced {len(candidates)} candidates. Running final evaluation...")
         # Save config
         self._save_config(candidates, tasks, run_dir)
         async def run_one(candidate: Candidate, task: Task) -> TaskResult:
             nonlocal completed
             async with semaphore:
+                # Check evaluation cache
+                cache_key: str | None = None
+                if self._cache is not None:
+                    cache_key = build_cache_key(
+                        agent_cache_dict(candidate.agent),
+                        task_cache_dict(task),
+                    )
+                    cached = self._cache.get(cache_key)
+                    if cached is not None:
+                        result = _task_result_from_cache(cached, candidate, task)
+                        async with lock:
+                            completed += 1
+                            self._log(
+                                f"  [{completed}/{total}] {candidate.agent.name}/{task.name}: "
+                                f"CACHED score={result.eval_score:.2f} "
+                                f"reasoning={result.eval_reasoning_score:.2f} "
+                                f"tokens={result.metrics.total_tokens:,}"
+                            )
+                            if progress_callback:
+                                progress_callback(completed, total, candidate.agent.name, task.name)
+                        return result
                 workspace = run_dir / "workspaces" / candidate.agent.name / task.name
                 workspace.mkdir(parents=True, exist_ok=True)
                 result = await self._run_single(candidate, task, workspace, evaluator)
+                # Store in cache
+                if self._cache is not None and cache_key is not None:
+                    self._cache.put(cache_key, _task_result_to_cache(result))
                 async with lock:
                     completed += 1
                     status = "PASS" if result.eval_passed else "FAIL"
+                    self._log(
                         f"  [{completed}/{total}] {candidate.agent.name}/{task.name}: "
                         f"{status} score={result.eval_score:.2f} "
                         f"reasoning={result.eval_reasoning_score:.2f} "
         valid_results: list[TaskResult] = []
         for r in gather_results:
             if isinstance(r, BaseException):
+                logger.error(
+                    "Experiment failed: %s",
+                    r,
+                    exc_info=(type(r), r, r.__traceback__),
+                )
             else:
                 valid_results.append(r)
     def _print_summary(self, result: OptimizationResult) -> None:
         """Print optimization summary."""
+        self._log("\n" + "=" * 70)
+        self._log(" OPTIMIZATION RESULTS")
+        self._log("=" * 70)
+        self._log(f"\n{'Candidate':<30} | {'Score':>8} | {'Reason':>8} | {'Tokens':>10} | {'Pareto':>8}")
+        self._log("-" * 75)
         for summary in sorted(result.summaries, key=lambda s: s.avg_score, reverse=True):
             pareto = "*" if summary.is_pareto_optimal else ""
+            self._log(
                 f"{summary.name:<30} | {summary.avg_score:>8.2f} | "
                 f"{summary.avg_reasoning_score:>8.2f} | "
                 f"{summary.avg_tokens:>10,.0f} | {pareto:>8}"
             )
+        self._log("\n" + "-" * 70)
+        self._log(f"Pareto frontier: {result.pareto_frontier}")
+        self._log(f"Best by score: {result.rank_by_score[0] if result.rank_by_score else 'N/A'}")
+        self._log(f"Best by efficiency: {result.rank_by_efficiency[0] if result.rank_by_efficiency else 'N/A'}")
+        self._log("\nExported agents:")
         for name, path in result.exported_agents.items():
+            self._log(f"  {name}: {path}")
+        self._log(f"\nResults saved to: {result.output_dir}")
 def load_tasks_from_jsonl(path: Path) -> list[Task]:
     parallel: int = 4,
     use_llm_evaluator: bool = True,
     output_dir: Path | None = None,
+    quiet: bool = False,
 ) -> CandidateSummary:
     """Evaluate a single agent on a set of tasks.
         parallel=parallel,
         use_llm_evaluator=use_llm_evaluator,
         output_dir=eval_output_dir,
+        quiet=quiet,
     )
     result = await optimizer.optimize([candidate], tasks)
         raise RuntimeError("Evaluation produced no results")
     return result.summaries[0]
+# ---------------------------------------------------------------------------
+# Cache serialisation helpers
+# ---------------------------------------------------------------------------
+def _task_result_to_cache(result: TaskResult) -> dict[str, Any]:
+    """Serialise a TaskResult to a JSON-safe dict for caching."""
+    return {
+        "eval_score": result.eval_score,
+        "eval_passed": result.eval_passed,
+        "eval_reasoning": result.eval_reasoning,
+        "eval_reasoning_score": result.eval_reasoning_score,
+        "criteria_results": result.criteria_results,
+        "metrics": {
+            "total_tokens": result.metrics.total_tokens,
+            "input_tokens": result.metrics.input_tokens,
+            "output_tokens": result.metrics.output_tokens,
+            "tool_call_count": result.metrics.tool_call_count,
+            "llm_call_count": result.metrics.llm_call_count,
+            "total_duration_ms": result.metrics.total_duration_ms,
+        },
+        "run": {
+            "output": result.run_result.output,
+            "files_created": result.run_result.files_created,
+            "duration_seconds": result.run_result.duration_seconds,
+            "error": result.run_result.error,
+            "tool_results": result.run_result.tool_results,
+            "trace": result.run_result.trace,
+        },
+    }
+def _task_result_from_cache(
+    cached: dict[str, Any],
+    candidate: Candidate,
+    task: Task,
+) -> TaskResult:
+    """Reconstruct a TaskResult from a cached dict."""
+    run_data = cached.get("run", {})
+    metrics_data = cached.get("metrics", {})
+    run_result = RunResult(
+        task=task,
+        trace=run_data.get("trace", []),
+        output=run_data.get("output", ""),
+        files_created=run_data.get("files_created", []),
+        duration_seconds=run_data.get("duration_seconds", 0.0),
+        workspace=Path("/cached"),
+        error=run_data.get("error"),
+        tool_results=run_data.get("tool_results", []),
+    )
+    metrics = TraceMetrics(
+        total_tokens=metrics_data.get("total_tokens", 0),
+        input_tokens=metrics_data.get("input_tokens", 0),
+        output_tokens=metrics_data.get("output_tokens", 0),
+        tool_call_count=metrics_data.get("tool_call_count", 0),
+        llm_call_count=metrics_data.get("llm_call_count", 0),
+        total_duration_ms=metrics_data.get("total_duration_ms", 0.0),
+    )
+    return TaskResult(
+        candidate_name=candidate.agent.name,
+        task_name=task.name,
+        run_result=run_result,
+        metrics=metrics,
+        eval_score=cached.get("eval_score", 0.0),
+        eval_passed=cached.get("eval_passed", False),
+        eval_reasoning=cached.get("eval_reasoning", ""),
+        criteria_results=cached.get("criteria_results", []),
+        eval_reasoning_score=cached.get("eval_reasoning_score", 0.0),
+    )

src/flow/experiments/results.py CHANGED Viewed

@@ -13,6 +13,7 @@ from pathlib import Path
 from typing import TYPE_CHECKING, Any
 if TYPE_CHECKING:
     from .optimizer import CandidateSummary
@@ -110,9 +111,29 @@ class AgentOptimizationResult:
     # Set when agent was deployed — links to the DB job
     job_id: str | None = field(default=None, repr=False)
     def __str__(self) -> str:
         return (
             f"Optimization: {self.baseline} → {self.best}\n"
             f"Improvement: {self.improvement}\n"
             f"Candidates tested: {self.candidates_tested}"
         )

 from typing import TYPE_CHECKING, Any
 if TYPE_CHECKING:
+    from .models import StrategyIteration
     from .optimizer import CandidateSummary
     # Set when agent was deployed — links to the DB job
     job_id: str | None = field(default=None, repr=False)
+    @property
+    def iterations(self) -> list[StrategyIteration]:
+        """Per-iteration history from active optimization strategies."""
+        details = self.best._details
+        if details and details.candidate.optimization_history:
+            return details.candidate.optimization_history
+        return []
     def __str__(self) -> str:
         return (
             f"Optimization: {self.baseline} → {self.best}\n"
             f"Improvement: {self.improvement}\n"
             f"Candidates tested: {self.candidates_tested}"
         )
+    def print_summary(self) -> None:
+        """Print a formatted table of optimization iterations."""
+        history = self.iterations
+        if not history:
+            print(str(self))
+            return
+        print(f"{'Iter':<6}{'Score':<10}{'Pass Rate':<12}{'Change'}")
+        print("-" * 60)
+        for h in history:
+            print(f"{h.iteration:<6}{h.avg_score:<10.0%}{h.pass_rate:<12.0%}{h.change_description}")

src/flow/experiments/runner.py CHANGED Viewed

@@ -201,7 +201,7 @@ class FlowExperimentRunner:
         except Exception as e:
             error = str(e)
-            logger.error(f"Task execution failed: {e}")
         end_time = time.time()
         duration_seconds = end_time - start_time

         except Exception as e:
             error = str(e)
+            logger.exception(f"Task execution failed: {e}")
         end_time = time.time()
         duration_seconds = end_time - start_time

src/flow/experiments/strategies/__init__.py CHANGED Viewed

@@ -9,10 +9,10 @@ Example YAML:
     variations:
       instructions:
         - "You are helpful"           # Literal
-        - strategy: gepa              # Strategy
-          max_candidates: 3
           config:
-            reflection_lm: gpt-4o
 """
 from __future__ import annotations
@@ -84,19 +84,33 @@ def _register_builtin_strategies() -> None:
     except ImportError:
         logger.debug("GEPA strategy not available (gepa package not installed)")
-    # LLM rewriter strategy (simple instruction variations)
     try:
-        from .llm_rewriter import LLMRewriterStrategy
-        register_strategy("llm_rewriter", LLMRewriterStrategy)
     except ImportError:
-        logger.debug("LLM rewriter strategy not available")
-    # Tool selector strategy (generates tool configurations)
     try:
-        from .tool_selector import ToolSelectorStrategy
-        register_strategy("tool_selector", ToolSelectorStrategy)
     except ImportError:
-        logger.debug("Tool selector strategy not available")
 # Register on module import

     variations:
       instructions:
         - "You are helpful"           # Literal
+        - strategy: instruction       # Strategy
+          max_candidates: 1
           config:
+            max_iterations: 5
 """
 from __future__ import annotations
     except ImportError:
         logger.debug("GEPA strategy not available (gepa package not installed)")
+    # Instruction optimizer
     try:
+        from .instruction import InstructionOptimizer
+        register_strategy("instruction", InstructionOptimizer)
     except ImportError:
+        logger.debug("Instruction optimizer not available")
+    # Tool optimizer
     try:
+        from .tool import ToolOptimizer
+        register_strategy("tool", ToolOptimizer)
     except ImportError:
+        logger.debug("Tool optimizer not available")
+    # Skill optimizer
+    try:
+        from .skill import SkillOptimizer
+        register_strategy("skill", SkillOptimizer)
+    except ImportError:
+        logger.debug("Skill optimizer not available")
+    # GEPA instruction optimizer (uses standard plumbing + GEPA reflection)
+    try:
+        from .gepa_instruction import GEPAInstructionOptimizer
+        register_strategy("gepa_instruction", GEPAInstructionOptimizer)
+    except ImportError:
+        logger.debug("GEPA instruction optimizer not available")
 # Register on module import

src/flow/experiments/strategies/gepa_instruction.py ADDED Viewed

	@@ -0,0 +1,415 @@

+# Copyright (c) Microsoft. All rights reserved.
+"""GEPA-based instruction optimization strategy.
+Like InstructionOptimizer but uses GEPA's evolutionary approach:
+- Maintains a population of candidate instructions
+- Uses GEPA's reflection mechanism to generate new candidates from failures
+- Selects candidates via frontier-based selection (not just greedy best)
+Uses the standard strategy plumbing (runner.evaluate()) instead of
+a custom evaluator callback bridge.
+"""
+from __future__ import annotations
+import logging
+import os
+from dataclasses import dataclass, field
+from typing import Any
+from ..models import Agent, Candidate, ExperimentRunner, StrategyIteration
+from ..types import Task
+logger = logging.getLogger(__name__)
+@dataclass
+class GEPAInstructionOptimizer:
+    """Instruction optimizer using GEPA's evolutionary approach.
+    Uses the same runner.evaluate() plumbing as InstructionOptimizer,
+    but delegates candidate generation to GEPA's reflection + selection loop.
+    The GEPA library handles:
+    - Generating improved prompts via LLM reflection on failures
+    - Candidate selection via frontier-based strategies
+    - Population management across generations
+    Config options:
+        model: LLM for GEPA reflection (default: gpt-4o-mini)
+        max_iterations: Max generations (default: 5)
+        min_improvement: Min score gain to continue (default: 0.05)
+        reflection_lm: LLM for GEPA reflection (overrides model if set)
+    Example:
+        flow optimize --agent agent.yaml --tasks tasks.jsonl --strategy gepa_instructions
+    """
+    config: dict[str, Any] = field(default_factory=dict)
+    async def generate(
+        self,
+        base: Agent,
+        budget: int,
+        *,
+        tasks: list[Task] | None = None,
+        runner: ExperimentRunner | None = None,
+    ) -> list[Candidate]:
+        """Generate optimized instructions using GEPA's evolutionary loop.
+        Args:
+            base: Base agent with instructions to optimize
+            budget: Max candidates to evaluate
+            tasks: Tasks to evaluate on (required)
+            runner: ExperimentRunner for evaluation (required)
+        Returns:
+            List with the best candidate found
+        """
+        if runner is None:
+            raise ValueError(
+                "GEPAInstructionOptimizer requires a runner. "
+                "Use FlowOptimizer.optimize_with_strategy() to provide one."
+            )
+        if not tasks:
+            raise ValueError(
+                "GEPAInstructionOptimizer requires tasks to evaluate against."
+            )
+        try:
+            import gepa
+            from gepa.core.adapter import EvaluationBatch
+        except ImportError as e:
+            raise ImportError(
+                "GEPA is not installed. Install with: pip install gepa"
+            ) from e
+        model = self.config.get("model", "gpt-4o-mini")
+        max_iterations = self.config.get("max_iterations", 5)
+        min_improvement = self.config.get("min_improvement", 0.05)
+        base_instructions = base.instructions or "You are a helpful assistant."
+        # Track optimization history
+        history: list[StrategyIteration] = []
+        best_instructions = base_instructions
+        best_score = 0.0
+        generation = 0
+        # ── Build GEPA adapter that uses runner.evaluate() ──
+        strategy_self = self
+        _runner = runner
+        _tasks = tasks
+        _base = base
+        def _run_async(coro: Any) -> Any:
+            """Run an async coroutine from synchronous GEPA context."""
+            import asyncio
+            import concurrent.futures
+            with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool:
+                return pool.submit(asyncio.run, coro).result()
+        class FlowRunnerAdapter(gepa.GEPAAdapter):
+            """Bridges GEPA's adapter interface to Flow's ExperimentRunner."""
+            def evaluate(
+                self,
+                batch: list[Any],
+                candidate: dict[str, str],
+                capture_traces: bool = False,
+            ) -> EvaluationBatch:
+                """Evaluate a candidate one task at a time using Flow's runner.
+                GEPA tracks scores by dataset item index, so we must evaluate
+                each batch item individually to preserve the 1:1 mapping.
+                """
+                instructions_text = candidate.get("instructions", base_instructions)
+                # Build agent + candidate
+                agent = Agent(
+                    name=f"{_base.name}_gepa_iter",
+                    framework=_base.framework,
+                    instructions=instructions_text,
+                    llm_config=_base.llm_config,
+                    compaction=_base.compaction,
+                    tools=_base.tools,
+                )
+                flow_candidate = Candidate(
+                    agent=agent,
+                    mutations={"instructions": instructions_text},
+                )
+                if not batch:
+                    return EvaluationBatch(
+                        outputs=[], scores=[],
+                        trajectories=[] if capture_traces else None,
+                        objective_scores=None,
+                    )
+                # Evaluate each task individually to preserve GEPA's index mapping
+                scores: list[float] = []
+                outputs: list[dict[str, Any]] = []
+                trajectories: list[dict[str, Any]] = []
+                passed_count = 0
+                for item in batch:
+                    if not isinstance(item, Task):
+                        scores.append(0.0)
+                        outputs.append({})
+                        continue
+                    # Evaluate single task via runner
+                    summary = _run_async(_runner.evaluate(flow_candidate, [item]))
+                    # Extract result for this task
+                    if summary.task_results:
+                        tr = summary.task_results[0]
+                        task_score = float(getattr(tr, "eval_score", 0.0))
+                        eval_passed = getattr(tr, "eval_passed", False)
+                        eval_reasoning = getattr(tr, "eval_reasoning", "")
+                        agent_output = str(getattr(tr.run_result, "output", "")) if tr.run_result else ""
+                    else:
+                        task_score = 0.0
+                        eval_passed = False
+                        eval_reasoning = "No result"
+                        agent_output = ""
+                    if eval_passed:
+                        passed_count += 1
+                    scores.append(task_score)
+                    traj = {
+                        "task_name": getattr(item, "name", "unknown"),
+                        "task_prompt": getattr(item, "prompt", ""),
+                        "agent_output": agent_output[:1000],
+                        "eval_reasoning": eval_reasoning,
+                        "eval_score": task_score,
+                        "eval_passed": eval_passed,
+                        "instructions_used": instructions_text,
+                    }
+                    outputs.append(traj)
+                    if capture_traces:
+                        trajectories.append(traj)
+                # Record iteration in history
+                avg_score = sum(scores) / len(scores) if scores else 0.0
+                pass_rate = passed_count / len(batch) if batch else 0.0
+                failures_count = len(batch) - passed_count
+                nonlocal generation, best_score, best_instructions
+                generation += 1
+                task_lines = [
+                    f"  [{'PASS' if o.get('eval_passed') else 'FAIL'}] "
+                    f"{o.get('task_name', '?')}: {o.get('eval_reasoning', '')[:150]}"
+                    for o in outputs if isinstance(o, dict) and o
+                ]
+                history.append(
+                    StrategyIteration(
+                        iteration=generation - 1,
+                        instructions_preview=instructions_text[:200],
+                        full_instructions=instructions_text,
+                        avg_score=avg_score,
+                        pass_rate=pass_rate,
+                        failures_count=failures_count,
+                        change_description=f"GEPA generation {generation}",
+                        change_rationale="\n".join(task_lines),
+                    )
+                )
+                if avg_score > best_score:
+                    best_score = avg_score
+                    best_instructions = instructions_text
+                logger.info(
+                    f"GEPA gen {generation}: score={avg_score:.3f}, "
+                    f"pass_rate={pass_rate:.1%}, failures={failures_count}"
+                )
+                return EvaluationBatch(
+                    outputs=outputs,
+                    scores=scores,
+                    trajectories=trajectories if capture_traces else None,
+                    objective_scores=None,
+                )
+            def make_reflective_dataset(
+                self,
+                candidate: dict[str, str],
+                eval_batch: EvaluationBatch,
+                components_to_update: list[str],
+            ) -> dict[str, list[dict[str, Any]]]:
+                """Create reflection dataset from evaluation results.
+                GEPA uses this to generate improved candidates via LLM reflection.
+                """
+                trajectories = eval_batch.trajectories or eval_batch.outputs or []
+                scores = eval_batch.scores or []
+                reflection_data: dict[str, list[dict[str, Any]]] = {}
+                for component in components_to_update:
+                    examples: list[dict[str, Any]] = []
+                    for traj, score in zip(trajectories, scores):
+                        if not isinstance(traj, dict):
+                            continue
+                        example = {
+                            "Inputs": {
+                                "task": traj.get("task_prompt", ""),
+                                "instructions": traj.get("instructions_used", "")[:500],
+                            },
+                            "Generated Outputs": {
+                                "agent_response": traj.get("agent_output", "")[:1000],
+                            },
+                            "Feedback": (
+                                f"Score: {score:.2f}/1.0. "
+                                f"Passed: {traj.get('eval_passed', False)}. "
+                                f"{traj.get('eval_reasoning', '')}"
+                            ),
+                            "_score": score,
+                        }
+                        examples.append(example)
+                    # Sort by score ascending — GEPA learns more from failures
+                    examples.sort(key=lambda x: x.get("_score", 0))
+                    for ex in examples:
+                        ex.pop("_score", None)
+                    reflection_data[component] = examples
+                return reflection_data
+        # ── Set up Azure env vars for GEPA's LiteLLM usage ──
+        if os.environ.get("AZURE_OPENAI_API_KEY"):
+            os.environ.setdefault("AZURE_API_KEY", os.environ["AZURE_OPENAI_API_KEY"])
+        if os.environ.get("AZURE_OPENAI_ENDPOINT"):
+            os.environ.setdefault("AZURE_API_BASE", os.environ["AZURE_OPENAI_ENDPOINT"])
+        # ── Build GEPA config ──
+        gepa_config: dict[str, Any] = {}
+        # Resolve reflection LM: explicit config > Azure deployment > default model
+        reflection_lm = self.config.get("reflection_lm")
+        if not reflection_lm:
+            azure_deployment = os.environ.get("AZURE_OPENAI_DEPLOYMENT")
+            reflection_lm = azure_deployment if azure_deployment else model
+        # Add azure/ prefix for LiteLLM if using Azure
+        if not reflection_lm.startswith("azure/") and os.environ.get("AZURE_OPENAI_ENDPOINT"):
+            reflection_lm = f"azure/{reflection_lm}"
+        gepa_config["reflection_lm"] = reflection_lm
+        # Pass through valid GEPA params from config
+        VALID_GEPA_PARAMS = {
+            "reflection_lm", "candidate_selection_strategy", "frontier_type",
+            "skip_perfect_score", "batch_sampler", "reflection_minibatch_size",
+            "perfect_score", "reflection_prompt_template", "module_selector",
+            "use_merge", "max_merge_invocations", "merge_val_overlap_floor",
+            "stop_callbacks", "display_progress_bar", "seed",
+            "cache_evaluation", "raise_on_exception",
+        }
+        for key, value in self.config.items():
+            if key in VALID_GEPA_PARAMS:
+                gepa_config[key] = value
+        # ── Run GEPA ──
+        seed_candidate = {"instructions": base_instructions}
+        # GEPA needs Task objects as dataset
+        dataset = list(tasks)
+        logger.info(
+            f"GEPAInstructionOptimizer: budget={budget}, tasks={len(dataset)}, "
+            f"reflection_lm={reflection_lm}"
+        )
+        gepa_result = gepa.optimize(
+            seed_candidate=seed_candidate,
+            adapter=FlowRunnerAdapter(),
+            trainset=dataset,
+            valset=dataset,
+            max_metric_calls=budget,
+            display_progress_bar=True,
+            skip_perfect_score=False,
+            perfect_score=2.0,  # Impossible score to disable early stopping
+            **gepa_config,
+        )
+        # ── Extract best result ──
+        best_prompts = gepa_result.best_candidate
+        final_instructions = best_prompts.get("instructions", best_instructions)
+        gepa_best_score = gepa_result.val_aggregate_scores[gepa_result.best_idx]
+        # Use GEPA's best if it's better, otherwise use our tracked best
+        if gepa_best_score >= best_score:
+            best_instructions = final_instructions
+            best_score = gepa_best_score
+        logger.info(
+            f"GEPAInstructionOptimizer complete: {generation} generations, "
+            f"best_score={best_score:.3f}"
+        )
+        # Build candidates for all unique instruction variants tried
+        candidates: list[Candidate] = []
+        seen_instructions: set[str] = set()
+        for h in history:
+            instr = h.full_instructions or ""
+            if not instr or instr in seen_instructions:
+                continue
+            seen_instructions.add(instr)
+            is_best = instr == best_instructions
+            suffix = "gepa_optimized" if is_best else f"gepa_gen{h.iteration}"
+            agent = Agent(
+                name=f"{base.name}_{suffix}",
+                framework=base.framework,
+                description=base.description,
+                instructions=instr,
+                llm_config=base.llm_config,
+                compaction=base.compaction,
+                tools=base.tools,
+            )
+            candidates.append(
+                Candidate(
+                    agent=agent,
+                    mutations={"instructions": instr},
+                    rationale=f"GEPA generation {h.iteration}: score={h.avg_score:.3f}",
+                    optimization_history=history if is_best else [],
+                )
+            )
+        # Ensure best is always included
+        if best_instructions not in seen_instructions:
+            final_agent = Agent(
+                name=f"{base.name}_gepa_optimized",
+                framework=base.framework,
+                description=base.description,
+                instructions=best_instructions,
+                llm_config=base.llm_config,
+                compaction=base.compaction,
+                tools=base.tools,
+            )
+            score_progression = f"{history[0].avg_score:.2f} -> {best_score:.2f}" if history else f"-> {best_score:.2f}"
+            candidates.append(
+                Candidate(
+                    agent=final_agent,
+                    mutations={"instructions": best_instructions},
+                    rationale=f"GEPA instruction optimization: {generation} generations, {score_progression}",
+                    optimization_history=history,
+                )
+            )
+        return candidates

src/flow/experiments/strategies/{llm_rewriter.py → instruction.py} RENAMED Viewed

@@ -1,12 +1,12 @@
 # Copyright (c) Microsoft. All rights reserved.
-"""LLM-based instruction rewriter strategy.
-This strategy always requires a runner and tasks. It:
-1. Evaluates the current instructions on all tasks
-2. Reflects on failures to understand what went wrong
-3. Rewrites instructions to address failures
-4. Re-evaluates and repeats until convergence or budget exhausted
 """
 from __future__ import annotations
@@ -23,7 +23,7 @@ logger = logging.getLogger(__name__)
 @dataclass
-class LLMRewriterStrategy:
     """Strategy that uses an LLM to iteratively improve agent instructions.
     Runs an evaluate-reflect-rewrite loop. Each iteration evaluates
@@ -42,7 +42,7 @@ class LLMRewriterStrategy:
     Example YAML:
         strategy:
-          type: llm_rewriter
           config:
             model: gpt-4o-mini
             max_iterations: 5
@@ -75,12 +75,12 @@ class LLMRewriterStrategy:
         """
         if runner is None:
             raise ValueError(
-                "LLMRewriterStrategy requires a runner. "
                 "Use FlowOptimizer.optimize_with_strategy() to provide one."
             )
         if not tasks:
             raise ValueError(
-                "LLMRewriterStrategy requires tasks to evaluate against."
             )
         base_instructions = base.instructions or "You are a helpful assistant."
@@ -100,7 +100,7 @@ class LLMRewriterStrategy:
         min_improvement = self.config.get("min_improvement", 0.05)
         logger.info(
-            f"LLMRewriterStrategy: active mode (max_iterations={max_iterations}, "
             f"min_improvement={min_improvement})"
         )
@@ -108,6 +108,7 @@ class LLMRewriterStrategy:
         best_instructions = instructions
         best_score = 0.0
         history: list[StrategyIteration] = []
         for iteration in range(max_iterations):
             # 1. Evaluate current instructions
@@ -169,6 +170,10 @@ class LLMRewriterStrategy:
                 )
             )
             # Track best
             if avg_score > best_score:
                 best_score = avg_score
@@ -193,29 +198,61 @@ class LLMRewriterStrategy:
             # 3. Reflect on failures and rewrite
             current_instructions = self._reflect_and_rewrite(
-                current_instructions, failures, avg_score, model
             )
             logger.info(f"  Rewrote instructions ({len(current_instructions)} chars)")
-        # Build final candidate with optimization history
-        final_agent = Agent(
-            name=f"{base.name}_llm_rewriter_optimized",
-            framework=base.framework,
-            instructions=best_instructions,
-            llm_config=base.llm_config,
-            compaction=base.compaction,
-            tools=base.tools,
-        )
-        score_progression = f"{history[0].avg_score:.2f} → {best_score:.2f}"
-        return [
-            Candidate(
-                agent=final_agent,
-                mutations={"instructions": best_instructions},
-                rationale=f"LLM rewriter active optimization: {len(history)} iterations, {score_progression}",
-                optimization_history=history,
             )
-        ]
     def _reflect_and_rewrite(
         self,
@@ -223,52 +260,60 @@ class LLMRewriterStrategy:
         failures: list[Any],
         current_score: float,
         model: str,
     ) -> str:
         """Analyze failures and rewrite instructions to address them."""
-        # Build failure analysis
         failure_descriptions = []
-        for tr in failures[:5]:  # Limit to 5 failures for context
             task_name = getattr(tr, "task_name", "unknown")
             reasoning = getattr(tr, "eval_reasoning", "No reasoning")
             score = getattr(tr, "eval_score", 0.0)
             failure_descriptions.append(
-                f"- Task '{task_name}' (score={score:.2f}): {reasoning[:200]}"
             )
         failures_text = "\n".join(failure_descriptions)
-        prompt = f"""You are a prompt engineer writing guidelines for a coding assistant.
-The assistant's current guidelines scored {current_score:.2f} out of 1.0 on a benchmark.
-Here are the tasks where performance was low:
 {failures_text}
-The current guidelines are:
 ---
 {instructions}
 ---
-Write a new, improved version of the guidelines. The new guidelines should:
-1. Help the assistant succeed on a wide range of coding tasks — the failures
-   above are examples, but the guidelines must generalize beyond them
-2. Include concrete strategies (e.g., always verify output, check edge cases,
-   create and run files when asked)
-3. Be general-purpose: do NOT reference specific task names, specific answers,
-   or specific test cases from the failures above
-4. Focus on transferable skills and habits (e.g., "verify output matches
-   requirements" not "check that fibonacci returns 55")
-5. Be concise
-Output ONLY the new guidelines text, nothing else."""
         try:
             return self._call_llm(prompt, model) or instructions
         except Exception as e:
             logger.warning(f"LLM rewrite failed: {e}")
-            # Primary prompt failed — the original instructions may have
-            # triggered a content filter (Azure, OpenAI, etc.) or caused
-            # another error. Try a fallback that omits them entirely.
             logger.info("Retrying rewrite with fallback prompt (without original instructions)")
             return self._fallback_rewrite(failures_text, current_score, model)
@@ -277,29 +322,29 @@ Output ONLY the new guidelines text, nothing else."""
         failures_text: str,
         current_score: float,
         model: str,
     ) -> str:
-        """Generate new instructions from scratch when the primary rewrite is blocked.
-        This avoids including the original instructions (which may trigger
-        content filters) and instead writes fresh guidelines based solely on
-        the task failure descriptions.
-        """
-        prompt = f"""You are a prompt engineer. Write guidelines for a coding assistant.
 The assistant scored {current_score:.2f} out of 1.0 on these tasks:
 {failures_text}
-Write concise guidelines that would help a coding assistant succeed on
-a wide range of coding tasks. The failures above are examples — the
-guidelines must generalize beyond them. The guidelines should:
-1. Instruct the assistant to complete coding tasks by creating files and
-   running code
-2. Include strategies for verifying output and handling edge cases
-3. Be general-purpose: do NOT reference specific task names or answers
-   from the failures above
-4. Focus on transferable habits and skills
-Output ONLY the guidelines text, nothing else."""
         try:
             result = self._call_llm(prompt, model)
@@ -309,22 +354,20 @@ Output ONLY the guidelines text, nothing else."""
         except Exception as e2:
             logger.warning(f"Fallback rewrite also failed: {e2}")
-        # Last resort: return a sensible default
-        logger.info("Using default coding assistant guidelines")
         return (
-            "You are a helpful coding assistant. When given a task:\n"
-            "1. Create the requested files with correct, working code\n"
-            "2. Run the code and verify the output is correct\n"
             "3. Handle edge cases and validate results before finishing"
         )
     def _get_client(self, model: str) -> tuple[Any, str]:
         """Get OpenAI client and model name."""
         try:
             from openai import AzureOpenAI, OpenAI
         except ImportError as e:
-            raise ImportError("openai package required for LLMRewriterStrategy") from e
         azure_key = os.environ.get("AZURE_OPENAI_API_KEY")
         azure_endpoint = os.environ.get("AZURE_OPENAI_ENDPOINT")
@@ -354,4 +397,3 @@ Output ONLY the guidelines text, nothing else."""
             messages=[{"role": "user", "content": prompt}],
         )
         return response.choices[0].message.content or ""

 # Copyright (c) Microsoft. All rights reserved.
+"""Instruction optimization strategy.
+This strategy iteratively improves agent instructions by:
+1. Evaluating the current instructions on all tasks
+2. Reflecting on failures to understand what went wrong
+3. Rewriting instructions to address failures
+4. Re-evaluating and repeating until convergence or budget exhausted
 """
 from __future__ import annotations
 @dataclass
+class InstructionOptimizer:
     """Strategy that uses an LLM to iteratively improve agent instructions.
     Runs an evaluate-reflect-rewrite loop. Each iteration evaluates
     Example YAML:
         strategy:
+          type: instruction
           config:
             model: gpt-4o-mini
             max_iterations: 5
         """
         if runner is None:
             raise ValueError(
+                "InstructionOptimizer requires a runner. "
                 "Use FlowOptimizer.optimize_with_strategy() to provide one."
             )
         if not tasks:
             raise ValueError(
+                "InstructionOptimizer requires tasks to evaluate against."
             )
         base_instructions = base.instructions or "You are a helpful assistant."
         min_improvement = self.config.get("min_improvement", 0.05)
         logger.info(
+            f"InstructionOptimizer: active mode (max_iterations={max_iterations}, "
             f"min_improvement={min_improvement})"
         )
         best_instructions = instructions
         best_score = 0.0
         history: list[StrategyIteration] = []
+        iteration_candidates: list[tuple[str, str]] = []  # (instructions, label)
         for iteration in range(max_iterations):
             # 1. Evaluate current instructions
                 )
             )
+            # Collect candidate for this iteration
+            label = "baseline" if iteration == 0 else f"iter{iteration}"
+            iteration_candidates.append((current_instructions, label))
             # Track best
             if avg_score > best_score:
                 best_score = avg_score
             # 3. Reflect on failures and rewrite
             current_instructions = self._reflect_and_rewrite(
+                current_instructions, failures, avg_score, model,
+                agent_name=base.name, agent_description=base.description or "",
+                tasks=tasks,
             )
             logger.info(f"  Rewrote instructions ({len(current_instructions)} chars)")
+        # Build candidates for all unique instruction variants tried
+        candidates: list[Candidate] = []
+        seen_instructions: set[str] = set()
+        for iter_instructions, label in iteration_candidates:
+            if iter_instructions in seen_instructions:
+                continue
+            seen_instructions.add(iter_instructions)
+            is_best = iter_instructions == best_instructions
+            suffix = "instruction_optimized" if is_best else f"instruction_{label}"
+            agent = Agent(
+                name=f"{base.name}_{suffix}",
+                framework=base.framework,
+                instructions=iter_instructions,
+                llm_config=base.llm_config,
+                compaction=base.compaction,
+                tools=base.tools,
+            )
+            candidates.append(
+                Candidate(
+                    agent=agent,
+                    mutations={"instructions": iter_instructions},
+                    rationale=f"Instructions ({label}): {len(iter_instructions)} chars",
+                    optimization_history=history if is_best else [],
+                )
             )
+        # Ensure best is always included
+        if best_instructions not in seen_instructions:
+            final_agent = Agent(
+                name=f"{base.name}_instruction_optimized",
+                framework=base.framework,
+                instructions=best_instructions,
+                llm_config=base.llm_config,
+                compaction=base.compaction,
+                tools=base.tools,
+            )
+            score_progression = f"{history[0].avg_score:.2f} -> {best_score:.2f}"
+            candidates.append(
+                Candidate(
+                    agent=final_agent,
+                    mutations={"instructions": best_instructions},
+                    rationale=f"Instruction optimization: {len(history)} iterations, {score_progression}",
+                    optimization_history=history,
+                )
+            )
+        return candidates
     def _reflect_and_rewrite(
         self,
         failures: list[Any],
         current_score: float,
         model: str,
+        agent_name: str = "",
+        agent_description: str = "",
+        tasks: list[Task] | None = None,
     ) -> str:
         """Analyze failures and rewrite instructions to address them."""
         failure_descriptions = []
+        for tr in failures[:5]:
             task_name = getattr(tr, "task_name", "unknown")
             reasoning = getattr(tr, "eval_reasoning", "No reasoning")
             score = getattr(tr, "eval_score", 0.0)
+            task_prompt = getattr(tr, "task_prompt", "")
             failure_descriptions.append(
+                f"- Task '{task_name}' (score={score:.2f}): {reasoning[:300]}"
+                + (f"\n  Task prompt: {task_prompt[:200]}" if task_prompt else "")
             )
         failures_text = "\n".join(failure_descriptions)
+        # Build agent context from name, description, and task domain
+        agent_context = ""
+        if agent_name or agent_description:
+            agent_context = f"\nThe agent is called '{agent_name}'"
+            if agent_description:
+                agent_context += f" — {agent_description}"
+            agent_context += ".\n"
+        # Infer domain from task prompts
+        domain_context = ""
+        if tasks:
+            task_summaries = [f"- {t.name}: {t.prompt[:100]}..." for t in tasks[:5]]
+            domain_context = f"\nThe agent is evaluated on these types of tasks:\n" + "\n".join(task_summaries) + "\n"
+        prompt = f"""You are a prompt engineer improving an agent's instructions to fix its performance issues.
+{agent_context}{domain_context}
+The agent scored {current_score:.2f} out of 1.0. Here are the tasks where it failed and what went wrong:
 {failures_text}
+The agent's current instructions are:
 ---
 {instructions}
 ---
+Rewrite the instructions to fix the failures above. The new instructions should:
+1. Directly address the failure patterns — if the agent didn't create files, tell it to always save output to the requested file AND display the content. If it missed details, tell it to reference every constraint from the user's request.
+2. Be specific to this agent's domain — not generic "coding assistant" guidelines
+3. Do NOT reference specific task names or test answers — the instructions should generalize to similar tasks
+4. Be concise
+Output ONLY the new instructions text, nothing else."""
         try:
             return self._call_llm(prompt, model) or instructions
         except Exception as e:
             logger.warning(f"LLM rewrite failed: {e}")
             logger.info("Retrying rewrite with fallback prompt (without original instructions)")
             return self._fallback_rewrite(failures_text, current_score, model)
         failures_text: str,
         current_score: float,
         model: str,
+        agent_name: str = "",
+        agent_description: str = "",
     ) -> str:
+        """Generate new instructions from scratch when the primary rewrite is blocked."""
+        agent_role = "an AI assistant"
+        if agent_name or agent_description:
+            agent_role = f"an AI assistant called '{agent_name}'"
+            if agent_description:
+                agent_role += f" ({agent_description})"
+        prompt = f"""You are a prompt engineer. Write instructions for {agent_role}.
 The assistant scored {current_score:.2f} out of 1.0 on these tasks:
 {failures_text}
+Write concise instructions tailored to this assistant's role. The instructions should:
+1. Be specific to the assistant's domain and purpose
+2. Address the failure patterns from the tasks above
+3. Include strategies for creating files when asked and verifying output
+4. Do NOT reference specific task names or answers from the failures above
+5. Focus on transferable habits relevant to this assistant's role
+Output ONLY the instructions text, nothing else."""
         try:
             result = self._call_llm(prompt, model)
         except Exception as e2:
             logger.warning(f"Fallback rewrite also failed: {e2}")
+        logger.info("Using default assistant guidelines")
         return (
+            "You are a helpful assistant. When given a task:\n"
+            "1. Create the requested files with correct content\n"
+            "2. Verify the output matches all requirements\n"
             "3. Handle edge cases and validate results before finishing"
         )
     def _get_client(self, model: str) -> tuple[Any, str]:
         """Get OpenAI client and model name."""
         try:
             from openai import AzureOpenAI, OpenAI
         except ImportError as e:
+            raise ImportError("openai package required for InstructionOptimizer") from e
         azure_key = os.environ.get("AZURE_OPENAI_API_KEY")
         azure_endpoint = os.environ.get("AZURE_OPENAI_ENDPOINT")
             messages=[{"role": "user", "content": prompt}],
         )
         return response.choices[0].message.content or ""

src/flow/experiments/strategies/skill.py ADDED Viewed

	@@ -0,0 +1,692 @@

+# Copyright (c) Microsoft. All rights reserved.
+"""Skill optimization strategy.
+Iteratively discovers and generates skills (domain knowledge packages)
+to improve agent performance. The strategy:
+1. Starts with an empty skill directory (no pre-loaded domain knowledge)
+2. Evaluates the agent on tasks to establish a baseline
+3. Analyzes failures and uses an LLM to generate SKILL.md files
+4. Writes generated skills to a managed directory the agent can discover
+5. Re-evaluates and repeats until convergence or budget exhausted
+Skills differ from tools: tools are executable capabilities (read_file, bash),
+while skills are domain knowledge packages (OOXML patterns, testing workflows).
+"""
+from __future__ import annotations
+import logging
+import os
+import shutil
+import tempfile
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any
+from ..models import Agent, Candidate, ExperimentRunner, StrategyIteration
+from ..types import Task
+logger = logging.getLogger(__name__)
+@dataclass
+class SkillOptimizer:
+    """Strategy that iteratively generates and refines skills for an agent.
+    Runs an evaluate-analyze-generate loop. Each iteration evaluates
+    the agent on tasks, analyzes failures, and generates SKILL.md files
+    containing domain knowledge that would help the agent succeed.
+    The optimizer manages its own skill directory. On each iteration it can:
+    - Generate new skills based on failure patterns
+    - Refine existing skills that didn't help enough
+    - Remove skills that added context cost without benefit
+    Requires both a runner (to evaluate candidates) and tasks (to test on).
+    Config options:
+        model: LLM for skill generation (default: gpt-4o-mini)
+        max_iterations: Max optimization iterations (default: 3)
+        min_improvement: Min score gain to continue (default: 0.05)
+        include_builtin: Whether to include built-in skills in the catalog
+                         for selection (default: True)
+    Example YAML:
+        strategy:
+          type: skill
+          config:
+            model: gpt-4o-mini
+            max_iterations: 3
+            include_builtin: true
+    """
+    config: dict[str, Any] = field(default_factory=dict)
+    # Managed skill directory (created during generate, cleaned up after)
+    _skill_dir: Path | None = field(default=None, init=False, repr=False)
+    async def generate(
+        self,
+        base: Agent,
+        budget: int,
+        *,
+        tasks: list[Task] | None = None,
+        runner: ExperimentRunner | None = None,
+    ) -> list[Candidate]:
+        """Generate candidates with optimized skill configurations.
+        Args:
+            base: Base agent configuration
+            budget: Max candidates to generate
+            tasks: Tasks to evaluate on (required)
+            runner: ExperimentRunner for evaluation (required)
+        Returns:
+            List of candidates with optimized skill sets
+        Raises:
+            ValueError: If tasks or runner not provided
+        """
+        if runner is None:
+            raise ValueError(
+                "SkillOptimizer requires a runner. "
+                "Use FlowOptimizer.optimize_with_strategy() to provide one."
+            )
+        if not tasks:
+            raise ValueError(
+                "SkillOptimizer requires tasks to evaluate against."
+            )
+        # Create a temp directory that the optimizer owns
+        self._skill_dir = Path(tempfile.mkdtemp(prefix="flow_skills_opt_"))
+        logger.info(f"SkillOptimizer: managing skills in {self._skill_dir}")
+        try:
+            return await self._generate_active(base, budget, tasks, runner)
+        finally:
+            # Clean up temp dir after optimization
+            if self._skill_dir and self._skill_dir.exists():
+                shutil.rmtree(self._skill_dir, ignore_errors=True)
+                self._skill_dir = None
+    async def _generate_active(
+        self,
+        base: Agent,
+        budget: int,
+        tasks: list[Task],
+        runner: ExperimentRunner,
+    ) -> list[Candidate]:
+        """Run active optimization loop with real evaluation feedback."""
+        model = self.config.get("model", "gpt-4o-mini")
+        max_iterations = self.config.get("max_iterations", 3)
+        min_improvement = self.config.get("min_improvement", 0.05)
+        include_builtin = self.config.get("include_builtin", True)
+        assert self._skill_dir is not None
+        logger.info(
+            f"SkillOptimizer: active mode (max_iterations={max_iterations}, "
+            f"min_improvement={min_improvement}, include_builtin={include_builtin})"
+        )
+        # Collect built-in skill catalog for LLM reference
+        builtin_catalog = self._get_builtin_catalog() if include_builtin else {}
+        best_score = 0.0
+        best_skills: dict[str, str] = {}  # skill_name -> SKILL.md content
+        current_skills: dict[str, str] = {}  # starts empty
+        _prev_skills: dict[str, str] = {}
+        history: list[StrategyIteration] = []
+        iteration_candidates: list[tuple[dict[str, str], str]] = []
+        for iteration in range(max_iterations):
+            # 1. Write current skills to the managed directory
+            self._write_skills_to_dir(current_skills)
+            # 2. Build agent with skills embedded in instructions
+            tools_config = self._build_tools_with_skills(base)
+            enriched_instructions = self._build_instructions_with_skills(
+                base.instructions, current_skills
+            )
+            agent = Agent(
+                name=f"{base.name}_skills_iter{iteration}",
+                framework=base.framework,
+                instructions=enriched_instructions,
+                llm_config=base.llm_config,
+                compaction=base.compaction,
+                tools=tools_config,
+                skills=dict(current_skills) if current_skills else None,
+            )
+            candidate = Candidate(
+                agent=agent,
+                mutations={"skills": sorted(current_skills.keys())},
+            )
+            summary = await runner.evaluate(candidate, tasks)
+            avg_score = getattr(summary, "avg_score", 0.0)
+            pass_rate = getattr(summary, "pass_rate", 0.0)
+            task_results = getattr(summary, "task_results", [])
+            failures = [tr for tr in task_results if not getattr(tr, "eval_passed", True)]
+            skills_list = sorted(current_skills.keys()) or ["(none)"]
+            logger.info(
+                f"  Iteration {iteration}: avg_score={avg_score:.3f}, "
+                f"pass_rate={pass_rate:.1%}, failures={len(failures)}, "
+                f"skills={skills_list}"
+            )
+            # Build per-task summary (include full reasoning for history)
+            task_lines: list[str] = []
+            for tr in task_results:
+                task_name = getattr(tr, "task_name", "unknown")
+                passed = getattr(tr, "eval_passed", True)
+                reasoning = getattr(tr, "eval_reasoning", "")
+                status = "PASS" if passed else "FAIL"
+                task_lines.append(f"  [{status}] {task_name}: {reasoning[:500]}")
+            tasks_summary = "\n".join(task_lines)
+            # Record iteration
+            skills_desc = ", ".join(skills_list)
+            change_desc = "Baseline evaluation (no skills)" if iteration == 0 else f"Skill adjustment iteration {iteration}"
+            change_rationale = f"Skills: {skills_desc}\n{tasks_summary}"
+            if iteration > 0:
+                score_delta = avg_score - history[-1].avg_score
+                prev_skills = set(_prev_skills.keys())
+                curr_skill_set = set(current_skills.keys())
+                added = sorted(curr_skill_set - prev_skills)
+                removed = sorted(prev_skills - curr_skill_set)
+                change_rationale = (
+                    f"Score {'improved' if score_delta > 0 else 'declined'} by {abs(score_delta):.3f}. "
+                    f"Added skills: {added or 'none'}. Removed: {removed or 'none'}. "
+                    f"{len(failures)} failures remaining.\n"
+                    f"Skills: {skills_desc}\n{tasks_summary}"
+                )
+            history.append(
+                StrategyIteration(
+                    iteration=iteration,
+                    instructions_preview=f"[{skills_desc}]"[:200],
+                    full_instructions=f"Skills: [{skills_desc}]",
+                    avg_score=avg_score,
+                    pass_rate=pass_rate,
+                    failures_count=len(failures),
+                    change_description=change_desc,
+                    change_rationale=change_rationale,
+                )
+            )
+            label = "baseline" if iteration == 0 else f"iter{iteration}"
+            iteration_candidates.append((dict(current_skills), label))
+            # Track best (>= so that skills are preferred over no-skills on ties)
+            if avg_score >= best_score and (current_skills or not best_skills):
+                best_score = avg_score
+                best_skills = dict(current_skills)
+            # 2. Check stopping conditions
+            if iteration > 0:
+                improvement = avg_score - history[-2].avg_score
+                if improvement < min_improvement and avg_score <= best_score:
+                    logger.info(
+                        f"  Stopping: improvement ({improvement:.3f}) < "
+                        f"min_improvement ({min_improvement})"
+                    )
+                    break
+            if not failures:
+                logger.info("  Stopping: all tasks passed")
+                break
+            if iteration == max_iterations - 1:
+                break  # Don't generate on last iteration
+            # 3. Analyze failures and generate/adjust skills
+            _prev_skills = dict(current_skills)
+            current_skills = self._analyze_and_generate(
+                current_skills, task_results, builtin_catalog, model, tasks
+            )
+            logger.info(f"  Updated skills: {sorted(current_skills.keys())}")
+        # Build candidates for all unique skill configs tried
+        candidates: list[Candidate] = []
+        seen_skill_sets: set[tuple[str, ...]] = set()
+        for iter_skills, label in iteration_candidates:
+            skill_key = tuple(sorted(iter_skills.keys()))
+            if skill_key in seen_skill_sets:
+                continue
+            seen_skill_sets.add(skill_key)
+            is_best = sorted(iter_skills.keys()) == sorted(best_skills.keys())
+            suffix = "skills_optimized" if is_best else f"skills_{label}"
+            self._write_skills_to_dir(iter_skills)
+            tools_config = self._build_tools_with_skills(base)
+            enriched_instructions = self._build_instructions_with_skills(
+                base.instructions, iter_skills
+            )
+            skills_desc = ", ".join(sorted(iter_skills.keys())) or "(none)"
+            candidates.append(
+                Candidate(
+                    agent=Agent(
+                        name=f"{base.name}_{suffix}",
+                        framework=base.framework,
+                        instructions=enriched_instructions,
+                        llm_config=base.llm_config,
+                        compaction=base.compaction,
+                        tools=tools_config,
+                        skills=dict(iter_skills) if iter_skills else None,
+                    ),
+                    mutations={
+                        "skills": sorted(iter_skills.keys()),
+                    },
+                    rationale=f"Skills: [{skills_desc}]",
+                    optimization_history=history if is_best else [],
+                )
+            )
+        # Ensure best is always included
+        best_key = tuple(sorted(best_skills.keys()))
+        if best_key not in seen_skill_sets:
+            self._write_skills_to_dir(best_skills)
+            tools_config = self._build_tools_with_skills(base)
+            enriched_instructions = self._build_instructions_with_skills(
+                base.instructions, best_skills
+            )
+            skills_desc = ", ".join(sorted(best_skills.keys())) or "(none)"
+            candidates.append(
+                Candidate(
+                    agent=Agent(
+                        name=f"{base.name}_skills_optimized",
+                        framework=base.framework,
+                        instructions=enriched_instructions,
+                        llm_config=base.llm_config,
+                        compaction=base.compaction,
+                        tools=tools_config,
+                        skills=dict(best_skills) if best_skills else None,
+                    ),
+                    mutations={
+                        "skills": sorted(best_skills.keys()),
+                    },
+                    rationale=f"Skills: [{skills_desc}]",
+                    optimization_history=history,
+                )
+            )
+        # Restore best skills as final state on disk
+        self._write_skills_to_dir(best_skills)
+        return candidates
+    def _build_tools_with_skills(self, base: Agent) -> list[str] | dict[str, Any]:
+        """Build a tools config that includes the skills tool pointing to our managed dir.
+        Ensures the agent has the skills tool configured to only see our managed
+        skill directory (no built-in or user skills auto-loaded).
+        """
+        from ..models import resolve_tools
+        # Start from the base agent's tools
+        if base.tools is None or (isinstance(base.tools, list) and len(base.tools) == 0):
+            base_tools: dict[str, Any] = {}
+        elif isinstance(base.tools, str):
+            base_tools = dict(resolve_tools(base.tools))
+        elif isinstance(base.tools, list):
+            base_tools = dict(resolve_tools(base.tools))
+        else:
+            base_tools = dict(base.tools)
+        # Ensure skills tool is present with our managed path
+        assert self._skill_dir is not None
+        base_tools["skills"] = {
+            "skills_path": str(self._skill_dir),
+        }
+        return base_tools
+    def _build_instructions_with_skills(
+        self, base_instructions: str | None, skills: dict[str, str]
+    ) -> str:
+        """Inject full skill content into the agent's instructions.
+        The harness layer injects skill *summaries* (name + description) into
+        the system prompt for normal agents. The optimizer intentionally injects
+        *full* skill content here because optimization requires the agent to
+        see and follow the complete domain knowledge, not just a summary.
+        Setting explicit instructions on the Agent causes the harness to skip
+        its own summary injection, so these two approaches don't conflict.
+        """
+        parts: list[str] = []
+        if base_instructions:
+            parts.append(base_instructions)
+        if skills:
+            parts.append("\n## Domain Knowledge (Skills)\n")
+            parts.append(
+                "The following skills provide domain-specific patterns and "
+                "best practices. Follow these guidelines when completing tasks.\n"
+            )
+            for name, content in sorted(skills.items()):
+                parts.append(f"### {name}\n{content}\n")
+        return "\n".join(parts) if parts else ""
+    def _write_skills_to_dir(self, skills: dict[str, str]) -> None:
+        """Write skill content to the managed directory.
+        Clears the directory first, then writes each skill as a folder
+        with a SKILL.md file.
+        """
+        assert self._skill_dir is not None
+        # Clear existing skills
+        if self._skill_dir.exists():
+            for item in self._skill_dir.iterdir():
+                if item.is_dir():
+                    shutil.rmtree(item)
+        # Write each skill
+        for name, content in skills.items():
+            skill_dir = self._skill_dir / name
+            skill_dir.mkdir(parents=True, exist_ok=True)
+            (skill_dir / "SKILL.md").write_text(content)
+    def _get_builtin_catalog(self) -> dict[str, str]:
+        """Get descriptions of all built-in skills for LLM reference."""
+        from flow.tools.skills import _discover_skills, _get_builtin_skills_path
+        builtin_path = _get_builtin_skills_path()
+        if not builtin_path.exists():
+            return {}
+        discovered = _discover_skills([builtin_path])
+        catalog: dict[str, str] = {}
+        for skill_name, (skill_md, meta) in discovered.items():
+            description = meta.get("description", "No description")
+            catalog[skill_name] = description
+        return catalog
+    def _analyze_and_generate(
+        self,
+        current_skills: dict[str, str],
+        task_results: list[Any],
+        builtin_catalog: dict[str, str],
+        model: str,
+        tasks: list[Task] | None = None,
+    ) -> dict[str, str]:
+        """Analyze failures and incrementally evolve skills.
+        The LLM sees the full content of every current skill and decides
+        per-skill what to do:
+        - "keep": skill is helping, leave it unchanged
+        - "drop": skill isn't helping, remove it
+        - "refine": skill has the right idea but needs improved content
+          (LLM provides the updated SKILL.md)
+        - New skills can be added (LLM provides full SKILL.md content)
+        - "builtin": select a built-in skill by name from the catalog
+        """
+        # Build task->criteria lookup for enriching the prompt
+        task_criteria_map: dict[str, list[dict[str, str]]] = {}
+        if tasks:
+            for t in tasks:
+                task_criteria_map[t.name] = [
+                    {"name": c.name, "instruction": c.instruction}
+                    for c in t.criteria
+                ]
+        # Build task results summary with full reasoning and criteria
+        task_descriptions = []
+        for tr in task_results:
+            task_name = getattr(tr, "task_name", "unknown")
+            passed = getattr(tr, "eval_passed", True)
+            reasoning = getattr(tr, "eval_reasoning", "")
+            score = getattr(tr, "eval_score", 0.0)
+            status = "PASS" if passed else "FAIL"
+            # Include full reasoning (not truncated)
+            entry = f"- [{status}] Task '{task_name}' (score={score:.2f}):\n  Reasoning: {reasoning}"
+            # Include the task's evaluation criteria so the LLM knows
+            # the exact rules the agent must follow
+            criteria = task_criteria_map.get(task_name, [])
+            if criteria and not passed:
+                criteria_lines = []
+                for c in criteria:
+                    criteria_lines.append(f"    - {c['name']}: {c['instruction']}")
+                entry += "\n  Evaluation criteria (the agent MUST satisfy ALL of these):\n"
+                entry += "\n".join(criteria_lines)
+            task_descriptions.append(entry)
+        results_text = "\n".join(task_descriptions)
+        # Build current skills section with full content
+        current_skills_section = ""
+        if current_skills:
+            skill_entries = []
+            for name, content in sorted(current_skills.items()):
+                # Show full content so LLM can refine it
+                skill_entries.append(
+                    f"### Skill: {name}\n```\n{content}\n```"
+                )
+            current_skills_section = (
+                "\n## Current Skills (full content)\n"
+                + "\n\n".join(skill_entries)
+                + "\n"
+            )
+        else:
+            current_skills_section = "\n## Current Skills\nNone — this is the first iteration.\n"
+        # Build catalog section
+        catalog_section = ""
+        if builtin_catalog:
+            catalog_lines = []
+            for name, desc in sorted(builtin_catalog.items()):
+                catalog_lines.append(f"  - {name}: {desc}")
+            catalog_section = (
+                "\n## Available Built-in Skills (can be selected by name)\n"
+                + "\n".join(catalog_lines)
+                + "\n"
+            )
+        prompt = f"""You are optimizing the skill configuration for a coding assistant.
+Skills are domain knowledge packages (SKILL.md files) that give the agent specialized
+expertise, patterns, and best practices for specific domains.
+## Task Results
+{results_text}
+{current_skills_section}{catalog_section}
+## Your Job
+Analyze the failing tasks above. Each failing task includes its **evaluation criteria** —
+these are the exact rules the evaluator checks. Your skills MUST encode these specific
+requirements so the agent follows them.
+**Critical**: The agent fails because it doesn't know about specific conventions
+(e.g., exact data formats, specific error types to raise, required fields in output).
+Your skills must spell out these conventions as concrete, actionable rules — not
+general advice.
+For EACH current skill, decide:
+- **"keep"** — the skill is helping (tasks it targets are passing). Leave it as-is.
+- **"drop"** — the skill isn't contributing. Remove it to reduce noise.
+- **Provide updated SKILL.md content** — the skill targets the right problem but
+  its content should be refined to better address the failures.
+You can also:
+- **Add new skills** with full SKILL.md content to address uncovered failure patterns
+- **Select a built-in** by setting the value to "builtin"
+## What Makes a Good Skill
+- **Specific, not generic**: "Always use `json.dumps(data, indent=2)`" is better than
+  "Use proper JSON formatting"
+- **Actionable rules**: "Define `__all__` at module top" is better than "Follow best practices"
+- **Directly addresses criteria**: Each skill rule should map to a specific evaluation
+  criterion that the agent is currently failing
+- **Concise**: Include only the rules needed; avoid padding with obvious advice
+- **Evidence-producing**: The agent is evaluated ONLY on what appears in its tool
+  outputs and final response. If the agent writes a file, the evaluator does NOT
+  read that file — it only sees the tool's return message (e.g., "Successfully wrote
+  625 characters"). Skills MUST instruct the agent to make its work verifiable:
+  * After writing a file, read it back or print its contents so the output is visible
+  * After creating structured output (CSV, JSON, code), display the result
+  * Run scripts and show their output rather than just writing them
+  * The evaluator cannot verify what it cannot see — always produce visible evidence
+## Response Format
+Respond with a JSON object. Keys are skill names, values are one of:
+- `"keep"` — retain this skill unchanged
+- `"drop"` — remove this skill
+- `"builtin"` — load from the built-in catalog
+- A string containing the full SKILL.md content (for new or refined skills)
+SKILL.md content MUST start with YAML frontmatter:
+---
+name: skill-name
+description: What this skill does
+---
+# Content...
+## Rules
+- Keep skills that are working (their target tasks pass) — don't drop what works
+- Refine skills whose target tasks still fail — tweak the content, don't start over
+- Only add new skills for failure patterns not covered by existing skills
+- Keep skills focused and concise (domain knowledge, not general advice)
+- ALWAYS include a "Verification" section in every skill telling the agent to
+  display/print/cat its output after creating it — this is the #1 cause of false
+  failures (correct code that the evaluator can't see)
+## Example Response
+{{"git-log-parsing": "keep", "executable-verification": "---\\nname: executable-verification\\ndescription: Improved verification patterns\\n---\\n# Updated content here...", "new-skill": "---\\nname: new-skill\\ndescription: Addresses regex failures\\n---\\n# Content..."}}
+Respond with ONLY the JSON object, nothing else."""
+        try:
+            result = self._call_llm(prompt, model)
+            if result:
+                return self._parse_skill_response(result, current_skills, builtin_catalog)
+        except Exception as e:
+            logger.warning(f"LLM skill generation failed: {e}")
+        # Fallback: keep current skills unchanged
+        return current_skills
+    def _parse_skill_response(
+        self,
+        response: str,
+        current_skills: dict[str, str],
+        builtin_catalog: dict[str, str],
+    ) -> dict[str, str]:
+        """Parse LLM response into skill name -> content mapping.
+        Supports incremental operations:
+        - "keep": retain existing skill content unchanged
+        - "drop": remove the skill (omit from result)
+        - "builtin": load from built-in catalog
+        - string content: new or refined skill (replaces existing)
+        """
+        import json
+        # Try to extract JSON from the response
+        response = response.strip()
+        # Handle markdown code blocks
+        if response.startswith("```"):
+            lines = response.split("\n")
+            lines = [l for l in lines if not l.strip().startswith("```")]
+            response = "\n".join(lines)
+        try:
+            skills_dict = json.loads(response)
+        except json.JSONDecodeError:
+            logger.warning(f"Failed to parse skill response as JSON: {response[:200]}")
+            return current_skills
+        if not isinstance(skills_dict, dict):
+            logger.warning(f"Skill response is not a dict: {type(skills_dict)}")
+            return current_skills
+        new_skills: dict[str, str] = {}
+        for name, value in skills_dict.items():
+            if not isinstance(name, str):
+                continue
+            if value == "keep":
+                # Retain existing skill unchanged
+                if name in current_skills:
+                    new_skills[name] = current_skills[name]
+                else:
+                    logger.warning(f"Cannot keep unknown skill: {name}")
+            elif value == "drop":
+                # Explicitly remove — just don't add to new_skills
+                logger.info(f"Dropping skill: {name}")
+            elif value == "builtin" and name in builtin_catalog:
+                content = self._load_builtin_skill(name)
+                if content:
+                    new_skills[name] = content
+                else:
+                    logger.warning(f"Failed to load built-in skill: {name}")
+            elif isinstance(value, str) and value not in ("builtin", "keep", "drop"):
+                # New or refined skill content
+                new_skills[name] = value
+            else:
+                logger.warning(f"Skipping invalid skill entry: {name}={value!r}")
+        if not new_skills and current_skills:
+            logger.warning("LLM dropped all skills, keeping current set")
+            return current_skills
+        return new_skills
+    def _load_builtin_skill(self, name: str) -> str | None:
+        """Load the full content of a built-in skill."""
+        from flow.tools.skills import _discover_skills, _get_builtin_skills_path
+        builtin_path = _get_builtin_skills_path()
+        discovered = _discover_skills([builtin_path])
+        if name in discovered:
+            skill_md, _ = discovered[name]
+            try:
+                return skill_md.read_text()
+            except Exception as e:
+                logger.warning(f"Error reading built-in skill {name}: {e}")
+        return None
+    def _get_client(self, model: str) -> tuple[Any, str]:
+        """Get OpenAI client and model name."""
+        try:
+            from openai import AzureOpenAI, OpenAI
+        except ImportError as e:
+            raise ImportError("openai package required for SkillOptimizer") from e
+        azure_key = os.environ.get("AZURE_OPENAI_API_KEY")
+        azure_endpoint = os.environ.get("AZURE_OPENAI_ENDPOINT")
+        if azure_key and azure_endpoint:
+            client = AzureOpenAI(
+                api_key=azure_key,
+                api_version="2024-08-01-preview",
+                azure_endpoint=azure_endpoint,
+            )
+            model_name = os.environ.get("AZURE_OPENAI_DEPLOYMENT", model)
+        else:
+            openai_key = os.environ.get("OPENAI_API_KEY")
+            if not openai_key:
+                raise ValueError("No OpenAI or Azure OpenAI credentials found")
+            client = OpenAI(api_key=openai_key)
+            model_name = model
+        return client, model_name
+    def _call_llm(self, prompt: str, model: str) -> str:
+        """Call LLM with a prompt."""
+        client, model_name = self._get_client(model)
+        response = client.chat.completions.create(
+            model=model_name,
+            messages=[{"role": "user", "content": prompt}],
+        )
+        return response.choices[0].message.content or ""

src/flow/experiments/strategies/{tool_selector.py → tool.py} RENAMED Viewed

@@ -1,6 +1,6 @@
 # Copyright (c) Microsoft. All rights reserved.
-"""Active tool selector strategy.
 Uses the runner to evaluate tool configurations and iteratively adjust
 the tool set based on actual execution failures. The strategy:
@@ -17,7 +17,6 @@ import os
 from dataclasses import dataclass, field
 from typing import Any
-from ..metrics import extract_metrics
 from ..models import Agent, Candidate, ExperimentRunner, StrategyIteration, TOOL_PRESETS
 from ..types import Task
@@ -30,7 +29,7 @@ ALL_AVAILABLE_TOOLS: list[str] = sorted(
 @dataclass
-class ToolSelectorStrategy:
     """Strategy that iteratively optimizes tool configurations via evaluation.
     Runs an evaluate-analyze-adjust loop. Each iteration evaluates
@@ -47,7 +46,7 @@ class ToolSelectorStrategy:
     Example YAML:
         strategy:
-          type: tool_selector
           config:
             model: gpt-4o-mini
             max_iterations: 3
@@ -79,18 +78,22 @@ class ToolSelectorStrategy:
         """
         if runner is None:
             raise ValueError(
-                "ToolSelectorStrategy requires a runner. "
                 "Use FlowOptimizer.optimize_with_strategy() to provide one."
             )
         if not tasks:
             raise ValueError(
-                "ToolSelectorStrategy requires tasks to evaluate against."
             )
         # Resolve initial tools to a list
         from ..models import resolve_tools
         if base.tools is None or (isinstance(base.tools, list) and len(base.tools) == 0):
-            current_tools = []
         else:
             current_tools = sorted(resolve_tools(base.tools).keys())
@@ -111,13 +114,14 @@ class ToolSelectorStrategy:
         available_tools = self.config.get("available_tools", ALL_AVAILABLE_TOOLS)
         logger.info(
-            f"ToolSelectorStrategy: active mode (max_iterations={max_iterations}, "
             f"available_tools={len(available_tools)})"
         )
         current_tools = tools
         best_tools = tools
         best_score = 0.0
         history: list[StrategyIteration] = []
         # Track all unique tool configs tried, for returning as candidates
         iteration_candidates: list[tuple[list[str], str]] = []  # (tools, name_suffix)
@@ -180,8 +184,8 @@ class ToolSelectorStrategy:
             change_rationale = f"Tools used: {used_desc}\n{tasks_summary}"
             if iteration > 0:
                 score_delta = avg_score - history[-1].avg_score
-                added = set(current_tools) - set(best_tools if iteration == 1 else _prev_tools)
-                removed = set(_prev_tools) - set(current_tools) if iteration > 0 else set()
                 change_rationale = (
                     f"Score {'improved' if score_delta > 0 else 'declined'} by {abs(score_delta):.3f}. "
                     f"Added: {sorted(added) or 'none'}. Removed: {sorted(removed) or 'none'}. "
@@ -235,7 +239,6 @@ class ToolSelectorStrategy:
             logger.info(f"  Adjusted tools: {current_tools}")
         # Build candidates for all unique tool configs tried
-        # This gives the Pareto chart multiple data points to compare
         candidates: list[Candidate] = []
         seen_tool_sets: set[tuple[str, ...]] = set()
@@ -265,8 +268,7 @@ class ToolSelectorStrategy:
                 )
             )
-        # Ensure best is always included (may differ from any iteration if
-        # the best score was from an earlier iteration)
         best_key = tuple(sorted(best_tools))
         if best_key not in seen_tool_sets:
             final_agent = Agent(
@@ -298,7 +300,6 @@ class ToolSelectorStrategy:
         model: str,
     ) -> list[str]:
         """Analyze failures and traces, then recommend tool changes."""
-        # Build analysis of what happened
         failure_descriptions = []
         for tr in task_results:
             task_name = getattr(tr, "task_name", "unknown")
@@ -306,7 +307,6 @@ class ToolSelectorStrategy:
             reasoning = getattr(tr, "eval_reasoning", "")
             score = getattr(tr, "eval_score", 0.0)
-            # Get per-task tool usage
             metrics = getattr(tr, "metrics", None)
             task_tools = {}
             if metrics and hasattr(metrics, "tool_calls_by_name"):
@@ -349,16 +349,13 @@ Example: read_file, write_file, bash, grep, edit_file"""
         try:
             result = self._call_llm(prompt, model)
             if result:
-                # Parse comma-separated tool names
                 parsed = [t.strip() for t in result.split(",") if t.strip()]
-                # Validate against available tools
                 valid = [t for t in parsed if t in available_tools]
                 if valid:
                     return sorted(valid)
                 logger.warning(f"No valid tools in LLM response: {parsed}")
         except Exception as e:
             logger.warning(f"LLM tool adjustment failed: {e}")
-            # Fallback: try adding commonly useful tools
             return self._heuristic_adjust(current_tools, tools_used, available_tools)
         return current_tools
@@ -372,18 +369,15 @@ Example: read_file, write_file, bash, grep, edit_file"""
         """Fallback heuristic when LLM is unavailable."""
         adjusted = set(current_tools)
-        # If bash was used heavily but grep/glob not available, add them
         if "bash" in tools_used and tools_used["bash"] > 2:
             for tool in ["grep", "glob_files", "ls"]:
                 if tool in available_tools:
                     adjusted.add(tool)
-        # If write_file was used but edit_file not available, add it
         if "write_file" in tools_used and "edit_file" not in adjusted:
             if "edit_file" in available_tools:
                 adjusted.add("edit_file")
-        # Add think if not present (helps with reasoning)
         if "think" in available_tools:
             adjusted.add("think")
@@ -394,7 +388,7 @@ Example: read_file, write_file, bash, grep, edit_file"""
         try:
             from openai import AzureOpenAI, OpenAI
         except ImportError as e:
-            raise ImportError("openai package required for ToolSelectorStrategy") from e
         azure_key = os.environ.get("AZURE_OPENAI_API_KEY")
         azure_endpoint = os.environ.get("AZURE_OPENAI_ENDPOINT")

 # Copyright (c) Microsoft. All rights reserved.
+"""Tool optimization strategy.
 Uses the runner to evaluate tool configurations and iteratively adjust
 the tool set based on actual execution failures. The strategy:
 from dataclasses import dataclass, field
 from typing import Any
 from ..models import Agent, Candidate, ExperimentRunner, StrategyIteration, TOOL_PRESETS
 from ..types import Task
 @dataclass
+class ToolOptimizer:
     """Strategy that iteratively optimizes tool configurations via evaluation.
     Runs an evaluate-analyze-adjust loop. Each iteration evaluates
     Example YAML:
         strategy:
+          type: tool
           config:
             model: gpt-4o-mini
             max_iterations: 3
         """
         if runner is None:
             raise ValueError(
+                "ToolOptimizer requires a runner. "
                 "Use FlowOptimizer.optimize_with_strategy() to provide one."
             )
         if not tasks:
             raise ValueError(
+                "ToolOptimizer requires tasks to evaluate against."
             )
         # Resolve initial tools to a list
+        # When starting from no tools, seed with "standard" preset so the
+        # optimizer has a working baseline to iterate from.  An agent with
+        # zero tools produces zero signal (no tool calls, no files created),
+        # which makes iterative improvement impossible.
         from ..models import resolve_tools
         if base.tools is None or (isinstance(base.tools, list) and len(base.tools) == 0):
+            current_tools = sorted(resolve_tools("standard").keys())
         else:
             current_tools = sorted(resolve_tools(base.tools).keys())
         available_tools = self.config.get("available_tools", ALL_AVAILABLE_TOOLS)
         logger.info(
+            f"ToolOptimizer: active mode (max_iterations={max_iterations}, "
             f"available_tools={len(available_tools)})"
         )
         current_tools = tools
         best_tools = tools
         best_score = 0.0
+        _prev_tools: list[str] = []
         history: list[StrategyIteration] = []
         # Track all unique tool configs tried, for returning as candidates
         iteration_candidates: list[tuple[list[str], str]] = []  # (tools, name_suffix)
             change_rationale = f"Tools used: {used_desc}\n{tasks_summary}"
             if iteration > 0:
                 score_delta = avg_score - history[-1].avg_score
+                added = set(current_tools) - set(_prev_tools)
+                removed = set(_prev_tools) - set(current_tools)
                 change_rationale = (
                     f"Score {'improved' if score_delta > 0 else 'declined'} by {abs(score_delta):.3f}. "
                     f"Added: {sorted(added) or 'none'}. Removed: {sorted(removed) or 'none'}. "
             logger.info(f"  Adjusted tools: {current_tools}")
         # Build candidates for all unique tool configs tried
         candidates: list[Candidate] = []
         seen_tool_sets: set[tuple[str, ...]] = set()
                 )
             )
+        # Ensure best is always included
         best_key = tuple(sorted(best_tools))
         if best_key not in seen_tool_sets:
             final_agent = Agent(
         model: str,
     ) -> list[str]:
         """Analyze failures and traces, then recommend tool changes."""
         failure_descriptions = []
         for tr in task_results:
             task_name = getattr(tr, "task_name", "unknown")
             reasoning = getattr(tr, "eval_reasoning", "")
             score = getattr(tr, "eval_score", 0.0)
             metrics = getattr(tr, "metrics", None)
             task_tools = {}
             if metrics and hasattr(metrics, "tool_calls_by_name"):
         try:
             result = self._call_llm(prompt, model)
             if result:
                 parsed = [t.strip() for t in result.split(",") if t.strip()]
                 valid = [t for t in parsed if t in available_tools]
                 if valid:
                     return sorted(valid)
                 logger.warning(f"No valid tools in LLM response: {parsed}")
         except Exception as e:
             logger.warning(f"LLM tool adjustment failed: {e}")
             return self._heuristic_adjust(current_tools, tools_used, available_tools)
         return current_tools
         """Fallback heuristic when LLM is unavailable."""
         adjusted = set(current_tools)
         if "bash" in tools_used and tools_used["bash"] > 2:
             for tool in ["grep", "glob_files", "ls"]:
                 if tool in available_tools:
                     adjusted.add(tool)
         if "write_file" in tools_used and "edit_file" not in adjusted:
             if "edit_file" in available_tools:
                 adjusted.add("edit_file")
         if "think" in available_tools:
             adjusted.add("think")
         try:
             from openai import AzureOpenAI, OpenAI
         except ImportError as e:
+            raise ImportError("openai package required for ToolOptimizer") from e
         azure_key = os.environ.get("AZURE_OPENAI_API_KEY")
         azure_endpoint = os.environ.get("AZURE_OPENAI_ENDPOINT")

src/flow/harness/compaction/strategies.py CHANGED Viewed

@@ -10,6 +10,8 @@ from __future__ import annotations
 from dataclasses import dataclass, field
 from typing import Any, Protocol
 from flow.harness.compaction.tokenizer import (
     count_message_tokens,
     count_messages_tokens,
@@ -479,7 +481,8 @@ class SummarizationStrategy:
         if self.summarize_fn:
             try:
                 summary_text = await self.summarize_fn(middle, self.summary_max_tokens)
-            except Exception:
                 summary_text = self._extract_key_info(middle)
         else:
             summary_text = self._extract_key_info(middle)

 from dataclasses import dataclass, field
 from typing import Any, Protocol
+from loguru import logger
 from flow.harness.compaction.tokenizer import (
     count_message_tokens,
     count_messages_tokens,
         if self.summarize_fn:
             try:
                 summary_text = await self.summarize_fn(middle, self.summary_max_tokens)
+            except Exception as e:
+                logger.warning(f"Summarization function failed, falling back to key info extraction: {e}")
                 summary_text = self._extract_key_info(middle)
         else:
             summary_text = self._extract_key_info(middle)

src/flow/harness/maf/agent.py CHANGED Viewed

@@ -163,14 +163,26 @@ def create_agent(
             f"Message compaction enabled: head={compaction_head_size}, tail={compaction_tail_size}, head_ratio={head_ratio:.2f}"
         )
-    # Determine if memory is enabled for instructions
     enable_memory = False
     if isinstance(tools, str):
-        enable_memory = "memory" in TOOL_PRESETS.get(tools, {})
     elif isinstance(tools, list):
         enable_memory = "memory" in tools
     elif isinstance(tools, dict):
         enable_memory = "memory" in tools
     # Create the agent
     agent = ChatAgent(
@@ -178,6 +190,8 @@ def create_agent(
         description="Autonomous coding agent",
         instructions=instructions or build_instructions(
             enable_memory=enable_memory,
         ),
         chat_client=client,
         tools=converted_tools,

             f"Message compaction enabled: head={compaction_head_size}, tail={compaction_tail_size}, head_ratio={head_ratio:.2f}"
         )
+    # Determine if memory and skills are enabled for instructions
     enable_memory = False
+    enable_skills = False
     if isinstance(tools, str):
+        preset = TOOL_PRESETS.get(tools, {})
+        enable_memory = "memory" in preset
+        enable_skills = "skills" in preset
     elif isinstance(tools, list):
         enable_memory = "memory" in tools
+        enable_skills = "skills" in tools
     elif isinstance(tools, dict):
         enable_memory = "memory" in tools
+        enable_skills = "skills" in tools
+    # Discover skill metadata when skills are enabled and no explicit instructions
+    skills_metadata: dict[str, dict[str, str]] | None = None
+    if enable_skills and instructions is None:
+        from flow.tools.skills import discover_skills_from_tools_spec
+        skills_metadata = discover_skills_from_tools_spec(tools_spec)
     # Create the agent
     agent = ChatAgent(
         description="Autonomous coding agent",
         instructions=instructions or build_instructions(
             enable_memory=enable_memory,
+            enable_skills=enable_skills,
+            skills_metadata=skills_metadata,
         ),
         chat_client=client,
         tools=converted_tools,

src/flow/harness/maf/tools/__init__.py CHANGED Viewed

@@ -151,8 +151,16 @@ def build_tools(
                 model=config.get("model"),
             )
             tools.append(to_maf_tool(custom_task))
         elif name == "skills" and config.get("additional_paths"):
-            # Skills with custom paths
             custom_skills = create_skills_tool(project_path=Path(config["additional_paths"][0]))
             tools.append(to_maf_tool(custom_skills))
         # Web search tool

                 model=config.get("model"),
             )
             tools.append(to_maf_tool(custom_task))
+        elif name == "skills" and config.get("skills_path"):
+            # Skills with explicit managed path (used by SkillOptimizer)
+            # Only uses the specified path — no built-in or user skills
+            custom_skills = create_skills_tool(
+                builtin_path=Path(config["skills_path"]),
+                exclusive=True,
+            )
+            tools.append(to_maf_tool(custom_skills))
         elif name == "skills" and config.get("additional_paths"):
+            # Skills with additional project paths (keeps built-in skills too)
             custom_skills = create_skills_tool(project_path=Path(config["additional_paths"][0]))
             tools.append(to_maf_tool(custom_skills))
         # Web search tool

src/flow/harness/miniagent/harness.py CHANGED Viewed

@@ -116,6 +116,8 @@ class MiniAgentHarness(BaseHarness):
         otel_hooks = create_otel_hooks(model=config.model)
         # Resolve instructions: explicit > preset > default "general"
         if agent.instructions:
             instructions = agent.instructions
         elif agent.instructions_preset:
@@ -123,6 +125,27 @@ class MiniAgentHarness(BaseHarness):
         else:
             instructions = get_instructions("general")
         chat_agent = ChatAgent(
             client=chat_client,
             instructions=instructions,
@@ -410,7 +433,15 @@ class MiniAgentHarness(BaseHarness):
         tools: list[Tool] = []
         for name, config in tools_spec.items():
-            if name in tool_map:
                 tools.append(tool_map[name])
             elif name == "task" and config:
                 # Task tool with custom config

         otel_hooks = create_otel_hooks(model=config.model)
         # Resolve instructions: explicit > preset > default "general"
+        # When using default instructions, discover skill metadata and append
+        # a summary so the agent knows what skills are available upfront.
         if agent.instructions:
             instructions = agent.instructions
         elif agent.instructions_preset:
         else:
             instructions = get_instructions("general")
+        # Inject skill metadata into instructions (unless explicit instructions were set)
+        if not agent.instructions and "skills" in tools_spec:
+            from flow.tools.skills import discover_skills_from_tools_spec
+            skills_metadata = discover_skills_from_tools_spec(tools_spec)
+            if skills_metadata:
+                lines = ["\n\n## AVAILABLE SKILLS\n"]
+                lines.append(
+                    "The following domain-specific skills are available. "
+                    "Use `skills(action='load', name='...')` to load full content "
+                    "when relevant to your task.\n"
+                )
+                for skill_name, meta in sorted(skills_metadata.items()):
+                    description = meta.get("description", "No description")
+                    triggers = meta.get("triggers", "")
+                    entry = f"- **{skill_name}**: {description}"
+                    if triggers:
+                        entry += f" _(triggers: {triggers})_"
+                    lines.append(entry)
+                instructions += "\n".join(lines)
         chat_agent = ChatAgent(
             client=chat_client,
             instructions=instructions,
         tools: list[Tool] = []
         for name, config in tools_spec.items():
+            if name == "skills" and config.get("skills_path"):
+                # Skills with explicit managed path (used by SkillOptimizer)
+                from flow.tools.skills import create_skills_tool as _create_skills
+                custom_skills = _create_skills(
+                    builtin_path=Path(config["skills_path"]),
+                    exclusive=True,
+                )
+                tools.append(custom_skills)
+            elif name in tool_map:
                 tools.append(tool_map[name])
             elif name == "task" and config:
                 # Task tool with custom config

src/flow/harness/miniagent/tool.py CHANGED Viewed

@@ -8,6 +8,8 @@ from collections.abc import Callable
 from dataclasses import dataclass
 from typing import Annotated, Any, Literal, get_args, get_origin, get_type_hints
 @dataclass
 class Tool:
@@ -115,7 +117,8 @@ def tool(func: Callable[..., Any]) -> Tool:
     # Get type hints (with extras for Annotated)
     try:
         hints = get_type_hints(func, include_extras=True)
-    except Exception:
         hints = {}
     # Build JSON Schema for parameters

 from dataclasses import dataclass
 from typing import Annotated, Any, Literal, get_args, get_origin, get_type_hints
+from loguru import logger
 @dataclass
 class Tool:
     # Get type hints (with extras for Annotated)
     try:
         hints = get_type_hints(func, include_extras=True)
+    except Exception as e:
+        logger.warning(f"Failed to get type hints for function {func.__name__}: {e}")
         hints = {}
     # Build JSON Schema for parameters

src/flow/prompts.py CHANGED Viewed

@@ -464,6 +464,7 @@ def build_instructions(
     *,
     enable_memory: bool = True,
     enable_skills: bool = True,
 ) -> str:
     """Build agent instructions dynamically based on enabled tools.
@@ -474,6 +475,11 @@ def build_instructions(
     Args:
         enable_memory: Include memory tool documentation.
         enable_skills: Include skills tool documentation.
     Returns:
         Complete instruction string.
@@ -529,6 +535,23 @@ def build_instructions(
     if enable_skills:
         sections.append(_SKILLS_SECTION)
     sections.extend([
         _CORE_EXAMPLES,
         _CORE_RESEARCH,

     *,
     enable_memory: bool = True,
     enable_skills: bool = True,
+    skills_metadata: dict[str, dict[str, str]] | None = None,
 ) -> str:
     """Build agent instructions dynamically based on enabled tools.
     Args:
         enable_memory: Include memory tool documentation.
         enable_skills: Include skills tool documentation.
+        skills_metadata: Optional dict of discovered skill metadata
+            (name -> {"description": ..., "triggers": ...}).
+            When provided, injects a concrete listing of available skills
+            into the system prompt so the agent knows what's available
+            without needing to call ``skills(action='list')``.
     Returns:
         Complete instruction string.
     if enable_skills:
         sections.append(_SKILLS_SECTION)
+    # Inject concrete skill listing when metadata is available
+    if enable_skills and skills_metadata:
+        lines = ["\n## AVAILABLE SKILLS\n"]
+        lines.append(
+            "The following domain-specific skills are available. "
+            "Use `skills(action='load', name='...')` to load full content "
+            "when relevant to your task.\n"
+        )
+        for skill_name, meta in sorted(skills_metadata.items()):
+            description = meta.get("description", "No description")
+            triggers = meta.get("triggers", "")
+            entry = f"- **{skill_name}**: {description}"
+            if triggers:
+                entry += f" _(triggers: {triggers})_"
+            lines.append(entry)
+        sections.append("\n".join(lines))
     sections.extend([
         _CORE_EXAMPLES,
         _CORE_RESEARCH,

src/flow/tools/__init__.py CHANGED Viewed

@@ -50,10 +50,11 @@ Sub-agents:
 from __future__ import annotations
-import logging
 from pathlib import Path
 from typing import Any
 # Adapters for framework integration
 from .adapters import to_maf_tool, to_openai_tool, tools_to_maf, tools_to_openai
 from .base import Tool, tool
@@ -96,7 +97,7 @@ from .notebook import notebook_edit, notebook_read
 from .planning import think, todo_read, todo_write
 # Skills tools
-from .skills import create_skills_tool, skills
 # Sub-agent tools
 from .subagent import create_task_tool, task
@@ -148,6 +149,7 @@ __all__ = [
     # Skills tools
     "skills",
     "create_skills_tool",
     # Sub-agent tools
     "task",
     "create_task_tool",
@@ -175,8 +177,6 @@ __all__ = [
     "visual_inspector",
 ]
-logger = logging.getLogger(__name__)
 # =============================================================================
 # Tool Presets - Convenient groupings of tools

 from __future__ import annotations
 from pathlib import Path
 from typing import Any
+from loguru import logger
 # Adapters for framework integration
 from .adapters import to_maf_tool, to_openai_tool, tools_to_maf, tools_to_openai
 from .base import Tool, tool
 from .planning import think, todo_read, todo_write
 # Skills tools
+from .skills import create_skills_tool, discover_skills_from_tools_spec, skills
 # Sub-agent tools
 from .subagent import create_task_tool, task
     # Skills tools
     "skills",
     "create_skills_tool",
+    "discover_skills_from_tools_spec",
     # Sub-agent tools
     "task",
     "create_task_tool",
     "visual_inspector",
 ]
 # =============================================================================
 # Tool Presets - Convenient groupings of tools

src/flow/tools/adapters.py CHANGED Viewed

@@ -9,7 +9,6 @@ across different agent frameworks without code duplication.
 from __future__ import annotations
-import logging
 from typing import TYPE_CHECKING
 from .base import Tool
@@ -18,8 +17,6 @@ if TYPE_CHECKING:
     from collections.abc import Callable
     from typing import Any
-logger = logging.getLogger(__name__)
 def to_maf_tool(tool: Tool) -> Callable[..., Any]:
     """Convert a Flow Tool to a MAF-decorated function.
@@ -42,10 +39,7 @@ def to_maf_tool(tool: Tool) -> Callable[..., Any]:
     try:
         from agent_framework import tool as maf_tool
     except ImportError:
-        raise ImportError(
-            "Microsoft Agent Framework not installed. "
-            "Install with: pip install agent-framework"
-        )
     return maf_tool(
         name=tool.name,

 from __future__ import annotations
 from typing import TYPE_CHECKING
 from .base import Tool
     from collections.abc import Callable
     from typing import Any
 def to_maf_tool(tool: Tool) -> Callable[..., Any]:
     """Convert a Flow Tool to a MAF-decorated function.
     try:
         from agent_framework import tool as maf_tool
     except ImportError:
+        raise ImportError("Microsoft Agent Framework not installed. Install with: pip install agent-framework")
     return maf_tool(
         name=tool.name,

src/flow/tools/base.py CHANGED Viewed

@@ -15,6 +15,8 @@ from collections.abc import Callable
 from dataclasses import dataclass
 from typing import Annotated, Any, Literal, get_args, get_origin, get_type_hints
 @dataclass
 class Tool:
@@ -128,7 +130,8 @@ def tool(func: Callable[..., Any]) -> Tool:
     # Get type hints (with extras for Annotated)
     try:
         hints = get_type_hints(func, include_extras=True)
-    except Exception:
         hints = {}
     # Build JSON Schema for parameters

 from dataclasses import dataclass
 from typing import Annotated, Any, Literal, get_args, get_origin, get_type_hints
+from loguru import logger
 @dataclass
 class Tool:
     # Get type hints (with extras for Annotated)
     try:
         hints = get_type_hints(func, include_extras=True)
+    except Exception as e:
+        logger.warning(f"Failed to get type hints for function {func.__name__}: {e}")
         hints = {}
     # Build JSON Schema for parameters

src/flow/tools/browsing.py CHANGED Viewed

@@ -26,7 +26,10 @@ def create_smol_web_search_tool(max_results: int = 10, engine: str = "duckduckgo
         """
         logger.info("Performing web search for query: {}", query)
         tool = WebSearchTool(max_results=max_results, engine=engine)
-        return tool.forward(query=query)
     return smol_web_search
@@ -39,7 +42,11 @@ def wikipedia_search(
     """Searches Wikipedia and returns a summary or full text of the given topic, along with the page URL."""
     logger.info("Performing wikipedia search for query: {}", query)
     tool = WikipediaSearchTool(language=language)
-    return tool.forward(query=query)
 def create_visit_webpage_tool(max_output_length: int = 40000) -> Tool:
     """Create a tool for visiting webpages and reading their content as markdown.
@@ -59,7 +66,10 @@ def create_visit_webpage_tool(max_output_length: int = 40000) -> Tool:
         """Visits a webpage at the given url and reads its content as a markdown string. Use this to browse webpages."""
         logger.info("Visiting webpage at URL: {}", url)
         tool = VisitWebpageTool(max_output_length=max_output_length)
-        return tool.forward(url=url)
     return visit_webpage

         """
         logger.info("Performing web search for query: {}", query)
         tool = WebSearchTool(max_results=max_results, engine=engine)
+        output = tool.forward(query=query)
+        logger.debug("Web search output length: {}", len(output))
+        logger.debug("Web search output first 200 chars: {}", output[:200] if len(output) > 200 else output)
+        return output
     return smol_web_search
     """Searches Wikipedia and returns a summary or full text of the given topic, along with the page URL."""
     logger.info("Performing wikipedia search for query: {}", query)
     tool = WikipediaSearchTool(language=language)
+    output = tool.forward(query=query)
+    logger.debug("Wikipedia search output length: {}", len(output))
+    logger.debug("Wikipedia search output first 200 chars: {}", output[:200] if len(output) > 200 else output)
+    return output
 def create_visit_webpage_tool(max_output_length: int = 40000) -> Tool:
     """Create a tool for visiting webpages and reading their content as markdown.
         """Visits a webpage at the given url and reads its content as a markdown string. Use this to browse webpages."""
         logger.info("Visiting webpage at URL: {}", url)
         tool = VisitWebpageTool(max_output_length=max_output_length)
+        output = tool.forward(url=url)
+        logger.debug("Visit webpage output length: {}", len(output))
+        logger.debug("Visit webpage output first 200 chars: {}", output[:200] if len(output) > 200 else output)
+        return output
     return visit_webpage

src/flow/tools/coding.py CHANGED Viewed

@@ -8,6 +8,8 @@ import re
 from pathlib import Path
 from typing import Annotated
 from .base import tool
 from .workspace import get_workspace
@@ -35,13 +37,16 @@ def read_file(
     Returns the file content with line numbers for easy reference.
     """
     try:
         path_obj = _resolve_path(path)
         if not path_obj.exists():
             return f"Error: File not found: {path}"
         if not path_obj.is_file():
             return f"Error: Not a file: {path}"
         with open(path_obj, encoding="utf-8", errors="replace") as f:
@@ -66,9 +71,11 @@ def read_file(
         if start > 0 or end < total_lines:
             result += f"\n\n[Showing lines {start + 1}-{end} of {total_lines}]"
         return result
     except Exception as e:
         return f"Error reading file: {e!s}"
@@ -83,6 +90,7 @@ def write_file(
     Use this to create new files or completely replace file contents.
     For partial edits, use edit_file instead.
     """
     try:
         path_obj = _resolve_path(path)
@@ -95,9 +103,11 @@ def write_file(
         # Count lines for feedback
         line_count = content.count("\n") + (1 if content and not content.endswith("\n") else 0)
         return f"Successfully wrote {len(content)} characters ({line_count} lines) to {path}"
     except Exception as e:
         return f"Error writing file: {e!s}"
@@ -112,10 +122,12 @@ def edit_file(
     The old_string must appear exactly once in the file.
     For multiple replacements, call this tool multiple times.
     """
     try:
         path_obj = _resolve_path(path)
         if not path_obj.exists():
             return f"Error: File not found: {path}"
         with open(path_obj, encoding="utf-8") as f:
@@ -125,9 +137,11 @@ def edit_file(
         count = content.count(old_string)
         if count == 0:
             return f"Error: Could not find the specified text in {path}"
         if count > 1:
             return f"Error: Found {count} occurrences of the text. Please provide more context to make it unique."
         # Perform replacement
@@ -136,9 +150,11 @@ def edit_file(
         with open(path_obj, "w", encoding="utf-8") as f:
             f.write(new_content)
         return f"Successfully edited {path}"
     except Exception as e:
         return f"Error editing file: {e!s}"
@@ -152,13 +168,16 @@ def glob_files(
     Returns a list of matching file paths, sorted by modification time (newest first).
     """
     try:
         base_path = _resolve_path(path)
         if not base_path.exists():
             return f"Error: Directory not found: {path}"
         if not base_path.is_dir():
             return f"Error: Not a directory: {path}"
         # Find matching files
@@ -188,9 +207,11 @@ def glob_files(
         if len(matches) > limit:
             result += f"\n\n[Showing {limit} of {len(matches)} matches]"
         return result
     except Exception as e:
         return f"Error searching files: {e!s}"
@@ -206,6 +227,7 @@ def grep(
     Returns matching lines with file paths and line numbers.
     """
     try:
         base_path = _resolve_path(path)
         regex = re.compile(pattern)
@@ -216,10 +238,7 @@ def grep(
             files = [base_path]
         else:
             # Find files matching include pattern
-            files = [
-                p for p in base_path.rglob("*")
-                if p.is_file() and fnmatch.fnmatch(p.name, include)
-            ]
         for file_path in files:
             try:
@@ -247,13 +266,15 @@ def grep(
                         if len(matches) >= limit:
                             break
-            except Exception:
                 continue  # Skip files that can't be read
             if len(matches) >= limit:
                 break
         if not matches:
             return f"No matches found for pattern: {pattern}"
         result = "\n\n".join(matches)
@@ -261,11 +282,14 @@ def grep(
         if len(matches) >= limit:
             result += f"\n\n[Results limited to {limit} matches]"
         return result
     except re.error as e:
         return f"Error: Invalid regex pattern: {e!s}"
     except Exception as e:
         return f"Error searching: {e!s}"
@@ -279,13 +303,16 @@ def ls(
     Returns a formatted listing of directory contents.
     """
     try:
         dir_path = _resolve_path(path)
         if not dir_path.exists():
             return f"Error: Path not found: {path}"
         if not dir_path.is_dir():
             return f"Error: Not a directory: {path}"
         entries = list(dir_path.iterdir())
@@ -309,27 +336,31 @@ def ls(
                     if size < 1024:
                         size_str = f"{size:>6}B"
                     elif size < 1024 * 1024:
-                        size_str = f"{size/1024:>6.1f}K"
                     else:
-                        size_str = f"{size/(1024*1024):>6.1f}M"
                     # Format time
                     from datetime import datetime
                     mtime = datetime.fromtimestamp(stat.st_mtime)
                     time_str = mtime.strftime("%Y-%m-%d %H:%M")
                     type_char = "d" if entry.is_dir() else "-"
                     name = entry.name + ("/" if entry.is_dir() else "")
                     output_lines.append(f"{type_char} {size_str} {time_str} {name}")
-                except Exception:
                     output_lines.append(entry.name)
             else:
                 name = entry.name + ("/" if entry.is_dir() else "")
                 output_lines.append(name)
         return "\n".join(output_lines)
     except Exception as e:
         return f"Error listing directory: {e!s}"
@@ -346,10 +377,12 @@ def multi_edit(
     Edits are applied sequentially, so later edits see the result of earlier ones.
     """
     try:
         path_obj = _resolve_path(path)
         if not path_obj.exists():
             return f"Error: File not found: {path}"
         with open(path_obj, encoding="utf-8") as f:
@@ -358,7 +391,7 @@ def multi_edit(
         # Validate all edits first
         for i, edit in enumerate(edits):
             if "old_string" not in edit or "new_string" not in edit:
-                return f"Error: Edit {i+1} missing 'old_string' or 'new_string'"
         # Apply edits sequentially
         applied: list[str] = []
@@ -371,26 +404,28 @@ def multi_edit(
             if count == 0:
                 # Rollback - restore original
                 return (
-                    f"Error: Edit {i+1} failed - could not find text.\n"
                     f"Applied {len(applied)} edit(s) before failure.\n"
                     f"File unchanged (atomic rollback)."
                 )
             if count > 1:
                 return (
-                    f"Error: Edit {i+1} failed - found {count} occurrences.\n"
                     f"Applied {len(applied)} edit(s) before failure.\n"
                     f"File unchanged (atomic rollback)."
                 )
             content = content.replace(old_str, new_str, 1)
-            applied.append(f"Edit {i+1}: replaced {len(old_str)} chars with {len(new_str)} chars")
         # All edits succeeded - write the file
         with open(path_obj, "w", encoding="utf-8") as f:
             f.write(content)
         return f"Successfully applied {len(edits)} edit(s) to {path}:\n" + "\n".join(applied)
     except Exception as e:
         return f"Error editing file: {e!s}"

 from pathlib import Path
 from typing import Annotated
+from loguru import logger
 from .base import tool
 from .workspace import get_workspace
     Returns the file content with line numbers for easy reference.
     """
+    logger.debug(f"read_file: path={path}, offset={offset}, limit={limit}")
     try:
         path_obj = _resolve_path(path)
         if not path_obj.exists():
+            logger.warning(f"read_file: file not found: {path}")
             return f"Error: File not found: {path}"
         if not path_obj.is_file():
+            logger.warning(f"read_file: not a file: {path}")
             return f"Error: Not a file: {path}"
         with open(path_obj, encoding="utf-8", errors="replace") as f:
         if start > 0 or end < total_lines:
             result += f"\n\n[Showing lines {start + 1}-{end} of {total_lines}]"
+        logger.debug(f"read_file: read {end - start} lines from {path}")
         return result
     except Exception as e:
+        logger.warning(f"read_file: error reading {path}: {e}")
         return f"Error reading file: {e!s}"
     Use this to create new files or completely replace file contents.
     For partial edits, use edit_file instead.
     """
+    logger.info(f"write_file: writing to {path}")
     try:
         path_obj = _resolve_path(path)
         # Count lines for feedback
         line_count = content.count("\n") + (1 if content and not content.endswith("\n") else 0)
+        logger.debug(f"write_file: wrote {len(content)} chars ({line_count} lines) to {path}")
         return f"Successfully wrote {len(content)} characters ({line_count} lines) to {path}"
     except Exception as e:
+        logger.warning(f"write_file: error writing to {path}: {e}")
         return f"Error writing file: {e!s}"
     The old_string must appear exactly once in the file.
     For multiple replacements, call this tool multiple times.
     """
+    logger.info(f"edit_file: editing {path}")
     try:
         path_obj = _resolve_path(path)
         if not path_obj.exists():
+            logger.warning(f"edit_file: file not found: {path}")
             return f"Error: File not found: {path}"
         with open(path_obj, encoding="utf-8") as f:
         count = content.count(old_string)
         if count == 0:
+            logger.warning(f"edit_file: text not found in {path}")
             return f"Error: Could not find the specified text in {path}"
         if count > 1:
+            logger.warning(f"edit_file: found {count} occurrences in {path}, expected 1")
             return f"Error: Found {count} occurrences of the text. Please provide more context to make it unique."
         # Perform replacement
         with open(path_obj, "w", encoding="utf-8") as f:
             f.write(new_content)
+        logger.debug(f"edit_file: successfully edited {path}")
         return f"Successfully edited {path}"
     except Exception as e:
+        logger.warning(f"edit_file: error editing {path}: {e}")
         return f"Error editing file: {e!s}"
     Returns a list of matching file paths, sorted by modification time (newest first).
     """
+    logger.debug(f"glob_files: pattern={pattern}, path={path}, limit={limit}")
     try:
         base_path = _resolve_path(path)
         if not base_path.exists():
+            logger.warning(f"glob_files: directory not found: {path}")
             return f"Error: Directory not found: {path}"
         if not base_path.is_dir():
+            logger.warning(f"glob_files: not a directory: {path}")
             return f"Error: Not a directory: {path}"
         # Find matching files
         if len(matches) > limit:
             result += f"\n\n[Showing {limit} of {len(matches)} matches]"
+        logger.debug(f"glob_files: found {len(files)} files matching '{pattern}'")
         return result
     except Exception as e:
+        logger.warning(f"glob_files: error searching: {e}")
         return f"Error searching files: {e!s}"
     Returns matching lines with file paths and line numbers.
     """
+    logger.debug(f"grep: pattern='{pattern}', path={path}, include={include}")
     try:
         base_path = _resolve_path(path)
         regex = re.compile(pattern)
             files = [base_path]
         else:
             # Find files matching include pattern
+            files = [p for p in base_path.rglob("*") if p.is_file() and fnmatch.fnmatch(p.name, include)]
         for file_path in files:
             try:
                         if len(matches) >= limit:
                             break
+            except Exception as e:
+                logger.debug(f"grep: skipping unreadable file {file_path}: {e}")
                 continue  # Skip files that can't be read
             if len(matches) >= limit:
                 break
         if not matches:
+            logger.debug(f"grep: no matches found for pattern '{pattern}'")
             return f"No matches found for pattern: {pattern}"
         result = "\n\n".join(matches)
         if len(matches) >= limit:
             result += f"\n\n[Results limited to {limit} matches]"
+        logger.debug(f"grep: found {len(matches)} matches for pattern '{pattern}'")
         return result
     except re.error as e:
+        logger.warning(f"grep: invalid regex '{pattern}': {e}")
         return f"Error: Invalid regex pattern: {e!s}"
     except Exception as e:
+        logger.warning(f"grep: error searching: {e}")
         return f"Error searching: {e!s}"
     Returns a formatted listing of directory contents.
     """
+    logger.debug(f"ls: path={path}, show_hidden={show_hidden}, long_format={long_format}")
     try:
         dir_path = _resolve_path(path)
         if not dir_path.exists():
+            logger.warning(f"ls: path not found: {path}")
             return f"Error: Path not found: {path}"
         if not dir_path.is_dir():
+            logger.warning(f"ls: not a directory: {path}")
             return f"Error: Not a directory: {path}"
         entries = list(dir_path.iterdir())
                     if size < 1024:
                         size_str = f"{size:>6}B"
                     elif size < 1024 * 1024:
+                        size_str = f"{size / 1024:>6.1f}K"
                     else:
+                        size_str = f"{size / (1024 * 1024):>6.1f}M"
                     # Format time
                     from datetime import datetime
                     mtime = datetime.fromtimestamp(stat.st_mtime)
                     time_str = mtime.strftime("%Y-%m-%d %H:%M")
                     type_char = "d" if entry.is_dir() else "-"
                     name = entry.name + ("/" if entry.is_dir() else "")
                     output_lines.append(f"{type_char} {size_str} {time_str} {name}")
+                except Exception as e:
+                    logger.debug(f"ls: failed to stat entry {entry.name}: {e}")
                     output_lines.append(entry.name)
             else:
                 name = entry.name + ("/" if entry.is_dir() else "")
                 output_lines.append(name)
+        logger.debug(f"ls: listed {len(output_lines)} entries in {path}")
         return "\n".join(output_lines)
     except Exception as e:
+        logger.warning(f"ls: error listing {path}: {e}")
         return f"Error listing directory: {e!s}"
     Edits are applied sequentially, so later edits see the result of earlier ones.
     """
+    logger.info(f"multi_edit: applying {len(edits)} edits to {path}")
     try:
         path_obj = _resolve_path(path)
         if not path_obj.exists():
+            logger.warning(f"multi_edit: file not found: {path}")
             return f"Error: File not found: {path}"
         with open(path_obj, encoding="utf-8") as f:
         # Validate all edits first
         for i, edit in enumerate(edits):
             if "old_string" not in edit or "new_string" not in edit:
+                return f"Error: Edit {i + 1} missing 'old_string' or 'new_string'"
         # Apply edits sequentially
         applied: list[str] = []
             if count == 0:
                 # Rollback - restore original
                 return (
+                    f"Error: Edit {i + 1} failed - could not find text.\n"
                     f"Applied {len(applied)} edit(s) before failure.\n"
                     f"File unchanged (atomic rollback)."
                 )
             if count > 1:
                 return (
+                    f"Error: Edit {i + 1} failed - found {count} occurrences.\n"
                     f"Applied {len(applied)} edit(s) before failure.\n"
                     f"File unchanged (atomic rollback)."
                 )
             content = content.replace(old_str, new_str, 1)
+            applied.append(f"Edit {i + 1}: replaced {len(old_str)} chars with {len(new_str)} chars")
         # All edits succeeded - write the file
         with open(path_obj, "w", encoding="utf-8") as f:
             f.write(content)
+        logger.debug(f"multi_edit: successfully applied {len(edits)} edits to {path}")
         return f"Successfully applied {len(edits)} edit(s) to {path}:\n" + "\n".join(applied)
     except Exception as e:
+        logger.warning(f"multi_edit: error editing {path}: {e}")
         return f"Error editing file: {e!s}"

src/flow/tools/execution.py CHANGED Viewed

@@ -6,6 +6,8 @@ Execute shell commands and manage processes.
 import subprocess
 from typing import Annotated
 from .base import tool
 from .workspace import get_workspace
@@ -21,6 +23,8 @@ def bash(
     Use this to run shell commands, scripts, or system utilities.
     Be careful with destructive commands.
     """
     try:
         # Default to workspace root so concurrent tasks don't share process cwd
         effective_cwd = cwd if cwd is not None else str(get_workspace().root)
@@ -40,13 +44,18 @@ def bash(
             output += result.stderr
         if result.returncode != 0:
             output += f"\n[Exit code: {result.returncode}]"
         return output.strip() if output else "(No output)"
     except subprocess.TimeoutExpired:
         return f"Error: Command timed out after {timeout} seconds"
     except Exception as e:
         return f"Error executing command: {e!s}"
@@ -64,6 +73,8 @@ def check_processes(
     import os
     import signal
     if action == "list":
         try:
             # Use ps to list processes
@@ -73,8 +84,10 @@ def check_processes(
                 text=True,
                 timeout=10,
             )
             return result.stdout if result.stdout else "No processes found"
         except Exception as e:
             return f"Error listing processes: {e!s}"
     elif action == "kill":
@@ -82,14 +95,17 @@ def check_processes(
             return "Error: PID required for 'kill' action"
         try:
             os.kill(pid, signal.SIGTERM)
             return f"Sent SIGTERM to process {pid}"
         except ProcessLookupError:
             return f"Error: Process {pid} not found"
         except PermissionError:
             return f"Error: Permission denied to kill process {pid}"
         except Exception as e:
             return f"Error killing process: {e!s}"
     else:
         return f"Unknown action: {action}. Use 'list' or 'kill'."

 import subprocess
 from typing import Annotated
+from loguru import logger
 from .base import tool
 from .workspace import get_workspace
     Use this to run shell commands, scripts, or system utilities.
     Be careful with destructive commands.
     """
+    logger.info(f"bash: executing command (timeout={timeout}s, cwd={cwd})")
+    logger.debug(f"bash: command={command[:200] if len(command) > 200 else command}")
     try:
         # Default to workspace root so concurrent tasks don't share process cwd
         effective_cwd = cwd if cwd is not None else str(get_workspace().root)
             output += result.stderr
         if result.returncode != 0:
+            logger.debug(f"bash: command exited with code {result.returncode}")
             output += f"\n[Exit code: {result.returncode}]"
+        else:
+            logger.debug("bash: command completed successfully")
         return output.strip() if output else "(No output)"
     except subprocess.TimeoutExpired:
+        logger.warning(f"bash: command timed out after {timeout}s")
         return f"Error: Command timed out after {timeout} seconds"
     except Exception as e:
+        logger.warning(f"bash: error executing command: {e}")
         return f"Error executing command: {e!s}"
     import os
     import signal
+    logger.debug(f"check_processes: action={action}, pid={pid}")
     if action == "list":
         try:
             # Use ps to list processes
                 text=True,
                 timeout=10,
             )
+            logger.debug("check_processes: listed processes")
             return result.stdout if result.stdout else "No processes found"
         except Exception as e:
+            logger.warning(f"check_processes: error listing: {e}")
             return f"Error listing processes: {e!s}"
     elif action == "kill":
             return "Error: PID required for 'kill' action"
         try:
             os.kill(pid, signal.SIGTERM)
+            logger.info(f"check_processes: sent SIGTERM to pid {pid}")
             return f"Sent SIGTERM to process {pid}"
         except ProcessLookupError:
+            logger.warning(f"check_processes: process {pid} not found")
             return f"Error: Process {pid} not found"
         except PermissionError:
+            logger.warning(f"check_processes: permission denied for pid {pid}")
             return f"Error: Permission denied to kill process {pid}"
         except Exception as e:
+            logger.warning(f"check_processes: error killing {pid}: {e}")
             return f"Error killing process: {e!s}"
     else:
         return f"Unknown action: {action}. Use 'list' or 'kill'."

src/flow/tools/memory.py CHANGED Viewed

@@ -24,6 +24,8 @@ import uuid
 from datetime import datetime
 from typing import Annotated, Any, Literal
 from .base import Tool, tool
 from .workspace import Workspace, get_workspace
@@ -76,6 +78,7 @@ def create_memory_tool(workspace: Workspace | None = None) -> Tool:
     Returns:
         A Tool instance for memory operations.
     """
     def get_ws() -> Workspace:
         if workspace is not None:
             return workspace
@@ -85,20 +88,13 @@ def create_memory_tool(workspace: Workspace | None = None) -> Tool:
     def memory(
         action: Annotated[
             Literal["store", "recall", "list", "forget"],
-            "Action: store (save), recall (search), list (show all), forget (delete)"
         ],
         content: Annotated[
-            str,
-            "For 'store': info to remember. For 'recall': search query. For 'forget': memory ID."
-        ] = "",
-        key: Annotated[
-            str,
-            "Optional short identifier (e.g., 'user_preferences', 'project_structure')"
         ] = "",
-        tags: Annotated[
-            list[str],
-            "Optional tags for categorization (e.g., ['important', 'user-info'])"
-        ] = [],
     ) -> str:
         """Store and retrieve information across sessions.
@@ -132,38 +128,44 @@ def create_memory_tool(workspace: Workspace | None = None) -> Tool:
             }
             ws.save_memory(memory_id, data)
             return f"Stored memory '{data['key']}' (id: {memory_id}) in {ws.memory_dir}"
         elif action == "recall":
             if not content:
                 return "Error: content (search query) is required for 'recall' action"
             results = _search_memories(ws, content)
             if not results:
                 return f"No memories found matching '{content}'"
             output = [f"Found {len(results)} memory(ies) matching '{content}':\n"]
             for mem in results:
                 tags_str = f" [{', '.join(mem['tags'])}]" if mem.get("tags") else ""
                 output.append(f"- [{mem['id']}] {mem['key']}{tags_str}: {mem['content'][:200]}")
-                if len(mem['content']) > 200:
                     output[-1] += "..."
             return "\n".join(output)
         elif action == "list":
             memories = ws.list_memories()
             if not memories:
                 return f"No memories stored in {ws.memory_dir}"
             output = [f"Stored memories ({len(memories)} total):\n"]
             for mem in sorted(memories, key=lambda m: m.get("created_at", ""), reverse=True):
                 tags_str = f" [{', '.join(mem['tags'])}]" if mem.get("tags") else ""
-                preview = mem['content'][:100] + "..." if len(mem['content']) > 100 else mem['content']
                 output.append(f"- [{mem['id']}] {mem['key']}{tags_str}: {preview}")
             return "\n".join(output)
         elif action == "forget":
@@ -171,8 +173,10 @@ def create_memory_tool(workspace: Workspace | None = None) -> Tool:
                 return "Error: content (memory ID) is required for 'forget' action"
             if ws.delete_memory(content):
                 return f"Deleted memory with id: {content}"
             else:
                 return f"No memory found with id: {content}"
         else:

 from datetime import datetime
 from typing import Annotated, Any, Literal
+from loguru import logger
 from .base import Tool, tool
 from .workspace import Workspace, get_workspace
     Returns:
         A Tool instance for memory operations.
     """
     def get_ws() -> Workspace:
         if workspace is not None:
             return workspace
     def memory(
         action: Annotated[
             Literal["store", "recall", "list", "forget"],
+            "Action: store (save), recall (search), list (show all), forget (delete)",
         ],
         content: Annotated[
+            str, "For 'store': info to remember. For 'recall': search query. For 'forget': memory ID."
         ] = "",
+        key: Annotated[str, "Optional short identifier (e.g., 'user_preferences', 'project_structure')"] = "",
+        tags: Annotated[list[str], "Optional tags for categorization (e.g., ['important', 'user-info'])"] = [],
     ) -> str:
         """Store and retrieve information across sessions.
             }
             ws.save_memory(memory_id, data)
+            logger.info(f"memory: stored '{data['key']}' (id: {memory_id}) with {len(content)} chars")
             return f"Stored memory '{data['key']}' (id: {memory_id}) in {ws.memory_dir}"
         elif action == "recall":
             if not content:
                 return "Error: content (search query) is required for 'recall' action"
+            logger.debug(f"memory: recalling with query '{content}'")
             results = _search_memories(ws, content)
             if not results:
+                logger.debug(f"memory: no matches found for '{content}'")
                 return f"No memories found matching '{content}'"
             output = [f"Found {len(results)} memory(ies) matching '{content}':\n"]
             for mem in results:
                 tags_str = f" [{', '.join(mem['tags'])}]" if mem.get("tags") else ""
                 output.append(f"- [{mem['id']}] {mem['key']}{tags_str}: {mem['content'][:200]}")
+                if len(mem["content"]) > 200:
                     output[-1] += "..."
+            logger.debug(f"memory: found {len(results)} memories matching '{content}'")
             return "\n".join(output)
         elif action == "list":
             memories = ws.list_memories()
             if not memories:
+                logger.debug("memory: no memories stored")
                 return f"No memories stored in {ws.memory_dir}"
             output = [f"Stored memories ({len(memories)} total):\n"]
             for mem in sorted(memories, key=lambda m: m.get("created_at", ""), reverse=True):
                 tags_str = f" [{', '.join(mem['tags'])}]" if mem.get("tags") else ""
+                preview = mem["content"][:100] + "..." if len(mem["content"]) > 100 else mem["content"]
                 output.append(f"- [{mem['id']}] {mem['key']}{tags_str}: {preview}")
+            logger.debug(f"memory: listed {len(memories)} memories")
             return "\n".join(output)
         elif action == "forget":
                 return "Error: content (memory ID) is required for 'forget' action"
             if ws.delete_memory(content):
+                logger.info(f"memory: deleted memory with id '{content}'")
                 return f"Deleted memory with id: {content}"
             else:
+                logger.warning(f"memory: no memory found with id '{content}'")
                 return f"No memory found with id: {content}"
         else:

src/flow/tools/notebook.py CHANGED Viewed

@@ -7,6 +7,8 @@ import json
 from pathlib import Path
 from typing import Annotated, Any, Literal
 from .base import tool
@@ -27,13 +29,16 @@ def notebook_edit(
     For insert mode, cell_type is required.
     """
     try:
         path_obj = Path(path).expanduser().resolve()
         if not path_obj.exists():
             return f"Error: Notebook not found: {path}"
         if not path_obj.suffix == ".ipynb":
             return f"Error: Not a Jupyter notebook (must be .ipynb): {path}"
         # Read notebook
@@ -53,6 +58,7 @@ def notebook_edit(
             with open(path_obj, "w", encoding="utf-8") as f:
                 json.dump(notebook, f, indent=1)
             return f"Successfully deleted {deleted_type} cell at index {cell_index}"
         elif edit_mode == "insert":
@@ -79,6 +85,7 @@ def notebook_edit(
             with open(path_obj, "w", encoding="utf-8") as f:
                 json.dump(notebook, f, indent=1)
             return f"Successfully inserted {cell_type} cell at index {cell_index}"
         else:  # replace
@@ -107,11 +114,14 @@ def notebook_edit(
             with open(path_obj, "w", encoding="utf-8") as f:
                 json.dump(notebook, f, indent=1)
             return f"Successfully replaced cell at index {cell_index}"
     except json.JSONDecodeError as e:
         return f"Error: Invalid notebook JSON: {e!s}"
     except Exception as e:
         return f"Error editing notebook: {e!s}"
@@ -125,13 +135,16 @@ def notebook_read(
     Returns formatted cell contents with indices for easy reference.
     """
     try:
         path_obj = Path(path).expanduser().resolve()
         if not path_obj.exists():
             return f"Error: Notebook not found: {path}"
         if not path_obj.suffix == ".ipynb":
             return f"Error: Not a Jupyter notebook (must be .ipynb): {path}"
         with open(path_obj, encoding="utf-8") as f:
@@ -179,7 +192,11 @@ def notebook_read(
                             data = output.get("data", {})
                             if isinstance(data, dict) and "text/plain" in data:
                                 plain_data = data["text/plain"]
-                                text = "".join(str(t) for t in plain_data) if isinstance(plain_data, list) else str(plain_data)
                                 output_texts.append(f"[result]\n{text}")
                         elif output.get("output_type") == "error":
                             ename = str(output.get("ename", "Error"))
@@ -195,9 +212,12 @@ def notebook_read(
         if cell_index is None:
             result = f"Notebook: {path} ({len(notebook.get('cells', []))} cells)\n\n" + result
         return result
     except json.JSONDecodeError as e:
         return f"Error: Invalid notebook JSON: {e!s}"
     except Exception as e:
         return f"Error reading notebook: {e!s}"

 from pathlib import Path
 from typing import Annotated, Any, Literal
+from loguru import logger
 from .base import tool
     For insert mode, cell_type is required.
     """
+    logger.info(f"notebook_edit: {edit_mode} cell at index {cell_index} in {path}")
     try:
         path_obj = Path(path).expanduser().resolve()
         if not path_obj.exists():
+            logger.warning(f"notebook_edit: notebook not found: {path}")
             return f"Error: Notebook not found: {path}"
         if not path_obj.suffix == ".ipynb":
+            logger.warning(f"notebook_edit: not a notebook file: {path}")
             return f"Error: Not a Jupyter notebook (must be .ipynb): {path}"
         # Read notebook
             with open(path_obj, "w", encoding="utf-8") as f:
                 json.dump(notebook, f, indent=1)
+            logger.debug(f"notebook_edit: deleted {deleted_type} cell at index {cell_index}")
             return f"Successfully deleted {deleted_type} cell at index {cell_index}"
         elif edit_mode == "insert":
             with open(path_obj, "w", encoding="utf-8") as f:
                 json.dump(notebook, f, indent=1)
+            logger.debug(f"notebook_edit: inserted {cell_type} cell at index {cell_index}")
             return f"Successfully inserted {cell_type} cell at index {cell_index}"
         else:  # replace
             with open(path_obj, "w", encoding="utf-8") as f:
                 json.dump(notebook, f, indent=1)
+            logger.debug(f"notebook_edit: replaced cell at index {cell_index}")
             return f"Successfully replaced cell at index {cell_index}"
     except json.JSONDecodeError as e:
+        logger.warning(f"notebook_edit: invalid JSON in {path}: {e}")
         return f"Error: Invalid notebook JSON: {e!s}"
     except Exception as e:
+        logger.warning(f"notebook_edit: error editing {path}: {e}")
         return f"Error editing notebook: {e!s}"
     Returns formatted cell contents with indices for easy reference.
     """
+    logger.debug(f"notebook_read: path={path}, cell_index={cell_index}, include_outputs={include_outputs}")
     try:
         path_obj = Path(path).expanduser().resolve()
         if not path_obj.exists():
+            logger.warning(f"notebook_read: notebook not found: {path}")
             return f"Error: Notebook not found: {path}"
         if not path_obj.suffix == ".ipynb":
+            logger.warning(f"notebook_read: not a notebook file: {path}")
             return f"Error: Not a Jupyter notebook (must be .ipynb): {path}"
         with open(path_obj, encoding="utf-8") as f:
                             data = output.get("data", {})
                             if isinstance(data, dict) and "text/plain" in data:
                                 plain_data = data["text/plain"]
+                                text = (
+                                    "".join(str(t) for t in plain_data)
+                                    if isinstance(plain_data, list)
+                                    else str(plain_data)
+                                )
                                 output_texts.append(f"[result]\n{text}")
                         elif output.get("output_type") == "error":
                             ename = str(output.get("ename", "Error"))
         if cell_index is None:
             result = f"Notebook: {path} ({len(notebook.get('cells', []))} cells)\n\n" + result
+        logger.debug(f"notebook_read: read {len(cells_list)} cells from {path}")
         return result
     except json.JSONDecodeError as e:
+        logger.warning(f"notebook_read: invalid JSON in {path}: {e}")
         return f"Error: Invalid notebook JSON: {e!s}"
     except Exception as e:
+        logger.warning(f"notebook_read: error reading {path}: {e}")
         return f"Error reading notebook: {e!s}"

src/flow/tools/planning.py CHANGED Viewed

@@ -6,6 +6,8 @@ Todos are persisted to the workspace's .flow/todos.json file.
 from typing import Annotated, Any
 from .base import tool
 from .workspace import Workspace, get_workspace
@@ -34,6 +36,7 @@ def think(
     Your thought is recorded in conversation history for reference.
     """
     # The thought is recorded in the tool result, becoming part of context
     return "Thought recorded."
@@ -42,7 +45,7 @@ def think(
 def todo_write(
     todos: Annotated[
         list[dict[str, Any]],
-        "List of todo items. Each item needs: content (str), status ('pending'|'in_progress'|'completed'), activeForm (str describing current action)"
     ],
 ) -> str:
     """Create or update the task list for this session.
@@ -75,17 +78,18 @@ def todo_write(
     valid_statuses = {"pending", "in_progress", "completed"}
     for i, todo in enumerate(todos):
         if "content" not in todo:
-            return f"Error: Todo {i+1} missing 'content'"
         if "status" not in todo:
-            return f"Error: Todo {i+1} missing 'status'"
         if todo["status"] not in valid_statuses:
-            return f"Error: Todo {i+1} has invalid status '{todo['status']}'. Must be: {valid_statuses}"
         if "activeForm" not in todo:
-            return f"Error: Todo {i+1} missing 'activeForm'"
     # Check only one in_progress
     in_progress_count = sum(1 for t in todos if t["status"] == "in_progress")
     if in_progress_count > 1:
         return f"Error: {in_progress_count} tasks marked 'in_progress'. Only one task should be in progress at a time."
     # Save to workspace
@@ -100,6 +104,7 @@ def todo_write(
     current = next((t for t in todos if t["status"] == "in_progress"), None)
     current_msg = f"Current: {current['activeForm']}" if current else "No task in progress"
     return f"Todo list updated: {completed} completed, {in_progress} in progress, {pending} pending. {current_msg}"
@@ -110,10 +115,12 @@ def todo_read() -> str:
     Returns the current state of all tasks with their status.
     Todos are loaded from {workspace}/.flow/todos.json
     """
     ws = _get_workspace()
     todos = ws.load_todos()
     if not todos:
         return "No todos. Use todo_write to create a task list."
     lines: list[str] = []
@@ -133,6 +140,7 @@ def todo_read() -> str:
     completed = sum(1 for t in todos if t["status"] == "completed")
     total = len(todos)
     return f"Progress: {completed}/{total}\n\n" + "\n".join(lines)

 from typing import Annotated, Any
+from loguru import logger
 from .base import tool
 from .workspace import Workspace, get_workspace
     Your thought is recorded in conversation history for reference.
     """
+    logger.debug(f"think: recording thought ({len(thought)} chars)")
     # The thought is recorded in the tool result, becoming part of context
     return "Thought recorded."
 def todo_write(
     todos: Annotated[
         list[dict[str, Any]],
+        "List of todo items. Each item needs: content (str), status ('pending'|'in_progress'|'completed'), activeForm (str describing current action)",
     ],
 ) -> str:
     """Create or update the task list for this session.
     valid_statuses = {"pending", "in_progress", "completed"}
     for i, todo in enumerate(todos):
         if "content" not in todo:
+            return f"Error: Todo {i + 1} missing 'content'"
         if "status" not in todo:
+            return f"Error: Todo {i + 1} missing 'status'"
         if todo["status"] not in valid_statuses:
+            return f"Error: Todo {i + 1} has invalid status '{todo['status']}'. Must be: {valid_statuses}"
         if "activeForm" not in todo:
+            return f"Error: Todo {i + 1} missing 'activeForm'"
     # Check only one in_progress
     in_progress_count = sum(1 for t in todos if t["status"] == "in_progress")
     if in_progress_count > 1:
+        logger.warning(f"todo_write: {in_progress_count} tasks marked in_progress, should be 1")
         return f"Error: {in_progress_count} tasks marked 'in_progress'. Only one task should be in progress at a time."
     # Save to workspace
     current = next((t for t in todos if t["status"] == "in_progress"), None)
     current_msg = f"Current: {current['activeForm']}" if current else "No task in progress"
+    logger.info(f"todo_write: updated ({completed} completed, {in_progress} in progress, {pending} pending)")
     return f"Todo list updated: {completed} completed, {in_progress} in progress, {pending} pending. {current_msg}"
     Returns the current state of all tasks with their status.
     Todos are loaded from {workspace}/.flow/todos.json
     """
+    logger.debug("todo_read: loading todos")
     ws = _get_workspace()
     todos = ws.load_todos()
     if not todos:
+        logger.debug("todo_read: no todos found")
         return "No todos. Use todo_write to create a task list."
     lines: list[str] = []
     completed = sum(1 for t in todos if t["status"] == "completed")
     total = len(todos)
+    logger.debug(f"todo_read: loaded {total} todos ({completed} completed)")
     return f"Progress: {completed}/{total}\n\n" + "\n".join(lines)

src/flow/tools/skills.py CHANGED Viewed

@@ -26,7 +26,9 @@ Usage:
 import re
 from pathlib import Path
-from typing import Annotated, Literal
 from .base import Tool, tool
@@ -88,8 +90,9 @@ def _discover_skills(skills_paths: list[Path]) -> dict[str, tuple[Path, dict[str
                         meta = _parse_frontmatter(content)
                         skill_name = meta.get("name", item.name)
                         skills[skill_name] = (skill_md, meta)
-                    except Exception:
-                        # Skip broken skills
                         skills[item.name] = (
                             skill_md,
                             {"name": item.name, "description": "Error reading skill"},
@@ -113,6 +116,7 @@ def create_skills_tool(
     builtin_path: Path | None = None,
     user_path: Path | None = None,
     project_path: Path | None = None,
 ) -> Tool:
     """Create a skills tool for discovering and loading domain expertise.
@@ -120,6 +124,8 @@ def create_skills_tool(
         builtin_path: Path to built-in skills (shipped with package)
         user_path: Path to user skills (defaults to ~/.flow/skills/)
         project_path: Path to project-local skills (highest priority)
     Returns:
         A Tool that can be added to an agent's tool list
@@ -138,7 +144,7 @@ def create_skills_tool(
     # Built-in skills
     if builtin_path:
         all_paths.append(builtin_path)
-    else:
         default_builtin = _get_builtin_skills_path()
         if default_builtin.exists():
             all_paths.append(default_builtin)
@@ -146,7 +152,7 @@ def create_skills_tool(
     # User skills
     if user_path:
         all_paths.append(user_path)
-    else:
         default_user = _get_user_skills_path()
         if default_user.exists():
             all_paths.append(default_user)
@@ -177,6 +183,7 @@ def create_skills_tool(
         discovered = _discover_skills(all_paths)
         if action == "list":
             if not discovered:
                 paths_str = "\n".join(f"  - {p}" for p in all_paths) if all_paths else "  (no paths configured)"
                 return (
@@ -210,6 +217,7 @@ def create_skills_tool(
                 return "Error: 'name' parameter is required for 'load' action."
             if name not in discovered:
                 available = sorted(discovered.keys())
                 msg = f"Skill '{name}' not found."
                 if available:
@@ -223,8 +231,10 @@ def create_skills_tool(
                 # Return full content (body only, frontmatter already parsed)
                 body = _get_skill_body(content)
                 return f"# Skill: {skill_name}\n\n{body}"
             except Exception as e:
                 return f"Error loading skill '{name}': {e}"
         else:
@@ -233,5 +243,61 @@ def create_skills_tool(
     return skills
 # Default skills tool (includes built-in skills from Flow)
 skills = create_skills_tool()

 import re
 from pathlib import Path
+from typing import Annotated, Any, Literal
+from loguru import logger
 from .base import Tool, tool
                         meta = _parse_frontmatter(content)
                         skill_name = meta.get("name", item.name)
                         skills[skill_name] = (skill_md, meta)
+                    except Exception as e:
+                        # Skip broken skills but log the error
+                        logger.debug(f"Failed to load skill '{item.name}': {e}")
                         skills[item.name] = (
                             skill_md,
                             {"name": item.name, "description": "Error reading skill"},
     builtin_path: Path | None = None,
     user_path: Path | None = None,
     project_path: Path | None = None,
+    exclusive: bool = False,
 ) -> Tool:
     """Create a skills tool for discovering and loading domain expertise.
         builtin_path: Path to built-in skills (shipped with package)
         user_path: Path to user skills (defaults to ~/.flow/skills/)
         project_path: Path to project-local skills (highest priority)
+        exclusive: If True, only use explicitly provided paths (no defaults).
+                   Useful for optimization where you want a clean slate.
     Returns:
         A Tool that can be added to an agent's tool list
     # Built-in skills
     if builtin_path:
         all_paths.append(builtin_path)
+    elif not exclusive:
         default_builtin = _get_builtin_skills_path()
         if default_builtin.exists():
             all_paths.append(default_builtin)
     # User skills
     if user_path:
         all_paths.append(user_path)
+    elif not exclusive:
         default_user = _get_user_skills_path()
         if default_user.exists():
             all_paths.append(default_user)
         discovered = _discover_skills(all_paths)
         if action == "list":
+            logger.debug(f"skills: listing {len(discovered)} discovered skills")
             if not discovered:
                 paths_str = "\n".join(f"  - {p}" for p in all_paths) if all_paths else "  (no paths configured)"
                 return (
                 return "Error: 'name' parameter is required for 'load' action."
             if name not in discovered:
+                logger.warning(f"skills: skill '{name}' not found")
                 available = sorted(discovered.keys())
                 msg = f"Skill '{name}' not found."
                 if available:
                 # Return full content (body only, frontmatter already parsed)
                 body = _get_skill_body(content)
+                logger.info(f"skills: loaded skill '{skill_name}' ({len(body)} chars)")
                 return f"# Skill: {skill_name}\n\n{body}"
             except Exception as e:
+                logger.warning(f"skills: error loading skill '{name}': {e}")
                 return f"Error loading skill '{name}': {e}"
         else:
     return skills
+def discover_skills_from_tools_spec(
+    tools_spec: dict[str, dict[str, Any]],
+) -> dict[str, dict[str, str]]:
+    """Discover available skills based on a resolved tools specification.
+    Extracts skill metadata (name, description, triggers) from the same
+    paths that the skills tool would use. Returns a lightweight dict
+    suitable for injecting into the system prompt.
+    Args:
+        tools_spec: Resolved tools specification dict (from resolve_tools()).
+                   Looks for a ``"skills"`` key with optional ``skills_path``,
+                   ``additional_paths``, or ``exclusive`` config.
+    Returns:
+        Dict mapping skill_name -> frontmatter metadata dict
+        (keys: ``name``, ``description``, ``triggers``).
+        Returns empty dict if no skills tool is configured.
+    """
+    if "skills" not in tools_spec:
+        return {}
+    config = tools_spec.get("skills", {})
+    exclusive = bool(config.get("skills_path"))  # skills_path implies exclusive
+    # Build the same paths that create_skills_tool would use
+    all_paths: list[Path] = []
+    if config.get("skills_path"):
+        all_paths.append(Path(config["skills_path"]))
+    elif not exclusive:
+        default_builtin = _get_builtin_skills_path()
+        if default_builtin.exists():
+            all_paths.append(default_builtin)
+    if not exclusive:
+        default_user = _get_user_skills_path()
+        if default_user.exists():
+            all_paths.append(default_user)
+    if config.get("additional_paths"):
+        for p in config["additional_paths"]:
+            all_paths.append(Path(p))
+    if config.get("project_path"):
+        all_paths.append(Path(config["project_path"]))
+    discovered = _discover_skills(all_paths)
+    # Return metadata only (strip file paths)
+    return {
+        skill_name: dict(meta)
+        for skill_name, (_, meta) in discovered.items()
+    }
 # Default skills tool (includes built-in skills from Flow)
 skills = create_skills_tool()

src/flow/tools/subagent.py CHANGED Viewed

@@ -16,6 +16,8 @@ from __future__ import annotations
 from typing import TYPE_CHECKING, Annotated, Literal
 from .base import Tool, tool
 if TYPE_CHECKING:
@@ -118,7 +120,7 @@ def create_task_tool(
         description: Annotated[str, "Short 3-5 word summary of what the sub-agent will do"],
         agent_type: Annotated[
             Literal["explore", "research", "general"],
-            "Type of sub-agent: 'explore' for codebase search, 'research' for web research, 'general' for other tasks"
         ] = "general",
     ) -> str:
         """Launch a sub-agent to handle a complex task in isolated context.
@@ -139,12 +141,16 @@ def create_task_tool(
         - research: Web research (web_search, web_fetch)
         - general: All tools available to you
         """
         # Lazy imports to avoid circular dependencies
         try:
             from flow.harness.miniagent.agent import ChatAgent
             from flow.harness.miniagent.client import ChatClient, ClientConfig
             from flow.harness.miniagent.context import HeadTailStrategy
         except ImportError:
             return "Error: MiniAgent harness not available. Install flow with miniagent extras."
         # Get agent type config
@@ -222,9 +228,13 @@ def create_task_tool(
                 f"{response.usage.tool_calls} tool calls]"
             )
             return result + usage_info
         except Exception as e:
             return f"Sub-agent failed: {e!s}"
     return task

 from typing import TYPE_CHECKING, Annotated, Literal
+from loguru import logger
 from .base import Tool, tool
 if TYPE_CHECKING:
         description: Annotated[str, "Short 3-5 word summary of what the sub-agent will do"],
         agent_type: Annotated[
             Literal["explore", "research", "general"],
+            "Type of sub-agent: 'explore' for codebase search, 'research' for web research, 'general' for other tasks",
         ] = "general",
     ) -> str:
         """Launch a sub-agent to handle a complex task in isolated context.
         - research: Web research (web_search, web_fetch)
         - general: All tools available to you
         """
+        logger.info(f"task: spawning {agent_type} sub-agent for '{description}'")
+        logger.debug(f"task: prompt preview={prompt[:200] if len(prompt) > 200 else prompt}")
         # Lazy imports to avoid circular dependencies
         try:
             from flow.harness.miniagent.agent import ChatAgent
             from flow.harness.miniagent.client import ChatClient, ClientConfig
             from flow.harness.miniagent.context import HeadTailStrategy
         except ImportError:
+            logger.error("task: MiniAgent harness not available")
             return "Error: MiniAgent harness not available. Install flow with miniagent extras."
         # Get agent type config
                 f"{response.usage.tool_calls} tool calls]"
             )
+            logger.debug(
+                f"task: sub-agent completed - {response.iterations} iterations, {response.usage.tool_calls} tool calls"
+            )
             return result + usage_info
         except Exception as e:
+            logger.warning(f"task: sub-agent failed: {e}")
             return f"Sub-agent failed: {e!s}"
     return task

src/flow/tools/text_inspector_qa.py CHANGED Viewed

@@ -191,4 +191,7 @@ def text_inspector(
     """
     logger.info("Inspecting file at path: {}", file_path)
     ti_tool = TextInspectorTool()
-    return ti_tool.forward(file_path=file_path, question=question)

     """
     logger.info("Inspecting file at path: {}", file_path)
     ti_tool = TextInspectorTool()
+    output = ti_tool.forward(file_path=file_path, question=question)
+    logger.debug("Text inspector output length: {}", len(output))
+    logger.debug("Text inspector output first 200 chars: {}", output[:200] if len(output) > 200 else output)
+    return output

src/flow/tools/web.py CHANGED Viewed

@@ -7,6 +7,8 @@ import os
 from typing import Annotated
 from urllib.parse import urlparse
 from .base import tool
@@ -20,10 +22,12 @@ def web_search(
     Requires GOOGLE_API_KEY and GOOGLE_CSE_ID environment variables.
     Returns a list of search results with titles, URLs, and snippets.
     """
     api_key = os.environ.get("GOOGLE_API_KEY")
     cse_id = os.environ.get("GOOGLE_CSE_ID")
     if not api_key or not cse_id:
         return (
             "Error: Web search requires GOOGLE_API_KEY and GOOGLE_CSE_ID "
             "environment variables to be set."
@@ -32,6 +36,7 @@ def web_search(
     try:
         import httpx
     except ImportError:
         return "Error: httpx package required. Install with: pip install httpx"
     try:
@@ -51,6 +56,7 @@ def web_search(
         items = data.get("items", [])
         if not items:
             return f"No results found for: {query}"
         results: list[str] = []
@@ -60,9 +66,11 @@ def web_search(
             snippet = item.get("snippet", "No description")
             results.append(f"{i}. {title}\n   {link}\n   {snippet}")
         return "\n\n".join(results)
     except Exception as e:
         return f"Error performing search: {e!s}"
@@ -77,6 +85,7 @@ def web_fetch(
     Returns the page content in the specified format.
     Useful for reading documentation, articles, and web pages.
     """
     # Validate URL
     try:
         parsed = urlparse(url)
@@ -84,13 +93,16 @@ def web_fetch(
             url = "https://" + url
             parsed = urlparse(url)
         if not parsed.netloc:
             return f"Error: Invalid URL: {url}"
-    except Exception:
         return f"Error: Invalid URL format: {url}"
     try:
         import httpx
     except ImportError:
         return "Error: httpx package required. Install with: pip install httpx"
     try:
@@ -139,7 +151,9 @@ def web_fetch(
         if len(content) > max_length:
             content = content[:max_length] + "\n\n[Content truncated...]"
         return content
     except Exception as e:
         return f"Error fetching URL: {e!s}"

 from typing import Annotated
 from urllib.parse import urlparse
+from loguru import logger
 from .base import tool
     Requires GOOGLE_API_KEY and GOOGLE_CSE_ID environment variables.
     Returns a list of search results with titles, URLs, and snippets.
     """
+    logger.info(f"web_search: searching for '{query}' (num_results={num_results})")
     api_key = os.environ.get("GOOGLE_API_KEY")
     cse_id = os.environ.get("GOOGLE_CSE_ID")
     if not api_key or not cse_id:
+        logger.warning("web_search: missing API credentials")
         return (
             "Error: Web search requires GOOGLE_API_KEY and GOOGLE_CSE_ID "
             "environment variables to be set."
     try:
         import httpx
     except ImportError:
+        logger.warning("web_search: httpx package not installed")
         return "Error: httpx package required. Install with: pip install httpx"
     try:
         items = data.get("items", [])
         if not items:
+            logger.debug(f"web_search: no results for '{query}'")
             return f"No results found for: {query}"
         results: list[str] = []
             snippet = item.get("snippet", "No description")
             results.append(f"{i}. {title}\n   {link}\n   {snippet}")
+        logger.debug(f"web_search: found {len(items)} results for '{query}'")
         return "\n\n".join(results)
     except Exception as e:
+        logger.warning(f"web_search: error searching: {e}")
         return f"Error performing search: {e!s}"
     Returns the page content in the specified format.
     Useful for reading documentation, articles, and web pages.
     """
+    logger.info(f"web_fetch: fetching {url}")
     # Validate URL
     try:
         parsed = urlparse(url)
             url = "https://" + url
             parsed = urlparse(url)
         if not parsed.netloc:
+            logger.warning(f"web_fetch: invalid URL: {url}")
             return f"Error: Invalid URL: {url}"
+    except Exception as e:
+        logger.warning(f"web_fetch: invalid URL format: {url}: {e}")
         return f"Error: Invalid URL format: {url}"
     try:
         import httpx
     except ImportError:
+        logger.warning("web_fetch: httpx package not installed")
         return "Error: httpx package required. Install with: pip install httpx"
     try:
         if len(content) > max_length:
             content = content[:max_length] + "\n\n[Content truncated...]"
+        logger.debug(f"web_fetch: fetched {len(content)} chars from {url}")
         return content
     except Exception as e:
+        logger.warning(f"web_fetch: error fetching {url}: {e}")
         return f"Error fetching URL: {e!s}"

src/flow/tools/workspace.py CHANGED Viewed

@@ -36,6 +36,8 @@ import json
 from pathlib import Path
 from typing import Any
 class Workspace:
     """Manages workspace paths and agent data storage.
@@ -53,6 +55,7 @@ class Workspace:
         if root is None:
             root = Path.cwd()
         self._root = Path(root).resolve()
     @property
     def root(self) -> Path:
@@ -97,8 +100,11 @@ class Workspace:
             return []
         try:
             with open(self.todos_file) as f:
-                return json.load(f)  # type: ignore[no-any-return]
-        except (OSError, json.JSONDecodeError):
             return []
     def save_todos(self, todos: list[dict[str, Any]]) -> None:
@@ -106,6 +112,7 @@ class Workspace:
         self.ensure_data_dir()
         with open(self.todos_file, "w") as f:
             json.dump(todos, f, indent=2)
     # --- Memory ---
@@ -119,8 +126,10 @@ class Workspace:
             try:
                 with open(filepath) as f:
                     memories.append(json.load(f))
-            except (OSError, json.JSONDecodeError):
                 continue
         return memories
     def load_memory(self, memory_id: str) -> dict[str, Any] | None:
@@ -131,7 +140,8 @@ class Workspace:
         try:
             with open(filepath) as f:
                 return json.load(f)  # type: ignore[no-any-return]
-        except (OSError, json.JSONDecodeError):
             return None
     def save_memory(self, memory_id: str, data: dict[str, Any]) -> None:
@@ -140,12 +150,14 @@ class Workspace:
         filepath = self.memory_dir / f"{memory_id}.json"
         with open(filepath, "w") as f:
             json.dump(data, f, indent=2, default=str)
     def delete_memory(self, memory_id: str) -> bool:
         """Delete a memory entry. Returns True if deleted."""
         filepath = self.memory_dir / f"{memory_id}.json"
         if filepath.exists():
             filepath.unlink()
             return True
         return False
@@ -158,7 +170,8 @@ class Workspace:
         try:
             with open(self.config_file) as f:
                 return json.load(f)
-        except (OSError, json.JSONDecodeError):
             return {}
     def save_config(self, config: dict[str, Any]) -> None:

 from pathlib import Path
 from typing import Any
+from loguru import logger
 class Workspace:
     """Manages workspace paths and agent data storage.
         if root is None:
             root = Path.cwd()
         self._root = Path(root).resolve()
+        logger.debug(f"Workspace initialized at {self._root}")
     @property
     def root(self) -> Path:
             return []
         try:
             with open(self.todos_file) as f:
+                todos = json.load(f)
+                logger.debug(f"Loaded {len(todos)} todos from {self.todos_file}")
+                return todos  # type: ignore[no-any-return]
+        except (OSError, json.JSONDecodeError) as e:
+            logger.warning(f"Failed to load todos: {e}")
             return []
     def save_todos(self, todos: list[dict[str, Any]]) -> None:
         self.ensure_data_dir()
         with open(self.todos_file, "w") as f:
             json.dump(todos, f, indent=2)
+        logger.debug(f"Saved {len(todos)} todos to {self.todos_file}")
     # --- Memory ---
             try:
                 with open(filepath) as f:
                     memories.append(json.load(f))
+            except (OSError, json.JSONDecodeError) as e:
+                logger.debug(f"Failed to load memory {filepath}: {e}")
                 continue
+        logger.debug(f"Listed {len(memories)} memories from {self.memory_dir}")
         return memories
     def load_memory(self, memory_id: str) -> dict[str, Any] | None:
         try:
             with open(filepath) as f:
                 return json.load(f)  # type: ignore[no-any-return]
+        except (OSError, json.JSONDecodeError) as e:
+            logger.debug(f"Failed to load memory '{memory_id}': {e}")
             return None
     def save_memory(self, memory_id: str, data: dict[str, Any]) -> None:
         filepath = self.memory_dir / f"{memory_id}.json"
         with open(filepath, "w") as f:
             json.dump(data, f, indent=2, default=str)
+        logger.debug(f"Saved memory '{memory_id}' to {filepath}")
     def delete_memory(self, memory_id: str) -> bool:
         """Delete a memory entry. Returns True if deleted."""
         filepath = self.memory_dir / f"{memory_id}.json"
         if filepath.exists():
             filepath.unlink()
+            logger.debug(f"Deleted memory '{memory_id}'")
             return True
         return False
         try:
             with open(self.config_file) as f:
                 return json.load(f)
+        except (OSError, json.JSONDecodeError) as e:
+            logger.debug(f"Failed to load config: {e}")
             return {}
     def save_config(self, config: dict[str, Any]) -> None:

src/flow/ui/api/__init__.py CHANGED Viewed

@@ -2,6 +2,7 @@
 """API routes package."""
 from .configs import router as configs_router
 from .evaluate import router as evaluate_router
 from .experiment import router as experiment_router
 from .jobs import router as jobs_router
@@ -14,6 +15,7 @@ from .tools import router as tools_router
 __all__ = [
     "configs_router",
     "evaluate_router",
     "experiment_router",
     "jobs_router",

 """API routes package."""
 from .configs import router as configs_router
+from .deployments import router as deployments_router
 from .evaluate import router as evaluate_router
 from .experiment import router as experiment_router
 from .jobs import router as jobs_router
 __all__ = [
     "configs_router",
+    "deployments_router",
     "evaluate_router",
     "experiment_router",
     "jobs_router",

src/flow/ui/api/deployments.py ADDED Viewed

	@@ -0,0 +1,145 @@

+# Copyright (c) Microsoft. All rights reserved.
+"""Deployment API routes."""
+from typing import Annotated
+from uuid import UUID
+from fastapi import APIRouter, Depends, HTTPException
+from sqlalchemy.ext.asyncio import AsyncSession
+from sqlmodel import desc, or_, select
+from ..auth import TokenData, get_current_user, get_effective_user_id, should_filter_by_user
+from ..database import get_session
+from ..models.deployment import Deployment, DeploymentVersion
+from ..schemas.deployment import DeploymentDetailResponse, DeploymentResponse, DeploymentVersionResponse
+router = APIRouter(prefix="/deployments", tags=["deployments"])
+def _parse_uuid(id_str: str) -> UUID:
+    """Parse a string to UUID, raising 400 if invalid."""
+    try:
+        return UUID(id_str)
+    except ValueError as e:
+        raise HTTPException(status_code=400, detail=f"Invalid UUID: {id_str}") from e
+@router.get("", response_model=list[DeploymentResponse])
+async def list_deployments(
+    session: AsyncSession = Depends(get_session),
+    user: Annotated[TokenData | None, Depends(get_current_user)] = None,
+) -> list[Deployment]:
+    """List all deployments."""
+    query = select(Deployment)
+    if should_filter_by_user():
+        effective_user_id = get_effective_user_id(user)
+        query = query.where(
+            or_(
+                Deployment.user_id == effective_user_id,
+                Deployment.is_public == True,  # noqa: E712
+            )
+        )
+    query = query.order_by(desc(Deployment.updated_at))
+    result = await session.execute(query)
+    return list(result.scalars().all())
+@router.get("/{deployment_id}", response_model=DeploymentDetailResponse)
+async def get_deployment(
+    deployment_id: str,
+    session: AsyncSession = Depends(get_session),
+    user: Annotated[TokenData | None, Depends(get_current_user)] = None,
+) -> dict:
+    """Get a deployment with its version history."""
+    uuid_id = _parse_uuid(deployment_id)
+    query = select(Deployment).where(Deployment.id == uuid_id)
+    if should_filter_by_user():
+        effective_user_id = get_effective_user_id(user)
+        query = query.where(
+            or_(
+                Deployment.is_public == True,  # noqa: E712
+                Deployment.user_id == effective_user_id,
+            )
+        )
+    result = await session.execute(query)
+    deployment = result.scalar_one_or_none()
+    if not deployment:
+        raise HTTPException(status_code=404, detail="Deployment not found")
+    # Fetch versions
+    versions_result = await session.execute(
+        select(DeploymentVersion)
+        .where(DeploymentVersion.deployment_id == uuid_id)
+        .order_by(desc(DeploymentVersion.version))
+    )
+    versions = list(versions_result.scalars().all())
+    return {
+        **deployment.__dict__,
+        "versions": versions,
+    }
+@router.get("/{deployment_id}/versions", response_model=list[DeploymentVersionResponse])
+async def list_versions(
+    deployment_id: str,
+    session: AsyncSession = Depends(get_session),
+    user: Annotated[TokenData | None, Depends(get_current_user)] = None,
+) -> list[DeploymentVersion]:
+    """List all versions of a deployment."""
+    uuid_id = _parse_uuid(deployment_id)
+    # Verify deployment exists and user has access
+    dep_query = select(Deployment).where(Deployment.id == uuid_id)
+    if should_filter_by_user():
+        effective_user_id = get_effective_user_id(user)
+        dep_query = dep_query.where(
+            or_(
+                Deployment.is_public == True,  # noqa: E712
+                Deployment.user_id == effective_user_id,
+            )
+        )
+    dep_result = await session.execute(dep_query)
+    if not dep_result.scalar_one_or_none():
+        raise HTTPException(status_code=404, detail="Deployment not found")
+    result = await session.execute(
+        select(DeploymentVersion)
+        .where(DeploymentVersion.deployment_id == uuid_id)
+        .order_by(desc(DeploymentVersion.version))
+    )
+    return list(result.scalars().all())
+@router.delete("/{deployment_id}", status_code=204)
+async def delete_deployment(
+    deployment_id: str,
+    session: AsyncSession = Depends(get_session),
+    user: Annotated[TokenData | None, Depends(get_current_user)] = None,
+) -> None:
+    """Delete a deployment and all its versions."""
+    uuid_id = _parse_uuid(deployment_id)
+    query = select(Deployment).where(Deployment.id == uuid_id)
+    if should_filter_by_user():
+        effective_user_id = get_effective_user_id(user)
+        query = query.where(Deployment.user_id == effective_user_id)
+    result = await session.execute(query)
+    deployment = result.scalar_one_or_none()
+    if not deployment:
+        raise HTTPException(status_code=404, detail="Deployment not found")
+    # Delete versions first
+    versions_result = await session.execute(
+        select(DeploymentVersion).where(DeploymentVersion.deployment_id == uuid_id)
+    )
+    for version in versions_result.scalars().all():
+        await session.delete(version)
+    await session.delete(deployment)
+    await session.commit()

src/flow/ui/api/experiment.py CHANGED Viewed

@@ -94,7 +94,6 @@ async def design_experiment(
     Returns the YAML content and candidate count for preview.
     """
     # Look up base agent to get its path/name
-    from uuid import UUID
     try:
         agent_uuid = UUID(data.base_agent_id)
     except ValueError as e:

     Returns the YAML content and candidate count for preview.
     """
     # Look up base agent to get its path/name
     try:
         agent_uuid = UUID(data.base_agent_id)
     except ValueError as e:

src/flow/ui/api/jobs.py CHANGED Viewed

@@ -7,7 +7,7 @@ from collections.abc import AsyncGenerator
 from typing import Annotated, Any
 from uuid import UUID
-from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException
 from fastapi.responses import StreamingResponse
 from sqlalchemy.ext.asyncio import AsyncSession
 from sqlmodel import desc, or_, select
@@ -80,15 +80,44 @@ async def create_job(
     """Create a new optimization job."""
     effective_user_id = get_effective_user_id(user)
-    # Validate candidate_ids exist AND belong to user
-    for candidate_id in data.candidate_ids:
-        uuid_id = parse_uuid(candidate_id)
         query = select(AgentConfig).where(AgentConfig.id == uuid_id)
         if should_filter_by_user():
             query = query.where(AgentConfig.user_id == effective_user_id)
         result = await session.execute(query)
         if not result.scalar_one_or_none():
-            raise HTTPException(status_code=400, detail=f"Candidate {candidate_id} not found")
     # Validate task_ids exist AND are accessible (shared or user's own)
     for task_id in data.task_ids:
@@ -114,9 +143,14 @@ async def create_job(
         name=data.name,
         candidate_ids=data.candidate_ids,
         task_ids=data.task_ids,
         parallel=data.parallel,
         use_llm_eval=data.use_llm_eval,
-        total_experiments=len(data.candidate_ids) * len(data.task_ids),
         user_id=effective_user_id,
         created_by_name=created_by_name,
     )
@@ -223,7 +257,6 @@ async def _run_job_background(job_id: str) -> None:
 @router.post("/{job_id}/start")
 async def start_job(
     job_id: str,
-    background_tasks: BackgroundTasks,
     session: AsyncSession = Depends(get_session),
     user: Annotated[TokenData | None, Depends(get_current_user)] = None,
 ) -> StreamingResponse:

 from typing import Annotated, Any
 from uuid import UUID
+from fastapi import APIRouter, Depends, HTTPException
 from fastapi.responses import StreamingResponse
 from sqlalchemy.ext.asyncio import AsyncSession
 from sqlmodel import desc, or_, select
     """Create a new optimization job."""
     effective_user_id = get_effective_user_id(user)
+    is_strategy_mode = len(data.strategies) > 0
+    if is_strategy_mode:
+        # Strategy mode: validate strategy names and base_agent_id
+        if not data.base_agent_id:
+            raise HTTPException(status_code=400, detail="base_agent_id is required for strategy mode")
+        from flow.experiments.strategies import get_registered_strategies
+        available = list(get_registered_strategies().keys())
+        for sname in data.strategies:
+            if sname not in available:
+                raise HTTPException(
+                    status_code=400,
+                    detail=f"Unknown strategy: {sname}. Available: {available}",
+                )
+        # Validate base agent exists and belongs to user
+        uuid_id = parse_uuid(data.base_agent_id)
         query = select(AgentConfig).where(AgentConfig.id == uuid_id)
         if should_filter_by_user():
             query = query.where(AgentConfig.user_id == effective_user_id)
         result = await session.execute(query)
         if not result.scalar_one_or_none():
+            raise HTTPException(status_code=400, detail=f"Base agent {data.base_agent_id} not found")
+    else:
+        # Grid mode: validate candidate_ids exist AND belong to user
+        if not data.candidate_ids:
+            raise HTTPException(status_code=400, detail="candidate_ids required for grid mode (or use strategy mode)")
+        for candidate_id in data.candidate_ids:
+            uuid_id = parse_uuid(candidate_id)
+            query = select(AgentConfig).where(AgentConfig.id == uuid_id)
+            if should_filter_by_user():
+                query = query.where(AgentConfig.user_id == effective_user_id)
+            result = await session.execute(query)
+            if not result.scalar_one_or_none():
+                raise HTTPException(status_code=400, detail=f"Candidate {candidate_id} not found")
     # Validate task_ids exist AND are accessible (shared or user's own)
     for task_id in data.task_ids:
         name=data.name,
         candidate_ids=data.candidate_ids,
         task_ids=data.task_ids,
+        strategies=data.strategies,
+        strategy_config=data.strategy_config,
+        base_agent_id=data.base_agent_id,
         parallel=data.parallel,
         use_llm_eval=data.use_llm_eval,
+        # Strategy mode: estimate based on tasks (will be updated by progress callback)
+        # Grid mode: exact count = candidates × tasks
+        total_experiments=len(data.task_ids) if is_strategy_mode else len(data.candidate_ids) * len(data.task_ids),
         user_id=effective_user_id,
         created_by_name=created_by_name,
     )
 @router.post("/{job_id}/start")
 async def start_job(
     job_id: str,
     session: AsyncSession = Depends(get_session),
     user: Annotated[TokenData | None, Depends(get_current_user)] = None,
 ) -> StreamingResponse:

src/flow/ui/api/schema.py CHANGED Viewed

@@ -74,7 +74,7 @@ class LLMProviderSchema(BaseModel):
 class OptimizationStrategySchema(BaseModel):
-    """Schema for an optimization strategy (GEPA, llm_rewriter, etc.)."""
     name: str = Field(description="Strategy identifier")
     description: str = Field(description="What this strategy does")
@@ -238,14 +238,18 @@ async def get_agent_schema() -> AgentSchema:
             "description": "GEPA: Reflective prompt evolution using LLM feedback",
             "applicable_dimensions": ["instructions"],
         },
-        "llm_rewriter": {
-            "description": "LLM-based instruction rewriting with variations",
             "applicable_dimensions": ["instructions"],
         },
-        "tool_selector": {
-            "description": "Intelligent tool set selection based on task",
             "applicable_dimensions": ["tools"],
         },
     }
     optimization_strategies = [
         OptimizationStrategySchema(

 class OptimizationStrategySchema(BaseModel):
+    """Schema for an optimization strategy (GEPA, instruction, tool, skill)."""
     name: str = Field(description="Strategy identifier")
     description: str = Field(description="What this strategy does")
             "description": "GEPA: Reflective prompt evolution using LLM feedback",
             "applicable_dimensions": ["instructions"],
         },
+        "instruction": {
+            "description": "LLM-based instruction optimization via evaluate-reflect-rewrite",
             "applicable_dimensions": ["instructions"],
         },
+        "tool": {
+            "description": "Intelligent tool set optimization based on task failures",
             "applicable_dimensions": ["tools"],
         },
+        "skill": {
+            "description": "Skill generation and selection to provide domain knowledge",
+            "applicable_dimensions": ["skills"],
+        },
     }
     optimization_strategies = [
         OptimizationStrategySchema(

src/flow/ui/api/tests.py CHANGED Viewed

@@ -1,7 +1,6 @@
 # Copyright (c) Microsoft. All rights reserved.
 """Test run API routes for interactive agent testing."""
-import asyncio
 import logging
 from collections.abc import AsyncGenerator
 from typing import Annotated, Any
@@ -13,7 +12,7 @@ from sqlalchemy.ext.asyncio import AsyncSession
 from sqlmodel import desc, select
 from ..auth import TokenData, get_current_user, get_effective_user_id, should_filter_by_user
-from ..database import async_session, get_session
 from ..models.config import AgentConfig
 from ..models.test_run import TestRun, TestRunStatus
 from ..schemas.test import TestRunCreate, TestRunDetailResponse, TestRunResponse
@@ -22,10 +21,6 @@ from ..services.test_service import TestService
 router = APIRouter(prefix="/tests", tags=["tests"])
 logger = logging.getLogger(__name__)
-# Store running tests for cancellation
-_running_tests: dict[str, asyncio.Task[Any]] = {}
 def parse_uuid(id_str: str) -> UUID:
     """Parse a string to UUID, raising 400 if invalid."""
     try:
@@ -150,30 +145,6 @@ async def get_test(
     }
-async def _run_test_background(test_id: str) -> None:
-    """Run test in background, updating DB with progress."""
-    service = TestService()
-    try:
-        async for progress in service.run_test(test_id):
-            logger.debug(f"Test {test_id[:8]} progress: {progress.event} - {progress.message}")
-    except Exception as e:
-        logger.error(f"Background test {test_id[:8]} failed: {e}")
-        # Ensure test is marked as failed
-        async with async_session() as session:
-            from datetime import datetime, timezone
-            result = await session.execute(
-                select(TestRun).where(TestRun.id == UUID(test_id))
-            )
-            test_run = result.scalar_one_or_none()
-            if test_run and test_run.status == TestRunStatus.RUNNING:
-                test_run.status = TestRunStatus.FAILED
-                test_run.error = f"Background execution failed: {e}"
-                test_run.completed_at = datetime.now(timezone.utc)
-                await session.commit()
-    finally:
-        _running_tests.pop(test_id, None)
 @router.post("/{test_id}/start")
 async def start_test(
     test_id: str,
@@ -246,11 +217,6 @@ async def cancel_test(
     if test_run.status != TestRunStatus.RUNNING:
         raise HTTPException(status_code=400, detail=f"Test is not running (status: {test_run.status})")
-    # Cancel the running task if it exists
-    if test_id in _running_tests:
-        _running_tests[test_id].cancel()
-        del _running_tests[test_id]
     test_run.status = TestRunStatus.CANCELLED
     await session.commit()
     await session.refresh(test_run)

 # Copyright (c) Microsoft. All rights reserved.
 """Test run API routes for interactive agent testing."""
 import logging
 from collections.abc import AsyncGenerator
 from typing import Annotated, Any
 from sqlmodel import desc, select
 from ..auth import TokenData, get_current_user, get_effective_user_id, should_filter_by_user
+from ..database import get_session
 from ..models.config import AgentConfig
 from ..models.test_run import TestRun, TestRunStatus
 from ..schemas.test import TestRunCreate, TestRunDetailResponse, TestRunResponse
 router = APIRouter(prefix="/tests", tags=["tests"])
 logger = logging.getLogger(__name__)
 def parse_uuid(id_str: str) -> UUID:
     """Parse a string to UUID, raising 400 if invalid."""
     try:
     }
 @router.post("/{test_id}/start")
 async def start_test(
     test_id: str,
     if test_run.status != TestRunStatus.RUNNING:
         raise HTTPException(status_code=400, detail=f"Test is not running (status: {test_run.status})")
     test_run.status = TestRunStatus.CANCELLED
     await session.commit()
     await session.refresh(test_run)

src/flow/ui/auth/__init__.py CHANGED Viewed

@@ -2,7 +2,7 @@
 """Authentication module for Flow UI."""
 from .config import AuthMode, AuthSettings, get_auth_settings, init_auth_settings
-from .middleware import get_current_user, require_auth
 from .router import router as auth_router
 from .tokens import TokenData, create_access_token, verify_access_token
 from .user_context import ANONYMOUS_USER_ID, get_effective_user_id, should_filter_by_user
@@ -18,7 +18,6 @@ __all__ = [
     "get_current_user",
     "get_effective_user_id",
     "init_auth_settings",
-    "require_auth",
     "should_filter_by_user",
     "verify_access_token",
 ]

 """Authentication module for Flow UI."""
 from .config import AuthMode, AuthSettings, get_auth_settings, init_auth_settings
+from .middleware import get_current_user
 from .router import router as auth_router
 from .tokens import TokenData, create_access_token, verify_access_token
 from .user_context import ANONYMOUS_USER_ID, get_effective_user_id, should_filter_by_user
     "get_current_user",
     "get_effective_user_id",
     "init_auth_settings",
     "should_filter_by_user",
     "verify_access_token",
 ]

src/flow/ui/auth/config.py CHANGED Viewed

@@ -5,14 +5,10 @@ from __future__ import annotations
 import secrets
 from enum import Enum
-from typing import TYPE_CHECKING
 from pydantic import Field, field_validator
 from pydantic_settings import BaseSettings, SettingsConfigDict
-if TYPE_CHECKING:
-    pass
 class AuthMode(str, Enum):
     """Authentication mode."""

 import secrets
 from enum import Enum
 from pydantic import Field, field_validator
 from pydantic_settings import BaseSettings, SettingsConfigDict
 class AuthMode(str, Enum):
     """Authentication mode."""

src/flow/ui/auth/middleware.py CHANGED Viewed

@@ -60,48 +60,6 @@ async def get_current_user(
         ) from e
-async def require_auth(
-    user: Annotated[TokenData | None, Depends(get_current_user)],
-) -> TokenData | None:
-    """Require authentication if enabled.
-    Use this as a dependency on routes that should be protected when auth is enabled.
-    This is essentially an alias for get_current_user that makes intent clearer.
-    Args:
-        user: The current user from get_current_user
-    Returns:
-        TokenData if authenticated, None if auth is disabled
-    """
-    return user
-def get_optional_user(
-    credentials: Annotated[HTTPAuthorizationCredentials | None, Depends(bearer_scheme)],
-) -> TokenData | None:
-    """Get the current user if a valid token is provided, otherwise None.
-    Unlike get_current_user, this never raises an error - it's for routes
-    that work differently based on whether the user is authenticated.
-    Args:
-        credentials: The bearer token credentials
-    Returns:
-        TokenData if valid token provided, None otherwise
-    """
-    settings = get_auth_settings()
-    if credentials is None:
-        return None
-    try:
-        return verify_access_token(credentials.credentials, settings.secret)
-    except TokenError:
-        return None
 class AuthMiddleware:
     """Middleware to check authentication on all /api/* routes except /api/auth/*.

         ) from e
 class AuthMiddleware:
     """Middleware to check authentication on all /api/* routes except /api/auth/*.