Spaces:

CreativeEngineer
/

fusion-design-lab

Paused

App Files Files Community

CreativeEngineer commited on Mar 8

Commit

ede4c5c

1 Parent(s): 8254ade

feat: add model-driven llm reward evaluation

Browse files

Files changed (2) hide show

training/README.md +6 -0
training/llm_rollout.py +202 -16

training/README.md CHANGED Viewed

@@ -23,6 +23,8 @@ Training policy:
 - replay an LLM completion or action plan: `uv run python training/llm_rollout.py replay --seed 0 --completion-file <path>`
 - monitor reward terms, action clamping, and verifier outcomes across seeds:
   `uv run python training/llm_rollout.py monitor --completion-file <path> --seeds 0,1,2`
 ## Shared LLM Contract
@@ -36,3 +38,7 @@ Use that module as the source of truth for:
 - action-plan parsing
 - local rollout replay
 - rollout telemetry structure used by the monitor command

 - replay an LLM completion or action plan: `uv run python training/llm_rollout.py replay --seed 0 --completion-file <path>`
 - monitor reward terms, action clamping, and verifier outcomes across seeds:
   `uv run python training/llm_rollout.py monitor --completion-file <path> --seeds 0,1,2`
+- generate fresh model completions per seed and save aggregate reward/outcome metrics:
+  `uv run python training/llm_rollout.py evaluate --completion-command 'python path/to/model_cli.py' --seeds 0,1,2`
 ## Shared LLM Contract
 - action-plan parsing
 - local rollout replay
 - rollout telemetry structure used by the monitor command
+For `evaluate`, the completion command reads the prompt from `stdin` and writes a raw completion to `stdout`.
+The current seed is exposed as the `FUSION_LAB_SEED` environment variable so the same command can be used
+for fixed-seed before/after comparisons of untrained and trained checkpoints.

training/llm_rollout.py CHANGED Viewed

@@ -2,6 +2,9 @@ from __future__ import annotations
 import argparse
 import json
 from datetime import UTC, datetime
 from pathlib import Path
 from typing import Final
@@ -10,12 +13,14 @@ from fusion_lab.llm_agent import (
     build_prompt,
     parse_action_plan,
     run_episode_with_actions,
 )
 from fusion_lab.models import StellaratorAction
 from server.environment import StellaratorEnvironment
 DEFAULT_OUTPUT_DIR: Final[Path] = Path("training/artifacts/llm_rollout")
 DEFAULT_MONITOR_OUTPUT_DIR: Final[Path] = Path("training/artifacts/llm_monitor")
 def add_action_source_args(parser: argparse.ArgumentParser) -> None:
@@ -81,6 +86,41 @@ def parse_args() -> argparse.Namespace:
         default=DEFAULT_MONITOR_OUTPUT_DIR,
         help="Directory for monitoring artifacts.",
     )
     return parser.parse_args()
@@ -174,6 +214,76 @@ def _reward_terms_summary(reward_breakdown: dict[str, object]) -> str:
     return ", ".join(non_zero_terms) if non_zero_terms else "none"
 def monitor_payload(
     *,
     source: str,
@@ -181,43 +291,102 @@ def monitor_payload(
     seeds: list[int],
 ) -> dict[str, object]:
     traces = [run_episode_with_actions(actions, seed_idx=seed) for seed in seeds]
-    feasible_count = sum(1 for trace in traces if trace.constraints_satisfied)
-    high_fidelity_count = sum(1 for trace in traces if trace.final_evaluation_fidelity == "high")
-    mean_reward = sum(trace.total_reward for trace in traces) / len(traces)
     return {
         "created_at_utc": datetime.now(UTC).isoformat(),
         "source": source,
         "parsed_action_count": len(actions),
         "actions": [action.model_dump(exclude_none=True) for action in actions],
         "seeds": seeds,
-        "summary": {
-            "episode_count": len(traces),
-            "feasible_episode_count": feasible_count,
-            "high_fidelity_episode_count": high_fidelity_count,
-            "mean_total_reward": round(mean_reward, 4),
-        },
         "episodes": [trace.asdict() for trace in traces],
     }
 def write_monitor_summary(payload: dict[str, object]) -> None:
     summary = payload["summary"]
     print(
         "episodes="
         f"{summary['episode_count']} feasible={summary['feasible_episode_count']} "
         f"high_fidelity={summary['high_fidelity_episode_count']} "
-        f"mean_total_reward={summary['mean_total_reward']:+.4f}"
     )
     for episode in payload["episodes"]:
         print(
             "seed="
-            f"{episode['seed']} total_reward={episode['total_reward']:+.4f} "
-            f"final_fidelity={episode['final_evaluation_fidelity']} "
-            f"feasible={episode['constraints_satisfied']} "
-            f"score={episode['final_score']:.6f} "
-            f"feasibility={episode['final_feasibility']:.6f}"
         )
-        for step in episode["steps"]:
             action_monitor = step["action_monitor"]
             print(
                 "  step="
@@ -240,6 +409,20 @@ def run_monitor(args: argparse.Namespace) -> None:
     write_monitor_summary(payload)
 def main() -> None:
     args = parse_args()
     if args.command == "prompt":
@@ -248,6 +431,9 @@ def main() -> None:
     if args.command == "replay":
         run_replay(args)
         return
     run_monitor(args)

 import argparse
 import json
+import math
+import os
+import subprocess
 from datetime import UTC, datetime
 from pathlib import Path
 from typing import Final
     build_prompt,
     parse_action_plan,
     run_episode_with_actions,
+    LLMEpisodeTrace,
 )
 from fusion_lab.models import StellaratorAction
 from server.environment import StellaratorEnvironment
 DEFAULT_OUTPUT_DIR: Final[Path] = Path("training/artifacts/llm_rollout")
 DEFAULT_MONITOR_OUTPUT_DIR: Final[Path] = Path("training/artifacts/llm_monitor")
+DEFAULT_EVALUATE_OUTPUT_DIR: Final[Path] = Path("training/artifacts/llm_evaluate")
 def add_action_source_args(parser: argparse.ArgumentParser) -> None:
         default=DEFAULT_MONITOR_OUTPUT_DIR,
         help="Directory for monitoring artifacts.",
     )
+    evaluate_parser = subparsers.add_parser(
+        "evaluate",
+        help=(
+            "Generate fresh completions per seed with a model command, replay them, "
+            "and save aggregate reward/outcome metrics."
+        ),
+    )
+    evaluate_parser.add_argument(
+        "--completion-command",
+        type=str,
+        required=True,
+        help=(
+            "Shell command that reads the prompt from stdin and writes a completion to stdout. "
+            "The current seed is exposed as FUSION_LAB_SEED."
+        ),
+    )
+    evaluate_parser.add_argument(
+        "--seeds",
+        type=str,
+        default="0,1,2",
+        help="Comma-separated reset seed indices to evaluate.",
+    )
+    evaluate_parser.add_argument(
+        "--label",
+        type=str,
+        default="model",
+        help="Short label stored in the evaluation artifact.",
+    )
+    evaluate_parser.add_argument(
+        "--output-dir",
+        type=Path,
+        default=DEFAULT_EVALUATE_OUTPUT_DIR,
+        help="Directory for evaluation artifacts.",
+    )
     return parser.parse_args()
     return ", ".join(non_zero_terms) if non_zero_terms else "none"
+def _mean(values: list[float]) -> float | None:
+    if not values:
+        return None
+    return sum(values) / len(values)
+def _round_metric(value: float | None) -> float | None:
+    if value is None:
+        return None
+    return round(value, 4)
+def _format_metric(value: object, precision: int = 4, signed: bool = False) -> str:
+    if not isinstance(value, (int, float)):
+        return "n/a"
+    if signed:
+        return f"{float(value):+.{precision}f}"
+    return f"{float(value):.{precision}f}"
+def _pearson_correlation(xs: list[float], ys: list[float]) -> float | None:
+    if len(xs) != len(ys) or len(xs) < 2:
+        return None
+    mean_x = sum(xs) / len(xs)
+    mean_y = sum(ys) / len(ys)
+    centered_x = [value - mean_x for value in xs]
+    centered_y = [value - mean_y for value in ys]
+    variance_x = sum(value * value for value in centered_x)
+    variance_y = sum(value * value for value in centered_y)
+    if math.isclose(variance_x, 0.0) or math.isclose(variance_y, 0.0):
+        return None
+    covariance = sum(x_value * y_value for x_value, y_value in zip(centered_x, centered_y))
+    return covariance / math.sqrt(variance_x * variance_y)
+def summarize_traces(traces: list[LLMEpisodeTrace]) -> dict[str, object]:
+    feasible_count = sum(1 for trace in traces if trace.constraints_satisfied)
+    high_fidelity_traces = [trace for trace in traces if trace.final_evaluation_fidelity == "high"]
+    high_fidelity_count = len(high_fidelity_traces)
+    failed_count = sum(1 for trace in traces if trace.evaluation_failed)
+    total_rewards = [trace.total_reward for trace in traces]
+    final_scores = [trace.final_score for trace in traces]
+    final_feasibilities = [trace.final_feasibility for trace in traces]
+    high_fidelity_scores = [trace.final_score for trace in high_fidelity_traces]
+    high_fidelity_feasibilities = [trace.final_feasibility for trace in high_fidelity_traces]
+    feasible_flags = [1.0 if trace.constraints_satisfied else 0.0 for trace in traces]
+    episode_count = len(traces)
+    return {
+        "episode_count": episode_count,
+        "feasible_episode_count": feasible_count,
+        "high_fidelity_episode_count": high_fidelity_count,
+        "evaluation_failed_episode_count": failed_count,
+        "feasible_rate": _round_metric(feasible_count / episode_count),
+        "high_fidelity_rate": _round_metric(high_fidelity_count / episode_count),
+        "evaluation_failed_rate": _round_metric(failed_count / episode_count),
+        "mean_total_reward": _round_metric(_mean(total_rewards)),
+        "mean_final_score": _round_metric(_mean(final_scores)),
+        "mean_final_feasibility": _round_metric(_mean(final_feasibilities)),
+        "mean_high_fidelity_score": _round_metric(_mean(high_fidelity_scores)),
+        "mean_high_fidelity_feasibility": _round_metric(_mean(high_fidelity_feasibilities)),
+        "reward_final_score_correlation": _round_metric(
+            _pearson_correlation(total_rewards, final_scores)
+        ),
+        "reward_feasible_correlation": _round_metric(
+            _pearson_correlation(total_rewards, feasible_flags)
+        ),
+    }
 def monitor_payload(
     *,
     source: str,
     seeds: list[int],
 ) -> dict[str, object]:
     traces = [run_episode_with_actions(actions, seed_idx=seed) for seed in seeds]
     return {
         "created_at_utc": datetime.now(UTC).isoformat(),
         "source": source,
         "parsed_action_count": len(actions),
         "actions": [action.model_dump(exclude_none=True) for action in actions],
         "seeds": seeds,
+        "summary": summarize_traces(traces),
         "episodes": [trace.asdict() for trace in traces],
     }
+def _run_completion_command(*, prompt: str, seed: int, command: str) -> str:
+    env = os.environ.copy()
+    env["FUSION_LAB_SEED"] = str(seed)
+    shell_path = env.get("SHELL", "/bin/sh")
+    completed = subprocess.run(
+        [shell_path, "-lc", command],
+        input=prompt,
+        text=True,
+        capture_output=True,
+        env=env,
+        check=False,
+    )
+    if completed.returncode != 0:
+        raise RuntimeError(
+            "completion command failed "
+            f"(seed={seed}, exit_code={completed.returncode}): {completed.stderr.strip()}"
+        )
+    return completed.stdout
+def evaluate_payload(
+    *,
+    completion_command: str,
+    label: str,
+    seeds: list[int],
+) -> dict[str, object]:
+    evaluations: list[dict[str, object]] = []
+    traces: list[LLMEpisodeTrace] = []
+    for seed in seeds:
+        observation = StellaratorEnvironment().reset(seed=seed)
+        prompt = build_prompt(observation)
+        completion = _run_completion_command(
+            prompt=prompt,
+            seed=seed,
+            command=completion_command,
+        )
+        actions = parse_action_plan(completion)
+        trace = run_episode_with_actions(actions, seed_idx=seed)
+        traces.append(trace)
+        evaluations.append(
+            {
+                "seed": seed,
+                "prompt": prompt,
+                "completion": completion,
+                "parsed_action_count": len(actions),
+                "actions": [action.model_dump(exclude_none=True) for action in actions],
+                "trace": trace.asdict(),
+            }
+        )
+    return {
+        "created_at_utc": datetime.now(UTC).isoformat(),
+        "label": label,
+        "completion_command": completion_command,
+        "seeds": seeds,
+        "summary": summarize_traces(traces),
+        "episodes": evaluations,
+    }
 def write_monitor_summary(payload: dict[str, object]) -> None:
     summary = payload["summary"]
     print(
         "episodes="
         f"{summary['episode_count']} feasible={summary['feasible_episode_count']} "
         f"high_fidelity={summary['high_fidelity_episode_count']} "
+        f"failed={summary['evaluation_failed_episode_count']} "
+        f"mean_total_reward={_format_metric(summary['mean_total_reward'], signed=True)} "
+        f"mean_high_fidelity_score={_format_metric(summary['mean_high_fidelity_score'], signed=True)} "
+        f"reward_score_corr={summary['reward_final_score_correlation']}"
     )
     for episode in payload["episodes"]:
+        trace = episode.get("trace", episode)
         print(
             "seed="
+            f"{trace['seed']} total_reward={trace['total_reward']:+.4f} "
+            f"final_fidelity={trace['final_evaluation_fidelity']} "
+            f"feasible={trace['constraints_satisfied']} "
+            f"score={trace['final_score']:.6f} "
+            f"feasibility={trace['final_feasibility']:.6f}"
         )
+        if "parsed_action_count" in episode:
+            print(f"  parsed_actions={episode['parsed_action_count']}")
+        for step in trace["steps"]:
             action_monitor = step["action_monitor"]
             print(
                 "  step="
     write_monitor_summary(payload)
+def run_evaluate(args: argparse.Namespace) -> None:
+    seeds = parse_seed_list(args.seeds)
+    payload = evaluate_payload(
+        completion_command=args.completion_command,
+        label=args.label,
+        seeds=seeds,
+    )
+    timestamp = datetime.now(UTC).strftime("%Y%m%dT%H%M%SZ")
+    output_path = args.output_dir / f"llm_evaluate_{timestamp}.json"
+    write_json(output_path, payload)
+    print(output_path)
+    write_monitor_summary(payload)
 def main() -> None:
     args = parse_args()
     if args.command == "prompt":
     if args.command == "replay":
         run_replay(args)
         return
+    if args.command == "evaluate":
+        run_evaluate(args)
+        return
     run_monitor(args)