Spaces:

KChad
/

Pulse_ER_env

Sleeping

File size: 6,762 Bytes

9b1756a

"""Generate a submission-friendly before/after mock policy comparison plot.

This script is intentionally honest about what "before" means:

- before: a weak random tool-calling baseline on the selected mock scenario
- after: the trained/expert policy on the same scenario

It also includes the verified mock benchmark scores for context so the figure
is useful both as a quick demo asset and as a reproducible benchmark artifact.
"""

from __future__ import annotations

import argparse
import textwrap
from pathlib import Path

import matplotlib

matplotlib.use("Agg")
import matplotlib.pyplot as plt

from pulse_physiology_env.demo_llm_policy import heuristic_infer_fn
from pulse_physiology_env.episode_runner import EpisodeRunner
from pulse_physiology_env.eval_mock import score_policy, score_random_policy
from pulse_physiology_env.policies import LLMPolicy, RandomPolicy, build_expert_policy, build_no_action_policy
from pulse_physiology_env.server.adapters import MockPulseAdapter
from pulse_physiology_env.server.mock_scenarios import MOCK_SCENARIOS


def _run_trace(policy, *, scenario_id: str):
    backend = MockPulseAdapter(default_scenario_id=scenario_id, seed=0)
    runner = EpisodeRunner(backend=backend, max_steps=8)
    try:
        return runner.run(policy=policy, scenario_id=scenario_id)
    finally:
        close_method = getattr(backend, "close", None)
        if callable(close_method):
            close_method()


def main() -> None:
    parser = argparse.ArgumentParser()
    parser.add_argument("--scenario", default="respiratory_distress", choices=sorted(MOCK_SCENARIOS))
    parser.add_argument(
        "--output",
        default="artifacts/mock_before_after_comparison.png",
        help="Output PNG path.",
    )
    args = parser.parse_args()

    expert_scores = score_policy(lambda scenario_id: build_expert_policy(), "expert")
    llm_scores = score_policy(
        lambda scenario_id: LLMPolicy(infer_fn=heuristic_infer_fn, name="llm_demo"),
        "llm_demo",
    )
    random_scores = score_random_policy()
    no_action_scores = score_policy(lambda scenario_id: build_no_action_policy(), "no_action")

    scenario_id = args.scenario
    expert_trace = _run_trace(build_expert_policy(), scenario_id=scenario_id)
    random_trace = _run_trace(RandomPolicy(seed=0), scenario_id=scenario_id)

    output_path = Path(args.output)
    output_path.parent.mkdir(parents=True, exist_ok=True)

    fig = plt.figure(figsize=(14, 9), facecolor="white")
    gs = fig.add_gridspec(2, 2, height_ratios=[2.2, 1.2], hspace=0.35, wspace=0.25)

    ax_bar = fig.add_subplot(gs[0, 0])
    labels = ["Expert", "LLM Demo", "Random", "No Action"]
    values = [
        expert_scores.per_scenario[scenario_id],
        llm_scores.per_scenario[scenario_id],
        random_scores.per_scenario[scenario_id],
        no_action_scores.per_scenario[scenario_id],
    ]
    colors = ["#1f9d73", "#4c84c4", "#d68c2f", "#cc4b5a"]
    bars = ax_bar.bar(labels, values, color=colors, edgecolor="#1d2736", linewidth=1.2)
    ax_bar.axhline(0.0, color="#364152", linewidth=1.1)
    ax_bar.set_title(f"Scenario Reward Comparison: {scenario_id.replace('_', ' ').title()}", fontsize=15, pad=12)
    ax_bar.set_ylabel("Episode reward")
    ax_bar.grid(axis="y", alpha=0.18)
    for bar, value in zip(bars, values, strict=True):
        va = "bottom" if value >= 0 else "top"
        offset = 0.18 if value >= 0 else -0.18
        ax_bar.text(
            bar.get_x() + bar.get_width() / 2,
            value + offset,
            f"{value:+.3f}",
            ha="center",
            va=va,
            fontsize=11,
            fontweight="bold",
        )

    ax_trace = fig.add_subplot(gs[0, 1])
    expert_steps = [0] + [step.step_index + 1 for step in expert_trace.steps]
    expert_spo2 = [expert_trace.initial_observation.spo2 * 100] + [step.observation.spo2 * 100 for step in expert_trace.steps]
    random_steps = [0] + [step.step_index + 1 for step in random_trace.steps]
    random_spo2 = [random_trace.initial_observation.spo2 * 100] + [step.observation.spo2 * 100 for step in random_trace.steps]
    ax_trace.plot(expert_steps, expert_spo2, marker="o", linewidth=2.6, color="#1f9d73", label="Expert / after")
    ax_trace.plot(random_steps, random_spo2, marker="o", linewidth=2.2, color="#d68c2f", label="Random seed=0 / before")
    ax_trace.set_title("SpO2 Trajectory on the Same Scenario", fontsize=15, pad=12)
    ax_trace.set_xlabel("Episode step")
    ax_trace.set_ylabel("SpO2 (%)")
    ax_trace.grid(alpha=0.18)
    ax_trace.legend(frameon=False, loc="lower right")

    ax_text_left = fig.add_subplot(gs[1, 0])
    ax_text_left.axis("off")
    ax_text_left.text(
        0,
        1,
        "Before / after framing",
        fontsize=14,
        fontweight="bold",
        va="top",
    )
    ax_text_left.text(
        0,
        0.82,
        textwrap.fill(
            f"Before: random tool-calling baseline on {scenario_id} "
            f"finishes at {random_trace.total_reward:+.3f} reward.\n"
            f"After: expert policy on the same scenario finishes at {expert_trace.total_reward:+.3f} reward.\n\n"
            "This is a better submission artifact than reusing the same training curve twice, "
            "because it shows policy quality separation on an actual episode.",
            width=55,
        ),
        fontsize=11.5,
        color="#334155",
        va="top",
        linespacing=1.5,
        wrap=True,
    )

    ax_text_right = fig.add_subplot(gs[1, 1])
    ax_text_right.axis("off")
    ax_text_right.text(
        0,
        1,
        "Verified mock benchmark ordering",
        fontsize=14,
        fontweight="bold",
        va="top",
    )
    ax_text_right.text(
        0,
        0.82,
        textwrap.fill(
            f"Expert average: {expert_scores.average_reward:+.3f}\n"
            f"LLM demo average: {llm_scores.average_reward:+.3f}\n"
            f"Random average: {random_scores.average_reward:+.3f}\n"
            f"No-action average: {no_action_scores.average_reward:+.3f}",
            width=32,
        ),
        fontsize=11.5,
        color="#334155",
        va="top",
        linespacing=1.6,
        wrap=True,
    )

    fig.suptitle(
        "Pulse-ER Mock Before vs After\nRandom Tool Calls vs Trained Policy",
        fontsize=20,
        fontweight="bold",
        y=0.98,
    )
    fig.text(
        0.5,
        0.03,
        "Before = random tool-calling baseline (seed 0). After = expert/trained policy. All values are generated from the current repo.",
        ha="center",
        fontsize=10.5,
        color="#475569",
    )
    fig.savefig(output_path, dpi=180, bbox_inches="tight")
    print(output_path.resolve())


if __name__ == "__main__":
    main()