replicalab / tests /test_logging.py
maxxie114's picture
Initial HF Spaces deployment
80d8c84
"""Tests for replicalab.utils.logging (MOD 07 + JDG 07)."""
from __future__ import annotations
import csv
import json
from pathlib import Path
import pytest
from replicalab.models import (
ConversationEntry,
EpisodeLog,
EpisodeState,
RewardBreakdown,
)
from replicalab.utils.logging import (
append_reward_csv,
append_reward_jsonl,
load_episode_log,
log_episode_reward,
write_episode_log,
)
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _make_episode_log(episode_id: str = "test-ep-001") -> EpisodeLog:
return EpisodeLog(
episode_id=episode_id,
seed=42,
scenario_template="math_reasoning",
difficulty="easy",
final_state=EpisodeState(
seed=42,
scenario_template="math_reasoning",
difficulty="easy",
paper_title="Test Title",
paper_hypothesis="Test Hypothesis",
paper_method="Test Method",
paper_key_finding="Test Finding",
experiment_goal="Test Goal",
round_number=2,
max_rounds=6,
done=True,
agreement_reached=True,
reward=5.0,
rigor_score=0.8,
feasibility_score=0.7,
fidelity_score=0.9,
),
transcript=[
ConversationEntry(
role="scientist",
message="Proposing protocol.",
round_number=1,
action_type="propose_protocol",
),
ConversationEntry(
role="lab_manager",
message="Feasible.",
round_number=1,
action_type="report_feasibility",
),
],
reward_breakdown=RewardBreakdown(
rigor=0.8,
feasibility=0.7,
fidelity=0.9,
),
total_reward=5.0,
rounds_used=2,
agreement_reached=True,
judge_notes="All checks passed.",
verdict="accept",
)
# ---------------------------------------------------------------------------
# write_episode_log / load_episode_log
# ---------------------------------------------------------------------------
class TestWriteAndLoadEpisodeLog:
def test_round_trip_lossless(self, tmp_path: Path) -> None:
log = _make_episode_log()
path = write_episode_log(log, directory=tmp_path)
assert path.exists()
assert path.suffix == ".json"
loaded = load_episode_log(path)
assert loaded == log
def test_filename_uses_episode_id(self, tmp_path: Path) -> None:
log = _make_episode_log(episode_id="my-episode-42")
path = write_episode_log(log, directory=tmp_path)
assert path.name == "my-episode-42.json"
def test_creates_missing_directories(self, tmp_path: Path) -> None:
nested = tmp_path / "a" / "b" / "c"
log = _make_episode_log()
path = write_episode_log(log, directory=nested)
assert path.exists()
assert nested.exists()
def test_overwrites_existing_file(self, tmp_path: Path) -> None:
log1 = _make_episode_log(episode_id="overwrite-test")
log2 = _make_episode_log(episode_id="overwrite-test")
# Mutate log2 so it differs
log2 = log2.model_copy(update={"total_reward": 9.9})
path1 = write_episode_log(log1, directory=tmp_path)
path2 = write_episode_log(log2, directory=tmp_path)
assert path1 == path2
loaded = load_episode_log(path2)
assert loaded.total_reward == 9.9
def test_load_nonexistent_raises(self, tmp_path: Path) -> None:
with pytest.raises(FileNotFoundError):
load_episode_log(tmp_path / "does-not-exist.json")
def test_default_directory_used(self) -> None:
"""write_episode_log with no directory should target the default replays dir."""
log = _make_episode_log(episode_id="default-dir-test")
path = write_episode_log(log)
try:
assert "replays" in str(path)
assert path.exists()
loaded = load_episode_log(path)
assert loaded == log
finally:
# Clean up: remove the file we just wrote to the real replays dir
path.unlink(missing_ok=True)
def test_transcript_preserved(self, tmp_path: Path) -> None:
log = _make_episode_log()
path = write_episode_log(log, directory=tmp_path)
loaded = load_episode_log(path)
assert len(loaded.transcript) == 2
assert loaded.transcript[0].role == "scientist"
assert loaded.transcript[1].role == "lab_manager"
def test_reward_breakdown_preserved(self, tmp_path: Path) -> None:
log = _make_episode_log()
path = write_episode_log(log, directory=tmp_path)
loaded = load_episode_log(path)
assert loaded.reward_breakdown is not None
assert loaded.reward_breakdown.rigor == 0.8
assert loaded.reward_breakdown.feasibility == 0.7
assert loaded.reward_breakdown.fidelity == 0.9
# ---------------------------------------------------------------------------
# append_reward_csv
# ---------------------------------------------------------------------------
class TestAppendRewardCsv:
def test_creates_file_with_header(self, tmp_path: Path) -> None:
csv_path = tmp_path / "rewards.csv"
result = append_reward_csv(
csv_path,
episode_id="ep-1",
seed=42,
scenario_template="math_reasoning",
difficulty="easy",
total_reward=5.0,
rigor=0.8,
feasibility=0.7,
fidelity=0.9,
parsimony=0.95,
efficiency_bonus=0.4,
penalty_total=0.1,
rounds_used=2,
agreement_reached=True,
verdict="accept",
)
assert result == csv_path
assert csv_path.exists()
with open(csv_path, newline="", encoding="utf-8") as f:
reader = csv.DictReader(f)
rows = list(reader)
assert len(rows) == 1
assert rows[0]["episode_id"] == "ep-1"
assert rows[0]["seed"] == "42"
assert rows[0]["total_reward"] == "5.0"
assert rows[0]["agreement_reached"] == "True"
def test_v2_columns_present(self, tmp_path: Path) -> None:
csv_path = tmp_path / "rewards.csv"
append_reward_csv(
csv_path,
episode_id="ep-v2",
parsimony=0.85,
efficiency_bonus=0.6,
communication_bonus=0.0,
penalty_total=0.2,
verdict="accept",
)
with open(csv_path, newline="", encoding="utf-8") as f:
reader = csv.DictReader(f)
rows = list(reader)
row = rows[0]
assert row["parsimony"] == "0.85"
assert row["efficiency_bonus"] == "0.6"
assert row["communication_bonus"] == "0.0"
assert row["penalty_total"] == "0.2"
assert row["verdict"] == "accept"
def test_appends_multiple_rows(self, tmp_path: Path) -> None:
csv_path = tmp_path / "rewards.csv"
for i in range(3):
append_reward_csv(
csv_path,
episode_id=f"ep-{i}",
seed=i,
total_reward=float(i),
)
with open(csv_path, newline="", encoding="utf-8") as f:
reader = csv.DictReader(f)
rows = list(reader)
assert len(rows) == 3
assert [r["episode_id"] for r in rows] == ["ep-0", "ep-1", "ep-2"]
def test_no_duplicate_headers(self, tmp_path: Path) -> None:
csv_path = tmp_path / "rewards.csv"
append_reward_csv(csv_path, episode_id="a")
append_reward_csv(csv_path, episode_id="b")
lines = csv_path.read_text(encoding="utf-8").strip().split("\n")
header_count = sum(1 for line in lines if line.startswith("episode_id"))
assert header_count == 1
def test_csv_header_has_all_expected_columns(self, tmp_path: Path) -> None:
csv_path = tmp_path / "rewards.csv"
append_reward_csv(csv_path, episode_id="hdr")
with open(csv_path, newline="", encoding="utf-8") as f:
reader = csv.DictReader(f)
_ = list(reader)
assert reader.fieldnames is not None
cols = set(reader.fieldnames)
expected = {
"episode_id", "seed", "scenario_template", "difficulty",
"total_reward", "rigor", "feasibility", "fidelity",
"parsimony", "efficiency_bonus", "communication_bonus",
"penalty_total", "rounds_used", "agreement_reached", "verdict",
}
assert cols == expected
# ---------------------------------------------------------------------------
# append_reward_jsonl (JDG 07)
# ---------------------------------------------------------------------------
class TestAppendRewardJsonl:
def test_creates_file_with_one_record(self, tmp_path: Path) -> None:
jsonl_path = tmp_path / "rewards.jsonl"
result = append_reward_jsonl(
jsonl_path,
episode_id="ep-j1",
seed=7,
scenario_template="ml_benchmark",
difficulty="hard",
total_reward=3.5,
breakdown=RewardBreakdown(
rigor=0.6,
feasibility=0.8,
fidelity=0.7,
parsimony=0.9,
efficiency_bonus=0.3,
penalties={"invalid_tool_use": 0.1},
),
rounds_used=4,
agreement_reached=True,
verdict="accept",
judge_notes="Good protocol.",
bounded_tool_metrics={"search_evidence": 2, "run_code_check": 1},
)
assert result == jsonl_path
assert jsonl_path.exists()
lines = jsonl_path.read_text(encoding="utf-8").strip().split("\n")
assert len(lines) == 1
rec = json.loads(lines[0])
assert rec["episode_id"] == "ep-j1"
assert rec["rigor"] == 0.6
assert rec["parsimony"] == 0.9
assert rec["penalties"] == {"invalid_tool_use": 0.1}
assert rec["penalty_total"] == 0.1
assert rec["bounded_tool_metrics"]["search_evidence"] == 2
assert rec["verdict"] == "accept"
assert rec["judge_notes"] == "Good protocol."
def test_appends_multiple_records(self, tmp_path: Path) -> None:
jsonl_path = tmp_path / "rewards.jsonl"
for i in range(3):
append_reward_jsonl(jsonl_path, episode_id=f"ep-{i}", seed=i)
lines = jsonl_path.read_text(encoding="utf-8").strip().split("\n")
assert len(lines) == 3
ids = [json.loads(line)["episode_id"] for line in lines]
assert ids == ["ep-0", "ep-1", "ep-2"]
def test_default_breakdown_used_when_none(self, tmp_path: Path) -> None:
jsonl_path = tmp_path / "rewards.jsonl"
append_reward_jsonl(jsonl_path, episode_id="no-bd")
rec = json.loads(jsonl_path.read_text(encoding="utf-8").strip())
assert rec["rigor"] == 0.0
assert rec["feasibility"] == 0.0
assert rec["fidelity"] == 0.0
assert rec["parsimony"] == 1.0
assert rec["penalties"] == {}
assert rec["bounded_tool_metrics"] == {}
def test_penalties_dict_preserved(self, tmp_path: Path) -> None:
jsonl_path = tmp_path / "rewards.jsonl"
bd = RewardBreakdown(
rigor=0.5,
feasibility=0.5,
fidelity=0.5,
penalties={"unsupported_claim": 0.05, "invalid_tool_use": 0.1},
)
append_reward_jsonl(jsonl_path, breakdown=bd)
rec = json.loads(jsonl_path.read_text(encoding="utf-8").strip())
assert rec["penalties"]["unsupported_claim"] == 0.05
assert rec["penalties"]["invalid_tool_use"] == 0.1
assert rec["penalty_total"] == pytest.approx(0.15)
def test_creates_missing_directories(self, tmp_path: Path) -> None:
nested = tmp_path / "a" / "b" / "rewards.jsonl"
append_reward_jsonl(nested, episode_id="nested")
assert nested.exists()
# ---------------------------------------------------------------------------
# log_episode_reward (JDG 07 convenience wrapper)
# ---------------------------------------------------------------------------
class TestLogEpisodeReward:
def test_writes_both_csv_and_jsonl(self, tmp_path: Path) -> None:
csv_path = tmp_path / "rewards.csv"
jsonl_path = tmp_path / "rewards.jsonl"
bd = RewardBreakdown(
rigor=0.7,
feasibility=0.9,
fidelity=0.8,
parsimony=0.95,
efficiency_bonus=0.5,
penalties={"stalling": 0.05},
)
csv_out, jsonl_out = log_episode_reward(
episode_id="dual-1",
seed=99,
scenario_template="finance_trading",
difficulty="medium",
total_reward=4.2,
breakdown=bd,
rounds_used=3,
agreement_reached=True,
verdict="accept",
judge_notes="Solid plan.",
bounded_tool_metrics={"search_evidence": 1},
csv_path=csv_path,
jsonl_path=jsonl_path,
)
assert csv_out == csv_path
assert jsonl_out == jsonl_path
# CSV check
with open(csv_path, newline="", encoding="utf-8") as f:
rows = list(csv.DictReader(f))
assert len(rows) == 1
assert rows[0]["episode_id"] == "dual-1"
assert rows[0]["parsimony"] == "0.95"
assert rows[0]["verdict"] == "accept"
# JSONL check
rec = json.loads(jsonl_path.read_text(encoding="utf-8").strip())
assert rec["episode_id"] == "dual-1"
assert rec["penalties"] == {"stalling": 0.05}
assert rec["bounded_tool_metrics"] == {"search_evidence": 1}
assert rec["judge_notes"] == "Solid plan."
def test_deterministic_across_calls(self, tmp_path: Path) -> None:
bd = RewardBreakdown(rigor=0.6, feasibility=0.7, fidelity=0.8)
kwargs = dict(
episode_id="det",
seed=1,
scenario_template="math_reasoning",
difficulty="easy",
total_reward=3.0,
breakdown=bd,
rounds_used=2,
agreement_reached=False,
verdict="timeout",
)
p1 = tmp_path / "run1"
p1.mkdir()
log_episode_reward(**kwargs, csv_path=p1 / "r.csv", jsonl_path=p1 / "r.jsonl")
p2 = tmp_path / "run2"
p2.mkdir()
log_episode_reward(**kwargs, csv_path=p2 / "r.csv", jsonl_path=p2 / "r.jsonl")
assert (p1 / "r.csv").read_text() == (p2 / "r.csv").read_text()
assert (p1 / "r.jsonl").read_text() == (p2 / "r.jsonl").read_text()
def test_default_breakdown_when_none(self, tmp_path: Path) -> None:
csv_path = tmp_path / "rewards.csv"
jsonl_path = tmp_path / "rewards.jsonl"
log_episode_reward(
episode_id="no-bd",
csv_path=csv_path,
jsonl_path=jsonl_path,
)
with open(csv_path, newline="", encoding="utf-8") as f:
rows = list(csv.DictReader(f))
assert rows[0]["parsimony"] == "1.0"
rec = json.loads(jsonl_path.read_text(encoding="utf-8").strip())
assert rec["parsimony"] == 1.0
assert rec["penalties"] == {}