File size: 3,297 Bytes

f440f03

"""Tests for richer preference dataset loading and summaries."""

from pathlib import Path

from maris_core.training.preferences import (
    build_blind_side_by_side_artifact,
    build_human_eval_summary,
    load_preference_dataset,
    summarize_preference_dataset,
)


def test_load_preference_dataset_reads_richer_coder_fields(tmp_path: Path) -> None:
    preference_path = tmp_path / "preferences.json"
    preference_path.write_text(
        """[
          {
            "prompt": "Salabo helperi",
            "chosen": "Labs variants",
            "rejected": "Slikts variants",
            "source": "human_review",
            "branch": "coder",
            "task_type": "bugfix",
            "language": "python",
            "source_type": "real_reviewer",
            "reviewer_segment": "ops",
            "risk_level": "high",
            "grounding_scope": "repo-grounded",
            "failure_bucket": "production_regression",
            "confidence": 0.91,
            "pair_id": "pair-1",
            "blind": true,
            "production_like": true,
            "repo_context": ["core-python", "backend-rust"],
            "execution_required": true,
            "tags": ["bugfix", "python"]
          }
        ]""",
        encoding="utf-8",
    )

    examples = load_preference_dataset(preference_path)

    assert examples[0].branch == "coder"
    assert examples[0].task_type == "bugfix"
    assert examples[0].language == "python"
    assert examples[0].repo_context == ("core-python", "backend-rust")
    assert examples[0].execution_required is True
    assert examples[0].source_type == "real_reviewer"
    assert examples[0].reviewer_segment == "ops"
    assert examples[0].risk_level == "high"
    assert examples[0].grounding_scope == "repo-grounded"
    assert examples[0].failure_bucket == "production_regression"
    assert examples[0].confidence == 0.91
    assert examples[0].blind is True
    assert examples[0].production_like is True


def test_summarize_preference_dataset_counts_branch_and_execution() -> None:
    examples = load_preference_dataset(
        "/home/runner/work/Maris-MI/Maris-MI/core-python/evals/coder_preference_dataset.json"
    )

    summary = summarize_preference_dataset(examples)

    assert summary["total_examples"] >= 10
    assert summary["branches"]["coder"] >= 1
    assert summary["task_types"]["bugfix"] >= 1
    assert summary["languages"]["python"] >= 1
    assert summary["execution_required_examples"] >= 1
    assert summary["languages"]["typescript"] >= 1
    assert summary["languages"]["rust"] >= 1
    assert summary["languages"]["sql"] >= 1
    assert summary["source_types"]["real_reviewer"] >= 1
    assert summary["blind_examples"] >= 1
    assert summary["pairwise_win_rate"] > 0.0
    assert summary["production_like_examples"] >= 1

    human_eval_summary = build_human_eval_summary(examples)
    assert human_eval_summary["artifact_type"] == "human-eval-summary"
    assert human_eval_summary["pairwise_win_rate"] > 0.0

    blind_artifact = build_blind_side_by_side_artifact(examples[:2], seed=7)
    assert blind_artifact["artifact_type"] == "blind-side-by-side-eval-set"
    assert blind_artifact["pairs"][0]["candidates"][0]["slot"] == "A"
    assert "source" not in blind_artifact["pairs"][0]