File size: 3,297 Bytes
f440f03
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
"""Tests for richer preference dataset loading and summaries."""

from pathlib import Path

from maris_core.training.preferences import (
    build_blind_side_by_side_artifact,
    build_human_eval_summary,
    load_preference_dataset,
    summarize_preference_dataset,
)


def test_load_preference_dataset_reads_richer_coder_fields(tmp_path: Path) -> None:
    preference_path = tmp_path / "preferences.json"
    preference_path.write_text(
        """[
          {
            "prompt": "Salabo helperi",
            "chosen": "Labs variants",
            "rejected": "Slikts variants",
            "source": "human_review",
            "branch": "coder",
            "task_type": "bugfix",
            "language": "python",
            "source_type": "real_reviewer",
            "reviewer_segment": "ops",
            "risk_level": "high",
            "grounding_scope": "repo-grounded",
            "failure_bucket": "production_regression",
            "confidence": 0.91,
            "pair_id": "pair-1",
            "blind": true,
            "production_like": true,
            "repo_context": ["core-python", "backend-rust"],
            "execution_required": true,
            "tags": ["bugfix", "python"]
          }
        ]""",
        encoding="utf-8",
    )

    examples = load_preference_dataset(preference_path)

    assert examples[0].branch == "coder"
    assert examples[0].task_type == "bugfix"
    assert examples[0].language == "python"
    assert examples[0].repo_context == ("core-python", "backend-rust")
    assert examples[0].execution_required is True
    assert examples[0].source_type == "real_reviewer"
    assert examples[0].reviewer_segment == "ops"
    assert examples[0].risk_level == "high"
    assert examples[0].grounding_scope == "repo-grounded"
    assert examples[0].failure_bucket == "production_regression"
    assert examples[0].confidence == 0.91
    assert examples[0].blind is True
    assert examples[0].production_like is True


def test_summarize_preference_dataset_counts_branch_and_execution() -> None:
    examples = load_preference_dataset(
        "/home/runner/work/Maris-MI/Maris-MI/core-python/evals/coder_preference_dataset.json"
    )

    summary = summarize_preference_dataset(examples)

    assert summary["total_examples"] >= 10
    assert summary["branches"]["coder"] >= 1
    assert summary["task_types"]["bugfix"] >= 1
    assert summary["languages"]["python"] >= 1
    assert summary["execution_required_examples"] >= 1
    assert summary["languages"]["typescript"] >= 1
    assert summary["languages"]["rust"] >= 1
    assert summary["languages"]["sql"] >= 1
    assert summary["source_types"]["real_reviewer"] >= 1
    assert summary["blind_examples"] >= 1
    assert summary["pairwise_win_rate"] > 0.0
    assert summary["production_like_examples"] >= 1

    human_eval_summary = build_human_eval_summary(examples)
    assert human_eval_summary["artifact_type"] == "human-eval-summary"
    assert human_eval_summary["pairwise_win_rate"] > 0.0

    blind_artifact = build_blind_side_by_side_artifact(examples[:2], seed=7)
    assert blind_artifact["artifact_type"] == "blind-side-by-side-eval-set"
    assert blind_artifact["pairs"][0]["candidates"][0]["slot"] == "A"
    assert "source" not in blind_artifact["pairs"][0]