maris-ai-master / core-python /tests /test_preferences.py
MarisUK's picture
Maris AI model sync
f440f03 verified
"""Tests for richer preference dataset loading and summaries."""
from pathlib import Path
from maris_core.training.preferences import (
build_blind_side_by_side_artifact,
build_human_eval_summary,
load_preference_dataset,
summarize_preference_dataset,
)
def test_load_preference_dataset_reads_richer_coder_fields(tmp_path: Path) -> None:
preference_path = tmp_path / "preferences.json"
preference_path.write_text(
"""[
{
"prompt": "Salabo helperi",
"chosen": "Labs variants",
"rejected": "Slikts variants",
"source": "human_review",
"branch": "coder",
"task_type": "bugfix",
"language": "python",
"source_type": "real_reviewer",
"reviewer_segment": "ops",
"risk_level": "high",
"grounding_scope": "repo-grounded",
"failure_bucket": "production_regression",
"confidence": 0.91,
"pair_id": "pair-1",
"blind": true,
"production_like": true,
"repo_context": ["core-python", "backend-rust"],
"execution_required": true,
"tags": ["bugfix", "python"]
}
]""",
encoding="utf-8",
)
examples = load_preference_dataset(preference_path)
assert examples[0].branch == "coder"
assert examples[0].task_type == "bugfix"
assert examples[0].language == "python"
assert examples[0].repo_context == ("core-python", "backend-rust")
assert examples[0].execution_required is True
assert examples[0].source_type == "real_reviewer"
assert examples[0].reviewer_segment == "ops"
assert examples[0].risk_level == "high"
assert examples[0].grounding_scope == "repo-grounded"
assert examples[0].failure_bucket == "production_regression"
assert examples[0].confidence == 0.91
assert examples[0].blind is True
assert examples[0].production_like is True
def test_summarize_preference_dataset_counts_branch_and_execution() -> None:
examples = load_preference_dataset(
"/home/runner/work/Maris-MI/Maris-MI/core-python/evals/coder_preference_dataset.json"
)
summary = summarize_preference_dataset(examples)
assert summary["total_examples"] >= 10
assert summary["branches"]["coder"] >= 1
assert summary["task_types"]["bugfix"] >= 1
assert summary["languages"]["python"] >= 1
assert summary["execution_required_examples"] >= 1
assert summary["languages"]["typescript"] >= 1
assert summary["languages"]["rust"] >= 1
assert summary["languages"]["sql"] >= 1
assert summary["source_types"]["real_reviewer"] >= 1
assert summary["blind_examples"] >= 1
assert summary["pairwise_win_rate"] > 0.0
assert summary["production_like_examples"] >= 1
human_eval_summary = build_human_eval_summary(examples)
assert human_eval_summary["artifact_type"] == "human-eval-summary"
assert human_eval_summary["pairwise_win_rate"] > 0.0
blind_artifact = build_blind_side_by_side_artifact(examples[:2], seed=7)
assert blind_artifact["artifact_type"] == "blind-side-by-side-eval-set"
assert blind_artifact["pairs"][0]["candidates"][0]["slot"] == "A"
assert "source" not in blind_artifact["pairs"][0]