"""Tests for richer preference dataset loading and summaries.""" from pathlib import Path from maris_core.training.preferences import ( build_blind_side_by_side_artifact, build_human_eval_summary, load_preference_dataset, summarize_preference_dataset, ) def test_load_preference_dataset_reads_richer_coder_fields(tmp_path: Path) -> None: preference_path = tmp_path / "preferences.json" preference_path.write_text( """[ { "prompt": "Salabo helperi", "chosen": "Labs variants", "rejected": "Slikts variants", "source": "human_review", "branch": "coder", "task_type": "bugfix", "language": "python", "source_type": "real_reviewer", "reviewer_segment": "ops", "risk_level": "high", "grounding_scope": "repo-grounded", "failure_bucket": "production_regression", "confidence": 0.91, "pair_id": "pair-1", "blind": true, "production_like": true, "repo_context": ["core-python", "backend-rust"], "execution_required": true, "tags": ["bugfix", "python"] } ]""", encoding="utf-8", ) examples = load_preference_dataset(preference_path) assert examples[0].branch == "coder" assert examples[0].task_type == "bugfix" assert examples[0].language == "python" assert examples[0].repo_context == ("core-python", "backend-rust") assert examples[0].execution_required is True assert examples[0].source_type == "real_reviewer" assert examples[0].reviewer_segment == "ops" assert examples[0].risk_level == "high" assert examples[0].grounding_scope == "repo-grounded" assert examples[0].failure_bucket == "production_regression" assert examples[0].confidence == 0.91 assert examples[0].blind is True assert examples[0].production_like is True def test_summarize_preference_dataset_counts_branch_and_execution() -> None: examples = load_preference_dataset( "/home/runner/work/Maris-MI/Maris-MI/core-python/evals/coder_preference_dataset.json" ) summary = summarize_preference_dataset(examples) assert summary["total_examples"] >= 10 assert summary["branches"]["coder"] >= 1 assert summary["task_types"]["bugfix"] >= 1 assert summary["languages"]["python"] >= 1 assert summary["execution_required_examples"] >= 1 assert summary["languages"]["typescript"] >= 1 assert summary["languages"]["rust"] >= 1 assert summary["languages"]["sql"] >= 1 assert summary["source_types"]["real_reviewer"] >= 1 assert summary["blind_examples"] >= 1 assert summary["pairwise_win_rate"] > 0.0 assert summary["production_like_examples"] >= 1 human_eval_summary = build_human_eval_summary(examples) assert human_eval_summary["artifact_type"] == "human-eval-summary" assert human_eval_summary["pairwise_win_rate"] > 0.0 blind_artifact = build_blind_side_by_side_artifact(examples[:2], seed=7) assert blind_artifact["artifact_type"] == "blind-side-by-side-eval-set" assert blind_artifact["pairs"][0]["candidates"][0]["slot"] == "A" assert "source" not in blind_artifact["pairs"][0]