| """Tests for richer preference dataset loading and summaries.""" |
|
|
| from pathlib import Path |
|
|
| from maris_core.training.preferences import ( |
| build_blind_side_by_side_artifact, |
| build_human_eval_summary, |
| load_preference_dataset, |
| summarize_preference_dataset, |
| ) |
|
|
|
|
| def test_load_preference_dataset_reads_richer_coder_fields(tmp_path: Path) -> None: |
| preference_path = tmp_path / "preferences.json" |
| preference_path.write_text( |
| """[ |
| { |
| "prompt": "Salabo helperi", |
| "chosen": "Labs variants", |
| "rejected": "Slikts variants", |
| "source": "human_review", |
| "branch": "coder", |
| "task_type": "bugfix", |
| "language": "python", |
| "source_type": "real_reviewer", |
| "reviewer_segment": "ops", |
| "risk_level": "high", |
| "grounding_scope": "repo-grounded", |
| "failure_bucket": "production_regression", |
| "confidence": 0.91, |
| "pair_id": "pair-1", |
| "blind": true, |
| "production_like": true, |
| "repo_context": ["core-python", "backend-rust"], |
| "execution_required": true, |
| "tags": ["bugfix", "python"] |
| } |
| ]""", |
| encoding="utf-8", |
| ) |
|
|
| examples = load_preference_dataset(preference_path) |
|
|
| assert examples[0].branch == "coder" |
| assert examples[0].task_type == "bugfix" |
| assert examples[0].language == "python" |
| assert examples[0].repo_context == ("core-python", "backend-rust") |
| assert examples[0].execution_required is True |
| assert examples[0].source_type == "real_reviewer" |
| assert examples[0].reviewer_segment == "ops" |
| assert examples[0].risk_level == "high" |
| assert examples[0].grounding_scope == "repo-grounded" |
| assert examples[0].failure_bucket == "production_regression" |
| assert examples[0].confidence == 0.91 |
| assert examples[0].blind is True |
| assert examples[0].production_like is True |
|
|
|
|
| def test_summarize_preference_dataset_counts_branch_and_execution() -> None: |
| examples = load_preference_dataset( |
| "/home/runner/work/Maris-MI/Maris-MI/core-python/evals/coder_preference_dataset.json" |
| ) |
|
|
| summary = summarize_preference_dataset(examples) |
|
|
| assert summary["total_examples"] >= 10 |
| assert summary["branches"]["coder"] >= 1 |
| assert summary["task_types"]["bugfix"] >= 1 |
| assert summary["languages"]["python"] >= 1 |
| assert summary["execution_required_examples"] >= 1 |
| assert summary["languages"]["typescript"] >= 1 |
| assert summary["languages"]["rust"] >= 1 |
| assert summary["languages"]["sql"] >= 1 |
| assert summary["source_types"]["real_reviewer"] >= 1 |
| assert summary["blind_examples"] >= 1 |
| assert summary["pairwise_win_rate"] > 0.0 |
| assert summary["production_like_examples"] >= 1 |
|
|
| human_eval_summary = build_human_eval_summary(examples) |
| assert human_eval_summary["artifact_type"] == "human-eval-summary" |
| assert human_eval_summary["pairwise_win_rate"] > 0.0 |
|
|
| blind_artifact = build_blind_side_by_side_artifact(examples[:2], seed=7) |
| assert blind_artifact["artifact_type"] == "blind-side-by-side-eval-set" |
| assert blind_artifact["pairs"][0]["candidates"][0]["slot"] == "A" |
| assert "source" not in blind_artifact["pairs"][0] |
|
|