| """Tests dataset scoring/weighting helperiem.""" |
|
|
| from __future__ import annotations |
|
|
| import json |
| from pathlib import Path |
|
|
| from maris_core.data.scoring import ( |
| DatasetBenchmarkFeedback, |
| DatasetScoringConfig, |
| apply_scoring_to_records, |
| build_benchmark_feedback_artifact, |
| load_benchmark_feedback, |
| score_record, |
| ) |
|
|
|
|
| def test_score_record_prefers_richer_structured_examples() -> None: |
| high_value = { |
| "prompt": "Izveido detalizētu arhitektūras plānu realtime voice assistantam latviešu valodā.", |
| "metadata": { |
| "channel": "voice", |
| "language": "lv", |
| "focus": "livekit", |
| "mode": "streaming", |
| }, |
| "source": "maris-production-bootstrap", |
| } |
| low_value = {"text": "ok"} |
|
|
| assert score_record(high_value, max_text_chars=2048) > score_record( |
| low_value, |
| max_text_chars=2048, |
| ) |
|
|
|
|
| def test_apply_scoring_to_records_expands_high_scoring_examples() -> None: |
| records = [ |
| { |
| "prompt": "Uztaisi pilnu dataset scoring un weighting pipeline, kas dod priekšroku detalizētiem, strukturētiem un bagātiem ierakstiem.", |
| "metadata": { |
| "channel": "training", |
| "language": "lv", |
| "priority": "high", |
| "type": "design", |
| }, |
| "source": "maris-production-bootstrap", |
| }, |
| {"text": "ok"}, |
| ] |
|
|
| expanded, report = apply_scoring_to_records( |
| records, |
| split_name="train", |
| config=DatasetScoringConfig(), |
| expand_weights=True, |
| ) |
|
|
| assert report.input_records == 2 |
| assert report.expanded_records > report.input_records |
| assert report.repeated_records >= 1 |
| assert report.average_score > 0.0 |
| assert any(item["maris_dataset_repeat_count"] > 1 for item in expanded) |
|
|
|
|
| def test_source_aware_weighting_prefers_production_over_noisy_records() -> None: |
| records = [ |
| { |
| "prompt": "Izveido arhitektūras plānu balss asistentam.", |
| "metadata": {"source_tier": "production", "category": "reasoning"}, |
| }, |
| { |
| "prompt": "Izveido arhitektūras plānu balss asistentam.", |
| "metadata": {"source_tier": "noisy", "category": "reasoning"}, |
| }, |
| ] |
|
|
| expanded, report = apply_scoring_to_records( |
| records, |
| split_name="train", |
| config=DatasetScoringConfig(), |
| expand_weights=True, |
| ) |
|
|
| production_copies = sum( |
| 1 for item in expanded if item.get("maris_dataset_source_tier") == "production" |
| ) |
| noisy_copies = sum(1 for item in expanded if item.get("maris_dataset_source_tier") == "noisy") |
|
|
| assert production_copies > noisy_copies |
| assert report.source_tiers["production"] == 1 |
| assert report.source_tiers["noisy"] == 1 |
| assert report.source_dashboard["production"]["records"] == 1 |
| assert report.source_dashboard["noisy"]["records"] == 1 |
|
|
|
|
| def test_benchmark_feedback_boosts_matching_categories(tmp_path: Path) -> None: |
| feedback_path = tmp_path / "benchmark-manifest.json" |
| feedback_path.write_text( |
| json.dumps( |
| { |
| "score_manifest": { |
| "overall": 0.62, |
| "reasoning": 0.41, |
| "coding": 0.79, |
| } |
| } |
| ), |
| encoding="utf-8", |
| ) |
| feedback = load_benchmark_feedback( |
| feedback_path, |
| targets={"overall": 0.72, "reasoning": 0.7, "coding": 0.72}, |
| boost_scale=2.0, |
| max_multiplier=1.75, |
| ) |
|
|
| expanded, report = apply_scoring_to_records( |
| [ |
| { |
| "prompt": "Izanalizē sistēmas kompromisus.", |
| "metadata": {"category": "reasoning"}, |
| }, |
| { |
| "prompt": "Uzraksti Python funkciju.", |
| "metadata": {"category": "coding"}, |
| }, |
| ], |
| split_name="train", |
| config=DatasetScoringConfig(), |
| expand_weights=True, |
| benchmark_feedback=feedback, |
| ) |
|
|
| reasoning_copies = sum( |
| 1 for item in expanded if item.get("metadata", {}).get("category") == "reasoning" |
| ) |
| coding_copies = sum( |
| 1 for item in expanded if item.get("metadata", {}).get("category") == "coding" |
| ) |
|
|
| assert reasoning_copies > coding_copies |
| assert report.feedback_boosted_records >= 1 |
| assert report.feedback_metric_hits["reasoning"] >= 1 |
| assert report.category_dashboard["reasoning"]["boosted_records"] >= 1 |
|
|
|
|
| def test_build_benchmark_feedback_artifact_preserves_deficits() -> None: |
| artifact = build_benchmark_feedback_artifact( |
| DatasetBenchmarkFeedback( |
| artifact_path="/tmp/benchmark-manifest.json", |
| overall_multiplier=1.2, |
| deficient_metrics={ |
| "reasoning": { |
| "target": 0.7, |
| "actual": 0.45, |
| "deficit": 0.25, |
| "multiplier": 1.5, |
| } |
| }, |
| ) |
| ) |
|
|
| assert artifact["artifact_type"] == "benchmark-feedback-reweighting" |
| assert artifact["deficient_metrics"]["reasoning"]["multiplier"] == 1.5 |
|
|
|
|
| def test_apply_scoring_to_records_uses_category_weight_map() -> None: |
| records = [ |
| { |
| "prompt": "Debug SSE mismatch", |
| "completion": "Check complete event handling.", |
| "category": "debugging", |
| "source": "maris-production-bootstrap", |
| "metadata": {"language": "python"}, |
| } |
| ] |
|
|
| expanded, report = apply_scoring_to_records( |
| records, |
| split_name="train", |
| config=DatasetScoringConfig( |
| category_weight_map={"debugging": 2.0}, |
| high_score_repeat_count=3, |
| medium_score_repeat_count=2, |
| ), |
| expand_weights=True, |
| ) |
|
|
| assert report.sample_scores[0]["category_multiplier"] == 2.0 |
| assert len(expanded) >= 2 |
|
|