"""Tests dataset scoring/weighting helperiem.""" from __future__ import annotations import json from pathlib import Path from maris_core.data.scoring import ( DatasetBenchmarkFeedback, DatasetScoringConfig, apply_scoring_to_records, build_benchmark_feedback_artifact, load_benchmark_feedback, score_record, ) def test_score_record_prefers_richer_structured_examples() -> None: high_value = { "prompt": "Izveido detalizētu arhitektūras plānu realtime voice assistantam latviešu valodā.", "metadata": { "channel": "voice", "language": "lv", "focus": "livekit", "mode": "streaming", }, "source": "maris-production-bootstrap", } low_value = {"text": "ok"} assert score_record(high_value, max_text_chars=2048) > score_record( low_value, max_text_chars=2048, ) def test_apply_scoring_to_records_expands_high_scoring_examples() -> None: records = [ { "prompt": "Uztaisi pilnu dataset scoring un weighting pipeline, kas dod priekšroku detalizētiem, strukturētiem un bagātiem ierakstiem.", "metadata": { "channel": "training", "language": "lv", "priority": "high", "type": "design", }, "source": "maris-production-bootstrap", }, {"text": "ok"}, ] expanded, report = apply_scoring_to_records( records, split_name="train", config=DatasetScoringConfig(), expand_weights=True, ) assert report.input_records == 2 assert report.expanded_records > report.input_records assert report.repeated_records >= 1 assert report.average_score > 0.0 assert any(item["maris_dataset_repeat_count"] > 1 for item in expanded) def test_source_aware_weighting_prefers_production_over_noisy_records() -> None: records = [ { "prompt": "Izveido arhitektūras plānu balss asistentam.", "metadata": {"source_tier": "production", "category": "reasoning"}, }, { "prompt": "Izveido arhitektūras plānu balss asistentam.", "metadata": {"source_tier": "noisy", "category": "reasoning"}, }, ] expanded, report = apply_scoring_to_records( records, split_name="train", config=DatasetScoringConfig(), expand_weights=True, ) production_copies = sum( 1 for item in expanded if item.get("maris_dataset_source_tier") == "production" ) noisy_copies = sum(1 for item in expanded if item.get("maris_dataset_source_tier") == "noisy") assert production_copies > noisy_copies assert report.source_tiers["production"] == 1 assert report.source_tiers["noisy"] == 1 assert report.source_dashboard["production"]["records"] == 1 assert report.source_dashboard["noisy"]["records"] == 1 def test_benchmark_feedback_boosts_matching_categories(tmp_path: Path) -> None: feedback_path = tmp_path / "benchmark-manifest.json" feedback_path.write_text( json.dumps( { "score_manifest": { "overall": 0.62, "reasoning": 0.41, "coding": 0.79, } } ), encoding="utf-8", ) feedback = load_benchmark_feedback( feedback_path, targets={"overall": 0.72, "reasoning": 0.7, "coding": 0.72}, boost_scale=2.0, max_multiplier=1.75, ) expanded, report = apply_scoring_to_records( [ { "prompt": "Izanalizē sistēmas kompromisus.", "metadata": {"category": "reasoning"}, }, { "prompt": "Uzraksti Python funkciju.", "metadata": {"category": "coding"}, }, ], split_name="train", config=DatasetScoringConfig(), expand_weights=True, benchmark_feedback=feedback, ) reasoning_copies = sum( 1 for item in expanded if item.get("metadata", {}).get("category") == "reasoning" ) coding_copies = sum( 1 for item in expanded if item.get("metadata", {}).get("category") == "coding" ) assert reasoning_copies > coding_copies assert report.feedback_boosted_records >= 1 assert report.feedback_metric_hits["reasoning"] >= 1 assert report.category_dashboard["reasoning"]["boosted_records"] >= 1 def test_build_benchmark_feedback_artifact_preserves_deficits() -> None: artifact = build_benchmark_feedback_artifact( DatasetBenchmarkFeedback( artifact_path="/tmp/benchmark-manifest.json", overall_multiplier=1.2, deficient_metrics={ "reasoning": { "target": 0.7, "actual": 0.45, "deficit": 0.25, "multiplier": 1.5, } }, ) ) assert artifact["artifact_type"] == "benchmark-feedback-reweighting" assert artifact["deficient_metrics"]["reasoning"]["multiplier"] == 1.5 def test_apply_scoring_to_records_uses_category_weight_map() -> None: records = [ { "prompt": "Debug SSE mismatch", "completion": "Check complete event handling.", "category": "debugging", "source": "maris-production-bootstrap", "metadata": {"language": "python"}, } ] expanded, report = apply_scoring_to_records( records, split_name="train", config=DatasetScoringConfig( category_weight_map={"debugging": 2.0}, high_score_repeat_count=3, medium_score_repeat_count=2, ), expand_weights=True, ) assert report.sample_scores[0]["category_multiplier"] == 2.0 assert len(expanded) >= 2