maris-ai-master / core-python /tests /test_dataset_scoring.py
MarisUK's picture
Maris AI model sync
f440f03 verified
"""Tests dataset scoring/weighting helperiem."""
from __future__ import annotations
import json
from pathlib import Path
from maris_core.data.scoring import (
DatasetBenchmarkFeedback,
DatasetScoringConfig,
apply_scoring_to_records,
build_benchmark_feedback_artifact,
load_benchmark_feedback,
score_record,
)
def test_score_record_prefers_richer_structured_examples() -> None:
high_value = {
"prompt": "Izveido detalizētu arhitektūras plānu realtime voice assistantam latviešu valodā.",
"metadata": {
"channel": "voice",
"language": "lv",
"focus": "livekit",
"mode": "streaming",
},
"source": "maris-production-bootstrap",
}
low_value = {"text": "ok"}
assert score_record(high_value, max_text_chars=2048) > score_record(
low_value,
max_text_chars=2048,
)
def test_apply_scoring_to_records_expands_high_scoring_examples() -> None:
records = [
{
"prompt": "Uztaisi pilnu dataset scoring un weighting pipeline, kas dod priekšroku detalizētiem, strukturētiem un bagātiem ierakstiem.",
"metadata": {
"channel": "training",
"language": "lv",
"priority": "high",
"type": "design",
},
"source": "maris-production-bootstrap",
},
{"text": "ok"},
]
expanded, report = apply_scoring_to_records(
records,
split_name="train",
config=DatasetScoringConfig(),
expand_weights=True,
)
assert report.input_records == 2
assert report.expanded_records > report.input_records
assert report.repeated_records >= 1
assert report.average_score > 0.0
assert any(item["maris_dataset_repeat_count"] > 1 for item in expanded)
def test_source_aware_weighting_prefers_production_over_noisy_records() -> None:
records = [
{
"prompt": "Izveido arhitektūras plānu balss asistentam.",
"metadata": {"source_tier": "production", "category": "reasoning"},
},
{
"prompt": "Izveido arhitektūras plānu balss asistentam.",
"metadata": {"source_tier": "noisy", "category": "reasoning"},
},
]
expanded, report = apply_scoring_to_records(
records,
split_name="train",
config=DatasetScoringConfig(),
expand_weights=True,
)
production_copies = sum(
1 for item in expanded if item.get("maris_dataset_source_tier") == "production"
)
noisy_copies = sum(1 for item in expanded if item.get("maris_dataset_source_tier") == "noisy")
assert production_copies > noisy_copies
assert report.source_tiers["production"] == 1
assert report.source_tiers["noisy"] == 1
assert report.source_dashboard["production"]["records"] == 1
assert report.source_dashboard["noisy"]["records"] == 1
def test_benchmark_feedback_boosts_matching_categories(tmp_path: Path) -> None:
feedback_path = tmp_path / "benchmark-manifest.json"
feedback_path.write_text(
json.dumps(
{
"score_manifest": {
"overall": 0.62,
"reasoning": 0.41,
"coding": 0.79,
}
}
),
encoding="utf-8",
)
feedback = load_benchmark_feedback(
feedback_path,
targets={"overall": 0.72, "reasoning": 0.7, "coding": 0.72},
boost_scale=2.0,
max_multiplier=1.75,
)
expanded, report = apply_scoring_to_records(
[
{
"prompt": "Izanalizē sistēmas kompromisus.",
"metadata": {"category": "reasoning"},
},
{
"prompt": "Uzraksti Python funkciju.",
"metadata": {"category": "coding"},
},
],
split_name="train",
config=DatasetScoringConfig(),
expand_weights=True,
benchmark_feedback=feedback,
)
reasoning_copies = sum(
1 for item in expanded if item.get("metadata", {}).get("category") == "reasoning"
)
coding_copies = sum(
1 for item in expanded if item.get("metadata", {}).get("category") == "coding"
)
assert reasoning_copies > coding_copies
assert report.feedback_boosted_records >= 1
assert report.feedback_metric_hits["reasoning"] >= 1
assert report.category_dashboard["reasoning"]["boosted_records"] >= 1
def test_build_benchmark_feedback_artifact_preserves_deficits() -> None:
artifact = build_benchmark_feedback_artifact(
DatasetBenchmarkFeedback(
artifact_path="/tmp/benchmark-manifest.json",
overall_multiplier=1.2,
deficient_metrics={
"reasoning": {
"target": 0.7,
"actual": 0.45,
"deficit": 0.25,
"multiplier": 1.5,
}
},
)
)
assert artifact["artifact_type"] == "benchmark-feedback-reweighting"
assert artifact["deficient_metrics"]["reasoning"]["multiplier"] == 1.5
def test_apply_scoring_to_records_uses_category_weight_map() -> None:
records = [
{
"prompt": "Debug SSE mismatch",
"completion": "Check complete event handling.",
"category": "debugging",
"source": "maris-production-bootstrap",
"metadata": {"language": "python"},
}
]
expanded, report = apply_scoring_to_records(
records,
split_name="train",
config=DatasetScoringConfig(
category_weight_map={"debugging": 2.0},
high_score_repeat_count=3,
medium_score_repeat_count=2,
),
expand_weights=True,
)
assert report.sample_scores[0]["category_multiplier"] == 2.0
assert len(expanded) >= 2