File size: 2,690 Bytes
f440f03 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 | """Tests dataset quality gate helperiem."""
from __future__ import annotations
from maris_core.data.quality import DatasetQualityGateConfig, apply_quality_gate_to_records
def test_apply_quality_gate_to_records_removes_duplicates_and_low_quality_examples() -> None:
records = [
{"user": "Sveiki, Maris!", "assistant": "Sveiki! Kā varu palīdzēt šodien?"},
{"user": "Sveiki, Maris!", "assistant": "Sveiki! Kā varu palīdzēt šodien?"},
{"user": "Atkārto mani", "assistant": "Atkārto mani"},
{"text": "test"},
{
"prompt": "Izveido detalizētu plānu latviešu valodā klientu atbalsta balss asistentam.",
"metadata": {"channel": "voice", "language": "lv"},
},
]
filtered, report = apply_quality_gate_to_records(
records,
split_name="train",
config=DatasetQualityGateConfig(min_text_chars=24),
)
assert len(filtered) == 2
assert report.kept_records == 2
assert report.duplicates_removed == 1
assert report.reasons["duplicate_training_text"] == 1
assert report.reasons["invalid_conversation_pair"] == 1
assert report.reasons["too_short"] == 1
def test_apply_quality_gate_rejects_short_echo_responses_and_repeated_line_noise() -> None:
records = [
{
"user": "Izskaidro SSE kontrakta regresijas cēloni starp backend un frontend.",
"assistant": "ok",
},
{
"prompt": "Uzraksti kopsavilkumu par incidentu",
"completion": "Uzraksti kopsavilkumu par incidentu lūdzu",
},
{
"prompt": "Izanalizē incidenta trokšņaino logu.",
"context": "\n".join(
[
"Timeout retry budget exceeded in production stream worker.",
"Timeout retry budget exceeded in production stream worker.",
"Timeout retry budget exceeded in production stream worker.",
"Timeout retry budget exceeded in production stream worker.",
]
),
},
{
"user": "Izveido drošu rollout plānu SSE kontrakta labojumam.",
"assistant": "Sākam ar backward-compatible delta/complete kontrakta salāgošanu un rollout drošības pārbaudēm.",
},
]
filtered, report = apply_quality_gate_to_records(
records,
split_name="train",
config=DatasetQualityGateConfig(min_text_chars=12),
)
assert len(filtered) == 1
assert report.reasons["response_too_short"] == 1
assert report.reasons["prompt_echo_response"] == 1
assert report.reasons["repeated_line_noise"] == 1
|