| """Tests dataset quality gate helperiem.""" | |
| from __future__ import annotations | |
| from maris_core.data.quality import DatasetQualityGateConfig, apply_quality_gate_to_records | |
| def test_apply_quality_gate_to_records_removes_duplicates_and_low_quality_examples() -> None: | |
| records = [ | |
| {"user": "Sveiki, Maris!", "assistant": "Sveiki! Kā varu palīdzēt šodien?"}, | |
| {"user": "Sveiki, Maris!", "assistant": "Sveiki! Kā varu palīdzēt šodien?"}, | |
| {"user": "Atkārto mani", "assistant": "Atkārto mani"}, | |
| {"text": "test"}, | |
| { | |
| "prompt": "Izveido detalizētu plānu latviešu valodā klientu atbalsta balss asistentam.", | |
| "metadata": {"channel": "voice", "language": "lv"}, | |
| }, | |
| ] | |
| filtered, report = apply_quality_gate_to_records( | |
| records, | |
| split_name="train", | |
| config=DatasetQualityGateConfig(min_text_chars=24), | |
| ) | |
| assert len(filtered) == 2 | |
| assert report.kept_records == 2 | |
| assert report.duplicates_removed == 1 | |
| assert report.reasons["duplicate_training_text"] == 1 | |
| assert report.reasons["invalid_conversation_pair"] == 1 | |
| assert report.reasons["too_short"] == 1 | |
| def test_apply_quality_gate_rejects_short_echo_responses_and_repeated_line_noise() -> None: | |
| records = [ | |
| { | |
| "user": "Izskaidro SSE kontrakta regresijas cēloni starp backend un frontend.", | |
| "assistant": "ok", | |
| }, | |
| { | |
| "prompt": "Uzraksti kopsavilkumu par incidentu", | |
| "completion": "Uzraksti kopsavilkumu par incidentu lūdzu", | |
| }, | |
| { | |
| "prompt": "Izanalizē incidenta trokšņaino logu.", | |
| "context": "\n".join( | |
| [ | |
| "Timeout retry budget exceeded in production stream worker.", | |
| "Timeout retry budget exceeded in production stream worker.", | |
| "Timeout retry budget exceeded in production stream worker.", | |
| "Timeout retry budget exceeded in production stream worker.", | |
| ] | |
| ), | |
| }, | |
| { | |
| "user": "Izveido drošu rollout plānu SSE kontrakta labojumam.", | |
| "assistant": "Sākam ar backward-compatible delta/complete kontrakta salāgošanu un rollout drošības pārbaudēm.", | |
| }, | |
| ] | |
| filtered, report = apply_quality_gate_to_records( | |
| records, | |
| split_name="train", | |
| config=DatasetQualityGateConfig(min_text_chars=12), | |
| ) | |
| assert len(filtered) == 1 | |
| assert report.reasons["response_too_short"] == 1 | |
| assert report.reasons["prompt_echo_response"] == 1 | |
| assert report.reasons["repeated_line_noise"] == 1 | |