| {"timestamp": "2026-04-17T17:00:00Z", "type": "conversation", "session_id": "eval-conv-001", "user": "Kā īsi paskaidrot operatoram, ko nozīmē regression evalā?", "assistant": "Regression evalā nozīmē, ka jaunais modelis uz stabila benchmark uzrāda sliktāku rezultātu nekā iepriekšējais salīdzināmais run, tāpēc pirms publicēšanas jānoskaidro krituma cēlonis un apjoms.", "language": "lv", "source": "maris-eval-benchmark", "task_id": "conversation-regression-001", "benchmark_version": "maris-evals-v1", "suite": "regression", "difficulty": "medium", "evaluation_mode": "reference-review", "risk_level": "medium", "expected_behavior": ["Paskaidro regression jēdzienu vienā īsā operacionālā skaidrojumā.", "Nesajauc benchmark kritumu ar treniņa kļūdu vai infrastruktūras avāriju."], "scoring_hints": ["Meklē skaidru salīdzinājumu ar iepriekšējo baseline.", "Atbildei jābūt īsai un uzturamai operatoram."], "reference_answer": "Regression evalā nozīmē, ka jaunais modelis uz stabila benchmark uzrāda sliktāku rezultātu nekā iepriekšējais salīdzināmais run, tāpēc pirms publicēšanas jānoskaidro krituma cēlonis un apjoms.", "acceptance_criteria": ["Atbilde piemin stabilu benchmark vai baseline salīdzinājumu.", "Atbilde norāda, ka pirms publicēšanas jāizmeklē krituma cēlonis."], "branch": "master"} | |
| {"timestamp": "2026-04-17T17:02:00Z", "type": "conversation", "session_id": "eval-conv-002", "user": "How should Maris AI summarize a failed CI workflow for a maintainer?", "assistant": "Summarize the failing workflow, the specific broken job, the likely regression area, the user impact, and the next corrective step in one compact operational update.", "language": "en", "source": "maris-eval-benchmark", "task_id": "conversation-sanity-002", "benchmark_version": "maris-evals-v1", "suite": "sanity", "difficulty": "easy", "evaluation_mode": "reference-review", "risk_level": "low", "expected_behavior": ["Provide an actionable CI summary for a maintainer.", "Identify the broken job and next corrective step without unnecessary detail."], "scoring_hints": ["Look for failing workflow, broken job, likely regression area, user impact, next step.", "Prefer one concise operational paragraph."], "reference_answer": "Summarize the failing workflow, the specific broken job, the likely regression area, the user impact, and the next corrective step in one compact operational update.", "acceptance_criteria": ["Mentions the failing workflow or job.", "Includes a concrete next step for the maintainer."], "branch": "master"} | |