maris-ai-master / core-python /benchmarks /chat-release.json
MarisUK's picture
Maris AI model sync
f440f03 verified
{
"cases": [
{
"name": "long-context-release",
"message": "Balstoties uz iepriekšējo sarunu, nosauc nākamos divus prioritāros soļus.",
"history": [
{
"role": "user",
"content": "Mēs būvējam incident response roadmap lielai komandai."
},
{
"role": "assistant",
"content": "Tu gribi prioritizēt alerting, ownership un postmortem procesu."
}
],
"expected_terms": ["alerting", "ownership"],
"reference_facts": ["alerting", "ownership"],
"category": "long_context",
"level": "release",
"difficulty": "hard",
"tags": ["memory", "continuity"]
},
{
"name": "helpfulness-release",
"message": "Palīdzi man izlemt, ko darīt tālāk, ja benchmark score krīt zem gate sliekšņa.",
"expected_terms": ["benchmark", "sliekšņa"],
"reference_facts": ["benchmark", "slieksnis"],
"category": "helpfulness",
"level": "release",
"difficulty": "hard",
"tags": ["release-gate", "operations"]
}
]
}