maris-ai-master / core-python /tests /test_text_benchmark.py
MarisUK's picture
Maris AI model sync
f440f03 verified
"""Tests for the chat benchmark runner."""
from __future__ import annotations
import json
from pathlib import Path
import httpx
import pytest
from maris_core.code.execution_eval import CodeExecutionResult
from maris_core.text.benchmark import (
ChatBenchmarkResult,
build_chat_benchmark_history_artifact,
build_chat_benchmark_manifest,
build_chat_benchmark_regression_report,
load_chat_benchmark_dataset,
run_chat_benchmark,
select_chat_benchmark_cases,
summarize_chat_benchmark,
)
from maris_core.text.evals import ChatEvalResult
def test_load_chat_benchmark_dataset_reads_json_cases(tmp_path: Path) -> None:
dataset_path = tmp_path / "dataset.json"
dataset_path.write_text(
json.dumps(
{
"cases": [
{
"name": "case-1",
"message": "Sveiki",
"expected_terms": ["Sveiki"],
"tags": ["smoke"],
"branches": ["master", "coder"],
"level": "local",
"difficulty": "easy",
"category": "latvian_quality",
"failure_bucket": "reviewer_bias",
"risk_level": "medium",
"production_like": True,
}
]
}
),
encoding="utf-8",
)
cases = load_chat_benchmark_dataset(dataset_path)
assert len(cases) == 1
assert cases[0].name == "case-1"
assert cases[0].expected_terms == ("Sveiki",)
assert cases[0].tags == ("smoke",)
assert cases[0].branches == ("master", "coder")
assert cases[0].level == "local"
assert cases[0].difficulty == "easy"
assert cases[0].category == "latvian_quality"
assert cases[0].failure_bucket == "reviewer_bias"
assert cases[0].risk_level == "medium"
assert cases[0].production_like is True
def test_load_chat_benchmark_dataset_reads_history_for_multiturn_cases(tmp_path: Path) -> None:
dataset_path = tmp_path / "dataset.json"
dataset_path.write_text(
json.dumps(
{
"cases": [
{
"name": "multi-turn-case",
"message": "Turpini iepriekšējo pavedienu.",
"history": [
{"role": "user", "content": "Mums flaky tests parādās tikai CI."},
{
"role": "assistant",
"content": "Tad jāsalīdzina workflow vide ar lokālo izpildi.",
},
],
"expected_terms": ["CI", "workflow"],
}
]
}
),
encoding="utf-8",
)
cases = load_chat_benchmark_dataset(dataset_path)
assert len(cases[0].history) == 2
assert cases[0].history[0]["role"] == "user"
assert cases[0].history[1]["content"] == "Tad jāsalīdzina workflow vide ar lokālo izpildi."
def test_load_chat_benchmark_dataset_reads_multimodal_vision_context(tmp_path: Path) -> None:
dataset_path = tmp_path / "dataset.json"
dataset_path.write_text(
json.dumps(
{
"cases": [
{
"name": "vision-case",
"message": "Ko redzi incident screenshotā?",
"vision_context": {
"summary": "Dashboard ar sarkanu incident alert un augstu latency grafiku.",
"source": "upload",
"model": "test-vision",
},
"tags": ["multimodal", "vision"],
}
]
}
),
encoding="utf-8",
)
cases = load_chat_benchmark_dataset(dataset_path)
assert cases[0].vision_context is not None
assert cases[0].vision_context["summary"].startswith("Dashboard")
def test_select_chat_benchmark_cases_filters_by_level_and_branch(tmp_path: Path) -> None:
dataset_path = tmp_path / "dataset.json"
dataset_path.write_text(
json.dumps(
{
"cases": [
{
"name": "master-only",
"message": "Sveiki",
"branches": ["master"],
"level": "ci",
},
{
"name": "coder-only",
"message": "Uzraksti kodu",
"branches": ["coder"],
"level": "ci",
},
{
"name": "release-only",
"message": "Izveido release rollout plānu",
"branches": ["master"],
"level": "release",
},
]
}
),
encoding="utf-8",
)
cases = load_chat_benchmark_dataset(dataset_path)
selected = select_chat_benchmark_cases(cases, levels=["ci"], branch="coder")
assert [case.name for case in selected] == ["coder-only"]
@pytest.mark.asyncio
async def test_run_chat_benchmark_reports_execution_pass_rate(tmp_path: Path) -> None:
dataset_path = tmp_path / "dataset.json"
dataset_path.write_text(
json.dumps(
[
{
"name": "exec-case",
"message": "Uzraksti Python funkciju normalize_email(email: str) -> str.",
"expects_code": True,
"execution_language": "python",
"execution_test_code": "assert normalize_email(' A@Example.COM ') == 'a@example.com'",
}
]
),
encoding="utf-8",
)
async def handler(request: httpx.Request) -> httpx.Response:
del request
return httpx.Response(
200,
json={
"response": (
"```python\n"
"def normalize_email(email: str) -> str:\n"
" return email.strip().lower()\n"
"```"
),
"model": "MarisUK/test-model",
"tokens_used": 12,
},
)
results = await run_chat_benchmark(
load_chat_benchmark_dataset(dataset_path),
url="http://test/v1/text/generate",
transport=httpx.MockTransport(handler),
)
summary = summarize_chat_benchmark(results)
assert results[0].execution is not None
assert results[0].execution.passed is True
assert summary["execution_cases"] == 1
assert summary["score_manifest"]["execution"] == 1.0
assert summary["execution_language_pass_rates"]["python"] == 1.0
assert summary["execution_language_scores"]["python"] > 0.0
assert summary["category_execution_pass_rates"]["general"] == 1.0
@pytest.mark.asyncio
async def test_run_chat_benchmark_forwards_multimodal_vision_context(tmp_path: Path) -> None:
dataset_path = tmp_path / "dataset.json"
dataset_path.write_text(
json.dumps(
[
{
"name": "vision-case",
"message": "Ko redzi incident screenshotā?",
"vision_context": {
"summary": "Dashboard ar sarkanu incident alert un augstu latency grafiku.",
"source": "upload",
},
"expected_terms": ["incident", "latency"],
"tags": ["multimodal", "vision"],
"category": "multimodal",
}
]
),
encoding="utf-8",
)
async def handler(request: httpx.Request) -> httpx.Response:
payload = json.loads(request.content.decode("utf-8"))
assert payload["vision_context"]["summary"].startswith("Dashboard")
return httpx.Response(
200,
json={
"response": "Screenshot rāda incident alert un latency pieaugumu.",
"model": "MarisUK/test-model",
"tokens_used": 19,
},
)
results = await run_chat_benchmark(
load_chat_benchmark_dataset(dataset_path),
url="http://test/v1/text/generate",
transport=httpx.MockTransport(handler),
)
assert results[0].ok is True
@pytest.mark.asyncio
async def test_run_chat_benchmark_enforces_grounding_requirements(tmp_path: Path) -> None:
dataset_path = tmp_path / "dataset.json"
dataset_path.write_text(
json.dumps(
[
{
"name": "grounded-case",
"message": "Debug SSE mismatch starp backend-rust/src/api/chat.rs un frontend/app/chat/page.tsx.",
"min_tool_steps": 2,
"min_grounding_sources": 2,
"expected_grounding_terms": [
"backend-rust/src/api/chat.rs",
"frontend/app/chat/page.tsx",
],
}
]
),
encoding="utf-8",
)
async def grounded_handler(request: httpx.Request) -> httpx.Response:
del request
return httpx.Response(
200,
json={
"response": "Pamatojoties uz abiem failiem, jāpārbauda SSE complete event apstrāde.",
"model": "MarisUK/test-model",
"tokens_used": 22,
"tool_trace": {
"mode": "multi_step",
"steps": [{"name": "workspace_search"}, {"name": "workspace_read"}],
"grounding_sources": [
{"label": "backend-rust/src/api/chat.rs"},
{"label": "frontend/app/chat/page.tsx"},
],
},
},
)
results = await run_chat_benchmark(
load_chat_benchmark_dataset(dataset_path),
url="http://test/v1/text/generate",
transport=httpx.MockTransport(grounded_handler),
)
assert results[0].ok is True
assert results[0].grounding_ok is True
@pytest.mark.asyncio
async def test_run_chat_benchmark_scores_successful_cases(tmp_path: Path) -> None:
dataset_path = tmp_path / "dataset.json"
dataset_path.write_text(
json.dumps(
[
{
"name": "case-1",
"message": "Izveido strukturētu plānu retry loģikai",
"expected_terms": ["retry", "plānu"],
"reference_facts": ["retry", "plānu"],
"category": "reasoning",
"level": "ci",
}
]
),
encoding="utf-8",
)
async def handler(request: httpx.Request) -> httpx.Response:
return httpx.Response(
200,
json={
"response": "Šeit ir skaidrs plāns ar retry robežām un nākamo soli.",
"model": "MarisUK/test-model",
"tokens_used": 123,
"persona_title": "Systems Strategist",
},
)
transport = httpx.MockTransport(handler)
results = await run_chat_benchmark(
load_chat_benchmark_dataset(dataset_path),
url="http://test/v1/text/generate",
transport=transport,
)
summary = summarize_chat_benchmark(results)
assert results[0].ok is True
assert results[0].eval is not None
assert results[0].tokens_used == 123
assert summary["successful_cases"] == 1
assert summary["average_overall_score"] > 0.0
assert summary["category_scores"]["reasoning"] > 0.0
assert summary["judge_summary"]["overall"] > 0.0
manifest = build_chat_benchmark_manifest(
results,
benchmark_name="chat-quality",
branch="master",
model="MarisUK/test-model",
human_eval_summary={
"artifact_type": "human-eval-summary",
"pairwise_win_rate": 0.75,
"average_confidence": 0.8,
},
)
assert manifest["artifact_type"] == "chat-benchmark-manifest"
assert manifest["score_manifest"]["reasoning"] > 0.0
assert manifest["score_manifest"]["pairwise_win_rate"] == 0.75
assert manifest["human_eval_summary"]["artifact_type"] == "human-eval-summary"
def test_build_chat_benchmark_history_and_regression_track_category_and_language() -> None:
previous_results = [
ChatBenchmarkResult(
name="python-pass",
ok=True,
latency_ms=100,
status_code=200,
response="```python\nprint('ok')\n```",
model="MarisUK/test-model",
tokens_used=10,
eval=ChatEvalResult(
name="python-pass",
helpfulness=0.8,
reasoning=0.8,
factuality=0.8,
latvian_quality=0.8,
coding=0.9,
long_context=0.7,
safety=1.0,
category="coding",
),
execution=None,
category="coding",
failure_bucket="production_regression",
execution_language="python",
),
ChatBenchmarkResult(
name="ts-exec",
ok=True,
latency_ms=120,
status_code=200,
response="```ts\nexport const ok = true;\n```",
model="MarisUK/test-model",
tokens_used=12,
eval=ChatEvalResult(
name="ts-exec",
helpfulness=0.75,
reasoning=0.76,
factuality=0.77,
latvian_quality=0.72,
coding=0.88,
long_context=0.7,
safety=1.0,
category="grounding",
),
execution=CodeExecutionResult(
language="typescript",
available=True,
passed=True,
summary="ok",
exit_code=0,
),
category="grounding",
failure_bucket="broken_contract",
execution_language="typescript",
),
]
current_results = [
ChatBenchmarkResult(
name="python-pass",
ok=True,
latency_ms=105,
status_code=200,
response="```python\nprint('ok')\n```",
model="MarisUK/test-model",
tokens_used=11,
eval=ChatEvalResult(
name="python-pass",
helpfulness=0.7,
reasoning=0.7,
factuality=0.72,
latvian_quality=0.74,
coding=0.78,
long_context=0.68,
safety=1.0,
category="coding",
),
execution=None,
category="coding",
failure_bucket="production_regression",
execution_language="python",
),
ChatBenchmarkResult(
name="ts-exec",
ok=False,
latency_ms=140,
status_code=200,
response="```ts\nthrow new Error('fail');\n```",
model="MarisUK/test-model",
tokens_used=13,
eval=ChatEvalResult(
name="ts-exec",
helpfulness=0.6,
reasoning=0.62,
factuality=0.63,
latvian_quality=0.68,
coding=0.7,
long_context=0.64,
safety=1.0,
category="grounding",
),
execution=CodeExecutionResult(
language="typescript",
available=True,
passed=False,
summary="failed",
exit_code=1,
),
category="grounding",
failure_bucket="broken_contract",
execution_language="typescript",
),
]
previous_manifest = build_chat_benchmark_manifest(
previous_results,
benchmark_name="coder-exec",
branch="coder",
model="MarisUK/test-model",
)
previous_manifest["generated_at"] = "2026-04-16T00:00:00Z"
current_manifest = build_chat_benchmark_manifest(
current_results,
benchmark_name="coder-exec",
branch="coder",
model="MarisUK/test-model",
)
current_manifest["generated_at"] = "2026-04-16T00:01:00Z"
regression = build_chat_benchmark_regression_report(
current_manifest,
previous_run=previous_manifest,
)
history = build_chat_benchmark_history_artifact(
current_manifest,
previous_history={"runs": [previous_manifest]},
)
assert regression["has_baseline"] is True
assert regression["has_regressions"] is True
assert regression["execution_language_pass_rate_deltas"]["typescript"]["delta"] == -1.0
assert regression["category_score_deltas"]["coding"]["delta"] < 0.0
assert regression["failure_bucket_score_deltas"]["broken_contract"]["delta"] < 0.0
assert history["run_count"] == 2
assert history["latest_regression_summary"]["has_regressions"] is True
def test_summarize_chat_benchmark_tracks_memory_metrics_and_trends() -> None:
results = [
ChatBenchmarkResult(
name="memory-followup",
ok=True,
latency_ms=80,
status_code=200,
response="Atceries workflow timeoutus un cache invalidāciju.",
model="MarisUK/test-model",
tokens_used=18,
eval=ChatEvalResult(
name="memory-followup",
helpfulness=0.82,
reasoning=0.81,
factuality=0.8,
latvian_quality=0.84,
coding=0.2,
long_context=0.86,
safety=1.0,
category="multi_turn_continuity",
),
execution=None,
category="multi_turn_continuity",
tags=("memory", "continuity"),
production_like=True,
),
ChatBenchmarkResult(
name="memory-cross-session",
ok=False,
latency_ms=110,
status_code=200,
response="Neatceros iepriekšējo preference.",
model="MarisUK/test-model",
tokens_used=20,
eval=ChatEvalResult(
name="memory-cross-session",
helpfulness=0.45,
reasoning=0.48,
factuality=0.44,
latvian_quality=0.52,
coding=0.1,
long_context=0.35,
safety=0.95,
category="cross_session_recall",
),
execution=None,
category="cross_session_recall",
tags=("memory", "cross-session"),
),
]
summary = summarize_chat_benchmark(results)
manifest = build_chat_benchmark_manifest(
results,
benchmark_name="memory-quality",
branch="master",
model="MarisUK/test-model",
)
history = build_chat_benchmark_history_artifact(manifest)
assert summary["memory_retrieval_cases"] == 2
assert summary["memory_retrieval_pass_rate"] == 0.5
assert summary["memory_quality_scores"]["multi_turn_continuity"] > 0.7
assert summary["memory_quality_pass_rates"]["cross_session_recall"] == 0.0
assert summary["tag_pass_rates"]["memory"] == 0.5
assert manifest["score_manifest"]["memory_multi_turn_continuity"] > 0.7
assert manifest["score_manifest"]["memory_retrieval_pass_rate"] == 0.5
assert manifest["human_eval_cadence"] == "weekly + pre-release"
assert history["trend_summary"]["memory_retrieval_pass_rate"]["latest"] == 0.5
assert history["trend_summary"]["average_latency_ms"]["latest"] == 80.0
def test_summarize_chat_benchmark_tracks_tool_multimodal_latency_and_hallucination_metrics() -> (
None
):
results = [
ChatBenchmarkResult(
name="tooling-case",
ok=True,
latency_ms=90,
status_code=200,
response="Balstoties uz README.md, /ready ir readiness checks ceļš.",
model="MarisUK/test-model",
tokens_used=24,
eval=ChatEvalResult(
name="tooling-case",
helpfulness=0.82,
reasoning=0.8,
factuality=0.86,
latvian_quality=0.84,
coding=0.4,
long_context=0.75,
safety=1.0,
category="grounding",
),
execution=None,
grounding_required=True,
grounding_ok=True,
category="grounding",
tags=("tool_use", "repo"),
),
ChatBenchmarkResult(
name="multimodal-case",
ok=False,
latency_ms=180,
status_code=200,
response="Droši vien tur nav nekāda incidenta.",
model="MarisUK/test-model",
tokens_used=21,
eval=ChatEvalResult(
name="multimodal-case",
helpfulness=0.42,
reasoning=0.38,
factuality=0.32,
latvian_quality=0.7,
coding=0.1,
long_context=0.4,
safety=0.92,
category="multimodal",
),
execution=None,
category="multimodal",
tags=("multimodal", "vision"),
),
]
summary = summarize_chat_benchmark(results)
manifest = build_chat_benchmark_manifest(
results,
benchmark_name="assistant-quality",
branch="master",
model="MarisUK/test-model",
)
history = build_chat_benchmark_history_artifact(manifest)
assert summary["tool_use_cases"] == 1
assert summary["tool_use_pass_rate"] == 1.0
assert summary["multimodal_cases"] == 1
assert summary["multimodal_pass_rate"] == 0.0
assert summary["hallucination_incidents"] == 1
assert summary["hallucination_rate"] == 0.5
assert summary["latency_p95_ms"] == 180.0
assert summary["quality_dimensions"]["tool_use"]["pass_rate"] == 1.0
assert summary["quality_dimensions"]["multimodality"]["cases"] == 1
assert manifest["score_manifest"]["tool_use_pass_rate"] == 1.0
assert manifest["score_manifest"]["multimodal_pass_rate"] == 0.0
assert manifest["score_manifest"]["hallucination_rate"] == 0.5
assert history["trend_summary"]["tool_use_pass_rate"]["latest"] == 1.0
assert history["trend_summary"]["multimodal_pass_rate"]["latest"] == 0.0
assert history["trend_summary"]["hallucination_rate"]["latest"] == 0.5