| """Tests for the chat benchmark runner.""" |
|
|
| from __future__ import annotations |
|
|
| import json |
| from pathlib import Path |
|
|
| import httpx |
| import pytest |
|
|
| from maris_core.code.execution_eval import CodeExecutionResult |
| from maris_core.text.benchmark import ( |
| ChatBenchmarkResult, |
| build_chat_benchmark_history_artifact, |
| build_chat_benchmark_manifest, |
| build_chat_benchmark_regression_report, |
| load_chat_benchmark_dataset, |
| run_chat_benchmark, |
| select_chat_benchmark_cases, |
| summarize_chat_benchmark, |
| ) |
| from maris_core.text.evals import ChatEvalResult |
|
|
|
|
| def test_load_chat_benchmark_dataset_reads_json_cases(tmp_path: Path) -> None: |
| dataset_path = tmp_path / "dataset.json" |
| dataset_path.write_text( |
| json.dumps( |
| { |
| "cases": [ |
| { |
| "name": "case-1", |
| "message": "Sveiki", |
| "expected_terms": ["Sveiki"], |
| "tags": ["smoke"], |
| "branches": ["master", "coder"], |
| "level": "local", |
| "difficulty": "easy", |
| "category": "latvian_quality", |
| "failure_bucket": "reviewer_bias", |
| "risk_level": "medium", |
| "production_like": True, |
| } |
| ] |
| } |
| ), |
| encoding="utf-8", |
| ) |
|
|
| cases = load_chat_benchmark_dataset(dataset_path) |
|
|
| assert len(cases) == 1 |
| assert cases[0].name == "case-1" |
| assert cases[0].expected_terms == ("Sveiki",) |
| assert cases[0].tags == ("smoke",) |
| assert cases[0].branches == ("master", "coder") |
| assert cases[0].level == "local" |
| assert cases[0].difficulty == "easy" |
| assert cases[0].category == "latvian_quality" |
| assert cases[0].failure_bucket == "reviewer_bias" |
| assert cases[0].risk_level == "medium" |
| assert cases[0].production_like is True |
|
|
|
|
| def test_load_chat_benchmark_dataset_reads_history_for_multiturn_cases(tmp_path: Path) -> None: |
| dataset_path = tmp_path / "dataset.json" |
| dataset_path.write_text( |
| json.dumps( |
| { |
| "cases": [ |
| { |
| "name": "multi-turn-case", |
| "message": "Turpini iepriekšējo pavedienu.", |
| "history": [ |
| {"role": "user", "content": "Mums flaky tests parādās tikai CI."}, |
| { |
| "role": "assistant", |
| "content": "Tad jāsalīdzina workflow vide ar lokālo izpildi.", |
| }, |
| ], |
| "expected_terms": ["CI", "workflow"], |
| } |
| ] |
| } |
| ), |
| encoding="utf-8", |
| ) |
|
|
| cases = load_chat_benchmark_dataset(dataset_path) |
|
|
| assert len(cases[0].history) == 2 |
| assert cases[0].history[0]["role"] == "user" |
| assert cases[0].history[1]["content"] == "Tad jāsalīdzina workflow vide ar lokālo izpildi." |
|
|
|
|
| def test_load_chat_benchmark_dataset_reads_multimodal_vision_context(tmp_path: Path) -> None: |
| dataset_path = tmp_path / "dataset.json" |
| dataset_path.write_text( |
| json.dumps( |
| { |
| "cases": [ |
| { |
| "name": "vision-case", |
| "message": "Ko redzi incident screenshotā?", |
| "vision_context": { |
| "summary": "Dashboard ar sarkanu incident alert un augstu latency grafiku.", |
| "source": "upload", |
| "model": "test-vision", |
| }, |
| "tags": ["multimodal", "vision"], |
| } |
| ] |
| } |
| ), |
| encoding="utf-8", |
| ) |
|
|
| cases = load_chat_benchmark_dataset(dataset_path) |
|
|
| assert cases[0].vision_context is not None |
| assert cases[0].vision_context["summary"].startswith("Dashboard") |
|
|
|
|
| def test_select_chat_benchmark_cases_filters_by_level_and_branch(tmp_path: Path) -> None: |
| dataset_path = tmp_path / "dataset.json" |
| dataset_path.write_text( |
| json.dumps( |
| { |
| "cases": [ |
| { |
| "name": "master-only", |
| "message": "Sveiki", |
| "branches": ["master"], |
| "level": "ci", |
| }, |
| { |
| "name": "coder-only", |
| "message": "Uzraksti kodu", |
| "branches": ["coder"], |
| "level": "ci", |
| }, |
| { |
| "name": "release-only", |
| "message": "Izveido release rollout plānu", |
| "branches": ["master"], |
| "level": "release", |
| }, |
| ] |
| } |
| ), |
| encoding="utf-8", |
| ) |
|
|
| cases = load_chat_benchmark_dataset(dataset_path) |
| selected = select_chat_benchmark_cases(cases, levels=["ci"], branch="coder") |
|
|
| assert [case.name for case in selected] == ["coder-only"] |
|
|
|
|
| @pytest.mark.asyncio |
| async def test_run_chat_benchmark_reports_execution_pass_rate(tmp_path: Path) -> None: |
| dataset_path = tmp_path / "dataset.json" |
| dataset_path.write_text( |
| json.dumps( |
| [ |
| { |
| "name": "exec-case", |
| "message": "Uzraksti Python funkciju normalize_email(email: str) -> str.", |
| "expects_code": True, |
| "execution_language": "python", |
| "execution_test_code": "assert normalize_email(' A@Example.COM ') == 'a@example.com'", |
| } |
| ] |
| ), |
| encoding="utf-8", |
| ) |
|
|
| async def handler(request: httpx.Request) -> httpx.Response: |
| del request |
| return httpx.Response( |
| 200, |
| json={ |
| "response": ( |
| "```python\n" |
| "def normalize_email(email: str) -> str:\n" |
| " return email.strip().lower()\n" |
| "```" |
| ), |
| "model": "MarisUK/test-model", |
| "tokens_used": 12, |
| }, |
| ) |
|
|
| results = await run_chat_benchmark( |
| load_chat_benchmark_dataset(dataset_path), |
| url="http://test/v1/text/generate", |
| transport=httpx.MockTransport(handler), |
| ) |
| summary = summarize_chat_benchmark(results) |
|
|
| assert results[0].execution is not None |
| assert results[0].execution.passed is True |
| assert summary["execution_cases"] == 1 |
| assert summary["score_manifest"]["execution"] == 1.0 |
| assert summary["execution_language_pass_rates"]["python"] == 1.0 |
| assert summary["execution_language_scores"]["python"] > 0.0 |
| assert summary["category_execution_pass_rates"]["general"] == 1.0 |
|
|
|
|
| @pytest.mark.asyncio |
| async def test_run_chat_benchmark_forwards_multimodal_vision_context(tmp_path: Path) -> None: |
| dataset_path = tmp_path / "dataset.json" |
| dataset_path.write_text( |
| json.dumps( |
| [ |
| { |
| "name": "vision-case", |
| "message": "Ko redzi incident screenshotā?", |
| "vision_context": { |
| "summary": "Dashboard ar sarkanu incident alert un augstu latency grafiku.", |
| "source": "upload", |
| }, |
| "expected_terms": ["incident", "latency"], |
| "tags": ["multimodal", "vision"], |
| "category": "multimodal", |
| } |
| ] |
| ), |
| encoding="utf-8", |
| ) |
|
|
| async def handler(request: httpx.Request) -> httpx.Response: |
| payload = json.loads(request.content.decode("utf-8")) |
| assert payload["vision_context"]["summary"].startswith("Dashboard") |
| return httpx.Response( |
| 200, |
| json={ |
| "response": "Screenshot rāda incident alert un latency pieaugumu.", |
| "model": "MarisUK/test-model", |
| "tokens_used": 19, |
| }, |
| ) |
|
|
| results = await run_chat_benchmark( |
| load_chat_benchmark_dataset(dataset_path), |
| url="http://test/v1/text/generate", |
| transport=httpx.MockTransport(handler), |
| ) |
|
|
| assert results[0].ok is True |
|
|
|
|
| @pytest.mark.asyncio |
| async def test_run_chat_benchmark_enforces_grounding_requirements(tmp_path: Path) -> None: |
| dataset_path = tmp_path / "dataset.json" |
| dataset_path.write_text( |
| json.dumps( |
| [ |
| { |
| "name": "grounded-case", |
| "message": "Debug SSE mismatch starp backend-rust/src/api/chat.rs un frontend/app/chat/page.tsx.", |
| "min_tool_steps": 2, |
| "min_grounding_sources": 2, |
| "expected_grounding_terms": [ |
| "backend-rust/src/api/chat.rs", |
| "frontend/app/chat/page.tsx", |
| ], |
| } |
| ] |
| ), |
| encoding="utf-8", |
| ) |
|
|
| async def grounded_handler(request: httpx.Request) -> httpx.Response: |
| del request |
| return httpx.Response( |
| 200, |
| json={ |
| "response": "Pamatojoties uz abiem failiem, jāpārbauda SSE complete event apstrāde.", |
| "model": "MarisUK/test-model", |
| "tokens_used": 22, |
| "tool_trace": { |
| "mode": "multi_step", |
| "steps": [{"name": "workspace_search"}, {"name": "workspace_read"}], |
| "grounding_sources": [ |
| {"label": "backend-rust/src/api/chat.rs"}, |
| {"label": "frontend/app/chat/page.tsx"}, |
| ], |
| }, |
| }, |
| ) |
|
|
| results = await run_chat_benchmark( |
| load_chat_benchmark_dataset(dataset_path), |
| url="http://test/v1/text/generate", |
| transport=httpx.MockTransport(grounded_handler), |
| ) |
|
|
| assert results[0].ok is True |
| assert results[0].grounding_ok is True |
|
|
|
|
| @pytest.mark.asyncio |
| async def test_run_chat_benchmark_scores_successful_cases(tmp_path: Path) -> None: |
| dataset_path = tmp_path / "dataset.json" |
| dataset_path.write_text( |
| json.dumps( |
| [ |
| { |
| "name": "case-1", |
| "message": "Izveido strukturētu plānu retry loģikai", |
| "expected_terms": ["retry", "plānu"], |
| "reference_facts": ["retry", "plānu"], |
| "category": "reasoning", |
| "level": "ci", |
| } |
| ] |
| ), |
| encoding="utf-8", |
| ) |
|
|
| async def handler(request: httpx.Request) -> httpx.Response: |
| return httpx.Response( |
| 200, |
| json={ |
| "response": "Šeit ir skaidrs plāns ar retry robežām un nākamo soli.", |
| "model": "MarisUK/test-model", |
| "tokens_used": 123, |
| "persona_title": "Systems Strategist", |
| }, |
| ) |
|
|
| transport = httpx.MockTransport(handler) |
| results = await run_chat_benchmark( |
| load_chat_benchmark_dataset(dataset_path), |
| url="http://test/v1/text/generate", |
| transport=transport, |
| ) |
| summary = summarize_chat_benchmark(results) |
|
|
| assert results[0].ok is True |
| assert results[0].eval is not None |
| assert results[0].tokens_used == 123 |
| assert summary["successful_cases"] == 1 |
| assert summary["average_overall_score"] > 0.0 |
| assert summary["category_scores"]["reasoning"] > 0.0 |
| assert summary["judge_summary"]["overall"] > 0.0 |
| manifest = build_chat_benchmark_manifest( |
| results, |
| benchmark_name="chat-quality", |
| branch="master", |
| model="MarisUK/test-model", |
| human_eval_summary={ |
| "artifact_type": "human-eval-summary", |
| "pairwise_win_rate": 0.75, |
| "average_confidence": 0.8, |
| }, |
| ) |
| assert manifest["artifact_type"] == "chat-benchmark-manifest" |
| assert manifest["score_manifest"]["reasoning"] > 0.0 |
| assert manifest["score_manifest"]["pairwise_win_rate"] == 0.75 |
| assert manifest["human_eval_summary"]["artifact_type"] == "human-eval-summary" |
|
|
|
|
| def test_build_chat_benchmark_history_and_regression_track_category_and_language() -> None: |
| previous_results = [ |
| ChatBenchmarkResult( |
| name="python-pass", |
| ok=True, |
| latency_ms=100, |
| status_code=200, |
| response="```python\nprint('ok')\n```", |
| model="MarisUK/test-model", |
| tokens_used=10, |
| eval=ChatEvalResult( |
| name="python-pass", |
| helpfulness=0.8, |
| reasoning=0.8, |
| factuality=0.8, |
| latvian_quality=0.8, |
| coding=0.9, |
| long_context=0.7, |
| safety=1.0, |
| category="coding", |
| ), |
| execution=None, |
| category="coding", |
| failure_bucket="production_regression", |
| execution_language="python", |
| ), |
| ChatBenchmarkResult( |
| name="ts-exec", |
| ok=True, |
| latency_ms=120, |
| status_code=200, |
| response="```ts\nexport const ok = true;\n```", |
| model="MarisUK/test-model", |
| tokens_used=12, |
| eval=ChatEvalResult( |
| name="ts-exec", |
| helpfulness=0.75, |
| reasoning=0.76, |
| factuality=0.77, |
| latvian_quality=0.72, |
| coding=0.88, |
| long_context=0.7, |
| safety=1.0, |
| category="grounding", |
| ), |
| execution=CodeExecutionResult( |
| language="typescript", |
| available=True, |
| passed=True, |
| summary="ok", |
| exit_code=0, |
| ), |
| category="grounding", |
| failure_bucket="broken_contract", |
| execution_language="typescript", |
| ), |
| ] |
| current_results = [ |
| ChatBenchmarkResult( |
| name="python-pass", |
| ok=True, |
| latency_ms=105, |
| status_code=200, |
| response="```python\nprint('ok')\n```", |
| model="MarisUK/test-model", |
| tokens_used=11, |
| eval=ChatEvalResult( |
| name="python-pass", |
| helpfulness=0.7, |
| reasoning=0.7, |
| factuality=0.72, |
| latvian_quality=0.74, |
| coding=0.78, |
| long_context=0.68, |
| safety=1.0, |
| category="coding", |
| ), |
| execution=None, |
| category="coding", |
| failure_bucket="production_regression", |
| execution_language="python", |
| ), |
| ChatBenchmarkResult( |
| name="ts-exec", |
| ok=False, |
| latency_ms=140, |
| status_code=200, |
| response="```ts\nthrow new Error('fail');\n```", |
| model="MarisUK/test-model", |
| tokens_used=13, |
| eval=ChatEvalResult( |
| name="ts-exec", |
| helpfulness=0.6, |
| reasoning=0.62, |
| factuality=0.63, |
| latvian_quality=0.68, |
| coding=0.7, |
| long_context=0.64, |
| safety=1.0, |
| category="grounding", |
| ), |
| execution=CodeExecutionResult( |
| language="typescript", |
| available=True, |
| passed=False, |
| summary="failed", |
| exit_code=1, |
| ), |
| category="grounding", |
| failure_bucket="broken_contract", |
| execution_language="typescript", |
| ), |
| ] |
| previous_manifest = build_chat_benchmark_manifest( |
| previous_results, |
| benchmark_name="coder-exec", |
| branch="coder", |
| model="MarisUK/test-model", |
| ) |
| previous_manifest["generated_at"] = "2026-04-16T00:00:00Z" |
| current_manifest = build_chat_benchmark_manifest( |
| current_results, |
| benchmark_name="coder-exec", |
| branch="coder", |
| model="MarisUK/test-model", |
| ) |
| current_manifest["generated_at"] = "2026-04-16T00:01:00Z" |
|
|
| regression = build_chat_benchmark_regression_report( |
| current_manifest, |
| previous_run=previous_manifest, |
| ) |
| history = build_chat_benchmark_history_artifact( |
| current_manifest, |
| previous_history={"runs": [previous_manifest]}, |
| ) |
|
|
| assert regression["has_baseline"] is True |
| assert regression["has_regressions"] is True |
| assert regression["execution_language_pass_rate_deltas"]["typescript"]["delta"] == -1.0 |
| assert regression["category_score_deltas"]["coding"]["delta"] < 0.0 |
| assert regression["failure_bucket_score_deltas"]["broken_contract"]["delta"] < 0.0 |
| assert history["run_count"] == 2 |
| assert history["latest_regression_summary"]["has_regressions"] is True |
|
|
|
|
| def test_summarize_chat_benchmark_tracks_memory_metrics_and_trends() -> None: |
| results = [ |
| ChatBenchmarkResult( |
| name="memory-followup", |
| ok=True, |
| latency_ms=80, |
| status_code=200, |
| response="Atceries workflow timeoutus un cache invalidāciju.", |
| model="MarisUK/test-model", |
| tokens_used=18, |
| eval=ChatEvalResult( |
| name="memory-followup", |
| helpfulness=0.82, |
| reasoning=0.81, |
| factuality=0.8, |
| latvian_quality=0.84, |
| coding=0.2, |
| long_context=0.86, |
| safety=1.0, |
| category="multi_turn_continuity", |
| ), |
| execution=None, |
| category="multi_turn_continuity", |
| tags=("memory", "continuity"), |
| production_like=True, |
| ), |
| ChatBenchmarkResult( |
| name="memory-cross-session", |
| ok=False, |
| latency_ms=110, |
| status_code=200, |
| response="Neatceros iepriekšējo preference.", |
| model="MarisUK/test-model", |
| tokens_used=20, |
| eval=ChatEvalResult( |
| name="memory-cross-session", |
| helpfulness=0.45, |
| reasoning=0.48, |
| factuality=0.44, |
| latvian_quality=0.52, |
| coding=0.1, |
| long_context=0.35, |
| safety=0.95, |
| category="cross_session_recall", |
| ), |
| execution=None, |
| category="cross_session_recall", |
| tags=("memory", "cross-session"), |
| ), |
| ] |
|
|
| summary = summarize_chat_benchmark(results) |
| manifest = build_chat_benchmark_manifest( |
| results, |
| benchmark_name="memory-quality", |
| branch="master", |
| model="MarisUK/test-model", |
| ) |
| history = build_chat_benchmark_history_artifact(manifest) |
|
|
| assert summary["memory_retrieval_cases"] == 2 |
| assert summary["memory_retrieval_pass_rate"] == 0.5 |
| assert summary["memory_quality_scores"]["multi_turn_continuity"] > 0.7 |
| assert summary["memory_quality_pass_rates"]["cross_session_recall"] == 0.0 |
| assert summary["tag_pass_rates"]["memory"] == 0.5 |
| assert manifest["score_manifest"]["memory_multi_turn_continuity"] > 0.7 |
| assert manifest["score_manifest"]["memory_retrieval_pass_rate"] == 0.5 |
| assert manifest["human_eval_cadence"] == "weekly + pre-release" |
| assert history["trend_summary"]["memory_retrieval_pass_rate"]["latest"] == 0.5 |
| assert history["trend_summary"]["average_latency_ms"]["latest"] == 80.0 |
|
|
|
|
| def test_summarize_chat_benchmark_tracks_tool_multimodal_latency_and_hallucination_metrics() -> ( |
| None |
| ): |
| results = [ |
| ChatBenchmarkResult( |
| name="tooling-case", |
| ok=True, |
| latency_ms=90, |
| status_code=200, |
| response="Balstoties uz README.md, /ready ir readiness checks ceļš.", |
| model="MarisUK/test-model", |
| tokens_used=24, |
| eval=ChatEvalResult( |
| name="tooling-case", |
| helpfulness=0.82, |
| reasoning=0.8, |
| factuality=0.86, |
| latvian_quality=0.84, |
| coding=0.4, |
| long_context=0.75, |
| safety=1.0, |
| category="grounding", |
| ), |
| execution=None, |
| grounding_required=True, |
| grounding_ok=True, |
| category="grounding", |
| tags=("tool_use", "repo"), |
| ), |
| ChatBenchmarkResult( |
| name="multimodal-case", |
| ok=False, |
| latency_ms=180, |
| status_code=200, |
| response="Droši vien tur nav nekāda incidenta.", |
| model="MarisUK/test-model", |
| tokens_used=21, |
| eval=ChatEvalResult( |
| name="multimodal-case", |
| helpfulness=0.42, |
| reasoning=0.38, |
| factuality=0.32, |
| latvian_quality=0.7, |
| coding=0.1, |
| long_context=0.4, |
| safety=0.92, |
| category="multimodal", |
| ), |
| execution=None, |
| category="multimodal", |
| tags=("multimodal", "vision"), |
| ), |
| ] |
|
|
| summary = summarize_chat_benchmark(results) |
| manifest = build_chat_benchmark_manifest( |
| results, |
| benchmark_name="assistant-quality", |
| branch="master", |
| model="MarisUK/test-model", |
| ) |
| history = build_chat_benchmark_history_artifact(manifest) |
|
|
| assert summary["tool_use_cases"] == 1 |
| assert summary["tool_use_pass_rate"] == 1.0 |
| assert summary["multimodal_cases"] == 1 |
| assert summary["multimodal_pass_rate"] == 0.0 |
| assert summary["hallucination_incidents"] == 1 |
| assert summary["hallucination_rate"] == 0.5 |
| assert summary["latency_p95_ms"] == 180.0 |
| assert summary["quality_dimensions"]["tool_use"]["pass_rate"] == 1.0 |
| assert summary["quality_dimensions"]["multimodality"]["cases"] == 1 |
| assert manifest["score_manifest"]["tool_use_pass_rate"] == 1.0 |
| assert manifest["score_manifest"]["multimodal_pass_rate"] == 0.0 |
| assert manifest["score_manifest"]["hallucination_rate"] == 0.5 |
| assert history["trend_summary"]["tool_use_pass_rate"]["latest"] == 1.0 |
| assert history["trend_summary"]["multimodal_pass_rate"]["latest"] == 0.0 |
| assert history["trend_summary"]["hallucination_rate"]["latest"] == 0.5 |
|
|