"""Tests for the chat benchmark runner.""" from __future__ import annotations import json from pathlib import Path import httpx import pytest from maris_core.code.execution_eval import CodeExecutionResult from maris_core.text.benchmark import ( ChatBenchmarkResult, build_chat_benchmark_history_artifact, build_chat_benchmark_manifest, build_chat_benchmark_regression_report, load_chat_benchmark_dataset, run_chat_benchmark, select_chat_benchmark_cases, summarize_chat_benchmark, ) from maris_core.text.evals import ChatEvalResult def test_load_chat_benchmark_dataset_reads_json_cases(tmp_path: Path) -> None: dataset_path = tmp_path / "dataset.json" dataset_path.write_text( json.dumps( { "cases": [ { "name": "case-1", "message": "Sveiki", "expected_terms": ["Sveiki"], "tags": ["smoke"], "branches": ["master", "coder"], "level": "local", "difficulty": "easy", "category": "latvian_quality", "failure_bucket": "reviewer_bias", "risk_level": "medium", "production_like": True, } ] } ), encoding="utf-8", ) cases = load_chat_benchmark_dataset(dataset_path) assert len(cases) == 1 assert cases[0].name == "case-1" assert cases[0].expected_terms == ("Sveiki",) assert cases[0].tags == ("smoke",) assert cases[0].branches == ("master", "coder") assert cases[0].level == "local" assert cases[0].difficulty == "easy" assert cases[0].category == "latvian_quality" assert cases[0].failure_bucket == "reviewer_bias" assert cases[0].risk_level == "medium" assert cases[0].production_like is True def test_load_chat_benchmark_dataset_reads_history_for_multiturn_cases(tmp_path: Path) -> None: dataset_path = tmp_path / "dataset.json" dataset_path.write_text( json.dumps( { "cases": [ { "name": "multi-turn-case", "message": "Turpini iepriekšējo pavedienu.", "history": [ {"role": "user", "content": "Mums flaky tests parādās tikai CI."}, { "role": "assistant", "content": "Tad jāsalīdzina workflow vide ar lokālo izpildi.", }, ], "expected_terms": ["CI", "workflow"], } ] } ), encoding="utf-8", ) cases = load_chat_benchmark_dataset(dataset_path) assert len(cases[0].history) == 2 assert cases[0].history[0]["role"] == "user" assert cases[0].history[1]["content"] == "Tad jāsalīdzina workflow vide ar lokālo izpildi." def test_load_chat_benchmark_dataset_reads_multimodal_vision_context(tmp_path: Path) -> None: dataset_path = tmp_path / "dataset.json" dataset_path.write_text( json.dumps( { "cases": [ { "name": "vision-case", "message": "Ko redzi incident screenshotā?", "vision_context": { "summary": "Dashboard ar sarkanu incident alert un augstu latency grafiku.", "source": "upload", "model": "test-vision", }, "tags": ["multimodal", "vision"], } ] } ), encoding="utf-8", ) cases = load_chat_benchmark_dataset(dataset_path) assert cases[0].vision_context is not None assert cases[0].vision_context["summary"].startswith("Dashboard") def test_select_chat_benchmark_cases_filters_by_level_and_branch(tmp_path: Path) -> None: dataset_path = tmp_path / "dataset.json" dataset_path.write_text( json.dumps( { "cases": [ { "name": "master-only", "message": "Sveiki", "branches": ["master"], "level": "ci", }, { "name": "coder-only", "message": "Uzraksti kodu", "branches": ["coder"], "level": "ci", }, { "name": "release-only", "message": "Izveido release rollout plānu", "branches": ["master"], "level": "release", }, ] } ), encoding="utf-8", ) cases = load_chat_benchmark_dataset(dataset_path) selected = select_chat_benchmark_cases(cases, levels=["ci"], branch="coder") assert [case.name for case in selected] == ["coder-only"] @pytest.mark.asyncio async def test_run_chat_benchmark_reports_execution_pass_rate(tmp_path: Path) -> None: dataset_path = tmp_path / "dataset.json" dataset_path.write_text( json.dumps( [ { "name": "exec-case", "message": "Uzraksti Python funkciju normalize_email(email: str) -> str.", "expects_code": True, "execution_language": "python", "execution_test_code": "assert normalize_email(' A@Example.COM ') == 'a@example.com'", } ] ), encoding="utf-8", ) async def handler(request: httpx.Request) -> httpx.Response: del request return httpx.Response( 200, json={ "response": ( "```python\n" "def normalize_email(email: str) -> str:\n" " return email.strip().lower()\n" "```" ), "model": "MarisUK/test-model", "tokens_used": 12, }, ) results = await run_chat_benchmark( load_chat_benchmark_dataset(dataset_path), url="http://test/v1/text/generate", transport=httpx.MockTransport(handler), ) summary = summarize_chat_benchmark(results) assert results[0].execution is not None assert results[0].execution.passed is True assert summary["execution_cases"] == 1 assert summary["score_manifest"]["execution"] == 1.0 assert summary["execution_language_pass_rates"]["python"] == 1.0 assert summary["execution_language_scores"]["python"] > 0.0 assert summary["category_execution_pass_rates"]["general"] == 1.0 @pytest.mark.asyncio async def test_run_chat_benchmark_forwards_multimodal_vision_context(tmp_path: Path) -> None: dataset_path = tmp_path / "dataset.json" dataset_path.write_text( json.dumps( [ { "name": "vision-case", "message": "Ko redzi incident screenshotā?", "vision_context": { "summary": "Dashboard ar sarkanu incident alert un augstu latency grafiku.", "source": "upload", }, "expected_terms": ["incident", "latency"], "tags": ["multimodal", "vision"], "category": "multimodal", } ] ), encoding="utf-8", ) async def handler(request: httpx.Request) -> httpx.Response: payload = json.loads(request.content.decode("utf-8")) assert payload["vision_context"]["summary"].startswith("Dashboard") return httpx.Response( 200, json={ "response": "Screenshot rāda incident alert un latency pieaugumu.", "model": "MarisUK/test-model", "tokens_used": 19, }, ) results = await run_chat_benchmark( load_chat_benchmark_dataset(dataset_path), url="http://test/v1/text/generate", transport=httpx.MockTransport(handler), ) assert results[0].ok is True @pytest.mark.asyncio async def test_run_chat_benchmark_enforces_grounding_requirements(tmp_path: Path) -> None: dataset_path = tmp_path / "dataset.json" dataset_path.write_text( json.dumps( [ { "name": "grounded-case", "message": "Debug SSE mismatch starp backend-rust/src/api/chat.rs un frontend/app/chat/page.tsx.", "min_tool_steps": 2, "min_grounding_sources": 2, "expected_grounding_terms": [ "backend-rust/src/api/chat.rs", "frontend/app/chat/page.tsx", ], } ] ), encoding="utf-8", ) async def grounded_handler(request: httpx.Request) -> httpx.Response: del request return httpx.Response( 200, json={ "response": "Pamatojoties uz abiem failiem, jāpārbauda SSE complete event apstrāde.", "model": "MarisUK/test-model", "tokens_used": 22, "tool_trace": { "mode": "multi_step", "steps": [{"name": "workspace_search"}, {"name": "workspace_read"}], "grounding_sources": [ {"label": "backend-rust/src/api/chat.rs"}, {"label": "frontend/app/chat/page.tsx"}, ], }, }, ) results = await run_chat_benchmark( load_chat_benchmark_dataset(dataset_path), url="http://test/v1/text/generate", transport=httpx.MockTransport(grounded_handler), ) assert results[0].ok is True assert results[0].grounding_ok is True @pytest.mark.asyncio async def test_run_chat_benchmark_scores_successful_cases(tmp_path: Path) -> None: dataset_path = tmp_path / "dataset.json" dataset_path.write_text( json.dumps( [ { "name": "case-1", "message": "Izveido strukturētu plānu retry loģikai", "expected_terms": ["retry", "plānu"], "reference_facts": ["retry", "plānu"], "category": "reasoning", "level": "ci", } ] ), encoding="utf-8", ) async def handler(request: httpx.Request) -> httpx.Response: return httpx.Response( 200, json={ "response": "Šeit ir skaidrs plāns ar retry robežām un nākamo soli.", "model": "MarisUK/test-model", "tokens_used": 123, "persona_title": "Systems Strategist", }, ) transport = httpx.MockTransport(handler) results = await run_chat_benchmark( load_chat_benchmark_dataset(dataset_path), url="http://test/v1/text/generate", transport=transport, ) summary = summarize_chat_benchmark(results) assert results[0].ok is True assert results[0].eval is not None assert results[0].tokens_used == 123 assert summary["successful_cases"] == 1 assert summary["average_overall_score"] > 0.0 assert summary["category_scores"]["reasoning"] > 0.0 assert summary["judge_summary"]["overall"] > 0.0 manifest = build_chat_benchmark_manifest( results, benchmark_name="chat-quality", branch="master", model="MarisUK/test-model", human_eval_summary={ "artifact_type": "human-eval-summary", "pairwise_win_rate": 0.75, "average_confidence": 0.8, }, ) assert manifest["artifact_type"] == "chat-benchmark-manifest" assert manifest["score_manifest"]["reasoning"] > 0.0 assert manifest["score_manifest"]["pairwise_win_rate"] == 0.75 assert manifest["human_eval_summary"]["artifact_type"] == "human-eval-summary" def test_build_chat_benchmark_history_and_regression_track_category_and_language() -> None: previous_results = [ ChatBenchmarkResult( name="python-pass", ok=True, latency_ms=100, status_code=200, response="```python\nprint('ok')\n```", model="MarisUK/test-model", tokens_used=10, eval=ChatEvalResult( name="python-pass", helpfulness=0.8, reasoning=0.8, factuality=0.8, latvian_quality=0.8, coding=0.9, long_context=0.7, safety=1.0, category="coding", ), execution=None, category="coding", failure_bucket="production_regression", execution_language="python", ), ChatBenchmarkResult( name="ts-exec", ok=True, latency_ms=120, status_code=200, response="```ts\nexport const ok = true;\n```", model="MarisUK/test-model", tokens_used=12, eval=ChatEvalResult( name="ts-exec", helpfulness=0.75, reasoning=0.76, factuality=0.77, latvian_quality=0.72, coding=0.88, long_context=0.7, safety=1.0, category="grounding", ), execution=CodeExecutionResult( language="typescript", available=True, passed=True, summary="ok", exit_code=0, ), category="grounding", failure_bucket="broken_contract", execution_language="typescript", ), ] current_results = [ ChatBenchmarkResult( name="python-pass", ok=True, latency_ms=105, status_code=200, response="```python\nprint('ok')\n```", model="MarisUK/test-model", tokens_used=11, eval=ChatEvalResult( name="python-pass", helpfulness=0.7, reasoning=0.7, factuality=0.72, latvian_quality=0.74, coding=0.78, long_context=0.68, safety=1.0, category="coding", ), execution=None, category="coding", failure_bucket="production_regression", execution_language="python", ), ChatBenchmarkResult( name="ts-exec", ok=False, latency_ms=140, status_code=200, response="```ts\nthrow new Error('fail');\n```", model="MarisUK/test-model", tokens_used=13, eval=ChatEvalResult( name="ts-exec", helpfulness=0.6, reasoning=0.62, factuality=0.63, latvian_quality=0.68, coding=0.7, long_context=0.64, safety=1.0, category="grounding", ), execution=CodeExecutionResult( language="typescript", available=True, passed=False, summary="failed", exit_code=1, ), category="grounding", failure_bucket="broken_contract", execution_language="typescript", ), ] previous_manifest = build_chat_benchmark_manifest( previous_results, benchmark_name="coder-exec", branch="coder", model="MarisUK/test-model", ) previous_manifest["generated_at"] = "2026-04-16T00:00:00Z" current_manifest = build_chat_benchmark_manifest( current_results, benchmark_name="coder-exec", branch="coder", model="MarisUK/test-model", ) current_manifest["generated_at"] = "2026-04-16T00:01:00Z" regression = build_chat_benchmark_regression_report( current_manifest, previous_run=previous_manifest, ) history = build_chat_benchmark_history_artifact( current_manifest, previous_history={"runs": [previous_manifest]}, ) assert regression["has_baseline"] is True assert regression["has_regressions"] is True assert regression["execution_language_pass_rate_deltas"]["typescript"]["delta"] == -1.0 assert regression["category_score_deltas"]["coding"]["delta"] < 0.0 assert regression["failure_bucket_score_deltas"]["broken_contract"]["delta"] < 0.0 assert history["run_count"] == 2 assert history["latest_regression_summary"]["has_regressions"] is True def test_summarize_chat_benchmark_tracks_memory_metrics_and_trends() -> None: results = [ ChatBenchmarkResult( name="memory-followup", ok=True, latency_ms=80, status_code=200, response="Atceries workflow timeoutus un cache invalidāciju.", model="MarisUK/test-model", tokens_used=18, eval=ChatEvalResult( name="memory-followup", helpfulness=0.82, reasoning=0.81, factuality=0.8, latvian_quality=0.84, coding=0.2, long_context=0.86, safety=1.0, category="multi_turn_continuity", ), execution=None, category="multi_turn_continuity", tags=("memory", "continuity"), production_like=True, ), ChatBenchmarkResult( name="memory-cross-session", ok=False, latency_ms=110, status_code=200, response="Neatceros iepriekšējo preference.", model="MarisUK/test-model", tokens_used=20, eval=ChatEvalResult( name="memory-cross-session", helpfulness=0.45, reasoning=0.48, factuality=0.44, latvian_quality=0.52, coding=0.1, long_context=0.35, safety=0.95, category="cross_session_recall", ), execution=None, category="cross_session_recall", tags=("memory", "cross-session"), ), ] summary = summarize_chat_benchmark(results) manifest = build_chat_benchmark_manifest( results, benchmark_name="memory-quality", branch="master", model="MarisUK/test-model", ) history = build_chat_benchmark_history_artifact(manifest) assert summary["memory_retrieval_cases"] == 2 assert summary["memory_retrieval_pass_rate"] == 0.5 assert summary["memory_quality_scores"]["multi_turn_continuity"] > 0.7 assert summary["memory_quality_pass_rates"]["cross_session_recall"] == 0.0 assert summary["tag_pass_rates"]["memory"] == 0.5 assert manifest["score_manifest"]["memory_multi_turn_continuity"] > 0.7 assert manifest["score_manifest"]["memory_retrieval_pass_rate"] == 0.5 assert manifest["human_eval_cadence"] == "weekly + pre-release" assert history["trend_summary"]["memory_retrieval_pass_rate"]["latest"] == 0.5 assert history["trend_summary"]["average_latency_ms"]["latest"] == 80.0 def test_summarize_chat_benchmark_tracks_tool_multimodal_latency_and_hallucination_metrics() -> ( None ): results = [ ChatBenchmarkResult( name="tooling-case", ok=True, latency_ms=90, status_code=200, response="Balstoties uz README.md, /ready ir readiness checks ceļš.", model="MarisUK/test-model", tokens_used=24, eval=ChatEvalResult( name="tooling-case", helpfulness=0.82, reasoning=0.8, factuality=0.86, latvian_quality=0.84, coding=0.4, long_context=0.75, safety=1.0, category="grounding", ), execution=None, grounding_required=True, grounding_ok=True, category="grounding", tags=("tool_use", "repo"), ), ChatBenchmarkResult( name="multimodal-case", ok=False, latency_ms=180, status_code=200, response="Droši vien tur nav nekāda incidenta.", model="MarisUK/test-model", tokens_used=21, eval=ChatEvalResult( name="multimodal-case", helpfulness=0.42, reasoning=0.38, factuality=0.32, latvian_quality=0.7, coding=0.1, long_context=0.4, safety=0.92, category="multimodal", ), execution=None, category="multimodal", tags=("multimodal", "vision"), ), ] summary = summarize_chat_benchmark(results) manifest = build_chat_benchmark_manifest( results, benchmark_name="assistant-quality", branch="master", model="MarisUK/test-model", ) history = build_chat_benchmark_history_artifact(manifest) assert summary["tool_use_cases"] == 1 assert summary["tool_use_pass_rate"] == 1.0 assert summary["multimodal_cases"] == 1 assert summary["multimodal_pass_rate"] == 0.0 assert summary["hallucination_incidents"] == 1 assert summary["hallucination_rate"] == 0.5 assert summary["latency_p95_ms"] == 180.0 assert summary["quality_dimensions"]["tool_use"]["pass_rate"] == 1.0 assert summary["quality_dimensions"]["multimodality"]["cases"] == 1 assert manifest["score_manifest"]["tool_use_pass_rate"] == 1.0 assert manifest["score_manifest"]["multimodal_pass_rate"] == 0.0 assert manifest["score_manifest"]["hallucination_rate"] == 0.5 assert history["trend_summary"]["tool_use_pass_rate"]["latest"] == 1.0 assert history["trend_summary"]["multimodal_pass_rate"]["latest"] == 0.0 assert history["trend_summary"]["hallucination_rate"]["latest"] == 0.5