| """Chat benchmark runner for JSON eval datasets.""" |
|
|
| from __future__ import annotations |
|
|
| import asyncio |
| import json |
| from collections.abc import Awaitable, Callable |
| from dataclasses import asdict, dataclass |
| from datetime import UTC, datetime |
| from pathlib import Path |
| from statistics import mean |
| from time import perf_counter |
| from typing import Any |
|
|
| import httpx |
|
|
| from maris_core.code.execution_eval import ( |
| CodeExecutionResult, |
| CodeExecutionSpec, |
| evaluate_code_response, |
| ) |
| from maris_core.text.evals import ChatEvalCase, ChatEvalResult, evaluate_chat_case |
|
|
| MEMORY_QUALITY_CATEGORIES = ( |
| "multi_turn_continuity", |
| "cross_session_recall", |
| "user_preferences_recall", |
| "cross_lingual_retrieval", |
| "stale_memory_rejection", |
| ) |
| TOOL_USE_TAGS = ("tool", "tools", "tool_use", "grounding", "repo", "browser") |
| MULTIMODAL_TAGS = ("multimodal", "vision", "image", "video", "audio", "voice", "music") |
| MULTIMODAL_CATEGORIES = ( |
| "multimodal", |
| "vision", |
| "vision_analysis", |
| "image_generation", |
| "video_generation", |
| "voice_conversation", |
| "audio", |
| "music_generation", |
| ) |
| HALLUCINATION_FACTUALITY_THRESHOLD = 0.6 |
| TREND_TRACKED_METRICS = ( |
| "average_overall_score", |
| "average_latency_ms", |
| "latency_p95_ms", |
| "average_tokens_used", |
| "success_rate", |
| "reasoning", |
| "coding", |
| "safety", |
| "execution_pass_rate", |
| "grounding_pass_rate", |
| "tool_use_pass_rate", |
| "memory_retrieval_pass_rate", |
| "multimodal_pass_rate", |
| "hallucination_rate", |
| "production_like_pass_rate", |
| "pairwise_win_rate", |
| ) |
|
|
|
|
| @dataclass(frozen=True, slots=True) |
| class ChatBenchmarkCase: |
| name: str |
| message: str |
| history: tuple[dict[str, str], ...] = () |
| vision_context: dict[str, Any] | None = None |
| profile: str | None = None |
| persona_id: str | None = None |
| session_id: str | None = None |
| expected_terms: tuple[str, ...] = () |
| forbidden_terms: tuple[str, ...] = () |
| tags: tuple[str, ...] = () |
| reference_answer: str | None = None |
| reference_facts: tuple[str, ...] = () |
| expects_code: bool = False |
| execution_language: str | None = None |
| execution_test_code: str | None = None |
| execution_timeout_seconds: float = 8.0 |
| execution_compile_only: bool = False |
| min_tool_steps: int = 0 |
| min_grounding_sources: int = 0 |
| expected_grounding_terms: tuple[str, ...] = () |
| branches: tuple[str, ...] = () |
| level: str = "ci" |
| difficulty: str = "standard" |
| category: str = "general" |
| failure_bucket: str = "general" |
| risk_level: str = "standard" |
| production_like: bool = False |
|
|
|
|
| @dataclass(frozen=True, slots=True) |
| class ChatBenchmarkResult: |
| name: str |
| ok: bool |
| latency_ms: int |
| status_code: int | None |
| response: str |
| model: str |
| tokens_used: int |
| eval: ChatEvalResult | None |
| execution: CodeExecutionResult | None |
| grounding_required: bool = False |
| grounding_ok: bool = True |
| error: str | None = None |
| tags: tuple[str, ...] = () |
| level: str = "ci" |
| difficulty: str = "standard" |
| category: str = "general" |
| failure_bucket: str = "general" |
| risk_level: str = "standard" |
| production_like: bool = False |
| execution_language: str | None = None |
|
|
|
|
| def load_chat_benchmark_dataset(path: str | Path) -> list[ChatBenchmarkCase]: |
| raw = json.loads(Path(path).read_text(encoding="utf-8")) |
| entries = raw.get("cases", raw) if isinstance(raw, dict) else raw |
| if not isinstance(entries, list): |
| raise ValueError("Benchmark datasetam jābūt JSON masīvam vai objektam ar `cases`.") |
|
|
| cases: list[ChatBenchmarkCase] = [] |
| for entry in entries: |
| if not isinstance(entry, dict): |
| raise ValueError("Katram benchmark ierakstam jābūt JSON objektam.") |
| name = str(entry.get("name", "")).strip() |
| message = str(entry.get("message", "")).strip() |
| if not name or not message: |
| raise ValueError("Benchmark ierakstam obligāti vajag `name` un `message`.") |
| history = tuple( |
| item |
| for item in entry.get("history", []) |
| if isinstance(item, dict) and item.get("role") and item.get("content") |
| ) |
| cases.append( |
| ChatBenchmarkCase( |
| name=name, |
| message=message, |
| history=history, |
| vision_context=_normalize_optional_mapping(entry.get("vision_context")), |
| profile=_normalize_optional_text(entry.get("profile")), |
| persona_id=_normalize_optional_text(entry.get("persona_id")), |
| session_id=_normalize_optional_text(entry.get("session_id")), |
| expected_terms=_normalize_text_list(entry.get("expected_terms")), |
| forbidden_terms=_normalize_text_list(entry.get("forbidden_terms")), |
| tags=_normalize_text_list(entry.get("tags")), |
| reference_answer=_normalize_optional_text(entry.get("reference_answer")), |
| reference_facts=_normalize_text_list(entry.get("reference_facts")), |
| expects_code=bool(entry.get("expects_code", False)), |
| execution_language=_normalize_optional_text(entry.get("execution_language")), |
| execution_test_code=_normalize_optional_text(entry.get("execution_test_code")), |
| execution_timeout_seconds=float(entry.get("execution_timeout_seconds", 8.0) or 8.0), |
| execution_compile_only=bool(entry.get("execution_compile_only", False)), |
| min_tool_steps=max(0, int(entry.get("min_tool_steps", 0) or 0)), |
| min_grounding_sources=max(0, int(entry.get("min_grounding_sources", 0) or 0)), |
| expected_grounding_terms=_normalize_text_list( |
| entry.get("expected_grounding_terms") |
| ), |
| branches=_normalize_text_list(entry.get("branches")), |
| level=_normalize_optional_text(entry.get("level")) or "ci", |
| difficulty=_normalize_optional_text(entry.get("difficulty")) or "standard", |
| category=_normalize_optional_text(entry.get("category")) or "general", |
| failure_bucket=_normalize_optional_text(entry.get("failure_bucket")) or "general", |
| risk_level=_normalize_optional_text(entry.get("risk_level")) or "standard", |
| production_like=bool(entry.get("production_like", False)), |
| ) |
| ) |
| return cases |
|
|
|
|
| def summarize_chat_benchmark(results: list[ChatBenchmarkResult]) -> dict[str, Any]: |
| successful = [result for result in results if result.ok] |
| failed = [result for result in results if not result.ok] |
| evals = [result.eval for result in successful if result.eval is not None] |
|
|
| category_scores = _group_average_scores(results, key=lambda item: item.category) |
| failure_bucket_scores = _group_average_scores(results, key=lambda item: item.failure_bucket) |
| level_scores = _group_average_scores(results, key=lambda item: item.level) |
| difficulty_scores = _group_average_scores(results, key=lambda item: item.difficulty) |
| risk_level_scores = _group_average_scores(results, key=lambda item: item.risk_level) |
| tag_scores = _group_tag_average_scores(results) |
| tag_pass_rates = _group_tag_pass_rates(results) |
| execution_supported = [ |
| result for result in results if result.execution is not None and result.execution.available |
| ] |
| execution_skipped = [ |
| result |
| for result in results |
| if result.execution is not None and not result.execution.available |
| ] |
| execution_passed = [result for result in execution_supported if result.execution.passed] |
| grounding_cases = [result for result in results if result.grounding_required] |
| grounding_passed = [ |
| result |
| for result in grounding_cases |
| if result.grounding_ok is True and result.error != "grounding requirements not met" |
| ] |
| execution_language_pass_rates = _group_execution_pass_rates( |
| execution_supported, |
| key=lambda item: item.execution_language or "unspecified", |
| ) |
| execution_language_scores = _group_average_scores( |
| [result for result in results if result.execution_language], |
| key=lambda item: item.execution_language or "unspecified", |
| ) |
| category_execution_pass_rates = _group_execution_pass_rates( |
| execution_supported, |
| key=lambda item: item.category, |
| ) |
| production_like_results = [result for result in results if result.production_like] |
| production_like_passed = [result for result in production_like_results if result.ok] |
| memory_results = [ |
| result |
| for result in results |
| if result.category in MEMORY_QUALITY_CATEGORIES or "memory" in result.tags |
| ] |
| memory_passed = [result for result in memory_results if result.ok] |
| memory_quality_scores = { |
| key: value for key, value in category_scores.items() if key in MEMORY_QUALITY_CATEGORIES |
| } |
| memory_quality_pass_rates = _group_case_pass_rates( |
| memory_results, |
| key=lambda item: item.category, |
| ) |
| tool_use_results = [result for result in results if _is_tool_use_result(result)] |
| tool_use_passed = [result for result in tool_use_results if result.ok and result.grounding_ok] |
| multimodal_results = [result for result in results if _is_multimodal_result(result)] |
| multimodal_passed = [result for result in multimodal_results if result.ok] |
| hallucination_population = [result for result in results if result.eval is not None] |
| hallucination_incidents = [ |
| result for result in hallucination_population if _is_hallucination_incident(result) |
| ] |
| latency_p95_ms = _latency_percentile_ms(results, percentile=95.0) |
| judge_summary = _summarize_judge_results(evals) |
| score_manifest = { |
| "overall": round(mean(item.overall for item in evals), 3) if evals else 0.0, |
| "reasoning": _mean_eval_metric(evals, "reasoning"), |
| "factuality": _mean_eval_metric(evals, "factuality"), |
| "latvian_quality": _mean_eval_metric(evals, "latvian_quality"), |
| "coding": _mean_eval_metric(evals, "coding"), |
| "long_context": _mean_eval_metric(evals, "long_context"), |
| "helpfulness": _mean_eval_metric(evals, "helpfulness"), |
| "safety": _mean_eval_metric(evals, "safety"), |
| "tool_use_pass_rate": round(len(tool_use_passed) / len(tool_use_results), 3) |
| if tool_use_results |
| else 1.0, |
| "multimodal_pass_rate": round(len(multimodal_passed) / len(multimodal_results), 3) |
| if multimodal_results |
| else 1.0, |
| "hallucination_rate": round( |
| len(hallucination_incidents) / len(hallucination_population), |
| 3, |
| ) |
| if hallucination_population |
| else 0.0, |
| "latency_p95_ms": latency_p95_ms, |
| "execution": round(len(execution_passed) / len(execution_supported), 3) |
| if execution_supported |
| else 0.0, |
| "grounding": round(len(grounding_passed) / len(grounding_cases), 3) |
| if grounding_cases |
| else 1.0, |
| "judge_overall": judge_summary["overall"], |
| "judge_task_completion": judge_summary["dimension_scores"]["task_completion"], |
| "judge_instruction_following": judge_summary["dimension_scores"]["instruction_following"], |
| "judge_grounding": judge_summary["dimension_scores"]["grounding"], |
| "judge_safety": judge_summary["dimension_scores"]["safety"], |
| "judge_multi_turn_continuity": judge_summary["dimension_scores"]["multi_turn_continuity"], |
| "judge_code_quality": judge_summary["dimension_scores"]["code_quality"], |
| "judge_regression_risk": judge_summary["dimension_scores"]["regression_risk"], |
| "memory_retrieval_pass_rate": round(len(memory_passed) / len(memory_results), 3) |
| if memory_results |
| else 1.0, |
| } |
| for category, score in memory_quality_scores.items(): |
| score_manifest[f"memory_{category}"] = score |
|
|
| quality_dimensions = { |
| "reasoning": {"score": score_manifest["reasoning"], "cases": len(results)}, |
| "coding": {"score": score_manifest["coding"], "cases": len(results)}, |
| "tool_use": { |
| "cases": len(tool_use_results), |
| "pass_rate": score_manifest["tool_use_pass_rate"], |
| }, |
| "memory": { |
| "cases": len(memory_results), |
| "pass_rate": score_manifest["memory_retrieval_pass_rate"], |
| }, |
| "multimodality": { |
| "cases": len(multimodal_results), |
| "pass_rate": score_manifest["multimodal_pass_rate"], |
| }, |
| "latency": { |
| "average_ms": round(mean(result.latency_ms for result in successful), 1) |
| if successful |
| else 0.0, |
| "p95_ms": latency_p95_ms, |
| }, |
| "hallucination": { |
| "cases": len(hallucination_population), |
| "incident_rate": score_manifest["hallucination_rate"], |
| }, |
| "safety": {"score": score_manifest["safety"], "cases": len(results)}, |
| } |
|
|
| return { |
| "total_cases": len(results), |
| "successful_cases": len(successful), |
| "failed_cases": len(failed), |
| "success_rate": round(len(successful) / len(results), 3) if results else 0.0, |
| "average_latency_ms": round(mean(result.latency_ms for result in successful), 1) |
| if successful |
| else 0.0, |
| "latency_p95_ms": latency_p95_ms, |
| "average_tokens_used": round(mean(result.tokens_used for result in successful), 1) |
| if successful |
| else 0.0, |
| "average_overall_score": round(mean(item.overall for item in evals), 3) if evals else 0.0, |
| "execution_cases": len(execution_supported), |
| "execution_skipped": len(execution_skipped), |
| "execution_passed": len(execution_passed), |
| "execution_pass_rate": round(len(execution_passed) / len(execution_supported), 3) |
| if execution_supported |
| else 0.0, |
| "grounding_cases": len(grounding_cases), |
| "grounding_pass_rate": round(len(grounding_passed) / len(grounding_cases), 3) |
| if grounding_cases |
| else 1.0, |
| "category_scores": category_scores, |
| "failure_bucket_scores": failure_bucket_scores, |
| "execution_language_pass_rates": execution_language_pass_rates, |
| "execution_language_scores": execution_language_scores, |
| "category_execution_pass_rates": category_execution_pass_rates, |
| "level_scores": level_scores, |
| "difficulty_scores": difficulty_scores, |
| "risk_level_scores": risk_level_scores, |
| "tag_scores": tag_scores, |
| "tag_pass_rates": tag_pass_rates, |
| "memory_retrieval_cases": len(memory_results), |
| "memory_retrieval_pass_rate": round(len(memory_passed) / len(memory_results), 3) |
| if memory_results |
| else 1.0, |
| "tool_use_cases": len(tool_use_results), |
| "tool_use_pass_rate": round(len(tool_use_passed) / len(tool_use_results), 3) |
| if tool_use_results |
| else 1.0, |
| "multimodal_cases": len(multimodal_results), |
| "multimodal_pass_rate": round(len(multimodal_passed) / len(multimodal_results), 3) |
| if multimodal_results |
| else 1.0, |
| "hallucination_eval_cases": len(hallucination_population), |
| "hallucination_incidents": len(hallucination_incidents), |
| "hallucination_rate": round( |
| len(hallucination_incidents) / len(hallucination_population), |
| 3, |
| ) |
| if hallucination_population |
| else 0.0, |
| "memory_quality_scores": memory_quality_scores, |
| "memory_quality_pass_rates": memory_quality_pass_rates, |
| "production_like_cases": len(production_like_results), |
| "production_like_pass_rate": round( |
| len(production_like_passed) / len(production_like_results), 3 |
| ) |
| if production_like_results |
| else 1.0, |
| "quality_dimensions": quality_dimensions, |
| "judge_summary": judge_summary, |
| "score_manifest": score_manifest, |
| "results": [benchmark_result_to_dict(result) for result in results], |
| } |
|
|
|
|
| def build_chat_benchmark_manifest( |
| results: list[ChatBenchmarkResult], |
| *, |
| benchmark_name: str, |
| branch: str, |
| model: str, |
| human_eval_summary: dict[str, Any] | None = None, |
| ) -> dict[str, Any]: |
| summary = summarize_chat_benchmark(results) |
| if isinstance(human_eval_summary, dict): |
| summary["human_eval_summary"] = human_eval_summary |
| summary["score_manifest"]["pairwise_win_rate"] = round( |
| float(human_eval_summary.get("pairwise_win_rate", 0.0) or 0.0), 3 |
| ) |
| summary["score_manifest"]["human_eval_confidence"] = round( |
| float(human_eval_summary.get("average_confidence", 0.0) or 0.0), 3 |
| ) |
| summary["human_eval_cadence"] = _resolve_human_eval_cadence( |
| benchmark_name=benchmark_name, |
| branch=branch, |
| human_eval_summary=human_eval_summary, |
| ) |
| return { |
| "benchmark_name": benchmark_name, |
| "branch": branch, |
| "model": model, |
| "generated_at": _utc_timestamp(), |
| "artifact_type": "chat-benchmark-manifest", |
| **summary, |
| } |
|
|
|
|
| def build_chat_benchmark_history_artifact( |
| current_manifest: dict[str, Any], |
| *, |
| previous_history: dict[str, Any] | None = None, |
| max_runs: int = 30, |
| ) -> dict[str, Any]: |
| existing_runs = _coerce_history_runs(previous_history) |
| current_run = _build_benchmark_history_run(current_manifest) |
| baseline = _select_previous_history_baseline(existing_runs, current_run) |
| runs = list(existing_runs) |
| if runs and _same_benchmark_identity(runs[-1], current_run): |
| runs[-1] = current_run |
| else: |
| runs.append(current_run) |
| runs = runs[-max(1, max_runs) :] |
| regression_report = build_chat_benchmark_regression_report( |
| current_manifest, |
| previous_run=baseline, |
| ) |
| return { |
| "artifact_type": "chat-benchmark-history", |
| "benchmark_name": str(current_manifest.get("benchmark_name", "")).strip(), |
| "branch": str(current_manifest.get("branch", "")).strip(), |
| "model": str(current_manifest.get("model", "")).strip(), |
| "latest_generated_at": current_run["generated_at"], |
| "run_count": len(runs), |
| "runs": runs, |
| "trend_summary": _build_benchmark_trend_summary(runs), |
| "latest_regression_summary": { |
| "status": regression_report["status"], |
| "has_baseline": regression_report["has_baseline"], |
| "has_regressions": regression_report["has_regressions"], |
| "regression_count": regression_report["regression_count"], |
| }, |
| } |
|
|
|
|
| def build_chat_benchmark_regression_report( |
| current_manifest: dict[str, Any], |
| *, |
| previous_run: dict[str, Any] | None = None, |
| ) -> dict[str, Any]: |
| current = _extract_benchmark_reference_metrics(current_manifest) |
| previous = _extract_benchmark_reference_metrics(previous_run) |
| if previous is None: |
| return { |
| "artifact_type": "chat-benchmark-regression-report", |
| "benchmark_name": current["benchmark_name"], |
| "branch": current["branch"], |
| "model": current["model"], |
| "current_generated_at": current["generated_at"], |
| "previous_generated_at": None, |
| "has_baseline": False, |
| "has_regressions": False, |
| "regression_count": 0, |
| "status": "no-baseline", |
| "score_manifest_deltas": {}, |
| "category_score_deltas": {}, |
| "failure_bucket_score_deltas": {}, |
| "execution_language_pass_rate_deltas": {}, |
| "execution_language_score_deltas": {}, |
| "category_execution_pass_rate_deltas": {}, |
| "memory_quality_score_deltas": {}, |
| "memory_quality_pass_rate_deltas": {}, |
| "regressions": {}, |
| } |
|
|
| score_manifest_deltas = _calculate_metric_deltas( |
| current["score_manifest"], |
| previous["score_manifest"], |
| ) |
| category_score_deltas = _calculate_metric_deltas( |
| current["category_scores"], |
| previous["category_scores"], |
| ) |
| failure_bucket_score_deltas = _calculate_metric_deltas( |
| current["failure_bucket_scores"], |
| previous["failure_bucket_scores"], |
| ) |
| execution_language_pass_rate_deltas = _calculate_metric_deltas( |
| current["execution_language_pass_rates"], |
| previous["execution_language_pass_rates"], |
| ) |
| execution_language_score_deltas = _calculate_metric_deltas( |
| current["execution_language_scores"], |
| previous["execution_language_scores"], |
| ) |
| category_execution_pass_rate_deltas = _calculate_metric_deltas( |
| current["category_execution_pass_rates"], |
| previous["category_execution_pass_rates"], |
| ) |
| memory_quality_score_deltas = _calculate_metric_deltas( |
| current["memory_quality_scores"], |
| previous["memory_quality_scores"], |
| ) |
| memory_quality_pass_rate_deltas = _calculate_metric_deltas( |
| current["memory_quality_pass_rates"], |
| previous["memory_quality_pass_rates"], |
| ) |
| regressions = { |
| "score_manifest": _filter_negative_deltas(score_manifest_deltas), |
| "category_scores": _filter_negative_deltas(category_score_deltas), |
| "failure_bucket_scores": _filter_negative_deltas(failure_bucket_score_deltas), |
| "execution_language_pass_rates": _filter_negative_deltas( |
| execution_language_pass_rate_deltas |
| ), |
| "execution_language_scores": _filter_negative_deltas(execution_language_score_deltas), |
| "category_execution_pass_rates": _filter_negative_deltas( |
| category_execution_pass_rate_deltas |
| ), |
| "memory_quality_scores": _filter_negative_deltas(memory_quality_score_deltas), |
| "memory_quality_pass_rates": _filter_negative_deltas(memory_quality_pass_rate_deltas), |
| } |
| regression_count = sum(len(payload) for payload in regressions.values()) |
| return { |
| "artifact_type": "chat-benchmark-regression-report", |
| "benchmark_name": current["benchmark_name"], |
| "branch": current["branch"], |
| "model": current["model"], |
| "current_generated_at": current["generated_at"], |
| "previous_generated_at": previous["generated_at"], |
| "has_baseline": True, |
| "has_regressions": regression_count > 0, |
| "regression_count": regression_count, |
| "status": "regression-detected" if regression_count > 0 else "ok", |
| "score_manifest_deltas": score_manifest_deltas, |
| "category_score_deltas": category_score_deltas, |
| "failure_bucket_score_deltas": failure_bucket_score_deltas, |
| "execution_language_pass_rate_deltas": execution_language_pass_rate_deltas, |
| "execution_language_score_deltas": execution_language_score_deltas, |
| "category_execution_pass_rate_deltas": category_execution_pass_rate_deltas, |
| "memory_quality_score_deltas": memory_quality_score_deltas, |
| "memory_quality_pass_rate_deltas": memory_quality_pass_rate_deltas, |
| "regressions": regressions, |
| } |
|
|
|
|
| def select_chat_benchmark_cases( |
| cases: list[ChatBenchmarkCase], |
| *, |
| levels: list[str] | tuple[str, ...], |
| branch: str | None = None, |
| ) -> list[ChatBenchmarkCase]: |
| selected = [case for case in cases if case.level in set(levels)] |
| if not branch: |
| return selected |
| normalized_branch = branch.strip().lower() |
| return [ |
| case |
| for case in selected |
| if not case.branches |
| or normalized_branch in {item.strip().lower() for item in case.branches} |
| ] |
|
|
|
|
| async def run_chat_benchmark( |
| cases: list[ChatBenchmarkCase], |
| *, |
| url: str, |
| concurrency: int = 1, |
| timeout_seconds: float = 120.0, |
| transport: httpx.AsyncBaseTransport | None = None, |
| ) -> list[ChatBenchmarkResult]: |
| async def responder(case: ChatBenchmarkCase) -> dict[str, Any]: |
| async with httpx.AsyncClient( |
| timeout=timeout_seconds, |
| follow_redirects=True, |
| transport=transport, |
| ) as client: |
| response = await client.post( |
| url, |
| json={ |
| "message": case.message, |
| "history": list(case.history), |
| "vision_context": case.vision_context, |
| "profile": case.profile, |
| "persona_id": case.persona_id, |
| "session_id": case.session_id, |
| }, |
| ) |
| response.raise_for_status() |
| return response.json() |
|
|
| return await run_chat_benchmark_with_responder( |
| cases, responder=responder, concurrency=concurrency |
| ) |
|
|
|
|
| async def run_chat_benchmark_with_responder( |
| cases: list[ChatBenchmarkCase], |
| *, |
| responder: Callable[[ChatBenchmarkCase], Awaitable[dict[str, Any]]], |
| concurrency: int = 1, |
| ) -> list[ChatBenchmarkResult]: |
| semaphore = asyncio.Semaphore(max(1, concurrency)) |
|
|
| async def run_case(case: ChatBenchmarkCase) -> ChatBenchmarkResult: |
| async with semaphore: |
| return await _execute_case(responder, case) |
|
|
| return await asyncio.gather(*(run_case(case) for case in cases)) |
|
|
|
|
| def benchmark_result_to_dict(result: ChatBenchmarkResult) -> dict[str, Any]: |
| payload = asdict(result) |
| if result.eval is not None: |
| payload["eval"] = asdict(result.eval) |
| payload["eval"]["overall"] = result.eval.overall |
| return payload |
|
|
|
|
| def _normalize_optional_text(value: Any) -> str | None: |
| normalized = str(value or "").strip() |
| return normalized or None |
|
|
|
|
| def _normalize_text_list(value: Any) -> tuple[str, ...]: |
| if not isinstance(value, list): |
| return () |
| return tuple(str(item).strip() for item in value if str(item).strip()) |
|
|
|
|
| def _normalize_optional_mapping(value: Any) -> dict[str, Any] | None: |
| if not isinstance(value, dict): |
| return None |
| return dict(value) |
|
|
|
|
| async def _execute_case( |
| responder: Callable[[ChatBenchmarkCase], Awaitable[dict[str, Any]]], |
| case: ChatBenchmarkCase, |
| ) -> ChatBenchmarkResult: |
| started_at = perf_counter() |
| try: |
| data = await responder(case) |
| latency_ms = int((perf_counter() - started_at) * 1000) |
| response_text = str(data.get("response", "")).strip() |
| execution_result = ( |
| evaluate_code_response( |
| response_text, |
| CodeExecutionSpec( |
| language=case.execution_language, |
| test_code=case.execution_test_code or "", |
| timeout_seconds=case.execution_timeout_seconds, |
| compile_only=case.execution_compile_only, |
| ), |
| ) |
| if case.execution_language |
| else None |
| ) |
| grounding_ok = _grounding_requirements_met(data, case) |
| eval_result = evaluate_chat_case( |
| ChatEvalCase( |
| name=case.name, |
| prompt=case.message, |
| response=response_text, |
| persona_title=str(data.get("persona_title") or "Core Assistant"), |
| reference_answer=case.reference_answer or "", |
| reference_facts=case.reference_facts, |
| expected_terms=case.expected_terms, |
| forbidden_terms=case.forbidden_terms, |
| history_turns=len(case.history), |
| expects_code=case.expects_code, |
| level=case.level, |
| difficulty=case.difficulty, |
| category=case.category, |
| failure_bucket=case.failure_bucket, |
| risk_level=case.risk_level, |
| production_like=case.production_like, |
| ) |
| ) |
| execution_ok = ( |
| execution_result is None or not execution_result.available or execution_result.passed |
| ) |
| ok = execution_ok and grounding_ok |
| return ChatBenchmarkResult( |
| name=case.name, |
| ok=ok, |
| latency_ms=latency_ms, |
| status_code=int(data.get("status_code", 200) or 200), |
| response=response_text, |
| model=str(data.get("model", "")).strip(), |
| tokens_used=int(data.get("tokens_used", 0) or 0), |
| eval=eval_result, |
| execution=execution_result, |
| grounding_required=_case_requires_grounding(case), |
| grounding_ok=grounding_ok, |
| error=_failure_summary(execution_result=execution_result, grounding_ok=grounding_ok), |
| tags=case.tags, |
| level=case.level, |
| difficulty=case.difficulty, |
| category=case.category, |
| failure_bucket=case.failure_bucket, |
| risk_level=case.risk_level, |
| production_like=case.production_like, |
| execution_language=case.execution_language, |
| ) |
| except (httpx.HTTPError, ValueError, TypeError) as exc: |
| latency_ms = int((perf_counter() - started_at) * 1000) |
| status_code = getattr(getattr(exc, "response", None), "status_code", None) |
| return ChatBenchmarkResult( |
| name=case.name, |
| ok=False, |
| latency_ms=latency_ms, |
| status_code=status_code, |
| response="", |
| model="", |
| tokens_used=0, |
| eval=None, |
| execution=None, |
| grounding_required=_case_requires_grounding(case), |
| grounding_ok=False, |
| error=str(exc), |
| tags=case.tags, |
| level=case.level, |
| difficulty=case.difficulty, |
| category=case.category, |
| failure_bucket=case.failure_bucket, |
| risk_level=case.risk_level, |
| production_like=case.production_like, |
| execution_language=case.execution_language, |
| ) |
|
|
|
|
| def _case_requires_grounding(case: ChatBenchmarkCase) -> bool: |
| return ( |
| case.min_tool_steps > 0 |
| or case.min_grounding_sources > 0 |
| or bool(case.expected_grounding_terms) |
| ) |
|
|
|
|
| def _grounding_requirements_met(data: dict[str, Any], case: ChatBenchmarkCase) -> bool: |
| if not _case_requires_grounding(case): |
| return True |
| tool_trace = data.get("tool_trace") |
| if not isinstance(tool_trace, dict): |
| return False |
| steps = tool_trace.get("steps") |
| grounding_sources = tool_trace.get("grounding_sources") |
| if case.min_tool_steps > 0 and ( |
| not isinstance(steps, list) or len(steps) < case.min_tool_steps |
| ): |
| return False |
| if case.min_grounding_sources > 0 and ( |
| not isinstance(grounding_sources, list) |
| or len(grounding_sources) < case.min_grounding_sources |
| ): |
| return False |
| if not case.expected_grounding_terms: |
| return True |
| haystacks: list[str] = [] |
| if isinstance(grounding_sources, list): |
| for source in grounding_sources: |
| if not isinstance(source, dict): |
| continue |
| haystacks.extend( |
| [ |
| str(source.get("label", "")).lower(), |
| str(source.get("uri", "")).lower(), |
| str(source.get("snippet", "")).lower(), |
| ] |
| ) |
| combined = "\n".join(haystacks) |
| return all(term.lower() in combined for term in case.expected_grounding_terms) |
|
|
|
|
| def _failure_summary( |
| *, execution_result: CodeExecutionResult | None, grounding_ok: bool |
| ) -> str | None: |
| if execution_result is not None and execution_result.available and not execution_result.passed: |
| return execution_result.summary |
| if not grounding_ok: |
| return "grounding requirements not met" |
| return None |
|
|
|
|
| def _mean_eval_metric(evals: list[ChatEvalResult], field: str) -> float: |
| if not evals: |
| return 0.0 |
| return round(mean(float(getattr(item, field, 0.0)) for item in evals), 3) |
|
|
|
|
| def _group_average_scores( |
| results: list[ChatBenchmarkResult], |
| *, |
| key: Callable[[ChatBenchmarkResult], str], |
| ) -> dict[str, float]: |
| buckets: dict[str, list[float]] = {} |
| for result in results: |
| if result.eval is None: |
| continue |
| bucket = key(result).strip() |
| if not bucket: |
| continue |
| buckets.setdefault(bucket, []).append(result.eval.overall) |
| return {bucket: round(mean(scores), 3) for bucket, scores in sorted(buckets.items()) if scores} |
|
|
|
|
| def _group_case_pass_rates( |
| results: list[ChatBenchmarkResult], |
| *, |
| key: Callable[[ChatBenchmarkResult], str], |
| ) -> dict[str, float]: |
| buckets: dict[str, list[float]] = {} |
| for result in results: |
| bucket = key(result).strip() |
| if not bucket: |
| continue |
| buckets.setdefault(bucket, []).append(1.0 if result.ok else 0.0) |
| return {bucket: round(mean(scores), 3) for bucket, scores in sorted(buckets.items()) if scores} |
|
|
|
|
| def _group_tag_average_scores(results: list[ChatBenchmarkResult]) -> dict[str, float]: |
| buckets: dict[str, list[float]] = {} |
| for result in results: |
| if result.eval is None: |
| continue |
| for tag in result.tags: |
| normalized = str(tag).strip() |
| if normalized: |
| buckets.setdefault(normalized, []).append(result.eval.overall) |
| return {bucket: round(mean(scores), 3) for bucket, scores in sorted(buckets.items()) if scores} |
|
|
|
|
| def _group_tag_pass_rates(results: list[ChatBenchmarkResult]) -> dict[str, float]: |
| buckets: dict[str, list[float]] = {} |
| for result in results: |
| for tag in result.tags: |
| normalized = str(tag).strip() |
| if normalized: |
| buckets.setdefault(normalized, []).append(1.0 if result.ok else 0.0) |
| return {bucket: round(mean(scores), 3) for bucket, scores in sorted(buckets.items()) if scores} |
|
|
|
|
| def _group_execution_pass_rates( |
| results: list[ChatBenchmarkResult], |
| *, |
| key: Callable[[ChatBenchmarkResult], str], |
| ) -> dict[str, float]: |
| buckets: dict[str, list[float]] = {} |
| for result in results: |
| if result.execution is None or not result.execution.available: |
| continue |
| bucket = key(result).strip() |
| if not bucket: |
| continue |
| buckets.setdefault(bucket, []).append(1.0 if result.execution.passed else 0.0) |
| return {bucket: round(mean(scores), 3) for bucket, scores in sorted(buckets.items()) if scores} |
|
|
|
|
| def _is_tool_use_result(result: ChatBenchmarkResult) -> bool: |
| if result.grounding_required: |
| return True |
| normalized_tags = {str(tag).strip().lower() for tag in result.tags} |
| return any(tag in normalized_tags for tag in TOOL_USE_TAGS) |
|
|
|
|
| def _is_multimodal_result(result: ChatBenchmarkResult) -> bool: |
| normalized_category = result.category.strip().lower() |
| if normalized_category in MULTIMODAL_CATEGORIES: |
| return True |
| normalized_tags = {str(tag).strip().lower() for tag in result.tags} |
| return any(tag in normalized_tags for tag in MULTIMODAL_TAGS) |
|
|
|
|
| def _is_hallucination_incident(result: ChatBenchmarkResult) -> bool: |
| if result.eval is None: |
| return False |
| if result.eval.factuality < HALLUCINATION_FACTUALITY_THRESHOLD: |
| return True |
| judge = result.eval.judge |
| if judge is None: |
| return False |
| return ( |
| result.category in {"grounding", "factuality", "multimodal"} and not judge.grounding.passed |
| ) |
|
|
|
|
| def _latency_percentile_ms( |
| results: list[ChatBenchmarkResult], |
| *, |
| percentile: float, |
| ) -> float: |
| if not results: |
| return 0.0 |
| latencies = sorted(result.latency_ms for result in results) |
| index = max(0, min(len(latencies) - 1, round((len(latencies) - 1) * (percentile / 100.0)))) |
| return round(float(latencies[index]), 1) |
|
|
|
|
| def _summarize_judge_results(evals: list[ChatEvalResult]) -> dict[str, Any]: |
| dimension_names = ( |
| "task_completion", |
| "instruction_following", |
| "grounding", |
| "safety", |
| "multi_turn_continuity", |
| "code_quality", |
| "regression_risk", |
| ) |
| judge_results = [item.judge for item in evals if item.judge is not None] |
| if not judge_results: |
| return { |
| "overall": 0.0, |
| "pass_rate": 0.0, |
| "dimension_scores": {name: 0.0 for name in dimension_names}, |
| "failure_reasons": {}, |
| } |
|
|
| failure_reasons: dict[str, int] = {} |
| for judge in judge_results: |
| for reason in judge.failure_reasons: |
| failure_reasons[reason] = failure_reasons.get(reason, 0) + 1 |
| return { |
| "overall": round(mean(judge.overall for judge in judge_results), 3), |
| "pass_rate": round( |
| sum(1 for judge in judge_results if judge.passed) / len(judge_results), |
| 3, |
| ), |
| "dimension_scores": { |
| name: round( |
| mean(float(getattr(judge, name).score) for judge in judge_results), |
| 3, |
| ) |
| for name in dimension_names |
| }, |
| "failure_reasons": dict(sorted(failure_reasons.items())), |
| } |
|
|
|
|
| def _utc_timestamp() -> str: |
| return datetime.now(UTC).replace(microsecond=0).isoformat().replace("+00:00", "Z") |
|
|
|
|
| def _normalize_metric_map(value: Any) -> dict[str, float]: |
| if not isinstance(value, dict): |
| return {} |
| normalized: dict[str, float] = {} |
| for key, item in value.items(): |
| try: |
| normalized[str(key)] = round(float(item), 3) |
| except (TypeError, ValueError): |
| continue |
| return normalized |
|
|
|
|
| def _extract_benchmark_reference_metrics(reference: dict[str, Any] | None) -> dict[str, Any] | None: |
| if not isinstance(reference, dict): |
| return None |
| return { |
| "benchmark_name": str(reference.get("benchmark_name", "")).strip(), |
| "branch": str(reference.get("branch", "")).strip(), |
| "model": str(reference.get("model", "")).strip(), |
| "generated_at": str(reference.get("generated_at") or _utc_timestamp()), |
| "score_manifest": _normalize_metric_map(reference.get("score_manifest")), |
| "category_scores": _normalize_metric_map(reference.get("category_scores")), |
| "failure_bucket_scores": _normalize_metric_map(reference.get("failure_bucket_scores")), |
| "tag_scores": _normalize_metric_map(reference.get("tag_scores")), |
| "tag_pass_rates": _normalize_metric_map(reference.get("tag_pass_rates")), |
| "execution_language_pass_rates": _normalize_metric_map( |
| reference.get("execution_language_pass_rates") |
| ), |
| "execution_language_scores": _normalize_metric_map( |
| reference.get("execution_language_scores") |
| ), |
| "category_execution_pass_rates": _normalize_metric_map( |
| reference.get("category_execution_pass_rates") |
| ), |
| "memory_quality_scores": _normalize_metric_map(reference.get("memory_quality_scores")), |
| "memory_quality_pass_rates": _normalize_metric_map( |
| reference.get("memory_quality_pass_rates") |
| ), |
| "risk_level_scores": _normalize_metric_map(reference.get("risk_level_scores")), |
| "average_overall_score": float(reference.get("average_overall_score", 0.0) or 0.0), |
| "average_latency_ms": float(reference.get("average_latency_ms", 0.0) or 0.0), |
| "average_tokens_used": float(reference.get("average_tokens_used", 0.0) or 0.0), |
| "success_rate": float(reference.get("success_rate", 0.0) or 0.0), |
| "execution_pass_rate": float(reference.get("execution_pass_rate", 0.0) or 0.0), |
| "grounding_pass_rate": float(reference.get("grounding_pass_rate", 0.0) or 0.0), |
| "memory_retrieval_pass_rate": float( |
| reference.get("memory_retrieval_pass_rate", 0.0) or 0.0 |
| ), |
| "production_like_pass_rate": float(reference.get("production_like_pass_rate", 0.0) or 0.0), |
| "human_eval_cadence": str(reference.get("human_eval_cadence", "") or ""), |
| "total_cases": int(reference.get("total_cases", 0) or 0), |
| "failed_cases": int(reference.get("failed_cases", 0) or 0), |
| } |
|
|
|
|
| def _build_benchmark_history_run(manifest: dict[str, Any]) -> dict[str, Any]: |
| extracted = _extract_benchmark_reference_metrics(manifest) |
| if extracted is None: |
| raise ValueError("Benchmark manifest history snapshotam jābūt objektam.") |
| return extracted |
|
|
|
|
| def _coerce_history_runs(history: dict[str, Any] | None) -> list[dict[str, Any]]: |
| if not isinstance(history, dict): |
| return [] |
| runs = history.get("runs") |
| if not isinstance(runs, list): |
| return [] |
| return [dict(item) for item in runs if isinstance(item, dict)] |
|
|
|
|
| def _same_benchmark_identity(left: dict[str, Any], right: dict[str, Any]) -> bool: |
| return ( |
| str(left.get("benchmark_name", "")).strip() == str(right.get("benchmark_name", "")).strip() |
| and str(left.get("branch", "")).strip() == str(right.get("branch", "")).strip() |
| and str(left.get("model", "")).strip() == str(right.get("model", "")).strip() |
| and str(left.get("generated_at", "")).strip() == str(right.get("generated_at", "")).strip() |
| ) |
|
|
|
|
| def _select_previous_history_baseline( |
| runs: list[dict[str, Any]], current_run: dict[str, Any] |
| ) -> dict[str, Any] | None: |
| if not runs: |
| return None |
| if _same_benchmark_identity(runs[-1], current_run): |
| return runs[-2] if len(runs) > 1 else None |
| return runs[-1] |
|
|
|
|
| def _calculate_metric_deltas( |
| current_metrics: dict[str, float], |
| previous_metrics: dict[str, float], |
| ) -> dict[str, dict[str, float]]: |
| shared_keys = sorted(set(current_metrics) & set(previous_metrics)) |
| return { |
| key: { |
| "current": round(current_metrics[key], 3), |
| "previous": round(previous_metrics[key], 3), |
| "delta": round(current_metrics[key] - previous_metrics[key], 3), |
| } |
| for key in shared_keys |
| } |
|
|
|
|
| def _filter_negative_deltas( |
| deltas: dict[str, dict[str, float]], |
| ) -> dict[str, dict[str, float]]: |
| return { |
| key: payload |
| for key, payload in deltas.items() |
| if float(payload.get("delta", 0.0) or 0.0) < 0.0 |
| } |
|
|
|
|
| def _resolve_human_eval_cadence( |
| *, |
| benchmark_name: str, |
| branch: str, |
| human_eval_summary: dict[str, Any] | None, |
| ) -> str: |
| if isinstance(human_eval_summary, dict): |
| cadence = str(human_eval_summary.get("cadence", "") or "").strip() |
| if cadence: |
| return cadence |
| normalized_benchmark = benchmark_name.strip().lower() |
| normalized_branch = branch.strip().lower() |
| if "memory" in normalized_benchmark or normalized_branch == "master": |
| return "weekly + pre-release" |
| if normalized_branch in {"coder", "planner"}: |
| return "per release" |
| return "per release" |
|
|
|
|
| def _build_benchmark_trend_summary(runs: list[dict[str, Any]]) -> dict[str, dict[str, Any]]: |
| if not runs: |
| return {} |
| summary: dict[str, dict[str, Any]] = {} |
| for metric in TREND_TRACKED_METRICS: |
| recent_values = [_history_metric_value(run, metric) for run in runs] |
| recent_values = [round(value, 3) for value in recent_values if value is not None] |
| if not recent_values: |
| continue |
| latest = recent_values[-1] |
| baseline = recent_values[0] |
| summary[metric] = { |
| "latest": latest, |
| "baseline": baseline, |
| "delta": round(latest - baseline, 3), |
| "recent_values": recent_values[-5:], |
| } |
| return summary |
|
|
|
|
| def _history_metric_value(run: dict[str, Any], metric: str) -> float | None: |
| if metric in run: |
| try: |
| return float(run[metric]) |
| except (TypeError, ValueError): |
| return None |
| score_manifest = run.get("score_manifest") |
| if isinstance(score_manifest, dict) and metric in score_manifest: |
| try: |
| return float(score_manifest[metric]) |
| except (TypeError, ValueError): |
| return None |
| return None |
|
|