| """Lightweight chat quality evaluation helpers.""" |
|
|
| from __future__ import annotations |
|
|
| from dataclasses import dataclass |
|
|
| _LATVIAN_MARKERS = (" un ", " ar ", " lai ", " š", "ļ", "ģ", "ņ", "ā", "ē", "ī", "ū") |
| _REASONING_MARKERS = ( |
| "jo ", |
| "tāpēc", |
| "līdz ar to", |
| "secin", |
| "pirma", |
| "otr", |
| "treš", |
| "solis", |
| "priorit", |
| ) |
| _HELPFULNESS_MARKERS = ("solis", "ieteikum", "plān", "kopsavilk", "nākamais solis") |
| _SAFETY_PENALTY_MARKERS = ("garantēju", "100%", "pilnīgi droši") |
| _CODE_MARKERS = ("```", "def ", "class ", "function ", "return ", "import ", "fn ") |
| _CODE_QUALITY_MARKERS = ( |
| "edge case", |
| "edge-case", |
| "validāc", |
| "validation", |
| "error", |
| "test", |
| "droš", |
| "typed", |
| "type hint", |
| ) |
| _TECHNICAL_LATVIAN_MARKERS = ( |
| "kontrakt", |
| "savietojam", |
| "saderīb", |
| "veiktspēj", |
| "novērojam", |
| "migrācij", |
| "regresij", |
| "diagnost", |
| "ievad", |
| "izvad", |
| ) |
| _PROFESSIONAL_MIXED_TERMS = ( |
| "api", |
| "sse", |
| "feature flag", |
| "rollback", |
| "hotfix", |
| "payload", |
| "schema", |
| "backward-compatible", |
| "backward compatible", |
| "canary", |
| "latency", |
| "timeout", |
| "retry", |
| "backoff", |
| "delta", |
| "complete", |
| ) |
| _AWKWARD_TRANSLATION_MARKERS = ( |
| "atpakaļripo", |
| "iezīmes karog", |
| "kravas satur", |
| "pabeigšanas notikuma gabal", |
| ) |
| _CONTEXT_CONTINUITY_MARKERS = ( |
| "turpinot iepriekšējo", |
| "balstoties uz to, ko jau minēji", |
| "kā jau minēji", |
| "iepriekš", |
| "šajā pašā pavedienā", |
| "turpinām", |
| ) |
| _CLARIFICATION_MARKERS = ("preciz", "ko tieši", "vari iedot", "vairāk konteksta") |
| _REGRESSION_GUARD_MARKERS = ( |
| "regres", |
| "rollback", |
| "backward", |
| "compat", |
| "kontrakt", |
| "edge case", |
| "robež", |
| "tests", |
| "testu", |
| "smoke", |
| ) |
| _GROUNDING_MARKERS = ("balstoties", "repo", "fails", "pamatojoties", "logs", "konfigur") |
|
|
|
|
| @dataclass(frozen=True, slots=True) |
| class ChatEvalCase: |
| name: str |
| prompt: str |
| response: str |
| persona_title: str = "Core Assistant" |
| reference_answer: str = "" |
| reference_facts: tuple[str, ...] = () |
| expected_terms: tuple[str, ...] = () |
| forbidden_terms: tuple[str, ...] = () |
| history_turns: int = 0 |
| expects_code: bool = False |
| level: str = "ci" |
| difficulty: str = "standard" |
| category: str = "general" |
| failure_bucket: str = "general" |
| risk_level: str = "standard" |
| production_like: bool = False |
|
|
|
|
| @dataclass(frozen=True, slots=True) |
| class JudgeDimensionResult: |
| name: str |
| score: float |
| passed: bool |
| reasons: tuple[str, ...] = () |
|
|
|
|
| @dataclass(frozen=True, slots=True) |
| class RubricJudgeResult: |
| overall: float |
| passed: bool |
| task_completion: JudgeDimensionResult |
| instruction_following: JudgeDimensionResult |
| grounding: JudgeDimensionResult |
| safety: JudgeDimensionResult |
| multi_turn_continuity: JudgeDimensionResult |
| code_quality: JudgeDimensionResult |
| regression_risk: JudgeDimensionResult |
| failure_reasons: tuple[str, ...] = () |
|
|
|
|
| @dataclass(frozen=True, slots=True) |
| class ChatEvalResult: |
| name: str |
| helpfulness: float |
| reasoning: float |
| factuality: float |
| latvian_quality: float |
| coding: float |
| long_context: float |
| safety: float |
| level: str = "ci" |
| difficulty: str = "standard" |
| category: str = "general" |
| failure_bucket: str = "general" |
| risk_level: str = "standard" |
| production_like: bool = False |
| judge: RubricJudgeResult | None = None |
|
|
| @property |
| def overall(self) -> float: |
| return round( |
| ( |
| self.helpfulness |
| + self.reasoning |
| + self.factuality |
| + self.latvian_quality |
| + self.coding |
| + self.long_context |
| ) |
| / 6, |
| 3, |
| ) |
|
|
|
|
| def evaluate_chat_case(case: ChatEvalCase) -> ChatEvalResult: |
| response = case.response.strip() |
| lowered = response.lower() |
| prompt_lower = case.prompt.lower() |
|
|
| helpfulness = 0.35 |
| if len(response.split()) >= 8: |
| helpfulness += 0.2 |
| if any(marker in lowered for marker in _HELPFULNESS_MARKERS): |
| helpfulness += 0.2 |
| if "?" in response and "preciz" in lowered: |
| helpfulness += 0.15 |
|
|
| reasoning = 0.3 |
| if any(marker in lowered for marker in _REASONING_MARKERS): |
| reasoning += 0.3 |
| if response.count("\n") >= 1 or any(char.isdigit() for char in response): |
| reasoning += 0.1 |
| if any(term.lower() in lowered for term in case.expected_terms): |
| reasoning += 0.15 |
|
|
| factuality = 0.3 |
| if any(term.lower() in lowered for term in case.expected_terms): |
| factuality += 0.25 |
| elif any(word in lowered for word in prompt_lower.split() if len(word) > 4): |
| factuality += 0.15 |
| if any(fact.lower() in lowered for fact in case.reference_facts): |
| factuality += 0.25 |
| if case.reference_answer and len(case.reference_answer.split()) >= 6: |
| lowered_tokens = set(lowered.split()) |
| overlap_tokens = { |
| token |
| for token in case.reference_answer.lower().split() |
| if len(token) > 4 and token in lowered_tokens |
| } |
| factuality += min(0.2, len(overlap_tokens) * 0.05) |
| if "izdom" not in lowered and "nevaru pārbaudīt" in lowered: |
| factuality += 0.15 |
|
|
| latvian_quality = 0.45 |
| if any(marker in lowered for marker in _LATVIAN_MARKERS): |
| latvian_quality += 0.25 |
| if response.endswith((".", "!", "?")): |
| latvian_quality += 0.1 |
| if any(marker in lowered for marker in _TECHNICAL_LATVIAN_MARKERS): |
| latvian_quality += 0.1 |
| if any(marker in lowered for marker in _PROFESSIONAL_MIXED_TERMS): |
| latvian_quality += 0.1 |
| if any(marker in lowered for marker in _AWKWARD_TRANSLATION_MARKERS): |
| latvian_quality -= 0.25 |
|
|
| coding = 0.2 |
| coding_expected = ( |
| case.expects_code |
| or case.category == "coding" |
| or any( |
| token in prompt_lower |
| for token in ("kod", "python", "rust", "typescript", "javascript", "sql") |
| ) |
| ) |
| if coding_expected: |
| coding += 0.2 |
| if any(marker in response for marker in _CODE_MARKERS): |
| coding += 0.35 |
| if any(token in lowered for token in _CODE_QUALITY_MARKERS): |
| coding += 0.15 |
| if response.count("\n") >= 3: |
| coding += 0.1 |
| if not any(marker in response for marker in _CODE_MARKERS): |
| coding = max(0.1, coding - 0.25) |
| else: |
| coding = 1.0 |
|
|
| long_context = 0.25 |
| if case.history_turns > 0: |
| long_context += 0.25 |
| if any(term.lower() in lowered for term in case.expected_terms): |
| long_context += 0.2 |
| if len(response.split()) >= 10: |
| long_context += 0.15 |
| if any(marker in lowered for marker in _CONTEXT_CONTINUITY_MARKERS): |
| long_context += 0.15 |
| else: |
| long_context = 1.0 |
|
|
| safety = 1.0 |
| if any(term.lower() in lowered for term in case.forbidden_terms): |
| safety = 0.0 |
| factuality = max(0.0, factuality - 0.3) |
| elif any(term in lowered for term in _SAFETY_PENALTY_MARKERS): |
| safety = 0.55 |
| factuality = max(0.0, factuality - 0.2) |
|
|
| judge = _evaluate_rubric_judge( |
| case=case, |
| response=response, |
| lowered=lowered, |
| helpfulness=min(helpfulness, 1.0), |
| reasoning=min(reasoning, 1.0), |
| factuality=min(factuality, 1.0), |
| coding=min(coding, 1.0), |
| long_context=min(long_context, 1.0), |
| safety=min(safety, 1.0), |
| ) |
|
|
| return ChatEvalResult( |
| name=case.name, |
| helpfulness=min(helpfulness, 1.0), |
| reasoning=min(reasoning, 1.0), |
| factuality=min(factuality, 1.0), |
| latvian_quality=min(latvian_quality, 1.0), |
| coding=min(coding, 1.0), |
| long_context=min(long_context, 1.0), |
| safety=min(safety, 1.0), |
| level=case.level, |
| difficulty=case.difficulty, |
| category=case.category, |
| failure_bucket=case.failure_bucket, |
| risk_level=case.risk_level, |
| production_like=case.production_like, |
| judge=judge, |
| ) |
|
|
|
|
| def evaluate_chat_suite(cases: list[ChatEvalCase]) -> list[ChatEvalResult]: |
| return [evaluate_chat_case(case) for case in cases] |
|
|
|
|
| def _evaluate_rubric_judge( |
| *, |
| case: ChatEvalCase, |
| response: str, |
| lowered: str, |
| helpfulness: float, |
| reasoning: float, |
| factuality: float, |
| coding: float, |
| long_context: float, |
| safety: float, |
| ) -> RubricJudgeResult: |
| task_completion_reasons: list[str] = [] |
| task_completion = min(1.0, (helpfulness + reasoning + factuality) / 3) |
| if case.expected_terms and not any(term.lower() in lowered for term in case.expected_terms): |
| task_completion = max(0.0, task_completion - 0.2) |
| task_completion_reasons.append("trūkst sagaidīto terminu vai galveno deliverable signālu") |
| if len(response.split()) < 8: |
| task_completion = max(0.0, task_completion - 0.15) |
| task_completion_reasons.append("atbilde ir pārāk īsa pilnam uzdevuma pabeigumam") |
|
|
| instruction_following_reasons: list[str] = [] |
| instruction_following = 0.55 |
| if case.expects_code: |
| if any(marker in response for marker in _CODE_MARKERS): |
| instruction_following += 0.3 |
| else: |
| instruction_following -= 0.25 |
| instruction_following_reasons.append("prasīts kods, bet atbildē nav koda bloka") |
| else: |
| instruction_following += 0.2 |
| if case.history_turns > 0: |
| if any(marker in lowered for marker in _CONTEXT_CONTINUITY_MARKERS): |
| instruction_following += 0.15 |
| elif case.expected_terms and not any( |
| term.lower() in lowered for term in case.expected_terms |
| ): |
| instruction_following -= 0.15 |
| instruction_following_reasons.append( |
| "follow-up atbilde neparāda iepriekšējā konteksta turpinājumu" |
| ) |
| if case.category == "helpfulness" and not any( |
| marker in lowered for marker in _CLARIFICATION_MARKERS |
| ): |
| instruction_following -= 0.1 |
| instruction_following_reasons.append("neskaidram pieprasījumam pietrūkst precizējoša soļa") |
| if any(term.lower() in lowered for term in case.forbidden_terms): |
| instruction_following = max(0.0, instruction_following - 0.35) |
| instruction_following_reasons.append("atbildē parādās aizliegtie termini") |
|
|
| grounding_reasons: list[str] = [] |
| grounding_required = ( |
| case.category == "grounding" |
| or any("/" in term or "." in term for term in (*case.expected_terms, *case.reference_facts)) |
| or case.production_like |
| ) |
| if grounding_required: |
| grounding = 0.35 |
| if case.reference_facts and any(fact.lower() in lowered for fact in case.reference_facts): |
| grounding += 0.3 |
| if case.expected_terms and any(term.lower() in lowered for term in case.expected_terms): |
| grounding += 0.2 |
| if any(marker in lowered for marker in _GROUNDING_MARKERS): |
| grounding += 0.15 |
| if grounding < 0.7: |
| grounding_reasons.append("nepietiekami grounded signāli vai konkrētas atsauces") |
| else: |
| grounding = 1.0 |
|
|
| multi_turn_reasons: list[str] = [] |
| if case.history_turns > 0: |
| multi_turn_continuity = long_context |
| if multi_turn_continuity < 0.7: |
| multi_turn_reasons.append("multi-turn kontinuitāte ir pārāk vāja") |
| else: |
| multi_turn_continuity = 1.0 |
|
|
| code_quality_reasons: list[str] = [] |
| if case.expects_code or case.category == "coding": |
| code_quality = coding |
| if code_quality < 0.7: |
| code_quality_reasons.append( |
| "koda kvalitātes, validācijas vai testu signāli ir par vāju" |
| ) |
| else: |
| code_quality = 1.0 |
|
|
| regression_reasons: list[str] = [] |
| critical_case = case.production_like or case.failure_bucket not in {"", "general"} |
| if critical_case: |
| regression_risk = 0.35 |
| if any(marker in lowered for marker in _REGRESSION_GUARD_MARKERS): |
| regression_risk += 0.4 |
| if any(marker in lowered for marker in ("test", "rollback", "smoke", "monitor", "metr")): |
| regression_risk += 0.15 |
| if case.expects_code and "```" in response: |
| regression_risk += 0.05 |
| if regression_risk < 0.7: |
| regression_reasons.append( |
| "pietrūkst regresiju, rollback vai verifikācijas drošības signālu" |
| ) |
| else: |
| regression_risk = 1.0 |
|
|
| safety_reasons: list[str] = [] |
| if safety < 0.85: |
| safety_reasons.append("drošības vai piesardzības signāli ir nepietiekami") |
|
|
| task_completion_result = _judge_dimension( |
| "task_completion", |
| task_completion, |
| threshold=0.7, |
| reasons=task_completion_reasons, |
| ) |
| instruction_following_result = _judge_dimension( |
| "instruction_following", |
| min(instruction_following, 1.0), |
| threshold=0.7, |
| reasons=instruction_following_reasons, |
| ) |
| grounding_result = _judge_dimension( |
| "grounding", |
| min(grounding, 1.0), |
| threshold=0.7, |
| reasons=grounding_reasons, |
| ) |
| safety_result = _judge_dimension( |
| "safety", |
| safety, |
| threshold=0.85, |
| reasons=safety_reasons, |
| ) |
| multi_turn_result = _judge_dimension( |
| "multi_turn_continuity", |
| min(multi_turn_continuity, 1.0), |
| threshold=0.7, |
| reasons=multi_turn_reasons, |
| ) |
| code_quality_result = _judge_dimension( |
| "code_quality", |
| min(code_quality, 1.0), |
| threshold=0.7, |
| reasons=code_quality_reasons, |
| ) |
| regression_risk_result = _judge_dimension( |
| "regression_risk", |
| min(regression_risk, 1.0), |
| threshold=0.7, |
| reasons=regression_reasons, |
| ) |
| dimensions = ( |
| task_completion_result, |
| instruction_following_result, |
| grounding_result, |
| safety_result, |
| multi_turn_result, |
| code_quality_result, |
| regression_risk_result, |
| ) |
| failure_reasons = tuple( |
| f"{dimension.name}: {reason}" |
| for dimension in dimensions |
| if not dimension.passed |
| for reason in dimension.reasons |
| ) |
| overall = round(sum(dimension.score for dimension in dimensions) / len(dimensions), 3) |
| return RubricJudgeResult( |
| overall=overall, |
| passed=all(dimension.passed for dimension in dimensions), |
| task_completion=task_completion_result, |
| instruction_following=instruction_following_result, |
| grounding=grounding_result, |
| safety=safety_result, |
| multi_turn_continuity=multi_turn_result, |
| code_quality=code_quality_result, |
| regression_risk=regression_risk_result, |
| failure_reasons=failure_reasons, |
| ) |
|
|
|
|
| def _judge_dimension( |
| name: str, |
| score: float, |
| *, |
| threshold: float, |
| reasons: list[str], |
| ) -> JudgeDimensionResult: |
| normalized = round(max(0.0, min(score, 1.0)), 3) |
| return JudgeDimensionResult( |
| name=name, |
| score=normalized, |
| passed=normalized >= threshold, |
| reasons=tuple(reasons), |
| ) |
|
|