File size: 3,272 Bytes
a32fa97
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
# core/evaluators.py

import logging
import json
from typing import List
from core.schema import HealthEvalInput, HealthEvalOutput
from core.providers import JudgeProvider
from core.constants import AVAILABLE_JUDGES, DEFAULT_WEIGHTS, METRIC_NAMES


class HealthEvalEvaluator:
    """
    Evaluates conversations against health-related metrics using judge models.
    """

    def __init__(self, judge_provider: JudgeProvider):
        self.judge_provider = judge_provider
        logging.debug("HealthEvalEvaluator initialized with JudgeProvider")

    def evaluate(
        self,
        input_data: HealthEvalInput,
        weights: List[float] = None,
        selected_judges: List[str] = None
    ) -> HealthEvalOutput:
        """
        Run evaluation across selected judges and return aggregated scores.
        """

        if weights is None:
            weights = DEFAULT_WEIGHTS

        if selected_judges is None or len(selected_judges) == 0:
            selected_judges = list(AVAILABLE_JUDGES.keys())

        logging.debug(f"Running evaluation with judges={selected_judges} weights={weights}")

        # Collect results
        model_scores = {}
        for judge in selected_judges:
            judge_model = AVAILABLE_JUDGES[judge]
            raw_response, tokens = self.judge_provider.ask_model(
                model=judge_model,
                query=input_data.query,
                response=input_data.response
            )

            scores, comment = self._parse_model_output(raw_response)

            total_score = (
                sum([s * w for s, w in zip(scores, weights)]) / sum(weights)
                if any(scores) else 0.0
            )

            model_scores[judge] = {
                "response": raw_response,
                "tokens": tokens,
                "scores": scores,
                "total_score": total_score,
                "comment": comment,
            }

        # Build HealthEvalOutput
        output = HealthEvalOutput(
          query=input_data.query,
          weights=weights,
          selected_judges=selected_judges,
          models=model_scores
        )

        logging.debug(f"Evaluation completed: {output}")
        return output

    def _parse_model_output(self, raw_response: str) -> (List[float], str):
        """
        Parse JSON response from model into scores and comment.
        Falls back gracefully if parsing fails.
        """
        try:
            parsed = json.loads(raw_response)

            scores = [
                float(parsed.get("Evidence & Transparency Fit", 0.0)),
                float(parsed.get("Clinical Safety & Escalation", 0.0)),
                float(parsed.get("Empathy & Relationship Quality", 0.0)),
                float(parsed.get("Clarity & Comprehension", 0.0)),
                float(parsed.get("Plan Quality & Behavior Support", 0.0)),
                float(parsed.get("Trust, Explainability & User Agency", 0.0)),
            ]

            comment = parsed.get("Comment", "")
            return scores, comment

        except Exception as e:
            logging.error(f"Failed to parse model output: {e}\nRaw response: {raw_response}")
            return [0.0] * len(METRIC_NAMES), f"Parsing error: {e}. Raw: {raw_response[:200]}..."