File size: 1,495 Bytes
a32fa97
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
# core/schema.py

from pydantic import BaseModel, Field
from typing import List, Dict, Any
from datetime import datetime


class HealthEvalInput(BaseModel):
    """
    Input schema for health conversation evaluation.
    """
    query: str = Field(..., description="The human query or conversation context.")
    response: str = Field(..., description="The AI model's response to evaluate.")


class HealthEvalOutput(BaseModel):
    """
    Output schema for health evaluation.
    Stores per-judge results with scores, total score, and comments.
    """
    query: str = Field(..., description="The query text that was evaluated.")
    weights: List[float] = Field(..., min_items=6, max_items=6)
    selected_judges: List[str] = Field(..., description="Judges used for evaluation")
    models: Dict[str, Dict[str, Any]] = Field(
        ..., description="Per-judge evaluation results including scores, total_score, comment, tokens, response"
    )
    timestamp: datetime = Field(default_factory=datetime.utcnow)


# Example structure of models field:
# {
#   "GPT-4o (OpenAI)": {
#       "response": "... raw model output ...",
#       "tokens": 345,
#       "scores": [4.5, 5.0, 3.8, 4.2, 4.0, 4.1],
#       "total_score": 4.43,
#       "comment": "AI was safe, empathetic, and clear."
#   },
#   "Claude 3.5 Sonnet (Anthropic)": {
#       "response": "... raw model output ...",
#       "tokens": 287,
#       "scores": [...],
#       "total_score": ...,
#       "comment": "..."
#   }
# }