File size: 3,862 Bytes
266d7bc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
from opik.evaluation import models
from opik.evaluation.metrics import GEval

from src.config import settings
from src.utils.logger_util import setup_logging

logger = setup_logging()

# -----------------------
# Evaluation helper
# -----------------------


async def evaluate_metrics(output: str, context: str) -> dict:
    """Evaluate multiple metrics for a given LLM output.
    Metrics included: faithfulness, coherence, completeness.

    Args:
        output (str): The LLM-generated output to evaluate.
        context (str): The context used to generate the output.

    Returns:
        dict: A dictionary with metric names as keys and their evaluation results as values.

    """
    settings.openai.api_key = None
    logger.info(f"OpenAI key is not set: {settings.openai.api_key is None}")

    if not output.strip():
        logger.warning("Output is empty. Skipping evaluation.")
        return {
            "faithfulness": {"score": 0.0, "reason": "Empty output", "failed": True},
            "coherence": {"score": 0.0, "reason": "Empty output", "failed": True},
            "completeness": {"score": 0.0, "reason": "Empty output", "failed": True},
        }

    if not getattr(settings.openai, "api_key", None):
        logger.info("OpenAI API key not set. Skipping metrics evaluation.")
        return {
            "faithfulness": {"score": None, "reason": "Skipped – no API key", "failed": True},
            "coherence": {"score": None, "reason": "Skipped – no API key", "failed": True},
            "completeness": {"score": None, "reason": "Skipped – no API key", "failed": True},
        }

    judge_model = models.LiteLLMChatModel(
        model_name="gpt-4o",  # gpt-4o, gpt-5-mini
        api_key=settings.openai.api_key,
    )

    metric_configs = {
        "faithfulness": (
            (
                "You are an expert judge tasked with evaluating whether an AI-generated answer is "
                "faithful to the provided Substack excerpts."
            ),
            (
                "The OUTPUT must not introduce new information and beyond "
                "what is contained in the CONTEXT. "
                "All claims in the OUTPUT should be directly supported by the CONTEXT."
            ),
        ),
        "coherence": (
            (
                "You are an expert judge tasked with evaluating whether an AI-generated answer is "
                "logically coherent."
            ),
            "The answer should be well-structured, readable, and maintain consistent reasoning.",
        ),
        "completeness": (
            (
                "You are an expert judge tasked with evaluating whether an AI-generated answer "
                "covers all relevant aspects of the query."
            ),
            (
                "The answer should include all major points from the CONTEXT "
                "and address the user's "
                "query "
                "fully."
            ),
        ),
    }

    results = {}
    for name, (task_intro, eval_criteria) in metric_configs.items():
        try:
            metric = GEval(
                task_introduction=task_intro,
                evaluation_criteria=eval_criteria,
                model=judge_model,
                name=f"G-Eval {name.capitalize()}",
            )

            eval_input = f"""
            OUTPUT: {output}
            CONTEXT: {context}
            """

            score_result = await metric.ascore(eval_input)

            results[name] = {
                "score": score_result.value,
                "reason": score_result.reason,
                "failed": score_result.scoring_failed,
            }

        except Exception as e:
            logger.warning(f"G-Eval {name} failed: {e}")
            results[name] = {"score": 0.0, "reason": str(e), "failed": True}

    return results