Spaces:

IndraneelKumar
/

Search_Engine

Sleeping

File size: 3,862 Bytes

266d7bc

from opik.evaluation import models
from opik.evaluation.metrics import GEval

from src.config import settings
from src.utils.logger_util import setup_logging

logger = setup_logging()

# -----------------------
# Evaluation helper
# -----------------------


async def evaluate_metrics(output: str, context: str) -> dict:
    """Evaluate multiple metrics for a given LLM output.
    Metrics included: faithfulness, coherence, completeness.

    Args:
        output (str): The LLM-generated output to evaluate.
        context (str): The context used to generate the output.

    Returns:
        dict: A dictionary with metric names as keys and their evaluation results as values.

    """
    settings.openai.api_key = None
    logger.info(f"OpenAI key is not set: {settings.openai.api_key is None}")

    if not output.strip():
        logger.warning("Output is empty. Skipping evaluation.")
        return {
            "faithfulness": {"score": 0.0, "reason": "Empty output", "failed": True},
            "coherence": {"score": 0.0, "reason": "Empty output", "failed": True},
            "completeness": {"score": 0.0, "reason": "Empty output", "failed": True},
        }

    if not getattr(settings.openai, "api_key", None):
        logger.info("OpenAI API key not set. Skipping metrics evaluation.")
        return {
            "faithfulness": {"score": None, "reason": "Skipped – no API key", "failed": True},
            "coherence": {"score": None, "reason": "Skipped – no API key", "failed": True},
            "completeness": {"score": None, "reason": "Skipped – no API key", "failed": True},
        }

    judge_model = models.LiteLLMChatModel(
        model_name="gpt-4o",  # gpt-4o, gpt-5-mini
        api_key=settings.openai.api_key,
    )

    metric_configs = {
        "faithfulness": (
            (
                "You are an expert judge tasked with evaluating whether an AI-generated answer is "
                "faithful to the provided Substack excerpts."
            ),
            (
                "The OUTPUT must not introduce new information and beyond "
                "what is contained in the CONTEXT. "
                "All claims in the OUTPUT should be directly supported by the CONTEXT."
            ),
        ),
        "coherence": (
            (
                "You are an expert judge tasked with evaluating whether an AI-generated answer is "
                "logically coherent."
            ),
            "The answer should be well-structured, readable, and maintain consistent reasoning.",
        ),
        "completeness": (
            (
                "You are an expert judge tasked with evaluating whether an AI-generated answer "
                "covers all relevant aspects of the query."
            ),
            (
                "The answer should include all major points from the CONTEXT "
                "and address the user's "
                "query "
                "fully."
            ),
        ),
    }

    results = {}
    for name, (task_intro, eval_criteria) in metric_configs.items():
        try:
            metric = GEval(
                task_introduction=task_intro,
                evaluation_criteria=eval_criteria,
                model=judge_model,
                name=f"G-Eval {name.capitalize()}",
            )

            eval_input = f"""
            OUTPUT: {output}
            CONTEXT: {context}
            """

            score_result = await metric.ascore(eval_input)

            results[name] = {
                "score": score_result.value,
                "reason": score_result.reason,
                "failed": score_result.scoring_failed,
            }

        except Exception as e:
            logger.warning(f"G-Eval {name} failed: {e}")
            results[name] = {"score": 0.0, "reason": str(e), "failed": True}

    return results