import os from typing import Optional from dotenv import load_dotenv load_dotenv() def evaluate_answer( question: str, answer: str, contexts: list[str], ) -> Optional[dict]: """ Evaluate a RAG answer using RAGAS metrics. Runs three metrics: - Faithfulness: Does the answer only say things supported by the chunks? - Answer Relevancy: Does the answer actually address the question? - Context Precision: Were the retrieved chunks relevant to the question? Args: question: The user's original question answer: The answer generated by the RAG pipeline contexts: List of text chunks that were retrieved from ChromaDB """ try: from datasets import Dataset from ragas import evaluate from ragas.metrics import ( faithfulness, answer_relevancy, context_precision, ) from langchain_openai import ChatOpenAI, OpenAIEmbeddings ragas_llm = ChatOpenAI( model=os.getenv("OPENROUTER_MODEL", "anthropic/claude-3-haiku"), api_key=os.getenv("OPENROUTER_API_KEY"), base_url="https://openrouter.ai/api/v1", temperature=0, ) ragas_embeddings = OpenAIEmbeddings( model="text-embedding-3-small", api_key=os.getenv("OPENROUTER_API_KEY"), base_url="https://openrouter.ai/api/v1", ) # RAGAS expects data in Dataset format. data = { "question": [question], "answer": [answer], # contexts must be a list of lists (one list of chunks per question) "contexts": [contexts], # ground_truth is optional, we skip it since we have no labeled data "ground_truth": [""], } dataset = Dataset.from_dict(data) # Run evaluation result = evaluate( dataset=dataset, metrics=[ faithfulness, answer_relevancy, context_precision, ], llm=ragas_llm, embeddings=ragas_embeddings, raise_exceptions=False, ) # Extract scores scores = result.to_pandas().iloc[0].to_dict() return { "faithfulness": round(float(scores.get("faithfulness", 0)), 2), "answer_relevancy": round(float(scores.get("answer_relevancy", 0)), 2), "context_precision": round(float(scores.get("context_precision", 0)), 2), } except Exception as e: print("RAGAS evaluation error: " + str(e)) return None def get_score_emoji(score: float) -> str: if score >= 0.75: return "🟢" elif score >= 0.5: return "🟡" else: return "🔴" def format_score_bar(score: float, width: int = 10) -> str: filled = int(score * width) empty = width - filled return "█" * filled + "░" * empty