| import os |
| from typing import Optional |
| from dotenv import load_dotenv |
|
|
| load_dotenv() |
|
|
|
|
| def evaluate_answer( |
| question: str, |
| answer: str, |
| contexts: list[str], |
| ) -> Optional[dict]: |
| """ |
| Evaluate a RAG answer using RAGAS metrics. |
| |
| Runs three metrics: |
| - Faithfulness: Does the answer only say things supported by the chunks? |
| - Answer Relevancy: Does the answer actually address the question? |
| - Context Precision: Were the retrieved chunks relevant to the question? |
| |
| Args: |
| question: The user's original question |
| answer: The answer generated by the RAG pipeline |
| contexts: List of text chunks that were retrieved from ChromaDB |
| """ |
| try: |
| from datasets import Dataset |
| from ragas import evaluate |
| from ragas.metrics import ( |
| faithfulness, |
| answer_relevancy, |
| context_precision, |
| ) |
| from langchain_openai import ChatOpenAI, OpenAIEmbeddings |
|
|
| ragas_llm = ChatOpenAI( |
| model=os.getenv("OPENROUTER_MODEL", "anthropic/claude-3-haiku"), |
| api_key=os.getenv("OPENROUTER_API_KEY"), |
| base_url="https://openrouter.ai/api/v1", |
| temperature=0, |
| ) |
|
|
| ragas_embeddings = OpenAIEmbeddings( |
| model="text-embedding-3-small", |
| api_key=os.getenv("OPENROUTER_API_KEY"), |
| base_url="https://openrouter.ai/api/v1", |
| ) |
|
|
| |
| data = { |
| "question": [question], |
| "answer": [answer], |
| |
| "contexts": [contexts], |
| |
| "ground_truth": [""], |
| } |
|
|
| dataset = Dataset.from_dict(data) |
|
|
| |
| result = evaluate( |
| dataset=dataset, |
| metrics=[ |
| faithfulness, |
| answer_relevancy, |
| context_precision, |
| ], |
| llm=ragas_llm, |
| embeddings=ragas_embeddings, |
| raise_exceptions=False, |
| ) |
|
|
| |
| scores = result.to_pandas().iloc[0].to_dict() |
|
|
| return { |
| "faithfulness": round(float(scores.get("faithfulness", 0)), 2), |
| "answer_relevancy": round(float(scores.get("answer_relevancy", 0)), 2), |
| "context_precision": round(float(scores.get("context_precision", 0)), 2), |
| } |
|
|
| except Exception as e: |
| print("RAGAS evaluation error: " + str(e)) |
| return None |
|
|
|
|
|
|
| def get_score_emoji(score: float) -> str: |
| if score >= 0.75: |
| return "π’" |
| elif score >= 0.5: |
| return "π‘" |
| else: |
| return "π΄" |
|
|
|
|
| def format_score_bar(score: float, width: int = 10) -> str: |
| filled = int(score * width) |
| empty = width - filled |
| return "β" * filled + "β" * empty |
|
|