Spaces:

Shafagh99
/

chatpaper

Sleeping

App Files Files Community

chatpaper / src /evaluation /ragas_eval.py

Shafagh99

add chatpaper project

c003cc2 9 days ago

raw

history blame contribute delete

2.97 kB

	import os
	from typing import Optional
	from dotenv import load_dotenv

	load_dotenv()


	def evaluate_answer(
	question: str,
	answer: str,
	contexts: list[str],
	) -> Optional[dict]:
	"""
	Evaluate a RAG answer using RAGAS metrics.

	Runs three metrics:
	- Faithfulness: Does the answer only say things supported by the chunks?
	- Answer Relevancy: Does the answer actually address the question?
	- Context Precision: Were the retrieved chunks relevant to the question?

	Args:
	question: The user's original question
	answer: The answer generated by the RAG pipeline
	contexts: List of text chunks that were retrieved from ChromaDB
	"""
	try:
	from datasets import Dataset
	from ragas import evaluate
	from ragas.metrics import (
	faithfulness,
	answer_relevancy,
	context_precision,
	)
	from langchain_openai import ChatOpenAI, OpenAIEmbeddings

	ragas_llm = ChatOpenAI(
	model=os.getenv("OPENROUTER_MODEL", "anthropic/claude-3-haiku"),
	api_key=os.getenv("OPENROUTER_API_KEY"),
	base_url="https://openrouter.ai/api/v1",
	temperature=0,
	)

	ragas_embeddings = OpenAIEmbeddings(
	model="text-embedding-3-small",
	api_key=os.getenv("OPENROUTER_API_KEY"),
	base_url="https://openrouter.ai/api/v1",
	)

	# RAGAS expects data in Dataset format.
	data = {
	"question": [question],
	"answer": [answer],
	# contexts must be a list of lists (one list of chunks per question)
	"contexts": [contexts],
	# ground_truth is optional, we skip it since we have no labeled data
	"ground_truth": [""],
	}

	dataset = Dataset.from_dict(data)

	# Run evaluation
	result = evaluate(
	dataset=dataset,
	metrics=[
	faithfulness,
	answer_relevancy,
	context_precision,
	],
	llm=ragas_llm,
	embeddings=ragas_embeddings,
	raise_exceptions=False,
	)

	# Extract scores
	scores = result.to_pandas().iloc[0].to_dict()

	return {
	"faithfulness": round(float(scores.get("faithfulness", 0)), 2),
	"answer_relevancy": round(float(scores.get("answer_relevancy", 0)), 2),
	"context_precision": round(float(scores.get("context_precision", 0)), 2),
	}

	except Exception as e:
	print("RAGAS evaluation error: " + str(e))
	return None



	def get_score_emoji(score: float) -> str:
	if score >= 0.75:
	return "🟢"
	elif score >= 0.5:
	return "🟡"
	else:
	return "🔴"


	def format_score_bar(score: float, width: int = 10) -> str:
	filled = int(score * width)
	empty = width - filled
	return "█" * filled + "░" * empty