DoAn / evaluation /ragas_eval.py

change commit

b91b0a5 18 days ago

4.66 kB

	"""Script đánh giá RAG bằng RAGAS framework."""

	import os
	import sys
	import json
	from pathlib import Path
	from datetime import datetime
	from dotenv import find_dotenv, load_dotenv

	REPO_ROOT = Path(__file__).resolve().parents[1]
	if str(REPO_ROOT) not in sys.path:
	sys.path.insert(0, str(REPO_ROOT))
	load_dotenv(find_dotenv(usecwd=True))

	from pydantic import SecretStr
	from datasets import Dataset
	from langchain_openai import ChatOpenAI
	from ragas import evaluate
	from ragas.metrics import faithfulness, answer_relevancy, context_precision, context_recall, RougeScore
	from ragas.llms import LangchainLLMWrapper
	from ragas.embeddings import LangchainEmbeddingsWrapper
	from ragas.run_config import RunConfig

	from evaluation.eval_utils import load_csv_data, init_rag, generate_answers

	# Cấu hình
	CSV_PATH = "data/data.csv" # File dữ liệu test
	OUTPUT_DIR = "evaluation/results" # Thư mục output
	LLM_MODEL = os.getenv("EVAL_LLM_MODEL", "nex-agi/DeepSeek-V3.1-Nex-N1") # Model đánh giá
	API_BASE = "https://api.siliconflow.com/v1"


	def run_evaluation(sample_size: int = 10, retrieval_mode: str = "hybrid_rerank") -> dict:
	"""Chạy đánh giá RAGAS trên dữ liệu test."""
	print(f"\n{'='*60}")
	print(f"RAGAS EVALUATION - Mode: {retrieval_mode}")
	print(f"{'='*60}")

	# Khởi tạo RAG components
	rag, embeddings, llm_client = init_rag()

	# Tải dữ liệu test
	questions, ground_truths = load_csv_data(str(REPO_ROOT / CSV_PATH), sample_size)
	print(f" Đã tải {len(questions)} samples")

	# Generate câu trả lời
	answers, contexts = generate_answers(
	rag, questions, llm_client,
	llm_model=LLM_MODEL,
	retrieval_mode=retrieval_mode,
	)

	# Thiết lập RAGAS evaluator
	api_key = os.getenv("SILICONFLOW_API_KEY", "")
	evaluator_llm = LangchainLLMWrapper(ChatOpenAI(
	model=LLM_MODEL,
	api_key=SecretStr(api_key),
	base_url=API_BASE,
	temperature=0,
	timeout=120,
	max_retries=3,
	))
	evaluator_embeddings = LangchainEmbeddingsWrapper(embeddings)

	# Chuyển dữ liệu thành format Dataset
	dataset = Dataset.from_dict({
	"question": questions,
	"answer": answers,
	"contexts": contexts,
	"ground_truth": ground_truths,
	})

	# Chạy đánh giá RAGAS
	print("\n Đang chạy RAGAS metrics...")
	results = evaluate(
	dataset=dataset,
	metrics=[
	faithfulness, # Độ trung thực với context
	answer_relevancy, # Độ liên quan của câu trả lời
	context_precision, # Độ chính xác của context
	context_recall, # Độ bao phủ của context
	RougeScore(rouge_type='rouge1', mode='fmeasure'), # ROUGE-1
	RougeScore(rouge_type='rouge2', mode='fmeasure'), # ROUGE-2
	RougeScore(rouge_type='rougeL', mode='fmeasure'), # ROUGE-L
	],
	llm=evaluator_llm,
	embeddings=evaluator_embeddings,
	raise_exceptions=False,
	run_config=RunConfig(max_workers=8, timeout=600, max_retries=3),
	)

	# Trích xuất điểm số
	df = results.to_pandas()
	metric_cols = [c for c in df.columns if c not in ("question", "answer", "contexts", "ground_truth", "user_input", "response", "reference", "retrieved_contexts")]

	# Tính điểm trung bình cho mỗi metric
	avg_scores = {}
	for col in metric_cols:
	values = df[col].dropna().tolist()
	if values:
	avg_scores[col] = sum(values) / len(values)

	# Lưu kết quả
	out_path = REPO_ROOT / OUTPUT_DIR
	out_path.mkdir(parents=True, exist_ok=True)
	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

	# Lưu file CSV (tóm tắt)
	csv_path = out_path / f"ragas_{retrieval_mode}_{timestamp}.csv"
	with open(csv_path, 'w', encoding='utf-8') as f:
	f.write("retrieval_mode,sample_size," + ",".join(avg_scores.keys()) + "\n")
	f.write(f"{retrieval_mode},{len(questions)}," + ",".join(f"{v:.4f}" for v in avg_scores.values()) + "\n")

	# In kết quả
	print(f"\n{'='*60}")
	print(f"KẾT QUẢ - {retrieval_mode} ({len(questions)} samples)")
	print(f"{'='*60}")
	for metric, score in avg_scores.items():
	bar = "#" * int(score * 20) + "-" * (20 - int(score * 20))
	print(f" {metric:25} [{bar}] {score:.4f}")

	print(f"\nĐã lưu: {json_path}")
	print(f"Đã lưu: {csv_path}")

	return avg_scores