|
|
"""Script đánh giá RAG bằng RAGAS framework.""" |
|
|
|
|
|
import os |
|
|
import sys |
|
|
import json |
|
|
from pathlib import Path |
|
|
from datetime import datetime |
|
|
from dotenv import find_dotenv, load_dotenv |
|
|
|
|
|
REPO_ROOT = Path(__file__).resolve().parents[1] |
|
|
if str(REPO_ROOT) not in sys.path: |
|
|
sys.path.insert(0, str(REPO_ROOT)) |
|
|
load_dotenv(find_dotenv(usecwd=True)) |
|
|
|
|
|
from pydantic import SecretStr |
|
|
from datasets import Dataset |
|
|
from langchain_openai import ChatOpenAI |
|
|
from ragas import evaluate |
|
|
from ragas.metrics import faithfulness, answer_relevancy, context_precision, context_recall, RougeScore |
|
|
from ragas.llms import LangchainLLMWrapper |
|
|
from ragas.embeddings import LangchainEmbeddingsWrapper |
|
|
from ragas.run_config import RunConfig |
|
|
|
|
|
from evaluation.eval_utils import load_csv_data, init_rag, generate_answers |
|
|
|
|
|
|
|
|
CSV_PATH = "data/data.csv" |
|
|
OUTPUT_DIR = "evaluation/results" |
|
|
LLM_MODEL = os.getenv("EVAL_LLM_MODEL", "nex-agi/DeepSeek-V3.1-Nex-N1") |
|
|
API_BASE = "https://api.siliconflow.com/v1" |
|
|
|
|
|
|
|
|
def run_evaluation(sample_size: int = 10, retrieval_mode: str = "hybrid_rerank") -> dict: |
|
|
"""Chạy đánh giá RAGAS trên dữ liệu test.""" |
|
|
print(f"\n{'='*60}") |
|
|
print(f"RAGAS EVALUATION - Mode: {retrieval_mode}") |
|
|
print(f"{'='*60}") |
|
|
|
|
|
|
|
|
rag, embeddings, llm_client = init_rag() |
|
|
|
|
|
|
|
|
questions, ground_truths = load_csv_data(str(REPO_ROOT / CSV_PATH), sample_size) |
|
|
print(f" Đã tải {len(questions)} samples") |
|
|
|
|
|
|
|
|
answers, contexts = generate_answers( |
|
|
rag, questions, llm_client, |
|
|
llm_model=LLM_MODEL, |
|
|
retrieval_mode=retrieval_mode, |
|
|
) |
|
|
|
|
|
|
|
|
api_key = os.getenv("SILICONFLOW_API_KEY", "") |
|
|
evaluator_llm = LangchainLLMWrapper(ChatOpenAI( |
|
|
model=LLM_MODEL, |
|
|
api_key=SecretStr(api_key), |
|
|
base_url=API_BASE, |
|
|
temperature=0, |
|
|
timeout=120, |
|
|
max_retries=3, |
|
|
)) |
|
|
evaluator_embeddings = LangchainEmbeddingsWrapper(embeddings) |
|
|
|
|
|
|
|
|
dataset = Dataset.from_dict({ |
|
|
"question": questions, |
|
|
"answer": answers, |
|
|
"contexts": contexts, |
|
|
"ground_truth": ground_truths, |
|
|
}) |
|
|
|
|
|
|
|
|
print("\n Đang chạy RAGAS metrics...") |
|
|
results = evaluate( |
|
|
dataset=dataset, |
|
|
metrics=[ |
|
|
faithfulness, |
|
|
answer_relevancy, |
|
|
context_precision, |
|
|
context_recall, |
|
|
RougeScore(rouge_type='rouge1', mode='fmeasure'), |
|
|
RougeScore(rouge_type='rouge2', mode='fmeasure'), |
|
|
RougeScore(rouge_type='rougeL', mode='fmeasure'), |
|
|
], |
|
|
llm=evaluator_llm, |
|
|
embeddings=evaluator_embeddings, |
|
|
raise_exceptions=False, |
|
|
run_config=RunConfig(max_workers=8, timeout=600, max_retries=3), |
|
|
) |
|
|
|
|
|
|
|
|
df = results.to_pandas() |
|
|
metric_cols = [c for c in df.columns if c not in ("question", "answer", "contexts", "ground_truth", "user_input", "response", "reference", "retrieved_contexts")] |
|
|
|
|
|
|
|
|
avg_scores = {} |
|
|
for col in metric_cols: |
|
|
values = df[col].dropna().tolist() |
|
|
if values: |
|
|
avg_scores[col] = sum(values) / len(values) |
|
|
|
|
|
|
|
|
out_path = REPO_ROOT / OUTPUT_DIR |
|
|
out_path.mkdir(parents=True, exist_ok=True) |
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
|
|
|
|
|
|
|
|
csv_path = out_path / f"ragas_{retrieval_mode}_{timestamp}.csv" |
|
|
with open(csv_path, 'w', encoding='utf-8') as f: |
|
|
f.write("retrieval_mode,sample_size," + ",".join(avg_scores.keys()) + "\n") |
|
|
f.write(f"{retrieval_mode},{len(questions)}," + ",".join(f"{v:.4f}" for v in avg_scores.values()) + "\n") |
|
|
|
|
|
|
|
|
print(f"\n{'='*60}") |
|
|
print(f"KẾT QUẢ - {retrieval_mode} ({len(questions)} samples)") |
|
|
print(f"{'='*60}") |
|
|
for metric, score in avg_scores.items(): |
|
|
bar = "#" * int(score * 20) + "-" * (20 - int(score * 20)) |
|
|
print(f" {metric:25} [{bar}] {score:.4f}") |
|
|
|
|
|
print(f"\nĐã lưu: {json_path}") |
|
|
print(f"Đã lưu: {csv_path}") |
|
|
|
|
|
return avg_scores |