File size: 4,655 Bytes
b91b0a5 4ff2e4d 9681056 c429a2d 4ff2e4d 9681056 4ff2e4d c429a2d 4ff2e4d c429a2d 4ff2e4d b91b0a5 c429a2d 4ff2e4d c429a2d b91b0a5 c429a2d 4ff2e4d b91b0a5 c429a2d 9681056 b91b0a5 c429a2d b91b0a5 794ce9a b91b0a5 c429a2d 794ce9a b91b0a5 c429a2d 9681056 c429a2d 794ce9a 9681056 c429a2d 4ff2e4d b91b0a5 c429a2d 4ff2e4d 9681056 4ff2e4d b91b0a5 9681056 c429a2d b91b0a5 c429a2d 9681056 c429a2d 9681056 4ff2e4d b91b0a5 c429a2d 9681056 b91b0a5 c429a2d 794ce9a b91b0a5 c429a2d b91b0a5 794ce9a c429a2d 794ce9a b91b0a5 794ce9a b91b0a5 794ce9a b91b0a5 943f176 b91b0a5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 |
"""Script đánh giá RAG bằng RAGAS framework."""
import os
import sys
import json
from pathlib import Path
from datetime import datetime
from dotenv import find_dotenv, load_dotenv
REPO_ROOT = Path(__file__).resolve().parents[1]
if str(REPO_ROOT) not in sys.path:
sys.path.insert(0, str(REPO_ROOT))
load_dotenv(find_dotenv(usecwd=True))
from pydantic import SecretStr
from datasets import Dataset
from langchain_openai import ChatOpenAI
from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy, context_precision, context_recall, RougeScore
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.run_config import RunConfig
from evaluation.eval_utils import load_csv_data, init_rag, generate_answers
# Cấu hình
CSV_PATH = "data/data.csv" # File dữ liệu test
OUTPUT_DIR = "evaluation/results" # Thư mục output
LLM_MODEL = os.getenv("EVAL_LLM_MODEL", "nex-agi/DeepSeek-V3.1-Nex-N1") # Model đánh giá
API_BASE = "https://api.siliconflow.com/v1"
def run_evaluation(sample_size: int = 10, retrieval_mode: str = "hybrid_rerank") -> dict:
"""Chạy đánh giá RAGAS trên dữ liệu test."""
print(f"\n{'='*60}")
print(f"RAGAS EVALUATION - Mode: {retrieval_mode}")
print(f"{'='*60}")
# Khởi tạo RAG components
rag, embeddings, llm_client = init_rag()
# Tải dữ liệu test
questions, ground_truths = load_csv_data(str(REPO_ROOT / CSV_PATH), sample_size)
print(f" Đã tải {len(questions)} samples")
# Generate câu trả lời
answers, contexts = generate_answers(
rag, questions, llm_client,
llm_model=LLM_MODEL,
retrieval_mode=retrieval_mode,
)
# Thiết lập RAGAS evaluator
api_key = os.getenv("SILICONFLOW_API_KEY", "")
evaluator_llm = LangchainLLMWrapper(ChatOpenAI(
model=LLM_MODEL,
api_key=SecretStr(api_key),
base_url=API_BASE,
temperature=0,
timeout=120,
max_retries=3,
))
evaluator_embeddings = LangchainEmbeddingsWrapper(embeddings)
# Chuyển dữ liệu thành format Dataset
dataset = Dataset.from_dict({
"question": questions,
"answer": answers,
"contexts": contexts,
"ground_truth": ground_truths,
})
# Chạy đánh giá RAGAS
print("\n Đang chạy RAGAS metrics...")
results = evaluate(
dataset=dataset,
metrics=[
faithfulness, # Độ trung thực với context
answer_relevancy, # Độ liên quan của câu trả lời
context_precision, # Độ chính xác của context
context_recall, # Độ bao phủ của context
RougeScore(rouge_type='rouge1', mode='fmeasure'), # ROUGE-1
RougeScore(rouge_type='rouge2', mode='fmeasure'), # ROUGE-2
RougeScore(rouge_type='rougeL', mode='fmeasure'), # ROUGE-L
],
llm=evaluator_llm,
embeddings=evaluator_embeddings,
raise_exceptions=False,
run_config=RunConfig(max_workers=8, timeout=600, max_retries=3),
)
# Trích xuất điểm số
df = results.to_pandas()
metric_cols = [c for c in df.columns if c not in ("question", "answer", "contexts", "ground_truth", "user_input", "response", "reference", "retrieved_contexts")]
# Tính điểm trung bình cho mỗi metric
avg_scores = {}
for col in metric_cols:
values = df[col].dropna().tolist()
if values:
avg_scores[col] = sum(values) / len(values)
# Lưu kết quả
out_path = REPO_ROOT / OUTPUT_DIR
out_path.mkdir(parents=True, exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
# Lưu file CSV (tóm tắt)
csv_path = out_path / f"ragas_{retrieval_mode}_{timestamp}.csv"
with open(csv_path, 'w', encoding='utf-8') as f:
f.write("retrieval_mode,sample_size," + ",".join(avg_scores.keys()) + "\n")
f.write(f"{retrieval_mode},{len(questions)}," + ",".join(f"{v:.4f}" for v in avg_scores.values()) + "\n")
# In kết quả
print(f"\n{'='*60}")
print(f"KẾT QUẢ - {retrieval_mode} ({len(questions)} samples)")
print(f"{'='*60}")
for metric, score in avg_scores.items():
bar = "#" * int(score * 20) + "-" * (20 - int(score * 20))
print(f" {metric:25} [{bar}] {score:.4f}")
print(f"\nĐã lưu: {json_path}")
print(f"Đã lưu: {csv_path}")
return avg_scores |