File size: 4,655 Bytes
b91b0a5
 
4ff2e4d
 
 
 
 
 
 
 
 
 
 
9681056
c429a2d
4ff2e4d
9681056
4ff2e4d
c429a2d
4ff2e4d
 
 
 
c429a2d
4ff2e4d
b91b0a5
 
 
 
c429a2d
4ff2e4d
c429a2d
 
b91b0a5
c429a2d
 
 
4ff2e4d
b91b0a5
c429a2d
9681056
b91b0a5
c429a2d
b91b0a5
794ce9a
b91b0a5
c429a2d
 
 
 
 
794ce9a
b91b0a5
c429a2d
9681056
c429a2d
 
 
 
 
794ce9a
9681056
c429a2d
4ff2e4d
b91b0a5
c429a2d
4ff2e4d
9681056
 
4ff2e4d
 
 
b91b0a5
 
9681056
c429a2d
 
b91b0a5
 
 
 
 
 
 
c429a2d
9681056
 
 
c429a2d
9681056
4ff2e4d
b91b0a5
c429a2d
 
9681056
b91b0a5
c429a2d
 
 
 
 
794ce9a
b91b0a5
c429a2d
 
 
b91b0a5
 
794ce9a
 
 
c429a2d
794ce9a
b91b0a5
794ce9a
b91b0a5
794ce9a
 
 
 
 
b91b0a5
 
943f176
b91b0a5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
"""Script đánh giá RAG bằng RAGAS framework."""

import os
import sys
import json
from pathlib import Path
from datetime import datetime
from dotenv import find_dotenv, load_dotenv

REPO_ROOT = Path(__file__).resolve().parents[1]
if str(REPO_ROOT) not in sys.path:
    sys.path.insert(0, str(REPO_ROOT))
load_dotenv(find_dotenv(usecwd=True))

from pydantic import SecretStr
from datasets import Dataset
from langchain_openai import ChatOpenAI
from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy, context_precision, context_recall, RougeScore
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.run_config import RunConfig

from evaluation.eval_utils import load_csv_data, init_rag, generate_answers

# Cấu hình
CSV_PATH = "data/data.csv"                                        # File dữ liệu test
OUTPUT_DIR = "evaluation/results"                                  # Thư mục output
LLM_MODEL = os.getenv("EVAL_LLM_MODEL", "nex-agi/DeepSeek-V3.1-Nex-N1")  # Model đánh giá
API_BASE = "https://api.siliconflow.com/v1"


def run_evaluation(sample_size: int = 10, retrieval_mode: str = "hybrid_rerank") -> dict:
    """Chạy đánh giá RAGAS trên dữ liệu test."""
    print(f"\n{'='*60}")
    print(f"RAGAS EVALUATION - Mode: {retrieval_mode}")
    print(f"{'='*60}")
    
    # Khởi tạo RAG components
    rag, embeddings, llm_client = init_rag()
    
    # Tải dữ liệu test
    questions, ground_truths = load_csv_data(str(REPO_ROOT / CSV_PATH), sample_size)
    print(f"  Đã tải {len(questions)} samples")
    
    # Generate câu trả lời
    answers, contexts = generate_answers(
        rag, questions, llm_client,
        llm_model=LLM_MODEL,
        retrieval_mode=retrieval_mode,
    )
    
    # Thiết lập RAGAS evaluator
    api_key = os.getenv("SILICONFLOW_API_KEY", "")
    evaluator_llm = LangchainLLMWrapper(ChatOpenAI(
        model=LLM_MODEL,
        api_key=SecretStr(api_key),
        base_url=API_BASE,
        temperature=0,
        timeout=120,
        max_retries=3,
    ))
    evaluator_embeddings = LangchainEmbeddingsWrapper(embeddings)
    
    # Chuyển dữ liệu thành format Dataset
    dataset = Dataset.from_dict({
        "question": questions,
        "answer": answers,
        "contexts": contexts,
        "ground_truth": ground_truths,
    })
    
    # Chạy đánh giá RAGAS
    print("\n  Đang chạy RAGAS metrics...")
    results = evaluate(
        dataset=dataset,
        metrics=[
            faithfulness,           # Độ trung thực với context
            answer_relevancy,       # Độ liên quan của câu trả lời
            context_precision,      # Độ chính xác của context
            context_recall,         # Độ bao phủ của context
            RougeScore(rouge_type='rouge1', mode='fmeasure'),  # ROUGE-1
            RougeScore(rouge_type='rouge2', mode='fmeasure'),  # ROUGE-2
            RougeScore(rouge_type='rougeL', mode='fmeasure'),  # ROUGE-L
        ],
        llm=evaluator_llm,
        embeddings=evaluator_embeddings,
        raise_exceptions=False,
        run_config=RunConfig(max_workers=8, timeout=600, max_retries=3),
    )
    
    # Trích xuất điểm số
    df = results.to_pandas()
    metric_cols = [c for c in df.columns if c not in ("question", "answer", "contexts", "ground_truth", "user_input", "response", "reference", "retrieved_contexts")]
    
    # Tính điểm trung bình cho mỗi metric
    avg_scores = {}
    for col in metric_cols:
        values = df[col].dropna().tolist()
        if values:
            avg_scores[col] = sum(values) / len(values)
    
    # Lưu kết quả
    out_path = REPO_ROOT / OUTPUT_DIR
    out_path.mkdir(parents=True, exist_ok=True)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    # Lưu file CSV (tóm tắt)
    csv_path = out_path / f"ragas_{retrieval_mode}_{timestamp}.csv"
    with open(csv_path, 'w', encoding='utf-8') as f:
        f.write("retrieval_mode,sample_size," + ",".join(avg_scores.keys()) + "\n")
        f.write(f"{retrieval_mode},{len(questions)}," + ",".join(f"{v:.4f}" for v in avg_scores.values()) + "\n")
    
    # In kết quả
    print(f"\n{'='*60}")
    print(f"KẾT QUẢ - {retrieval_mode} ({len(questions)} samples)")
    print(f"{'='*60}")
    for metric, score in avg_scores.items():
        bar = "#" * int(score * 20) + "-" * (20 - int(score * 20))
        print(f"  {metric:25} [{bar}] {score:.4f}")
    
    print(f"\nĐã lưu: {json_path}")
    print(f"Đã lưu: {csv_path}")
    
    return avg_scores