"""Script đánh giá RAG bằng RAGAS framework."""

import os
import sys
import json
from pathlib import Path
from datetime import datetime
from dotenv import find_dotenv, load_dotenv

REPO_ROOT = Path(__file__).resolve().parents[1]
if str(REPO_ROOT) not in sys.path:
    sys.path.insert(0, str(REPO_ROOT))
load_dotenv(find_dotenv(usecwd=True))

from pydantic import SecretStr
from datasets import Dataset
from langchain_openai import ChatOpenAI
from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy, context_precision, context_recall, RougeScore
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.run_config import RunConfig

from evaluation.eval_utils import load_csv_data, init_rag, generate_answers

# Cấu hình
CSV_PATH = "data/data.csv"                                        # File dữ liệu test
OUTPUT_DIR = "evaluation/results"                                  # Thư mục output
LLM_MODEL = os.getenv("EVAL_LLM_MODEL", "nex-agi/DeepSeek-V3.1-Nex-N1")  # Model đánh giá
API_BASE = "https://api.siliconflow.com/v1"


def run_evaluation(sample_size: int = 10, retrieval_mode: str = "hybrid_rerank") -> dict:
    """Chạy đánh giá RAGAS trên dữ liệu test."""
    print(f"\n{'='*60}")
    print(f"RAGAS EVALUATION - Mode: {retrieval_mode}")
    print(f"{'='*60}")
    
    # Khởi tạo RAG components
    rag, embeddings, llm_client = init_rag()
    
    # Tải dữ liệu test
    questions, ground_truths = load_csv_data(str(REPO_ROOT / CSV_PATH), sample_size)
    print(f"  Đã tải {len(questions)} samples")
    
    # Generate câu trả lời
    answers, contexts = generate_answers(
        rag, questions, llm_client,
        llm_model=LLM_MODEL,
        retrieval_mode=retrieval_mode,
    )
    
    # Thiết lập RAGAS evaluator
    api_key = os.getenv("SILICONFLOW_API_KEY", "")
    evaluator_llm = LangchainLLMWrapper(ChatOpenAI(
        model=LLM_MODEL,
        api_key=SecretStr(api_key),
        base_url=API_BASE,
        temperature=0,
        timeout=120,
        max_retries=3,
    ))
    evaluator_embeddings = LangchainEmbeddingsWrapper(embeddings)
    
    # Chuyển dữ liệu thành format Dataset
    dataset = Dataset.from_dict({
        "question": questions,
        "answer": answers,
        "contexts": contexts,
        "ground_truth": ground_truths,
    })
    
    # Chạy đánh giá RAGAS
    print("\n  Đang chạy RAGAS metrics...")
    results = evaluate(
        dataset=dataset,
        metrics=[
            faithfulness,           # Độ trung thực với context
            answer_relevancy,       # Độ liên quan của câu trả lời
            context_precision,      # Độ chính xác của context
            context_recall,         # Độ bao phủ của context
            RougeScore(rouge_type='rouge1', mode='fmeasure'),  # ROUGE-1
            RougeScore(rouge_type='rouge2', mode='fmeasure'),  # ROUGE-2
            RougeScore(rouge_type='rougeL', mode='fmeasure'),  # ROUGE-L
        ],
        llm=evaluator_llm,
        embeddings=evaluator_embeddings,
        raise_exceptions=False,
        run_config=RunConfig(max_workers=8, timeout=600, max_retries=3),
    )
    
    # Trích xuất điểm số
    df = results.to_pandas()
    metric_cols = [c for c in df.columns if c not in ("question", "answer", "contexts", "ground_truth", "user_input", "response", "reference", "retrieved_contexts")]
    
    # Tính điểm trung bình cho mỗi metric
    avg_scores = {}
    for col in metric_cols:
        values = df[col].dropna().tolist()
        if values:
            avg_scores[col] = sum(values) / len(values)
    
    # Lưu kết quả
    out_path = REPO_ROOT / OUTPUT_DIR
    out_path.mkdir(parents=True, exist_ok=True)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    # Lưu file CSV (tóm tắt)
    csv_path = out_path / f"ragas_{retrieval_mode}_{timestamp}.csv"
    with open(csv_path, 'w', encoding='utf-8') as f:
        f.write("retrieval_mode,sample_size," + ",".join(avg_scores.keys()) + "\n")
        f.write(f"{retrieval_mode},{len(questions)}," + ",".join(f"{v:.4f}" for v in avg_scores.values()) + "\n")
    
    # In kết quả
    print(f"\n{'='*60}")
    print(f"KẾT QUẢ - {retrieval_mode} ({len(questions)} samples)")
    print(f"{'='*60}")
    for metric, score in avg_scores.items():
        bar = "#" * int(score * 20) + "-" * (20 - int(score * 20))
        print(f"  {metric:25} [{bar}] {score:.4f}")
    
    print(f"\nĐã lưu: {json_path}")
    print(f"Đã lưu: {csv_path}")
    
    return avg_scores