"""
RAGAS evaluation for the Rabbook retrieval pipeline.

Metrics computed (requires ground_truth from eval_dataset.json):
  Faithfulness                    — is the answer faithful to the retrieved context?
  AnswerRelevancy                 — does the answer address the question?
  AnswerCorrectness               — does the answer match the ground truth?
  LLMContextRecall                — does the context cover the ground truth?
  LLMContextPrecisionWithReference — are the retrieved chunks relevant to the ground truth?

Only "answer" cases (expected_behavior == "answer") are evaluated.
Fallback cases have no ground-truth context, so RAGAS metrics do not apply.
"""
import json
import warnings
from pathlib import Path

from dotenv import load_dotenv

load_dotenv()

warnings.filterwarnings("ignore", category=DeprecationWarning)

from ragas import EvaluationDataset, SingleTurnSample, evaluate
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.metrics import (
    AnswerCorrectness,
    AnswerRelevancy,
    Faithfulness,
    LLMContextPrecisionWithReference,
    LLMContextRecall,
)
from ragas.run_config import RunConfig

from .eval_common import (
    build_embeddings,
    build_evaluator_llm,
    build_llm,
    build_reranker,
    load_dataset,
    load_retrieval_bundle,
)
from core.config import DEFAULT_RETRIEVAL_K
from rag.retrieve import retrieve_documents_with_query_transform
from core.config import DEFAULT_BM25_CANDIDATE_K, DEFAULT_RERANK_CANDIDATE_K

# Cache lives alongside the other eval data artifacts
CACHE_PATH = Path(__file__).resolve().parent / "data" / "ragas_cache.json"


def retrieve_contexts(question: str, vectorstore, bm25_index, reranker, llm) -> list[str]:
    """Run retrieval for a question and return a list of page-content strings."""
    documents = retrieve_documents_with_query_transform(
        vectorstore,
        question,
        k=DEFAULT_RETRIEVAL_K,
        reranker=reranker,
        bm25_index=bm25_index,
        query_transformer=llm,
        enable_query_transform=False,
        candidate_k=DEFAULT_RERANK_CANDIDATE_K,
        bm25_candidate_k=DEFAULT_BM25_CANDIDATE_K,
        metadata_filter=None,
        include_debug=False,
    )
    return [doc.page_content for doc, _score in documents]


def generate_answer(question: str, contexts: list[str], llm) -> str:
    """Generate a concise answer from the retrieved contexts."""
    from langchain_core.messages import HumanMessage
    context_block = "\n\n".join(contexts)
    prompt = (
        "Answer the question using only the context below. "
        "Be concise and specific.\n\n"
        f"Context:\n{context_block}\n\n"
        f"Question: {question}"
    )
    return llm.invoke([HumanMessage(content=prompt)]).content


def main():
    print("Initializing models...")
    llm = build_llm()
    embeddings = build_embeddings()
    reranker = build_reranker()

    print("Loading retrieval bundle...")
    vectorstore, bm25_index = load_retrieval_bundle(embeddings)

    dataset = load_dataset()

    # Only RAGAS-evaluate cases that have a ground truth and are expected to answer.
    answer_cases = [c for c in dataset if c.get("expected_behavior") == "answer"]
    fallback_cases = [c for c in dataset if c.get("expected_behavior") != "answer"]
    print(f"Dataset: {len(answer_cases)} answer cases, {len(fallback_cases)} fallback case(s) skipped.")

    # Load cache so reruns skip already-collected questions.
    cache: dict[str, dict] = {}
    if CACHE_PATH.exists():
        cache = json.loads(CACHE_PATH.read_text(encoding="utf-8"))
        print(f"Loaded {len(cache)} cached answers from {CACHE_PATH.name}")

    print(f"\nCollecting answers for {len(answer_cases)} questions...")
    samples = []
    skipped = 0

    for i, case in enumerate(answer_cases, 1):
        question = case["question"]
        ground_truth = case.get("ground_truth", "")

        if question in cache:
            print(f"  [{i}/{len(answer_cases)}] (cached) {question[:65]}...")
            entry = cache[question]
            samples.append(SingleTurnSample(
                user_input=question,
                response=entry["answer"],
                retrieved_contexts=entry["contexts"],
                reference=ground_truth,
            ))
            continue

        print(f"  [{i}/{len(answer_cases)}] {question[:70]}...")
        contexts = retrieve_contexts(question, vectorstore, bm25_index, reranker, llm)
        if not contexts:
            print("    skipped — no context retrieved")
            skipped += 1
            continue

        answer = generate_answer(question, contexts, llm)

        cache[question] = {"answer": answer, "contexts": contexts}
        CACHE_PATH.write_text(json.dumps(cache, ensure_ascii=False, indent=2), encoding="utf-8")

        samples.append(SingleTurnSample(
            user_input=question,
            response=answer,
            retrieved_contexts=contexts,
            reference=ground_truth,
        ))

    if not samples:
        print("\nNo samples to evaluate.")
        print("Ensure documents are ingested: python ingest_docs.py")
        return

    dataset_obj = EvaluationDataset(samples=samples)

    evaluator_llm = LangchainLLMWrapper(build_evaluator_llm())
    evaluator_embeddings = LangchainEmbeddingsWrapper(embeddings)

    faithfulness_metric = Faithfulness(llm=evaluator_llm)
    answer_relevancy_metric = AnswerRelevancy(
        llm=evaluator_llm,
        embeddings=evaluator_embeddings,
    )
    answer_correctness_metric = AnswerCorrectness(
        llm=evaluator_llm,
        embeddings=evaluator_embeddings,
    )
    context_recall_metric = LLMContextRecall(llm=evaluator_llm)
    context_precision_metric = LLMContextPrecisionWithReference(llm=evaluator_llm)

    run_config = RunConfig(timeout=120, max_workers=2, max_wait=120)

    print(f"\nRunning RAGAS on {len(samples)} samples (2 workers)...")
    result = evaluate(
        dataset_obj,
        metrics=[
            faithfulness_metric,
            answer_relevancy_metric,
            answer_correctness_metric,
            context_recall_metric,
            context_precision_metric,
        ],
        run_config=run_config,
    )

    scores = result.to_pandas()
    print("\nRAGAS Results")
    print("=" * 60)
    print(f"  Faithfulness:              {scores['faithfulness'].mean():.3f}  (answer stays within retrieved context)")
    print(f"  Answer Relevancy:          {scores['answer_relevancy'].mean():.3f}  (answer addresses the question)")
    print(f"  Answer Correctness:        {scores['answer_correctness'].mean():.3f}  (answer matches the ground truth)")
    print(f"  Context Recall:            {scores['context_recall'].mean():.3f}  (context covers the ground truth)")
    print(f"  Context Precision:         {scores['llm_context_precision_with_reference'].mean():.3f}  (retrieved chunks are relevant to ground truth)")
    print(f"\n  Evaluated: {len(samples)}/{len(answer_cases)} answer cases", end="")
    if skipped:
        print(f"  ({skipped} skipped — empty retrieval)", end="")
    print()
    print(f"  Skipped:   {len(fallback_cases)} fallback case(s) — no ground-truth context to judge against")


if __name__ == "__main__":
    main()