import json
import re
from pathlib import Path

from dotenv import load_dotenv

BASE_DIR = Path(__file__).resolve().parent.parent
EVAL_PATH = BASE_DIR / "tests" / "eval_question.json"

from app.web import answer_query
from core.config import DEFAULT_GROUNDED_FALLBACK_MESSAGE


def load_eval_cases():
    return json.loads(EVAL_PATH.read_text(encoding="utf-8"))


def evaluate_case(case):
    answer, sources, citations, debug_data = answer_query(
        case["question"],
        debug_mode=True,
    )

    answer_verdict, matched_concepts = evaluate_answer(case, answer)
    grounding_verdict = evaluate_grounding(answer, citations, debug_data)

    return {
        "question": case["question"],
        "expected_behavior": case.get("expected_behavior", "answer"),
        "answer": answer,
        "answer_verdict": answer_verdict,
        "grounding_verdict": grounding_verdict,
        "matched_concepts": matched_concepts,
        "total_concepts": len(case.get("expected_concepts", [])),
        "sources": [source["chunk_id"] for source in sources],
        "citations": [citation["number"] for citation in citations],
        "grounding_reason": ((debug_data or {}).get("grounding") or {}).get("reason", "unknown"),
        "notes": case.get("notes", ""),
    }


def evaluate_answer(case, answer):
    expected_behavior = case.get("expected_behavior", "answer")
    answer_text = normalize_text(answer)
    is_fallback = answer_text == normalize_text(DEFAULT_GROUNDED_FALLBACK_MESSAGE)

    if expected_behavior == "fallback":
        return ("correct" if is_fallback else "wrong"), 0

    expected_concepts = case.get("expected_concepts", [])
    forbidden_concepts = case.get("forbidden_concepts", [])
    if not expected_concepts:
        return ("partially_correct", 0)

    for concept_group in forbidden_concepts:
        if any(normalize_text(phrase) in answer_text for phrase in concept_group):
            return "wrong", 0

    matched_concepts = 0
    for concept_group in expected_concepts:
        if any(normalize_text(phrase) in answer_text for phrase in concept_group):
            matched_concepts += 1

    if matched_concepts == len(expected_concepts):
        return "correct", matched_concepts
    if matched_concepts > 0:
        return "partially_correct", matched_concepts
    return "wrong", matched_concepts


def evaluate_grounding(answer, citations, debug_data):
    grounding = (debug_data or {}).get("grounding") or {}
    answer_text = normalize_text(answer)
    is_fallback = answer_text == normalize_text(DEFAULT_GROUNDED_FALLBACK_MESSAGE)

    if grounding.get("passed") is True and citations:
        return "grounded_answer"
    if is_fallback and grounding.get("passed") is False:
        return "safe_fallback"
    if citations:
        return "partially_grounded"
    return "not_grounded"


def build_summary(results):
    correct_answers = [result for result in results if result["answer_verdict"] == "correct"]
    partially_correct_answers = [
        result for result in results if result["answer_verdict"] == "partially_correct"
    ]
    grounded_answers = [
        result for result in results if result["grounding_verdict"] == "grounded_answer"
    ]
    safe_fallbacks = [
        result for result in results if result["grounding_verdict"] == "safe_fallback"
    ]
    partially_grounded_answers = [
        result for result in results if result["grounding_verdict"] == "partially_grounded"
    ]

    return {
        "cases": len(results),
        "correct_answers": len(correct_answers),
        "partially_correct_answers": len(partially_correct_answers),
        "grounded_answers": len(grounded_answers),
        "safe_fallbacks": len(safe_fallbacks),
        "partially_grounded_answers": len(partially_grounded_answers),
    }


def print_report(summary, results):
    print("RAG Answer Evaluation Report")
    print("=" * 32)
    print(f"Cases: {summary['cases']}")
    print(f"Correct Answers: {summary['correct_answers']}/{summary['cases']}")
    print(f"Partially Correct Answers: {summary['partially_correct_answers']}/{summary['cases']}")
    print(f"Grounded Answers: {summary['grounded_answers']}/{summary['cases']}")
    print(f"Safe Fallbacks: {summary['safe_fallbacks']}/{summary['cases']}")
    print(
        f"Partially Grounded Answers: "
        f"{summary['partially_grounded_answers']}/{summary['cases']}"
    )
    print()

    print("Question Results")
    print("-" * 32)
    for result in results:
        print(result["question"])
        print(f"Answer Verdict: {result['answer_verdict']}")
        print(f"Grounding Verdict: {result['grounding_verdict']}")
        print(f"Grounding Reason: {result['grounding_reason']}")
        if result["total_concepts"]:
            print(
                f"Matched Concepts: {result['matched_concepts']}/{result['total_concepts']}"
            )
        print(f"Cited Sources: {result['citations']}")
        print(f"Retrieved Chunks: {result['sources']}")
        print(f"Answer: {result['answer']}")
        if result["notes"]:
            print(f"Notes: {result['notes']}")
        print()


def normalize_text(text):
    return re.sub(r"\s+", " ", text.lower()).strip()


def main():
    load_dotenv()

    cases = load_eval_cases()
    results = [evaluate_case(case) for case in cases]
    summary = build_summary(results)
    print_report(summary, results)


if __name__ == "__main__":
    raise SystemExit(
        "Run this from the project root with `python evaluate_retrieval.py`, not `python tests/evaluate_retrieval.py`."
    )