import json
import os
import sys
import time
import uuid

sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))

from dotenv import load_dotenv
load_dotenv()

def score_faithfulness(question: str, answer: str, context: str, llm) -> float:
    prompt = f"""Rate from 0 to 10 how factually consistent this AI answer is with the reference.
10 = all facts match, 5 = some facts match, 0 = contradicts reference.
Only output a single integer number, nothing else.

Reference: {context[:400]}
AI Answer: {answer[:400]}"""
    try:
        result = llm.invoke(prompt)
        import re
        nums = re.findall(r'\d+', result.content.strip())
        score = int(nums[0]) / 10.0 if nums else 0.5
        return min(1.0, max(0.0, score))
    except Exception:
        return 0.5

def score_relevancy(question: str, answer: str, llm) -> float:
    prompt = f"""Rate from 0 to 10 how well this answer addresses the question.
10 = completely answers it, 5 = partially, 0 = off topic.
Only output a single integer number, nothing else.

Question: {question}
Answer: {answer[:400]}"""
    try:
        result = llm.invoke(prompt)
        import re
        nums = re.findall(r'\d+', result.content.strip())
        score = int(nums[0]) / 10.0 if nums else 0.5
        return min(1.0, max(0.0, score))
    except Exception:
        return 0.5

def run_evaluation(qa_path="tests/qa_pairs.json", sample=None):
    from langchain_groq import ChatGroq
    from langchain_core.messages import HumanMessage, AIMessage
    from src.agents.graph import get_graph
    import src.agents.graph as g

    print("\n" + "="*50)
    print("Evaluation Pipeline")
    print("="*50)

    with open(qa_path) as f:
        qa_pairs = json.load(f)

    if sample:
        qa_pairs = qa_pairs[:sample]

    print(f"\nEvaluating {len(qa_pairs)} questions...\n")

    graph = get_graph()

    # clear stale FAISS stores and disable web search
    g._THREAD_RETRIEVERS.clear()
    g._THREAD_META.clear()
    g.search_tool = None

    llm = ChatGroq(
        model="llama-3.1-8b-instant",
        api_key=os.getenv("GROQ_API_KEY"),
        temperature=0,
    )

    faithfulness_scores = []
    relevancy_scores = []
    latencies = []

    for i, pair in enumerate(qa_pairs):
        q = pair["question"]
        gt = pair.get("ground_truth", "")
        thread_id = f"eval_{uuid.uuid4().hex[:8]}"
        config = {"configurable": {"thread_id": thread_id}}

        t0 = time.perf_counter()
        try:
            result = graph.invoke(
                {"messages": [HumanMessage(content=q)]},
                config=config,
            )
            last_ai = next(
                (m for m in reversed(result["messages"]) if isinstance(m, AIMessage)), None
            )
            answer = last_ai.content if last_ai else ""
        except Exception as e:
            answer = ""
        latency = (time.perf_counter() - t0) * 1000
        latencies.append(latency)

        faith = score_faithfulness(q, answer, gt, llm)
        relevancy = score_relevancy(q, answer, llm)

        faithfulness_scores.append(faith)
        relevancy_scores.append(relevancy)

        print(f"  [{i+1:2d}/{len(qa_pairs)}] F:{faith:.2f} R:{relevancy:.2f} {latency:.0f}ms | {q[:45]}")

    faith_avg = sum(faithfulness_scores) / len(faithfulness_scores)
    rel_avg = sum(relevancy_scores) / len(relevancy_scores)
    latencies_sorted = sorted(latencies)
    p50 = latencies_sorted[len(latencies_sorted) // 2]
    p90 = latencies_sorted[int(len(latencies_sorted) * 0.9)]

    report = {
        "num_questions": len(qa_pairs),
        "metrics": {
            "faithfulness": round(faith_avg, 4),
            "answer_relevancy": round(rel_avg, 4),
        },
        "latency_ms": {
            "p50": round(p50, 1),
            "p90": round(p90, 1),
        },
        "targets_met": {
            "faithfulness_gt_085": faith_avg > 0.85,
            "answer_relevancy_gt_080": rel_avg > 0.80,
            "p90_lt_2000ms": p90 < 2000,
        }
    }

    print("\n" + "="*50)
    print("RESULTS")
    print("="*50)
    print(f"Faithfulness:     {faith_avg:.4f}  {'✅' if faith_avg > 0.85 else '❌'} (target >0.85)")
    print(f"Answer Relevancy: {rel_avg:.4f}  {'✅' if rel_avg > 0.80 else '❌'} (target >0.80)")
    print(f"P50 Latency:      {p50:.0f}ms")
    print(f"P90 Latency:      {p90:.0f}ms  {'✅' if p90 < 2000 else '❌'} (target <2000ms)")
    print(f"\nQuestions tested: {len(qa_pairs)}")

    with open("evaluation_report.json", "w") as f:
        json.dump(report, f, indent=2)
    print("\nSaved to evaluation_report.json")

    return report

if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("--sample", type=int, default=None)
    parser.add_argument("--qa", default="tests/qa_pairs.json")
    args = parser.parse_args()
    run_evaluation(qa_path=args.qa, sample=args.sample)