"""
Enhanced evaluation with proper groundedness checking.

This module implements LLM-based groundedness evaluation that checks if
generated answers are factually consistent with and fully supported by
the retrieved evidence, going beyond simple token overlap.
"""

import json
import os
import statistics
import time
from typing import Any, Dict, List

import requests
from tqdm import tqdm

ROOT = os.path.dirname(os.path.abspath(__file__))
EVAL_DIR = os.path.join(ROOT)
QUESTIONS_FILE = os.path.join(EVAL_DIR, "questions.json")
GOLD_FILE = os.path.join(EVAL_DIR, "gold_answers.json")
OUT_FILE = os.path.join(EVAL_DIR, "enhanced_results.json")
EVAL_RESULTS_DIR = os.path.join(os.path.dirname(EVAL_DIR), "evaluation_results")

# Ensure results directory exists
os.makedirs(EVAL_RESULTS_DIR, exist_ok=True)

TARGET_URL = os.getenv("EVAL_TARGET_URL", "https://msse-team-3-ai-engineering-project.hf.space")
CHAT_ENDPOINT = os.getenv("EVAL_CHAT_PATH", "/chat")
TIMEOUT = int(os.getenv("EVAL_TIMEOUT", "30"))

# LLM API for groundedness evaluation
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
GROUNDEDNESS_MODEL = "microsoft/wizardlm-2-8x22b"


def load_json(path: str) -> Any:
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)


def evaluate_groundedness_llm(generated_answer: str, retrieved_context: List[str]) -> Dict[str, Any]:
    """
    Use LLM to evaluate if the generated answer is grounded in the retrieved context.

    Args:
        generated_answer: The generated response text
        retrieved_context: List of retrieved document excerpts

    Returns:
        Dictionary with groundedness score and explanation
    """
    if not OPENROUTER_API_KEY:
        # Fallback to token overlap if no API key
        return {
            "grounded": True,
            "confidence": 0.5,
            "explanation": "Using fallback token overlap method - no OpenRouter API key available",
            "method": "token_overlap_fallback",
        }

    # Create context from retrieved documents
    context_text = "\n\n".join([f"Document {i+1}: {ctx}" for i, ctx in enumerate(retrieved_context)])

    # Groundedness evaluation prompt
    prompt = f"""You are an expert evaluator tasked with determining if a generated answer is
factually grounded in the provided context.

CONTEXT (Retrieved Documents):
{context_text}

GENERATED ANSWER:
{generated_answer}

TASK:
Evaluate whether the generated answer is:
1. FACTUALLY CONSISTENT with the context (no contradictions)
2. FULLY SUPPORTED by the context (all claims can be verified)
3. NOT HALLUCINATED (no information absent from context)

Respond with a JSON object containing:
- "grounded": boolean (true if fully grounded, false otherwise)
- "confidence": float 0-1 (how confident you are in this assessment)
- "explanation": string (detailed reasoning for your assessment)
- "unsupported_claims": list of strings (any claims not supported by context)

Be strict: if ANY part of the answer contains information not present in or
contradicted by the context, mark as false."""

    try:
        response = requests.post(
            "https://openrouter.ai/api/v1/chat/completions",
            headers={
                "Authorization": f"Bearer {OPENROUTER_API_KEY}",
                "Content-Type": "application/json",
            },
            json={
                "model": GROUNDEDNESS_MODEL,
                "messages": [{"role": "user", "content": prompt}],
                "temperature": 0.1,
                "max_tokens": 500,
            },
            timeout=30,
        )

        if response.status_code == 200:
            result = response.json()
            content = result["choices"][0]["message"]["content"]

            # Try to parse JSON response
            try:
                evaluation = json.loads(content)
                evaluation["method"] = "llm_evaluation"
                return evaluation
            except json.JSONDecodeError:
                # Fallback if LLM didn't return valid JSON
                is_grounded = "true" in content.lower() and "grounded" in content.lower()
                return {
                    "grounded": is_grounded,
                    "confidence": 0.7,
                    "explanation": f"LLM evaluation (non-JSON): {content[:200]}...",
                    "method": "llm_evaluation_parsed",
                }
        else:
            # API error fallback
            return {
                "grounded": True,
                "confidence": 0.3,
                "explanation": f"API error {response.status_code}, using neutral assessment",
                "method": "api_error_fallback",
            }

    except Exception as e:
        # Exception fallback
        return {
            "grounded": True,
            "confidence": 0.3,
            "explanation": f"Evaluation error: {str(e)}, using neutral assessment",
            "method": "exception_fallback",
        }


def evaluate_citation_accuracy_enhanced(
    expected_sources: List[str],
    returned_sources: List[Dict[str, Any]],
    generated_answer: str,
) -> Dict[str, Any]:
    """
    Enhanced citation accuracy that considers both source presence and relevance.

    Args:
        expected_sources: List of expected source filenames
        returned_sources: List of returned source dictionaries
        generated_answer: The generated response text

    Returns:
        Dictionary with citation accuracy metrics
    """
    if not expected_sources:
        return {
            "citation_accuracy": 1.0 if not returned_sources else 0.0,
            "expected_count": 0,
            "returned_count": len(returned_sources),
            "correctly_cited": 0,
            "method": "exact_match",
        }

    # Extract returned filenames
    returned_filenames = {
        s.get("document") or s.get("filename") or s.get("source_file") or s.get("file") for s in returned_sources
    }
    returned_filenames = {f for f in returned_filenames if f}

    # Count correct citations
    correctly_cited = 0
    for expected in expected_sources:
        if expected in returned_filenames:
            correctly_cited += 1

    citation_accuracy = correctly_cited / len(expected_sources) if expected_sources else 0.0

    return {
        "citation_accuracy": citation_accuracy,
        "expected_count": len(expected_sources),
        "returned_count": len(returned_filenames),
        "correctly_cited": correctly_cited,
        "expected_sources": expected_sources,
        "returned_sources": list(returned_filenames),
        "method": "exact_match",
    }


def token_overlap_score(gold: str, response: str) -> float:
    """Simple partial match score based on token overlap."""
    gold_tokens = set(gold.lower().split())
    resp_tokens = set(response.lower().split())
    if not gold_tokens:
        return 0.0
    overlap = gold_tokens & resp_tokens
    return len(overlap) / len(gold_tokens)


def run_enhanced_evaluation(target: str = TARGET_URL):
    """Run enhanced evaluation with proper groundedness checking."""
    questions = load_json(QUESTIONS_FILE)
    golds = load_json(GOLD_FILE)

    results = []
    latencies = []
    groundedness_scores = []
    citation_accuracies = []

    print(f"Running enhanced evaluation against {target}")
    print(f"Using groundedness evaluation: {'LLM-based' if OPENROUTER_API_KEY else 'Token overlap fallback'}")

    for q in tqdm(questions, desc="Enhanced Evaluation"):
        qid = str(q["id"])
        payload = {"message": q["question"], "include_sources": True}
        url = target.rstrip("/") + CHAT_ENDPOINT
        start = time.time()

        try:
            # Add progress info
            print(f"\nEvaluating question {qid}: {q['question'][:50]}...")
            r = requests.post(url, json=payload, timeout=TIMEOUT)
            latency = time.time() - start
            latencies.append(latency)
            print(f"Response received in {latency:.2f}s")

            if r.status_code != 200:
                results.append(
                    {
                        "id": qid,
                        "question": q["question"],
                        "status_code": r.status_code,
                        "error": r.text,
                        "latency_s": latency,
                    }
                )
                continue

            data = r.json()
            response_text = data.get("response", "")
            returned_sources = data.get("sources", []) or []

            gold_answer = golds.get(qid, {}).get("answer", "")
            expected_sources = golds.get(qid, {}).get("expected_sources", [])

            # Enhanced groundedness evaluation
            context_excerpts = [s.get("excerpt", "") for s in returned_sources if s.get("excerpt")]
            groundedness_eval = evaluate_groundedness_llm(response_text, context_excerpts)

            # Enhanced citation accuracy
            citation_eval = evaluate_citation_accuracy_enhanced(expected_sources, returned_sources, response_text)

            # Traditional token overlap for comparison
            overlap_score = token_overlap_score(gold_answer, response_text)

            # Store comprehensive results
            result = {
                "id": qid,
                "question": q["question"],
                "response": response_text,
                "latency_s": latency,
                # Enhanced groundedness metrics
                "groundedness": groundedness_eval,
                # Enhanced citation metrics
                "citation_evaluation": citation_eval,
                # Traditional metrics for comparison
                "overlap_score": overlap_score,
                "citation_accuracy": citation_eval["citation_accuracy"],
                # Source information
                "returned_sources": returned_sources,
                "expected_sources": expected_sources,
                "gold_answer": gold_answer,
            }

            results.append(result)

            # Track metrics for summary
            if groundedness_eval.get("grounded") is not None:
                groundedness_scores.append(1.0 if groundedness_eval["grounded"] else 0.0)
            citation_accuracies.append(citation_eval["citation_accuracy"])

        except Exception as e:
            latency = time.time() - start
            latencies.append(latency)
            results.append(
                {
                    "id": qid,
                    "question": q["question"],
                    "status_code": "error",
                    "error": str(e),
                    "latency_s": latency,
                }
            )

    # Calculate summary metrics
    success_latencies = [lat for lat in latencies if lat is not None]
    p50 = statistics.median(success_latencies) if success_latencies else None
    p95 = sorted(success_latencies)[max(0, int(len(success_latencies) * 0.95) - 1)] if success_latencies else None

    # Enhanced summary metrics
    avg_groundedness = sum(groundedness_scores) / len(groundedness_scores) if groundedness_scores else None
    avg_citation_accuracy = sum(citation_accuracies) / len(citation_accuracies) if citation_accuracies else None

    # Count successful evaluations
    successful_evals = len([r for r in results if r.get("groundedness") is not None])
    total_questions = len(questions)

    summary = {
        "target": target,
        "evaluation_method": "enhanced_llm_based",
        "n_questions": total_questions,
        "successful_evaluations": successful_evals,
        "success_rate": (successful_evals / total_questions if total_questions > 0 else 0),
        # Performance metrics
        "latency_p50_s": p50,
        "latency_p95_s": p95,
        "avg_latency_s": (sum(success_latencies) / len(success_latencies) if success_latencies else None),
        # Quality metrics (enhanced)
        "avg_groundedness_score": avg_groundedness,
        "avg_citation_accuracy": avg_citation_accuracy,
        "groundedness_method": ("llm_evaluation" if OPENROUTER_API_KEY else "token_overlap_fallback"),
        # Additional insights
        "grounded_responses": sum(groundedness_scores),
        "ungrounded_responses": (len(groundedness_scores) - sum(groundedness_scores) if groundedness_scores else 0),
        "perfect_citations": len([c for c in citation_accuracies if c == 1.0]),
        "no_citations": len([c for c in citation_accuracies if c == 0.0]),
    }

    # Save enhanced results
    output = {
        "summary": summary,
        "results": results,
        "metadata": {
            "evaluation_timestamp": time.time(),
            "evaluation_version": "enhanced_v1.0",
            "groundedness_model": (GROUNDEDNESS_MODEL if OPENROUTER_API_KEY else "token_overlap"),
            "target_endpoint": target + CHAT_ENDPOINT,
        },
    }

    # Save to evaluation directory and a centralized evaluation_results folder
    with open(OUT_FILE, "w", encoding="utf-8") as f:
        json.dump(output, f, indent=2)

    # Also write a copy into evaluation_results for CI aggregation
    try:
        out_summary_path = os.path.join(EVAL_RESULTS_DIR, "enhanced_results_summary.json")
        with open(out_summary_path, "w", encoding="utf-8") as f2:
            json.dump(output["summary"], f2, indent=2)
    except Exception:
        pass

    print("\nEnhanced Evaluation Complete!")
    print("=" * 50)
    print(json.dumps(summary, indent=2))
    print(f"\nDetailed results saved to {OUT_FILE}")

    return output


if __name__ == "__main__":
    target = os.getenv("EVAL_TARGET_URL", TARGET_URL)
    run_enhanced_evaluation(target)