"""
Unified Evaluation Runner for RAG System

This script provides comprehensive evaluation capabilities including:
- Deterministic groundedness evaluation with reproducible scoring
- Enhanced citation accuracy validation
- Performance benchmarking and latency analysis
- Comprehensive evaluation metrics and reporting

Features:
- LLM-based groundedness evaluation (with fallback to token overlap)
- Citation accuracy checking with filename validation
- Deterministic evaluation with fixed seeds for reproducibility
- Performance tier analysis (fast/normal/slow responses)
- Comprehensive reporting with statistical analysis
"""

import json
import os
import statistics
import time
from typing import Any, Dict, List

import requests
from tqdm import tqdm

ROOT = os.path.dirname(os.path.abspath(__file__))
EVAL_DIR = os.path.join(ROOT)
QUESTIONS_FILE = os.path.join(EVAL_DIR, "questions.json")
GOLD_FILE = os.path.join(EVAL_DIR, "gold_answers.json")
OUT_FILE = os.path.join(EVAL_DIR, "results.json")
EVAL_RESULTS_DIR = os.path.join(os.path.dirname(EVAL_DIR), "evaluation_results")
os.makedirs(EVAL_RESULTS_DIR, exist_ok=True)

TARGET_URL = os.getenv("EVAL_TARGET_URL", "https://msse-team-3-ai-engineering-project.hf.space")
CHAT_ENDPOINT = os.getenv("EVAL_CHAT_PATH", "/chat")
TIMEOUT = int(os.getenv("EVAL_TIMEOUT", "30"))


def load_json(path: str) -> Any:
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)


def token_overlap_score(gold: str, response: str) -> float:
    """Simple partial match score based on token overlap."""
    gold_tokens = set(gold.lower().split())
    resp_tokens = set(response.lower().split())
    if not gold_tokens:
        return 0.0
    overlap = gold_tokens & resp_tokens
    return len(overlap) / len(gold_tokens)


def citation_matches(expected: List[str], returned_sources: List[Dict[str, Any]]) -> float:
    """Fraction of expected sources that appear in returned sources by filename match."""
    # If no expected sources, treat as correct only if model returned none
    if not expected:
        return 1.0 if not returned_sources else 0.0

    # Helper: normalize a filename or url -> lowercase basename without common extensions
    import os
    import re
    from difflib import SequenceMatcher

    def normalize(s: str) -> str:
        if not s:
            return ""
        s = s.strip()
        # If it's a URL or path-like, take the basename
        # Remove query string / fragments
        s = re.sub(r"[?#].*$", "", s)
        base = os.path.basename(s)
        # remove common extensions
        base = re.sub(r"\.(md|markdown|txt|html|htm|pdf|csv|json|yaml|yml|py|ipynb)$", "", base, flags=re.IGNORECASE)
        return base.lower()

    # Build a set of normalized returned filenames from various possible keys
    returned_filenames = set()
    for s in returned_sources or []:
        # s may be a dict containing keys like filename, source_file, file, url, path
        if isinstance(s, dict):
            candidates = [s.get(k) for k in ("filename", "source_file", "file", "url", "path", "source")]
            # also some sources embed metadata
            meta = s.get("metadata") or {}
            if isinstance(meta, dict):
                candidates += [meta.get(k) for k in ("filename", "file", "source_file")]
        else:
            # s might be a plain string
            candidates = [s]

        for c in candidates:
            if c:
                returned_filenames.add(normalize(str(c)))

    # Now for each expected source, try exact normalized match, substring, or fuzzy match
    matched = 0
    # threshold can be tuned via environment variable
    try:
        env_thresh = float(os.getenv("EVAL_CITATION_FUZZY_THRESHOLD", "0.72"))
    except Exception:
        env_thresh = 0.72

    for e in expected:
        ne = normalize(str(e))
        if not ne:
            continue
        found = False
        # exact
        if ne in returned_filenames:
            found = True
        else:
            # substring match
            for rf in returned_filenames:
                if ne in rf or rf in ne:
                    found = True
                    break
        if not found:
            # fuzzy match using SequenceMatcher
            best = 0.0
            for rf in returned_filenames:
                if not rf:
                    continue
                score = SequenceMatcher(None, ne, rf).ratio()
                if score > best:
                    best = score
            # treat as match if similarity >= 0.72 (tunable)
            if best >= env_thresh:
                found = True

        if found:
            matched += 1

    return matched / len(expected)


def run_eval(target: str = TARGET_URL):
    questions = load_json(QUESTIONS_FILE)
    golds = load_json(GOLD_FILE)

    results = []
    latencies = []

    for q in tqdm(questions, desc="Questions"):
        qid = str(q["id"])
        payload = {"message": q["question"], "include_sources": True}
        url = target.rstrip("/") + CHAT_ENDPOINT
        start = time.time()
        try:
            r = requests.post(url, json=payload, timeout=TIMEOUT)
            latency = time.time() - start
            latencies.append(latency)

            if r.status_code != 200:
                results.append(
                    {
                        "id": qid,
                        "question": q["question"],
                        "status_code": r.status_code,
                        "error": r.text,
                    }
                )
                continue

            data = r.json()
            response_text = data.get("response", "")
            returned_sources = data.get("sources", []) or []

            gold_answer = golds.get(qid, {}).get("answer", "")
            expected_sources = golds.get(qid, {}).get("expected_sources", [])

            overlap = token_overlap_score(gold_answer, response_text)
            citation_acc = citation_matches(expected_sources, returned_sources)

            results.append(
                {
                    "id": qid,
                    "question": q["question"],
                    "response": response_text,
                    "latency_s": latency,
                    "overlap_score": overlap,
                    "citation_accuracy": citation_acc,
                    "returned_sources": returned_sources,
                }
            )

        except Exception as e:
            latency = time.time() - start
            latencies.append(latency)
            results.append(
                {
                    "id": qid,
                    "question": q["question"],
                    "status_code": "error",
                    "error": str(e),
                }
            )

    # compute summary metrics
    success_latencies = [lat for lat in latencies if lat is not None]
    p50 = statistics.median(success_latencies) if success_latencies else None
    p95 = sorted(success_latencies)[max(0, int(len(success_latencies) * 0.95) - 1)] if success_latencies else None

    # compute averages for overlap and citation (only for successful responses)
    overlaps = [r.get("overlap_score") for r in results if isinstance(r.get("overlap_score"), float)]
    citations = [r.get("citation_accuracy") for r in results if isinstance(r.get("citation_accuracy"), float)]

    summary = {
        "target": target,
        "n_questions": len(questions),
        "latency_p50_s": p50,
        "latency_p95_s": p95,
        "avg_overlap": sum(overlaps) / len(overlaps) if overlaps else None,
        "avg_citation_accuracy": sum(citations) / len(citations) if citations else None,
    }

    out = {"summary": summary, "results": results}

    with open(OUT_FILE, "w", encoding="utf-8") as f:
        json.dump(out, f, indent=2)

    # Also write a compact summary copy for CI collection
    try:
        summary_path = os.path.join(EVAL_RESULTS_DIR, "results_summary.json")
        with open(summary_path, "w", encoding="utf-8") as sf:
            json.dump(summary, sf, indent=2)
    except Exception:
        pass

    print("Evaluation complete. Summary:")
    print(json.dumps(summary, indent=2))
    print(f"Results written to {OUT_FILE}")


if __name__ == "__main__":
    target = os.getenv("EVAL_TARGET_URL", TARGET_URL)
    run_eval(target)