"""Module C: Generation Utility — measures improvement in AI responses when using the document."""

from __future__ import annotations
import json
import anthropic
from kvl.ingestor import Document

_QA_GEN_PROMPT = """Generate {n} specific questions whose answers can ONLY be found in this document (not general knowledge).
Focus on questions about specific findings, methods, results, or recommendations in the document.

Return ONLY a JSON array of question strings.

Document:
{document}"""

_JUDGE_PROMPT = """Compare two AI-generated answers to the same question. One is generated without any document context (baseline), the other with a relevant document as context (RAG).

Question: {question}

Baseline answer (no document): {baseline}

RAG answer (with document context): {rag_answer}

Rate the RAG answer improvement over baseline on these criteria (1-5 each):
- accuracy: Is the RAG answer more factually correct?
- completeness: Does the RAG answer cover more relevant aspects?
- specificity: Is the RAG answer more specific and precise?

Also provide an overall improvement score (0-100) where:
- 0 = RAG answer is no better or worse than baseline
- 50 = moderate improvement
- 100 = dramatically better, document was essential

Return ONLY JSON: {{"accuracy": int, "completeness": int, "specificity": int, "improvement": int, "reason": "one sentence"}}"""


def _call_claude(client: anthropic.Anthropic, system: str, user: str, model: str = "claude-sonnet-4-6") -> str:
    msg = client.messages.create(
        model=model,
        max_tokens=1024,
        messages=[{"role": "user", "content": user}],
        system=system,
    )
    return msg.content[0].text.strip()


def _generate_questions(client: anthropic.Anthropic, doc: Document, n: int = 8) -> list[str]:
    text = " ".join(doc.raw.split()[:5000])
    raw = _call_claude(
        client,
        "You generate evaluation questions for knowledge documents.",
        _QA_GEN_PROMPT.format(n=n, document=text),
        model="claude-haiku-4-5-20251001",
    )
    raw = raw.strip()
    if raw.startswith("```"):
        raw = "\n".join(raw.split("\n")[1:])
        raw = raw.rsplit("```", 1)[0]
    try:
        questions = json.loads(raw)
        return [q for q in questions if isinstance(q, str)][:n]
    except json.JSONDecodeError:
        return []


def _baseline_answer(client: anthropic.Anthropic, question: str) -> str:
    return _call_claude(
        client,
        "Answer based only on your pre-trained knowledge. Be honest if you don't know.",
        question,
        model="claude-haiku-4-5-20251001",
    )


def _rag_answer(client: anthropic.Anthropic, question: str, context: str) -> str:
    return _call_claude(
        client,
        "Answer the question using the provided document context. Be specific and cite details from the document.",
        f"Document context:\n{context}\n\nQuestion: {question}",
        model="claude-haiku-4-5-20251001",
    )


def _judge(client: anthropic.Anthropic, question: str, baseline: str, rag: str) -> dict:
    raw = _call_claude(
        client,
        "You are an expert evaluator comparing AI answer quality.",
        _JUDGE_PROMPT.format(question=question, baseline=baseline, rag_answer=rag),
    )
    raw = raw.strip()
    if raw.startswith("```"):
        raw = "\n".join(raw.split("\n")[1:])
        raw = raw.rsplit("```", 1)[0]
    try:
        return json.loads(raw)
    except json.JSONDecodeError:
        return {"accuracy": 3, "completeness": 3, "specificity": 3, "improvement": 50, "reason": "Parse error."}


def evaluate(client: anthropic.Anthropic, doc: Document, progress_cb=None, max_workers: int = 6) -> dict:
    """Return generation utility score (0-100) and per-question details."""
    if progress_cb:
        progress_cb("Generating evaluation questions...")

    questions = _generate_questions(client, doc)
    if not questions:
        return {"score": 50, "details": [], "summary": "Could not generate test questions."}

    # Use the full document as RAG context (capped for token limits)
    context = " ".join(doc.raw.split()[:4000])

    def _evaluate_question(args):
        client, question, context = args
        baseline = _baseline_answer(client, question)
        rag = _rag_answer(client, question, context)
        judgment = _judge(client, question, baseline, rag)
        return {
            "question": question,
            "baseline_answer": baseline,
            "rag_answer": rag,
            "accuracy": judgment.get("accuracy", 3),
            "completeness": judgment.get("completeness", 3),
            "specificity": judgment.get("specificity", 3),
            "improvement": judgment.get("improvement", 50),
            "reason": judgment.get("reason", ""),
        }

    if progress_cb:
        progress_cb(f"Evaluating {len(questions)} questions in parallel (baseline vs RAG)...")

    from concurrent.futures import ThreadPoolExecutor
    with ThreadPoolExecutor(max_workers=max_workers) as pool:
        results = list(pool.map(_evaluate_question, [(client, q, context) for q in questions]))

    improvement_scores = [r["improvement"] for r in results]

    avg_improvement = sum(improvement_scores) / len(improvement_scores)
    score = round(avg_improvement)

    return {
        "score": score,
        "details": results,
        "summary": f"Average RAG improvement: {score}/100 across {len(questions)} questions.",
    }