"""
Evaluation — LLM-as-a-Judge framework.

Uses gpt-4o-mini to evaluate the quality of responses generated by cheaper models.
Only runs when explicitly requested (evaluate=True in generate()).
"""
from __future__ import annotations

import logging
from dataclasses import dataclass
from typing import Optional

logger = logging.getLogger(__name__)

_JUDGE_PROMPT = """You are an objective AI quality evaluator. 

A user asked:
"{query}"

An AI assistant responded:
"{response}"

Please evaluate this response on the following criteria and provide a score from 1-10 for each:
1. **Accuracy**: Is the information correct and factually accurate?
2. **Completeness**: Does it fully answer the question?
3. **Clarity**: Is it clear and easy to understand?
4. **Conciseness**: Is it appropriately concise without being too brief?

Also provide an **Overall Score** from 1-10.

Respond ONLY in this exact JSON format:
{{
  "accuracy": <1-10>,
  "completeness": <1-10>,
  "clarity": <1-10>,
  "conciseness": <1-10>,
  "overall": <1-10>,
  "feedback": "<one sentence summary of the main strength or weakness>"
}}"""


@dataclass
class EvaluationResult:
    accuracy: float
    completeness: float
    clarity: float
    conciseness: float
    overall: float
    feedback: str
    judge_model: str

    def to_dict(self) -> dict:
        return self.__dict__.copy()

    def __str__(self) -> str:
        return (
            f"Evaluation (judge={self.judge_model})\n"
            f"  Overall    : {self.overall}/10\n"
            f"  Accuracy   : {self.accuracy}/10\n"
            f"  Completeness: {self.completeness}/10\n"
            f"  Clarity    : {self.clarity}/10\n"
            f"  Conciseness: {self.conciseness}/10\n"
            f"  Feedback   : {self.feedback}"
        )


class LLMJudge:
    """
    Evaluates LLM responses using a judge model (default: gpt-4o-mini).
    Gracefully disabled if litellm or API keys are not available.
    """

    def __init__(self, judge_model: str = "gpt-4o-mini"):
        self.judge_model = judge_model
        self.enabled = True
        try:
            import litellm  # type: ignore # noqa: F401
        except ImportError:
            logger.warning("LLMJudge: litellm not installed. Evaluation disabled.")
            self.enabled = False

    def evaluate(self, query: str, response: str) -> Optional[EvaluationResult]:
        """
        Evaluates a query-response pair using the judge model.
        Returns None if evaluation fails or is disabled.
        """
        if not self.enabled:
            return None

        try:
            import json
            import litellm  # type: ignore

            prompt = _JUDGE_PROMPT.format(query=query, response=response)
            result = litellm.completion(
                model=self.judge_model,
                messages=[{"role": "user", "content": prompt}],
                temperature=0.0,
                max_tokens=300,
            )

            raw = str(result.choices[0].message.content).strip()  # type: ignore

            # Extract JSON from the response (handle markdown code blocks)
            if "```" in raw:
                raw = raw.split("```")[1]
                if raw.startswith("json"):
                    raw = raw[4:]

            scores = json.loads(raw)

            return EvaluationResult(
                accuracy=float(scores.get("accuracy", 0)),
                completeness=float(scores.get("completeness", 0)),
                clarity=float(scores.get("clarity", 0)),
                conciseness=float(scores.get("conciseness", 0)),
                overall=float(scores.get("overall", 0)),
                feedback=scores.get("feedback", ""),
                judge_model=self.judge_model,
            )

        except Exception as e:
            logger.warning(f"LLMJudge: Evaluation failed: {e}")
            return None