"""Telecom RAG - Evaluation Module

Implements RAGAS-style evaluation metrics for hallucination detection:
- Faithfulness scoring (is answer grounded in context?)
- Answer relevancy (does answer address the question?)
- Abstention logic (refuse when confidence is low)
"""

import re
from typing import List, Dict, Any, Optional, Tuple
from dataclasses import dataclass

from .config import (
    LLM_PROVIDER,
    OPENAI_API_KEY,
    GOOGLE_API_KEY,
    OPENAI_MODEL,
    GEMINI_MODEL
)


@dataclass
class EvaluationResult:
    """Evaluation metrics for a RAG response."""
    faithfulness_score: float  # 0-1: How grounded is answer in context
    relevancy_score: float     # 0-1: How relevant is answer to question
    confidence_score: float    # 0-1: Combined confidence
    should_abstain: bool       # True if confidence too low
    abstention_reason: str     # Why abstaining (if applicable)
    claims: List[str]          # Extracted claims from answer
    supported_claims: int      # Number of claims supported by context
    total_claims: int          # Total claims in answer
    # Context quality metrics (per architecture doc Section 5.1)
    context_precision: float = 0.0  # Relevant chunks / total chunks (target: >0.70)
    context_recall: float = 0.0     # Covered claims / total claims (target: >0.85)
    
    # TLM Trust Metrics (Section 5.2)
    trust_score: float = 0.0        # Combined reliability metric
    consistency_score: float = 0.0  # Self-consistency agreement (0-1)


class RAGEvaluator:
    """
    Evaluates RAG responses for faithfulness and relevancy.
    Implements abstention logic for low-confidence answers.
    """
    
    # Thresholds tuned for telecom domain with quality built-in KB
    FAITHFULNESS_THRESHOLD = 0.8   # Flag for review if below
    ABSTENTION_THRESHOLD = 0.3     # Refuse only for very low confidence
    MIN_SIMILARITY_THRESHOLD = 0.2 # Allow lower similarity since domain-specific
    
    def __init__(self):
        self.llm_available = self._check_llm()
        self.llm = None
        if self.llm_available:
            try:
                from .llm import TelecomLLM
                self.llm = TelecomLLM()
            except Exception as e:
                print(f"⚠️ Failed to init LLM for eval: {e}")

    def _check_llm(self) -> bool:
        """Check if LLM is available for evaluation."""
        if LLM_PROVIDER == "openai" and OPENAI_API_KEY and OPENAI_API_KEY != "your_openai_api_key_here":
            return True
        if LLM_PROVIDER == "gemini" and GOOGLE_API_KEY and GOOGLE_API_KEY != "your_google_api_key_here":
            return True
        return False
    
    def extract_claims(self, answer: str) -> List[str]:
        """
        Extract factual claims from an answer.
        Simple heuristic: split by sentences and filter.
        """
        # Split into sentences
        sentences = re.split(r'[.!?]+', answer)
        
        claims = []
        for sent in sentences:
            sent = sent.strip()
            # Filter out very short or non-factual sentences
            if len(sent) > 20 and not sent.startswith(('I ', 'We ', 'You ')):
                # Check if it contains factual content (numbers, technical terms)
                if re.search(r'\d|[A-Z]{2,}|specifically|defined|means|refers to', sent):
                    claims.append(sent)
        
        return claims
    
    def check_claim_support(self, claim: str, context: str) -> bool:
        """
        Check if a claim is supported by the context.
        Uses simple keyword/phrase overlap heuristic.
        """
        claim_lower = claim.lower()
        context_lower = context.lower()
        
        # Extract key terms from claim
        terms = re.findall(r'\b[a-z]{3,}\b', claim_lower)
        technical_terms = re.findall(r'\b[A-Z]{2,6}\b', claim)
        
        # Count term overlap
        term_matches = sum(1 for t in terms if t in context_lower)
        tech_matches = sum(1 for t in technical_terms if t in context)
        
        # Calculate support ratio
        total_terms = len(terms) + len(technical_terms)
        if total_terms == 0:
            return True  # No specific claims to verify
        
        support_ratio = (term_matches + tech_matches * 2) / (total_terms + len(technical_terms))
        
        return support_ratio > 0.3
    
    def calculate_faithfulness(
        self, 
        answer: str, 
        context: str
    ) -> Tuple[float, List[str], int, int]:
        """
        Calculate faithfulness score (Heuristic).
        Measures how grounded the answer is in the provided context.
        """
        claims = self.extract_claims(answer)
        
        if not claims:
            return 1.0, [], 0, 0  # No claims = faithful by default
        
        supported = 0
        for claim in claims:
            if self.check_claim_support(claim, context):
                supported += 1
        
        score = supported / len(claims) if claims else 1.0
        return score, claims, supported, len(claims)

    def calculate_llm_faithfulness(self, answer: str, context: str) -> float:
        """
        Calculate faithfulness using LLM (More accurate).
        """
        if not self.llm:
            return 0.0
            
        prompt = f"""Rate the faithfulness of the answer to the context on a scale of 0.0 to 1.0.
Faithfulness measures if the answer is derived solely from the context given.
Return ONLY the float score.

Context:
{context[:2000]}...

Answer:
{answer}

Score:"""
        try:
            response = self.llm.simple_generate(prompt).strip()
            # extract float
            match = re.search(r"0\.\d+|1\.0|0|1", response)
            if match:
                return float(match.group())
            return 0.5 # Fallback
        except Exception as e:
            print(f"⚠️ LLM Faithfulness failed: {e}")
            return 0.5

    def calculate_relevancy(self, question: str, answer: str) -> float:
        """
        Calculate how relevant the answer is to the question.
        Uses keyword overlap heuristic.
        """
        question_terms = set(re.findall(r'\b[a-z]{3,}\b', question.lower()))
        question_tech = set(re.findall(r'\b[A-Z]{2,6}\b', question))
        
        answer_terms = set(re.findall(r'\b[a-z]{3,}\b', answer.lower()))
        answer_tech = set(re.findall(r'\b[A-Z]{2,6}\b', answer))
        
        # Remove common words
        common_words = {'what', 'how', 'why', 'when', 'where', 'which', 'the', 'and', 'for'}
        question_terms -= common_words
        
        if not question_terms and not question_tech:
            return 1.0
        
        # Calculate overlap
        term_overlap = len(question_terms & answer_terms)
        tech_overlap = len(question_tech & answer_tech)
        
        total_question = len(question_terms) + len(question_tech)
        overlap = term_overlap + tech_overlap * 2  # Weight technical terms higher
        
        return min(1.0, overlap / total_question) if total_question > 0 else 1.0

    def calculate_llm_relevancy(self, question: str, answer: str) -> float:
        """
        Calculate relevancy using LLM.
        """
        if not self.llm:
            return 0.0
            
        prompt = f"""Rate the relevancy of the answer to the question on a scale of 0.0 to 1.0.
Relevancy measures if the answer actually answers the question asked.
Return ONLY the float score.

Question:
{question}

Answer:
{answer}

Score:"""
        try:
            response = self.llm.simple_generate(prompt).strip()
            match = re.search(r"0\.\d+|1\.0|0|1", response)
            if match:
                return float(match.group())
            return 0.5
        except Exception as e:
            print(f"⚠️ LLM Relevancy failed: {e}")
            return 0.5
    
    def calculate_retrieval_confidence(
        self, 
        similarity_scores: List[float]
    ) -> float:
        """
        Calculate confidence based on retrieval quality.
        Uses average of top similarity scores.
        """
        if not similarity_scores:
            return 0.0
        
        # Use top 3 scores
        top_scores = sorted(similarity_scores, reverse=True)[:3]
        avg_score = sum(top_scores) / len(top_scores)
        
        # Check if best match is good enough
        best_score = max(similarity_scores)
        
        if best_score < self.MIN_SIMILARITY_THRESHOLD:
            return 0.3  # Very low confidence
        
        return avg_score
    
    def evaluate(
        self,
        question: str,
        answer: str,
        context: str,
        similarity_scores: List[float],
        use_llm: bool = False
    ) -> EvaluationResult:
        """
        Full evaluation of a RAG response.
        
        Args:
            question: User's question
            answer: Generated answer
            context: Retrieved context used for generation
            similarity_scores: Similarity scores from retrieval
            use_llm: Whether to use LLM for evaluation (slower, more accurate)
            
        Returns:
            EvaluationResult with all metrics
        """
        # Calculate faithfulness
        if use_llm and self.llm:
            faithfulness = self.calculate_llm_faithfulness(answer, context)
            claims = ["LLM Evaluated"] # Skip claim extraction for LLM mode to save time? Or keep it?
            # Let's keep heuristic claims for display, but override score
            _, heuristic_claims, supported, total = self.calculate_faithfulness(answer, context)
            claims = heuristic_claims
        else:
            faithfulness, claims, supported, total = self.calculate_faithfulness(answer, context)
        
        # Calculate relevancy
        if use_llm and self.llm:
            relevancy = self.calculate_llm_relevancy(question, answer)
        else:
            relevancy = self.calculate_relevancy(question, answer)
        
        # Calculate retrieval confidence
        retrieval_confidence = self.calculate_retrieval_confidence(similarity_scores)
        
        # Combined confidence score (weighted average)
        confidence = (
            faithfulness * 0.4 +
            relevancy * 0.3 +
            retrieval_confidence * 0.3
        )
        
        # Determine abstention
        should_abstain = False
        abstention_reason = ""
        
        if retrieval_confidence < self.MIN_SIMILARITY_THRESHOLD:
            should_abstain = True
            abstention_reason = "Retrieved documents have low relevance to the question"
        elif faithfulness < self.ABSTENTION_THRESHOLD:
            should_abstain = True
            abstention_reason = "Answer may not be fully grounded in available information"
        elif confidence < self.ABSTENTION_THRESHOLD:
            should_abstain = True
            abstention_reason = "Insufficient confidence to provide a reliable answer"
        
        # Calculate context precision (relevant chunks / total)
        # Using similarity scores as proxy for relevance
        high_relevance_count = sum(1 for s in similarity_scores if s > 0.5)
        context_precision = high_relevance_count / len(similarity_scores) if similarity_scores else 0.0
        
        # Calculate context recall (supported claims / total claims)
        # Using heuristic supported count even in LLM mode for now as proxy
        context_recall = supported / total if total > 0 else 1.0
        
        # Calculate Reliability/Trust Score (Section 5.2 TLM)
        # Weighted average of key metrics:
        # - Faithfulness (40%): Is it true?
        # - Relevancy (30%): Is it useful?
        # - Ctx Precision (20%): Was retrieval good?
        # - Confidence (10%): Does model feel sure?
        trust_score = (faithfulness * 0.4) + (relevancy * 0.3) + (context_precision * 0.2) + (confidence * 0.1)
        
        return EvaluationResult(
            faithfulness_score=faithfulness,
            relevancy_score=relevancy,
            confidence_score=confidence,
            should_abstain=should_abstain,
            abstention_reason=abstention_reason,
            claims=claims,
            supported_claims=supported,
            total_claims=total,
            context_precision=context_precision,
            context_recall=context_recall,
            trust_score=trust_score,
            consistency_score=1.0  # Placeholder: Requires multi-generation logic
        )
    
    def get_abstention_message(self, reason: str) -> str:
        """Generate a polite abstention message."""
        return f"""⚠️ **Unable to provide a confident answer**

{reason}

**What you can do:**
- Try rephrasing your question with more specific terms
- Check if the topic is covered in the knowledge base
- Consult official 3GPP documentation for authoritative information

*This response was withheld because the system could not verify the accuracy of the answer based on available sources.*"""


# Global instance
_evaluator = None


def get_evaluator() -> RAGEvaluator:
    """Get or create global evaluator instance."""
    global _evaluator
    if _evaluator is None:
        _evaluator = RAGEvaluator()
    return _evaluator


if __name__ == "__main__":
    # Test evaluation
    evaluator = RAGEvaluator()
    
    question = "What is HARQ in 5G NR?"
    answer = "HARQ (Hybrid Automatic Repeat Request) is a error correction mechanism in 5G NR that combines forward error correction with retransmission. It uses soft combining to improve reliability."
    context = "HARQ (Hybrid Automatic Repeat Request) is a combination of high-rate forward error correction (FEC) and ARQ error-control. In 5G NR, HARQ provides reliable data transmission by using incremental redundancy."
    
    result = evaluator.evaluate(question, answer, context, [0.85, 0.72, 0.65])
    
    print("\n📊 Evaluation Results:")
    print(f"  Faithfulness: {result.faithfulness_score:.2f}")
    print(f"  Relevancy: {result.relevancy_score:.2f}")
    print(f"  Confidence: {result.confidence_score:.2f}")
    print(f"  Should Abstain: {result.should_abstain}")
    print(f"  Claims: {result.total_claims} total, {result.supported_claims} supported")