"""
reply_grader.py — Grades the quality of a drafted email reply.

This is the key Round 2 addition. Instead of only classifying emails,
the agent must also DRAFT A REPLY. This is much harder for LLMs and
far more useful for training — it tests tone, resolution, professionalism,
and context understanding simultaneously.

Score breakdown (all 0.0–1.0, weighted to produce final reply_score):
  - resolution_score  (0.40): Does the reply actually address the issue?
  - tone_score        (0.25): Is the tone appropriate for the category/priority?
  - completeness_score(0.20): Does it include required elements (greeting, sign-off, next steps)?
  - length_score      (0.10): Is it an appropriate length (not too short, not a wall of text)?
  - safety_score      (0.05): No promises the company can't keep, no admissions of liability?
"""
from __future__ import annotations

import re
from typing import Any, Dict, Optional, Tuple

# ── Tone profiles per category ────────────────────────────────────────────────
# Each maps to required signals and forbidden signals

TONE_PROFILES: Dict[str, Dict[str, Any]] = {
    "spam_phishing": {
        "required": [],
        "forbidden": ["congratulations", "prize", "winner", "claim", "urgent action"],
        "style": "none",  # No reply needed — agent should skip/trash
    },
    "customer_complaint": {
        "required": ["apologize", "sorry", "understand", "resolve", "look into"],
        "forbidden": ["your fault", "you should have", "not our problem", "impossible"],
        "style": "empathetic",
    },
    "billing_inquiry": {
        "required": ["account", "payment", "invoice", "amount", "charge"],
        "forbidden": ["ignore", "not our fault"],
        "style": "professional",
    },
    "technical_support": {
        "required": ["issue", "problem", "help", "team", "investigate"],
        "forbidden": ["cannot help", "not possible"],
        "style": "helpful",
    },
    "legal_compliance": {
        "required": ["received", "review", "team", "matter", "respond"],
        "forbidden": ["admit", "liable", "fault", "guilty", "settlement"],
        "style": "formal",
    },
    "internal_hr": {
        "required": ["team", "hr", "discuss", "meeting"],
        "forbidden": [],
        "style": "professional",
    },
    "sales_lead": {
        "required": ["thank", "interest", "team", "discuss", "schedule"],
        "forbidden": [],
        "style": "enthusiastic",
    },
    "general_inquiry": {
        "required": ["thank", "help", "answer", "question"],
        "forbidden": [],
        "style": "friendly",
    },
}

# Structural elements every professional reply should have
REQUIRED_ELEMENTS = {
    "greeting":   re.compile(r"\b(hi|hello|dear|good morning|good afternoon|thank you for)\b", re.I),
    "sign_off":   re.compile(r"\b(regards|sincerely|best|thanks|thank you|warm regards)\b", re.I),
    "next_steps": re.compile(r"\b(will|shall|team|we'll|contact|reach out|follow up|review|look into|investigate|schedule|call)\b", re.I),
}

# Safety: phrases that create legal/financial liability
UNSAFE_PHRASES = [
    "we guarantee",
    "we promise",
    "you will receive",
    "we admit",
    "our fault",
    "we are liable",
    "full refund",
    "immediately refund",
    "we will pay",
    "no questions asked",
]

# Ideal reply length ranges (words) per priority
LENGTH_RANGE: Dict[str, Tuple[int, int]] = {
    "urgent": (50, 200),
    "high":   (40, 250),
    "medium": (30, 300),
    "low":    (20, 200),
    "spam":   (0, 30),   # ideally no reply or very short rejection
}


def grade_reply(
    reply: str,
    category: str,
    priority: str,
    email_body: str,
    email_subject: str,
) -> Tuple[float, Dict[str, Any]]:
    """
    Grade the quality of a drafted reply.

    Args:
        reply:         The agent's drafted reply text.
        category:      Predicted/correct category (used for tone profile).
        priority:      Predicted/correct priority (used for length/urgency norms).
        email_body:    Original email body (for keyword overlap check).
        email_subject: Original email subject.

    Returns:
        (reply_score, detail_dict) where reply_score ∈ [0.0, 1.0]
    """
    # No reply needed for spam — reward is 0 but not penalised
    if category == "spam_phishing" or priority == "spam":
        skip_ok = len(reply.strip()) < 20  # Agent chose not to reply = good
        score = 0.8 if skip_ok else 0.3    # Penalise if they replied to spam
        return score, {
            "resolution_score": score,
            "tone_score": score,
            "completeness_score": score,
            "length_score": score,
            "safety_score": 1.0,
            "note": "spam — no reply expected",
        }

    reply_lower = reply.lower()
    word_count  = len(reply.split())

    # ── 1. Resolution score (0.40) ────────────────────────────────────────────
    # Does the reply address the actual content of the email?
    body_keywords  = set(re.findall(r'\b\w{5,}\b', email_body.lower()))
    reply_keywords = set(re.findall(r'\b\w{5,}\b', reply_lower))
    overlap        = body_keywords & reply_keywords
    overlap_ratio  = len(overlap) / max(len(body_keywords), 1)

    profile = TONE_PROFILES.get(category, TONE_PROFILES["general_inquiry"])
    required_hits = sum(1 for kw in profile["required"] if kw in reply_lower)
    required_ratio = required_hits / max(len(profile["required"]), 1)

    resolution_score = min(1.0, (overlap_ratio * 0.5) + (required_ratio * 0.5))

    # ── 2. Tone score (0.25) ──────────────────────────────────────────────────
    forbidden_hits = sum(1 for kw in profile["forbidden"] if kw in reply_lower)
    tone_score = max(0.0, 1.0 - (forbidden_hits * 0.3))

    # Empathetic categories need softening words
    if profile["style"] == "empathetic":
        empathy_words = ["sorry", "apologize", "understand", "frustrating", "concern"]
        has_empathy = any(w in reply_lower for w in empathy_words)
        if not has_empathy:
            tone_score *= 0.7

    # Formal categories penalise casual language
    if profile["style"] == "formal":
        casual_words = ["hey", "gonna", "wanna", "asap", "fyi", "lol"]
        casual_hits  = sum(1 for w in casual_words if w in reply_lower)
        tone_score  = max(0.0, tone_score - casual_hits * 0.15)

    # ── 3. Completeness score (0.20) ─────────────────────────────────────────
    element_scores = {
        elem: 1.0 if pattern.search(reply) else 0.0
        for elem, pattern in REQUIRED_ELEMENTS.items()
    }
    completeness_score = sum(element_scores.values()) / len(element_scores)

    # ── 4. Length score (0.10) ────────────────────────────────────────────────
    lo, hi = LENGTH_RANGE.get(priority, (30, 300))
    if lo <= word_count <= hi:
        length_score = 1.0
    elif word_count < lo:
        length_score = max(0.0, word_count / lo)       # Too short
    else:
        length_score = max(0.0, 1.0 - (word_count - hi) / hi)  # Too long

    # ── 5. Safety score (0.05) ────────────────────────────────────────────────
    unsafe_hits  = sum(1 for phrase in UNSAFE_PHRASES if phrase in reply_lower)
    safety_score = max(0.0, 1.0 - unsafe_hits * 0.5)

    # ── Final weighted score ──────────────────────────────────────────────────
    reply_score = (
        resolution_score   * 0.40
        + tone_score       * 0.25
        + completeness_score * 0.20
        + length_score     * 0.10
        + safety_score     * 0.05
    )
    reply_score = round(min(1.0, max(0.0, reply_score)), 4)

    detail = {
        "resolution_score":    round(resolution_score, 3),
        "tone_score":          round(tone_score, 3),
        "completeness_score":  round(completeness_score, 3),
        "length_score":        round(length_score, 3),
        "safety_score":        round(safety_score, 3),
        "word_count":          word_count,
        "keyword_overlap":     round(overlap_ratio, 3),
        "required_kw_hits":    required_hits,
        "forbidden_kw_hits":   forbidden_hits,
        "element_scores":      element_scores,
        "reply_score":         reply_score,
    }
    return reply_score, detail