Spaces:

Vansh04092003
/

multi-agent-email-env

Runtime error

App Files Files Community

multi-agent-email-env / reply_grader.py

Vansh04092003

Upload folder using huggingface_hub

4bd8a10 verified about 2 months ago

raw

history blame contribute delete

8.87 kB

	"""
	reply_grader.py — Grades the quality of a drafted email reply.

	This is the key Round 2 addition. Instead of only classifying emails,
	the agent must also DRAFT A REPLY. This is much harder for LLMs and
	far more useful for training — it tests tone, resolution, professionalism,
	and context understanding simultaneously.

	Score breakdown (all 0.0–1.0, weighted to produce final reply_score):
	- resolution_score (0.40): Does the reply actually address the issue?
	- tone_score (0.25): Is the tone appropriate for the category/priority?
	- completeness_score(0.20): Does it include required elements (greeting, sign-off, next steps)?
	- length_score (0.10): Is it an appropriate length (not too short, not a wall of text)?
	- safety_score (0.05): No promises the company can't keep, no admissions of liability?
	"""
	from __future__ import annotations

	import re
	from typing import Any, Dict, Optional, Tuple

	# ── Tone profiles per category ────────────────────────────────────────────────
	# Each maps to required signals and forbidden signals

	TONE_PROFILES: Dict[str, Dict[str, Any]] = {
	"spam_phishing": {
	"required": [],
	"forbidden": ["congratulations", "prize", "winner", "claim", "urgent action"],
	"style": "none", # No reply needed — agent should skip/trash
	},
	"customer_complaint": {
	"required": ["apologize", "sorry", "understand", "resolve", "look into"],
	"forbidden": ["your fault", "you should have", "not our problem", "impossible"],
	"style": "empathetic",
	},
	"billing_inquiry": {
	"required": ["account", "payment", "invoice", "amount", "charge"],
	"forbidden": ["ignore", "not our fault"],
	"style": "professional",
	},
	"technical_support": {
	"required": ["issue", "problem", "help", "team", "investigate"],
	"forbidden": ["cannot help", "not possible"],
	"style": "helpful",
	},
	"legal_compliance": {
	"required": ["received", "review", "team", "matter", "respond"],
	"forbidden": ["admit", "liable", "fault", "guilty", "settlement"],
	"style": "formal",
	},
	"internal_hr": {
	"required": ["team", "hr", "discuss", "meeting"],
	"forbidden": [],
	"style": "professional",
	},
	"sales_lead": {
	"required": ["thank", "interest", "team", "discuss", "schedule"],
	"forbidden": [],
	"style": "enthusiastic",
	},
	"general_inquiry": {
	"required": ["thank", "help", "answer", "question"],
	"forbidden": [],
	"style": "friendly",
	},
	}

	# Structural elements every professional reply should have
	REQUIRED_ELEMENTS = {
	"greeting": re.compile(r"\b(hi\|hello\|dear\|good morning\|good afternoon\|thank you for)\b", re.I),
	"sign_off": re.compile(r"\b(regards\|sincerely\|best\|thanks\|thank you\|warm regards)\b", re.I),
	"next_steps": re.compile(r"\b(will\|shall\|team\|we'll\|contact\|reach out\|follow up\|review\|look into\|investigate\|schedule\|call)\b", re.I),
	}

	# Safety: phrases that create legal/financial liability
	UNSAFE_PHRASES = [
	"we guarantee",
	"we promise",
	"you will receive",
	"we admit",
	"our fault",
	"we are liable",
	"full refund",
	"immediately refund",
	"we will pay",
	"no questions asked",
	]

	# Ideal reply length ranges (words) per priority
	LENGTH_RANGE: Dict[str, Tuple[int, int]] = {
	"urgent": (50, 200),
	"high": (40, 250),
	"medium": (30, 300),
	"low": (20, 200),
	"spam": (0, 30), # ideally no reply or very short rejection
	}


	def grade_reply(
	reply: str,
	category: str,
	priority: str,
	email_body: str,
	email_subject: str,
	) -> Tuple[float, Dict[str, Any]]:
	"""
	Grade the quality of a drafted reply.

	Args:
	reply: The agent's drafted reply text.
	category: Predicted/correct category (used for tone profile).
	priority: Predicted/correct priority (used for length/urgency norms).
	email_body: Original email body (for keyword overlap check).
	email_subject: Original email subject.

	Returns:
	(reply_score, detail_dict) where reply_score ∈ [0.0, 1.0]
	"""
	# No reply needed for spam — reward is 0 but not penalised
	if category == "spam_phishing" or priority == "spam":
	skip_ok = len(reply.strip()) < 20 # Agent chose not to reply = good
	score = 0.8 if skip_ok else 0.3 # Penalise if they replied to spam
	return score, {
	"resolution_score": score,
	"tone_score": score,
	"completeness_score": score,
	"length_score": score,
	"safety_score": 1.0,
	"note": "spam — no reply expected",
	}

	reply_lower = reply.lower()
	word_count = len(reply.split())

	# ── 1. Resolution score (0.40) ────────────────────────────────────────────
	# Does the reply address the actual content of the email?
	body_keywords = set(re.findall(r'\b\w{5,}\b', email_body.lower()))
	reply_keywords = set(re.findall(r'\b\w{5,}\b', reply_lower))
	overlap = body_keywords & reply_keywords
	overlap_ratio = len(overlap) / max(len(body_keywords), 1)

	profile = TONE_PROFILES.get(category, TONE_PROFILES["general_inquiry"])
	required_hits = sum(1 for kw in profile["required"] if kw in reply_lower)
	required_ratio = required_hits / max(len(profile["required"]), 1)

	resolution_score = min(1.0, (overlap_ratio * 0.5) + (required_ratio * 0.5))

	# ── 2. Tone score (0.25) ──────────────────────────────────────────────────
	forbidden_hits = sum(1 for kw in profile["forbidden"] if kw in reply_lower)
	tone_score = max(0.0, 1.0 - (forbidden_hits * 0.3))

	# Empathetic categories need softening words
	if profile["style"] == "empathetic":
	empathy_words = ["sorry", "apologize", "understand", "frustrating", "concern"]
	has_empathy = any(w in reply_lower for w in empathy_words)
	if not has_empathy:
	tone_score *= 0.7

	# Formal categories penalise casual language
	if profile["style"] == "formal":
	casual_words = ["hey", "gonna", "wanna", "asap", "fyi", "lol"]
	casual_hits = sum(1 for w in casual_words if w in reply_lower)
	tone_score = max(0.0, tone_score - casual_hits * 0.15)

	# ── 3. Completeness score (0.20) ─────────────────────────────────────────
	element_scores = {
	elem: 1.0 if pattern.search(reply) else 0.0
	for elem, pattern in REQUIRED_ELEMENTS.items()
	}
	completeness_score = sum(element_scores.values()) / len(element_scores)

	# ── 4. Length score (0.10) ────────────────────────────────────────────────
	lo, hi = LENGTH_RANGE.get(priority, (30, 300))
	if lo <= word_count <= hi:
	length_score = 1.0
	elif word_count < lo:
	length_score = max(0.0, word_count / lo) # Too short
	else:
	length_score = max(0.0, 1.0 - (word_count - hi) / hi) # Too long

	# ── 5. Safety score (0.05) ────────────────────────────────────────────────
	unsafe_hits = sum(1 for phrase in UNSAFE_PHRASES if phrase in reply_lower)
	safety_score = max(0.0, 1.0 - unsafe_hits * 0.5)

	# ── Final weighted score ──────────────────────────────────────────────────
	reply_score = (
	resolution_score * 0.40
	+ tone_score * 0.25
	+ completeness_score * 0.20
	+ length_score * 0.10
	+ safety_score * 0.05
	)
	reply_score = round(min(1.0, max(0.0, reply_score)), 4)

	detail = {
	"resolution_score": round(resolution_score, 3),
	"tone_score": round(tone_score, 3),
	"completeness_score": round(completeness_score, 3),
	"length_score": round(length_score, 3),
	"safety_score": round(safety_score, 3),
	"word_count": word_count,
	"keyword_overlap": round(overlap_ratio, 3),
	"required_kw_hits": required_hits,
	"forbidden_kw_hits": forbidden_hits,
	"element_scores": element_scores,
	"reply_score": reply_score,
	}
	return reply_score, detail