""" Task definitions and graders for Customer Support Environment """ from typing import Dict, List, Tuple import random from datetime import datetime, timedelta from models import CustomerSupportAction, CustomerSupportObservation class TicketTemplate: """Templates for generating realistic support tickets""" def __init__( self, category: str, priority: str, correct_team: str, message: str, keywords: List[str], ): self.category = category self.priority = priority self.correct_team = correct_team self.message = message self.keywords = keywords # Ticket templates for each category TICKET_TEMPLATES = [ # BILLING tickets TicketTemplate( "billing", "high", "billing", "I was charged twice for my subscription this month. Transaction IDs: TXN-{} and TXN-{}. Please refund the duplicate charge immediately.", ["charged twice", "duplicate", "refund", "subscription", "transaction"], ), TicketTemplate( "billing", "medium", "billing", "Can you help me understand why my invoice shows $149 instead of the $99 I was expecting? I have the promotional code SAVE50.", ["invoice", "pricing", "promotional code", "expecting", "charge"], ), TicketTemplate( "billing", "low", "billing", "I need a copy of my invoice from last month for accounting purposes. Account number: {}.", ["invoice", "copy", "accounting", "receipt", "statement"], ), # TECHNICAL tickets TicketTemplate( "technical", "critical", "technical", "URGENT: Our entire team cannot log in. Getting 'Error 500: Internal Server Error' on the login page. This is affecting our business operations!", ["cannot log in", "error 500", "urgent", "down", "not working"], ), TicketTemplate( "technical", "high", "tier2", "The mobile app keeps crashing whenever I try to upload a file larger than 10MB. I've tried reinstalling but the issue persists. Using iPhone 14, iOS 17.", ["app crashing", "upload", "bug", "error", "not working"], ), TicketTemplate( "technical", "medium", "tier2", "The search feature isn't returning results for queries longer than 3 words. Is this a known limitation or a bug?", ["search", "not working", "feature", "bug", "issue"], ), # ACCOUNT tickets TicketTemplate( "account", "high", "tier2", "I forgot my password and the reset link isn't working. I've tried three times but never receive the email. I need access urgently for a client meeting.", ["forgot password", "reset link", "not receiving", "access", "locked out"], ), TicketTemplate( "account", "medium", "tier1", "How do I change my email address associated with my account? I can't find this option in settings.", ["change email", "update", "account settings", "profile"], ), TicketTemplate( "account", "low", "tier1", "I want to update my profile picture and add a bio to my account. Can you guide me through the process?", ["profile", "picture", "update", "settings", "account"], ), # SHIPPING tickets TicketTemplate( "shipping", "high", "tier2", "My order #{} was supposed to arrive 3 days ago but tracking shows it's still in transit. I need it by tomorrow for an event!", ["order", "tracking", "late", "delivery", "shipping"], ), TicketTemplate( "shipping", "medium", "tier1", "I received the wrong item in my order #{}. I ordered a blue medium shirt but got a red large. What's the return process?", ["wrong item", "order", "return", "exchange", "mistake"], ), TicketTemplate( "shipping", "critical", "management", "I received my order but the product is completely damaged! The package was crushed and the item is broken. This is unacceptable - I need a refund AND compensation.", ["damaged", "broken", "crushed", "unacceptable", "compensation"], ), # GENERAL tickets TicketTemplate( "general", "low", "tier1", "What are your business hours? I'd like to know when I can reach someone by phone.", ["business hours", "when", "contact", "phone", "hours"], ), TicketTemplate( "general", "low", "tier1", "I love your product! Just wanted to say thanks for the excellent service. Keep up the great work!", ["thanks", "love", "great", "excellent", "feedback"], ), TicketTemplate( "general", "medium", "tier1", "Do you have any plans to add a dark mode feature? It would really improve the user experience.", ["feature request", "dark mode", "suggestion", "improve", "add"], ), ] def generate_ticket( seed: int = None, task_id: str = "easy" ) -> Tuple[CustomerSupportObservation, Dict]: """Generate a random ticket and its ground truth labels""" if seed is not None: random.seed(seed) template = random.choice(TICKET_TEMPLATES) # Generate random customer history is_premium = random.choice([True, False]) account_age_days = random.randint(1, 1825) # 0-5 years total_tickets = random.randint(0, 20) resolved_tickets = random.randint(0, min(total_tickets, 15)) satisfaction_score = round(random.uniform(2.0, 5.0), 1) lifetime_value = ( random.uniform(500, 10000) if is_premium else random.uniform(0, 500) ) # Generate ticket metadata ticket_id = f"TKT-2025-{random.randint(100000, 999999)}" timestamp = ( datetime.now() - timedelta(minutes=random.randint(0, 1440)) ).isoformat() + "Z" customer_id = f"CUST-{random.randint(10000, 99999)}" channel = random.choice(["email", "chat", "phone", "social"]) # Format message with random IDs message = template.message.format( random.randint(1000, 9999), random.randint(1000, 9999) ) observation = CustomerSupportObservation( ticket_id=ticket_id, timestamp=timestamp, customer_id=customer_id, channel=channel, customer_message=message, account_age_days=account_age_days, total_tickets=total_tickets, resolved_tickets=resolved_tickets, satisfaction_score=satisfaction_score, is_premium=is_premium, lifetime_value=lifetime_value, previous_interactions=[], attachments=[], task_id=task_id, done=False, reward=0.0, ) # Ground truth for grading ground_truth = { "category": template.category, "priority": template.priority, "team": template.correct_team, "keywords": template.keywords, "is_premium": is_premium, } return observation, ground_truth def grade_easy( action: CustomerSupportAction, ground_truth: Dict, observation: CustomerSupportObservation ) -> float: """ EASY TASK: Basic Ticket Classification Agent must correctly categorize the ticket into one of 5 categories. """ category_correct = 1.0 if action.category == ground_truth["category"] else 0.0 # Bonus for appropriate response (basic check) response_length_ok = len(action.response_draft) >= 20 response_bonus = 0.0 if category_correct == 0 else (0.2 if response_length_ok else 0.0) total_score = category_correct * 0.8 + response_bonus return min(max(total_score, 0.0), 1.0) def grade_medium( action: CustomerSupportAction, ground_truth: Dict, observation: CustomerSupportObservation ) -> float: """ MEDIUM TASK: Priority Assignment + Team Routing Agent must correctly categorize, assign priority, and route to the right team. """ category_correct = 1.0 if action.category == ground_truth["category"] else 0.0 priority_correct = 1.0 if action.priority == ground_truth["priority"] else 0.0 team_correct = 1.0 if action.assigned_team == ground_truth["team"] else 0.0 # Check if escalation is appropriate appropriate_escalation = ( (ground_truth["priority"] == "critical" and action.escalate) or ( ground_truth["is_premium"] and ground_truth["priority"] in ["high", "critical"] and action.escalate ) or (not action.escalate and ground_truth["priority"] in ["low", "medium"]) ) escalation_score = 1.0 if appropriate_escalation else 0.0 # Weighted scoring score = ( category_correct * 0.35 + priority_correct * 0.30 + team_correct * 0.25 + escalation_score * 0.10 ) return min(max(score, 0.0), 1.0) def grade_hard( action: CustomerSupportAction, ground_truth: Dict, observation: CustomerSupportObservation ) -> float: """ HARD TASK: Full Ticket Resolution Agent must correctly categorize, prioritize, route, AND draft a high-quality response. """ category_correct = 1.0 if action.category == ground_truth["category"] else 0.0 priority_correct = 1.0 if action.priority == ground_truth["priority"] else 0.0 team_correct = 1.0 if action.assigned_team == ground_truth["team"] else 0.0 # Response quality evaluation response = action.response_draft.lower() response_score = _evaluate_response_quality(response, ground_truth["keywords"]) # Check for professional tone indicators professional_indicators = [ "apologize" in response or "sorry" in response, "help" in response or "assist" in response, "thank" in response or "appreciate" in response, len(action.response_draft) >= 50, # Minimum reasonable length ] professionalism_score = sum(professional_indicators) / len(professional_indicators) # Premium customer handling premium_bonus = 0.0 if ground_truth["is_premium"]: premium_indicators = [ "priority" in response or "expedite" in response, action.priority in ["high", "critical"], "value" in response or "important" in response, ] premium_bonus = 0.05 if sum(premium_indicators) >= 2 else 0.0 # Weighted scoring score = ( category_correct * 0.25 + priority_correct * 0.20 + team_correct * 0.20 + response_score * 0.25 + professionalism_score * 0.10 + premium_bonus ) return min(max(score, 0.0), 1.0) def _evaluate_response_quality(response: str, keywords: List[str]) -> float: """Evaluate the quality of the response draft""" if len(response) < 30: return 0.0 # Check if response addresses key aspects of the issue keyword_matches = sum(1 for kw in keywords if kw.lower() in response) keyword_score = min(keyword_matches / max(len(keywords), 1), 1.0) # Check for generic bad responses bad_indicators = [ response.count(".") == 0, # No punctuation "lorem ipsum" in response, len(response.split()) < 10, # Too short ] if any(bad_indicators): return keyword_score * 0.3 return keyword_score # Grader function mapping GRADERS = {"easy": grade_easy, "medium": grade_medium, "hard": grade_hard} def get_grader(task_id: str): """Get the grader function for a task""" if task_id not in GRADERS: raise ValueError( f"Unknown task ID: {task_id}. Available: {list(GRADERS.keys())}" ) return GRADERS[task_id]