Spaces:
Sleeping
Sleeping
| """ | |
| Task definitions and graders for Customer Support Environment | |
| """ | |
| from typing import Dict, List, Tuple | |
| import random | |
| from datetime import datetime, timedelta | |
| from models import CustomerSupportAction, CustomerSupportObservation | |
| class TicketTemplate: | |
| """Templates for generating realistic support tickets""" | |
| def __init__( | |
| self, | |
| category: str, | |
| priority: str, | |
| correct_team: str, | |
| message: str, | |
| keywords: List[str], | |
| ): | |
| self.category = category | |
| self.priority = priority | |
| self.correct_team = correct_team | |
| self.message = message | |
| self.keywords = keywords | |
| # Ticket templates for each category | |
| TICKET_TEMPLATES = [ | |
| # BILLING tickets | |
| TicketTemplate( | |
| "billing", | |
| "high", | |
| "billing", | |
| "I was charged twice for my subscription this month. Transaction IDs: TXN-{} and TXN-{}. Please refund the duplicate charge immediately.", | |
| ["charged twice", "duplicate", "refund", "subscription", "transaction"], | |
| ), | |
| TicketTemplate( | |
| "billing", | |
| "medium", | |
| "billing", | |
| "Can you help me understand why my invoice shows $149 instead of the $99 I was expecting? I have the promotional code SAVE50.", | |
| ["invoice", "pricing", "promotional code", "expecting", "charge"], | |
| ), | |
| TicketTemplate( | |
| "billing", | |
| "low", | |
| "billing", | |
| "I need a copy of my invoice from last month for accounting purposes. Account number: {}.", | |
| ["invoice", "copy", "accounting", "receipt", "statement"], | |
| ), | |
| # TECHNICAL tickets | |
| TicketTemplate( | |
| "technical", | |
| "critical", | |
| "technical", | |
| "URGENT: Our entire team cannot log in. Getting 'Error 500: Internal Server Error' on the login page. This is affecting our business operations!", | |
| ["cannot log in", "error 500", "urgent", "down", "not working"], | |
| ), | |
| TicketTemplate( | |
| "technical", | |
| "high", | |
| "tier2", | |
| "The mobile app keeps crashing whenever I try to upload a file larger than 10MB. I've tried reinstalling but the issue persists. Using iPhone 14, iOS 17.", | |
| ["app crashing", "upload", "bug", "error", "not working"], | |
| ), | |
| TicketTemplate( | |
| "technical", | |
| "medium", | |
| "tier2", | |
| "The search feature isn't returning results for queries longer than 3 words. Is this a known limitation or a bug?", | |
| ["search", "not working", "feature", "bug", "issue"], | |
| ), | |
| # ACCOUNT tickets | |
| TicketTemplate( | |
| "account", | |
| "high", | |
| "tier2", | |
| "I forgot my password and the reset link isn't working. I've tried three times but never receive the email. I need access urgently for a client meeting.", | |
| ["forgot password", "reset link", "not receiving", "access", "locked out"], | |
| ), | |
| TicketTemplate( | |
| "account", | |
| "medium", | |
| "tier1", | |
| "How do I change my email address associated with my account? I can't find this option in settings.", | |
| ["change email", "update", "account settings", "profile"], | |
| ), | |
| TicketTemplate( | |
| "account", | |
| "low", | |
| "tier1", | |
| "I want to update my profile picture and add a bio to my account. Can you guide me through the process?", | |
| ["profile", "picture", "update", "settings", "account"], | |
| ), | |
| # SHIPPING tickets | |
| TicketTemplate( | |
| "shipping", | |
| "high", | |
| "tier2", | |
| "My order #{} was supposed to arrive 3 days ago but tracking shows it's still in transit. I need it by tomorrow for an event!", | |
| ["order", "tracking", "late", "delivery", "shipping"], | |
| ), | |
| TicketTemplate( | |
| "shipping", | |
| "medium", | |
| "tier1", | |
| "I received the wrong item in my order #{}. I ordered a blue medium shirt but got a red large. What's the return process?", | |
| ["wrong item", "order", "return", "exchange", "mistake"], | |
| ), | |
| TicketTemplate( | |
| "shipping", | |
| "critical", | |
| "management", | |
| "I received my order but the product is completely damaged! The package was crushed and the item is broken. This is unacceptable - I need a refund AND compensation.", | |
| ["damaged", "broken", "crushed", "unacceptable", "compensation"], | |
| ), | |
| # GENERAL tickets | |
| TicketTemplate( | |
| "general", | |
| "low", | |
| "tier1", | |
| "What are your business hours? I'd like to know when I can reach someone by phone.", | |
| ["business hours", "when", "contact", "phone", "hours"], | |
| ), | |
| TicketTemplate( | |
| "general", | |
| "low", | |
| "tier1", | |
| "I love your product! Just wanted to say thanks for the excellent service. Keep up the great work!", | |
| ["thanks", "love", "great", "excellent", "feedback"], | |
| ), | |
| TicketTemplate( | |
| "general", | |
| "medium", | |
| "tier1", | |
| "Do you have any plans to add a dark mode feature? It would really improve the user experience.", | |
| ["feature request", "dark mode", "suggestion", "improve", "add"], | |
| ), | |
| ] | |
| def generate_ticket( | |
| seed: int = None, task_id: str = "easy" | |
| ) -> Tuple[CustomerSupportObservation, Dict]: | |
| """Generate a random ticket and its ground truth labels""" | |
| if seed is not None: | |
| random.seed(seed) | |
| template = random.choice(TICKET_TEMPLATES) | |
| # Generate random customer history | |
| is_premium = random.choice([True, False]) | |
| account_age_days = random.randint(1, 1825) # 0-5 years | |
| total_tickets = random.randint(0, 20) | |
| resolved_tickets = random.randint(0, min(total_tickets, 15)) | |
| satisfaction_score = round(random.uniform(2.0, 5.0), 1) | |
| lifetime_value = ( | |
| random.uniform(500, 10000) if is_premium else random.uniform(0, 500) | |
| ) | |
| # Generate ticket metadata | |
| ticket_id = f"TKT-2025-{random.randint(100000, 999999)}" | |
| timestamp = ( | |
| datetime.now() - timedelta(minutes=random.randint(0, 1440)) | |
| ).isoformat() + "Z" | |
| customer_id = f"CUST-{random.randint(10000, 99999)}" | |
| channel = random.choice(["email", "chat", "phone", "social"]) | |
| # Format message with random IDs | |
| message = template.message.format( | |
| random.randint(1000, 9999), random.randint(1000, 9999) | |
| ) | |
| observation = CustomerSupportObservation( | |
| ticket_id=ticket_id, | |
| timestamp=timestamp, | |
| customer_id=customer_id, | |
| channel=channel, | |
| customer_message=message, | |
| account_age_days=account_age_days, | |
| total_tickets=total_tickets, | |
| resolved_tickets=resolved_tickets, | |
| satisfaction_score=satisfaction_score, | |
| is_premium=is_premium, | |
| lifetime_value=lifetime_value, | |
| previous_interactions=[], | |
| attachments=[], | |
| task_id=task_id, | |
| done=False, | |
| reward=0.0, | |
| ) | |
| # Ground truth for grading | |
| ground_truth = { | |
| "category": template.category, | |
| "priority": template.priority, | |
| "team": template.correct_team, | |
| "keywords": template.keywords, | |
| "is_premium": is_premium, | |
| } | |
| return observation, ground_truth | |
| def grade_easy( | |
| action: CustomerSupportAction, ground_truth: Dict, observation: CustomerSupportObservation | |
| ) -> float: | |
| """ | |
| EASY TASK: Basic Ticket Classification | |
| Agent must correctly categorize the ticket into one of 5 categories. | |
| """ | |
| category_correct = 1.0 if action.category == ground_truth["category"] else 0.0 | |
| # Bonus for appropriate response (basic check) | |
| response_length_ok = len(action.response_draft) >= 20 | |
| response_bonus = 0.0 if category_correct == 0 else (0.2 if response_length_ok else 0.0) | |
| total_score = category_correct * 0.8 + response_bonus | |
| return min(max(total_score, 0.0), 1.0) | |
| def grade_medium( | |
| action: CustomerSupportAction, ground_truth: Dict, observation: CustomerSupportObservation | |
| ) -> float: | |
| """ | |
| MEDIUM TASK: Priority Assignment + Team Routing | |
| Agent must correctly categorize, assign priority, and route to the right team. | |
| """ | |
| category_correct = 1.0 if action.category == ground_truth["category"] else 0.0 | |
| priority_correct = 1.0 if action.priority == ground_truth["priority"] else 0.0 | |
| team_correct = 1.0 if action.assigned_team == ground_truth["team"] else 0.0 | |
| # Check if escalation is appropriate | |
| appropriate_escalation = ( | |
| (ground_truth["priority"] == "critical" and action.escalate) | |
| or ( | |
| ground_truth["is_premium"] | |
| and ground_truth["priority"] in ["high", "critical"] | |
| and action.escalate | |
| ) | |
| or (not action.escalate and ground_truth["priority"] in ["low", "medium"]) | |
| ) | |
| escalation_score = 1.0 if appropriate_escalation else 0.0 | |
| # Weighted scoring | |
| score = ( | |
| category_correct * 0.35 | |
| + priority_correct * 0.30 | |
| + team_correct * 0.25 | |
| + escalation_score * 0.10 | |
| ) | |
| return min(max(score, 0.0), 1.0) | |
| def grade_hard( | |
| action: CustomerSupportAction, ground_truth: Dict, observation: CustomerSupportObservation | |
| ) -> float: | |
| """ | |
| HARD TASK: Full Ticket Resolution | |
| Agent must correctly categorize, prioritize, route, AND draft a high-quality response. | |
| """ | |
| category_correct = 1.0 if action.category == ground_truth["category"] else 0.0 | |
| priority_correct = 1.0 if action.priority == ground_truth["priority"] else 0.0 | |
| team_correct = 1.0 if action.assigned_team == ground_truth["team"] else 0.0 | |
| # Response quality evaluation | |
| response = action.response_draft.lower() | |
| response_score = _evaluate_response_quality(response, ground_truth["keywords"]) | |
| # Check for professional tone indicators | |
| professional_indicators = [ | |
| "apologize" in response or "sorry" in response, | |
| "help" in response or "assist" in response, | |
| "thank" in response or "appreciate" in response, | |
| len(action.response_draft) >= 50, # Minimum reasonable length | |
| ] | |
| professionalism_score = sum(professional_indicators) / len(professional_indicators) | |
| # Premium customer handling | |
| premium_bonus = 0.0 | |
| if ground_truth["is_premium"]: | |
| premium_indicators = [ | |
| "priority" in response or "expedite" in response, | |
| action.priority in ["high", "critical"], | |
| "value" in response or "important" in response, | |
| ] | |
| premium_bonus = 0.05 if sum(premium_indicators) >= 2 else 0.0 | |
| # Weighted scoring | |
| score = ( | |
| category_correct * 0.25 | |
| + priority_correct * 0.20 | |
| + team_correct * 0.20 | |
| + response_score * 0.25 | |
| + professionalism_score * 0.10 | |
| + premium_bonus | |
| ) | |
| return min(max(score, 0.0), 1.0) | |
| def _evaluate_response_quality(response: str, keywords: List[str]) -> float: | |
| """Evaluate the quality of the response draft""" | |
| if len(response) < 30: | |
| return 0.0 | |
| # Check if response addresses key aspects of the issue | |
| keyword_matches = sum(1 for kw in keywords if kw.lower() in response) | |
| keyword_score = min(keyword_matches / max(len(keywords), 1), 1.0) | |
| # Check for generic bad responses | |
| bad_indicators = [ | |
| response.count(".") == 0, # No punctuation | |
| "lorem ipsum" in response, | |
| len(response.split()) < 10, # Too short | |
| ] | |
| if any(bad_indicators): | |
| return keyword_score * 0.3 | |
| return keyword_score | |
| # Grader function mapping | |
| GRADERS = {"easy": grade_easy, "medium": grade_medium, "hard": grade_hard} | |
| def get_grader(task_id: str): | |
| """Get the grader function for a task""" | |
| if task_id not in GRADERS: | |
| raise ValueError( | |
| f"Unknown task ID: {task_id}. Available: {list(GRADERS.keys())}" | |
| ) | |
| return GRADERS[task_id] | |