Spaces:
Sleeping
Sleeping
| import random | |
| from datetime import datetime, timedelta | |
| from environment.types import Email, GroundTruth, EmailCategory, Team | |
| class DataGenerator: | |
| """Generates synthetic email datasets for different tasks""" | |
| SPAM_PATTERNS = [ | |
| "Click here now!", "LIMITED TIME OFFER", "Act NOW!!!", | |
| "Free money", "You've won!", "Congratulations", | |
| "Verify your account", "Confirm identity", "Update payment", | |
| "urgent action required", "verify credentials" | |
| ] | |
| URGENCY_KEYWORDS = [ | |
| "urgent", "asap", "critical", "downtime", "affected", | |
| "production issue", "customer complaint", "emergency" | |
| ] | |
| def __init__(self, seed: int = 42): | |
| random.seed(seed) | |
| def _is_spam(self, subject: str, body: str) -> bool: | |
| """Determine if email is spam based on patterns""" | |
| text = (subject + " " + body).lower() | |
| spam_score = sum(1 for pattern in self.SPAM_PATTERNS if pattern.lower() in text) | |
| return spam_score >= 2 | |
| def _is_urgent(self, subject: str, body: str, sla_hours: int = None) -> bool: | |
| """Determine if email is urgent""" | |
| text = (subject + " " + body).lower() | |
| urgency_score = sum(1 for kw in self.URGENCY_KEYWORDS if kw in text) | |
| return urgency_score >= 1 or (sla_hours and sla_hours <= 4) | |
| def _get_category(self, subject: str, body: str, sla_hours: int = None) -> EmailCategory: | |
| """Determine email category""" | |
| if self._is_spam(subject, body): | |
| return EmailCategory.SPAM | |
| if self._is_urgent(subject, body, sla_hours): | |
| return EmailCategory.URGENT | |
| if "billing" in subject.lower() or "invoice" in subject.lower(): | |
| return EmailCategory.BILLING | |
| return EmailCategory.NORMAL | |
| def _get_team(self, category: EmailCategory, subject: str) -> Team: | |
| """Determine target team""" | |
| if category == EmailCategory.SPAM: | |
| return Team.NONE | |
| if category == EmailCategory.BILLING or "billing" in subject.lower(): | |
| return Team.BILLING | |
| if category == EmailCategory.URGENT and "sales" in subject.lower(): | |
| return Team.SALES | |
| if category == EmailCategory.URGENT: | |
| return Team.SUPPORT | |
| if "sales" in subject.lower() or "order" in subject.lower(): | |
| return Team.SALES | |
| return Team.SUPPORT | |
| def _get_priority(self, category: EmailCategory, sla_hours: int = None) -> int: | |
| """Get priority level 0-3""" | |
| if category == EmailCategory.SPAM: | |
| return 0 | |
| if category == EmailCategory.URGENT: | |
| if sla_hours and sla_hours <= 2: | |
| return 3 | |
| return 2 | |
| if category == EmailCategory.BILLING: | |
| return 1 | |
| return 1 | |
| def generate_task1_emails(self) -> tuple[list[Email], list[GroundTruth]]: | |
| """Generate 10 simple spam/not-spam emails (EASY)""" | |
| subjects = [ | |
| "Click here for FREE MONEY now!!!", | |
| "Verify your PayPal account immediately", | |
| "CONGRATS You've Won $1,000,000", | |
| "Your AWS account has unusual activity", | |
| "Team standup at 10am today", | |
| "Weekly status report submission", | |
| "Meeting notes from yesterday", | |
| "Can we sync up tomorrow?", | |
| "LIMITED TIME: 50% OFF EVERYTHING", | |
| "Password reset request - URGENT" | |
| ] | |
| bodies = [ | |
| "Click the link to claim your prize! This offer expires in 1 hour!", | |
| "We detected unusual login attempts. Verify now: [link]", | |
| "You are a lucky winner! Click to collect your prize!!!", | |
| "We noticed some unusual activity on your account. Please review.", | |
| "Agenda: Q2 planning, budget review, timeline discussion", | |
| "Completed: API optimization, 3 new features, 2 bugs fixed", | |
| "Here are the key points from our 10am sync yesterday.", | |
| "Let's discuss the new design for the dashboard", | |
| "SALE: All summer items 50% off! Shop now before supplies run out!", | |
| "Someone requested to reset your password. If this wasn't you, ignore this email." | |
| ] | |
| emails = [] | |
| truths = [] | |
| is_spam_list = [True, True, True, False, False, False, False, False, True, False] | |
| for i, (subject, body, is_spam) in enumerate(zip(subjects, bodies, is_spam_list)): | |
| email = Email( | |
| email_id=f"task1_{i}", | |
| subject=subject, | |
| body=body, | |
| sender_domain="promo.com" if is_spam else "company.com", | |
| timestamp=datetime.now() - timedelta(hours=random.randint(1, 24)), | |
| is_vip_sender=False, | |
| sla_hours=None | |
| ) | |
| emails.append(email) | |
| category = EmailCategory.SPAM if is_spam else EmailCategory.NORMAL | |
| truth = GroundTruth( | |
| email_id=f"task1_{i}", | |
| category=category, | |
| team=Team.NONE if is_spam else Team.SUPPORT, | |
| priority=0 if is_spam else 1 | |
| ) | |
| truths.append(truth) | |
| return emails, truths | |
| def generate_task2_emails(self) -> tuple[list[Email], list[GroundTruth]]: | |
| """Generate 12 multi-class routing emails (MEDIUM)""" | |
| templates = [ | |
| ("URGENT: Production database down!!", "Our main database is offline. All services affected. This is critical.", 4, True), | |
| ("Invoice for March 2024", "Please find attached your invoice. Payment due by April 10.", None, False), | |
| ("Free Trial Offer - 30 Days!", "Get our premium service FREE for 30 days. Click NOW!!!", None, False), | |
| ("Customer complaint - Order #12345", "Customer reports missing items. Needs urgent resolution.", 2, True), | |
| ("Team meeting at 2pm", "Just a reminder about our sync at 2pm today in the main conference room.", None, False), | |
| ("Billing issue - Duplicate charge", "Customer reports being charged twice. Need help resolving.", 6, False), | |
| ("Sales inquiry: Enterprise plan", "Interest in your enterprise solution. Can we talk pricing?", None, False), | |
| ("System alert: High memory usage", "Memory utilization at 95%. Recommend immediate investigation.", 1, True), | |
| ("Password reset link", "You requested a password reset. Click the link below.", None, False), | |
| ("Feature request from VIP customer", "Our top customer requesting new analytics dashboard.", 8, False), | |
| ("CLICK TO CLAIM PRIZE NOW!!!!", "You've been selected as today's big winner! Claim prize NOW!", None, False), | |
| ("Meeting transcript from standup", "Here are the notes from this morning's standup meeting.", None, False) | |
| ] | |
| emails = [] | |
| truths = [] | |
| for i, (subject, body, sla_hours, is_vip) in enumerate(templates): | |
| email = Email( | |
| email_id=f"task2_{i}", | |
| subject=subject, | |
| body=body, | |
| sender_domain="customer.com" if is_vip else "internal.com", | |
| timestamp=datetime.now() - timedelta(hours=random.randint(1, 12)), | |
| is_vip_sender=is_vip, | |
| sla_hours=sla_hours | |
| ) | |
| emails.append(email) | |
| category = self._get_category(subject, body, sla_hours) | |
| team = self._get_team(category, subject) | |
| priority = self._get_priority(category, sla_hours) | |
| truth = GroundTruth( | |
| email_id=f"task2_{i}", | |
| category=category, | |
| team=team, | |
| priority=priority | |
| ) | |
| truths.append(truth) | |
| return emails, truths | |
| def generate_task3_emails(self) -> tuple[list[Email], list[GroundTruth]]: | |
| """Generate 20 context-aware emails with escalation (HARD)""" | |
| emails = [] | |
| truths = [] | |
| # VIP customer issues (high priority) | |
| for i in range(3): | |
| subject = f"VIP Customer Issue #{i+1}: Service outage" | |
| body = f"Our VIP enterprise customer reporting service unavailability. Revenue impact potential. Immediate escalation required." | |
| email = Email( | |
| email_id=f"task3_{i}", | |
| subject=subject, | |
| body=body, | |
| sender_domain="vip_customer.com", | |
| timestamp=datetime.now(), | |
| is_vip_sender=True, | |
| sla_hours=1 | |
| ) | |
| emails.append(email) | |
| truth = GroundTruth( | |
| email_id=f"task3_{i}", | |
| category=EmailCategory.URGENT, | |
| team=Team.SUPPORT, | |
| priority=3 | |
| ) | |
| truths.append(truth) | |
| # Standard support cases | |
| for i in range(5): | |
| subject = f"Support ticket #{i+1}" | |
| body = f"Customer issue regarding feature X. Needs resolution within 24 hours." | |
| email = Email( | |
| email_id=f"task3_{3+i}", | |
| subject=subject, | |
| body=body, | |
| sender_domain="support.company.com", | |
| timestamp=datetime.now() - timedelta(hours=i*2), | |
| is_vip_sender=False, | |
| sla_hours=24 | |
| ) | |
| emails.append(email) | |
| truth = GroundTruth( | |
| email_id=f"task3_{3+i}", | |
| category=EmailCategory.NORMAL, | |
| team=Team.SUPPORT, | |
| priority=1 | |
| ) | |
| truths.append(truth) | |
| # Billing issues | |
| for i in range(4): | |
| subject = f"Billing inquiry #{i+1}" | |
| body = f"Customer question about invoice or billing. Standard resolution." | |
| email = Email( | |
| email_id=f"task3_{8+i}", | |
| subject=subject, | |
| body=body, | |
| sender_domain="billing.com", | |
| timestamp=datetime.now() - timedelta(hours=i*3), | |
| is_vip_sender=False, | |
| sla_hours=None | |
| ) | |
| emails.append(email) | |
| truth = GroundTruth( | |
| email_id=f"task3_{8+i}", | |
| category=EmailCategory.BILLING, | |
| team=Team.BILLING, | |
| priority=1 | |
| ) | |
| truths.append(truth) | |
| # Sales leads | |
| for i in range(3): | |
| subject = f"Sales inquiry #{i+1}: Enterprise interest" | |
| body = f"New company interested in our enterprise solution. High-value potential lead." | |
| email = Email( | |
| email_id=f"task3_{12+i}", | |
| subject=subject, | |
| body=body, | |
| sender_domain=f"company{i}.com", | |
| timestamp=datetime.now() - timedelta(hours=i*4), | |
| is_vip_sender=False, | |
| sla_hours=None | |
| ) | |
| emails.append(email) | |
| truth = GroundTruth( | |
| email_id=f"task3_{12+i}", | |
| category=EmailCategory.NORMAL, | |
| team=Team.SALES, | |
| priority=2 | |
| ) | |
| truths.append(truth) | |
| # Spam emails | |
| for i in range(5): | |
| subject = f"CLICK HERE NOW !!! Get FREE stuff!!!" | |
| body = f"Limited time offer expires in 1 hour. Click the link to claim your prize!" | |
| email = Email( | |
| email_id=f"task3_{15+i}", | |
| subject=subject, | |
| body=body, | |
| sender_domain=f"spam{i}.com", | |
| timestamp=datetime.now() - timedelta(hours=i*5), | |
| is_vip_sender=False, | |
| sla_hours=None | |
| ) | |
| emails.append(email) | |
| truth = GroundTruth( | |
| email_id=f"task3_{15+i}", | |
| category=EmailCategory.SPAM, | |
| team=Team.NONE, | |
| priority=0 | |
| ) | |
| truths.append(truth) | |
| return emails, truths | |