Spaces:
Sleeping
Sleeping
File size: 11,972 Bytes
fee8744 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 | import random
from datetime import datetime, timedelta
from environment.types import Email, GroundTruth, EmailCategory, Team
class DataGenerator:
"""Generates synthetic email datasets for different tasks"""
SPAM_PATTERNS = [
"Click here now!", "LIMITED TIME OFFER", "Act NOW!!!",
"Free money", "You've won!", "Congratulations",
"Verify your account", "Confirm identity", "Update payment",
"urgent action required", "verify credentials"
]
URGENCY_KEYWORDS = [
"urgent", "asap", "critical", "downtime", "affected",
"production issue", "customer complaint", "emergency"
]
def __init__(self, seed: int = 42):
random.seed(seed)
def _is_spam(self, subject: str, body: str) -> bool:
"""Determine if email is spam based on patterns"""
text = (subject + " " + body).lower()
spam_score = sum(1 for pattern in self.SPAM_PATTERNS if pattern.lower() in text)
return spam_score >= 2
def _is_urgent(self, subject: str, body: str, sla_hours: int = None) -> bool:
"""Determine if email is urgent"""
text = (subject + " " + body).lower()
urgency_score = sum(1 for kw in self.URGENCY_KEYWORDS if kw in text)
return urgency_score >= 1 or (sla_hours and sla_hours <= 4)
def _get_category(self, subject: str, body: str, sla_hours: int = None) -> EmailCategory:
"""Determine email category"""
if self._is_spam(subject, body):
return EmailCategory.SPAM
if self._is_urgent(subject, body, sla_hours):
return EmailCategory.URGENT
if "billing" in subject.lower() or "invoice" in subject.lower():
return EmailCategory.BILLING
return EmailCategory.NORMAL
def _get_team(self, category: EmailCategory, subject: str) -> Team:
"""Determine target team"""
if category == EmailCategory.SPAM:
return Team.NONE
if category == EmailCategory.BILLING or "billing" in subject.lower():
return Team.BILLING
if category == EmailCategory.URGENT and "sales" in subject.lower():
return Team.SALES
if category == EmailCategory.URGENT:
return Team.SUPPORT
if "sales" in subject.lower() or "order" in subject.lower():
return Team.SALES
return Team.SUPPORT
def _get_priority(self, category: EmailCategory, sla_hours: int = None) -> int:
"""Get priority level 0-3"""
if category == EmailCategory.SPAM:
return 0
if category == EmailCategory.URGENT:
if sla_hours and sla_hours <= 2:
return 3
return 2
if category == EmailCategory.BILLING:
return 1
return 1
def generate_task1_emails(self) -> tuple[list[Email], list[GroundTruth]]:
"""Generate 10 simple spam/not-spam emails (EASY)"""
subjects = [
"Click here for FREE MONEY now!!!",
"Verify your PayPal account immediately",
"CONGRATS You've Won $1,000,000",
"Your AWS account has unusual activity",
"Team standup at 10am today",
"Weekly status report submission",
"Meeting notes from yesterday",
"Can we sync up tomorrow?",
"LIMITED TIME: 50% OFF EVERYTHING",
"Password reset request - URGENT"
]
bodies = [
"Click the link to claim your prize! This offer expires in 1 hour!",
"We detected unusual login attempts. Verify now: [link]",
"You are a lucky winner! Click to collect your prize!!!",
"We noticed some unusual activity on your account. Please review.",
"Agenda: Q2 planning, budget review, timeline discussion",
"Completed: API optimization, 3 new features, 2 bugs fixed",
"Here are the key points from our 10am sync yesterday.",
"Let's discuss the new design for the dashboard",
"SALE: All summer items 50% off! Shop now before supplies run out!",
"Someone requested to reset your password. If this wasn't you, ignore this email."
]
emails = []
truths = []
is_spam_list = [True, True, True, False, False, False, False, False, True, False]
for i, (subject, body, is_spam) in enumerate(zip(subjects, bodies, is_spam_list)):
email = Email(
email_id=f"task1_{i}",
subject=subject,
body=body,
sender_domain="promo.com" if is_spam else "company.com",
timestamp=datetime.now() - timedelta(hours=random.randint(1, 24)),
is_vip_sender=False,
sla_hours=None
)
emails.append(email)
category = EmailCategory.SPAM if is_spam else EmailCategory.NORMAL
truth = GroundTruth(
email_id=f"task1_{i}",
category=category,
team=Team.NONE if is_spam else Team.SUPPORT,
priority=0 if is_spam else 1
)
truths.append(truth)
return emails, truths
def generate_task2_emails(self) -> tuple[list[Email], list[GroundTruth]]:
"""Generate 12 multi-class routing emails (MEDIUM)"""
templates = [
("URGENT: Production database down!!", "Our main database is offline. All services affected. This is critical.", 4, True),
("Invoice for March 2024", "Please find attached your invoice. Payment due by April 10.", None, False),
("Free Trial Offer - 30 Days!", "Get our premium service FREE for 30 days. Click NOW!!!", None, False),
("Customer complaint - Order #12345", "Customer reports missing items. Needs urgent resolution.", 2, True),
("Team meeting at 2pm", "Just a reminder about our sync at 2pm today in the main conference room.", None, False),
("Billing issue - Duplicate charge", "Customer reports being charged twice. Need help resolving.", 6, False),
("Sales inquiry: Enterprise plan", "Interest in your enterprise solution. Can we talk pricing?", None, False),
("System alert: High memory usage", "Memory utilization at 95%. Recommend immediate investigation.", 1, True),
("Password reset link", "You requested a password reset. Click the link below.", None, False),
("Feature request from VIP customer", "Our top customer requesting new analytics dashboard.", 8, False),
("CLICK TO CLAIM PRIZE NOW!!!!", "You've been selected as today's big winner! Claim prize NOW!", None, False),
("Meeting transcript from standup", "Here are the notes from this morning's standup meeting.", None, False)
]
emails = []
truths = []
for i, (subject, body, sla_hours, is_vip) in enumerate(templates):
email = Email(
email_id=f"task2_{i}",
subject=subject,
body=body,
sender_domain="customer.com" if is_vip else "internal.com",
timestamp=datetime.now() - timedelta(hours=random.randint(1, 12)),
is_vip_sender=is_vip,
sla_hours=sla_hours
)
emails.append(email)
category = self._get_category(subject, body, sla_hours)
team = self._get_team(category, subject)
priority = self._get_priority(category, sla_hours)
truth = GroundTruth(
email_id=f"task2_{i}",
category=category,
team=team,
priority=priority
)
truths.append(truth)
return emails, truths
def generate_task3_emails(self) -> tuple[list[Email], list[GroundTruth]]:
"""Generate 20 context-aware emails with escalation (HARD)"""
emails = []
truths = []
# VIP customer issues (high priority)
for i in range(3):
subject = f"VIP Customer Issue #{i+1}: Service outage"
body = f"Our VIP enterprise customer reporting service unavailability. Revenue impact potential. Immediate escalation required."
email = Email(
email_id=f"task3_{i}",
subject=subject,
body=body,
sender_domain="vip_customer.com",
timestamp=datetime.now(),
is_vip_sender=True,
sla_hours=1
)
emails.append(email)
truth = GroundTruth(
email_id=f"task3_{i}",
category=EmailCategory.URGENT,
team=Team.SUPPORT,
priority=3
)
truths.append(truth)
# Standard support cases
for i in range(5):
subject = f"Support ticket #{i+1}"
body = f"Customer issue regarding feature X. Needs resolution within 24 hours."
email = Email(
email_id=f"task3_{3+i}",
subject=subject,
body=body,
sender_domain="support.company.com",
timestamp=datetime.now() - timedelta(hours=i*2),
is_vip_sender=False,
sla_hours=24
)
emails.append(email)
truth = GroundTruth(
email_id=f"task3_{3+i}",
category=EmailCategory.NORMAL,
team=Team.SUPPORT,
priority=1
)
truths.append(truth)
# Billing issues
for i in range(4):
subject = f"Billing inquiry #{i+1}"
body = f"Customer question about invoice or billing. Standard resolution."
email = Email(
email_id=f"task3_{8+i}",
subject=subject,
body=body,
sender_domain="billing.com",
timestamp=datetime.now() - timedelta(hours=i*3),
is_vip_sender=False,
sla_hours=None
)
emails.append(email)
truth = GroundTruth(
email_id=f"task3_{8+i}",
category=EmailCategory.BILLING,
team=Team.BILLING,
priority=1
)
truths.append(truth)
# Sales leads
for i in range(3):
subject = f"Sales inquiry #{i+1}: Enterprise interest"
body = f"New company interested in our enterprise solution. High-value potential lead."
email = Email(
email_id=f"task3_{12+i}",
subject=subject,
body=body,
sender_domain=f"company{i}.com",
timestamp=datetime.now() - timedelta(hours=i*4),
is_vip_sender=False,
sla_hours=None
)
emails.append(email)
truth = GroundTruth(
email_id=f"task3_{12+i}",
category=EmailCategory.NORMAL,
team=Team.SALES,
priority=2
)
truths.append(truth)
# Spam emails
for i in range(5):
subject = f"CLICK HERE NOW !!! Get FREE stuff!!!"
body = f"Limited time offer expires in 1 hour. Click the link to claim your prize!"
email = Email(
email_id=f"task3_{15+i}",
subject=subject,
body=body,
sender_domain=f"spam{i}.com",
timestamp=datetime.now() - timedelta(hours=i*5),
is_vip_sender=False,
sla_hours=None
)
emails.append(email)
truth = GroundTruth(
email_id=f"task3_{15+i}",
category=EmailCategory.SPAM,
team=Team.NONE,
priority=0
)
truths.append(truth)
return emails, truths
|