Meta_Hackathon / environment /data_generator.py
parthpethia's picture
Add Email Triage OpenEnv environment - production-ready with 3 graded tasks and Flask API
fee8744
import random
from datetime import datetime, timedelta
from environment.types import Email, GroundTruth, EmailCategory, Team
class DataGenerator:
"""Generates synthetic email datasets for different tasks"""
SPAM_PATTERNS = [
"Click here now!", "LIMITED TIME OFFER", "Act NOW!!!",
"Free money", "You've won!", "Congratulations",
"Verify your account", "Confirm identity", "Update payment",
"urgent action required", "verify credentials"
]
URGENCY_KEYWORDS = [
"urgent", "asap", "critical", "downtime", "affected",
"production issue", "customer complaint", "emergency"
]
def __init__(self, seed: int = 42):
random.seed(seed)
def _is_spam(self, subject: str, body: str) -> bool:
"""Determine if email is spam based on patterns"""
text = (subject + " " + body).lower()
spam_score = sum(1 for pattern in self.SPAM_PATTERNS if pattern.lower() in text)
return spam_score >= 2
def _is_urgent(self, subject: str, body: str, sla_hours: int = None) -> bool:
"""Determine if email is urgent"""
text = (subject + " " + body).lower()
urgency_score = sum(1 for kw in self.URGENCY_KEYWORDS if kw in text)
return urgency_score >= 1 or (sla_hours and sla_hours <= 4)
def _get_category(self, subject: str, body: str, sla_hours: int = None) -> EmailCategory:
"""Determine email category"""
if self._is_spam(subject, body):
return EmailCategory.SPAM
if self._is_urgent(subject, body, sla_hours):
return EmailCategory.URGENT
if "billing" in subject.lower() or "invoice" in subject.lower():
return EmailCategory.BILLING
return EmailCategory.NORMAL
def _get_team(self, category: EmailCategory, subject: str) -> Team:
"""Determine target team"""
if category == EmailCategory.SPAM:
return Team.NONE
if category == EmailCategory.BILLING or "billing" in subject.lower():
return Team.BILLING
if category == EmailCategory.URGENT and "sales" in subject.lower():
return Team.SALES
if category == EmailCategory.URGENT:
return Team.SUPPORT
if "sales" in subject.lower() or "order" in subject.lower():
return Team.SALES
return Team.SUPPORT
def _get_priority(self, category: EmailCategory, sla_hours: int = None) -> int:
"""Get priority level 0-3"""
if category == EmailCategory.SPAM:
return 0
if category == EmailCategory.URGENT:
if sla_hours and sla_hours <= 2:
return 3
return 2
if category == EmailCategory.BILLING:
return 1
return 1
def generate_task1_emails(self) -> tuple[list[Email], list[GroundTruth]]:
"""Generate 10 simple spam/not-spam emails (EASY)"""
subjects = [
"Click here for FREE MONEY now!!!",
"Verify your PayPal account immediately",
"CONGRATS You've Won $1,000,000",
"Your AWS account has unusual activity",
"Team standup at 10am today",
"Weekly status report submission",
"Meeting notes from yesterday",
"Can we sync up tomorrow?",
"LIMITED TIME: 50% OFF EVERYTHING",
"Password reset request - URGENT"
]
bodies = [
"Click the link to claim your prize! This offer expires in 1 hour!",
"We detected unusual login attempts. Verify now: [link]",
"You are a lucky winner! Click to collect your prize!!!",
"We noticed some unusual activity on your account. Please review.",
"Agenda: Q2 planning, budget review, timeline discussion",
"Completed: API optimization, 3 new features, 2 bugs fixed",
"Here are the key points from our 10am sync yesterday.",
"Let's discuss the new design for the dashboard",
"SALE: All summer items 50% off! Shop now before supplies run out!",
"Someone requested to reset your password. If this wasn't you, ignore this email."
]
emails = []
truths = []
is_spam_list = [True, True, True, False, False, False, False, False, True, False]
for i, (subject, body, is_spam) in enumerate(zip(subjects, bodies, is_spam_list)):
email = Email(
email_id=f"task1_{i}",
subject=subject,
body=body,
sender_domain="promo.com" if is_spam else "company.com",
timestamp=datetime.now() - timedelta(hours=random.randint(1, 24)),
is_vip_sender=False,
sla_hours=None
)
emails.append(email)
category = EmailCategory.SPAM if is_spam else EmailCategory.NORMAL
truth = GroundTruth(
email_id=f"task1_{i}",
category=category,
team=Team.NONE if is_spam else Team.SUPPORT,
priority=0 if is_spam else 1
)
truths.append(truth)
return emails, truths
def generate_task2_emails(self) -> tuple[list[Email], list[GroundTruth]]:
"""Generate 12 multi-class routing emails (MEDIUM)"""
templates = [
("URGENT: Production database down!!", "Our main database is offline. All services affected. This is critical.", 4, True),
("Invoice for March 2024", "Please find attached your invoice. Payment due by April 10.", None, False),
("Free Trial Offer - 30 Days!", "Get our premium service FREE for 30 days. Click NOW!!!", None, False),
("Customer complaint - Order #12345", "Customer reports missing items. Needs urgent resolution.", 2, True),
("Team meeting at 2pm", "Just a reminder about our sync at 2pm today in the main conference room.", None, False),
("Billing issue - Duplicate charge", "Customer reports being charged twice. Need help resolving.", 6, False),
("Sales inquiry: Enterprise plan", "Interest in your enterprise solution. Can we talk pricing?", None, False),
("System alert: High memory usage", "Memory utilization at 95%. Recommend immediate investigation.", 1, True),
("Password reset link", "You requested a password reset. Click the link below.", None, False),
("Feature request from VIP customer", "Our top customer requesting new analytics dashboard.", 8, False),
("CLICK TO CLAIM PRIZE NOW!!!!", "You've been selected as today's big winner! Claim prize NOW!", None, False),
("Meeting transcript from standup", "Here are the notes from this morning's standup meeting.", None, False)
]
emails = []
truths = []
for i, (subject, body, sla_hours, is_vip) in enumerate(templates):
email = Email(
email_id=f"task2_{i}",
subject=subject,
body=body,
sender_domain="customer.com" if is_vip else "internal.com",
timestamp=datetime.now() - timedelta(hours=random.randint(1, 12)),
is_vip_sender=is_vip,
sla_hours=sla_hours
)
emails.append(email)
category = self._get_category(subject, body, sla_hours)
team = self._get_team(category, subject)
priority = self._get_priority(category, sla_hours)
truth = GroundTruth(
email_id=f"task2_{i}",
category=category,
team=team,
priority=priority
)
truths.append(truth)
return emails, truths
def generate_task3_emails(self) -> tuple[list[Email], list[GroundTruth]]:
"""Generate 20 context-aware emails with escalation (HARD)"""
emails = []
truths = []
# VIP customer issues (high priority)
for i in range(3):
subject = f"VIP Customer Issue #{i+1}: Service outage"
body = f"Our VIP enterprise customer reporting service unavailability. Revenue impact potential. Immediate escalation required."
email = Email(
email_id=f"task3_{i}",
subject=subject,
body=body,
sender_domain="vip_customer.com",
timestamp=datetime.now(),
is_vip_sender=True,
sla_hours=1
)
emails.append(email)
truth = GroundTruth(
email_id=f"task3_{i}",
category=EmailCategory.URGENT,
team=Team.SUPPORT,
priority=3
)
truths.append(truth)
# Standard support cases
for i in range(5):
subject = f"Support ticket #{i+1}"
body = f"Customer issue regarding feature X. Needs resolution within 24 hours."
email = Email(
email_id=f"task3_{3+i}",
subject=subject,
body=body,
sender_domain="support.company.com",
timestamp=datetime.now() - timedelta(hours=i*2),
is_vip_sender=False,
sla_hours=24
)
emails.append(email)
truth = GroundTruth(
email_id=f"task3_{3+i}",
category=EmailCategory.NORMAL,
team=Team.SUPPORT,
priority=1
)
truths.append(truth)
# Billing issues
for i in range(4):
subject = f"Billing inquiry #{i+1}"
body = f"Customer question about invoice or billing. Standard resolution."
email = Email(
email_id=f"task3_{8+i}",
subject=subject,
body=body,
sender_domain="billing.com",
timestamp=datetime.now() - timedelta(hours=i*3),
is_vip_sender=False,
sla_hours=None
)
emails.append(email)
truth = GroundTruth(
email_id=f"task3_{8+i}",
category=EmailCategory.BILLING,
team=Team.BILLING,
priority=1
)
truths.append(truth)
# Sales leads
for i in range(3):
subject = f"Sales inquiry #{i+1}: Enterprise interest"
body = f"New company interested in our enterprise solution. High-value potential lead."
email = Email(
email_id=f"task3_{12+i}",
subject=subject,
body=body,
sender_domain=f"company{i}.com",
timestamp=datetime.now() - timedelta(hours=i*4),
is_vip_sender=False,
sla_hours=None
)
emails.append(email)
truth = GroundTruth(
email_id=f"task3_{12+i}",
category=EmailCategory.NORMAL,
team=Team.SALES,
priority=2
)
truths.append(truth)
# Spam emails
for i in range(5):
subject = f"CLICK HERE NOW !!! Get FREE stuff!!!"
body = f"Limited time offer expires in 1 hour. Click the link to claim your prize!"
email = Email(
email_id=f"task3_{15+i}",
subject=subject,
body=body,
sender_domain=f"spam{i}.com",
timestamp=datetime.now() - timedelta(hours=i*5),
is_vip_sender=False,
sla_hours=None
)
emails.append(email)
truth = GroundTruth(
email_id=f"task3_{15+i}",
category=EmailCategory.SPAM,
team=Team.NONE,
priority=0
)
truths.append(truth)
return emails, truths