File size: 11,972 Bytes
fee8744
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
import random
from datetime import datetime, timedelta
from environment.types import Email, GroundTruth, EmailCategory, Team

class DataGenerator:
    """Generates synthetic email datasets for different tasks"""

    SPAM_PATTERNS = [
        "Click here now!", "LIMITED TIME OFFER", "Act NOW!!!",
        "Free money", "You've won!", "Congratulations",
        "Verify your account", "Confirm identity", "Update payment",
        "urgent action required", "verify credentials"
    ]

    URGENCY_KEYWORDS = [
        "urgent", "asap", "critical", "downtime", "affected",
        "production issue", "customer complaint", "emergency"
    ]

    def __init__(self, seed: int = 42):
        random.seed(seed)

    def _is_spam(self, subject: str, body: str) -> bool:
        """Determine if email is spam based on patterns"""
        text = (subject + " " + body).lower()
        spam_score = sum(1 for pattern in self.SPAM_PATTERNS if pattern.lower() in text)
        return spam_score >= 2

    def _is_urgent(self, subject: str, body: str, sla_hours: int = None) -> bool:
        """Determine if email is urgent"""
        text = (subject + " " + body).lower()
        urgency_score = sum(1 for kw in self.URGENCY_KEYWORDS if kw in text)
        return urgency_score >= 1 or (sla_hours and sla_hours <= 4)

    def _get_category(self, subject: str, body: str, sla_hours: int = None) -> EmailCategory:
        """Determine email category"""
        if self._is_spam(subject, body):
            return EmailCategory.SPAM
        if self._is_urgent(subject, body, sla_hours):
            return EmailCategory.URGENT
        if "billing" in subject.lower() or "invoice" in subject.lower():
            return EmailCategory.BILLING
        return EmailCategory.NORMAL

    def _get_team(self, category: EmailCategory, subject: str) -> Team:
        """Determine target team"""
        if category == EmailCategory.SPAM:
            return Team.NONE
        if category == EmailCategory.BILLING or "billing" in subject.lower():
            return Team.BILLING
        if category == EmailCategory.URGENT and "sales" in subject.lower():
            return Team.SALES
        if category == EmailCategory.URGENT:
            return Team.SUPPORT
        if "sales" in subject.lower() or "order" in subject.lower():
            return Team.SALES
        return Team.SUPPORT

    def _get_priority(self, category: EmailCategory, sla_hours: int = None) -> int:
        """Get priority level 0-3"""
        if category == EmailCategory.SPAM:
            return 0
        if category == EmailCategory.URGENT:
            if sla_hours and sla_hours <= 2:
                return 3
            return 2
        if category == EmailCategory.BILLING:
            return 1
        return 1

    def generate_task1_emails(self) -> tuple[list[Email], list[GroundTruth]]:
        """Generate 10 simple spam/not-spam emails (EASY)"""
        subjects = [
            "Click here for FREE MONEY now!!!",
            "Verify your PayPal account immediately",
            "CONGRATS You've Won $1,000,000",
            "Your AWS account has unusual activity",
            "Team standup at 10am today",
            "Weekly status report submission",
            "Meeting notes from yesterday",
            "Can we sync up tomorrow?",
            "LIMITED TIME: 50% OFF EVERYTHING",
            "Password reset request - URGENT"
        ]

        bodies = [
            "Click the link to claim your prize! This offer expires in 1 hour!",
            "We detected unusual login attempts. Verify now: [link]",
            "You are a lucky winner! Click to collect your prize!!!",
            "We noticed some unusual activity on your account. Please review.",
            "Agenda: Q2 planning, budget review, timeline discussion",
            "Completed: API optimization, 3 new features, 2 bugs fixed",
            "Here are the key points from our 10am sync yesterday.",
            "Let's discuss the new design for the dashboard",
            "SALE: All summer items 50% off! Shop now before supplies run out!",
            "Someone requested to reset your password. If this wasn't you, ignore this email."
        ]

        emails = []
        truths = []
        is_spam_list = [True, True, True, False, False, False, False, False, True, False]

        for i, (subject, body, is_spam) in enumerate(zip(subjects, bodies, is_spam_list)):
            email = Email(
                email_id=f"task1_{i}",
                subject=subject,
                body=body,
                sender_domain="promo.com" if is_spam else "company.com",
                timestamp=datetime.now() - timedelta(hours=random.randint(1, 24)),
                is_vip_sender=False,
                sla_hours=None
            )
            emails.append(email)

            category = EmailCategory.SPAM if is_spam else EmailCategory.NORMAL
            truth = GroundTruth(
                email_id=f"task1_{i}",
                category=category,
                team=Team.NONE if is_spam else Team.SUPPORT,
                priority=0 if is_spam else 1
            )
            truths.append(truth)

        return emails, truths

    def generate_task2_emails(self) -> tuple[list[Email], list[GroundTruth]]:
        """Generate 12 multi-class routing emails (MEDIUM)"""
        templates = [
            ("URGENT: Production database down!!", "Our main database is offline. All services affected. This is critical.", 4, True),
            ("Invoice for March 2024", "Please find attached your invoice. Payment due by April 10.", None, False),
            ("Free Trial Offer - 30 Days!", "Get our premium service FREE for 30 days. Click NOW!!!", None, False),
            ("Customer complaint - Order #12345", "Customer reports missing items. Needs urgent resolution.", 2, True),
            ("Team meeting at 2pm", "Just a reminder about our sync at 2pm today in the main conference room.", None, False),
            ("Billing issue - Duplicate charge", "Customer reports being charged twice. Need help resolving.", 6, False),
            ("Sales inquiry: Enterprise plan", "Interest in your enterprise solution. Can we talk pricing?", None, False),
            ("System alert: High memory usage", "Memory utilization at 95%. Recommend immediate investigation.", 1, True),
            ("Password reset link", "You requested a password reset. Click the link below.", None, False),
            ("Feature request from VIP customer", "Our top customer requesting new analytics dashboard.", 8, False),
            ("CLICK TO CLAIM PRIZE NOW!!!!", "You've been selected as today's big winner! Claim prize NOW!", None, False),
            ("Meeting transcript from standup", "Here are the notes from this morning's standup meeting.", None, False)
        ]

        emails = []
        truths = []

        for i, (subject, body, sla_hours, is_vip) in enumerate(templates):
            email = Email(
                email_id=f"task2_{i}",
                subject=subject,
                body=body,
                sender_domain="customer.com" if is_vip else "internal.com",
                timestamp=datetime.now() - timedelta(hours=random.randint(1, 12)),
                is_vip_sender=is_vip,
                sla_hours=sla_hours
            )
            emails.append(email)

            category = self._get_category(subject, body, sla_hours)
            team = self._get_team(category, subject)
            priority = self._get_priority(category, sla_hours)

            truth = GroundTruth(
                email_id=f"task2_{i}",
                category=category,
                team=team,
                priority=priority
            )
            truths.append(truth)

        return emails, truths

    def generate_task3_emails(self) -> tuple[list[Email], list[GroundTruth]]:
        """Generate 20 context-aware emails with escalation (HARD)"""
        emails = []
        truths = []

        # VIP customer issues (high priority)
        for i in range(3):
            subject = f"VIP Customer Issue #{i+1}: Service outage"
            body = f"Our VIP enterprise customer reporting service unavailability. Revenue impact potential. Immediate escalation required."
            email = Email(
                email_id=f"task3_{i}",
                subject=subject,
                body=body,
                sender_domain="vip_customer.com",
                timestamp=datetime.now(),
                is_vip_sender=True,
                sla_hours=1
            )
            emails.append(email)
            truth = GroundTruth(
                email_id=f"task3_{i}",
                category=EmailCategory.URGENT,
                team=Team.SUPPORT,
                priority=3
            )
            truths.append(truth)

        # Standard support cases
        for i in range(5):
            subject = f"Support ticket #{i+1}"
            body = f"Customer issue regarding feature X. Needs resolution within 24 hours."
            email = Email(
                email_id=f"task3_{3+i}",
                subject=subject,
                body=body,
                sender_domain="support.company.com",
                timestamp=datetime.now() - timedelta(hours=i*2),
                is_vip_sender=False,
                sla_hours=24
            )
            emails.append(email)
            truth = GroundTruth(
                email_id=f"task3_{3+i}",
                category=EmailCategory.NORMAL,
                team=Team.SUPPORT,
                priority=1
            )
            truths.append(truth)

        # Billing issues
        for i in range(4):
            subject = f"Billing inquiry #{i+1}"
            body = f"Customer question about invoice or billing. Standard resolution."
            email = Email(
                email_id=f"task3_{8+i}",
                subject=subject,
                body=body,
                sender_domain="billing.com",
                timestamp=datetime.now() - timedelta(hours=i*3),
                is_vip_sender=False,
                sla_hours=None
            )
            emails.append(email)
            truth = GroundTruth(
                email_id=f"task3_{8+i}",
                category=EmailCategory.BILLING,
                team=Team.BILLING,
                priority=1
            )
            truths.append(truth)

        # Sales leads
        for i in range(3):
            subject = f"Sales inquiry #{i+1}: Enterprise interest"
            body = f"New company interested in our enterprise solution. High-value potential lead."
            email = Email(
                email_id=f"task3_{12+i}",
                subject=subject,
                body=body,
                sender_domain=f"company{i}.com",
                timestamp=datetime.now() - timedelta(hours=i*4),
                is_vip_sender=False,
                sla_hours=None
            )
            emails.append(email)
            truth = GroundTruth(
                email_id=f"task3_{12+i}",
                category=EmailCategory.NORMAL,
                team=Team.SALES,
                priority=2
            )
            truths.append(truth)

        # Spam emails
        for i in range(5):
            subject = f"CLICK HERE NOW !!! Get FREE stuff!!!"
            body = f"Limited time offer expires in 1 hour. Click the link to claim your prize!"
            email = Email(
                email_id=f"task3_{15+i}",
                subject=subject,
                body=body,
                sender_domain=f"spam{i}.com",
                timestamp=datetime.now() - timedelta(hours=i*5),
                is_vip_sender=False,
                sla_hours=None
            )
            emails.append(email)
            truth = GroundTruth(
                email_id=f"task3_{15+i}",
                category=EmailCategory.SPAM,
                team=Team.NONE,
                priority=0
            )
            truths.append(truth)

        return emails, truths