File size: 16,138 Bytes
bcd8636
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fdaf3e1
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
"""
Task definitions and automated graders for the DataDetective environment.

Each task has:
  - id, title, difficulty, description
  - A grader function that scores the agent's final answer (0.0 - 1.0)
    based on whether key findings are mentioned.
"""

import re
from typing import Callable


def _has_any(text: str, keywords: list[str]) -> bool:
    """Case-insensitive check: does *text* contain any of *keywords*?"""
    low = text.lower()
    return any(kw.lower() in low for kw in keywords)


def _has_pattern(text: str, pattern: str) -> bool:
    return bool(re.search(pattern, text, re.IGNORECASE))


def _grade_orders_drop(answer: str) -> float:
    score = 0.0
    if _has_any(answer, ["drop", "decrease", "decline", "fell", "fewer", "reduction", "lower"]):
        score += 0.20
    if _has_any(answer, ["spring mega sale", "spring sale", "mega sale"]) or (
        _has_any(answer, ["promotion", "promo", "sale", "discount", "campaign"])
    ):
        score += 0.20
    if _has_any(answer, ["ended", "expired", "over", "concluded", "stopped"]) or _has_pattern(
        answer, r"march\s*0?1"
    ):
        score += 0.20
    if _has_any(answer, [
        "caused", "because", "due to", "result of", "led to",
        "when the", "after the", "ending of", "end of the",
        "correlated", "explains",
    ]):
        score += 0.20
    if _has_pattern(answer, r"\d+\s*(orders|transactions)") or _has_pattern(
        answer, r"\d+\s*%"
    ) or _has_pattern(answer, r"from\s+\d+.*to\s+\d+"):
        score += 0.20
    return min(score, 1.0)


def _grade_returns_spike(answer: str) -> float:
    score = 0.0
    if _has_any(answer, ["wireless headphones", "headphones pro", "headphone"]):
        score += 0.20
    if _has_any(answer, ["west"]):
        score += 0.20
    if _has_any(answer, ["audiotech", "audio tech"]):
        score += 0.20
    if _has_any(answer, [
        "defect", "defective", "faulty", "quality",
        "high return", "return rate", "abnormal",
        "stopped working", "battery issue", "poor audio",
    ]):
        score += 0.20
    if _has_pattern(answer, r"\d+\s*%") or _has_pattern(
        answer, r"\d+\s*(returns|returned|units)"
    ) or _has_any(answer, ["return rate", "compared to"]):
        score += 0.20
    return min(score, 1.0)


def _grade_customer_churn(answer: str) -> float:
    score = 0.0
    if _has_pattern(answer, r"\d+\s*%") or _has_any(answer, [
        "decline", "decrease", "drop", "churn", "fewer active",
        "lost customers", "stopped ordering",
    ]):
        score += 0.20
    if _has_any(answer, ["enterprise"]):
        score += 0.20
    if _has_any(answer, ["northeast", "north east", "north-east"]):
        score += 0.20
    if _has_any(answer, [
        "price increase", "price change", "price hike", "pricing",
        "more expensive", "raised price", "cost increase",
    ]):
        score += 0.20
    if _has_any(answer, [
        "laptop pro", "desktop workstation", "office suite",
        "devtools", "external ssd",
    ]) or _has_pattern(answer, r"product.*(1|2|11|15|19)"):
        score += 0.20
    return min(score, 1.0)


def _grade_shipping_delay(answer: str) -> float:
    score = 0.0
    if _has_any(answer, ["midwest"]):
        score += 0.20
    if _has_any(answer, ["quickship", "quick ship"]):
        score += 0.20
    if _has_any(answer, [
        "delivery delay", "late delivery", "delayed shipment",
        "shipping delay", "late shipment", "delivery time",
        "delayed delivery", "slow delivery",
    ]):
        score += 0.20
    if _has_pattern(answer, r"feb(ruary)?\s*(10|mid|middle)") or _has_any(answer, [
        "mid-february", "mid february", "around february",
        "starting in february", "beginning of february",
    ]):
        score += 0.20
    if _has_any(answer, [
        "support ticket", "complaint", "ticket volume",
        "customer satisfaction", "support request",
    ]) and _has_any(answer, [
        "delivery", "shipping", "carrier", "quickship",
    ]):
        score += 0.20
    return min(score, 1.0)


def _grade_revenue_paradox(answer: str) -> float:
    score = 0.0
    if _has_any(answer, [
        "spring mega sale", "mega sale", "25%", "25 percent",
    ]) or (
        _has_any(answer, ["promotion", "promo", "discount", "sale"])
        and _has_any(answer, ["margin", "profit", "cost"])
    ):
        score += 0.20
    if _has_any(answer, [
        "product mix", "category mix", "mix shift", "shifted toward",
        "higher proportion", "more electronics", "low-margin",
        "composition changed",
    ]):
        score += 0.20
    if _has_any(answer, ["enterprise"]) and _has_any(answer, [
        "price increase", "price change", "price hike",
        "lost", "churn", "left", "fewer", "decline",
    ]):
        score += 0.20
    if _has_any(answer, ["return", "refund"]) and _has_any(answer, [
        "cost", "expense", "profit", "margin", "loss", "erode",
    ]):
        score += 0.20
    if _has_pattern(answer, r"\$\s*[\d,]+") or _has_pattern(
        answer, r"\d+\s*%"
    ) or _has_pattern(answer, r"from\s+\$?[\d,]+.*to\s+\$?[\d,]+"):
        score += 0.20
    return min(score, 1.0)


def _grade_supplier_quality(answer: str) -> float:
    score = 0.0
    if _has_any(answer, ["audiotech", "audio tech"]):
        score += 0.20
    if _has_any(answer, ["wireless headphones", "headphones pro", "product 6"]):
        score += 0.20
    if _has_any(answer, ["bluetooth speaker", "product 7"]):
        score += 0.20
    if _has_any(answer, ["return rate", "refund", "return volume"]) or _has_pattern(
        answer, r"\d+\s*%.*return"
    ) or _has_pattern(answer, r"return.*\d+\s*%") or _has_pattern(
        answer, r"\$\s*[\d,]+"
    ):
        score += 0.20
    if _has_any(answer, [
        "support ticket", "defect", "complaint", "product_defect",
        "quality issue", "customer complaint",
    ]):
        score += 0.20
    return min(score, 1.0)


def _grade_inventory_stockout(answer: str) -> float:
    score = 0.0
    if _has_any(answer, ["west"]):
        score += 0.20
    if _has_any(answer, ["monitor", "product 4", "monitor 27"]):
        score += 0.20
    if _has_any(answer, [
        "inventory", "stock", "out of stock", "stockout", "stock-out",
        "zero units", "no inventory", "warehouse",
    ]):
        score += 0.20
    if _has_any(answer, [
        "spring mega sale", "mega sale", "promo", "promotion",
        "february 15", "feb 15", "during the sale",
    ]):
        score += 0.20
    if _has_pattern(answer, r"\d+\s*(units|orders|sales)") or _has_pattern(
        answer, r"\d+\s*%"
    ) or _has_pattern(answer, r"from\s+\d+.*to\s+\d+"):
        score += 0.20
    return min(score, 1.0)


def _grade_fraud_detection(answer: str) -> float:
    score = 0.0
    if _has_any(answer, ["southeast"]):
        score += 0.20
    if _has_any(answer, [
        "new account", "recent signup", "recently created",
        "new customer", "account creation", "registered in feb",
        "signed up",
    ]):
        score += 0.20
    if _has_any(answer, [
        "high-value", "high value", "expensive", "laptop pro",
        "desktop workstation", "large order", "electronics",
    ]):
        score += 0.20
    if _has_pattern(answer, r"1[0-5]\s*(account|customer|user)") or _has_pattern(
        answer, r"\$\s*[\d,]+"
    ) or _has_pattern(answer, r"\d+\s*(order|transaction)"):
        score += 0.20
    if _has_any(answer, [
        "pattern", "cluster", "coordinated", "suspicious",
        "same product", "no return", "never returned",
        "concentrated", "anomal", "fraud ring",
    ]):
        score += 0.20
    return min(score, 1.0)


def _grade_repeat_purchase_decline(answer: str) -> float:
    score = 0.0
    if _has_any(answer, [
        "repeat purchase", "repeat rate", "returning customer",
        "repeat buyer", "repurchase", "order frequency",
        "second order", "came back",
    ]) and (_has_pattern(answer, r"\d+\s*%") or _has_any(answer, [
        "decline", "drop", "decrease", "fell", "collapsed",
    ])):
        score += 0.20
    if _has_any(answer, ["enterprise"]) and _has_any(answer, [
        "price", "increase", "hike", "stopped", "left", "churn",
    ]):
        score += 0.20
    if (_has_any(answer, ["midwest"]) or _has_any(answer, [
        "shipping", "delivery", "quickship",
    ])) and _has_any(answer, [
        "repeat", "return", "reorder", "come back", "second order",
    ]):
        score += 0.20
    if _has_any(answer, ["marketing", "acquisition", "spend"]) and _has_any(answer, [
        "retention", "email", "loyalty", "re-engage", "lapsed",
        "shifted", "new customer",
    ]):
        score += 0.20
    if _has_any(answer, [
        "segment", "cohort", "by region", "by segment",
        "enterprise vs", "consumer vs", "smb vs",
    ]) or _has_pattern(answer, r"(enterprise|smb|consumer).*\d+\s*%"):
        score += 0.20
    return min(score, 1.0)


TASKS: dict[str, dict] = {
    "orders_drop": {
        "id": "orders_drop",
        "difficulty": "easy",
        "title": "Weekly Orders Drop Investigation",
        "description": (
            "URGENT -- Our order volume dropped sharply in the first two weeks "
            "of March compared to the last two weeks of February. Leadership "
            "needs to know why.\n\n"
            "Investigate the database, identify the root cause of the drop, "
            "and submit a clear summary of your findings."
        ),
    },
    "returns_spike": {
        "id": "returns_spike",
        "difficulty": "medium",
        "title": "Product Returns Spike Investigation",
        "description": (
            "ALERT -- Our return rate has spiked significantly in recent weeks, "
            "with particular concentration in one geographic region. This is "
            "eating into margins.\n\n"
            "Use the database to identify which product(s) are driving the "
            "spike, which region is most affected, and what the likely root "
            "cause is. Include the supplier if relevant."
        ),
    },
    "customer_churn": {
        "id": "customer_churn",
        "difficulty": "hard",
        "title": "Customer Churn Root Cause Analysis",
        "description": (
            "CRITICAL -- Our monthly active customer count has declined "
            "significantly from January to March. The executive team wants a "
            "full root-cause analysis.\n\n"
            "Determine which customer segments and regions are most affected, "
            "quantify the decline, and identify the most likely causes. "
            "Check all available tables for clues."
        ),
    },
    "shipping_delay": {
        "id": "shipping_delay",
        "difficulty": "medium-hard",
        "title": "Customer Satisfaction Crisis Investigation",
        "description": (
            "ESCALATION -- Customer satisfaction scores have plummeted in one "
            "of our regions. The support team is overwhelmed with complaints "
            "and escalations are piling up.\n\n"
            "Investigate what operational issue is driving the complaints, "
            "identify the responsible party (carrier, warehouse, etc.), "
            "determine when the problem started, and quantify the impact. "
            "Cross-reference multiple data sources for a complete picture."
        ),
    },
    "revenue_paradox": {
        "id": "revenue_paradox",
        "difficulty": "hard",
        "title": "Revenue vs. Profit Paradox Investigation",
        "description": (
            "CRITICAL -- Revenue in February was our highest month ever, yet "
            "gross profit actually *decreased* compared to January. The CFO "
            "wants a full breakdown of why we are selling more but earning "
            "less.\n\n"
            "Analyze revenue, costs, margins, discounts, product mix, customer "
            "segments, and any other relevant factors. This is likely multi-"
            "causal -- identify ALL contributing factors and quantify their "
            "impact. Use the products.cost column to compute margins."
        ),
    },
    "supplier_quality": {
        "id": "supplier_quality",
        "difficulty": "medium",
        "title": "Supplier Quality Crisis Investigation",
        "description": (
            "ESCALATION -- The VP of Merchandising has received escalating "
            "complaints about product quality across multiple SKUs. Quality "
            "Assurance wants a supplier-level analysis.\n\n"
            "Determine which supplier(s) have systemic quality issues, which "
            "of their products are affected, and quantify the total business "
            "impact in returns, refunds, and support ticket volume. Include "
            "return rates by supplier to support a contract renegotiation."
        ),
    },
    "inventory_stockout": {
        "id": "inventory_stockout",
        "difficulty": "medium-hard",
        "title": "Regional Sales Underperformance Investigation",
        "description": (
            "INVESTIGATION -- Our West region was projected to be the top "
            "performer during the Spring Mega Sale based on historical trends "
            "and marketing investment, but actual sales came in significantly "
            "below the other regions.\n\n"
            "The Regional VP demands an explanation. Investigate what caused "
            "the West to underperform during our biggest promotional event. "
            "Check product-level sales, inventory data, and any operational "
            "issues that may have limited fulfillment."
        ),
    },
    "fraud_detection": {
        "id": "fraud_detection",
        "difficulty": "hard",
        "title": "Suspicious Order Pattern Investigation",
        "description": (
            "ALERT -- The Finance team has flagged a suspicious spike in "
            "high-value orders from recently created accounts. Several of "
            "these orders have already shipped.\n\n"
            "Investigate the pattern: identify the suspicious accounts, "
            "determine the scope of potential fraud, estimate the financial "
            "exposure, and describe the behavioral signatures that "
            "distinguish these accounts from legitimate customers. Look at "
            "signup dates, order values, product choices, and geographic "
            "concentration."
        ),
    },
    "repeat_purchase_decline": {
        "id": "repeat_purchase_decline",
        "difficulty": "hard",
        "title": "Customer Retention Crisis Investigation",
        "description": (
            "CRITICAL -- Monthly unique buyer count has held steady around "
            "100, but the Customer Success team reports that repeat purchase "
            "rates have collapsed. In January, roughly 40%% of orders came "
            "from returning customers; by March, it appears to be under 20%%."
            "\n\n"
            "The CEO asks: are we becoming a one-time-purchase business? "
            "Diagnose which customer segments and regions lost repeat buyers, "
            "identify the root causes, and determine whether our marketing "
            "spend strategy is masking a retention problem. Check the "
            "marketing_spend table for clues about acquisition vs. retention "
            "investment."
        ),
    },
}

_GRADERS: dict[str, Callable[[str], float]] = {
    "orders_drop":             _grade_orders_drop,
    "returns_spike":           _grade_returns_spike,
    "customer_churn":          _grade_customer_churn,
    "shipping_delay":          _grade_shipping_delay,
    "revenue_paradox":         _grade_revenue_paradox,
    "supplier_quality":        _grade_supplier_quality,
    "inventory_stockout":      _grade_inventory_stockout,
    "fraud_detection":         _grade_fraud_detection,
    "repeat_purchase_decline": _grade_repeat_purchase_decline,
}


def grade_answer(task_id: str, answer: str) -> float:
    grader = _GRADERS.get(task_id)
    if grader is None:
        return 0.05
    raw = grader(answer)
    return max(0.05, min(0.95, raw))