DataDetective / server /tasks.py
Viani's picture
Fix grading: clamp scores to (0,1) exclusive
fdaf3e1 verified
"""
Task definitions and automated graders for the DataDetective environment.
Each task has:
- id, title, difficulty, description
- A grader function that scores the agent's final answer (0.0 - 1.0)
based on whether key findings are mentioned.
"""
import re
from typing import Callable
def _has_any(text: str, keywords: list[str]) -> bool:
"""Case-insensitive check: does *text* contain any of *keywords*?"""
low = text.lower()
return any(kw.lower() in low for kw in keywords)
def _has_pattern(text: str, pattern: str) -> bool:
return bool(re.search(pattern, text, re.IGNORECASE))
def _grade_orders_drop(answer: str) -> float:
score = 0.0
if _has_any(answer, ["drop", "decrease", "decline", "fell", "fewer", "reduction", "lower"]):
score += 0.20
if _has_any(answer, ["spring mega sale", "spring sale", "mega sale"]) or (
_has_any(answer, ["promotion", "promo", "sale", "discount", "campaign"])
):
score += 0.20
if _has_any(answer, ["ended", "expired", "over", "concluded", "stopped"]) or _has_pattern(
answer, r"march\s*0?1"
):
score += 0.20
if _has_any(answer, [
"caused", "because", "due to", "result of", "led to",
"when the", "after the", "ending of", "end of the",
"correlated", "explains",
]):
score += 0.20
if _has_pattern(answer, r"\d+\s*(orders|transactions)") or _has_pattern(
answer, r"\d+\s*%"
) or _has_pattern(answer, r"from\s+\d+.*to\s+\d+"):
score += 0.20
return min(score, 1.0)
def _grade_returns_spike(answer: str) -> float:
score = 0.0
if _has_any(answer, ["wireless headphones", "headphones pro", "headphone"]):
score += 0.20
if _has_any(answer, ["west"]):
score += 0.20
if _has_any(answer, ["audiotech", "audio tech"]):
score += 0.20
if _has_any(answer, [
"defect", "defective", "faulty", "quality",
"high return", "return rate", "abnormal",
"stopped working", "battery issue", "poor audio",
]):
score += 0.20
if _has_pattern(answer, r"\d+\s*%") or _has_pattern(
answer, r"\d+\s*(returns|returned|units)"
) or _has_any(answer, ["return rate", "compared to"]):
score += 0.20
return min(score, 1.0)
def _grade_customer_churn(answer: str) -> float:
score = 0.0
if _has_pattern(answer, r"\d+\s*%") or _has_any(answer, [
"decline", "decrease", "drop", "churn", "fewer active",
"lost customers", "stopped ordering",
]):
score += 0.20
if _has_any(answer, ["enterprise"]):
score += 0.20
if _has_any(answer, ["northeast", "north east", "north-east"]):
score += 0.20
if _has_any(answer, [
"price increase", "price change", "price hike", "pricing",
"more expensive", "raised price", "cost increase",
]):
score += 0.20
if _has_any(answer, [
"laptop pro", "desktop workstation", "office suite",
"devtools", "external ssd",
]) or _has_pattern(answer, r"product.*(1|2|11|15|19)"):
score += 0.20
return min(score, 1.0)
def _grade_shipping_delay(answer: str) -> float:
score = 0.0
if _has_any(answer, ["midwest"]):
score += 0.20
if _has_any(answer, ["quickship", "quick ship"]):
score += 0.20
if _has_any(answer, [
"delivery delay", "late delivery", "delayed shipment",
"shipping delay", "late shipment", "delivery time",
"delayed delivery", "slow delivery",
]):
score += 0.20
if _has_pattern(answer, r"feb(ruary)?\s*(10|mid|middle)") or _has_any(answer, [
"mid-february", "mid february", "around february",
"starting in february", "beginning of february",
]):
score += 0.20
if _has_any(answer, [
"support ticket", "complaint", "ticket volume",
"customer satisfaction", "support request",
]) and _has_any(answer, [
"delivery", "shipping", "carrier", "quickship",
]):
score += 0.20
return min(score, 1.0)
def _grade_revenue_paradox(answer: str) -> float:
score = 0.0
if _has_any(answer, [
"spring mega sale", "mega sale", "25%", "25 percent",
]) or (
_has_any(answer, ["promotion", "promo", "discount", "sale"])
and _has_any(answer, ["margin", "profit", "cost"])
):
score += 0.20
if _has_any(answer, [
"product mix", "category mix", "mix shift", "shifted toward",
"higher proportion", "more electronics", "low-margin",
"composition changed",
]):
score += 0.20
if _has_any(answer, ["enterprise"]) and _has_any(answer, [
"price increase", "price change", "price hike",
"lost", "churn", "left", "fewer", "decline",
]):
score += 0.20
if _has_any(answer, ["return", "refund"]) and _has_any(answer, [
"cost", "expense", "profit", "margin", "loss", "erode",
]):
score += 0.20
if _has_pattern(answer, r"\$\s*[\d,]+") or _has_pattern(
answer, r"\d+\s*%"
) or _has_pattern(answer, r"from\s+\$?[\d,]+.*to\s+\$?[\d,]+"):
score += 0.20
return min(score, 1.0)
def _grade_supplier_quality(answer: str) -> float:
score = 0.0
if _has_any(answer, ["audiotech", "audio tech"]):
score += 0.20
if _has_any(answer, ["wireless headphones", "headphones pro", "product 6"]):
score += 0.20
if _has_any(answer, ["bluetooth speaker", "product 7"]):
score += 0.20
if _has_any(answer, ["return rate", "refund", "return volume"]) or _has_pattern(
answer, r"\d+\s*%.*return"
) or _has_pattern(answer, r"return.*\d+\s*%") or _has_pattern(
answer, r"\$\s*[\d,]+"
):
score += 0.20
if _has_any(answer, [
"support ticket", "defect", "complaint", "product_defect",
"quality issue", "customer complaint",
]):
score += 0.20
return min(score, 1.0)
def _grade_inventory_stockout(answer: str) -> float:
score = 0.0
if _has_any(answer, ["west"]):
score += 0.20
if _has_any(answer, ["monitor", "product 4", "monitor 27"]):
score += 0.20
if _has_any(answer, [
"inventory", "stock", "out of stock", "stockout", "stock-out",
"zero units", "no inventory", "warehouse",
]):
score += 0.20
if _has_any(answer, [
"spring mega sale", "mega sale", "promo", "promotion",
"february 15", "feb 15", "during the sale",
]):
score += 0.20
if _has_pattern(answer, r"\d+\s*(units|orders|sales)") or _has_pattern(
answer, r"\d+\s*%"
) or _has_pattern(answer, r"from\s+\d+.*to\s+\d+"):
score += 0.20
return min(score, 1.0)
def _grade_fraud_detection(answer: str) -> float:
score = 0.0
if _has_any(answer, ["southeast"]):
score += 0.20
if _has_any(answer, [
"new account", "recent signup", "recently created",
"new customer", "account creation", "registered in feb",
"signed up",
]):
score += 0.20
if _has_any(answer, [
"high-value", "high value", "expensive", "laptop pro",
"desktop workstation", "large order", "electronics",
]):
score += 0.20
if _has_pattern(answer, r"1[0-5]\s*(account|customer|user)") or _has_pattern(
answer, r"\$\s*[\d,]+"
) or _has_pattern(answer, r"\d+\s*(order|transaction)"):
score += 0.20
if _has_any(answer, [
"pattern", "cluster", "coordinated", "suspicious",
"same product", "no return", "never returned",
"concentrated", "anomal", "fraud ring",
]):
score += 0.20
return min(score, 1.0)
def _grade_repeat_purchase_decline(answer: str) -> float:
score = 0.0
if _has_any(answer, [
"repeat purchase", "repeat rate", "returning customer",
"repeat buyer", "repurchase", "order frequency",
"second order", "came back",
]) and (_has_pattern(answer, r"\d+\s*%") or _has_any(answer, [
"decline", "drop", "decrease", "fell", "collapsed",
])):
score += 0.20
if _has_any(answer, ["enterprise"]) and _has_any(answer, [
"price", "increase", "hike", "stopped", "left", "churn",
]):
score += 0.20
if (_has_any(answer, ["midwest"]) or _has_any(answer, [
"shipping", "delivery", "quickship",
])) and _has_any(answer, [
"repeat", "return", "reorder", "come back", "second order",
]):
score += 0.20
if _has_any(answer, ["marketing", "acquisition", "spend"]) and _has_any(answer, [
"retention", "email", "loyalty", "re-engage", "lapsed",
"shifted", "new customer",
]):
score += 0.20
if _has_any(answer, [
"segment", "cohort", "by region", "by segment",
"enterprise vs", "consumer vs", "smb vs",
]) or _has_pattern(answer, r"(enterprise|smb|consumer).*\d+\s*%"):
score += 0.20
return min(score, 1.0)
TASKS: dict[str, dict] = {
"orders_drop": {
"id": "orders_drop",
"difficulty": "easy",
"title": "Weekly Orders Drop Investigation",
"description": (
"URGENT -- Our order volume dropped sharply in the first two weeks "
"of March compared to the last two weeks of February. Leadership "
"needs to know why.\n\n"
"Investigate the database, identify the root cause of the drop, "
"and submit a clear summary of your findings."
),
},
"returns_spike": {
"id": "returns_spike",
"difficulty": "medium",
"title": "Product Returns Spike Investigation",
"description": (
"ALERT -- Our return rate has spiked significantly in recent weeks, "
"with particular concentration in one geographic region. This is "
"eating into margins.\n\n"
"Use the database to identify which product(s) are driving the "
"spike, which region is most affected, and what the likely root "
"cause is. Include the supplier if relevant."
),
},
"customer_churn": {
"id": "customer_churn",
"difficulty": "hard",
"title": "Customer Churn Root Cause Analysis",
"description": (
"CRITICAL -- Our monthly active customer count has declined "
"significantly from January to March. The executive team wants a "
"full root-cause analysis.\n\n"
"Determine which customer segments and regions are most affected, "
"quantify the decline, and identify the most likely causes. "
"Check all available tables for clues."
),
},
"shipping_delay": {
"id": "shipping_delay",
"difficulty": "medium-hard",
"title": "Customer Satisfaction Crisis Investigation",
"description": (
"ESCALATION -- Customer satisfaction scores have plummeted in one "
"of our regions. The support team is overwhelmed with complaints "
"and escalations are piling up.\n\n"
"Investigate what operational issue is driving the complaints, "
"identify the responsible party (carrier, warehouse, etc.), "
"determine when the problem started, and quantify the impact. "
"Cross-reference multiple data sources for a complete picture."
),
},
"revenue_paradox": {
"id": "revenue_paradox",
"difficulty": "hard",
"title": "Revenue vs. Profit Paradox Investigation",
"description": (
"CRITICAL -- Revenue in February was our highest month ever, yet "
"gross profit actually *decreased* compared to January. The CFO "
"wants a full breakdown of why we are selling more but earning "
"less.\n\n"
"Analyze revenue, costs, margins, discounts, product mix, customer "
"segments, and any other relevant factors. This is likely multi-"
"causal -- identify ALL contributing factors and quantify their "
"impact. Use the products.cost column to compute margins."
),
},
"supplier_quality": {
"id": "supplier_quality",
"difficulty": "medium",
"title": "Supplier Quality Crisis Investigation",
"description": (
"ESCALATION -- The VP of Merchandising has received escalating "
"complaints about product quality across multiple SKUs. Quality "
"Assurance wants a supplier-level analysis.\n\n"
"Determine which supplier(s) have systemic quality issues, which "
"of their products are affected, and quantify the total business "
"impact in returns, refunds, and support ticket volume. Include "
"return rates by supplier to support a contract renegotiation."
),
},
"inventory_stockout": {
"id": "inventory_stockout",
"difficulty": "medium-hard",
"title": "Regional Sales Underperformance Investigation",
"description": (
"INVESTIGATION -- Our West region was projected to be the top "
"performer during the Spring Mega Sale based on historical trends "
"and marketing investment, but actual sales came in significantly "
"below the other regions.\n\n"
"The Regional VP demands an explanation. Investigate what caused "
"the West to underperform during our biggest promotional event. "
"Check product-level sales, inventory data, and any operational "
"issues that may have limited fulfillment."
),
},
"fraud_detection": {
"id": "fraud_detection",
"difficulty": "hard",
"title": "Suspicious Order Pattern Investigation",
"description": (
"ALERT -- The Finance team has flagged a suspicious spike in "
"high-value orders from recently created accounts. Several of "
"these orders have already shipped.\n\n"
"Investigate the pattern: identify the suspicious accounts, "
"determine the scope of potential fraud, estimate the financial "
"exposure, and describe the behavioral signatures that "
"distinguish these accounts from legitimate customers. Look at "
"signup dates, order values, product choices, and geographic "
"concentration."
),
},
"repeat_purchase_decline": {
"id": "repeat_purchase_decline",
"difficulty": "hard",
"title": "Customer Retention Crisis Investigation",
"description": (
"CRITICAL -- Monthly unique buyer count has held steady around "
"100, but the Customer Success team reports that repeat purchase "
"rates have collapsed. In January, roughly 40%% of orders came "
"from returning customers; by March, it appears to be under 20%%."
"\n\n"
"The CEO asks: are we becoming a one-time-purchase business? "
"Diagnose which customer segments and regions lost repeat buyers, "
"identify the root causes, and determine whether our marketing "
"spend strategy is masking a retention problem. Check the "
"marketing_spend table for clues about acquisition vs. retention "
"investment."
),
},
}
_GRADERS: dict[str, Callable[[str], float]] = {
"orders_drop": _grade_orders_drop,
"returns_spike": _grade_returns_spike,
"customer_churn": _grade_customer_churn,
"shipping_delay": _grade_shipping_delay,
"revenue_paradox": _grade_revenue_paradox,
"supplier_quality": _grade_supplier_quality,
"inventory_stockout": _grade_inventory_stockout,
"fraud_detection": _grade_fraud_detection,
"repeat_purchase_decline": _grade_repeat_purchase_decline,
}
def grade_answer(task_id: str, answer: str) -> float:
grader = _GRADERS.get(task_id)
if grader is None:
return 0.05
raw = grader(answer)
return max(0.05, min(0.95, raw))