Spaces:

Viani
/

DataDetective

Sleeping

File size: 16,138 Bytes

"""
Task definitions and automated graders for the DataDetective environment.

Each task has:
  - id, title, difficulty, description
  - A grader function that scores the agent's final answer (0.0 - 1.0)
    based on whether key findings are mentioned.
"""

import re
from typing import Callable


def _has_any(text: str, keywords: list[str]) -> bool:
    """Case-insensitive check: does *text* contain any of *keywords*?"""
    low = text.lower()
    return any(kw.lower() in low for kw in keywords)


def _has_pattern(text: str, pattern: str) -> bool:
    return bool(re.search(pattern, text, re.IGNORECASE))


def _grade_orders_drop(answer: str) -> float:
    score = 0.0
    if _has_any(answer, ["drop", "decrease", "decline", "fell", "fewer", "reduction", "lower"]):
        score += 0.20
    if _has_any(answer, ["spring mega sale", "spring sale", "mega sale"]) or (
        _has_any(answer, ["promotion", "promo", "sale", "discount", "campaign"])
    ):
        score += 0.20
    if _has_any(answer, ["ended", "expired", "over", "concluded", "stopped"]) or _has_pattern(
        answer, r"march\s*0?1"
    ):
        score += 0.20
    if _has_any(answer, [
        "caused", "because", "due to", "result of", "led to",
        "when the", "after the", "ending of", "end of the",
        "correlated", "explains",
    ]):
        score += 0.20
    if _has_pattern(answer, r"\d+\s*(orders|transactions)") or _has_pattern(
        answer, r"\d+\s*%"
    ) or _has_pattern(answer, r"from\s+\d+.*to\s+\d+"):
        score += 0.20
    return min(score, 1.0)


def _grade_returns_spike(answer: str) -> float:
    score = 0.0
    if _has_any(answer, ["wireless headphones", "headphones pro", "headphone"]):
        score += 0.20
    if _has_any(answer, ["west"]):
        score += 0.20
    if _has_any(answer, ["audiotech", "audio tech"]):
        score += 0.20
    if _has_any(answer, [
        "defect", "defective", "faulty", "quality",
        "high return", "return rate", "abnormal",
        "stopped working", "battery issue", "poor audio",
    ]):
        score += 0.20
    if _has_pattern(answer, r"\d+\s*%") or _has_pattern(
        answer, r"\d+\s*(returns|returned|units)"
    ) or _has_any(answer, ["return rate", "compared to"]):
        score += 0.20
    return min(score, 1.0)


def _grade_customer_churn(answer: str) -> float:
    score = 0.0
    if _has_pattern(answer, r"\d+\s*%") or _has_any(answer, [
        "decline", "decrease", "drop", "churn", "fewer active",
        "lost customers", "stopped ordering",
    ]):
        score += 0.20
    if _has_any(answer, ["enterprise"]):
        score += 0.20
    if _has_any(answer, ["northeast", "north east", "north-east"]):
        score += 0.20
    if _has_any(answer, [
        "price increase", "price change", "price hike", "pricing",
        "more expensive", "raised price", "cost increase",
    ]):
        score += 0.20
    if _has_any(answer, [
        "laptop pro", "desktop workstation", "office suite",
        "devtools", "external ssd",
    ]) or _has_pattern(answer, r"product.*(1|2|11|15|19)"):
        score += 0.20
    return min(score, 1.0)


def _grade_shipping_delay(answer: str) -> float:
    score = 0.0
    if _has_any(answer, ["midwest"]):
        score += 0.20
    if _has_any(answer, ["quickship", "quick ship"]):
        score += 0.20
    if _has_any(answer, [
        "delivery delay", "late delivery", "delayed shipment",
        "shipping delay", "late shipment", "delivery time",
        "delayed delivery", "slow delivery",
    ]):
        score += 0.20
    if _has_pattern(answer, r"feb(ruary)?\s*(10|mid|middle)") or _has_any(answer, [
        "mid-february", "mid february", "around february",
        "starting in february", "beginning of february",
    ]):
        score += 0.20
    if _has_any(answer, [
        "support ticket", "complaint", "ticket volume",
        "customer satisfaction", "support request",
    ]) and _has_any(answer, [
        "delivery", "shipping", "carrier", "quickship",
    ]):
        score += 0.20
    return min(score, 1.0)


def _grade_revenue_paradox(answer: str) -> float:
    score = 0.0
    if _has_any(answer, [
        "spring mega sale", "mega sale", "25%", "25 percent",
    ]) or (
        _has_any(answer, ["promotion", "promo", "discount", "sale"])
        and _has_any(answer, ["margin", "profit", "cost"])
    ):
        score += 0.20
    if _has_any(answer, [
        "product mix", "category mix", "mix shift", "shifted toward",
        "higher proportion", "more electronics", "low-margin",
        "composition changed",
    ]):
        score += 0.20
    if _has_any(answer, ["enterprise"]) and _has_any(answer, [
        "price increase", "price change", "price hike",
        "lost", "churn", "left", "fewer", "decline",
    ]):
        score += 0.20
    if _has_any(answer, ["return", "refund"]) and _has_any(answer, [
        "cost", "expense", "profit", "margin", "loss", "erode",
    ]):
        score += 0.20
    if _has_pattern(answer, r"\$\s*[\d,]+") or _has_pattern(
        answer, r"\d+\s*%"
    ) or _has_pattern(answer, r"from\s+\$?[\d,]+.*to\s+\$?[\d,]+"):
        score += 0.20
    return min(score, 1.0)


def _grade_supplier_quality(answer: str) -> float:
    score = 0.0
    if _has_any(answer, ["audiotech", "audio tech"]):
        score += 0.20
    if _has_any(answer, ["wireless headphones", "headphones pro", "product 6"]):
        score += 0.20
    if _has_any(answer, ["bluetooth speaker", "product 7"]):
        score += 0.20
    if _has_any(answer, ["return rate", "refund", "return volume"]) or _has_pattern(
        answer, r"\d+\s*%.*return"
    ) or _has_pattern(answer, r"return.*\d+\s*%") or _has_pattern(
        answer, r"\$\s*[\d,]+"
    ):
        score += 0.20
    if _has_any(answer, [
        "support ticket", "defect", "complaint", "product_defect",
        "quality issue", "customer complaint",
    ]):
        score += 0.20
    return min(score, 1.0)


def _grade_inventory_stockout(answer: str) -> float:
    score = 0.0
    if _has_any(answer, ["west"]):
        score += 0.20
    if _has_any(answer, ["monitor", "product 4", "monitor 27"]):
        score += 0.20
    if _has_any(answer, [
        "inventory", "stock", "out of stock", "stockout", "stock-out",
        "zero units", "no inventory", "warehouse",
    ]):
        score += 0.20
    if _has_any(answer, [
        "spring mega sale", "mega sale", "promo", "promotion",
        "february 15", "feb 15", "during the sale",
    ]):
        score += 0.20
    if _has_pattern(answer, r"\d+\s*(units|orders|sales)") or _has_pattern(
        answer, r"\d+\s*%"
    ) or _has_pattern(answer, r"from\s+\d+.*to\s+\d+"):
        score += 0.20
    return min(score, 1.0)


def _grade_fraud_detection(answer: str) -> float:
    score = 0.0
    if _has_any(answer, ["southeast"]):
        score += 0.20
    if _has_any(answer, [
        "new account", "recent signup", "recently created",
        "new customer", "account creation", "registered in feb",
        "signed up",
    ]):
        score += 0.20
    if _has_any(answer, [
        "high-value", "high value", "expensive", "laptop pro",
        "desktop workstation", "large order", "electronics",
    ]):
        score += 0.20
    if _has_pattern(answer, r"1[0-5]\s*(account|customer|user)") or _has_pattern(
        answer, r"\$\s*[\d,]+"
    ) or _has_pattern(answer, r"\d+\s*(order|transaction)"):
        score += 0.20
    if _has_any(answer, [
        "pattern", "cluster", "coordinated", "suspicious",
        "same product", "no return", "never returned",
        "concentrated", "anomal", "fraud ring",
    ]):
        score += 0.20
    return min(score, 1.0)


def _grade_repeat_purchase_decline(answer: str) -> float:
    score = 0.0
    if _has_any(answer, [
        "repeat purchase", "repeat rate", "returning customer",
        "repeat buyer", "repurchase", "order frequency",
        "second order", "came back",
    ]) and (_has_pattern(answer, r"\d+\s*%") or _has_any(answer, [
        "decline", "drop", "decrease", "fell", "collapsed",
    ])):
        score += 0.20
    if _has_any(answer, ["enterprise"]) and _has_any(answer, [
        "price", "increase", "hike", "stopped", "left", "churn",
    ]):
        score += 0.20
    if (_has_any(answer, ["midwest"]) or _has_any(answer, [
        "shipping", "delivery", "quickship",
    ])) and _has_any(answer, [
        "repeat", "return", "reorder", "come back", "second order",
    ]):
        score += 0.20
    if _has_any(answer, ["marketing", "acquisition", "spend"]) and _has_any(answer, [
        "retention", "email", "loyalty", "re-engage", "lapsed",
        "shifted", "new customer",
    ]):
        score += 0.20
    if _has_any(answer, [
        "segment", "cohort", "by region", "by segment",
        "enterprise vs", "consumer vs", "smb vs",
    ]) or _has_pattern(answer, r"(enterprise|smb|consumer).*\d+\s*%"):
        score += 0.20
    return min(score, 1.0)


TASKS: dict[str, dict] = {
    "orders_drop": {
        "id": "orders_drop",
        "difficulty": "easy",
        "title": "Weekly Orders Drop Investigation",
        "description": (
            "URGENT -- Our order volume dropped sharply in the first two weeks "
            "of March compared to the last two weeks of February. Leadership "
            "needs to know why.\n\n"
            "Investigate the database, identify the root cause of the drop, "
            "and submit a clear summary of your findings."
        ),
    },
    "returns_spike": {
        "id": "returns_spike",
        "difficulty": "medium",
        "title": "Product Returns Spike Investigation",
        "description": (
            "ALERT -- Our return rate has spiked significantly in recent weeks, "
            "with particular concentration in one geographic region. This is "
            "eating into margins.\n\n"
            "Use the database to identify which product(s) are driving the "
            "spike, which region is most affected, and what the likely root "
            "cause is. Include the supplier if relevant."
        ),
    },
    "customer_churn": {
        "id": "customer_churn",
        "difficulty": "hard",
        "title": "Customer Churn Root Cause Analysis",
        "description": (
            "CRITICAL -- Our monthly active customer count has declined "
            "significantly from January to March. The executive team wants a "
            "full root-cause analysis.\n\n"
            "Determine which customer segments and regions are most affected, "
            "quantify the decline, and identify the most likely causes. "
            "Check all available tables for clues."
        ),
    },
    "shipping_delay": {
        "id": "shipping_delay",
        "difficulty": "medium-hard",
        "title": "Customer Satisfaction Crisis Investigation",
        "description": (
            "ESCALATION -- Customer satisfaction scores have plummeted in one "
            "of our regions. The support team is overwhelmed with complaints "
            "and escalations are piling up.\n\n"
            "Investigate what operational issue is driving the complaints, "
            "identify the responsible party (carrier, warehouse, etc.), "
            "determine when the problem started, and quantify the impact. "
            "Cross-reference multiple data sources for a complete picture."
        ),
    },
    "revenue_paradox": {
        "id": "revenue_paradox",
        "difficulty": "hard",
        "title": "Revenue vs. Profit Paradox Investigation",
        "description": (
            "CRITICAL -- Revenue in February was our highest month ever, yet "
            "gross profit actually *decreased* compared to January. The CFO "
            "wants a full breakdown of why we are selling more but earning "
            "less.\n\n"
            "Analyze revenue, costs, margins, discounts, product mix, customer "
            "segments, and any other relevant factors. This is likely multi-"
            "causal -- identify ALL contributing factors and quantify their "
            "impact. Use the products.cost column to compute margins."
        ),
    },
    "supplier_quality": {
        "id": "supplier_quality",
        "difficulty": "medium",
        "title": "Supplier Quality Crisis Investigation",
        "description": (
            "ESCALATION -- The VP of Merchandising has received escalating "
            "complaints about product quality across multiple SKUs. Quality "
            "Assurance wants a supplier-level analysis.\n\n"
            "Determine which supplier(s) have systemic quality issues, which "
            "of their products are affected, and quantify the total business "
            "impact in returns, refunds, and support ticket volume. Include "
            "return rates by supplier to support a contract renegotiation."
        ),
    },
    "inventory_stockout": {
        "id": "inventory_stockout",
        "difficulty": "medium-hard",
        "title": "Regional Sales Underperformance Investigation",
        "description": (
            "INVESTIGATION -- Our West region was projected to be the top "
            "performer during the Spring Mega Sale based on historical trends "
            "and marketing investment, but actual sales came in significantly "
            "below the other regions.\n\n"
            "The Regional VP demands an explanation. Investigate what caused "
            "the West to underperform during our biggest promotional event. "
            "Check product-level sales, inventory data, and any operational "
            "issues that may have limited fulfillment."
        ),
    },
    "fraud_detection": {
        "id": "fraud_detection",
        "difficulty": "hard",
        "title": "Suspicious Order Pattern Investigation",
        "description": (
            "ALERT -- The Finance team has flagged a suspicious spike in "
            "high-value orders from recently created accounts. Several of "
            "these orders have already shipped.\n\n"
            "Investigate the pattern: identify the suspicious accounts, "
            "determine the scope of potential fraud, estimate the financial "
            "exposure, and describe the behavioral signatures that "
            "distinguish these accounts from legitimate customers. Look at "
            "signup dates, order values, product choices, and geographic "
            "concentration."
        ),
    },
    "repeat_purchase_decline": {
        "id": "repeat_purchase_decline",
        "difficulty": "hard",
        "title": "Customer Retention Crisis Investigation",
        "description": (
            "CRITICAL -- Monthly unique buyer count has held steady around "
            "100, but the Customer Success team reports that repeat purchase "
            "rates have collapsed. In January, roughly 40%% of orders came "
            "from returning customers; by March, it appears to be under 20%%."
            "\n\n"
            "The CEO asks: are we becoming a one-time-purchase business? "
            "Diagnose which customer segments and regions lost repeat buyers, "
            "identify the root causes, and determine whether our marketing "
            "spend strategy is masking a retention problem. Check the "
            "marketing_spend table for clues about acquisition vs. retention "
            "investment."
        ),
    },
}

_GRADERS: dict[str, Callable[[str], float]] = {
    "orders_drop":             _grade_orders_drop,
    "returns_spike":           _grade_returns_spike,
    "customer_churn":          _grade_customer_churn,
    "shipping_delay":          _grade_shipping_delay,
    "revenue_paradox":         _grade_revenue_paradox,
    "supplier_quality":        _grade_supplier_quality,
    "inventory_stockout":      _grade_inventory_stockout,
    "fraud_detection":         _grade_fraud_detection,
    "repeat_purchase_decline": _grade_repeat_purchase_decline,
}


def grade_answer(task_id: str, answer: str) -> float:
    grader = _GRADERS.get(task_id)
    if grader is None:
        return 0.05
    raw = grader(answer)
    return max(0.05, min(0.95, raw))