Spaces:

Viani
/

DataDetective

Sleeping

App Files Files Community

DataDetective / server /tasks.py

Viani

Fix grading: clamp scores to (0,1) exclusive

fdaf3e1 verified about 1 month ago

raw

history blame contribute delete

16.1 kB

	"""
	Task definitions and automated graders for the DataDetective environment.

	Each task has:
	- id, title, difficulty, description
	- A grader function that scores the agent's final answer (0.0 - 1.0)
	based on whether key findings are mentioned.
	"""

	import re
	from typing import Callable


	def _has_any(text: str, keywords: list[str]) -> bool:
	"""Case-insensitive check: does text contain any of keywords?"""
	low = text.lower()
	return any(kw.lower() in low for kw in keywords)


	def _has_pattern(text: str, pattern: str) -> bool:
	return bool(re.search(pattern, text, re.IGNORECASE))


	def _grade_orders_drop(answer: str) -> float:
	score = 0.0
	if _has_any(answer, ["drop", "decrease", "decline", "fell", "fewer", "reduction", "lower"]):
	score += 0.20
	if _has_any(answer, ["spring mega sale", "spring sale", "mega sale"]) or (
	_has_any(answer, ["promotion", "promo", "sale", "discount", "campaign"])
	):
	score += 0.20
	if _has_any(answer, ["ended", "expired", "over", "concluded", "stopped"]) or _has_pattern(
	answer, r"march\s*0?1"
	):
	score += 0.20
	if _has_any(answer, [
	"caused", "because", "due to", "result of", "led to",
	"when the", "after the", "ending of", "end of the",
	"correlated", "explains",
	]):
	score += 0.20
	if _has_pattern(answer, r"\d+\s*(orders\|transactions)") or _has_pattern(
	answer, r"\d+\s*%"
	) or _has_pattern(answer, r"from\s+\d+.*to\s+\d+"):
	score += 0.20
	return min(score, 1.0)


	def _grade_returns_spike(answer: str) -> float:
	score = 0.0
	if _has_any(answer, ["wireless headphones", "headphones pro", "headphone"]):
	score += 0.20
	if _has_any(answer, ["west"]):
	score += 0.20
	if _has_any(answer, ["audiotech", "audio tech"]):
	score += 0.20
	if _has_any(answer, [
	"defect", "defective", "faulty", "quality",
	"high return", "return rate", "abnormal",
	"stopped working", "battery issue", "poor audio",
	]):
	score += 0.20
	if _has_pattern(answer, r"\d+\s*%") or _has_pattern(
	answer, r"\d+\s*(returns\|returned\|units)"
	) or _has_any(answer, ["return rate", "compared to"]):
	score += 0.20
	return min(score, 1.0)


	def _grade_customer_churn(answer: str) -> float:
	score = 0.0
	if _has_pattern(answer, r"\d+\s*%") or _has_any(answer, [
	"decline", "decrease", "drop", "churn", "fewer active",
	"lost customers", "stopped ordering",
	]):
	score += 0.20
	if _has_any(answer, ["enterprise"]):
	score += 0.20
	if _has_any(answer, ["northeast", "north east", "north-east"]):
	score += 0.20
	if _has_any(answer, [
	"price increase", "price change", "price hike", "pricing",
	"more expensive", "raised price", "cost increase",
	]):
	score += 0.20
	if _has_any(answer, [
	"laptop pro", "desktop workstation", "office suite",
	"devtools", "external ssd",
	]) or _has_pattern(answer, r"product.*(1\|2\|11\|15\|19)"):
	score += 0.20
	return min(score, 1.0)


	def _grade_shipping_delay(answer: str) -> float:
	score = 0.0
	if _has_any(answer, ["midwest"]):
	score += 0.20
	if _has_any(answer, ["quickship", "quick ship"]):
	score += 0.20
	if _has_any(answer, [
	"delivery delay", "late delivery", "delayed shipment",
	"shipping delay", "late shipment", "delivery time",
	"delayed delivery", "slow delivery",
	]):
	score += 0.20
	if _has_pattern(answer, r"feb(ruary)?\s*(10\|mid\|middle)") or _has_any(answer, [
	"mid-february", "mid february", "around february",
	"starting in february", "beginning of february",
	]):
	score += 0.20
	if _has_any(answer, [
	"support ticket", "complaint", "ticket volume",
	"customer satisfaction", "support request",
	]) and _has_any(answer, [
	"delivery", "shipping", "carrier", "quickship",
	]):
	score += 0.20
	return min(score, 1.0)


	def _grade_revenue_paradox(answer: str) -> float:
	score = 0.0
	if _has_any(answer, [
	"spring mega sale", "mega sale", "25%", "25 percent",
	]) or (
	_has_any(answer, ["promotion", "promo", "discount", "sale"])
	and _has_any(answer, ["margin", "profit", "cost"])
	):
	score += 0.20
	if _has_any(answer, [
	"product mix", "category mix", "mix shift", "shifted toward",
	"higher proportion", "more electronics", "low-margin",
	"composition changed",
	]):
	score += 0.20
	if _has_any(answer, ["enterprise"]) and _has_any(answer, [
	"price increase", "price change", "price hike",
	"lost", "churn", "left", "fewer", "decline",
	]):
	score += 0.20
	if _has_any(answer, ["return", "refund"]) and _has_any(answer, [
	"cost", "expense", "profit", "margin", "loss", "erode",
	]):
	score += 0.20
	if _has_pattern(answer, r"\$\s*[\d,]+") or _has_pattern(
	answer, r"\d+\s*%"
	) or _has_pattern(answer, r"from\s+\$?[\d,]+.*to\s+\$?[\d,]+"):
	score += 0.20
	return min(score, 1.0)


	def _grade_supplier_quality(answer: str) -> float:
	score = 0.0
	if _has_any(answer, ["audiotech", "audio tech"]):
	score += 0.20
	if _has_any(answer, ["wireless headphones", "headphones pro", "product 6"]):
	score += 0.20
	if _has_any(answer, ["bluetooth speaker", "product 7"]):
	score += 0.20
	if _has_any(answer, ["return rate", "refund", "return volume"]) or _has_pattern(
	answer, r"\d+\s%.return"
	) or _has_pattern(answer, r"return.\d+\s%") or _has_pattern(
	answer, r"\$\s*[\d,]+"
	):
	score += 0.20
	if _has_any(answer, [
	"support ticket", "defect", "complaint", "product_defect",
	"quality issue", "customer complaint",
	]):
	score += 0.20
	return min(score, 1.0)


	def _grade_inventory_stockout(answer: str) -> float:
	score = 0.0
	if _has_any(answer, ["west"]):
	score += 0.20
	if _has_any(answer, ["monitor", "product 4", "monitor 27"]):
	score += 0.20
	if _has_any(answer, [
	"inventory", "stock", "out of stock", "stockout", "stock-out",
	"zero units", "no inventory", "warehouse",
	]):
	score += 0.20
	if _has_any(answer, [
	"spring mega sale", "mega sale", "promo", "promotion",
	"february 15", "feb 15", "during the sale",
	]):
	score += 0.20
	if _has_pattern(answer, r"\d+\s*(units\|orders\|sales)") or _has_pattern(
	answer, r"\d+\s*%"
	) or _has_pattern(answer, r"from\s+\d+.*to\s+\d+"):
	score += 0.20
	return min(score, 1.0)


	def _grade_fraud_detection(answer: str) -> float:
	score = 0.0
	if _has_any(answer, ["southeast"]):
	score += 0.20
	if _has_any(answer, [
	"new account", "recent signup", "recently created",
	"new customer", "account creation", "registered in feb",
	"signed up",
	]):
	score += 0.20
	if _has_any(answer, [
	"high-value", "high value", "expensive", "laptop pro",
	"desktop workstation", "large order", "electronics",
	]):
	score += 0.20
	if _has_pattern(answer, r"1[0-5]\s*(account\|customer\|user)") or _has_pattern(
	answer, r"\$\s*[\d,]+"
	) or _has_pattern(answer, r"\d+\s*(order\|transaction)"):
	score += 0.20
	if _has_any(answer, [
	"pattern", "cluster", "coordinated", "suspicious",
	"same product", "no return", "never returned",
	"concentrated", "anomal", "fraud ring",
	]):
	score += 0.20
	return min(score, 1.0)


	def _grade_repeat_purchase_decline(answer: str) -> float:
	score = 0.0
	if _has_any(answer, [
	"repeat purchase", "repeat rate", "returning customer",
	"repeat buyer", "repurchase", "order frequency",
	"second order", "came back",
	]) and (_has_pattern(answer, r"\d+\s*%") or _has_any(answer, [
	"decline", "drop", "decrease", "fell", "collapsed",
	])):
	score += 0.20
	if _has_any(answer, ["enterprise"]) and _has_any(answer, [
	"price", "increase", "hike", "stopped", "left", "churn",
	]):
	score += 0.20
	if (_has_any(answer, ["midwest"]) or _has_any(answer, [
	"shipping", "delivery", "quickship",
	])) and _has_any(answer, [
	"repeat", "return", "reorder", "come back", "second order",
	]):
	score += 0.20
	if _has_any(answer, ["marketing", "acquisition", "spend"]) and _has_any(answer, [
	"retention", "email", "loyalty", "re-engage", "lapsed",
	"shifted", "new customer",
	]):
	score += 0.20
	if _has_any(answer, [
	"segment", "cohort", "by region", "by segment",
	"enterprise vs", "consumer vs", "smb vs",
	]) or _has_pattern(answer, r"(enterprise\|smb\|consumer).\d+\s%"):
	score += 0.20
	return min(score, 1.0)


	TASKS: dict[str, dict] = {
	"orders_drop": {
	"id": "orders_drop",
	"difficulty": "easy",
	"title": "Weekly Orders Drop Investigation",
	"description": (
	"URGENT -- Our order volume dropped sharply in the first two weeks "
	"of March compared to the last two weeks of February. Leadership "
	"needs to know why.\n\n"
	"Investigate the database, identify the root cause of the drop, "
	"and submit a clear summary of your findings."
	),
	},
	"returns_spike": {
	"id": "returns_spike",
	"difficulty": "medium",
	"title": "Product Returns Spike Investigation",
	"description": (
	"ALERT -- Our return rate has spiked significantly in recent weeks, "
	"with particular concentration in one geographic region. This is "
	"eating into margins.\n\n"
	"Use the database to identify which product(s) are driving the "
	"spike, which region is most affected, and what the likely root "
	"cause is. Include the supplier if relevant."
	),
	},
	"customer_churn": {
	"id": "customer_churn",
	"difficulty": "hard",
	"title": "Customer Churn Root Cause Analysis",
	"description": (
	"CRITICAL -- Our monthly active customer count has declined "
	"significantly from January to March. The executive team wants a "
	"full root-cause analysis.\n\n"
	"Determine which customer segments and regions are most affected, "
	"quantify the decline, and identify the most likely causes. "
	"Check all available tables for clues."
	),
	},
	"shipping_delay": {
	"id": "shipping_delay",
	"difficulty": "medium-hard",
	"title": "Customer Satisfaction Crisis Investigation",
	"description": (
	"ESCALATION -- Customer satisfaction scores have plummeted in one "
	"of our regions. The support team is overwhelmed with complaints "
	"and escalations are piling up.\n\n"
	"Investigate what operational issue is driving the complaints, "
	"identify the responsible party (carrier, warehouse, etc.), "
	"determine when the problem started, and quantify the impact. "
	"Cross-reference multiple data sources for a complete picture."
	),
	},
	"revenue_paradox": {
	"id": "revenue_paradox",
	"difficulty": "hard",
	"title": "Revenue vs. Profit Paradox Investigation",
	"description": (
	"CRITICAL -- Revenue in February was our highest month ever, yet "
	"gross profit actually decreased compared to January. The CFO "
	"wants a full breakdown of why we are selling more but earning "
	"less.\n\n"
	"Analyze revenue, costs, margins, discounts, product mix, customer "
	"segments, and any other relevant factors. This is likely multi-"
	"causal -- identify ALL contributing factors and quantify their "
	"impact. Use the products.cost column to compute margins."
	),
	},
	"supplier_quality": {
	"id": "supplier_quality",
	"difficulty": "medium",
	"title": "Supplier Quality Crisis Investigation",
	"description": (
	"ESCALATION -- The VP of Merchandising has received escalating "
	"complaints about product quality across multiple SKUs. Quality "
	"Assurance wants a supplier-level analysis.\n\n"
	"Determine which supplier(s) have systemic quality issues, which "
	"of their products are affected, and quantify the total business "
	"impact in returns, refunds, and support ticket volume. Include "
	"return rates by supplier to support a contract renegotiation."
	),
	},
	"inventory_stockout": {
	"id": "inventory_stockout",
	"difficulty": "medium-hard",
	"title": "Regional Sales Underperformance Investigation",
	"description": (
	"INVESTIGATION -- Our West region was projected to be the top "
	"performer during the Spring Mega Sale based on historical trends "
	"and marketing investment, but actual sales came in significantly "
	"below the other regions.\n\n"
	"The Regional VP demands an explanation. Investigate what caused "
	"the West to underperform during our biggest promotional event. "
	"Check product-level sales, inventory data, and any operational "
	"issues that may have limited fulfillment."
	),
	},
	"fraud_detection": {
	"id": "fraud_detection",
	"difficulty": "hard",
	"title": "Suspicious Order Pattern Investigation",
	"description": (
	"ALERT -- The Finance team has flagged a suspicious spike in "
	"high-value orders from recently created accounts. Several of "
	"these orders have already shipped.\n\n"
	"Investigate the pattern: identify the suspicious accounts, "
	"determine the scope of potential fraud, estimate the financial "
	"exposure, and describe the behavioral signatures that "
	"distinguish these accounts from legitimate customers. Look at "
	"signup dates, order values, product choices, and geographic "
	"concentration."
	),
	},
	"repeat_purchase_decline": {
	"id": "repeat_purchase_decline",
	"difficulty": "hard",
	"title": "Customer Retention Crisis Investigation",
	"description": (
	"CRITICAL -- Monthly unique buyer count has held steady around "
	"100, but the Customer Success team reports that repeat purchase "
	"rates have collapsed. In January, roughly 40%% of orders came "
	"from returning customers; by March, it appears to be under 20%%."
	"\n\n"
	"The CEO asks: are we becoming a one-time-purchase business? "
	"Diagnose which customer segments and regions lost repeat buyers, "
	"identify the root causes, and determine whether our marketing "
	"spend strategy is masking a retention problem. Check the "
	"marketing_spend table for clues about acquisition vs. retention "
	"investment."
	),
	},
	}

	_GRADERS: dict[str, Callable[[str], float]] = {
	"orders_drop": _grade_orders_drop,
	"returns_spike": _grade_returns_spike,
	"customer_churn": _grade_customer_churn,
	"shipping_delay": _grade_shipping_delay,
	"revenue_paradox": _grade_revenue_paradox,
	"supplier_quality": _grade_supplier_quality,
	"inventory_stockout": _grade_inventory_stockout,
	"fraud_detection": _grade_fraud_detection,
	"repeat_purchase_decline": _grade_repeat_purchase_decline,
	}


	def grade_answer(task_id: str, answer: str) -> float:
	grader = _GRADERS.get(task_id)
	if grader is None:
	return 0.05
	raw = grader(answer)
	return max(0.05, min(0.95, raw))