Spaces:

Mohammed-Altaf
/

DataAnalysis_Env

Sleeping

App Files Files Community

DataAnalysis_Env / tasks /task_hard_2.py

Mohammed-Altaf

changes upper and lower bounds for inference grading

19b4563 about 1 month ago

raw

history blame contribute delete

3.55 kB

	import re
	import sqlite3
	from pathlib import Path

	import pandas as pd

	from tasks.base_task import BaseTask

	DB_PATH = Path(__file__).resolve().parent.parent / "datasets" / "store_data.db"


	class CustomerLoyaltyRevenueTask(BaseTask):
	"""Hard task: find the highest-revenue customer loyalty tier using cross-source data.

	The agent must query the customer_profiles table from the SQLite database,
	join it with the sales DataFrame on customer_id, and compute revenue by tier.
	The database is accessible via sqlite3.connect(db_path) in the sandbox.
	"""

	@property
	def task_id(self) -> int:
	return 5

	@property
	def difficulty(self) -> str:
	return "hard"

	@property
	def description(self) -> str:
	return (
	"Using the customer profiles database (connect with sqlite3.connect(db_path)), "
	"which customer loyalty tier generates the highest total revenue? "
	"What percentage of total revenue does it represent? "
	"Round percentage to 2 decimal places. "
	"Submit your answer in the format: "
	"'Top tier: <name>, Revenue: $X.XX, Percentage: X.XX%'"
	)

	def _compute(self) -> tuple:
	"""Compute the top loyalty tier and its revenue share.

	Returns:
	A tuple of (top_tier, tier_revenue, percentage).
	"""
	conn = sqlite3.connect(DB_PATH)
	profiles = pd.read_sql("SELECT customer_id, loyalty_tier FROM customer_profiles", conn)
	conn.close()
	merged = self.df.merge(profiles, on="customer_id", how="left")
	tier_rev = merged.groupby("loyalty_tier")["total_price"].sum()
	total = merged["total_price"].sum()
	top = tier_rev.idxmax()
	rev = tier_rev[top]
	pct = rev / total * 100
	return top, rev, pct

	def expected_answer(self) -> str:
	"""Compute the expected formatted answer.

	Returns:
	Formatted string like 'Top tier: Bronze, Revenue: $97210.91, Percentage: 39.28%'.
	"""
	top, rev, pct = self._compute()
	return f"Top tier: {top}, Revenue: ${rev:.2f}, Percentage: {round(pct, 2)}%"

	def grade(self, answer: str) -> float:
	"""Grade with partial credit for each of the three fields.

	Scoring:
	- 0.33 for correct tier name (case-insensitive)
	- 0.33 for revenue within ±0.5% of expected
	- 0.34 for percentage within ±0.1 of expected

	Args:
	answer: The agent's submitted answer string.

	Returns:
	A score between 0.0 and 1.0.
	"""
	top, expected_rev, expected_pct = self._compute()
	score = 0.0

	tier_match = re.search(r"Top tier:\s*([^,]+)", answer, re.IGNORECASE)
	if tier_match and tier_match.group(1).strip().lower() == top.lower():
	score += 0.33

	rev_match = re.search(r"Revenue:\s*\$?([\d.]+)", answer, re.IGNORECASE)
	if rev_match:
	try:
	submitted = float(rev_match.group(1))
	if abs(submitted - expected_rev) <= expected_rev * 0.005:
	score += 0.33
	except ValueError:
	pass

	pct_match = re.search(r"Percentage:\s*([\d.]+)%?", answer, re.IGNORECASE)
	if pct_match:
	try:
	if abs(float(pct_match.group(1)) - expected_pct) <= 0.1:
	score += 0.34
	except ValueError:
	pass

	return max(0.05, min(0.95, score))