Spaces:

Mohammed-Altaf
/

DataAnalysis_Env

Sleeping

App Files Files Community

DataAnalysis_Env / tasks /task_medium_2.py

Mohammed-Altaf

changes upper and lower bounds for inference grading

19b4563 about 1 month ago

raw

history blame contribute delete

3.07 kB

	import re

	import pandas as pd

	from tasks.base_task import BaseTask


	class MonthlyRevenueRatioTask(BaseTask):
	"""Medium task: find the best and worst months by revenue and compute their ratio.

	The agent must parse order_date, group by month, find the extremes,
	and compute how many times larger the best month is versus the worst.
	"""

	@property
	def task_id(self) -> int:
	return 4

	@property
	def difficulty(self) -> str:
	return "medium"

	@property
	def description(self) -> str:
	return (
	"What is the best and worst performing month by total revenue in 2024? "
	"What is the ratio of best to worst month revenue? Round ratio to 2 decimal places. "
	"Submit your answer in the format: "
	"'Best: YYYY-MM ($X.XX), Worst: YYYY-MM ($X.XX), Ratio: X.XX'"
	)

	def _compute(self) -> tuple:
	"""Compute the best month, worst month, and their revenue ratio.

	Returns:
	A tuple of (best_month_str, best_rev, worst_month_str, worst_rev, ratio).
	"""
	df = self.df.copy()
	df["order_date"] = pd.to_datetime(df["order_date"])
	monthly = df.groupby(df["order_date"].dt.to_period("M"))["total_price"].sum()
	best = monthly.idxmax()
	worst = monthly.idxmin()
	ratio = round(monthly[best] / monthly[worst], 2)
	return str(best), monthly[best], str(worst), monthly[worst], ratio

	def expected_answer(self) -> str:
	"""Compute the expected formatted answer.

	Returns:
	Formatted string like 'Best: 2024-12 ($23025.82), Worst: 2024-05 ($16871.48), Ratio: 1.36'.
	"""
	best, best_rev, worst, worst_rev, ratio = self._compute()
	return f"Best: {best} (${best_rev:.2f}), Worst: {worst} (${worst_rev:.2f}), Ratio: {ratio}"

	def grade(self, answer: str) -> float:
	"""Grade with partial credit for each of the three fields.

	Scoring:
	- 0.33 for correct best month (exact YYYY-MM match)
	- 0.33 for correct worst month (exact YYYY-MM match)
	- 0.34 for ratio within ±0.01 of expected

	Args:
	answer: The agent's submitted answer string.

	Returns:
	A score between 0.0 and 1.0.
	"""
	best, _, worst, _, expected_ratio = self._compute()
	score = 0.0

	best_match = re.search(r"Best:\s*([\d]{4}-[\d]{2})", answer, re.IGNORECASE)
	if best_match and best_match.group(1).strip() == best:
	score += 0.33

	worst_match = re.search(r"Worst:\s*([\d]{4}-[\d]{2})", answer, re.IGNORECASE)
	if worst_match and worst_match.group(1).strip() == worst:
	score += 0.33

	ratio_match = re.search(r"Ratio:\s*([\d.]+)", answer, re.IGNORECASE)
	if ratio_match:
	try:
	if abs(float(ratio_match.group(1)) - expected_ratio) <= 0.01:
	score += 0.34
	except ValueError:
	pass

	return max(0.05, min(0.95, score))