from .models import EnvironmentState def grade_easy(state: EnvironmentState) -> float: # Requires: check_policy, issue_refund, close_ticket reward = 0.0 actions = [a.action_type for a in state.action_history] if "check_policy" in actions: reward += 0.2 if "issue_refund" in actions: reward += 0.5 if "close_ticket" in actions: reward += 0.3 if "escalate" in actions: reward -= 0.5 # penalty for unnecessary escalation return max(0.0, min(1.0, reward)) def grade_medium(state: EnvironmentState) -> float: # Requires: check_policy, reply_to_customer (explaining policy), close_ticket # NO refund should be issued. reward = 0.0 actions = [a.action_type for a in state.action_history] if "check_policy" in actions: reward += 0.3 if "reply_to_customer" in actions: reward += 0.4 if "close_ticket" in actions: reward += 0.3 if "issue_refund" in actions: # fatal mistake return 0.0 return max(0.0, min(1.0, reward)) def grade_hard(state: EnvironmentState) -> float: # Requires: fetch_user_data, escalate to "billing_tier2", reply_to_customer reward = 0.0 actions = [a.action_type for a in state.action_history] if "fetch_user_data" in actions: reward += 0.2 escalated = False for a in state.action_history: if a.action_type == "escalate" and a.parameters.get("reason") == "billing_tier2": escalated = True if escalated: reward += 0.5 if "reply_to_customer" in actions: reward += 0.3 if "issue_refund" in actions: reward -= 0.5 # can't refund enterprise double charges directly if "close_ticket" in actions: reward -= 0.3 # can't close without resolving escalate return max(0.0, min(1.0, reward)) def grade_fraud_detection(state: EnvironmentState) -> float: # Requires: fetch_user_data, check_policy, deny refund, close_ticket reward = 0.0 actions = [a.action_type for a in state.action_history] print(f"Actions received for grading: {actions}") if "fetch_user_data" in actions: reward += 0.3 # Increased reward for fetching user data print("Reward after fetch_user_data:", reward) if "check_policy" in actions: reward += 0.4 # Increased reward for checking policy print("Reward after check_policy:", reward) if "close_ticket" in actions: reward += 0.5 # Reward for closing the ticket correctly print("Reward after close_ticket:", reward) if "issue_refund" in actions: # fatal mistake return 0.0 return max(0.0, min(1.0, reward)) def grade(state: EnvironmentState) -> float: if state.current_task_id == "task_fraud_detection": return grade_fraud_detection(state) if state.task_difficulty == "easy": return grade_easy(state) elif state.task_difficulty == "medium": return grade_medium(state) elif state.task_difficulty == "hard": return grade_hard(state) return 0.0