Spaces:

junaid0600
/

sql-db-engineer-agent

Sleeping

App Files Files Community

sql-db-engineer-agent / env /reward.py

junaid0600

Round 2: SQL Database Engineer Agent - 24/24 tests passing

8cb206e 2 months ago

Raw

History Blame Contribute Delete

14.3 kB

	from env.models import Action, Reward, DifficultyLevel, ActionType
	from env.graders import grade

	# ─────────────────────────────────────────────
	# CONSTANTS
	# ─────────────────────────────────────────────

	MAX_STEPS = 50 # Round 2: long-horizon episodes
	HINT_PENALTY = -0.10 # Per hint requested (increased from Round 1)
	LOOP_PENALTY = -0.08 # Same action on same target 2+ times, no improvement
	INVALID_PENALTY = -0.10 # Null / malformed action
	BACKTRACK_PENALTY = -0.05 # Action makes score worse than previous best
	BUDGET_EXHAUSTION_PEN = -0.15 # Reaching max_steps without submitting report
	EFFICIENCY_BONUS = 0.10 # Solved in < 70% of max_steps

	# Milestone thresholds: {improvement_fraction: bonus_reward}
	MILESTONE_THRESHOLDS = {
	0.25: 0.15, # 25% improvement → +0.15 bonus
	0.50: 0.25, # 50% improvement → +0.25 bonus
	0.75: 0.40, # 75% improvement → +0.40 bonus
	}

	# Step rewards for Round 2 actions (dense signal)
	STEP_REWARDS = {
	# ── Round 2 actions ──────────────────────────
	ActionType.INSPECT_QUERY: 0.05, # Investigation rewarded
	ActionType.ANALYZE_INDEXES: 0.05, # Investigation rewarded
	ActionType.CREATE_INDEX: 0.10, # Core optimization action
	ActionType.REWRITE_QUERY: 0.15, # High-value rewrite
	ActionType.ADD_COLUMN: 0.08, # Denormalization
	ActionType.DROP_INDEX: 0.05, # Clean up overhead
	ActionType.PARTITION_TABLE: 0.15, # Big structural improvement
	ActionType.ANALYZE_STATS: 0.05, # Maintenance action
	ActionType.SUBMIT_REPORT: 0.00, # Terminal — score comes from grader
	ActionType.REQUEST_HINT: 0.00, # No reward, only penalty
	# ── Round 1 backward compat ──────────────────
	ActionType.IDENTIFY_ERROR: 0.15,
	ActionType.PROPOSE_FIX: 0.25,
	ActionType.SUBMIT_ANSWER: 0.00,
	ActionType.EXPLAIN_ISSUE: 0.10,
	ActionType.OPTIMIZE_QUERY: 0.20,
	}

	# Terminal actions that end the episode
	TERMINAL_ACTIONS = {
	ActionType.SUBMIT_ANSWER,
	ActionType.OPTIMIZE_QUERY,
	ActionType.SUBMIT_REPORT,
	}


	# ─────────────────────────────────────────────
	# MILESTONE TRACKER
	# ─────────────────────────────────────────────

	def check_milestones(
	baseline_score: float,
	new_score: float,
	earned: set,
	) -> tuple[float, list[float]]:
	"""
	Returns (total_bonus, newly_earned_thresholds).
	One-time bonuses — each milestone only paid once per episode.
	"""
	max_possible = max(1.0, 100.0 - baseline_score)
	improvement = (new_score - baseline_score) / max_possible
	bonus = 0.0
	newly_earned = []

	for threshold, reward in MILESTONE_THRESHOLDS.items():
	if improvement >= threshold and threshold not in earned:
	bonus += reward
	newly_earned.append(threshold)
	earned.add(threshold)

	return round(bonus, 4), newly_earned


	# ─────────────────────────────────────────────
	# LOOP DETECTOR
	# ─────────────────────────────────────────────

	def _detect_loop(previous_actions: list[str], current_action: str) -> bool:
	"""Returns True if agent has done the same action 2+ times in a row."""
	if len(previous_actions) < 1:
	return False
	last = previous_actions[-1]
	return last == current_action


	def _count_consecutive(previous_actions: list[str], current_action: str) -> int:
	count = 1
	for a in reversed(previous_actions):
	if a == current_action:
	count += 1
	else:
	break
	return count


	# ─────────────────────────────────────────────
	# EFFICIENCY BONUS
	# ─────────────────────────────────────────────

	def _efficiency_bonus(step_count: int, max_steps: int) -> float:
	"""Bonus if agent finishes in < 70% of budget."""
	threshold = max_steps * 0.70
	if step_count <= threshold:
	ratio = step_count / max(1, max_steps)
	return round(EFFICIENCY_BONUS * (1.0 - ratio), 4)
	return 0.0


	# ─────────────────────────────────────────────
	# MAIN REWARD FUNCTION
	# ─────────────────────────────────────────────

	def compute_reward(
	action: Action,
	task_id: str,
	difficulty: DifficultyLevel,
	step_count: int,
	previous_actions: list[str],
	hints_used: int,
	estimated_steps: int,
	action_counts: dict[str, int],
	# Round 2 extras (optional — backward compatible)
	db_delta: float = 0.0, # Performance score delta from DatabaseSimulator
	baseline_score: float = 0.0, # Scenario baseline score
	current_score: float = 0.0, # Current DB performance score
	milestones_earned: set = None, # Set of already-earned milestone thresholds
	) -> Reward:
	"""
	Computes dense reward signal for every step.

	Components:
	1. Step reward — small reward for valid action type
	2. Delta reward — proportional to DB performance improvement (Round 2)
	3. Milestone bonus — one-time bonus at 25%/50%/75% improvement
	4. Grader score — full score on terminal actions (Round 1 compat)
	5. Loop penalty — repeated same action with no improvement
	6. Hint penalty — cost per hint
	7. Backtrack penalty — action made things worse
	8. Budget penalty — approaching max_steps without submitting
	9. Efficiency bonus — solved fast
	"""

	if milestones_earned is None:
	milestones_earned = set()

	breakdown = {}
	feedback_parts = []
	final_score = 0.0

	# ── Edge case: null action ────────────────────────────────────
	if action is None or action.payload is None:
	return Reward(
	score=0.001,
	breakdown={"invalid_action": 0.001},
	feedback="Invalid or null action received."
	)

	action_type_val = action.action_type.value if hasattr(action.action_type, "value") else str(action.action_type)
	action_type_enum = action.action_type

	# ── 1. Step reward ────────────────────────────────────────────
	step_reward = STEP_REWARDS.get(action_type_enum, 0.05)
	breakdown["step_reward"] = round(step_reward, 4)
	final_score += step_reward
	if step_reward > 0:
	feedback_parts.append(f"Action '{action_type_val}' +{step_reward}.")

	# ── 2. Delta reward (Round 2 DB performance change) ───────────
	if db_delta != 0.0:
	delta_reward = round((db_delta / 100.0) * 0.40, 4)
	delta_reward = max(-0.40, min(0.40, delta_reward))
	breakdown["delta_reward"] = delta_reward
	final_score += delta_reward
	if delta_reward > 0:
	feedback_parts.append(f"DB improved +{db_delta:.1f} pts. Delta reward +{delta_reward}.")
	elif delta_reward < 0:
	feedback_parts.append(f"DB worsened {db_delta:.1f} pts. Penalty {delta_reward}.")

	# ── 3. Milestone bonuses ──────────────────────────────────────
	if baseline_score > 0 and current_score > 0:
	milestone_bonus, newly_earned = check_milestones(
	baseline_score, current_score, milestones_earned
	)
	if milestone_bonus > 0:
	breakdown["milestone_bonus"] = milestone_bonus
	final_score += milestone_bonus
	pct = int(max(newly_earned) * 100)
	feedback_parts.append(f"🎯 Milestone! {pct}% improvement. Bonus +{milestone_bonus}!")

	# ── 4. Grader score for terminal actions (Round 1 compat) ─────
	grader_score = 0.0
	is_terminal = action_type_enum in TERMINAL_ACTIONS

	if is_terminal and action_type_enum != ActionType.SUBMIT_REPORT:
	raw_score, grader_breakdown, grader_feedback = grade(action, task_id)
	grader_score = raw_score
	breakdown["grader_score"] = round(grader_score, 4)
	breakdown["grader_breakdown"] = grader_breakdown
	final_score += grader_score
	feedback_parts.append(grader_feedback)

	if grader_score >= 0.5:
	eff_bonus = _efficiency_bonus(step_count, MAX_STEPS)
	if eff_bonus > 0:
	final_score += eff_bonus
	breakdown["efficiency_bonus"] = round(eff_bonus, 4)
	feedback_parts.append(f"Efficiency bonus +{eff_bonus}.")

	elif is_terminal and action_type_enum == ActionType.SUBMIT_REPORT:
	# Round 2 terminal: compute from DB performance
	if baseline_score > 0 and current_score > 0:
	perf_improvement = (current_score - baseline_score) / max(1.0, 100.0 - baseline_score)
	step_efficiency = 1.0 - (step_count / max(1, MAX_STEPS))
	terminal_score = round(
	(perf_improvement * 0.60) + (step_efficiency * 0.20) + 0.10, 4
	)
	terminal_score = max(0.001, min(0.999, terminal_score))
	breakdown["terminal_score"] = terminal_score
	breakdown["perf_improvement"] = round(perf_improvement, 4)
	breakdown["step_efficiency"] = round(step_efficiency, 4)
	final_score += terminal_score
	feedback_parts.append(
	f"Report submitted. Performance: {baseline_score:.1f} → {current_score:.1f}. "
	f"Terminal score: {terminal_score}."
	)
	# Efficiency bonus on submit_report too
	eff_bonus = _efficiency_bonus(step_count, MAX_STEPS)
	if eff_bonus > 0:
	final_score += eff_bonus
	breakdown["efficiency_bonus"] = round(eff_bonus, 4)
	feedback_parts.append(f"Efficiency bonus +{eff_bonus}.")
	else:
	breakdown["terminal_score"] = 0.10
	final_score += 0.10
	feedback_parts.append("Report submitted.")

	elif action_type_enum == ActionType.PROPOSE_FIX:
	raw_score, grader_breakdown, _ = grade(action, task_id)
	partial = round(raw_score * 0.4, 4)
	breakdown["partial_grader_score"] = partial
	final_score += partial

	elif action_type_enum == ActionType.IDENTIFY_ERROR:
	raw_score, _, _ = grade(action, task_id)
	partial = round(raw_score * 0.2, 4)
	breakdown["identification_score"] = partial
	final_score += partial

	# ── 5. Loop penalty ───────────────────────────────────────────
	if _detect_loop(previous_actions, action_type_val):
	consecutive = _count_consecutive(previous_actions, action_type_val)
	loop_pen = LOOP_PENALTY * min(consecutive - 1, 3)
	final_score += loop_pen
	breakdown["loop_penalty"] = round(loop_pen, 4)
	feedback_parts.append(f"Loop detected ({consecutive}x). Penalty {loop_pen}.")

	# ── 6. Hint penalty ───────────────────────────────────────────
	if action_type_enum == ActionType.REQUEST_HINT:
	final_score += HINT_PENALTY
	breakdown["hint_penalty"] = HINT_PENALTY
	feedback_parts.append(f"Hint requested. Penalty {HINT_PENALTY}.")

	# ── 7. Backtrack penalty ──────────────────────────────────────
	if db_delta < -1.0:
	final_score += BACKTRACK_PENALTY
	breakdown["backtrack_penalty"] = BACKTRACK_PENALTY
	feedback_parts.append(f"Performance regressed. Backtrack penalty {BACKTRACK_PENALTY}.")

	# ── 8. Budget exhaustion penalty ─────────────────────────────
	if step_count >= MAX_STEPS - 2 and not is_terminal:
	final_score += BUDGET_EXHAUSTION_PEN
	breakdown["budget_penalty"] = BUDGET_EXHAUSTION_PEN
	feedback_parts.append("Budget nearly exhausted. Submit report now!")

	# ── Clamp to (0.001, 0.999) ───────────────────────────────────
	final_score = round(max(0.001, min(0.999, final_score)), 4)
	breakdown["total"] = final_score

	feedback = " ".join(feedback_parts) if feedback_parts else "Step processed."

	return Reward(score=final_score, breakdown=breakdown, feedback=feedback)


	# ─────────────────────────────────────────────
	# EPISODE DONE CONDITION
	# ─────────────────────────────────────────────

	def is_done(
	action_type: ActionType,
	step_count: int,
	grader_score: float = 0.0,
	target_reached: bool = False,
	) -> bool:
	"""
	Episode ends when:
	1. Agent submits report / final answer
	2. Max steps reached
	3. Perfect score / target reached
	"""
	if action_type in TERMINAL_ACTIONS:
	return True
	if step_count >= MAX_STEPS:
	return True
	if grader_score >= 1.0:
	return True
	if target_reached:
	return True
	return False