Spaces:

junaid0600
/

sql-db-engineer-agent

Sleeping

File size: 14,341 Bytes

3c1b0c7
 
 
8cb206e
3c1b0c7
8cb206e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3c1b0c7
8cb206e
3c1b0c7
8cb206e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3c1b0c7
 
8cb206e
 
 
 
 
 
3c1b0c7
 
8cb206e
 
 
3c1b0c7
8cb206e
 
 
 
 
3c1b0c7
8cb206e
 
3c1b0c7
8cb206e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3c1b0c7
8cb206e
 
3c1b0c7
 
 
 
 
 
 
 
 
 
 
 
8cb206e
3c1b0c7
8cb206e
 
 
 
 
 
 
 
3c1b0c7
 
 
8cb206e
3c1b0c7
8cb206e
3c1b0c7
 
 
 
 
 
 
 
 
 
8cb206e
 
 
 
 
3c1b0c7
 
8cb206e
 
 
 
 
 
 
 
 
 
 
 
3c1b0c7
 
8cb206e
 
 
 
3c1b0c7
8cb206e
3c1b0c7
 
 
 
ef20791
 
 
3c1b0c7
8cb206e
 
3c1b0c7
 
8cb206e
3c1b0c7
 
 
 
8cb206e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3c1b0c7
8cb206e
3c1b0c7
8cb206e
3c1b0c7
8cb206e
3c1b0c7
 
8cb206e
3c1b0c7
 
 
 
 
8cb206e
3c1b0c7
 
 
8cb206e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3c1b0c7
 
 
8cb206e
3c1b0c7
 
 
 
 
8cb206e
3c1b0c7
 
 
8cb206e
3c1b0c7
 
8cb206e
3c1b0c7
 
8cb206e
3c1b0c7
8cb206e
3c1b0c7
8cb206e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ef20791
3c1b0c7
 
 
 
8cb206e
3c1b0c7
 
8cb206e
3c1b0c7
8cb206e
3c1b0c7
 
8cb206e
 
 
 
3c1b0c7
 
 
8cb206e
3c1b0c7
8cb206e
3c1b0c7
8cb206e
3c1b0c7
 
 
 
 
8cb206e

from env.models import Action, Reward, DifficultyLevel, ActionType
from env.graders import grade

# ─────────────────────────────────────────────
#  CONSTANTS
# ─────────────────────────────────────────────

MAX_STEPS             = 50    # Round 2: long-horizon episodes
HINT_PENALTY          = -0.10  # Per hint requested (increased from Round 1)
LOOP_PENALTY          = -0.08  # Same action on same target 2+ times, no improvement
INVALID_PENALTY       = -0.10  # Null / malformed action
BACKTRACK_PENALTY     = -0.05  # Action makes score worse than previous best
BUDGET_EXHAUSTION_PEN = -0.15  # Reaching max_steps without submitting report
EFFICIENCY_BONUS      =  0.10  # Solved in < 70% of max_steps

# Milestone thresholds: {improvement_fraction: bonus_reward}
MILESTONE_THRESHOLDS = {
    0.25: 0.15,   # 25% improvement → +0.15 bonus
    0.50: 0.25,   # 50% improvement → +0.25 bonus
    0.75: 0.40,   # 75% improvement → +0.40 bonus
}

# Step rewards for Round 2 actions (dense signal)
STEP_REWARDS = {
    # ── Round 2 actions ──────────────────────────
    ActionType.INSPECT_QUERY:    0.05,   # Investigation rewarded
    ActionType.ANALYZE_INDEXES:  0.05,   # Investigation rewarded
    ActionType.CREATE_INDEX:     0.10,   # Core optimization action
    ActionType.REWRITE_QUERY:    0.15,   # High-value rewrite
    ActionType.ADD_COLUMN:       0.08,   # Denormalization
    ActionType.DROP_INDEX:       0.05,   # Clean up overhead
    ActionType.PARTITION_TABLE:  0.15,   # Big structural improvement
    ActionType.ANALYZE_STATS:    0.05,   # Maintenance action
    ActionType.SUBMIT_REPORT:    0.00,   # Terminal — score comes from grader
    ActionType.REQUEST_HINT:     0.00,   # No reward, only penalty
    # ── Round 1 backward compat ──────────────────
    ActionType.IDENTIFY_ERROR:   0.15,
    ActionType.PROPOSE_FIX:      0.25,
    ActionType.SUBMIT_ANSWER:    0.00,
    ActionType.EXPLAIN_ISSUE:    0.10,
    ActionType.OPTIMIZE_QUERY:   0.20,
}

# Terminal actions that end the episode
TERMINAL_ACTIONS = {
    ActionType.SUBMIT_ANSWER,
    ActionType.OPTIMIZE_QUERY,
    ActionType.SUBMIT_REPORT,
}


# ─────────────────────────────────────────────
#  MILESTONE TRACKER
# ─────────────────────────────────────────────

def check_milestones(
    baseline_score: float,
    new_score:      float,
    earned:         set,
) -> tuple[float, list[float]]:
    """
    Returns (total_bonus, newly_earned_thresholds).
    One-time bonuses — each milestone only paid once per episode.
    """
    max_possible   = max(1.0, 100.0 - baseline_score)
    improvement    = (new_score - baseline_score) / max_possible
    bonus          = 0.0
    newly_earned   = []

    for threshold, reward in MILESTONE_THRESHOLDS.items():
        if improvement >= threshold and threshold not in earned:
            bonus        += reward
            newly_earned.append(threshold)
            earned.add(threshold)

    return round(bonus, 4), newly_earned


# ─────────────────────────────────────────────
#  LOOP DETECTOR
# ─────────────────────────────────────────────

def _detect_loop(previous_actions: list[str], current_action: str) -> bool:
    """Returns True if agent has done the same action 2+ times in a row."""
    if len(previous_actions) < 1:
        return False
    last = previous_actions[-1]
    return last == current_action


def _count_consecutive(previous_actions: list[str], current_action: str) -> int:
    count = 1
    for a in reversed(previous_actions):
        if a == current_action:
            count += 1
        else:
            break
    return count


# ─────────────────────────────────────────────
#  EFFICIENCY BONUS
# ─────────────────────────────────────────────

def _efficiency_bonus(step_count: int, max_steps: int) -> float:
    """Bonus if agent finishes in < 70% of budget."""
    threshold = max_steps * 0.70
    if step_count <= threshold:
        ratio = step_count / max(1, max_steps)
        return round(EFFICIENCY_BONUS * (1.0 - ratio), 4)
    return 0.0


# ─────────────────────────────────────────────
#  MAIN REWARD FUNCTION
# ─────────────────────────────────────────────

def compute_reward(
    action:           Action,
    task_id:          str,
    difficulty:       DifficultyLevel,
    step_count:       int,
    previous_actions: list[str],
    hints_used:       int,
    estimated_steps:  int,
    action_counts:    dict[str, int],
    # Round 2 extras (optional — backward compatible)
    db_delta:         float = 0.0,     # Performance score delta from DatabaseSimulator
    baseline_score:   float = 0.0,     # Scenario baseline score
    current_score:    float = 0.0,     # Current DB performance score
    milestones_earned: set  = None,    # Set of already-earned milestone thresholds
) -> Reward:
    """
    Computes dense reward signal for every step.

    Components:
    1. Step reward     — small reward for valid action type
    2. Delta reward    — proportional to DB performance improvement (Round 2)
    3. Milestone bonus — one-time bonus at 25%/50%/75% improvement
    4. Grader score    — full score on terminal actions (Round 1 compat)
    5. Loop penalty    — repeated same action with no improvement
    6. Hint penalty    — cost per hint
    7. Backtrack penalty — action made things worse
    8. Budget penalty  — approaching max_steps without submitting
    9. Efficiency bonus — solved fast
    """

    if milestones_earned is None:
        milestones_earned = set()

    breakdown      = {}
    feedback_parts = []
    final_score    = 0.0

    # ── Edge case: null action ────────────────────────────────────
    if action is None or action.payload is None:
        return Reward(
            score=0.001,
            breakdown={"invalid_action": 0.001},
            feedback="Invalid or null action received."
        )

    action_type_val  = action.action_type.value if hasattr(action.action_type, "value") else str(action.action_type)
    action_type_enum = action.action_type

    # ── 1. Step reward ────────────────────────────────────────────
    step_reward = STEP_REWARDS.get(action_type_enum, 0.05)
    breakdown["step_reward"] = round(step_reward, 4)
    final_score += step_reward
    if step_reward > 0:
        feedback_parts.append(f"Action '{action_type_val}' +{step_reward}.")

    # ── 2. Delta reward (Round 2 DB performance change) ───────────
    if db_delta != 0.0:
        delta_reward = round((db_delta / 100.0) * 0.40, 4)
        delta_reward = max(-0.40, min(0.40, delta_reward))
        breakdown["delta_reward"] = delta_reward
        final_score += delta_reward
        if delta_reward > 0:
            feedback_parts.append(f"DB improved +{db_delta:.1f} pts. Delta reward +{delta_reward}.")
        elif delta_reward < 0:
            feedback_parts.append(f"DB worsened {db_delta:.1f} pts. Penalty {delta_reward}.")

    # ── 3. Milestone bonuses ──────────────────────────────────────
    if baseline_score > 0 and current_score > 0:
        milestone_bonus, newly_earned = check_milestones(
            baseline_score, current_score, milestones_earned
        )
        if milestone_bonus > 0:
            breakdown["milestone_bonus"] = milestone_bonus
            final_score += milestone_bonus
            pct = int(max(newly_earned) * 100)
            feedback_parts.append(f"🎯 Milestone! {pct}% improvement. Bonus +{milestone_bonus}!")

    # ── 4. Grader score for terminal actions (Round 1 compat) ─────
    grader_score = 0.0
    is_terminal  = action_type_enum in TERMINAL_ACTIONS

    if is_terminal and action_type_enum != ActionType.SUBMIT_REPORT:
        raw_score, grader_breakdown, grader_feedback = grade(action, task_id)
        grader_score = raw_score
        breakdown["grader_score"]     = round(grader_score, 4)
        breakdown["grader_breakdown"] = grader_breakdown
        final_score += grader_score
        feedback_parts.append(grader_feedback)

        if grader_score >= 0.5:
            eff_bonus = _efficiency_bonus(step_count, MAX_STEPS)
            if eff_bonus > 0:
                final_score += eff_bonus
                breakdown["efficiency_bonus"] = round(eff_bonus, 4)
                feedback_parts.append(f"Efficiency bonus +{eff_bonus}.")

    elif is_terminal and action_type_enum == ActionType.SUBMIT_REPORT:
        # Round 2 terminal: compute from DB performance
        if baseline_score > 0 and current_score > 0:
            perf_improvement = (current_score - baseline_score) / max(1.0, 100.0 - baseline_score)
            step_efficiency  = 1.0 - (step_count / max(1, MAX_STEPS))
            terminal_score   = round(
                (perf_improvement * 0.60) + (step_efficiency * 0.20) + 0.10, 4
            )
            terminal_score = max(0.001, min(0.999, terminal_score))
            breakdown["terminal_score"]    = terminal_score
            breakdown["perf_improvement"]  = round(perf_improvement, 4)
            breakdown["step_efficiency"]   = round(step_efficiency, 4)
            final_score += terminal_score
            feedback_parts.append(
                f"Report submitted. Performance: {baseline_score:.1f} → {current_score:.1f}. "
                f"Terminal score: {terminal_score}."
            )
            # Efficiency bonus on submit_report too
            eff_bonus = _efficiency_bonus(step_count, MAX_STEPS)
            if eff_bonus > 0:
                final_score += eff_bonus
                breakdown["efficiency_bonus"] = round(eff_bonus, 4)
                feedback_parts.append(f"Efficiency bonus +{eff_bonus}.")
        else:
            breakdown["terminal_score"] = 0.10
            final_score += 0.10
            feedback_parts.append("Report submitted.")

    elif action_type_enum == ActionType.PROPOSE_FIX:
        raw_score, grader_breakdown, _ = grade(action, task_id)
        partial = round(raw_score * 0.4, 4)
        breakdown["partial_grader_score"] = partial
        final_score += partial

    elif action_type_enum == ActionType.IDENTIFY_ERROR:
        raw_score, _, _ = grade(action, task_id)
        partial = round(raw_score * 0.2, 4)
        breakdown["identification_score"] = partial
        final_score += partial

    # ── 5. Loop penalty ───────────────────────────────────────────
    if _detect_loop(previous_actions, action_type_val):
        consecutive = _count_consecutive(previous_actions, action_type_val)
        loop_pen    = LOOP_PENALTY * min(consecutive - 1, 3)
        final_score += loop_pen
        breakdown["loop_penalty"] = round(loop_pen, 4)
        feedback_parts.append(f"Loop detected ({consecutive}x). Penalty {loop_pen}.")

    # ── 6. Hint penalty ───────────────────────────────────────────
    if action_type_enum == ActionType.REQUEST_HINT:
        final_score += HINT_PENALTY
        breakdown["hint_penalty"] = HINT_PENALTY
        feedback_parts.append(f"Hint requested. Penalty {HINT_PENALTY}.")

    # ── 7. Backtrack penalty ──────────────────────────────────────
    if db_delta < -1.0:
        final_score += BACKTRACK_PENALTY
        breakdown["backtrack_penalty"] = BACKTRACK_PENALTY
        feedback_parts.append(f"Performance regressed. Backtrack penalty {BACKTRACK_PENALTY}.")

    # ── 8. Budget exhaustion penalty ─────────────────────────────
    if step_count >= MAX_STEPS - 2 and not is_terminal:
        final_score += BUDGET_EXHAUSTION_PEN
        breakdown["budget_penalty"] = BUDGET_EXHAUSTION_PEN
        feedback_parts.append("Budget nearly exhausted. Submit report now!")

    # ── Clamp to (0.001, 0.999) ───────────────────────────────────
    final_score = round(max(0.001, min(0.999, final_score)), 4)
    breakdown["total"] = final_score

    feedback = " ".join(feedback_parts) if feedback_parts else "Step processed."

    return Reward(score=final_score, breakdown=breakdown, feedback=feedback)


# ─────────────────────────────────────────────
#  EPISODE DONE CONDITION
# ─────────────────────────────────────────────

def is_done(
    action_type:  ActionType,
    step_count:   int,
    grader_score: float = 0.0,
    target_reached: bool = False,
) -> bool:
    """
    Episode ends when:
    1. Agent submits report / final answer
    2. Max steps reached
    3. Perfect score / target reached
    """
    if action_type in TERMINAL_ACTIONS:
        return True
    if step_count >= MAX_STEPS:
        return True
    if grader_score >= 1.0:
        return True
    if target_reached:
        return True
    return False