File size: 14,341 Bytes
3c1b0c7
 
 
8cb206e
3c1b0c7
8cb206e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3c1b0c7
8cb206e
3c1b0c7
8cb206e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3c1b0c7
 
8cb206e
 
 
 
 
 
3c1b0c7
 
8cb206e
 
 
3c1b0c7
8cb206e
 
 
 
 
3c1b0c7
8cb206e
 
3c1b0c7
8cb206e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3c1b0c7
8cb206e
 
3c1b0c7
 
 
 
 
 
 
 
 
 
 
 
8cb206e
3c1b0c7
8cb206e
 
 
 
 
 
 
 
3c1b0c7
 
 
8cb206e
3c1b0c7
8cb206e
3c1b0c7
 
 
 
 
 
 
 
 
 
8cb206e
 
 
 
 
3c1b0c7
 
8cb206e
 
 
 
 
 
 
 
 
 
 
 
3c1b0c7
 
8cb206e
 
 
 
3c1b0c7
8cb206e
3c1b0c7
 
 
 
ef20791
 
 
3c1b0c7
8cb206e
 
3c1b0c7
 
8cb206e
3c1b0c7
 
 
 
8cb206e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3c1b0c7
8cb206e
3c1b0c7
8cb206e
3c1b0c7
8cb206e
3c1b0c7
 
8cb206e
3c1b0c7
 
 
 
 
8cb206e
3c1b0c7
 
 
8cb206e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3c1b0c7
 
 
8cb206e
3c1b0c7
 
 
 
 
8cb206e
3c1b0c7
 
 
8cb206e
3c1b0c7
 
8cb206e
3c1b0c7
 
8cb206e
3c1b0c7
8cb206e
3c1b0c7
8cb206e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ef20791
3c1b0c7
 
 
 
8cb206e
3c1b0c7
 
8cb206e
3c1b0c7
8cb206e
3c1b0c7
 
8cb206e
 
 
 
3c1b0c7
 
 
8cb206e
3c1b0c7
8cb206e
3c1b0c7
8cb206e
3c1b0c7
 
 
 
 
8cb206e
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
from env.models import Action, Reward, DifficultyLevel, ActionType
from env.graders import grade

# ─────────────────────────────────────────────
#  CONSTANTS
# ─────────────────────────────────────────────

MAX_STEPS             = 50    # Round 2: long-horizon episodes
HINT_PENALTY          = -0.10  # Per hint requested (increased from Round 1)
LOOP_PENALTY          = -0.08  # Same action on same target 2+ times, no improvement
INVALID_PENALTY       = -0.10  # Null / malformed action
BACKTRACK_PENALTY     = -0.05  # Action makes score worse than previous best
BUDGET_EXHAUSTION_PEN = -0.15  # Reaching max_steps without submitting report
EFFICIENCY_BONUS      =  0.10  # Solved in < 70% of max_steps

# Milestone thresholds: {improvement_fraction: bonus_reward}
MILESTONE_THRESHOLDS = {
    0.25: 0.15,   # 25% improvement β†’ +0.15 bonus
    0.50: 0.25,   # 50% improvement β†’ +0.25 bonus
    0.75: 0.40,   # 75% improvement β†’ +0.40 bonus
}

# Step rewards for Round 2 actions (dense signal)
STEP_REWARDS = {
    # ── Round 2 actions ──────────────────────────
    ActionType.INSPECT_QUERY:    0.05,   # Investigation rewarded
    ActionType.ANALYZE_INDEXES:  0.05,   # Investigation rewarded
    ActionType.CREATE_INDEX:     0.10,   # Core optimization action
    ActionType.REWRITE_QUERY:    0.15,   # High-value rewrite
    ActionType.ADD_COLUMN:       0.08,   # Denormalization
    ActionType.DROP_INDEX:       0.05,   # Clean up overhead
    ActionType.PARTITION_TABLE:  0.15,   # Big structural improvement
    ActionType.ANALYZE_STATS:    0.05,   # Maintenance action
    ActionType.SUBMIT_REPORT:    0.00,   # Terminal β€” score comes from grader
    ActionType.REQUEST_HINT:     0.00,   # No reward, only penalty
    # ── Round 1 backward compat ──────────────────
    ActionType.IDENTIFY_ERROR:   0.15,
    ActionType.PROPOSE_FIX:      0.25,
    ActionType.SUBMIT_ANSWER:    0.00,
    ActionType.EXPLAIN_ISSUE:    0.10,
    ActionType.OPTIMIZE_QUERY:   0.20,
}

# Terminal actions that end the episode
TERMINAL_ACTIONS = {
    ActionType.SUBMIT_ANSWER,
    ActionType.OPTIMIZE_QUERY,
    ActionType.SUBMIT_REPORT,
}


# ─────────────────────────────────────────────
#  MILESTONE TRACKER
# ─────────────────────────────────────────────

def check_milestones(
    baseline_score: float,
    new_score:      float,
    earned:         set,
) -> tuple[float, list[float]]:
    """
    Returns (total_bonus, newly_earned_thresholds).
    One-time bonuses β€” each milestone only paid once per episode.
    """
    max_possible   = max(1.0, 100.0 - baseline_score)
    improvement    = (new_score - baseline_score) / max_possible
    bonus          = 0.0
    newly_earned   = []

    for threshold, reward in MILESTONE_THRESHOLDS.items():
        if improvement >= threshold and threshold not in earned:
            bonus        += reward
            newly_earned.append(threshold)
            earned.add(threshold)

    return round(bonus, 4), newly_earned


# ─────────────────────────────────────────────
#  LOOP DETECTOR
# ─────────────────────────────────────────────

def _detect_loop(previous_actions: list[str], current_action: str) -> bool:
    """Returns True if agent has done the same action 2+ times in a row."""
    if len(previous_actions) < 1:
        return False
    last = previous_actions[-1]
    return last == current_action


def _count_consecutive(previous_actions: list[str], current_action: str) -> int:
    count = 1
    for a in reversed(previous_actions):
        if a == current_action:
            count += 1
        else:
            break
    return count


# ─────────────────────────────────────────────
#  EFFICIENCY BONUS
# ─────────────────────────────────────────────

def _efficiency_bonus(step_count: int, max_steps: int) -> float:
    """Bonus if agent finishes in < 70% of budget."""
    threshold = max_steps * 0.70
    if step_count <= threshold:
        ratio = step_count / max(1, max_steps)
        return round(EFFICIENCY_BONUS * (1.0 - ratio), 4)
    return 0.0


# ─────────────────────────────────────────────
#  MAIN REWARD FUNCTION
# ─────────────────────────────────────────────

def compute_reward(
    action:           Action,
    task_id:          str,
    difficulty:       DifficultyLevel,
    step_count:       int,
    previous_actions: list[str],
    hints_used:       int,
    estimated_steps:  int,
    action_counts:    dict[str, int],
    # Round 2 extras (optional β€” backward compatible)
    db_delta:         float = 0.0,     # Performance score delta from DatabaseSimulator
    baseline_score:   float = 0.0,     # Scenario baseline score
    current_score:    float = 0.0,     # Current DB performance score
    milestones_earned: set  = None,    # Set of already-earned milestone thresholds
) -> Reward:
    """
    Computes dense reward signal for every step.

    Components:
    1. Step reward     β€” small reward for valid action type
    2. Delta reward    β€” proportional to DB performance improvement (Round 2)
    3. Milestone bonus β€” one-time bonus at 25%/50%/75% improvement
    4. Grader score    β€” full score on terminal actions (Round 1 compat)
    5. Loop penalty    β€” repeated same action with no improvement
    6. Hint penalty    β€” cost per hint
    7. Backtrack penalty β€” action made things worse
    8. Budget penalty  β€” approaching max_steps without submitting
    9. Efficiency bonus β€” solved fast
    """

    if milestones_earned is None:
        milestones_earned = set()

    breakdown      = {}
    feedback_parts = []
    final_score    = 0.0

    # ── Edge case: null action ────────────────────────────────────
    if action is None or action.payload is None:
        return Reward(
            score=0.001,
            breakdown={"invalid_action": 0.001},
            feedback="Invalid or null action received."
        )

    action_type_val  = action.action_type.value if hasattr(action.action_type, "value") else str(action.action_type)
    action_type_enum = action.action_type

    # ── 1. Step reward ────────────────────────────────────────────
    step_reward = STEP_REWARDS.get(action_type_enum, 0.05)
    breakdown["step_reward"] = round(step_reward, 4)
    final_score += step_reward
    if step_reward > 0:
        feedback_parts.append(f"Action '{action_type_val}' +{step_reward}.")

    # ── 2. Delta reward (Round 2 DB performance change) ───────────
    if db_delta != 0.0:
        delta_reward = round((db_delta / 100.0) * 0.40, 4)
        delta_reward = max(-0.40, min(0.40, delta_reward))
        breakdown["delta_reward"] = delta_reward
        final_score += delta_reward
        if delta_reward > 0:
            feedback_parts.append(f"DB improved +{db_delta:.1f} pts. Delta reward +{delta_reward}.")
        elif delta_reward < 0:
            feedback_parts.append(f"DB worsened {db_delta:.1f} pts. Penalty {delta_reward}.")

    # ── 3. Milestone bonuses ──────────────────────────────────────
    if baseline_score > 0 and current_score > 0:
        milestone_bonus, newly_earned = check_milestones(
            baseline_score, current_score, milestones_earned
        )
        if milestone_bonus > 0:
            breakdown["milestone_bonus"] = milestone_bonus
            final_score += milestone_bonus
            pct = int(max(newly_earned) * 100)
            feedback_parts.append(f"🎯 Milestone! {pct}% improvement. Bonus +{milestone_bonus}!")

    # ── 4. Grader score for terminal actions (Round 1 compat) ─────
    grader_score = 0.0
    is_terminal  = action_type_enum in TERMINAL_ACTIONS

    if is_terminal and action_type_enum != ActionType.SUBMIT_REPORT:
        raw_score, grader_breakdown, grader_feedback = grade(action, task_id)
        grader_score = raw_score
        breakdown["grader_score"]     = round(grader_score, 4)
        breakdown["grader_breakdown"] = grader_breakdown
        final_score += grader_score
        feedback_parts.append(grader_feedback)

        if grader_score >= 0.5:
            eff_bonus = _efficiency_bonus(step_count, MAX_STEPS)
            if eff_bonus > 0:
                final_score += eff_bonus
                breakdown["efficiency_bonus"] = round(eff_bonus, 4)
                feedback_parts.append(f"Efficiency bonus +{eff_bonus}.")

    elif is_terminal and action_type_enum == ActionType.SUBMIT_REPORT:
        # Round 2 terminal: compute from DB performance
        if baseline_score > 0 and current_score > 0:
            perf_improvement = (current_score - baseline_score) / max(1.0, 100.0 - baseline_score)
            step_efficiency  = 1.0 - (step_count / max(1, MAX_STEPS))
            terminal_score   = round(
                (perf_improvement * 0.60) + (step_efficiency * 0.20) + 0.10, 4
            )
            terminal_score = max(0.001, min(0.999, terminal_score))
            breakdown["terminal_score"]    = terminal_score
            breakdown["perf_improvement"]  = round(perf_improvement, 4)
            breakdown["step_efficiency"]   = round(step_efficiency, 4)
            final_score += terminal_score
            feedback_parts.append(
                f"Report submitted. Performance: {baseline_score:.1f} β†’ {current_score:.1f}. "
                f"Terminal score: {terminal_score}."
            )
            # Efficiency bonus on submit_report too
            eff_bonus = _efficiency_bonus(step_count, MAX_STEPS)
            if eff_bonus > 0:
                final_score += eff_bonus
                breakdown["efficiency_bonus"] = round(eff_bonus, 4)
                feedback_parts.append(f"Efficiency bonus +{eff_bonus}.")
        else:
            breakdown["terminal_score"] = 0.10
            final_score += 0.10
            feedback_parts.append("Report submitted.")

    elif action_type_enum == ActionType.PROPOSE_FIX:
        raw_score, grader_breakdown, _ = grade(action, task_id)
        partial = round(raw_score * 0.4, 4)
        breakdown["partial_grader_score"] = partial
        final_score += partial

    elif action_type_enum == ActionType.IDENTIFY_ERROR:
        raw_score, _, _ = grade(action, task_id)
        partial = round(raw_score * 0.2, 4)
        breakdown["identification_score"] = partial
        final_score += partial

    # ── 5. Loop penalty ───────────────────────────────────────────
    if _detect_loop(previous_actions, action_type_val):
        consecutive = _count_consecutive(previous_actions, action_type_val)
        loop_pen    = LOOP_PENALTY * min(consecutive - 1, 3)
        final_score += loop_pen
        breakdown["loop_penalty"] = round(loop_pen, 4)
        feedback_parts.append(f"Loop detected ({consecutive}x). Penalty {loop_pen}.")

    # ── 6. Hint penalty ───────────────────────────────────────────
    if action_type_enum == ActionType.REQUEST_HINT:
        final_score += HINT_PENALTY
        breakdown["hint_penalty"] = HINT_PENALTY
        feedback_parts.append(f"Hint requested. Penalty {HINT_PENALTY}.")

    # ── 7. Backtrack penalty ──────────────────────────────────────
    if db_delta < -1.0:
        final_score += BACKTRACK_PENALTY
        breakdown["backtrack_penalty"] = BACKTRACK_PENALTY
        feedback_parts.append(f"Performance regressed. Backtrack penalty {BACKTRACK_PENALTY}.")

    # ── 8. Budget exhaustion penalty ─────────────────────────────
    if step_count >= MAX_STEPS - 2 and not is_terminal:
        final_score += BUDGET_EXHAUSTION_PEN
        breakdown["budget_penalty"] = BUDGET_EXHAUSTION_PEN
        feedback_parts.append("Budget nearly exhausted. Submit report now!")

    # ── Clamp to (0.001, 0.999) ───────────────────────────────────
    final_score = round(max(0.001, min(0.999, final_score)), 4)
    breakdown["total"] = final_score

    feedback = " ".join(feedback_parts) if feedback_parts else "Step processed."

    return Reward(score=final_score, breakdown=breakdown, feedback=feedback)


# ─────────────────────────────────────────────
#  EPISODE DONE CONDITION
# ─────────────────────────────────────────────

def is_done(
    action_type:  ActionType,
    step_count:   int,
    grader_score: float = 0.0,
    target_reached: bool = False,
) -> bool:
    """
    Episode ends when:
    1. Agent submits report / final answer
    2. Max steps reached
    3. Perfect score / target reached
    """
    if action_type in TERMINAL_ACTIONS:
        return True
    if step_count >= MAX_STEPS:
        return True
    if grader_score >= 1.0:
        return True
    if target_reached:
        return True
    return False