| { |
| "base_model": "Qwen/Qwen2.5-1.5B-Instruct", |
| "dataset_rows": 680, |
| "episodes_per_task": 8, |
| "random_rewards": [ |
| -5.96, |
| -11.48, |
| -12.5 |
| ], |
| "heuristic_rewards": [ |
| -4.72, |
| -0.87, |
| 5.89 |
| ], |
| "base_model_rewards": [ |
| -2.92, |
| -4.0, |
| -4.28 |
| ], |
| "sft_model_rewards": [ |
| -4.72, |
| -0.87, |
| 5.89 |
| ], |
| "improvement_sft_over_base": [ |
| -1.8, |
| 3.13, |
| 10.17 |
| ], |
| "improvement_heuristic_over_random": [ |
| 1.24, |
| 10.61, |
| 18.39 |
| ], |
| "reward_components_by_policy": { |
| "random": { |
| "wrong_actor_penalty": -3.12, |
| "closure_wrong": -17.82, |
| "step_cost": -2.61, |
| "postmortem_empty": -1.0, |
| "escalation_not_needed": -0.3, |
| "clue_bonus": 0.48, |
| "handoff_wrong": -0.8, |
| "mitigation_wrong": -2.1, |
| "rollback_ineffective": -1.65, |
| "sla_exhausted": -1.2, |
| "repeated_lookup_penalty": -0.02, |
| "escalation_needed": 0.2 |
| }, |
| "heuristic": { |
| "step_cost": -2.02, |
| "clue_bonus": 2.52, |
| "handoff_wrong": -0.8, |
| "mitigation_wrong": -2.1, |
| "closure_wrong": -9.9, |
| "repeated_lookup_penalty": -0.16, |
| "handoff_correct": 0.75, |
| "postmortem_logged": 0.35, |
| "mitigation_correct": 2.1, |
| "closure_correct": 7.36, |
| "closure_mitigation_bonus": 1.8, |
| "speed_bonus": 0.6, |
| "postmortem_bonus": 0.6, |
| "closure_under_investigated": -0.8 |
| }, |
| "base_model": { |
| "step_cost": -5.16, |
| "clue_bonus": 0.24, |
| "repeated_lookup_penalty": -1.24, |
| "sla_exhausted": -5.04 |
| }, |
| "sft_model": { |
| "step_cost": -2.02, |
| "clue_bonus": 2.52, |
| "handoff_wrong": -0.8, |
| "mitigation_wrong": -2.1, |
| "closure_wrong": -9.9, |
| "repeated_lookup_penalty": -0.16, |
| "handoff_correct": 0.75, |
| "postmortem_logged": 0.35, |
| "mitigation_correct": 2.1, |
| "closure_correct": 7.36, |
| "closure_mitigation_bonus": 1.8, |
| "speed_bonus": 0.6, |
| "postmortem_bonus": 0.6, |
| "closure_under_investigated": -0.8 |
| } |
| } |
| } |