{ "base_model": "Qwen/Qwen2.5-1.5B-Instruct", "dataset_rows": 680, "episodes_per_task": 8, "random_rewards": [ -5.96, -11.48, -12.5 ], "heuristic_rewards": [ -4.72, -0.87, 5.89 ], "base_model_rewards": [ -2.92, -4.0, -4.28 ], "sft_model_rewards": [ -4.72, -0.87, 5.89 ], "improvement_sft_over_base": [ -1.8, 3.13, 10.17 ], "improvement_heuristic_over_random": [ 1.24, 10.61, 18.39 ], "reward_components_by_policy": { "random": { "wrong_actor_penalty": -3.12, "closure_wrong": -17.82, "step_cost": -2.61, "postmortem_empty": -1.0, "escalation_not_needed": -0.3, "clue_bonus": 0.48, "handoff_wrong": -0.8, "mitigation_wrong": -2.1, "rollback_ineffective": -1.65, "sla_exhausted": -1.2, "repeated_lookup_penalty": -0.02, "escalation_needed": 0.2 }, "heuristic": { "step_cost": -2.02, "clue_bonus": 2.52, "handoff_wrong": -0.8, "mitigation_wrong": -2.1, "closure_wrong": -9.9, "repeated_lookup_penalty": -0.16, "handoff_correct": 0.75, "postmortem_logged": 0.35, "mitigation_correct": 2.1, "closure_correct": 7.36, "closure_mitigation_bonus": 1.8, "speed_bonus": 0.6, "postmortem_bonus": 0.6, "closure_under_investigated": -0.8 }, "base_model": { "step_cost": -5.16, "clue_bonus": 0.24, "repeated_lookup_penalty": -1.24, "sla_exhausted": -5.04 }, "sft_model": { "step_cost": -2.02, "clue_bonus": 2.52, "handoff_wrong": -0.8, "mitigation_wrong": -2.1, "closure_wrong": -9.9, "repeated_lookup_penalty": -0.16, "handoff_correct": 0.75, "postmortem_logged": 0.35, "mitigation_correct": 2.1, "closure_correct": 7.36, "closure_mitigation_bonus": 1.8, "speed_bonus": 0.6, "postmortem_bonus": 0.6, "closure_under_investigated": -0.8 } } }