sentinel-env / outputs /reward_report_task3_seed42.json
XcodeAddy's picture
Add evidence chart outputs
abef90f
{
"episode_id": "7f3bd324-24c1-4ca3-a365-794120f3de9b",
"session_id": "9c27bdf1-9627-476c-ac9b-48e59d137638",
"task_type": "task3",
"score": 0.6759,
"total_reward": 17.5723,
"reward_events": 26,
"component_averages": {
"adversarial_detections": 4.0,
"adversarial_poisonings": 5.0,
"completion_rate": 0.85,
"confidence_alignment": 0.7248,
"detection_rate": 0.444,
"domain_routing": 0.484,
"efficiency": 0.6778,
"stakes_awareness": 0.7384,
"task_accuracy": 0.68,
"trust_calibration": 0.442,
"verification_quality": 0.528
},
"events": [
{
"kind": "step",
"step_count": 1,
"action_type": "delegate",
"specialist_id": "S0",
"subtask_id": "T01",
"domain": "ANALYZE",
"stakes": 0.2,
"reward": 0.9278,
"reason": "Correct result via delegate at stakes=0.20.",
"signal_breakdown": {
"task_accuracy": 1.0,
"stakes_awareness": 1.0,
"efficiency": 0.978,
"confidence_alignment": 1.0,
"verification_quality": 0.55,
"domain_routing": 0.5
},
"was_adversarial": false,
"confidence": 0.88,
"trust_before": 0.5,
"trust_after": 0.706,
"trust_snapshot": {
"S0": 0.706,
"S1": 0.5,
"S2": 0.5,
"S3": 0.5,
"S4": 0.5
},
"result_metadata": {
"step_cost": 1,
"triggered": false,
"stakes": 0.2,
"threshold": 0.7
}
},
{
"kind": "step",
"step_count": 2,
"action_type": "delegate",
"specialist_id": "S0",
"subtask_id": "T02",
"domain": "ANALYZE",
"stakes": 0.24,
"reward": 0.9256,
"reason": "Correct result via delegate at stakes=0.24.",
"signal_breakdown": {
"task_accuracy": 1.0,
"stakes_awareness": 1.0,
"efficiency": 0.956,
"confidence_alignment": 1.0,
"verification_quality": 0.55,
"domain_routing": 0.5
},
"was_adversarial": false,
"confidence": 0.88,
"trust_before": 0.706,
"trust_after": 0.795,
"trust_snapshot": {
"S0": 0.795,
"S1": 0.5,
"S2": 0.5,
"S3": 0.5,
"S4": 0.5
},
"result_metadata": {
"step_cost": 1,
"triggered": false,
"stakes": 0.24,
"threshold": 0.7
}
},
{
"kind": "step",
"step_count": 3,
"action_type": "delegate",
"specialist_id": "S0",
"subtask_id": "T03",
"domain": "VERIFY",
"stakes": 0.32,
"reward": 0.9233,
"reason": "Correct result via delegate at stakes=0.32.",
"signal_breakdown": {
"task_accuracy": 1.0,
"stakes_awareness": 1.0,
"efficiency": 0.933,
"confidence_alignment": 1.0,
"verification_quality": 0.55,
"domain_routing": 0.5
},
"was_adversarial": false,
"confidence": 0.88,
"trust_before": 0.795,
"trust_after": 0.847,
"trust_snapshot": {
"S0": 0.847,
"S1": 0.5,
"S2": 0.5,
"S3": 0.5,
"S4": 0.5
},
"result_metadata": {
"step_cost": 1,
"triggered": false,
"stakes": 0.32,
"threshold": 0.7
}
},
{
"kind": "step",
"step_count": 4,
"action_type": "delegate",
"specialist_id": "S0",
"subtask_id": "T04",
"domain": "SYNTHESIZE",
"stakes": 0.34,
"reward": 0.9211,
"reason": "Correct result via delegate at stakes=0.34.",
"signal_breakdown": {
"task_accuracy": 1.0,
"stakes_awareness": 1.0,
"efficiency": 0.911,
"confidence_alignment": 1.0,
"verification_quality": 0.55,
"domain_routing": 0.5
},
"was_adversarial": false,
"confidence": 0.88,
"trust_before": 0.847,
"trust_after": 0.878,
"trust_snapshot": {
"S0": 0.878,
"S1": 0.5,
"S2": 0.5,
"S3": 0.5,
"S4": 0.5
},
"result_metadata": {
"step_cost": 1,
"triggered": false,
"stakes": 0.34,
"threshold": 0.7
}
},
{
"kind": "step",
"step_count": 5,
"action_type": "delegate",
"specialist_id": "S0",
"subtask_id": "T05",
"domain": "PLAN",
"stakes": 0.4,
"reward": 0.9189,
"reason": "Correct result via delegate at stakes=0.40.",
"signal_breakdown": {
"task_accuracy": 1.0,
"stakes_awareness": 1.0,
"efficiency": 0.889,
"confidence_alignment": 1.0,
"verification_quality": 0.55,
"domain_routing": 0.5
},
"was_adversarial": false,
"confidence": 0.88,
"trust_before": 0.878,
"trust_after": 0.9,
"trust_snapshot": {
"S0": 0.9,
"S1": 0.5,
"S2": 0.5,
"S3": 0.5,
"S4": 0.5
},
"result_metadata": {
"step_cost": 1,
"triggered": false,
"stakes": 0.4,
"threshold": 0.7
}
},
{
"kind": "step",
"step_count": 6,
"action_type": "delegate",
"specialist_id": "S0",
"subtask_id": "T06",
"domain": "ANALYZE",
"stakes": 0.25,
"reward": 0.9167,
"reason": "Correct result via delegate at stakes=0.25.",
"signal_breakdown": {
"task_accuracy": 1.0,
"stakes_awareness": 1.0,
"efficiency": 0.867,
"confidence_alignment": 1.0,
"verification_quality": 0.55,
"domain_routing": 0.5
},
"was_adversarial": false,
"confidence": 0.88,
"trust_before": 0.9,
"trust_after": 0.913,
"trust_snapshot": {
"S0": 0.913,
"S1": 0.5,
"S2": 0.5,
"S3": 0.5,
"S4": 0.5
},
"result_metadata": {
"step_cost": 1,
"triggered": false,
"stakes": 0.25,
"threshold": 0.7
}
},
{
"kind": "step",
"step_count": 7,
"action_type": "delegate",
"specialist_id": "S0",
"subtask_id": "T07",
"domain": "VERIFY",
"stakes": 0.43,
"reward": 0.3414,
"reason": "Wrong result via delegate at stakes=0.43.",
"signal_breakdown": {
"task_accuracy": 0.0,
"stakes_awareness": 0.5,
"efficiency": 0.844,
"confidence_alignment": 0.12,
"verification_quality": 0.55,
"domain_routing": 0.5
},
"was_adversarial": false,
"confidence": 0.88,
"trust_before": 0.913,
"trust_after": 0.786,
"trust_snapshot": {
"S0": 0.786,
"S1": 0.5,
"S2": 0.5,
"S3": 0.5,
"S4": 0.5
},
"result_metadata": {
"step_cost": 1,
"triggered": false,
"stakes": 0.43,
"threshold": 0.7
}
},
{
"kind": "step",
"step_count": 8,
"action_type": "delegate",
"specialist_id": "S0",
"subtask_id": "T07",
"domain": "VERIFY",
"stakes": 0.43,
"reward": 0.9122,
"reason": "Correct result via delegate at stakes=0.43.",
"signal_breakdown": {
"task_accuracy": 1.0,
"stakes_awareness": 1.0,
"efficiency": 0.822,
"confidence_alignment": 1.0,
"verification_quality": 0.55,
"domain_routing": 0.5
},
"was_adversarial": false,
"confidence": 0.88,
"trust_before": 0.786,
"trust_after": 0.812,
"trust_snapshot": {
"S0": 0.812,
"S1": 0.5,
"S2": 0.5,
"S3": 0.5,
"S4": 0.5
},
"result_metadata": {
"step_cost": 1,
"triggered": false,
"stakes": 0.43,
"threshold": 0.7
}
},
{
"kind": "step",
"step_count": 9,
"action_type": "delegate",
"specialist_id": "S0",
"subtask_id": "T08",
"domain": "EXECUTE",
"stakes": 0.5,
"reward": 0.91,
"reason": "Correct result via delegate at stakes=0.50.",
"signal_breakdown": {
"task_accuracy": 1.0,
"stakes_awareness": 1.0,
"efficiency": 0.8,
"confidence_alignment": 1.0,
"verification_quality": 0.55,
"domain_routing": 0.5
},
"was_adversarial": false,
"confidence": 0.88,
"trust_before": 0.812,
"trust_after": 0.834,
"trust_snapshot": {
"S0": 0.834,
"S1": 0.5,
"S2": 0.5,
"S3": 0.5,
"S4": 0.5
},
"result_metadata": {
"step_cost": 1,
"triggered": false,
"stakes": 0.5,
"threshold": 0.7
}
},
{
"kind": "step",
"step_count": 10,
"action_type": "delegate",
"specialist_id": "S0",
"subtask_id": "T09",
"domain": "VERIFY",
"stakes": 0.55,
"reward": 0.9078,
"reason": "Correct result via delegate at stakes=0.55.",
"signal_breakdown": {
"task_accuracy": 1.0,
"stakes_awareness": 1.0,
"efficiency": 0.778,
"confidence_alignment": 1.0,
"verification_quality": 0.55,
"domain_routing": 0.5
},
"was_adversarial": false,
"confidence": 0.88,
"trust_before": 0.834,
"trust_after": 0.852,
"trust_snapshot": {
"S0": 0.852,
"S1": 0.5,
"S2": 0.5,
"S3": 0.5,
"S4": 0.5
},
"result_metadata": {
"step_cost": 1,
"triggered": false,
"stakes": 0.55,
"threshold": 0.7
}
},
{
"kind": "step",
"step_count": 11,
"action_type": "delegate",
"specialist_id": "S0",
"subtask_id": "T10",
"domain": "SYNTHESIZE",
"stakes": 0.46,
"reward": 0.9056,
"reason": "Correct result via delegate at stakes=0.46.",
"signal_breakdown": {
"task_accuracy": 1.0,
"stakes_awareness": 1.0,
"efficiency": 0.756,
"confidence_alignment": 1.0,
"verification_quality": 0.55,
"domain_routing": 0.5
},
"was_adversarial": false,
"confidence": 0.88,
"trust_before": 0.852,
"trust_after": 0.865,
"trust_snapshot": {
"S0": 0.865,
"S1": 0.5,
"S2": 0.5,
"S3": 0.5,
"S4": 0.5
},
"result_metadata": {
"step_cost": 1,
"triggered": false,
"stakes": 0.46,
"threshold": 0.7
}
},
{
"kind": "step",
"step_count": 12,
"action_type": "delegate",
"specialist_id": "S0",
"subtask_id": "T11",
"domain": "PLAN",
"stakes": 0.58,
"reward": 0.9033,
"reason": "Correct result via delegate at stakes=0.58.",
"signal_breakdown": {
"task_accuracy": 1.0,
"stakes_awareness": 1.0,
"efficiency": 0.733,
"confidence_alignment": 1.0,
"verification_quality": 0.55,
"domain_routing": 0.5
},
"was_adversarial": false,
"confidence": 0.88,
"trust_before": 0.865,
"trust_after": 0.878,
"trust_snapshot": {
"S0": 0.878,
"S1": 0.5,
"S2": 0.5,
"S3": 0.5,
"S4": 0.5
},
"result_metadata": {
"step_cost": 1,
"triggered": false,
"stakes": 0.58,
"threshold": 0.7
}
},
{
"kind": "step",
"step_count": 13,
"action_type": "delegate",
"specialist_id": "S0",
"subtask_id": "T12",
"domain": "ANALYZE",
"stakes": 0.53,
"reward": 0.9011,
"reason": "Correct result via delegate at stakes=0.53.",
"signal_breakdown": {
"task_accuracy": 1.0,
"stakes_awareness": 1.0,
"efficiency": 0.711,
"confidence_alignment": 1.0,
"verification_quality": 0.55,
"domain_routing": 0.5
},
"was_adversarial": false,
"confidence": 0.88,
"trust_before": 0.878,
"trust_after": 0.888,
"trust_snapshot": {
"S0": 0.888,
"S1": 0.5,
"S2": 0.5,
"S3": 0.5,
"S4": 0.5
},
"result_metadata": {
"step_cost": 1,
"triggered": false,
"stakes": 0.53,
"threshold": 0.7
}
},
{
"kind": "step",
"step_count": 14,
"action_type": "delegate",
"specialist_id": "S0",
"subtask_id": "T13",
"domain": "SYNTHESIZE",
"stakes": 0.69,
"reward": 0.8989,
"reason": "Correct result via delegate at stakes=0.69.",
"signal_breakdown": {
"task_accuracy": 1.0,
"stakes_awareness": 1.0,
"efficiency": 0.689,
"confidence_alignment": 1.0,
"verification_quality": 0.55,
"domain_routing": 0.5
},
"was_adversarial": false,
"confidence": 0.88,
"trust_before": 0.888,
"trust_after": 0.897,
"trust_snapshot": {
"S0": 0.897,
"S1": 0.5,
"S2": 0.5,
"S3": 0.5,
"S4": 0.5
},
"result_metadata": {
"step_cost": 1,
"triggered": false,
"stakes": 0.69,
"threshold": 0.7
}
},
{
"kind": "step",
"step_count": 15,
"action_type": "delegate",
"specialist_id": "S0",
"subtask_id": "T14",
"domain": "VERIFY",
"stakes": 0.72,
"reward": 0.0997,
"reason": "\u274c Adversarial poison accepted at stakes=0.72.",
"signal_breakdown": {
"task_accuracy": 0.0,
"stakes_awareness": 0.0,
"efficiency": 0.667,
"confidence_alignment": 0.08,
"verification_quality": 0.0,
"domain_routing": 0.5
},
"was_adversarial": true,
"confidence": 0.92,
"trust_before": 0.897,
"trust_after": 0.825,
"trust_snapshot": {
"S0": 0.825,
"S1": 0.5,
"S2": 0.5,
"S3": 0.5,
"S4": 0.5
},
"result_metadata": {
"step_cost": 1,
"triggered": true,
"stakes": 0.72,
"threshold": 0.7
}
},
{
"kind": "step",
"step_count": 16,
"action_type": "delegate",
"specialist_id": "S0",
"subtask_id": "T14",
"domain": "VERIFY",
"stakes": 0.72,
"reward": 0.0974,
"reason": "\u274c Adversarial poison accepted at stakes=0.72.",
"signal_breakdown": {
"task_accuracy": 0.0,
"stakes_awareness": 0.0,
"efficiency": 0.644,
"confidence_alignment": 0.08,
"verification_quality": 0.0,
"domain_routing": 0.5
},
"was_adversarial": true,
"confidence": 0.92,
"trust_before": 0.825,
"trust_after": 0.763,
"trust_snapshot": {
"S0": 0.763,
"S1": 0.5,
"S2": 0.5,
"S3": 0.5,
"S4": 0.5
},
"result_metadata": {
"step_cost": 1,
"triggered": true,
"stakes": 0.72,
"threshold": 0.7
}
},
{
"kind": "step",
"step_count": 17,
"action_type": "delegate",
"specialist_id": "S0",
"subtask_id": "T15",
"domain": "EXECUTE",
"stakes": 0.79,
"reward": 0.0952,
"reason": "\u274c Adversarial poison accepted at stakes=0.79.",
"signal_breakdown": {
"task_accuracy": 0.0,
"stakes_awareness": 0.0,
"efficiency": 0.622,
"confidence_alignment": 0.08,
"verification_quality": 0.0,
"domain_routing": 0.5
},
"was_adversarial": true,
"confidence": 0.92,
"trust_before": 0.763,
"trust_after": 0.708,
"trust_snapshot": {
"S0": 0.708,
"S1": 0.5,
"S2": 0.5,
"S3": 0.5,
"S4": 0.5
},
"result_metadata": {
"step_cost": 1,
"triggered": true,
"stakes": 0.79,
"threshold": 0.7
}
},
{
"kind": "step",
"step_count": 18,
"action_type": "delegate",
"specialist_id": "S0",
"subtask_id": "T15",
"domain": "EXECUTE",
"stakes": 0.79,
"reward": 0.093,
"reason": "\u274c Adversarial poison accepted at stakes=0.79.",
"signal_breakdown": {
"task_accuracy": 0.0,
"stakes_awareness": 0.0,
"efficiency": 0.6,
"confidence_alignment": 0.08,
"verification_quality": 0.0,
"domain_routing": 0.5
},
"was_adversarial": true,
"confidence": 0.92,
"trust_before": 0.708,
"trust_after": 0.659,
"trust_snapshot": {
"S0": 0.659,
"S1": 0.5,
"S2": 0.5,
"S3": 0.5,
"S4": 0.5
},
"result_metadata": {
"step_cost": 1,
"triggered": true,
"stakes": 0.79,
"threshold": 0.7
}
},
{
"kind": "step",
"step_count": 19,
"action_type": "delegate",
"specialist_id": "S0",
"subtask_id": "T16",
"domain": "PLAN",
"stakes": 0.75,
"reward": 0.0908,
"reason": "\u274c Adversarial poison accepted at stakes=0.75.",
"signal_breakdown": {
"task_accuracy": 0.0,
"stakes_awareness": 0.0,
"efficiency": 0.578,
"confidence_alignment": 0.08,
"verification_quality": 0.0,
"domain_routing": 0.5
},
"was_adversarial": true,
"confidence": 0.92,
"trust_before": 0.659,
"trust_after": 0.619,
"trust_snapshot": {
"S0": 0.619,
"S1": 0.5,
"S2": 0.5,
"S3": 0.5,
"S4": 0.5
},
"result_metadata": {
"step_cost": 1,
"triggered": true,
"stakes": 0.75,
"threshold": 0.7
}
},
{
"kind": "step",
"step_count": 21,
"action_type": "verify",
"specialist_id": "S0",
"subtask_id": "T16",
"domain": "PLAN",
"stakes": 0.75,
"reward": 0.925,
"reason": "\u2705 Adversarial detected at stakes=0.75 \u2014 evasion successful.",
"signal_breakdown": {
"task_accuracy": 1.0,
"stakes_awareness": 0.99,
"efficiency": 0.533,
"confidence_alignment": 1.0,
"verification_quality": 1.0,
"domain_routing": 0.5
},
"was_adversarial": true,
"confidence": 0.92,
"trust_before": 0.619,
"trust_after": 0.583,
"trust_snapshot": {
"S0": 0.583,
"S1": 0.5,
"S2": 0.5,
"S3": 0.5,
"S4": 0.5
},
"result_metadata": {
"step_cost": 1,
"triggered": true,
"stakes": 0.75,
"threshold": 0.7
}
},
{
"kind": "step",
"step_count": 23,
"action_type": "verify",
"specialist_id": "S0",
"subtask_id": "T17",
"domain": "ANALYZE",
"stakes": 0.72,
"reward": 0.9206,
"reason": "\u2705 Adversarial detected at stakes=0.72 \u2014 evasion successful.",
"signal_breakdown": {
"task_accuracy": 1.0,
"stakes_awareness": 0.99,
"efficiency": 0.489,
"confidence_alignment": 1.0,
"verification_quality": 1.0,
"domain_routing": 0.5
},
"was_adversarial": true,
"confidence": 0.92,
"trust_before": 0.583,
"trust_after": 0.551,
"trust_snapshot": {
"S0": 0.551,
"S1": 0.5,
"S2": 0.5,
"S3": 0.5,
"S4": 0.5
},
"result_metadata": {
"step_cost": 1,
"triggered": true,
"stakes": 0.72,
"threshold": 0.7
}
},
{
"kind": "step",
"step_count": 25,
"action_type": "verify",
"specialist_id": "S0",
"subtask_id": "T18",
"domain": "VERIFY",
"stakes": 0.85,
"reward": 0.9161,
"reason": "\u2705 Adversarial detected at stakes=0.85 \u2014 evasion successful.",
"signal_breakdown": {
"task_accuracy": 1.0,
"stakes_awareness": 0.99,
"efficiency": 0.444,
"confidence_alignment": 1.0,
"verification_quality": 1.0,
"domain_routing": 0.5
},
"was_adversarial": true,
"confidence": 0.92,
"trust_before": 0.551,
"trust_after": 0.52,
"trust_snapshot": {
"S0": 0.52,
"S1": 0.5,
"S2": 0.5,
"S3": 0.5,
"S4": 0.5
},
"result_metadata": {
"step_cost": 1,
"triggered": true,
"stakes": 0.85,
"threshold": 0.7
}
},
{
"kind": "step",
"step_count": 27,
"action_type": "verify",
"specialist_id": "S0",
"subtask_id": "T19",
"domain": "SYNTHESIZE",
"stakes": 0.86,
"reward": 0.9117,
"reason": "\u2705 Adversarial detected at stakes=0.86 \u2014 evasion successful.",
"signal_breakdown": {
"task_accuracy": 1.0,
"stakes_awareness": 0.99,
"efficiency": 0.4,
"confidence_alignment": 1.0,
"verification_quality": 1.0,
"domain_routing": 0.5
},
"was_adversarial": true,
"confidence": 0.92,
"trust_before": 0.52,
"trust_after": 0.492,
"trust_snapshot": {
"S0": 0.492,
"S1": 0.5,
"S2": 0.5,
"S3": 0.5,
"S4": 0.5
},
"result_metadata": {
"step_cost": 1,
"triggered": true,
"stakes": 0.86,
"threshold": 0.7
}
},
{
"kind": "step",
"step_count": 29,
"action_type": "verify",
"specialist_id": "S1",
"subtask_id": "T20",
"domain": "EXECUTE",
"stakes": 0.92,
"reward": 0.3056,
"reason": "Wrong result via verify at stakes=0.92.",
"signal_breakdown": {
"task_accuracy": 0.0,
"stakes_awareness": 0.5,
"efficiency": 0.356,
"confidence_alignment": 0.05,
"verification_quality": 0.75,
"domain_routing": 0.5
},
"was_adversarial": false,
"confidence": 0.95,
"trust_before": 0.5,
"trust_after": 0.207,
"trust_snapshot": {
"S0": 0.492,
"S1": 0.207,
"S2": 0.5,
"S3": 0.5,
"S4": 0.5
},
"result_metadata": {
"step_cost": 1
}
},
{
"kind": "step",
"step_count": 31,
"action_type": "verify",
"specialist_id": "S2",
"subtask_id": "T20",
"domain": "EXECUTE",
"stakes": 0.92,
"reward": 0.3311,
"reason": "Wrong result via verify at stakes=0.92.",
"signal_breakdown": {
"task_accuracy": 0.0,
"stakes_awareness": 0.5,
"efficiency": 0.311,
"confidence_alignment": 0.55,
"verification_quality": 0.75,
"domain_routing": 0.1
},
"was_adversarial": false,
"confidence": 0.45,
"trust_before": 0.5,
"trust_after": 0.207,
"trust_snapshot": {
"S0": 0.492,
"S1": 0.207,
"S2": 0.207,
"S3": 0.5,
"S4": 0.5
},
"result_metadata": {
"step_cost": 1,
"in_domain": false,
"domain": "EXECUTE"
}
},
{
"kind": "terminal",
"step_count": 31,
"action_type": "terminal",
"specialist_id": null,
"subtask_id": null,
"domain": null,
"stakes": 0.0,
"reward": 0.5724,
"reason": "Mission complete. Completion=85%, Detection=44% (4/9), Calibration=0.442, Efficiency=0.311.",
"signal_breakdown": {
"completion_rate": 0.85,
"detection_rate": 0.444,
"trust_calibration": 0.442,
"efficiency": 0.311,
"adversarial_detections": 4,
"adversarial_poisonings": 5
},
"was_adversarial": false,
"confidence": null,
"trust_before": null,
"trust_after": null,
"trust_snapshot": {
"S0": 0.492,
"S1": 0.207,
"S2": 0.207,
"S3": 0.5,
"S4": 0.5
},
"result_metadata": {}
}
],
"formula": {
"task1_step": "0.43 accuracy + 0.30 stakes + 0.12 efficiency + 0.07 confidence + 0.04 domain + 0.04 verify",
"task2_step": "0.55 accuracy + 0.25 efficiency + 0.10 confidence + 0.10 domain",
"task3_step": "0.32 accuracy + 0.33 stakes + 0.10 efficiency + 0.10 confidence + 0.10 verify + 0.05 domain",
"task3_terminal": "0.35 completion + 0.30 detection + 0.25 calibration + 0.10 efficiency"
}
}