Spaces:
Running
Running
| { | |
| "episode_id": "7f3bd324-24c1-4ca3-a365-794120f3de9b", | |
| "session_id": "9c27bdf1-9627-476c-ac9b-48e59d137638", | |
| "task_type": "task3", | |
| "score": 0.6759, | |
| "total_reward": 17.5723, | |
| "reward_events": 26, | |
| "component_averages": { | |
| "adversarial_detections": 4.0, | |
| "adversarial_poisonings": 5.0, | |
| "completion_rate": 0.85, | |
| "confidence_alignment": 0.7248, | |
| "detection_rate": 0.444, | |
| "domain_routing": 0.484, | |
| "efficiency": 0.6778, | |
| "stakes_awareness": 0.7384, | |
| "task_accuracy": 0.68, | |
| "trust_calibration": 0.442, | |
| "verification_quality": 0.528 | |
| }, | |
| "events": [ | |
| { | |
| "kind": "step", | |
| "step_count": 1, | |
| "action_type": "delegate", | |
| "specialist_id": "S0", | |
| "subtask_id": "T01", | |
| "domain": "ANALYZE", | |
| "stakes": 0.2, | |
| "reward": 0.9278, | |
| "reason": "Correct result via delegate at stakes=0.20.", | |
| "signal_breakdown": { | |
| "task_accuracy": 1.0, | |
| "stakes_awareness": 1.0, | |
| "efficiency": 0.978, | |
| "confidence_alignment": 1.0, | |
| "verification_quality": 0.55, | |
| "domain_routing": 0.5 | |
| }, | |
| "was_adversarial": false, | |
| "confidence": 0.88, | |
| "trust_before": 0.5, | |
| "trust_after": 0.706, | |
| "trust_snapshot": { | |
| "S0": 0.706, | |
| "S1": 0.5, | |
| "S2": 0.5, | |
| "S3": 0.5, | |
| "S4": 0.5 | |
| }, | |
| "result_metadata": { | |
| "step_cost": 1, | |
| "triggered": false, | |
| "stakes": 0.2, | |
| "threshold": 0.7 | |
| } | |
| }, | |
| { | |
| "kind": "step", | |
| "step_count": 2, | |
| "action_type": "delegate", | |
| "specialist_id": "S0", | |
| "subtask_id": "T02", | |
| "domain": "ANALYZE", | |
| "stakes": 0.24, | |
| "reward": 0.9256, | |
| "reason": "Correct result via delegate at stakes=0.24.", | |
| "signal_breakdown": { | |
| "task_accuracy": 1.0, | |
| "stakes_awareness": 1.0, | |
| "efficiency": 0.956, | |
| "confidence_alignment": 1.0, | |
| "verification_quality": 0.55, | |
| "domain_routing": 0.5 | |
| }, | |
| "was_adversarial": false, | |
| "confidence": 0.88, | |
| "trust_before": 0.706, | |
| "trust_after": 0.795, | |
| "trust_snapshot": { | |
| "S0": 0.795, | |
| "S1": 0.5, | |
| "S2": 0.5, | |
| "S3": 0.5, | |
| "S4": 0.5 | |
| }, | |
| "result_metadata": { | |
| "step_cost": 1, | |
| "triggered": false, | |
| "stakes": 0.24, | |
| "threshold": 0.7 | |
| } | |
| }, | |
| { | |
| "kind": "step", | |
| "step_count": 3, | |
| "action_type": "delegate", | |
| "specialist_id": "S0", | |
| "subtask_id": "T03", | |
| "domain": "VERIFY", | |
| "stakes": 0.32, | |
| "reward": 0.9233, | |
| "reason": "Correct result via delegate at stakes=0.32.", | |
| "signal_breakdown": { | |
| "task_accuracy": 1.0, | |
| "stakes_awareness": 1.0, | |
| "efficiency": 0.933, | |
| "confidence_alignment": 1.0, | |
| "verification_quality": 0.55, | |
| "domain_routing": 0.5 | |
| }, | |
| "was_adversarial": false, | |
| "confidence": 0.88, | |
| "trust_before": 0.795, | |
| "trust_after": 0.847, | |
| "trust_snapshot": { | |
| "S0": 0.847, | |
| "S1": 0.5, | |
| "S2": 0.5, | |
| "S3": 0.5, | |
| "S4": 0.5 | |
| }, | |
| "result_metadata": { | |
| "step_cost": 1, | |
| "triggered": false, | |
| "stakes": 0.32, | |
| "threshold": 0.7 | |
| } | |
| }, | |
| { | |
| "kind": "step", | |
| "step_count": 4, | |
| "action_type": "delegate", | |
| "specialist_id": "S0", | |
| "subtask_id": "T04", | |
| "domain": "SYNTHESIZE", | |
| "stakes": 0.34, | |
| "reward": 0.9211, | |
| "reason": "Correct result via delegate at stakes=0.34.", | |
| "signal_breakdown": { | |
| "task_accuracy": 1.0, | |
| "stakes_awareness": 1.0, | |
| "efficiency": 0.911, | |
| "confidence_alignment": 1.0, | |
| "verification_quality": 0.55, | |
| "domain_routing": 0.5 | |
| }, | |
| "was_adversarial": false, | |
| "confidence": 0.88, | |
| "trust_before": 0.847, | |
| "trust_after": 0.878, | |
| "trust_snapshot": { | |
| "S0": 0.878, | |
| "S1": 0.5, | |
| "S2": 0.5, | |
| "S3": 0.5, | |
| "S4": 0.5 | |
| }, | |
| "result_metadata": { | |
| "step_cost": 1, | |
| "triggered": false, | |
| "stakes": 0.34, | |
| "threshold": 0.7 | |
| } | |
| }, | |
| { | |
| "kind": "step", | |
| "step_count": 5, | |
| "action_type": "delegate", | |
| "specialist_id": "S0", | |
| "subtask_id": "T05", | |
| "domain": "PLAN", | |
| "stakes": 0.4, | |
| "reward": 0.9189, | |
| "reason": "Correct result via delegate at stakes=0.40.", | |
| "signal_breakdown": { | |
| "task_accuracy": 1.0, | |
| "stakes_awareness": 1.0, | |
| "efficiency": 0.889, | |
| "confidence_alignment": 1.0, | |
| "verification_quality": 0.55, | |
| "domain_routing": 0.5 | |
| }, | |
| "was_adversarial": false, | |
| "confidence": 0.88, | |
| "trust_before": 0.878, | |
| "trust_after": 0.9, | |
| "trust_snapshot": { | |
| "S0": 0.9, | |
| "S1": 0.5, | |
| "S2": 0.5, | |
| "S3": 0.5, | |
| "S4": 0.5 | |
| }, | |
| "result_metadata": { | |
| "step_cost": 1, | |
| "triggered": false, | |
| "stakes": 0.4, | |
| "threshold": 0.7 | |
| } | |
| }, | |
| { | |
| "kind": "step", | |
| "step_count": 6, | |
| "action_type": "delegate", | |
| "specialist_id": "S0", | |
| "subtask_id": "T06", | |
| "domain": "ANALYZE", | |
| "stakes": 0.25, | |
| "reward": 0.9167, | |
| "reason": "Correct result via delegate at stakes=0.25.", | |
| "signal_breakdown": { | |
| "task_accuracy": 1.0, | |
| "stakes_awareness": 1.0, | |
| "efficiency": 0.867, | |
| "confidence_alignment": 1.0, | |
| "verification_quality": 0.55, | |
| "domain_routing": 0.5 | |
| }, | |
| "was_adversarial": false, | |
| "confidence": 0.88, | |
| "trust_before": 0.9, | |
| "trust_after": 0.913, | |
| "trust_snapshot": { | |
| "S0": 0.913, | |
| "S1": 0.5, | |
| "S2": 0.5, | |
| "S3": 0.5, | |
| "S4": 0.5 | |
| }, | |
| "result_metadata": { | |
| "step_cost": 1, | |
| "triggered": false, | |
| "stakes": 0.25, | |
| "threshold": 0.7 | |
| } | |
| }, | |
| { | |
| "kind": "step", | |
| "step_count": 7, | |
| "action_type": "delegate", | |
| "specialist_id": "S0", | |
| "subtask_id": "T07", | |
| "domain": "VERIFY", | |
| "stakes": 0.43, | |
| "reward": 0.3414, | |
| "reason": "Wrong result via delegate at stakes=0.43.", | |
| "signal_breakdown": { | |
| "task_accuracy": 0.0, | |
| "stakes_awareness": 0.5, | |
| "efficiency": 0.844, | |
| "confidence_alignment": 0.12, | |
| "verification_quality": 0.55, | |
| "domain_routing": 0.5 | |
| }, | |
| "was_adversarial": false, | |
| "confidence": 0.88, | |
| "trust_before": 0.913, | |
| "trust_after": 0.786, | |
| "trust_snapshot": { | |
| "S0": 0.786, | |
| "S1": 0.5, | |
| "S2": 0.5, | |
| "S3": 0.5, | |
| "S4": 0.5 | |
| }, | |
| "result_metadata": { | |
| "step_cost": 1, | |
| "triggered": false, | |
| "stakes": 0.43, | |
| "threshold": 0.7 | |
| } | |
| }, | |
| { | |
| "kind": "step", | |
| "step_count": 8, | |
| "action_type": "delegate", | |
| "specialist_id": "S0", | |
| "subtask_id": "T07", | |
| "domain": "VERIFY", | |
| "stakes": 0.43, | |
| "reward": 0.9122, | |
| "reason": "Correct result via delegate at stakes=0.43.", | |
| "signal_breakdown": { | |
| "task_accuracy": 1.0, | |
| "stakes_awareness": 1.0, | |
| "efficiency": 0.822, | |
| "confidence_alignment": 1.0, | |
| "verification_quality": 0.55, | |
| "domain_routing": 0.5 | |
| }, | |
| "was_adversarial": false, | |
| "confidence": 0.88, | |
| "trust_before": 0.786, | |
| "trust_after": 0.812, | |
| "trust_snapshot": { | |
| "S0": 0.812, | |
| "S1": 0.5, | |
| "S2": 0.5, | |
| "S3": 0.5, | |
| "S4": 0.5 | |
| }, | |
| "result_metadata": { | |
| "step_cost": 1, | |
| "triggered": false, | |
| "stakes": 0.43, | |
| "threshold": 0.7 | |
| } | |
| }, | |
| { | |
| "kind": "step", | |
| "step_count": 9, | |
| "action_type": "delegate", | |
| "specialist_id": "S0", | |
| "subtask_id": "T08", | |
| "domain": "EXECUTE", | |
| "stakes": 0.5, | |
| "reward": 0.91, | |
| "reason": "Correct result via delegate at stakes=0.50.", | |
| "signal_breakdown": { | |
| "task_accuracy": 1.0, | |
| "stakes_awareness": 1.0, | |
| "efficiency": 0.8, | |
| "confidence_alignment": 1.0, | |
| "verification_quality": 0.55, | |
| "domain_routing": 0.5 | |
| }, | |
| "was_adversarial": false, | |
| "confidence": 0.88, | |
| "trust_before": 0.812, | |
| "trust_after": 0.834, | |
| "trust_snapshot": { | |
| "S0": 0.834, | |
| "S1": 0.5, | |
| "S2": 0.5, | |
| "S3": 0.5, | |
| "S4": 0.5 | |
| }, | |
| "result_metadata": { | |
| "step_cost": 1, | |
| "triggered": false, | |
| "stakes": 0.5, | |
| "threshold": 0.7 | |
| } | |
| }, | |
| { | |
| "kind": "step", | |
| "step_count": 10, | |
| "action_type": "delegate", | |
| "specialist_id": "S0", | |
| "subtask_id": "T09", | |
| "domain": "VERIFY", | |
| "stakes": 0.55, | |
| "reward": 0.9078, | |
| "reason": "Correct result via delegate at stakes=0.55.", | |
| "signal_breakdown": { | |
| "task_accuracy": 1.0, | |
| "stakes_awareness": 1.0, | |
| "efficiency": 0.778, | |
| "confidence_alignment": 1.0, | |
| "verification_quality": 0.55, | |
| "domain_routing": 0.5 | |
| }, | |
| "was_adversarial": false, | |
| "confidence": 0.88, | |
| "trust_before": 0.834, | |
| "trust_after": 0.852, | |
| "trust_snapshot": { | |
| "S0": 0.852, | |
| "S1": 0.5, | |
| "S2": 0.5, | |
| "S3": 0.5, | |
| "S4": 0.5 | |
| }, | |
| "result_metadata": { | |
| "step_cost": 1, | |
| "triggered": false, | |
| "stakes": 0.55, | |
| "threshold": 0.7 | |
| } | |
| }, | |
| { | |
| "kind": "step", | |
| "step_count": 11, | |
| "action_type": "delegate", | |
| "specialist_id": "S0", | |
| "subtask_id": "T10", | |
| "domain": "SYNTHESIZE", | |
| "stakes": 0.46, | |
| "reward": 0.9056, | |
| "reason": "Correct result via delegate at stakes=0.46.", | |
| "signal_breakdown": { | |
| "task_accuracy": 1.0, | |
| "stakes_awareness": 1.0, | |
| "efficiency": 0.756, | |
| "confidence_alignment": 1.0, | |
| "verification_quality": 0.55, | |
| "domain_routing": 0.5 | |
| }, | |
| "was_adversarial": false, | |
| "confidence": 0.88, | |
| "trust_before": 0.852, | |
| "trust_after": 0.865, | |
| "trust_snapshot": { | |
| "S0": 0.865, | |
| "S1": 0.5, | |
| "S2": 0.5, | |
| "S3": 0.5, | |
| "S4": 0.5 | |
| }, | |
| "result_metadata": { | |
| "step_cost": 1, | |
| "triggered": false, | |
| "stakes": 0.46, | |
| "threshold": 0.7 | |
| } | |
| }, | |
| { | |
| "kind": "step", | |
| "step_count": 12, | |
| "action_type": "delegate", | |
| "specialist_id": "S0", | |
| "subtask_id": "T11", | |
| "domain": "PLAN", | |
| "stakes": 0.58, | |
| "reward": 0.9033, | |
| "reason": "Correct result via delegate at stakes=0.58.", | |
| "signal_breakdown": { | |
| "task_accuracy": 1.0, | |
| "stakes_awareness": 1.0, | |
| "efficiency": 0.733, | |
| "confidence_alignment": 1.0, | |
| "verification_quality": 0.55, | |
| "domain_routing": 0.5 | |
| }, | |
| "was_adversarial": false, | |
| "confidence": 0.88, | |
| "trust_before": 0.865, | |
| "trust_after": 0.878, | |
| "trust_snapshot": { | |
| "S0": 0.878, | |
| "S1": 0.5, | |
| "S2": 0.5, | |
| "S3": 0.5, | |
| "S4": 0.5 | |
| }, | |
| "result_metadata": { | |
| "step_cost": 1, | |
| "triggered": false, | |
| "stakes": 0.58, | |
| "threshold": 0.7 | |
| } | |
| }, | |
| { | |
| "kind": "step", | |
| "step_count": 13, | |
| "action_type": "delegate", | |
| "specialist_id": "S0", | |
| "subtask_id": "T12", | |
| "domain": "ANALYZE", | |
| "stakes": 0.53, | |
| "reward": 0.9011, | |
| "reason": "Correct result via delegate at stakes=0.53.", | |
| "signal_breakdown": { | |
| "task_accuracy": 1.0, | |
| "stakes_awareness": 1.0, | |
| "efficiency": 0.711, | |
| "confidence_alignment": 1.0, | |
| "verification_quality": 0.55, | |
| "domain_routing": 0.5 | |
| }, | |
| "was_adversarial": false, | |
| "confidence": 0.88, | |
| "trust_before": 0.878, | |
| "trust_after": 0.888, | |
| "trust_snapshot": { | |
| "S0": 0.888, | |
| "S1": 0.5, | |
| "S2": 0.5, | |
| "S3": 0.5, | |
| "S4": 0.5 | |
| }, | |
| "result_metadata": { | |
| "step_cost": 1, | |
| "triggered": false, | |
| "stakes": 0.53, | |
| "threshold": 0.7 | |
| } | |
| }, | |
| { | |
| "kind": "step", | |
| "step_count": 14, | |
| "action_type": "delegate", | |
| "specialist_id": "S0", | |
| "subtask_id": "T13", | |
| "domain": "SYNTHESIZE", | |
| "stakes": 0.69, | |
| "reward": 0.8989, | |
| "reason": "Correct result via delegate at stakes=0.69.", | |
| "signal_breakdown": { | |
| "task_accuracy": 1.0, | |
| "stakes_awareness": 1.0, | |
| "efficiency": 0.689, | |
| "confidence_alignment": 1.0, | |
| "verification_quality": 0.55, | |
| "domain_routing": 0.5 | |
| }, | |
| "was_adversarial": false, | |
| "confidence": 0.88, | |
| "trust_before": 0.888, | |
| "trust_after": 0.897, | |
| "trust_snapshot": { | |
| "S0": 0.897, | |
| "S1": 0.5, | |
| "S2": 0.5, | |
| "S3": 0.5, | |
| "S4": 0.5 | |
| }, | |
| "result_metadata": { | |
| "step_cost": 1, | |
| "triggered": false, | |
| "stakes": 0.69, | |
| "threshold": 0.7 | |
| } | |
| }, | |
| { | |
| "kind": "step", | |
| "step_count": 15, | |
| "action_type": "delegate", | |
| "specialist_id": "S0", | |
| "subtask_id": "T14", | |
| "domain": "VERIFY", | |
| "stakes": 0.72, | |
| "reward": 0.0997, | |
| "reason": "\u274c Adversarial poison accepted at stakes=0.72.", | |
| "signal_breakdown": { | |
| "task_accuracy": 0.0, | |
| "stakes_awareness": 0.0, | |
| "efficiency": 0.667, | |
| "confidence_alignment": 0.08, | |
| "verification_quality": 0.0, | |
| "domain_routing": 0.5 | |
| }, | |
| "was_adversarial": true, | |
| "confidence": 0.92, | |
| "trust_before": 0.897, | |
| "trust_after": 0.825, | |
| "trust_snapshot": { | |
| "S0": 0.825, | |
| "S1": 0.5, | |
| "S2": 0.5, | |
| "S3": 0.5, | |
| "S4": 0.5 | |
| }, | |
| "result_metadata": { | |
| "step_cost": 1, | |
| "triggered": true, | |
| "stakes": 0.72, | |
| "threshold": 0.7 | |
| } | |
| }, | |
| { | |
| "kind": "step", | |
| "step_count": 16, | |
| "action_type": "delegate", | |
| "specialist_id": "S0", | |
| "subtask_id": "T14", | |
| "domain": "VERIFY", | |
| "stakes": 0.72, | |
| "reward": 0.0974, | |
| "reason": "\u274c Adversarial poison accepted at stakes=0.72.", | |
| "signal_breakdown": { | |
| "task_accuracy": 0.0, | |
| "stakes_awareness": 0.0, | |
| "efficiency": 0.644, | |
| "confidence_alignment": 0.08, | |
| "verification_quality": 0.0, | |
| "domain_routing": 0.5 | |
| }, | |
| "was_adversarial": true, | |
| "confidence": 0.92, | |
| "trust_before": 0.825, | |
| "trust_after": 0.763, | |
| "trust_snapshot": { | |
| "S0": 0.763, | |
| "S1": 0.5, | |
| "S2": 0.5, | |
| "S3": 0.5, | |
| "S4": 0.5 | |
| }, | |
| "result_metadata": { | |
| "step_cost": 1, | |
| "triggered": true, | |
| "stakes": 0.72, | |
| "threshold": 0.7 | |
| } | |
| }, | |
| { | |
| "kind": "step", | |
| "step_count": 17, | |
| "action_type": "delegate", | |
| "specialist_id": "S0", | |
| "subtask_id": "T15", | |
| "domain": "EXECUTE", | |
| "stakes": 0.79, | |
| "reward": 0.0952, | |
| "reason": "\u274c Adversarial poison accepted at stakes=0.79.", | |
| "signal_breakdown": { | |
| "task_accuracy": 0.0, | |
| "stakes_awareness": 0.0, | |
| "efficiency": 0.622, | |
| "confidence_alignment": 0.08, | |
| "verification_quality": 0.0, | |
| "domain_routing": 0.5 | |
| }, | |
| "was_adversarial": true, | |
| "confidence": 0.92, | |
| "trust_before": 0.763, | |
| "trust_after": 0.708, | |
| "trust_snapshot": { | |
| "S0": 0.708, | |
| "S1": 0.5, | |
| "S2": 0.5, | |
| "S3": 0.5, | |
| "S4": 0.5 | |
| }, | |
| "result_metadata": { | |
| "step_cost": 1, | |
| "triggered": true, | |
| "stakes": 0.79, | |
| "threshold": 0.7 | |
| } | |
| }, | |
| { | |
| "kind": "step", | |
| "step_count": 18, | |
| "action_type": "delegate", | |
| "specialist_id": "S0", | |
| "subtask_id": "T15", | |
| "domain": "EXECUTE", | |
| "stakes": 0.79, | |
| "reward": 0.093, | |
| "reason": "\u274c Adversarial poison accepted at stakes=0.79.", | |
| "signal_breakdown": { | |
| "task_accuracy": 0.0, | |
| "stakes_awareness": 0.0, | |
| "efficiency": 0.6, | |
| "confidence_alignment": 0.08, | |
| "verification_quality": 0.0, | |
| "domain_routing": 0.5 | |
| }, | |
| "was_adversarial": true, | |
| "confidence": 0.92, | |
| "trust_before": 0.708, | |
| "trust_after": 0.659, | |
| "trust_snapshot": { | |
| "S0": 0.659, | |
| "S1": 0.5, | |
| "S2": 0.5, | |
| "S3": 0.5, | |
| "S4": 0.5 | |
| }, | |
| "result_metadata": { | |
| "step_cost": 1, | |
| "triggered": true, | |
| "stakes": 0.79, | |
| "threshold": 0.7 | |
| } | |
| }, | |
| { | |
| "kind": "step", | |
| "step_count": 19, | |
| "action_type": "delegate", | |
| "specialist_id": "S0", | |
| "subtask_id": "T16", | |
| "domain": "PLAN", | |
| "stakes": 0.75, | |
| "reward": 0.0908, | |
| "reason": "\u274c Adversarial poison accepted at stakes=0.75.", | |
| "signal_breakdown": { | |
| "task_accuracy": 0.0, | |
| "stakes_awareness": 0.0, | |
| "efficiency": 0.578, | |
| "confidence_alignment": 0.08, | |
| "verification_quality": 0.0, | |
| "domain_routing": 0.5 | |
| }, | |
| "was_adversarial": true, | |
| "confidence": 0.92, | |
| "trust_before": 0.659, | |
| "trust_after": 0.619, | |
| "trust_snapshot": { | |
| "S0": 0.619, | |
| "S1": 0.5, | |
| "S2": 0.5, | |
| "S3": 0.5, | |
| "S4": 0.5 | |
| }, | |
| "result_metadata": { | |
| "step_cost": 1, | |
| "triggered": true, | |
| "stakes": 0.75, | |
| "threshold": 0.7 | |
| } | |
| }, | |
| { | |
| "kind": "step", | |
| "step_count": 21, | |
| "action_type": "verify", | |
| "specialist_id": "S0", | |
| "subtask_id": "T16", | |
| "domain": "PLAN", | |
| "stakes": 0.75, | |
| "reward": 0.925, | |
| "reason": "\u2705 Adversarial detected at stakes=0.75 \u2014 evasion successful.", | |
| "signal_breakdown": { | |
| "task_accuracy": 1.0, | |
| "stakes_awareness": 0.99, | |
| "efficiency": 0.533, | |
| "confidence_alignment": 1.0, | |
| "verification_quality": 1.0, | |
| "domain_routing": 0.5 | |
| }, | |
| "was_adversarial": true, | |
| "confidence": 0.92, | |
| "trust_before": 0.619, | |
| "trust_after": 0.583, | |
| "trust_snapshot": { | |
| "S0": 0.583, | |
| "S1": 0.5, | |
| "S2": 0.5, | |
| "S3": 0.5, | |
| "S4": 0.5 | |
| }, | |
| "result_metadata": { | |
| "step_cost": 1, | |
| "triggered": true, | |
| "stakes": 0.75, | |
| "threshold": 0.7 | |
| } | |
| }, | |
| { | |
| "kind": "step", | |
| "step_count": 23, | |
| "action_type": "verify", | |
| "specialist_id": "S0", | |
| "subtask_id": "T17", | |
| "domain": "ANALYZE", | |
| "stakes": 0.72, | |
| "reward": 0.9206, | |
| "reason": "\u2705 Adversarial detected at stakes=0.72 \u2014 evasion successful.", | |
| "signal_breakdown": { | |
| "task_accuracy": 1.0, | |
| "stakes_awareness": 0.99, | |
| "efficiency": 0.489, | |
| "confidence_alignment": 1.0, | |
| "verification_quality": 1.0, | |
| "domain_routing": 0.5 | |
| }, | |
| "was_adversarial": true, | |
| "confidence": 0.92, | |
| "trust_before": 0.583, | |
| "trust_after": 0.551, | |
| "trust_snapshot": { | |
| "S0": 0.551, | |
| "S1": 0.5, | |
| "S2": 0.5, | |
| "S3": 0.5, | |
| "S4": 0.5 | |
| }, | |
| "result_metadata": { | |
| "step_cost": 1, | |
| "triggered": true, | |
| "stakes": 0.72, | |
| "threshold": 0.7 | |
| } | |
| }, | |
| { | |
| "kind": "step", | |
| "step_count": 25, | |
| "action_type": "verify", | |
| "specialist_id": "S0", | |
| "subtask_id": "T18", | |
| "domain": "VERIFY", | |
| "stakes": 0.85, | |
| "reward": 0.9161, | |
| "reason": "\u2705 Adversarial detected at stakes=0.85 \u2014 evasion successful.", | |
| "signal_breakdown": { | |
| "task_accuracy": 1.0, | |
| "stakes_awareness": 0.99, | |
| "efficiency": 0.444, | |
| "confidence_alignment": 1.0, | |
| "verification_quality": 1.0, | |
| "domain_routing": 0.5 | |
| }, | |
| "was_adversarial": true, | |
| "confidence": 0.92, | |
| "trust_before": 0.551, | |
| "trust_after": 0.52, | |
| "trust_snapshot": { | |
| "S0": 0.52, | |
| "S1": 0.5, | |
| "S2": 0.5, | |
| "S3": 0.5, | |
| "S4": 0.5 | |
| }, | |
| "result_metadata": { | |
| "step_cost": 1, | |
| "triggered": true, | |
| "stakes": 0.85, | |
| "threshold": 0.7 | |
| } | |
| }, | |
| { | |
| "kind": "step", | |
| "step_count": 27, | |
| "action_type": "verify", | |
| "specialist_id": "S0", | |
| "subtask_id": "T19", | |
| "domain": "SYNTHESIZE", | |
| "stakes": 0.86, | |
| "reward": 0.9117, | |
| "reason": "\u2705 Adversarial detected at stakes=0.86 \u2014 evasion successful.", | |
| "signal_breakdown": { | |
| "task_accuracy": 1.0, | |
| "stakes_awareness": 0.99, | |
| "efficiency": 0.4, | |
| "confidence_alignment": 1.0, | |
| "verification_quality": 1.0, | |
| "domain_routing": 0.5 | |
| }, | |
| "was_adversarial": true, | |
| "confidence": 0.92, | |
| "trust_before": 0.52, | |
| "trust_after": 0.492, | |
| "trust_snapshot": { | |
| "S0": 0.492, | |
| "S1": 0.5, | |
| "S2": 0.5, | |
| "S3": 0.5, | |
| "S4": 0.5 | |
| }, | |
| "result_metadata": { | |
| "step_cost": 1, | |
| "triggered": true, | |
| "stakes": 0.86, | |
| "threshold": 0.7 | |
| } | |
| }, | |
| { | |
| "kind": "step", | |
| "step_count": 29, | |
| "action_type": "verify", | |
| "specialist_id": "S1", | |
| "subtask_id": "T20", | |
| "domain": "EXECUTE", | |
| "stakes": 0.92, | |
| "reward": 0.3056, | |
| "reason": "Wrong result via verify at stakes=0.92.", | |
| "signal_breakdown": { | |
| "task_accuracy": 0.0, | |
| "stakes_awareness": 0.5, | |
| "efficiency": 0.356, | |
| "confidence_alignment": 0.05, | |
| "verification_quality": 0.75, | |
| "domain_routing": 0.5 | |
| }, | |
| "was_adversarial": false, | |
| "confidence": 0.95, | |
| "trust_before": 0.5, | |
| "trust_after": 0.207, | |
| "trust_snapshot": { | |
| "S0": 0.492, | |
| "S1": 0.207, | |
| "S2": 0.5, | |
| "S3": 0.5, | |
| "S4": 0.5 | |
| }, | |
| "result_metadata": { | |
| "step_cost": 1 | |
| } | |
| }, | |
| { | |
| "kind": "step", | |
| "step_count": 31, | |
| "action_type": "verify", | |
| "specialist_id": "S2", | |
| "subtask_id": "T20", | |
| "domain": "EXECUTE", | |
| "stakes": 0.92, | |
| "reward": 0.3311, | |
| "reason": "Wrong result via verify at stakes=0.92.", | |
| "signal_breakdown": { | |
| "task_accuracy": 0.0, | |
| "stakes_awareness": 0.5, | |
| "efficiency": 0.311, | |
| "confidence_alignment": 0.55, | |
| "verification_quality": 0.75, | |
| "domain_routing": 0.1 | |
| }, | |
| "was_adversarial": false, | |
| "confidence": 0.45, | |
| "trust_before": 0.5, | |
| "trust_after": 0.207, | |
| "trust_snapshot": { | |
| "S0": 0.492, | |
| "S1": 0.207, | |
| "S2": 0.207, | |
| "S3": 0.5, | |
| "S4": 0.5 | |
| }, | |
| "result_metadata": { | |
| "step_cost": 1, | |
| "in_domain": false, | |
| "domain": "EXECUTE" | |
| } | |
| }, | |
| { | |
| "kind": "terminal", | |
| "step_count": 31, | |
| "action_type": "terminal", | |
| "specialist_id": null, | |
| "subtask_id": null, | |
| "domain": null, | |
| "stakes": 0.0, | |
| "reward": 0.5724, | |
| "reason": "Mission complete. Completion=85%, Detection=44% (4/9), Calibration=0.442, Efficiency=0.311.", | |
| "signal_breakdown": { | |
| "completion_rate": 0.85, | |
| "detection_rate": 0.444, | |
| "trust_calibration": 0.442, | |
| "efficiency": 0.311, | |
| "adversarial_detections": 4, | |
| "adversarial_poisonings": 5 | |
| }, | |
| "was_adversarial": false, | |
| "confidence": null, | |
| "trust_before": null, | |
| "trust_after": null, | |
| "trust_snapshot": { | |
| "S0": 0.492, | |
| "S1": 0.207, | |
| "S2": 0.207, | |
| "S3": 0.5, | |
| "S4": 0.5 | |
| }, | |
| "result_metadata": {} | |
| } | |
| ], | |
| "formula": { | |
| "task1_step": "0.43 accuracy + 0.30 stakes + 0.12 efficiency + 0.07 confidence + 0.04 domain + 0.04 verify", | |
| "task2_step": "0.55 accuracy + 0.25 efficiency + 0.10 confidence + 0.10 domain", | |
| "task3_step": "0.32 accuracy + 0.33 stakes + 0.10 efficiency + 0.10 confidence + 0.10 verify + 0.05 domain", | |
| "task3_terminal": "0.35 completion + 0.30 detection + 0.25 calibration + 0.10 efficiency" | |
| } | |
| } |