Spaces:
Sleeping
Sleeping
| { | |
| "baseline": "rule_based", | |
| "average_score": 1.0, | |
| "results": [ | |
| { | |
| "task_id": "easy_account_takeover", | |
| "difficulty": "easy", | |
| "score": 1.0, | |
| "steps": 7, | |
| "transcript": [ | |
| { | |
| "action": { | |
| "action_type": "request_context", | |
| "target": "T1", | |
| "value": "account_security" | |
| }, | |
| "reward": { | |
| "value": 0.11, | |
| "components": { | |
| "step_penalty": -0.01, | |
| "required_context_found": 0.12 | |
| }, | |
| "rationale": "Processed request_context." | |
| }, | |
| "task_score": 0.1, | |
| "done": false | |
| }, | |
| { | |
| "action": { | |
| "action_type": "request_context", | |
| "target": "T1", | |
| "value": "billing_activity" | |
| }, | |
| "reward": { | |
| "value": 0.11, | |
| "components": { | |
| "step_penalty": -0.01, | |
| "required_context_found": 0.12 | |
| }, | |
| "rationale": "Processed request_context." | |
| }, | |
| "task_score": 0.2, | |
| "done": false | |
| }, | |
| { | |
| "action": { | |
| "action_type": "set_priority", | |
| "target": "T1", | |
| "value": "urgent" | |
| }, | |
| "reward": { | |
| "value": 0.07, | |
| "components": { | |
| "step_penalty": -0.01, | |
| "priority_match": 0.08 | |
| }, | |
| "rationale": "Processed set_priority." | |
| }, | |
| "task_score": 0.4, | |
| "done": false | |
| }, | |
| { | |
| "action": { | |
| "action_type": "set_route", | |
| "target": "T1", | |
| "value": "account_security" | |
| }, | |
| "reward": { | |
| "value": 0.09, | |
| "components": { | |
| "step_penalty": -0.01, | |
| "route_match": 0.1 | |
| }, | |
| "rationale": "Processed set_route." | |
| }, | |
| "task_score": 0.65, | |
| "done": false | |
| }, | |
| { | |
| "action": { | |
| "action_type": "set_resolution", | |
| "target": "T1", | |
| "value": "temporary_lock_and_manual_recovery" | |
| }, | |
| "reward": { | |
| "value": 0.11, | |
| "components": { | |
| "step_penalty": -0.01, | |
| "resolution_match": 0.12 | |
| }, | |
| "rationale": "Processed set_resolution." | |
| }, | |
| "task_score": 0.85, | |
| "done": false | |
| }, | |
| { | |
| "action": { | |
| "action_type": "escalate", | |
| "target": "T1", | |
| "value": "security_specialist" | |
| }, | |
| "reward": { | |
| "value": 0.09, | |
| "components": { | |
| "step_penalty": -0.01, | |
| "correct_escalation": 0.1 | |
| }, | |
| "rationale": "Processed escalate." | |
| }, | |
| "task_score": 1.0, | |
| "done": false | |
| }, | |
| { | |
| "action": { | |
| "action_type": "finalize", | |
| "target": "T1", | |
| "value": null | |
| }, | |
| "reward": { | |
| "value": 0.99, | |
| "components": { | |
| "step_penalty": -0.01, | |
| "terminal_grade": 1.0 | |
| }, | |
| "rationale": "Final task grade applied." | |
| }, | |
| "task_score": 1.0, | |
| "done": true | |
| } | |
| ] | |
| }, | |
| { | |
| "task_id": "medium_payout_hold", | |
| "difficulty": "medium", | |
| "score": 1.0, | |
| "steps": 6, | |
| "transcript": [ | |
| { | |
| "action": { | |
| "action_type": "request_context", | |
| "target": "T1", | |
| "value": "tax_status" | |
| }, | |
| "reward": { | |
| "value": 0.11, | |
| "components": { | |
| "step_penalty": -0.01, | |
| "required_context_found": 0.12 | |
| }, | |
| "rationale": "Processed request_context." | |
| }, | |
| "task_score": 0.225, | |
| "done": false | |
| }, | |
| { | |
| "action": { | |
| "action_type": "request_context", | |
| "target": "T1", | |
| "value": "payout_hold" | |
| }, | |
| "reward": { | |
| "value": 0.11, | |
| "components": { | |
| "step_penalty": -0.01, | |
| "required_context_found": 0.12 | |
| }, | |
| "rationale": "Processed request_context." | |
| }, | |
| "task_score": 0.35, | |
| "done": false | |
| }, | |
| { | |
| "action": { | |
| "action_type": "set_priority", | |
| "target": "T1", | |
| "value": "high" | |
| }, | |
| "reward": { | |
| "value": 0.07, | |
| "components": { | |
| "step_penalty": -0.01, | |
| "priority_match": 0.08 | |
| }, | |
| "rationale": "Processed set_priority." | |
| }, | |
| "task_score": 0.5, | |
| "done": false | |
| }, | |
| { | |
| "action": { | |
| "action_type": "set_route", | |
| "target": "T1", | |
| "value": "monetization_compliance" | |
| }, | |
| "reward": { | |
| "value": 0.09, | |
| "components": { | |
| "step_penalty": -0.01, | |
| "route_match": 0.1 | |
| }, | |
| "rationale": "Processed set_route." | |
| }, | |
| "task_score": 0.75, | |
| "done": false | |
| }, | |
| { | |
| "action": { | |
| "action_type": "set_resolution", | |
| "target": "T1", | |
| "value": "request_tax_renewal" | |
| }, | |
| "reward": { | |
| "value": 0.11, | |
| "components": { | |
| "step_penalty": -0.01, | |
| "resolution_match": 0.12 | |
| }, | |
| "rationale": "Processed set_resolution." | |
| }, | |
| "task_score": 1.0, | |
| "done": false | |
| }, | |
| { | |
| "action": { | |
| "action_type": "finalize", | |
| "target": "T1", | |
| "value": null | |
| }, | |
| "reward": { | |
| "value": 0.99, | |
| "components": { | |
| "step_penalty": -0.01, | |
| "terminal_grade": 1.0 | |
| }, | |
| "rationale": "Final task grade applied." | |
| }, | |
| "task_score": 1.0, | |
| "done": true | |
| } | |
| ] | |
| }, | |
| { | |
| "task_id": "hard_queue_triage", | |
| "difficulty": "hard", | |
| "score": 1.0, | |
| "steps": 16, | |
| "transcript": [ | |
| { | |
| "action": { | |
| "action_type": "rank_queue", | |
| "target": "T1", | |
| "value": "T2,T3,T1" | |
| }, | |
| "reward": { | |
| "value": 0.11, | |
| "components": { | |
| "step_penalty": -0.01, | |
| "queue_progress": 0.12 | |
| }, | |
| "rationale": "Processed rank_queue." | |
| }, | |
| "task_score": 0.2167, | |
| "done": false | |
| }, | |
| { | |
| "action": { | |
| "action_type": "request_context", | |
| "target": "T1", | |
| "value": "payment_status" | |
| }, | |
| "reward": { | |
| "value": 0.11, | |
| "components": { | |
| "step_penalty": -0.01, | |
| "required_context_found": 0.12 | |
| }, | |
| "rationale": "Processed request_context." | |
| }, | |
| "task_score": 0.25, | |
| "done": false | |
| }, | |
| { | |
| "action": { | |
| "action_type": "request_context", | |
| "target": "T2", | |
| "value": "account_security" | |
| }, | |
| "reward": { | |
| "value": 0.11, | |
| "components": { | |
| "step_penalty": -0.01, | |
| "required_context_found": 0.12 | |
| }, | |
| "rationale": "Processed request_context." | |
| }, | |
| "task_score": 0.2667, | |
| "done": false | |
| }, | |
| { | |
| "action": { | |
| "action_type": "request_context", | |
| "target": "T2", | |
| "value": "billing_activity" | |
| }, | |
| "reward": { | |
| "value": 0.11, | |
| "components": { | |
| "step_penalty": -0.01, | |
| "required_context_found": 0.12 | |
| }, | |
| "rationale": "Processed request_context." | |
| }, | |
| "task_score": 0.2834, | |
| "done": false | |
| }, | |
| { | |
| "action": { | |
| "action_type": "request_context", | |
| "target": "T3", | |
| "value": "appeal_state" | |
| }, | |
| "reward": { | |
| "value": 0.11, | |
| "components": { | |
| "step_penalty": -0.01, | |
| "required_context_found": 0.12 | |
| }, | |
| "rationale": "Processed request_context." | |
| }, | |
| "task_score": 0.3, | |
| "done": false | |
| }, | |
| { | |
| "action": { | |
| "action_type": "request_context", | |
| "target": "T3", | |
| "value": "campaign_deadline" | |
| }, | |
| "reward": { | |
| "value": 0.11, | |
| "components": { | |
| "step_penalty": -0.01, | |
| "required_context_found": 0.12 | |
| }, | |
| "rationale": "Processed request_context." | |
| }, | |
| "task_score": 0.3167, | |
| "done": false | |
| }, | |
| { | |
| "action": { | |
| "action_type": "set_priority", | |
| "target": "T1", | |
| "value": "normal" | |
| }, | |
| "reward": { | |
| "value": 0.07, | |
| "components": { | |
| "step_penalty": -0.01, | |
| "priority_match": 0.08 | |
| }, | |
| "rationale": "Processed set_priority." | |
| }, | |
| "task_score": 0.3834, | |
| "done": false | |
| }, | |
| { | |
| "action": { | |
| "action_type": "set_priority", | |
| "target": "T2", | |
| "value": "urgent" | |
| }, | |
| "reward": { | |
| "value": 0.07, | |
| "components": { | |
| "step_penalty": -0.01, | |
| "priority_match": 0.08 | |
| }, | |
| "rationale": "Processed set_priority." | |
| }, | |
| "task_score": 0.45, | |
| "done": false | |
| }, | |
| { | |
| "action": { | |
| "action_type": "set_priority", | |
| "target": "T3", | |
| "value": "high" | |
| }, | |
| "reward": { | |
| "value": 0.07, | |
| "components": { | |
| "step_penalty": -0.01, | |
| "priority_match": 0.08 | |
| }, | |
| "rationale": "Processed set_priority." | |
| }, | |
| "task_score": 0.5167, | |
| "done": false | |
| }, | |
| { | |
| "action": { | |
| "action_type": "set_route", | |
| "target": "T1", | |
| "value": "billing_refunds" | |
| }, | |
| "reward": { | |
| "value": 0.09, | |
| "components": { | |
| "step_penalty": -0.01, | |
| "route_match": 0.1 | |
| }, | |
| "rationale": "Processed set_route." | |
| }, | |
| "task_score": 0.6, | |
| "done": false | |
| }, | |
| { | |
| "action": { | |
| "action_type": "set_route", | |
| "target": "T2", | |
| "value": "account_security" | |
| }, | |
| "reward": { | |
| "value": 0.09, | |
| "components": { | |
| "step_penalty": -0.01, | |
| "route_match": 0.1 | |
| }, | |
| "rationale": "Processed set_route." | |
| }, | |
| "task_score": 0.6834, | |
| "done": false | |
| }, | |
| { | |
| "action": { | |
| "action_type": "set_route", | |
| "target": "T3", | |
| "value": "policy_appeals" | |
| }, | |
| "reward": { | |
| "value": 0.09, | |
| "components": { | |
| "step_penalty": -0.01, | |
| "route_match": 0.1 | |
| }, | |
| "rationale": "Processed set_route." | |
| }, | |
| "task_score": 0.7667, | |
| "done": false | |
| }, | |
| { | |
| "action": { | |
| "action_type": "set_resolution", | |
| "target": "T1", | |
| "value": "approve_refund" | |
| }, | |
| "reward": { | |
| "value": 0.11, | |
| "components": { | |
| "step_penalty": -0.01, | |
| "resolution_match": 0.12 | |
| }, | |
| "rationale": "Processed set_resolution." | |
| }, | |
| "task_score": 0.8334, | |
| "done": false | |
| }, | |
| { | |
| "action": { | |
| "action_type": "set_resolution", | |
| "target": "T2", | |
| "value": "temporary_lock_and_manual_recovery" | |
| }, | |
| "reward": { | |
| "value": 0.11, | |
| "components": { | |
| "step_penalty": -0.01, | |
| "resolution_match": 0.12 | |
| }, | |
| "rationale": "Processed set_resolution." | |
| }, | |
| "task_score": 0.9, | |
| "done": false | |
| }, | |
| { | |
| "action": { | |
| "action_type": "set_resolution", | |
| "target": "T3", | |
| "value": "expedited_human_review" | |
| }, | |
| "reward": { | |
| "value": 0.11, | |
| "components": { | |
| "step_penalty": -0.01, | |
| "resolution_match": 0.12 | |
| }, | |
| "rationale": "Processed set_resolution." | |
| }, | |
| "task_score": 0.9667, | |
| "done": false | |
| }, | |
| { | |
| "action": { | |
| "action_type": "escalate", | |
| "target": "T2", | |
| "value": "security_specialist" | |
| }, | |
| "reward": { | |
| "value": 1.09, | |
| "components": { | |
| "step_penalty": -0.01, | |
| "correct_escalation": 0.1, | |
| "timeout_grade": 1.0 | |
| }, | |
| "rationale": "Processed escalate." | |
| }, | |
| "task_score": 1.0, | |
| "done": true | |
| } | |
| ] | |
| } | |
| ] | |
| } |