fsds_cleaning_env / results_random.json
israaaML's picture
v3: benchmark results, final report, agent/eval improvements, smoke test fixes
b3fc5ee
{
"agent": "RandomAgent",
"base_url": "https://israaaML-fsds-cleaning-env.hf.space",
"n_episodes": 45,
"aggregate": {
"episodes": 45,
"success_rate": 0.0,
"avg_return": -0.09121333333333337,
"avg_steps": 3.1777777777777776,
"avg_invalid_actions": 0.0
},
"episodes": [
{
"task_name": "ecommerce_mobile_baseline_seed0",
"task_id": "ecommerce_mobile",
"episode": 0,
"success": false,
"total_return": -0.1,
"steps": 2,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 1.0
},
{
"task_name": "ecommerce_mobile_baseline_seed0",
"task_id": "ecommerce_mobile",
"episode": 1,
"success": false,
"total_return": -0.42000000000000004,
"steps": 6,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 1.0
},
{
"task_name": "ecommerce_mobile_baseline_seed0",
"task_id": "ecommerce_mobile",
"episode": 2,
"success": false,
"total_return": -0.1,
"steps": 3,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 1.0
},
{
"task_name": "ecommerce_mobile_baseline_seed1",
"task_id": "ecommerce_mobile",
"episode": 0,
"success": false,
"total_return": -0.1,
"steps": 6,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 1.0
},
{
"task_name": "ecommerce_mobile_baseline_seed1",
"task_id": "ecommerce_mobile",
"episode": 1,
"success": false,
"total_return": -0.04,
"steps": 4,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 1.0
},
{
"task_name": "ecommerce_mobile_baseline_seed1",
"task_id": "ecommerce_mobile",
"episode": 2,
"success": false,
"total_return": -0.28,
"steps": 10,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 1.0
},
{
"task_name": "ecommerce_mobile_baseline_seed2",
"task_id": "ecommerce_mobile",
"episode": 0,
"error": "Tool 'submit_solution' failed: Error calling tool 'submit_solution': Error serializing to JSON: TypeError: 'float' object cannot be interpreted as an integer (type: execution_error)",
"success": false,
"total_return": 0.0,
"steps": 0,
"invalid_actions": 0
},
{
"task_name": "ecommerce_mobile_baseline_seed2",
"task_id": "ecommerce_mobile",
"episode": 1,
"success": false,
"total_return": -0.02,
"steps": 2,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 1.0
},
{
"task_name": "ecommerce_mobile_baseline_seed2",
"task_id": "ecommerce_mobile",
"episode": 2,
"success": false,
"total_return": -0.12000000000000001,
"steps": 4,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 0.9728
},
{
"task_name": "ecommerce_mobile_baseline_seed3",
"task_id": "ecommerce_mobile",
"episode": 0,
"success": false,
"total_return": 0.0,
"steps": 1,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 1.0
},
{
"task_name": "ecommerce_mobile_baseline_seed3",
"task_id": "ecommerce_mobile",
"episode": 1,
"success": false,
"total_return": 0.0,
"steps": 1,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 1.0
},
{
"task_name": "ecommerce_mobile_baseline_seed3",
"task_id": "ecommerce_mobile",
"episode": 2,
"error": "Tool 'preview_data' failed: Error calling tool 'preview_data': Error serializing to JSON: TypeError: 'float' object cannot be interpreted as an integer (type: execution_error)",
"success": false,
"total_return": 0.0,
"steps": 0,
"invalid_actions": 0
},
{
"task_name": "ecommerce_mobile_baseline_seed4",
"task_id": "ecommerce_mobile",
"episode": 0,
"success": false,
"total_return": 0.005000000000000001,
"steps": 4,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 1.0
},
{
"task_name": "ecommerce_mobile_baseline_seed4",
"task_id": "ecommerce_mobile",
"episode": 1,
"success": false,
"total_return": -0.1,
"steps": 4,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 1.0
},
{
"task_name": "ecommerce_mobile_baseline_seed4",
"task_id": "ecommerce_mobile",
"episode": 2,
"success": false,
"total_return": -0.135,
"steps": 7,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 0.9806
},
{
"task_name": "subscription_churn_baseline_seed0",
"task_id": "subscription_churn",
"episode": 0,
"success": false,
"total_return": 0.0,
"steps": 1,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 1.0
},
{
"task_name": "subscription_churn_baseline_seed0",
"task_id": "subscription_churn",
"episode": 1,
"success": false,
"total_return": 0.0,
"steps": 1,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 1.0
},
{
"task_name": "subscription_churn_baseline_seed0",
"task_id": "subscription_churn",
"episode": 2,
"success": false,
"total_return": 0.0,
"steps": 1,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 1.0
},
{
"task_name": "subscription_churn_baseline_seed1",
"task_id": "subscription_churn",
"episode": 0,
"success": false,
"total_return": -0.34,
"steps": 7,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 0.9767
},
{
"task_name": "subscription_churn_baseline_seed1",
"task_id": "subscription_churn",
"episode": 1,
"success": false,
"total_return": -0.12000000000000001,
"steps": 3,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 1.0
},
{
"task_name": "subscription_churn_baseline_seed1",
"task_id": "subscription_churn",
"episode": 2,
"success": false,
"total_return": 0.0,
"steps": 2,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 1.0
},
{
"task_name": "subscription_churn_baseline_seed2",
"task_id": "subscription_churn",
"episode": 0,
"success": false,
"total_return": -0.1,
"steps": 2,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 1.0
},
{
"task_name": "subscription_churn_baseline_seed2",
"task_id": "subscription_churn",
"episode": 1,
"success": false,
"total_return": -0.12000000000000001,
"steps": 3,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 1.0
},
{
"task_name": "subscription_churn_baseline_seed2",
"task_id": "subscription_churn",
"episode": 2,
"success": false,
"total_return": -0.1,
"steps": 2,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 1.0
},
{
"task_name": "subscription_churn_baseline_seed3",
"task_id": "subscription_churn",
"episode": 0,
"success": false,
"total_return": -0.34,
"steps": 6,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 1.0
},
{
"task_name": "subscription_churn_baseline_seed3",
"task_id": "subscription_churn",
"episode": 1,
"success": false,
"total_return": -0.06,
"steps": 5,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 0.9786
},
{
"task_name": "subscription_churn_baseline_seed3",
"task_id": "subscription_churn",
"episode": 2,
"success": false,
"total_return": -0.02,
"steps": 2,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 1.0
},
{
"task_name": "subscription_churn_baseline_seed4",
"task_id": "subscription_churn",
"episode": 0,
"success": false,
"total_return": -0.12000000000000001,
"steps": 5,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 1.0
},
{
"task_name": "subscription_churn_baseline_seed4",
"task_id": "subscription_churn",
"episode": 1,
"success": false,
"total_return": -0.02,
"steps": 3,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 1.0
},
{
"task_name": "subscription_churn_baseline_seed4",
"task_id": "subscription_churn",
"episode": 2,
"success": false,
"total_return": -0.2,
"steps": 3,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 1.0
},
{
"task_name": "delivery_eta_baseline_seed0",
"task_id": "delivery_eta",
"episode": 0,
"success": false,
"total_return": -0.4800000000000001,
"steps": 11,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 0.9825
},
{
"task_name": "delivery_eta_baseline_seed0",
"task_id": "delivery_eta",
"episode": 1,
"success": false,
"total_return": -0.21730000000000002,
"steps": 6,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 1.0
},
{
"task_name": "delivery_eta_baseline_seed0",
"task_id": "delivery_eta",
"episode": 2,
"success": false,
"total_return": -0.02,
"steps": 4,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 1.0
},
{
"task_name": "delivery_eta_baseline_seed1",
"task_id": "delivery_eta",
"episode": 0,
"success": false,
"total_return": 0.0,
"steps": 1,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 1.0
},
{
"task_name": "delivery_eta_baseline_seed1",
"task_id": "delivery_eta",
"episode": 1,
"success": false,
"total_return": 0.0,
"steps": 2,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 1.0
},
{
"task_name": "delivery_eta_baseline_seed1",
"task_id": "delivery_eta",
"episode": 2,
"error": "Tool 'submit_solution' failed: Error calling tool 'submit_solution': Error serializing to JSON: TypeError: 'float' object cannot be interpreted as an integer (type: execution_error)",
"success": false,
"total_return": 0.0,
"steps": 0,
"invalid_actions": 0
},
{
"task_name": "delivery_eta_baseline_seed2",
"task_id": "delivery_eta",
"episode": 0,
"success": false,
"total_return": 0.0,
"steps": 3,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 1.0
},
{
"task_name": "delivery_eta_baseline_seed2",
"task_id": "delivery_eta",
"episode": 1,
"success": false,
"total_return": 0.0,
"steps": 1,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 1.0
},
{
"task_name": "delivery_eta_baseline_seed2",
"task_id": "delivery_eta",
"episode": 2,
"success": false,
"total_return": 0.0,
"steps": 1,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 1.0
},
{
"task_name": "delivery_eta_baseline_seed3",
"task_id": "delivery_eta",
"episode": 0,
"success": false,
"total_return": 0.002700000000000001,
"steps": 2,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 1.0
},
{
"task_name": "delivery_eta_baseline_seed3",
"task_id": "delivery_eta",
"episode": 1,
"error": "Tool 'preview_data' failed: Error calling tool 'preview_data': Error serializing to JSON: TypeError: 'float' object cannot be interpreted as an integer (type: execution_error)",
"success": false,
"total_return": 0.0,
"steps": 0,
"invalid_actions": 0
},
{
"task_name": "delivery_eta_baseline_seed3",
"task_id": "delivery_eta",
"episode": 2,
"success": false,
"total_return": -0.1,
"steps": 2,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 1.0
},
{
"task_name": "delivery_eta_baseline_seed4",
"task_id": "delivery_eta",
"episode": 0,
"success": false,
"total_return": 0.0,
"steps": 1,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 1.0
},
{
"task_name": "delivery_eta_baseline_seed4",
"task_id": "delivery_eta",
"episode": 1,
"success": false,
"total_return": -0.12000000000000001,
"steps": 3,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 1.0
},
{
"task_name": "delivery_eta_baseline_seed4",
"task_id": "delivery_eta",
"episode": 2,
"success": false,
"total_return": -0.22000000000000003,
"steps": 6,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 1.0
}
]
}