fsds_cleaning_env / results_heuristic.json
israaaML's picture
v3: benchmark results, final report, agent/eval improvements, smoke test fixes
b3fc5ee
{
"agent": "HeuristicAgent",
"base_url": "https://israaaML-fsds-cleaning-env.hf.space",
"n_episodes": 45,
"aggregate": {
"episodes": 45,
"success_rate": 0.0,
"avg_return": -0.08783333333333322,
"avg_steps": 12.2,
"avg_invalid_actions": 0.0
},
"episodes": [
{
"task_name": "ecommerce_mobile_baseline_seed0",
"task_id": "ecommerce_mobile",
"episode": 0,
"success": false,
"total_return": -0.07499999999999998,
"steps": 12,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 0.9864
},
{
"task_name": "ecommerce_mobile_baseline_seed0",
"task_id": "ecommerce_mobile",
"episode": 1,
"success": false,
"total_return": -0.07499999999999998,
"steps": 12,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 0.9864
},
{
"task_name": "ecommerce_mobile_baseline_seed0",
"task_id": "ecommerce_mobile",
"episode": 2,
"success": false,
"total_return": -0.07499999999999998,
"steps": 12,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 0.9864
},
{
"task_name": "ecommerce_mobile_baseline_seed1",
"task_id": "ecommerce_mobile",
"episode": 0,
"success": false,
"total_return": -0.07499999999999998,
"steps": 12,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 0.9786
},
{
"task_name": "ecommerce_mobile_baseline_seed1",
"task_id": "ecommerce_mobile",
"episode": 1,
"success": false,
"total_return": -0.07499999999999998,
"steps": 12,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 0.9786
},
{
"task_name": "ecommerce_mobile_baseline_seed1",
"task_id": "ecommerce_mobile",
"episode": 2,
"success": false,
"total_return": -0.07499999999999998,
"steps": 12,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 0.9786
},
{
"task_name": "ecommerce_mobile_baseline_seed2",
"task_id": "ecommerce_mobile",
"episode": 0,
"error": "Tool 'submit_solution' failed: Error calling tool 'submit_solution': Error serializing to JSON: TypeError: 'float' object cannot be interpreted as an integer (type: execution_error)",
"success": false,
"total_return": 0.0,
"steps": 0,
"invalid_actions": 0
},
{
"task_name": "ecommerce_mobile_baseline_seed2",
"task_id": "ecommerce_mobile",
"episode": 1,
"error": "Tool 'submit_solution' failed: Error calling tool 'submit_solution': Error serializing to JSON: TypeError: 'float' object cannot be interpreted as an integer (type: execution_error)",
"success": false,
"total_return": 0.0,
"steps": 0,
"invalid_actions": 0
},
{
"task_name": "ecommerce_mobile_baseline_seed2",
"task_id": "ecommerce_mobile",
"episode": 2,
"error": "Tool 'submit_solution' failed: Error calling tool 'submit_solution': Error serializing to JSON: TypeError: 'float' object cannot be interpreted as an integer (type: execution_error)",
"success": false,
"total_return": 0.0,
"steps": 0,
"invalid_actions": 0
},
{
"task_name": "ecommerce_mobile_baseline_seed3",
"task_id": "ecommerce_mobile",
"episode": 0,
"success": false,
"total_return": -0.07499999999999998,
"steps": 12,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 0.9767
},
{
"task_name": "ecommerce_mobile_baseline_seed3",
"task_id": "ecommerce_mobile",
"episode": 1,
"success": false,
"total_return": -0.07499999999999998,
"steps": 12,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 0.9767
},
{
"task_name": "ecommerce_mobile_baseline_seed3",
"task_id": "ecommerce_mobile",
"episode": 2,
"success": false,
"total_return": -0.07499999999999998,
"steps": 12,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 0.9767
},
{
"task_name": "ecommerce_mobile_baseline_seed4",
"task_id": "ecommerce_mobile",
"episode": 0,
"success": false,
"total_return": -0.07499999999999998,
"steps": 12,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 0.9806
},
{
"task_name": "ecommerce_mobile_baseline_seed4",
"task_id": "ecommerce_mobile",
"episode": 1,
"success": false,
"total_return": -0.07499999999999998,
"steps": 12,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 0.9806
},
{
"task_name": "ecommerce_mobile_baseline_seed4",
"task_id": "ecommerce_mobile",
"episode": 2,
"success": false,
"total_return": -0.07499999999999998,
"steps": 12,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 0.9806
},
{
"task_name": "subscription_churn_baseline_seed0",
"task_id": "subscription_churn",
"episode": 0,
"success": false,
"total_return": -0.11079999999999998,
"steps": 14,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 0.9864
},
{
"task_name": "subscription_churn_baseline_seed0",
"task_id": "subscription_churn",
"episode": 1,
"success": false,
"total_return": -0.11079999999999998,
"steps": 14,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 0.9864
},
{
"task_name": "subscription_churn_baseline_seed0",
"task_id": "subscription_churn",
"episode": 2,
"success": false,
"total_return": -0.11079999999999998,
"steps": 14,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 0.9864
},
{
"task_name": "subscription_churn_baseline_seed1",
"task_id": "subscription_churn",
"episode": 0,
"success": false,
"total_return": -0.11079999999999998,
"steps": 14,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 0.9767
},
{
"task_name": "subscription_churn_baseline_seed1",
"task_id": "subscription_churn",
"episode": 1,
"success": false,
"total_return": -0.11079999999999998,
"steps": 14,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 0.9767
},
{
"task_name": "subscription_churn_baseline_seed1",
"task_id": "subscription_churn",
"episode": 2,
"success": false,
"total_return": -0.11079999999999998,
"steps": 14,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 0.9767
},
{
"task_name": "subscription_churn_baseline_seed2",
"task_id": "subscription_churn",
"episode": 0,
"success": false,
"total_return": -0.11079999999999998,
"steps": 14,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 0.9845
},
{
"task_name": "subscription_churn_baseline_seed2",
"task_id": "subscription_churn",
"episode": 1,
"success": false,
"total_return": -0.11079999999999998,
"steps": 14,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 0.9845
},
{
"task_name": "subscription_churn_baseline_seed2",
"task_id": "subscription_churn",
"episode": 2,
"success": false,
"total_return": -0.11079999999999998,
"steps": 14,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 0.9845
},
{
"task_name": "subscription_churn_baseline_seed3",
"task_id": "subscription_churn",
"episode": 0,
"success": false,
"total_return": -0.11079999999999998,
"steps": 14,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 0.9786
},
{
"task_name": "subscription_churn_baseline_seed3",
"task_id": "subscription_churn",
"episode": 1,
"success": false,
"total_return": -0.11079999999999998,
"steps": 14,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 0.9786
},
{
"task_name": "subscription_churn_baseline_seed3",
"task_id": "subscription_churn",
"episode": 2,
"success": false,
"total_return": -0.11079999999999998,
"steps": 14,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 0.9786
},
{
"task_name": "subscription_churn_baseline_seed4",
"task_id": "subscription_churn",
"episode": 0,
"success": false,
"total_return": -0.11079999999999998,
"steps": 14,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 0.9825
},
{
"task_name": "subscription_churn_baseline_seed4",
"task_id": "subscription_churn",
"episode": 1,
"success": false,
"total_return": -0.11079999999999998,
"steps": 14,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 0.9825
},
{
"task_name": "subscription_churn_baseline_seed4",
"task_id": "subscription_churn",
"episode": 2,
"success": false,
"total_return": -0.11079999999999998,
"steps": 14,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 0.9825
},
{
"task_name": "delivery_eta_baseline_seed0",
"task_id": "delivery_eta",
"episode": 0,
"success": false,
"total_return": -0.09269999999999995,
"steps": 13,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 0.9845
},
{
"task_name": "delivery_eta_baseline_seed0",
"task_id": "delivery_eta",
"episode": 1,
"success": false,
"total_return": -0.09269999999999995,
"steps": 13,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 0.9845
},
{
"task_name": "delivery_eta_baseline_seed0",
"task_id": "delivery_eta",
"episode": 2,
"success": false,
"total_return": -0.09269999999999995,
"steps": 13,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 0.9845
},
{
"task_name": "delivery_eta_baseline_seed1",
"task_id": "delivery_eta",
"episode": 0,
"success": false,
"total_return": -0.09269999999999995,
"steps": 13,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 0.9883
},
{
"task_name": "delivery_eta_baseline_seed1",
"task_id": "delivery_eta",
"episode": 1,
"success": false,
"total_return": -0.09269999999999995,
"steps": 13,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 0.9883
},
{
"task_name": "delivery_eta_baseline_seed1",
"task_id": "delivery_eta",
"episode": 2,
"success": false,
"total_return": -0.09269999999999995,
"steps": 13,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 0.9883
},
{
"task_name": "delivery_eta_baseline_seed2",
"task_id": "delivery_eta",
"episode": 0,
"success": false,
"total_return": -0.09269999999999995,
"steps": 13,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 0.9748
},
{
"task_name": "delivery_eta_baseline_seed2",
"task_id": "delivery_eta",
"episode": 1,
"success": false,
"total_return": -0.09269999999999995,
"steps": 13,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 0.9748
},
{
"task_name": "delivery_eta_baseline_seed2",
"task_id": "delivery_eta",
"episode": 2,
"success": false,
"total_return": -0.09269999999999995,
"steps": 13,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 0.9748
},
{
"task_name": "delivery_eta_baseline_seed3",
"task_id": "delivery_eta",
"episode": 0,
"success": false,
"total_return": -0.09269999999999995,
"steps": 13,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 0.9767
},
{
"task_name": "delivery_eta_baseline_seed3",
"task_id": "delivery_eta",
"episode": 1,
"success": false,
"total_return": -0.09269999999999995,
"steps": 13,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 0.9767
},
{
"task_name": "delivery_eta_baseline_seed3",
"task_id": "delivery_eta",
"episode": 2,
"success": false,
"total_return": -0.09269999999999995,
"steps": 13,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 0.9767
},
{
"task_name": "delivery_eta_baseline_seed4",
"task_id": "delivery_eta",
"episode": 0,
"success": false,
"total_return": -0.09269999999999995,
"steps": 13,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 0.9748
},
{
"task_name": "delivery_eta_baseline_seed4",
"task_id": "delivery_eta",
"episode": 1,
"success": false,
"total_return": -0.09269999999999995,
"steps": 13,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 0.9748
},
{
"task_name": "delivery_eta_baseline_seed4",
"task_id": "delivery_eta",
"episode": 2,
"success": false,
"total_return": -0.09269999999999995,
"steps": 13,
"invalid_actions": 0,
"quality_gate_passed": false,
"retention_ratio": 0.9748
}
]
}