Spaces:
Sleeping
Sleeping
| { | |
| "agent": "HeuristicAgent", | |
| "base_url": "https://israaaML-fsds-cleaning-env.hf.space", | |
| "n_episodes": 45, | |
| "aggregate": { | |
| "episodes": 45, | |
| "success_rate": 0.0, | |
| "avg_return": -0.08783333333333322, | |
| "avg_steps": 12.2, | |
| "avg_invalid_actions": 0.0 | |
| }, | |
| "episodes": [ | |
| { | |
| "task_name": "ecommerce_mobile_baseline_seed0", | |
| "task_id": "ecommerce_mobile", | |
| "episode": 0, | |
| "success": false, | |
| "total_return": -0.07499999999999998, | |
| "steps": 12, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 0.9864 | |
| }, | |
| { | |
| "task_name": "ecommerce_mobile_baseline_seed0", | |
| "task_id": "ecommerce_mobile", | |
| "episode": 1, | |
| "success": false, | |
| "total_return": -0.07499999999999998, | |
| "steps": 12, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 0.9864 | |
| }, | |
| { | |
| "task_name": "ecommerce_mobile_baseline_seed0", | |
| "task_id": "ecommerce_mobile", | |
| "episode": 2, | |
| "success": false, | |
| "total_return": -0.07499999999999998, | |
| "steps": 12, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 0.9864 | |
| }, | |
| { | |
| "task_name": "ecommerce_mobile_baseline_seed1", | |
| "task_id": "ecommerce_mobile", | |
| "episode": 0, | |
| "success": false, | |
| "total_return": -0.07499999999999998, | |
| "steps": 12, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 0.9786 | |
| }, | |
| { | |
| "task_name": "ecommerce_mobile_baseline_seed1", | |
| "task_id": "ecommerce_mobile", | |
| "episode": 1, | |
| "success": false, | |
| "total_return": -0.07499999999999998, | |
| "steps": 12, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 0.9786 | |
| }, | |
| { | |
| "task_name": "ecommerce_mobile_baseline_seed1", | |
| "task_id": "ecommerce_mobile", | |
| "episode": 2, | |
| "success": false, | |
| "total_return": -0.07499999999999998, | |
| "steps": 12, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 0.9786 | |
| }, | |
| { | |
| "task_name": "ecommerce_mobile_baseline_seed2", | |
| "task_id": "ecommerce_mobile", | |
| "episode": 0, | |
| "error": "Tool 'submit_solution' failed: Error calling tool 'submit_solution': Error serializing to JSON: TypeError: 'float' object cannot be interpreted as an integer (type: execution_error)", | |
| "success": false, | |
| "total_return": 0.0, | |
| "steps": 0, | |
| "invalid_actions": 0 | |
| }, | |
| { | |
| "task_name": "ecommerce_mobile_baseline_seed2", | |
| "task_id": "ecommerce_mobile", | |
| "episode": 1, | |
| "error": "Tool 'submit_solution' failed: Error calling tool 'submit_solution': Error serializing to JSON: TypeError: 'float' object cannot be interpreted as an integer (type: execution_error)", | |
| "success": false, | |
| "total_return": 0.0, | |
| "steps": 0, | |
| "invalid_actions": 0 | |
| }, | |
| { | |
| "task_name": "ecommerce_mobile_baseline_seed2", | |
| "task_id": "ecommerce_mobile", | |
| "episode": 2, | |
| "error": "Tool 'submit_solution' failed: Error calling tool 'submit_solution': Error serializing to JSON: TypeError: 'float' object cannot be interpreted as an integer (type: execution_error)", | |
| "success": false, | |
| "total_return": 0.0, | |
| "steps": 0, | |
| "invalid_actions": 0 | |
| }, | |
| { | |
| "task_name": "ecommerce_mobile_baseline_seed3", | |
| "task_id": "ecommerce_mobile", | |
| "episode": 0, | |
| "success": false, | |
| "total_return": -0.07499999999999998, | |
| "steps": 12, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 0.9767 | |
| }, | |
| { | |
| "task_name": "ecommerce_mobile_baseline_seed3", | |
| "task_id": "ecommerce_mobile", | |
| "episode": 1, | |
| "success": false, | |
| "total_return": -0.07499999999999998, | |
| "steps": 12, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 0.9767 | |
| }, | |
| { | |
| "task_name": "ecommerce_mobile_baseline_seed3", | |
| "task_id": "ecommerce_mobile", | |
| "episode": 2, | |
| "success": false, | |
| "total_return": -0.07499999999999998, | |
| "steps": 12, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 0.9767 | |
| }, | |
| { | |
| "task_name": "ecommerce_mobile_baseline_seed4", | |
| "task_id": "ecommerce_mobile", | |
| "episode": 0, | |
| "success": false, | |
| "total_return": -0.07499999999999998, | |
| "steps": 12, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 0.9806 | |
| }, | |
| { | |
| "task_name": "ecommerce_mobile_baseline_seed4", | |
| "task_id": "ecommerce_mobile", | |
| "episode": 1, | |
| "success": false, | |
| "total_return": -0.07499999999999998, | |
| "steps": 12, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 0.9806 | |
| }, | |
| { | |
| "task_name": "ecommerce_mobile_baseline_seed4", | |
| "task_id": "ecommerce_mobile", | |
| "episode": 2, | |
| "success": false, | |
| "total_return": -0.07499999999999998, | |
| "steps": 12, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 0.9806 | |
| }, | |
| { | |
| "task_name": "subscription_churn_baseline_seed0", | |
| "task_id": "subscription_churn", | |
| "episode": 0, | |
| "success": false, | |
| "total_return": -0.11079999999999998, | |
| "steps": 14, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 0.9864 | |
| }, | |
| { | |
| "task_name": "subscription_churn_baseline_seed0", | |
| "task_id": "subscription_churn", | |
| "episode": 1, | |
| "success": false, | |
| "total_return": -0.11079999999999998, | |
| "steps": 14, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 0.9864 | |
| }, | |
| { | |
| "task_name": "subscription_churn_baseline_seed0", | |
| "task_id": "subscription_churn", | |
| "episode": 2, | |
| "success": false, | |
| "total_return": -0.11079999999999998, | |
| "steps": 14, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 0.9864 | |
| }, | |
| { | |
| "task_name": "subscription_churn_baseline_seed1", | |
| "task_id": "subscription_churn", | |
| "episode": 0, | |
| "success": false, | |
| "total_return": -0.11079999999999998, | |
| "steps": 14, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 0.9767 | |
| }, | |
| { | |
| "task_name": "subscription_churn_baseline_seed1", | |
| "task_id": "subscription_churn", | |
| "episode": 1, | |
| "success": false, | |
| "total_return": -0.11079999999999998, | |
| "steps": 14, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 0.9767 | |
| }, | |
| { | |
| "task_name": "subscription_churn_baseline_seed1", | |
| "task_id": "subscription_churn", | |
| "episode": 2, | |
| "success": false, | |
| "total_return": -0.11079999999999998, | |
| "steps": 14, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 0.9767 | |
| }, | |
| { | |
| "task_name": "subscription_churn_baseline_seed2", | |
| "task_id": "subscription_churn", | |
| "episode": 0, | |
| "success": false, | |
| "total_return": -0.11079999999999998, | |
| "steps": 14, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 0.9845 | |
| }, | |
| { | |
| "task_name": "subscription_churn_baseline_seed2", | |
| "task_id": "subscription_churn", | |
| "episode": 1, | |
| "success": false, | |
| "total_return": -0.11079999999999998, | |
| "steps": 14, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 0.9845 | |
| }, | |
| { | |
| "task_name": "subscription_churn_baseline_seed2", | |
| "task_id": "subscription_churn", | |
| "episode": 2, | |
| "success": false, | |
| "total_return": -0.11079999999999998, | |
| "steps": 14, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 0.9845 | |
| }, | |
| { | |
| "task_name": "subscription_churn_baseline_seed3", | |
| "task_id": "subscription_churn", | |
| "episode": 0, | |
| "success": false, | |
| "total_return": -0.11079999999999998, | |
| "steps": 14, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 0.9786 | |
| }, | |
| { | |
| "task_name": "subscription_churn_baseline_seed3", | |
| "task_id": "subscription_churn", | |
| "episode": 1, | |
| "success": false, | |
| "total_return": -0.11079999999999998, | |
| "steps": 14, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 0.9786 | |
| }, | |
| { | |
| "task_name": "subscription_churn_baseline_seed3", | |
| "task_id": "subscription_churn", | |
| "episode": 2, | |
| "success": false, | |
| "total_return": -0.11079999999999998, | |
| "steps": 14, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 0.9786 | |
| }, | |
| { | |
| "task_name": "subscription_churn_baseline_seed4", | |
| "task_id": "subscription_churn", | |
| "episode": 0, | |
| "success": false, | |
| "total_return": -0.11079999999999998, | |
| "steps": 14, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 0.9825 | |
| }, | |
| { | |
| "task_name": "subscription_churn_baseline_seed4", | |
| "task_id": "subscription_churn", | |
| "episode": 1, | |
| "success": false, | |
| "total_return": -0.11079999999999998, | |
| "steps": 14, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 0.9825 | |
| }, | |
| { | |
| "task_name": "subscription_churn_baseline_seed4", | |
| "task_id": "subscription_churn", | |
| "episode": 2, | |
| "success": false, | |
| "total_return": -0.11079999999999998, | |
| "steps": 14, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 0.9825 | |
| }, | |
| { | |
| "task_name": "delivery_eta_baseline_seed0", | |
| "task_id": "delivery_eta", | |
| "episode": 0, | |
| "success": false, | |
| "total_return": -0.09269999999999995, | |
| "steps": 13, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 0.9845 | |
| }, | |
| { | |
| "task_name": "delivery_eta_baseline_seed0", | |
| "task_id": "delivery_eta", | |
| "episode": 1, | |
| "success": false, | |
| "total_return": -0.09269999999999995, | |
| "steps": 13, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 0.9845 | |
| }, | |
| { | |
| "task_name": "delivery_eta_baseline_seed0", | |
| "task_id": "delivery_eta", | |
| "episode": 2, | |
| "success": false, | |
| "total_return": -0.09269999999999995, | |
| "steps": 13, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 0.9845 | |
| }, | |
| { | |
| "task_name": "delivery_eta_baseline_seed1", | |
| "task_id": "delivery_eta", | |
| "episode": 0, | |
| "success": false, | |
| "total_return": -0.09269999999999995, | |
| "steps": 13, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 0.9883 | |
| }, | |
| { | |
| "task_name": "delivery_eta_baseline_seed1", | |
| "task_id": "delivery_eta", | |
| "episode": 1, | |
| "success": false, | |
| "total_return": -0.09269999999999995, | |
| "steps": 13, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 0.9883 | |
| }, | |
| { | |
| "task_name": "delivery_eta_baseline_seed1", | |
| "task_id": "delivery_eta", | |
| "episode": 2, | |
| "success": false, | |
| "total_return": -0.09269999999999995, | |
| "steps": 13, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 0.9883 | |
| }, | |
| { | |
| "task_name": "delivery_eta_baseline_seed2", | |
| "task_id": "delivery_eta", | |
| "episode": 0, | |
| "success": false, | |
| "total_return": -0.09269999999999995, | |
| "steps": 13, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 0.9748 | |
| }, | |
| { | |
| "task_name": "delivery_eta_baseline_seed2", | |
| "task_id": "delivery_eta", | |
| "episode": 1, | |
| "success": false, | |
| "total_return": -0.09269999999999995, | |
| "steps": 13, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 0.9748 | |
| }, | |
| { | |
| "task_name": "delivery_eta_baseline_seed2", | |
| "task_id": "delivery_eta", | |
| "episode": 2, | |
| "success": false, | |
| "total_return": -0.09269999999999995, | |
| "steps": 13, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 0.9748 | |
| }, | |
| { | |
| "task_name": "delivery_eta_baseline_seed3", | |
| "task_id": "delivery_eta", | |
| "episode": 0, | |
| "success": false, | |
| "total_return": -0.09269999999999995, | |
| "steps": 13, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 0.9767 | |
| }, | |
| { | |
| "task_name": "delivery_eta_baseline_seed3", | |
| "task_id": "delivery_eta", | |
| "episode": 1, | |
| "success": false, | |
| "total_return": -0.09269999999999995, | |
| "steps": 13, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 0.9767 | |
| }, | |
| { | |
| "task_name": "delivery_eta_baseline_seed3", | |
| "task_id": "delivery_eta", | |
| "episode": 2, | |
| "success": false, | |
| "total_return": -0.09269999999999995, | |
| "steps": 13, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 0.9767 | |
| }, | |
| { | |
| "task_name": "delivery_eta_baseline_seed4", | |
| "task_id": "delivery_eta", | |
| "episode": 0, | |
| "success": false, | |
| "total_return": -0.09269999999999995, | |
| "steps": 13, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 0.9748 | |
| }, | |
| { | |
| "task_name": "delivery_eta_baseline_seed4", | |
| "task_id": "delivery_eta", | |
| "episode": 1, | |
| "success": false, | |
| "total_return": -0.09269999999999995, | |
| "steps": 13, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 0.9748 | |
| }, | |
| { | |
| "task_name": "delivery_eta_baseline_seed4", | |
| "task_id": "delivery_eta", | |
| "episode": 2, | |
| "success": false, | |
| "total_return": -0.09269999999999995, | |
| "steps": 13, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 0.9748 | |
| } | |
| ] | |
| } |