Spaces:
Sleeping
Sleeping
| { | |
| "agent": "RandomAgent", | |
| "base_url": "https://israaaML-fsds-cleaning-env.hf.space", | |
| "n_episodes": 45, | |
| "aggregate": { | |
| "episodes": 45, | |
| "success_rate": 0.0, | |
| "avg_return": -0.09121333333333337, | |
| "avg_steps": 3.1777777777777776, | |
| "avg_invalid_actions": 0.0 | |
| }, | |
| "episodes": [ | |
| { | |
| "task_name": "ecommerce_mobile_baseline_seed0", | |
| "task_id": "ecommerce_mobile", | |
| "episode": 0, | |
| "success": false, | |
| "total_return": -0.1, | |
| "steps": 2, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 1.0 | |
| }, | |
| { | |
| "task_name": "ecommerce_mobile_baseline_seed0", | |
| "task_id": "ecommerce_mobile", | |
| "episode": 1, | |
| "success": false, | |
| "total_return": -0.42000000000000004, | |
| "steps": 6, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 1.0 | |
| }, | |
| { | |
| "task_name": "ecommerce_mobile_baseline_seed0", | |
| "task_id": "ecommerce_mobile", | |
| "episode": 2, | |
| "success": false, | |
| "total_return": -0.1, | |
| "steps": 3, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 1.0 | |
| }, | |
| { | |
| "task_name": "ecommerce_mobile_baseline_seed1", | |
| "task_id": "ecommerce_mobile", | |
| "episode": 0, | |
| "success": false, | |
| "total_return": -0.1, | |
| "steps": 6, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 1.0 | |
| }, | |
| { | |
| "task_name": "ecommerce_mobile_baseline_seed1", | |
| "task_id": "ecommerce_mobile", | |
| "episode": 1, | |
| "success": false, | |
| "total_return": -0.04, | |
| "steps": 4, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 1.0 | |
| }, | |
| { | |
| "task_name": "ecommerce_mobile_baseline_seed1", | |
| "task_id": "ecommerce_mobile", | |
| "episode": 2, | |
| "success": false, | |
| "total_return": -0.28, | |
| "steps": 10, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 1.0 | |
| }, | |
| { | |
| "task_name": "ecommerce_mobile_baseline_seed2", | |
| "task_id": "ecommerce_mobile", | |
| "episode": 0, | |
| "error": "Tool 'submit_solution' failed: Error calling tool 'submit_solution': Error serializing to JSON: TypeError: 'float' object cannot be interpreted as an integer (type: execution_error)", | |
| "success": false, | |
| "total_return": 0.0, | |
| "steps": 0, | |
| "invalid_actions": 0 | |
| }, | |
| { | |
| "task_name": "ecommerce_mobile_baseline_seed2", | |
| "task_id": "ecommerce_mobile", | |
| "episode": 1, | |
| "success": false, | |
| "total_return": -0.02, | |
| "steps": 2, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 1.0 | |
| }, | |
| { | |
| "task_name": "ecommerce_mobile_baseline_seed2", | |
| "task_id": "ecommerce_mobile", | |
| "episode": 2, | |
| "success": false, | |
| "total_return": -0.12000000000000001, | |
| "steps": 4, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 0.9728 | |
| }, | |
| { | |
| "task_name": "ecommerce_mobile_baseline_seed3", | |
| "task_id": "ecommerce_mobile", | |
| "episode": 0, | |
| "success": false, | |
| "total_return": 0.0, | |
| "steps": 1, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 1.0 | |
| }, | |
| { | |
| "task_name": "ecommerce_mobile_baseline_seed3", | |
| "task_id": "ecommerce_mobile", | |
| "episode": 1, | |
| "success": false, | |
| "total_return": 0.0, | |
| "steps": 1, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 1.0 | |
| }, | |
| { | |
| "task_name": "ecommerce_mobile_baseline_seed3", | |
| "task_id": "ecommerce_mobile", | |
| "episode": 2, | |
| "error": "Tool 'preview_data' failed: Error calling tool 'preview_data': Error serializing to JSON: TypeError: 'float' object cannot be interpreted as an integer (type: execution_error)", | |
| "success": false, | |
| "total_return": 0.0, | |
| "steps": 0, | |
| "invalid_actions": 0 | |
| }, | |
| { | |
| "task_name": "ecommerce_mobile_baseline_seed4", | |
| "task_id": "ecommerce_mobile", | |
| "episode": 0, | |
| "success": false, | |
| "total_return": 0.005000000000000001, | |
| "steps": 4, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 1.0 | |
| }, | |
| { | |
| "task_name": "ecommerce_mobile_baseline_seed4", | |
| "task_id": "ecommerce_mobile", | |
| "episode": 1, | |
| "success": false, | |
| "total_return": -0.1, | |
| "steps": 4, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 1.0 | |
| }, | |
| { | |
| "task_name": "ecommerce_mobile_baseline_seed4", | |
| "task_id": "ecommerce_mobile", | |
| "episode": 2, | |
| "success": false, | |
| "total_return": -0.135, | |
| "steps": 7, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 0.9806 | |
| }, | |
| { | |
| "task_name": "subscription_churn_baseline_seed0", | |
| "task_id": "subscription_churn", | |
| "episode": 0, | |
| "success": false, | |
| "total_return": 0.0, | |
| "steps": 1, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 1.0 | |
| }, | |
| { | |
| "task_name": "subscription_churn_baseline_seed0", | |
| "task_id": "subscription_churn", | |
| "episode": 1, | |
| "success": false, | |
| "total_return": 0.0, | |
| "steps": 1, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 1.0 | |
| }, | |
| { | |
| "task_name": "subscription_churn_baseline_seed0", | |
| "task_id": "subscription_churn", | |
| "episode": 2, | |
| "success": false, | |
| "total_return": 0.0, | |
| "steps": 1, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 1.0 | |
| }, | |
| { | |
| "task_name": "subscription_churn_baseline_seed1", | |
| "task_id": "subscription_churn", | |
| "episode": 0, | |
| "success": false, | |
| "total_return": -0.34, | |
| "steps": 7, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 0.9767 | |
| }, | |
| { | |
| "task_name": "subscription_churn_baseline_seed1", | |
| "task_id": "subscription_churn", | |
| "episode": 1, | |
| "success": false, | |
| "total_return": -0.12000000000000001, | |
| "steps": 3, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 1.0 | |
| }, | |
| { | |
| "task_name": "subscription_churn_baseline_seed1", | |
| "task_id": "subscription_churn", | |
| "episode": 2, | |
| "success": false, | |
| "total_return": 0.0, | |
| "steps": 2, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 1.0 | |
| }, | |
| { | |
| "task_name": "subscription_churn_baseline_seed2", | |
| "task_id": "subscription_churn", | |
| "episode": 0, | |
| "success": false, | |
| "total_return": -0.1, | |
| "steps": 2, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 1.0 | |
| }, | |
| { | |
| "task_name": "subscription_churn_baseline_seed2", | |
| "task_id": "subscription_churn", | |
| "episode": 1, | |
| "success": false, | |
| "total_return": -0.12000000000000001, | |
| "steps": 3, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 1.0 | |
| }, | |
| { | |
| "task_name": "subscription_churn_baseline_seed2", | |
| "task_id": "subscription_churn", | |
| "episode": 2, | |
| "success": false, | |
| "total_return": -0.1, | |
| "steps": 2, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 1.0 | |
| }, | |
| { | |
| "task_name": "subscription_churn_baseline_seed3", | |
| "task_id": "subscription_churn", | |
| "episode": 0, | |
| "success": false, | |
| "total_return": -0.34, | |
| "steps": 6, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 1.0 | |
| }, | |
| { | |
| "task_name": "subscription_churn_baseline_seed3", | |
| "task_id": "subscription_churn", | |
| "episode": 1, | |
| "success": false, | |
| "total_return": -0.06, | |
| "steps": 5, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 0.9786 | |
| }, | |
| { | |
| "task_name": "subscription_churn_baseline_seed3", | |
| "task_id": "subscription_churn", | |
| "episode": 2, | |
| "success": false, | |
| "total_return": -0.02, | |
| "steps": 2, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 1.0 | |
| }, | |
| { | |
| "task_name": "subscription_churn_baseline_seed4", | |
| "task_id": "subscription_churn", | |
| "episode": 0, | |
| "success": false, | |
| "total_return": -0.12000000000000001, | |
| "steps": 5, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 1.0 | |
| }, | |
| { | |
| "task_name": "subscription_churn_baseline_seed4", | |
| "task_id": "subscription_churn", | |
| "episode": 1, | |
| "success": false, | |
| "total_return": -0.02, | |
| "steps": 3, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 1.0 | |
| }, | |
| { | |
| "task_name": "subscription_churn_baseline_seed4", | |
| "task_id": "subscription_churn", | |
| "episode": 2, | |
| "success": false, | |
| "total_return": -0.2, | |
| "steps": 3, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 1.0 | |
| }, | |
| { | |
| "task_name": "delivery_eta_baseline_seed0", | |
| "task_id": "delivery_eta", | |
| "episode": 0, | |
| "success": false, | |
| "total_return": -0.4800000000000001, | |
| "steps": 11, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 0.9825 | |
| }, | |
| { | |
| "task_name": "delivery_eta_baseline_seed0", | |
| "task_id": "delivery_eta", | |
| "episode": 1, | |
| "success": false, | |
| "total_return": -0.21730000000000002, | |
| "steps": 6, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 1.0 | |
| }, | |
| { | |
| "task_name": "delivery_eta_baseline_seed0", | |
| "task_id": "delivery_eta", | |
| "episode": 2, | |
| "success": false, | |
| "total_return": -0.02, | |
| "steps": 4, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 1.0 | |
| }, | |
| { | |
| "task_name": "delivery_eta_baseline_seed1", | |
| "task_id": "delivery_eta", | |
| "episode": 0, | |
| "success": false, | |
| "total_return": 0.0, | |
| "steps": 1, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 1.0 | |
| }, | |
| { | |
| "task_name": "delivery_eta_baseline_seed1", | |
| "task_id": "delivery_eta", | |
| "episode": 1, | |
| "success": false, | |
| "total_return": 0.0, | |
| "steps": 2, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 1.0 | |
| }, | |
| { | |
| "task_name": "delivery_eta_baseline_seed1", | |
| "task_id": "delivery_eta", | |
| "episode": 2, | |
| "error": "Tool 'submit_solution' failed: Error calling tool 'submit_solution': Error serializing to JSON: TypeError: 'float' object cannot be interpreted as an integer (type: execution_error)", | |
| "success": false, | |
| "total_return": 0.0, | |
| "steps": 0, | |
| "invalid_actions": 0 | |
| }, | |
| { | |
| "task_name": "delivery_eta_baseline_seed2", | |
| "task_id": "delivery_eta", | |
| "episode": 0, | |
| "success": false, | |
| "total_return": 0.0, | |
| "steps": 3, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 1.0 | |
| }, | |
| { | |
| "task_name": "delivery_eta_baseline_seed2", | |
| "task_id": "delivery_eta", | |
| "episode": 1, | |
| "success": false, | |
| "total_return": 0.0, | |
| "steps": 1, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 1.0 | |
| }, | |
| { | |
| "task_name": "delivery_eta_baseline_seed2", | |
| "task_id": "delivery_eta", | |
| "episode": 2, | |
| "success": false, | |
| "total_return": 0.0, | |
| "steps": 1, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 1.0 | |
| }, | |
| { | |
| "task_name": "delivery_eta_baseline_seed3", | |
| "task_id": "delivery_eta", | |
| "episode": 0, | |
| "success": false, | |
| "total_return": 0.002700000000000001, | |
| "steps": 2, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 1.0 | |
| }, | |
| { | |
| "task_name": "delivery_eta_baseline_seed3", | |
| "task_id": "delivery_eta", | |
| "episode": 1, | |
| "error": "Tool 'preview_data' failed: Error calling tool 'preview_data': Error serializing to JSON: TypeError: 'float' object cannot be interpreted as an integer (type: execution_error)", | |
| "success": false, | |
| "total_return": 0.0, | |
| "steps": 0, | |
| "invalid_actions": 0 | |
| }, | |
| { | |
| "task_name": "delivery_eta_baseline_seed3", | |
| "task_id": "delivery_eta", | |
| "episode": 2, | |
| "success": false, | |
| "total_return": -0.1, | |
| "steps": 2, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 1.0 | |
| }, | |
| { | |
| "task_name": "delivery_eta_baseline_seed4", | |
| "task_id": "delivery_eta", | |
| "episode": 0, | |
| "success": false, | |
| "total_return": 0.0, | |
| "steps": 1, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 1.0 | |
| }, | |
| { | |
| "task_name": "delivery_eta_baseline_seed4", | |
| "task_id": "delivery_eta", | |
| "episode": 1, | |
| "success": false, | |
| "total_return": -0.12000000000000001, | |
| "steps": 3, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 1.0 | |
| }, | |
| { | |
| "task_name": "delivery_eta_baseline_seed4", | |
| "task_id": "delivery_eta", | |
| "episode": 2, | |
| "success": false, | |
| "total_return": -0.22000000000000003, | |
| "steps": 6, | |
| "invalid_actions": 0, | |
| "quality_gate_passed": false, | |
| "retention_ratio": 1.0 | |
| } | |
| ] | |
| } |