diff --git a/.gitattributes b/.gitattributes index 7c997f5173693385cf8f34df08526d910db5986d..ec645e70edaa1c0663dd5b1921f3bf52b115fb6d 100644 --- a/.gitattributes +++ b/.gitattributes @@ -176,3 +176,4 @@ tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/_ .venv/lib/python3.11/site-packages/ray/_private/__pycache__/worker.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text .venv/lib/python3.11/site-packages/ray/_private/__pycache__/test_utils.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text .venv/lib/python3.11/site-packages/ray/_private/thirdparty/pynvml/__pycache__/pynvml.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text +.venv/lib/python3.11/site-packages/ray/rllib/algorithms/__pycache__/algorithm.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/__pycache__/algorithm.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/__pycache__/algorithm.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1b04dfbb8a08c1dffed33929fe7c79a26bfc9eec --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/__pycache__/algorithm.cpython-311.pyc @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95af77c4d005c6313bfda251ea72e8f78077efdc496424adc38b2ccc523d6af7 +size 171335 diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/curriculum/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/curriculum/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/curriculum/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/curriculum/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1887da130aab67c6a2c573bc84e37897b68da1ea Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/curriculum/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/curriculum/__pycache__/curriculum_learning.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/curriculum/__pycache__/curriculum_learning.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ef110b064bd31d1a83ab2780125e572575af9a87 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/curriculum/__pycache__/curriculum_learning.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/curriculum/curriculum_learning.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/curriculum/curriculum_learning.py new file mode 100644 index 0000000000000000000000000000000000000000..d08cc35c224ff85b5a19d854942465ae127a6809 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/curriculum/curriculum_learning.py @@ -0,0 +1,241 @@ +"""Example of using an env-task curriculum by implementing a custom callback. + +This example: + - demonstrates how to define your own curriculum-capable environments using + gymnasium's FrozenLake env. + - defines a custom callback that gets called once per iteration and - if necessary - + changes the maps used by FrozenLake on all EnvRunners to a new task (by moving the + goal position further and further away from the starting position). + - also demonstrates an alternative approach via reloading/recreating an entirely new + env inside all EnvRunners. + - uses Tune and RLlib to curriculum-learn the env described above and compares 2 + algorithms, one that does use curriculum learning vs one that does not. + +We use a FrozenLake (sparse reward) environment with a map size of 8x8 and a time step +limit of 16 to make it almost impossible for a non-curriculum policy to learn. + + +How to run this script +---------------------- +`python [script file name].py --enable-new-api-stack` + +Use the `--no-curriculum` flag to disable curriculum learning and force your policy +to be trained on the hardest task right away. With this option, the algorithm should NOT +succeed. + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` + + +Results to expect +----------------- +In the console output, you can see that only PPO policy that uses a curriculum can +actually learn, whereas the one that is thrown into the toughest task right from the +start never learns anything. + +Policy using the curriculum: ++-------------------------------+------------+-----------------+--------+ +| Trial name | status | loc | iter | +|-------------------------------+------------+-----------------+--------+ +| PPO_FrozenLake-v1_93ca4_00000 | TERMINATED | 127.0.0.1:73318 | 41 | ++-------------------------------+------------+-----------------+--------+ ++------------------+--------+----------+--------------------+ +| total time (s) | ts | reward | episode_len_mean | +|------------------+--------+----------+--------------------| +| 97.652 | 164000 | 1 | 14.0348 | ++------------------+--------+----------+--------------------+ + +Policy NOT using the curriculum (trying to solve the hardest task right away): +[DOES NOT LEARN AT ALL] +""" +from functools import partial + +from ray.air.constants import TRAINING_ITERATION +from ray.rllib.algorithms.algorithm import Algorithm +from ray.rllib.callbacks.callbacks import RLlibCallback +from ray.rllib.connectors.env_to_module import FlattenObservations +from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig +from ray.rllib.utils.metrics import ( + ENV_RUNNER_RESULTS, + EPISODE_RETURN_MEAN, + NUM_ENV_STEPS_SAMPLED_LIFETIME, +) +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, +) +from ray.tune.registry import get_trainable_cls + +parser = add_rllib_example_script_args(default_iters=100, default_timesteps=600000) +parser.set_defaults(enable_new_api_stack=True) +parser.add_argument( + "--upgrade-task-threshold", + type=float, + default=0.99, + help="The mean episode return, upon reaching of which we increase the task by one.", +) +parser.add_argument( + "--no-curriculum", + action="store_true", + help="Whether to NOT use curriculum learning (and instead trying to solve the " + "hardest task right away).", +) + +# __curriculum_learning_example_env_options__ +ENV_OPTIONS = { + "is_slippery": False, + # Limit the number of steps the agent is allowed to make in the env to + # make it almost impossible to learn without the curriculum. + "max_episode_steps": 16, +} + +# Our 3 tasks: 0=easiest, 1=medium, 2=hard +ENV_MAPS = [ + # 0 + [ + "SFFHFFFH", + "FFFHFFFF", + "FFGFFFFF", + "FFFFFFFF", + "HFFFFFFF", + "HHFFFFHF", + "FFFFFHHF", + "FHFFFFFF", + ], + # 1 + [ + "SFFHFFFH", + "FFFHFFFF", + "FFFFFFFF", + "FFFFFFFF", + "HFFFFFFF", + "HHFFGFHF", + "FFFFFHHF", + "FHFFFFFF", + ], + # 2 + [ + "SFFHFFFH", + "FFFHFFFF", + "FFFFFFFF", + "FFFFFFFF", + "HFFFFFFF", + "HHFFFFHF", + "FFFFFHHF", + "FHFFFFFG", + ], +] +# __END_curriculum_learning_example_env_options__ + + +# Simple function sent to an EnvRunner to change the map of all its gym. Envs from +# the current one to a new (tougher) one, in which the goal position is further away +# from the starting position. Note that a map is a list of strings, each one +# representing one row in the map. Each character in the strings represent a single +# field (S=starting position, H=hole (bad), F=frozen/free field (ok), G=goal (great!)). +def _remote_fn(env_runner, new_task: int): + # We recreate the entire env object by changing the env_config on the worker, + # then calling its `make_env()` method. + env_runner.config.environment(env_config={"desc": ENV_MAPS[new_task]}) + env_runner.make_env() + + +class EnvTaskCallback(RLlibCallback): + """Custom callback implementing `on_train_result()` for changing the envs' maps.""" + + def on_train_result( + self, + *, + algorithm: Algorithm, + metrics_logger=None, + result: dict, + **kwargs, + ) -> None: + # Hack: Store the current task inside a counter in our Algorithm. + # W/o a curriculum, the task is always 2 (hardest). + if args.no_curriculum: + algorithm._counters["current_env_task"] = 2 + current_task = algorithm._counters["current_env_task"] + + # If episode return is consistently `args.upgrade_task_threshold`, we switch + # to a more difficult task (if possible). If we already mastered the most + # difficult task, we publish our victory in the result dict. + result["task_solved"] = 0.0 + current_return = result[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN] + if current_return > args.upgrade_task_threshold: + if current_task < 2: + new_task = current_task + 1 + print( + f"Switching task/map on all EnvRunners to #{new_task} (0=easiest, " + f"2=hardest), b/c R={current_return} on current task." + ) + algorithm.env_runner_group.foreach_env_runner( + func=partial(_remote_fn, new_task=new_task) + ) + algorithm._counters["current_env_task"] = new_task + + # Hardest task was solved (1.0) -> report this in the results dict. + elif current_return == 1.0: + result["task_solved"] = 1.0 + # Emergency brake: If return is 0.0 AND we are already at a harder task (1 or + # 2), we go back to task=0. + elif current_return == 0.0 and current_task > 0: + print( + "Emergency brake: Our policy seemed to have collapsed -> Setting task " + "back to 0." + ) + algorithm.env_runner_group.foreach_env_runner( + func=partial(_remote_fn, new_task=0) + ) + algorithm._counters["current_env_task"] = 0 + + +if __name__ == "__main__": + args = parser.parse_args() + + base_config = ( + get_trainable_cls(args.algo) + .get_default_config() + # Plug in our curriculum callbacks that controls when we should upgrade the env + # task based on the received return for the current task. + .callbacks(EnvTaskCallback) + .environment( + "FrozenLake-v1", + env_config={ + # w/ curriculum: start with task=0 (easiest) + # w/o curriculum: start directly with hardest task 2. + "desc": ENV_MAPS[2 if args.no_curriculum else 0], + **ENV_OPTIONS, + }, + ) + .env_runners( + num_envs_per_env_runner=5, + env_to_module_connector=lambda env: FlattenObservations(), + ) + .training( + num_epochs=6, + vf_loss_coeff=0.01, + lr=0.0002, + ) + .rl_module(model_config=DefaultModelConfig(vf_share_layers=True)) + ) + + stop = { + TRAINING_ITERATION: args.stop_iters, + # Reward directly does not matter to us as we would like to continue + # after the policy reaches a return of ~1.0 on the 0-task (easiest). + # But we DO want to stop, once the entire task is learned (policy achieves + # return of 1.0 on the most difficult task=2). + "task_solved": 1.0, + NUM_ENV_STEPS_SAMPLED_LIFETIME: args.stop_timesteps, + } + + run_rllib_example_script_experiment( + base_config, args, stop=stop, success_metric={"task_solved": 1.0} + ) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f81be9aed26d68a88668b19b5942ee3cb1886ef5 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/__pycache__/agents_act_in_sequence.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/__pycache__/agents_act_in_sequence.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..42098212a26579db6b3e361158ddf1cdba56208d Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/__pycache__/agents_act_in_sequence.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/__pycache__/agents_act_simultaneously.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/__pycache__/agents_act_simultaneously.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..72481e2c700643d2648b0fc2597f68684b05eeca Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/__pycache__/agents_act_simultaneously.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/__pycache__/async_gym_env_vectorization.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/__pycache__/async_gym_env_vectorization.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6df42079bdea8a76efc00031ef217a5b409d51ec Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/__pycache__/async_gym_env_vectorization.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/__pycache__/custom_env_render_method.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/__pycache__/custom_env_render_method.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..977b9d7e92867b8da8bc850a3a19748d72bdbe85 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/__pycache__/custom_env_render_method.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/__pycache__/custom_gym_env.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/__pycache__/custom_gym_env.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ac045a29111694d1205563ced257e5c0c60e1077 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/__pycache__/custom_gym_env.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/__pycache__/env_connecting_to_rllib_w_tcp_client.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/__pycache__/env_connecting_to_rllib_w_tcp_client.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e4bea66b30566d1f3e1a3a443f553a1c0de32660 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/__pycache__/env_connecting_to_rllib_w_tcp_client.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/__pycache__/env_rendering_and_recording.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/__pycache__/env_rendering_and_recording.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..096ff061a6feece80c30718179e48968e5b5bb54 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/__pycache__/env_rendering_and_recording.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/__pycache__/env_w_protobuf_observations.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/__pycache__/env_w_protobuf_observations.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..40586d831225ceb8f3f794e095b7de48f17f2020 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/__pycache__/env_w_protobuf_observations.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/__pycache__/greyscale_env.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/__pycache__/greyscale_env.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a552734bef76ac02c9f710e6fb3cff760c2fea1e Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/__pycache__/greyscale_env.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/__pycache__/unity3d_env_local.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/__pycache__/unity3d_env_local.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..85baecec9f16e1cd5293d69dd816cb58ca81d71b Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/__pycache__/unity3d_env_local.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c2bd6417923f8ac6ca62ca8b6172eb870f3c22ef Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/action_mask_env.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/action_mask_env.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..98f279b2e1a66ac90f03af28be932c99d41d46a1 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/action_mask_env.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/cartpole_crashing.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/cartpole_crashing.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1ca98e8880ff45f523129945347b6313c8ae8b71 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/cartpole_crashing.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/cartpole_sparse_rewards.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/cartpole_sparse_rewards.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..84a58ad159bb6184b81b592b00d1854f04425168 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/cartpole_sparse_rewards.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/cartpole_with_dict_observation_space.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/cartpole_with_dict_observation_space.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6ec84727b993ae90be8548731fdfa2eb4a079e0f Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/cartpole_with_dict_observation_space.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/cartpole_with_large_observation_space.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/cartpole_with_large_observation_space.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..75b4a5cfadb0394f6d7cf315f18e547b00defe64 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/cartpole_with_large_observation_space.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/cartpole_with_protobuf_observation_space.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/cartpole_with_protobuf_observation_space.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0d2780a8e4831838aa192ed510107bcd3c8ad174 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/cartpole_with_protobuf_observation_space.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/cliff_walking_wall_env.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/cliff_walking_wall_env.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..87b754cc886e6213e1c8c3de2461e5c5a31afda0 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/cliff_walking_wall_env.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/correlated_actions_env.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/correlated_actions_env.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..599f3508b833ecedbe10f78e54f84236cbd87d35 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/correlated_actions_env.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/d4rl_env.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/d4rl_env.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7dd0507c4a9ef46a21e9ad8dcae504d157196e8b Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/d4rl_env.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/debug_counter_env.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/debug_counter_env.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ed37a8dc452d72468435eb5f0f9c9b07539498fb Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/debug_counter_env.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/deterministic_envs.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/deterministic_envs.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..603718eaf7e770b11a12221f9154e188dc4f8612 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/deterministic_envs.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/dm_control_suite.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/dm_control_suite.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..707ebf9b87305a1dde7ed2686c007f53e2945d3e Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/dm_control_suite.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/env_using_remote_actor.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/env_using_remote_actor.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d623674f65480df7dd1f4b0c21349245f379f487 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/env_using_remote_actor.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/env_with_subprocess.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/env_with_subprocess.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0d8580fd9958e961485a66c3673e22d613f63887 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/env_with_subprocess.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/fast_image_env.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/fast_image_env.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2c5728ef75df33ab0930bfc1e356342ba4ed9088 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/fast_image_env.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/gpu_requiring_env.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/gpu_requiring_env.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c5c1a2422f1ed7cfd11b41b43714ce8668346b8c Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/gpu_requiring_env.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/look_and_push.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/look_and_push.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5fd7bf9602dbe2f01ad7d9631d1baae14fb2caa6 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/look_and_push.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/memory_leaking_env.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/memory_leaking_env.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2ec4a3e7de5a8cf557241bb2ddae9da4d31af519 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/memory_leaking_env.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/mock_env.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/mock_env.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f0a41463cf6fdcb29515372800a50ce336a8024b Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/mock_env.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/nested_space_repeat_after_me_env.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/nested_space_repeat_after_me_env.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..45c55b5cedc185d059044c2d34cc4598dc0b3f98 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/nested_space_repeat_after_me_env.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/parametric_actions_cartpole.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/parametric_actions_cartpole.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e602845a956faf98077c425960a7291b0dbf0ac2 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/parametric_actions_cartpole.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/random_env.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/random_env.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..75f962a7e88f0be7e983a7cd5311ba87676ea3ec Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/random_env.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/recommender_system_envs_with_recsim.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/recommender_system_envs_with_recsim.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8986e41eeb8212d70fe3f5ccbb80a6ebb76587e3 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/recommender_system_envs_with_recsim.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/repeat_after_me_env.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/repeat_after_me_env.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..42a65ccacb41a11499bccd048a42ca1078bb19cd Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/repeat_after_me_env.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/repeat_initial_obs_env.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/repeat_initial_obs_env.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fad10cce2af701a0ff6bebc0ea89413f96287afd Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/repeat_initial_obs_env.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/simple_corridor.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/simple_corridor.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c665aacd5553753ec9ee14905e18ef56e9be962b Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/simple_corridor.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/simple_rpg.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/simple_rpg.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d6475c164a1793f511a23004aefaab3f08f9812b Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/simple_rpg.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/six_room_env.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/six_room_env.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..27258650a7f6d3dbdbfca24e88479214ba46a5ae Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/six_room_env.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/stateless_cartpole.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/stateless_cartpole.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..da91cd07e69c142b9b5cd09558c82a1c3ee6ecb7 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/stateless_cartpole.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/stateless_pendulum.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/stateless_pendulum.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2b271367bdd472e41ca33770f1424575df36db00 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/stateless_pendulum.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/transformed_action_space_env.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/transformed_action_space_env.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cceda54841019922d62cacb859d2aa938546f0e3 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/transformed_action_space_env.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/windy_maze_env.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/windy_maze_env.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b13e87e07521ee871e660e3f01b1a854a8732e36 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/windy_maze_env.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b7fb660ccd462981f7ace066fceacb8556f8a356 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/__init__.py @@ -0,0 +1,35 @@ +from ray.rllib.env.multi_agent_env import make_multi_agent +from ray.rllib.examples.envs.classes.cartpole_with_dict_observation_space import ( + CartPoleWithDictObservationSpace, +) +from ray.rllib.examples.envs.classes.multi_agent.guess_the_number_game import ( + GuessTheNumberGame, +) +from ray.rllib.examples.envs.classes.multi_agent.two_step_game import ( + TwoStepGame, + TwoStepGameWithGroupedAgents, +) +from ray.rllib.examples.envs.classes.nested_space_repeat_after_me_env import ( + NestedSpaceRepeatAfterMeEnv, +) +from ray.rllib.examples.envs.classes.stateless_cartpole import StatelessCartPole + + +# Backward compatibility. +__all__ = [ + "GuessTheNumberGame", + "TwoStepGame", + "TwoStepGameWithGroupedAgents", +] + + +MultiAgentCartPole = make_multi_agent("CartPole-v1") +MultiAgentMountainCar = make_multi_agent("MountainCarContinuous-v0") +MultiAgentPendulum = make_multi_agent("Pendulum-v1") +MultiAgentStatelessCartPole = make_multi_agent(lambda config: StatelessCartPole(config)) +MultiAgentCartPoleWithDictObservationSpace = make_multi_agent( + lambda config: CartPoleWithDictObservationSpace(config) +) +MultiAgentNestedSpaceRepeatAfterMeEnv = make_multi_agent( + lambda config: NestedSpaceRepeatAfterMeEnv(config) +) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9c57f046204dbebbd6f0fabb92b05d4b2dca2f34 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/__pycache__/bandit_envs_discrete.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/__pycache__/bandit_envs_discrete.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d07911bdc75d17a397b91aa0ee4b43366387dfaf Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/__pycache__/bandit_envs_discrete.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/__pycache__/bandit_envs_recommender_system.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/__pycache__/bandit_envs_recommender_system.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d398a40c0466533e3c7db06b3cb3e84afe2c1200 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/__pycache__/bandit_envs_recommender_system.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/__pycache__/guess_the_number_game.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/__pycache__/guess_the_number_game.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bdb2e28f630565de10f26b765fc339af1f18aa1b Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/__pycache__/guess_the_number_game.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/__pycache__/pettingzoo_chess.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/__pycache__/pettingzoo_chess.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bc8f4b127e2dec7d59b978d3c334b41fc08574c2 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/__pycache__/pettingzoo_chess.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/__pycache__/pettingzoo_connect4.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/__pycache__/pettingzoo_connect4.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..88d5c774b0296898181a1d7bd980a112190353ea Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/__pycache__/pettingzoo_connect4.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/__pycache__/rock_paper_scissors.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/__pycache__/rock_paper_scissors.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..deb3c6734428d5cb1a338a6163e23c4e22ae224a Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/__pycache__/rock_paper_scissors.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/__pycache__/tic_tac_toe.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/__pycache__/tic_tac_toe.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e7ca364d09c4a4a01eef87b5fc175c20ef29148a Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/__pycache__/tic_tac_toe.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/__pycache__/two_step_game.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/__pycache__/two_step_game.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b5d0c3aa4467e15950c3a97f9a2719006f28239f Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/__pycache__/two_step_game.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/bandit_envs_recommender_system.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/bandit_envs_recommender_system.py new file mode 100644 index 0000000000000000000000000000000000000000..05a29082a0d3352d96dd7cbace9b46a88f24e225 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/bandit_envs_recommender_system.py @@ -0,0 +1,227 @@ +"""Examples for recommender system simulating envs ready to be used by RLlib Algorithms. + +This env follows RecSim obs and action APIs. +""" +import gymnasium as gym +import numpy as np +from typing import Optional + +from ray.rllib.utils.numpy import softmax + + +class ParametricRecSys(gym.Env): + """A recommendation environment which generates items with visible features + randomly (parametric actions). + The environment can be configured to be multi-user, i.e. different models + will be learned independently for each user, by setting num_users_in_db + parameter. + To enable slate recommendation, the `slate_size` config parameter can be + set as > 1. + """ + + def __init__( + self, + embedding_size: int = 20, + num_docs_to_select_from: int = 10, + slate_size: int = 1, + num_docs_in_db: Optional[int] = None, + num_users_in_db: Optional[int] = None, + user_time_budget: float = 60.0, + ): + """Initializes a ParametricRecSys instance. + + Args: + embedding_size: Embedding size for both users and docs. + Each value in the user/doc embeddings can have values between + -1.0 and 1.0. + num_docs_to_select_from: The number of documents to present to the + agent each timestep. The agent will then have to pick a slate + out of these. + slate_size: The size of the slate to recommend to the user at each + timestep. + num_docs_in_db: The total number of documents in the DB. Set this + to None, in case you would like to resample docs from an + infinite pool. + num_users_in_db: The total number of users in the DB. Set this to + None, in case you would like to resample users from an infinite + pool. + user_time_budget: The total time budget a user has throughout an + episode. Once this time budget is used up (through engagements + with clicked/selected documents), the episode ends. + """ + self.embedding_size = embedding_size + self.num_docs_to_select_from = num_docs_to_select_from + self.slate_size = slate_size + + self.num_docs_in_db = num_docs_in_db + self.docs_db = None + self.num_users_in_db = num_users_in_db + self.users_db = None + self.current_user = None + + self.user_time_budget = user_time_budget + self.current_user_budget = user_time_budget + + self.observation_space = gym.spaces.Dict( + { + # The D docs our agent sees at each timestep. + # It has to select a k-slate out of these. + "doc": gym.spaces.Dict( + { + str(i): gym.spaces.Box( + -1.0, 1.0, shape=(self.embedding_size,), dtype=np.float32 + ) + for i in range(self.num_docs_to_select_from) + } + ), + # The user engaging in this timestep/episode. + "user": gym.spaces.Box( + -1.0, 1.0, shape=(self.embedding_size,), dtype=np.float32 + ), + # For each item in the previous slate, was it clicked? + # If yes, how long was it being engaged with (e.g. watched)? + "response": gym.spaces.Tuple( + [ + gym.spaces.Dict( + { + # Clicked or not? + "click": gym.spaces.Discrete(2), + # Engagement time (how many minutes watched?). + "engagement": gym.spaces.Box( + 0.0, 100.0, shape=(), dtype=np.float32 + ), + } + ) + for _ in range(self.slate_size) + ] + ), + } + ) + # Our action space is + self.action_space = gym.spaces.MultiDiscrete( + [self.num_docs_to_select_from for _ in range(self.slate_size)] + ) + + def _get_embedding(self): + return np.random.uniform(-1, 1, size=(self.embedding_size,)).astype(np.float32) + + def reset(self, *, seed=None, options=None): + # Reset the current user's time budget. + self.current_user_budget = self.user_time_budget + + # Sample a user for the next episode/session. + # Pick from a only-once-sampled user DB. + if self.num_users_in_db is not None: + if self.users_db is None: + self.users_db = [ + self._get_embedding() for _ in range(self.num_users_in_db) + ] + self.current_user = self.users_db[np.random.choice(self.num_users_in_db)] + # Pick from an infinite pool of users. + else: + self.current_user = self._get_embedding() + + return self._get_obs(), {} + + def step(self, action): + # Action is the suggested slate (indices of the docs in the + # suggested ones). + + # We calculate scores as the dot product between document features and user + # features. The softmax ensures regret<1 further down. + scores = softmax( + [np.dot(self.current_user, doc) for doc in self.currently_suggested_docs] + ) + best_reward = np.max(scores) + + # User choice model: User picks a doc stochastically, + # where probs are dot products between user- and doc feature + # (categories) vectors (rewards). + # There is also a no-click doc whose weight is 0.0. + user_doc_overlaps = np.array([scores[a] for a in action] + [0.0]) + # We have to softmax again so that probabilities add up to 1 + probabilities = softmax(user_doc_overlaps) + which_clicked = np.random.choice( + np.arange(self.slate_size + 1), p=probabilities + ) + + reward = 0.0 + if which_clicked < self.slate_size: + # Reward is 1.0 - regret if clicked. 0.0 if not clicked. + regret = best_reward - user_doc_overlaps[which_clicked] + # The reward also represents the user engagement that we define to be + # withing the range [0...100]. + reward = (1 - regret) * 100 + # If anything clicked, deduct from the current user's time budget. + self.current_user_budget -= 1.0 + done = truncated = self.current_user_budget <= 0.0 + + # Compile response. + response = tuple( + { + "click": int(idx == which_clicked), + "engagement": reward if idx == which_clicked else 0.0, + } + for idx in range(len(user_doc_overlaps) - 1) + ) + + return self._get_obs(response=response), reward, done, truncated, {} + + def _get_obs(self, response=None): + # Sample D docs from infinity or our pre-existing docs. + # Pick from a only-once-sampled docs DB. + if self.num_docs_in_db is not None: + if self.docs_db is None: + self.docs_db = [ + self._get_embedding() for _ in range(self.num_docs_in_db) + ] + self.currently_suggested_docs = [ + self.docs_db[doc_idx].astype(np.float32) + for doc_idx in np.random.choice( + self.num_docs_in_db, + size=(self.num_docs_to_select_from,), + replace=False, + ) + ] + # Pick from an infinite pool of docs. + else: + self.currently_suggested_docs = [ + self._get_embedding() for _ in range(self.num_docs_to_select_from) + ] + + doc = {str(i): d for i, d in enumerate(self.currently_suggested_docs)} + + if not response: + response = self.observation_space["response"].sample() + + return { + "user": self.current_user.astype(np.float32), + "doc": doc, + "response": response, + } + + +if __name__ == "__main__": + """Test RecommSys env with random actions for baseline performance.""" + env = ParametricRecSys( + num_docs_in_db=100, + num_users_in_db=1, + ) + obs, info = env.reset() + num_episodes = 0 + episode_rewards = [] + episode_reward = 0.0 + + while num_episodes < 100: + action = env.action_space.sample() + obs, reward, done, truncated, _ = env.step(action) + + episode_reward += reward + if done: + print(f"episode reward = {episode_reward}") + env.reset() + num_episodes += 1 + episode_rewards.append(episode_reward) + episode_reward = 0.0 + + print(f"Avg reward={np.mean(episode_rewards)}") diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/utils/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/utils/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/utils/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..da1c6386f6c9bc5ddd8bd467aaab583d3d4d9f60 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/utils/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/utils/__pycache__/cartpole_observations_proto.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/utils/__pycache__/cartpole_observations_proto.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ebc58ec49eb381010cc3aeff8b1157339569ece5 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/utils/__pycache__/cartpole_observations_proto.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/utils/cartpole_observations_proto.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/utils/cartpole_observations_proto.py new file mode 100644 index 0000000000000000000000000000000000000000..15b30f5b0b13680f3c4c4e8eba039cd038accd23 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/utils/cartpole_observations_proto.py @@ -0,0 +1,28 @@ +# -*- coding: utf-8 -*- +# Generated by the protocol buffer compiler. DO NOT EDIT! +# source: cartpole_observations.proto +# Protobuf Python Version: 5.26.1 +"""Generated protocol buffer code.""" +from google.protobuf import descriptor as _descriptor +from google.protobuf import descriptor_pool as _descriptor_pool +from google.protobuf import symbol_database as _symbol_database +from google.protobuf.internal import builder as _builder + +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( + b'\n\x1b\x63\x61rtpole_observations.proto"]\n\x13\x43\x61rtPoleObservation\x12\r\n\x05x_pos\x18\x01 \x01(\x01\x12\x0f\n\x07x_veloc\x18\x02 \x01(\x01\x12\x11\n\tangle_pos\x18\x03 \x01(\x01\x12\x13\n\x0b\x61ngle_veloc\x18\x04 \x01(\x01\x62\x06proto3' # noqa +) + +_globals = globals() +_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) +_builder.BuildTopDescriptorsAndMessages( + DESCRIPTOR, "cartpole_observations_proto", _globals +) +if not _descriptor._USE_C_DESCRIPTORS: + DESCRIPTOR._loaded_options = None + _globals["_CARTPOLEOBSERVATION"]._serialized_start = 31 + _globals["_CARTPOLEOBSERVATION"]._serialized_end = 124 +# @@protoc_insertion_point(module_scope) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/fault_tolerance/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/fault_tolerance/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/fault_tolerance/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/fault_tolerance/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f605e4be7d829ae116df58f7857b3c5a8ddc3506 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/fault_tolerance/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/fault_tolerance/__pycache__/crashing_and_stalling_env.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/fault_tolerance/__pycache__/crashing_and_stalling_env.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..41e6febe68ed39736784091afae50aa9a5c898df Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/fault_tolerance/__pycache__/crashing_and_stalling_env.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/fault_tolerance/crashing_and_stalling_env.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/fault_tolerance/crashing_and_stalling_env.py new file mode 100644 index 0000000000000000000000000000000000000000..4425d51d5d9e59e7ef41137d826885bf53f58976 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/fault_tolerance/crashing_and_stalling_env.py @@ -0,0 +1,176 @@ +"""Example demonstrating that RLlib can learn (at scale) in unstable environments. + +This script uses the `CartPoleCrashing` environment, an adapted cartpole env whose +instability is configurable through setting the probability of a crash and/or stall +(sleep for a configurable amount of time) during `reset()` and/or `step()`. + +RLlib has two major flags for EnvRunner fault tolerance, which can be independently +set to True: +1) `config.fault_tolerance(restart_failed_sub_environments=True)` causes only the +(gymnasium) environment object on an EnvRunner to be closed (try calling `close()` on +the faulty object), garbage collected, and finally recreated from scratch. Note that +during this process, the containing EnvRunner remaing up and running and sampling +simply continues after the env recycling. This is the lightest and fastest form of +fault tolerance and should be attempted first. +2) `config.fault_tolerance(restart_failed_env_runners=True)` causes the entire +EnvRunner (a Ray remote actor) to be restarted. This restart logically includes the +gymnasium environment, the RLModule, and all connector pipelines on the EnvRunner. +Use this option only if you face problems with the first option +(restart_failed_sub_environments=True), such as incomplete cleanups and memory leaks. + + +How to run this script +---------------------- +`python [script file name].py --enable-new-api-stack + +You can switch on the fault tolerant behavior (1) (restart_failed_sub_environments) +through the `--restart-failed-envs` flag. If this flag is not set, the script will +recreate the entire (faulty) EnvRunner. + +You can switch on stalling (besides crashing) through the `--stall` command line flag. +If set, besides crashing on `reset()` and/or `step()`, there is also a chance of +stalling for a few seconds on each of these events. + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` + + +Results to expect +----------------- +You should see the following (or very similar) console output when running this script +with: +`--algo=PPO --stall --restart-failed-envs --stop-reward=450.0` ++---------------------+------------+----------------+--------+------------------+ +| Trial name | status | loc | iter | total time (s) | +| | | | | | +|---------------------+------------+----------------+--------+------------------+ +| PPO_env_ba39b_00000 | TERMINATED | 127.0.0.1:1401 | 22 | 133.497 | ++---------------------+------------+----------------+--------+------------------+ ++------------------------+------------------------+------------------------+ +| episode_return_mean | num_episodes_lifetim | num_env_steps_traine | +| | e | d_lifetime | +|------------------------+------------------------+------------------------| +| 450.24 | 542 | 88628 | ++------------------------+------------------------+------------------------+ + +For APPO and testing restarting the entire EnvRunners, you could run the script with: +`--algo=APPO --stall --stop-reward=450.0` ++----------------------+------------+----------------+--------+------------------+ +| Trial name | status | loc | iter | total time (s) | +| | | | | | +|----------------------+------------+----------------+--------+------------------+ +| APPO_env_ba39b_00000 | TERMINATED | 127.0.0.1:4653 | 10 | 101.531 | ++----------------------+------------+----------------+--------+------------------+ ++------------------------+------------------------+------------------------+ +| episode_return_mean | num_episodes_lifetim | num_env_steps_traine | +| | e | d_lifetime | +|------------------------+------------------------+------------------------| +| 478.85 | 2546 | 321500 | ++------------------------+------------------------+------------------------+ +""" +from gymnasium.wrappers import TimeLimit + +from ray import tune +from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig +from ray.rllib.examples.envs.classes.cartpole_crashing import ( + CartPoleCrashing, + MultiAgentCartPoleCrashing, +) +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, +) + +parser = add_rllib_example_script_args( + default_reward=450.0, + default_timesteps=2000000, +) +parser.set_defaults( + enable_new_api_stack=True, + num_env_runners=4, +) +# Use `parser` to add your own custom command line options to this script +# and (if needed) use their values to set up `config` below. +parser.add_argument( + "--stall", + action="store_true", + help="Whether to also stall the env from time to time", +) +parser.add_argument( + "--restart-failed-envs", + action="store_true", + help="Whether to restart a failed environment (vs restarting the entire " + "EnvRunner).", +) + + +if __name__ == "__main__": + args = parser.parse_args() + + # Register our environment with tune. + if args.num_agents > 0: + tune.register_env("env", lambda cfg: MultiAgentCartPoleCrashing(cfg)) + else: + tune.register_env( + "env", + lambda cfg: TimeLimit(CartPoleCrashing(cfg), max_episode_steps=500), + ) + + base_config = ( + tune.registry.get_trainable_cls(args.algo) + .get_default_config() + .environment( + "env", + env_config={ + "num_agents": args.num_agents, + # Probability to crash during step(). + "p_crash": 0.0001, + # Probability to crash during reset(). + "p_crash_reset": 0.001, + "crash_on_worker_indices": [1, 2], + "init_time_s": 2.0, + # Probability to stall during step(). + "p_stall": 0.0005, + # Probability to stall during reset(). + "p_stall_reset": 0.001, + # Stall from 2 to 5sec (or 0.0 if --stall not set). + "stall_time_sec": (2, 5) if args.stall else 0.0, + # EnvRunner indices to stall on. + "stall_on_worker_indices": [2, 3], + }, + ) + # Switch on resiliency. + .fault_tolerance( + # Recreate any failed EnvRunners. + restart_failed_env_runners=True, + # Restart any failed environment (w/o recreating the EnvRunner). Note that + # this is the much faster option. + restart_failed_sub_environments=args.restart_failed_envs, + ) + ) + + # Use more stabilizing hyperparams for APPO. + if args.algo == "APPO": + base_config.training( + grad_clip=40.0, + entropy_coeff=0.0, + vf_loss_coeff=0.05, + ) + base_config.rl_module( + model_config=DefaultModelConfig(vf_share_layers=True), + ) + + # Add a simple multi-agent setup. + if args.num_agents > 0: + base_config.multi_agent( + policies={f"p{i}" for i in range(args.num_agents)}, + policy_mapping_fn=lambda aid, *a, **kw: f"p{aid}", + ) + + run_rllib_example_script_experiment(base_config, args=args) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_serve/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_serve/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_serve/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_serve/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c581351ad3dae4018e7086b23272d4d6e2d23755 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_serve/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_serve/__pycache__/ray_serve_with_rllib.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_serve/__pycache__/ray_serve_with_rllib.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ceb2866549cbee3268998faad232e545d33de87d Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_serve/__pycache__/ray_serve_with_rllib.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_serve/classes/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_serve/classes/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_serve/classes/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_serve/classes/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..72995fb401e47b9d387f589d16eb02f7025fc8a0 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_serve/classes/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_serve/classes/__pycache__/cartpole_deployment.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_serve/classes/__pycache__/cartpole_deployment.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..79b9b82e1ad446c77fd1253fdee8a8f7408d7567 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_serve/classes/__pycache__/cartpole_deployment.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_serve/classes/cartpole_deployment.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_serve/classes/cartpole_deployment.py new file mode 100644 index 0000000000000000000000000000000000000000..41686306c09577bb1f41f1205e0f96cde6f8100c --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_serve/classes/cartpole_deployment.py @@ -0,0 +1,50 @@ +import json +from typing import Dict + +import numpy as np +from starlette.requests import Request +import torch + +from ray import serve +from ray.rllib.core import Columns +from ray.rllib.core.rl_module.rl_module import RLModule +from ray.serve.schema import LoggingConfig + + +@serve.deployment(logging_config=LoggingConfig(log_level="WARN")) +class ServeRLlibRLModule: + """Callable class used by Ray Serve to handle async requests. + + All the necessary serving logic is implemented in here: + - Creation and restoring of the (already trained) RLlib Algorithm. + - Calls to algo.compute_action upon receiving an action request + (with a current observation). + """ + + def __init__(self, rl_module_checkpoint): + self.rl_module = RLModule.from_checkpoint(rl_module_checkpoint) + + async def __call__(self, starlette_request: Request) -> Dict: + request = await starlette_request.body() + request = request.decode("utf-8") + request = json.loads(request) + obs = request["observation"] + + # Compute and return the action for the given observation (create a batch + # with B=1 and convert to torch). + output = self.rl_module.forward_inference( + batch={"obs": torch.from_numpy(np.array([obs], np.float32))} + ) + # Extract action logits and unbatch. + logits = output[Columns.ACTION_DIST_INPUTS][0] + # Act greedily (argmax). + action = int(np.argmax(logits)) + + return {"action": action} + + +# Defining the builder function. This is so we can start our deployment via: +# `serve run [this py module]:rl_module checkpoint=[some algo checkpoint path]` +def rl_module(args: Dict[str, str]): + serve.start(http_options={"host": "0.0.0.0", "port": args.get("port", 12345)}) + return ServeRLlibRLModule.bind(args["rl_module_checkpoint"]) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_serve/ray_serve_with_rllib.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_serve/ray_serve_with_rllib.py new file mode 100644 index 0000000000000000000000000000000000000000..0853151f40fa0b11905531305393ec58c5cc9c25 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_serve/ray_serve_with_rllib.py @@ -0,0 +1,190 @@ +"""Example on how to run RLlib in combination with Ray Serve. + +This example trains an agent with PPO on the CartPole environment, then creates +an RLModule checkpoint and returns its location. After that, it sends the checkpoint +to the Serve deployment for serving the trained RLModule (policy). + +This example: + - shows how to set up a Ray Serve deployment for serving an already trained + RLModule (policy network). + - shows how to request new actions from the Ray Serve deployment while actually + running through episodes in an environment (on which the RLModule that's served + was trained). + + +How to run this script +---------------------- +`python [script file name].py --enable-new-api-stack --stop-reward=200.0` + +Use the `--stop-iters`, `--stop-reward`, and/or `--stop-timesteps` options to +determine how long to train the policy for. Use the `--serve-episodes` option to +set the number of episodes to serve (after training) and the `--no-render` option +to NOT render the environment during the serving phase. + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` + +You can visualize experiment results in ~/ray_results using TensorBoard. + + +Results to expect +----------------- + +You should see something similar to the following on the command line when using the +options: `--stop-reward=250.0`, `--num-episodes-served=2`, and `--port=12345`: + +[First, the RLModule is trained through PPO] + ++-----------------------------+------------+-----------------+--------+ +| Trial name | status | loc | iter | +| | | | | +|-----------------------------+------------+-----------------+--------+ +| PPO_CartPole-v1_84778_00000 | TERMINATED | 127.0.0.1:40411 | 1 | ++-----------------------------+------------+-----------------+--------+ ++------------------+---------------------+------------------------+ +| total time (s) | episode_return_mean | num_env_steps_sample | +| | | d_lifetime | +|------------------+---------------------|------------------------| +| 2.87052 | 253.2 | 12000 | ++------------------+---------------------+------------------------+ + +[The RLModule is deployed through Ray Serve on port 12345] + +Started Ray Serve with PID: 40458 + +[A few episodes are played through using the policy service (w/ greedy, non-exploratory +actions)] + +Episode R=500.0 +Episode R=500.0 +""" + +import atexit +import os + +import requests +import subprocess +import time + +import gymnasium as gym +from pathlib import Path + +from ray.rllib.algorithms.ppo import PPOConfig +from ray.rllib.core import ( + COMPONENT_LEARNER_GROUP, + COMPONENT_LEARNER, + COMPONENT_RL_MODULE, + DEFAULT_MODULE_ID, +) +from ray.rllib.utils.metrics import ( + ENV_RUNNER_RESULTS, + EPISODE_RETURN_MEAN, +) +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, +) + +parser = add_rllib_example_script_args() +parser.set_defaults( + enable_new_api_stack=True, + checkpoint_freq=1, + checkpoint_at_and=True, +) +parser.add_argument("--num-episodes-served", type=int, default=2) +parser.add_argument("--no-render", action="store_true") +parser.add_argument("--port", type=int, default=12345) + + +def kill_proc(proc): + try: + proc.terminate() # Send SIGTERM + proc.wait(timeout=5) # Wait for process to terminate + except subprocess.TimeoutExpired: + proc.kill() # Send SIGKILL + proc.wait() # Ensure process is dead + + +if __name__ == "__main__": + args = parser.parse_args() + + # Config for the served RLlib RLModule/Algorithm. + base_config = PPOConfig().environment("CartPole-v1") + + results = run_rllib_example_script_experiment(base_config, args) + algo_checkpoint = results.get_best_result( + f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}" + ).checkpoint.path + # We only need the RLModule component from the algorithm checkpoint. It's located + # under "[algo checkpoint dir]/learner_group/learner/rl_module/[default policy ID] + rl_module_checkpoint = ( + Path(algo_checkpoint) + / COMPONENT_LEARNER_GROUP + / COMPONENT_LEARNER + / COMPONENT_RL_MODULE + / DEFAULT_MODULE_ID + ) + + path_of_this_file = Path(__file__).parent + os.chdir(path_of_this_file) + # Start the serve app with the trained checkpoint. + serve_proc = subprocess.Popen( + [ + "serve", + "run", + "classes.cartpole_deployment:rl_module", + f"rl_module_checkpoint={rl_module_checkpoint}", + f"port={args.port}", + "route_prefix=/rllib-rlmodule", + ] + ) + # Register our `kill_proc` function to be called on exit to stop Ray Serve again. + atexit.register(kill_proc, serve_proc) + # Wait a while to make sure the app is ready to serve. + time.sleep(20) + print(f"Started Ray Serve with PID: {serve_proc.pid}") + + try: + # Create the environment that we would like to receive + # served actions for. + env = gym.make("CartPole-v1", render_mode="human") + obs, _ = env.reset() + + num_episodes = 0 + episode_return = 0.0 + + while num_episodes < args.num_episodes_served: + # Render env if necessary. + if not args.no_render: + env.render() + + # print(f"-> Requesting action for obs={obs} ...", end="") + # Send a request to serve. + resp = requests.get( + f"http://localhost:{args.port}/rllib-rlmodule", + json={"observation": obs.tolist()}, + ) + response = resp.json() + # print(f" received: action={response['action']}") + + # Apply the action in the env. + action = response["action"] + obs, reward, terminated, truncated, _ = env.step(action) + episode_return += reward + + # If episode done -> reset to get initial observation of new episode. + if terminated or truncated: + print(f"Episode R={episode_return}") + obs, _ = env.reset() + num_episodes += 1 + episode_return = 0.0 + + finally: + # Make sure to kill the process on script termination + kill_proc(serve_proc)