ledmands
/

ALE-Pacman-v5

@@ -3,20 +3,26 @@ from stable_baselines3.common.evaluation import evaluate_policy
 from stable_baselines3.common.monitor import Monitor
 import gymnasium as gym
-MODEL_NAME = "ALE-Pacman-v5-control"
-# the saved model does not contain the replay buffer
-loaded_model = DQN.load(MODEL_NAME)
-# print(f"The loaded_model has {loaded_model.replay_buffer.size()} transitions in its buffer")
-# now the loaded replay is not empty anymore
-# print(f"The loaded_model has {loaded_model.replay_buffer.size()} transitions in its buffer")
 # Retrieve the environment
-eval_env = Monitor(gym.make("ALE/Pacman-v5", render_mode="human", ))
 # Evaluate the policy
-mean_reward, std_reward = evaluate_policy(loaded_model.policy, eval_env, n_eval_episodes=10, deterministic=False, )
-print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")

 from stable_baselines3.common.monitor import Monitor
 import gymnasium as gym
+import argparse
+# This script should have some options
+# 1. Turn off the stochasticity as determined by the ALEv5
+#   Even if deterministic is set to true in evaluate policy, the environment will ignore this 25% of the time
+#   To compensate for this, we can set the repeat action probability to 0
+parser = argparse.ArgumentParser()
+parser.add_argument("-r", "--repeat_action_probability", help="repeat action probability", type=float, default=0.25)
+args = parser.parse_args()
+MODEL_NAME = "ALE-Pacman-v5"
+rpt_act_prob = args.repeat_action_probability
+loaded_model = DQN.load(MODEL_NAME)
 # Retrieve the environment
+eval_env = Monitor(gym.make("ALE/Pacman-v5", render_mode="rgb_array", repeat_action_probability=rpt_act_prob))
 # Evaluate the policy
+mean_rwd, std_rwd = evaluate_policy(loaded_model.policy, eval_env, n_eval_episodes=1)
+print("mean rwd: ", mean_rwd)
+print("std rwd: ", std_rwd)