File size: 8,118 Bytes
93155de a26f93a 78a8904 93155de |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 |
from symphony import Symphony
import gymnasium as gym
import logging
logging.getLogger().setLevel(logging.CRITICAL)
import torch
import numpy as np
import random
import pickle
import time
import os, re
#############################################
# -----------Helper Functions---------------#
#############################################
# random seeds for reproducing the experiment
def seed_reset():
r1, r2, r3 = random.randint(0,2**32-1), random.randint(0,2**32-1), random.randint(0,2**32-1)
torch.manual_seed(r1)
np.random.seed(r2)
random.seed(r3)
return r1, r2, r3
def extract_r1_r2_r3():
pattern = r'history_(\d+)_(\d+)_(\d+)\.csv'
# Iterate through the files in the given directory
for filename in os.listdir():
# Match the filename with the pattern
match = re.match(pattern, filename)
if match:
# Extract the numbers r1, r2, and r3 from the filename
return map(int, match.groups())
return None
#write or append to the history log file
class LogFile(object):
def __init__(self, log_name_main, log_name_opt):
self.log_name_main = log_name_main
self.log_name_opt = log_name_opt
def write(self, text):
with open(self.log_name_main, 'a+') as file:
file.write(text)
def write_opt(self, text):
with open(self.log_name_opt, 'a+') as file:
file.write(text)
def clean(self):
with open(self.log_name_main, 'w') as file:
file.write("step,return\n")
with open(self.log_name_opt, 'w') as file:
file.write("ep,return,steps,scale\n")
numbers = extract_r1_r2_r3()
if numbers != None:
# derive random numbers from history file
r1, r2, r3 = numbers
else:
# generate new random seeds
r1, r2, r3 = seed_reset()
print(r1, ", ", r2, ", ", r3)
log_name_main = "history_" + str(r1) + "_" + str(r2) + "_" + str(r3) + ".csv"
log_name_opt = "episodes_" + str(r1) + "_" + str(r2) + "_" + str(r3) + ".csv"
log_file = LogFile(log_name_main, log_name_opt)
def save(algo, total_rewards, total_steps):
torch.save(algo.nets.online.state_dict(), 'nets_online_model.pt')
torch.save(algo.nets.target.state_dict(), 'nets_target_model.pt')
torch.save(algo.nets_optimizer.state_dict(), 'nets_optimizer.pt')
print("saving... the buffer length = ", algo.replay_buffer.length, end="")
with open('data', 'wb') as file:
pickle.dump({'buffer': algo.replay_buffer, 'q_next_ema': algo.nets.q_next_ema, 'total_rewards': total_rewards, 'total_steps': total_steps}, file)
print(" > done")
def load(algo, Q_learning):
total_rewards, total_steps = [], 0
try:
print("loading models...")
algo.nets.online.load_state_dict(torch.load('nets_online_model.pt', weights_only=True))
algo.nets.target.load_state_dict(torch.load('nets_target_model.pt', weights_only=True))
algo.nets_optimizer.load_state_dict(torch.load('nets_optimizer.pt', weights_only=True))
print('models loaded')
#sim_loop(env_valid, 100, True, False, algo, [], total_steps=0)
except:
print("problem during loading models")
try:
print("loading buffer...")
with open('data', 'rb') as file:
dict = pickle.load(file)
algo.replay_buffer = dict['buffer']
algo.nets.q_next_ema = dict['q_next_ema']
total_rewards = dict['total_rewards']
total_steps = dict['total_steps']
if algo.replay_buffer.length>=explore_time and not Q_learning: Q_learning = True
print('buffer loaded, Q_ema', round(algo.nets.q_next_ema.item(), 2), ', average_reward = ', round(np.mean(total_rewards[-300:]), 2))
except:
print("problem during loading buffer")
return Q_learning, total_rewards, total_steps
#############################################
# ---------------Parametres-----------------#
#############################################
#global parameters
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
G = 3
learning_rate = 1e-4
explore_time, times = 10240, 50
capacity = explore_time * times
h_dim = capacity//1000
limit_step = 1000 #max steps per episode
limit_eval = 1000 #max steps per evaluation
num_episodes = 1000000
start_episode = 1 #number for the identification of the current episode
episode_rewards_all, episode_steps_all, test_rewards, Q_learning, total_steps = [], [], [], False, 0
# environment type.
option = 3
pre_valid = True
if option == 0: env_name = '"BipedalWalker-v3'
elif option == 1: env_name = 'HalfCheetah-v4'
elif option == 2: env_name = 'Walker2d-v4'
elif option == 3: env_name = 'Humanoid-v4'
elif option == 4: env_name = 'Ant-v4'
elif option == 5: env_name = 'Swimmer-v4'
elif option == 6: env_name = 'Hopper-v4'
elif option == 7: env_name = 'Pusher-v4'
env = gym.make(env_name)
env_test = gym.make(env_name)
env_valid = gym.make(env_name, render_mode="human")
state_dim = env.observation_space.shape[0]
action_dim= env.action_space.shape[0]
#max_action = torch.FloatTensor(env.action_space.high) if env.action_space.is_bounded() else torch.ones(action_dim)
max_action = torch.ones(action_dim)
print("action_dim: ", action_dim, "state_dim: ", state_dim, "max_action:", max_action)
algo = Symphony(capacity, state_dim, action_dim, h_dim, device, max_action, learning_rate)
# Loop for episodes:[ State -> Loop for one episode: [ Action, Next State, Reward, Done, State = Next State ] ]
def sim_loop(env, episodes, testing, Q_learning, algo, total_rewards, total_steps):
start_episode = len(total_rewards) + 1
for episode in range(start_episode, episodes+1):
Return = 0.0
state = env.reset()[0]
for steps in range(1,limit_step+1):
seed_reset()
total_steps += 1
# Activate training if explore time is reached and if it is not testing mode:
if testing:
Q_learning = False
else:
if algo.replay_buffer.length>=explore_time and not Q_learning:
Q_learning = True
algo.replay_buffer.norm_fill(times)
print("started training")
# if total steps is divisible to 2500 save models, stop training and do testing, return to training:
if Q_learning and total_steps>=2500 and total_steps%2500==0:
save(algo, total_rewards, total_steps)
print("start testing")
test_return = sim_loop(env_test, 25, True, Q_learning, algo, [], total_steps=0)
log_file.write(str(total_steps) + "," + str(round(test_return, 2)) + "\n")
print("end of testing")
# if steps is close to episode limit (e.g. 950) we shut down actions and leave noise to get Terminal Transition:
active = steps<(limit_step-50) if Q_learning else True
action = algo.select_action(state, action=active, noise=not testing)
next_state, reward, done, truncated, info = env.step(action)
if not testing: algo.replay_buffer.add(state, action, reward, next_state, done)
Return += reward
# actual training
if Q_learning: [scale := algo.train() for _ in range(G)]
if done or truncated: break
state = next_state
total_rewards.append(Return)
average_reward = np.mean(total_rewards[-300:])
print(f"Ep {episode}: Rtrn = {Return:.2f}, Avg = {average_reward:.2f}| ep steps = {steps} | total_steps = {total_steps}")
if not testing and Q_learning: log_file.write_opt(str(episode) + "," + str(round(Return, 2)) + "," + str(total_steps) + "," + str(round(scale.mean().item(), 4)) + "\n")
return np.mean(total_rewards).item()
# Loading existing models
Q_learning, total_rewards, total_steps = load(algo, Q_learning)
if not Q_learning: log_file.clean()
# Training
sim_loop(env, num_episodes, False, Q_learning, algo, total_rewards, total_steps)
|