Spaces:

UZHedu
/

RL_Project20

Sleeping

App Files Files Community

RL_Project20 / ppo_main.py

ShuvamGanguli

Upload 4 files

3d433ba verified 16 days ago

raw

history blame contribute delete

15.2 kB

	import argparse
	import gymnasium as gym
	import sys
	import matplotlib.pyplot as plt
	import ale_py
	import pandas as pd

	from ppo_helpers_cnn import *
	from gymnasium.spaces import Box
	import cv2
	import logging
	import numpy as np

	# Set up logging
	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
	logger = logging.getLogger(__name__)

	# Preprocess environment
	def preprocess(obs):
	# Convert to grayscale
	obs = cv2.cvtColor(obs, cv2.COLOR_RGB2GRAY)
	# Resize
	obs = cv2.resize(obs, (84, 84), interpolation=cv2.INTER_AREA)

	return np.expand_dims(obs, axis=0).astype(np.float32) / 255.0


	import pandas as pd
	import numpy as np


	def df_ops(lst_df, seeds):
	for df in lst_df:
	seed_data = df[seeds]
	df['Avg'] = seed_data.mean(axis=1)
	df['High'] = seed_data.max(axis=1)
	df['Low'] = seed_data.min(axis=1)

	return lst_df


	# Main loop
	def main() -> int:
	# Initialize variables
	"""
	batches = 5
	steps = 5
	clip_interval = 2
	seeds = [10, 20]
	ep_per_batch = 2
	"""
	batches = 1000
	steps = 5
	clip_interval = 2
	seeds = [10, 20, 30, 40, 50]
	ep_per_batch = 5

	# Arguments - 'vanilla', 'reward_clip', 'rbs', 'grad_clip', 'obs_norm', 'adv_norm', 'return_norm', 'reward_norm'
	"""
	'vanilla', 'reward_clip', 'rbs', 'grad_clip', 'obs_norm', 'adv_norm', 'return_norm', 'reward_norm'

	python Poster/ppo_main.py --method vanilla --env ALE/Pacman-v5

	usage examples:
	python3 ppo_main.py --method vanilla

	python3 ppo_main.py --method grad_clip

	python3 ppo_main.py --method rbs
	"""
	parser = argparse.ArgumentParser(description='PPO Training')

	parser.add_argument('--method', type=str, choices=['vanilla', 'reward_clip', 'rbs', 'grad_clip',
	'obs_norm', 'adv_norm', 'return_norm', 'reward_norm'],
	default='vanilla', help='PPO update method')
	parser.add_argument('--env', type=str, default='ALE/Pacman-v5',
	help='Gym environment name (e.g., ALE/Pacman-v5, ALE/SpaceInvaders-v5, ALE/BattleZone-v5)')
	parser.add_argument('--render', action='store_true', help='Enable rendering')
	parser.add_argument('--clip_window', type=int, default=clip_interval,
	help='Number of batches to collect rewards for clipping range update')

	args = parser.parse_args()

	# Set up environment
	if args.render:
	env = gym.make(args.env, render_mode='human')
	else:
	env = gym.make(args.env)

	logger.info(f"Observation space: {env.observation_space}")
	logger.info(f"Action space: {env.action_space}")
	logger.info(f'Method: {args.method}')

	# Initialize CNN with a dummy observation to get correct input shape
	obs, _ = env.reset()
	dummy_obs_space = Box(low=0.0, high=1.0, shape=preprocess(obs).shape)

	# Initialize PPO agent
	agent = Agent(obs_space=dummy_obs_space, action_space=env.action_space,
	hidden=64, lr=0.00001, gamma=0.997, clip_coef=0.2,
	entropy_coef=0.01, value_coef=0.5, seed=70,
	batch_size=64, ppo_epochs=32, lam=0.95)

	# === Return-Based Scaling stats (for RBS method) ===
	r_mean, r_var = 0.0, 1e-8
	g2_mean = 1.0
	agent.r_var = r_var
	agent.g2_mean = g2_mean

	# Initialize data structure outside the loop
	all_reward_histories = pd.DataFrame(columns=[i for i in seeds], index=[i for i in range(1, batches + 1)])
	all_loss_histories = pd.DataFrame(columns=[i for i in seeds], index=[i for i in range(1, batches + 1)])
	all_policy_loss = pd.DataFrame(columns=[i for i in seeds])
	all_value_loss = pd.DataFrame(columns=[i for i in seeds])

	# Main update loop
	try:

	for seed in seeds:
	obs, info = env.reset(seed=seed)
	state = preprocess(obs)

	loss_history = []
	reward_history = []
	policy_loss_history = []
	value_loss_history = []

	episode = 0
	total_return = 0

	steps = [0]

	""" Update loop: Gradient, Reward Normalization """
	if args.method == 'reward_clip':
	alpha = np.random.uniform(1, 2)
	logger.info(f"α sampled = {alpha:.3f} seed = {seed}")

	clip_low, clip_high = None, None
	ep_reward_history = []

	obs, info = env.reset()
	state = preprocess(obs)

	for update in range(1, batches + 1):

	batch_episode_returns = [] # used for μ, σ

	for _ in range(ep_per_batch):
	ep_rewards = []
	done = False

	while not done:
	action, logp, value = agent.choose_action(state)
	next_obs, reward, terminated, truncated, info = env.step(action)
	done = terminated or truncated
	next_state = preprocess(next_obs)

	ep_rewards.append(reward)

	agent.remember(state, action, reward, done, logp, value, next_state)

	state = next_state

	if done:
	ep_return = sum(ep_rewards)
	if clip_low is not None:
	clipped_return = np.clip(ep_return, clip_low, clip_high)
	else:
	clipped_return = ep_return
	ep_reward_history.append(clipped_return)
	batch_episode_returns.append(clipped_return)

	episode += 1
	total_return += clipped_return

	logger.info(f"Episode {episode} return: {clipped_return:.2f}")

	obs, info = env.reset()
	state = preprocess(obs)

	# === Compute clipping bounds using Code 1 logic ===
	mu = np.mean(batch_episode_returns)
	sigma = np.std(batch_episode_returns) + 1e-8 if np.std(batch_episode_returns) != 0 else 1

	clip_low = mu - alpha * sigma
	clip_high = mu + alpha * sigma

	logger.info(
	f"[UPDATE {update}] New Reward Clip Range: "
	f"[{clip_low:.4f}, {clip_high:.4f}]"
	)

	# === PPO UPDATE ===
	avg_loss = agent.vanilla_ppo_update()
	loss_history.append(avg_loss)

	avg_ret = np.mean(batch_episode_returns)
	reward_history.append(avg_ret)

	logger.info(
	f"Update {update}: batch_mean={avg_ret:.4f}, "
	f"batch_std={np.std(batch_episode_returns):.4f}, "
	f"episodes={episode}, avg_loss={avg_loss:.4f}"
	)

	current_steps = len(agent.value_loss_history)
	steps.append(current_steps - 1 - steps[-1])
	x = len(steps) - 1

	value_loss_history.append(
	sum(agent.value_loss_history[steps[x - 1]:steps[x]]) / (steps[x - 1] - steps[x]))
	policy_loss_history.append(sum(agent.policy_loss_history[x - 1:x]) / (steps[x - 1] - steps[x]))

	""" Update loop: Other Normalization Methods """
	else:
	for update in range(1, batches + 1):
	batch_episode_rewards = []
	ep_per_batch = 5

	for _ in range(ep_per_batch):
	ep_rewards = []

	done = False

	while not done:
	action, logp, value = agent.choose_action(state)
	next_obs, reward, terminated, truncated, info = env.step(action)
	done = terminated or truncated
	next_state = preprocess(next_obs)

	ep_rewards.append(reward) # Add this line to collect rewards
	agent.remember(state, action, reward, done, logp, value, next_state)

	state = next_state

	if done:
	ep_return = sum(ep_rewards)
	episode += 1
	total_return += ep_return
	batch_episode_rewards.append(ep_return)
	logger.info(f"Episode {episode} return: {ep_return:.2f}")
	obs, info = env.reset()
	state = preprocess(obs)


	# Choose normalization method
	if args.method == 'vanilla':
	avg_loss = agent.vanilla_ppo_update()
	elif args.method == 'grad_clip':
	avg_loss = agent.update_gradient_clipping()
	elif args.method == 'obs_norm':
	avg_loss = agent.update_obs_norm()
	elif args.method == 'return_norm':
	avg_loss = agent.update_return_norm()
	elif args.method == 'reward_norm':
	avg_loss = agent.update_reward_norm()
	else: # rbs
	avg_loss = agent.update_rbs()

	loss_history.append(avg_loss)

	avg_ret = (total_return / episode) if episode else 0
	reward_history.append(avg_ret)
	logger.info(
	f"Update {update}: episodes={episode}, avg_return={avg_ret:.2f}, avg_loss={avg_loss:.4f}")

	current_steps = len(agent.value_loss_history)
	steps.append(current_steps-1 - steps[-1])
	x = len(steps)-1

	value_loss_history.append(sum(agent.value_loss_history[steps[x-1]:steps[x]]) / (steps[x-1] - steps[x]))
	policy_loss_history.append(sum(agent.policy_loss_history[x - 1:x]) / (steps[x - 1] - steps[x]))

	all_reward_histories[seed] = reward_history
	all_loss_histories[seed] = loss_history

	# print(agent.value_loss_history)
	all_value_loss[seed] = value_loss_history[1:]
	# print(len(agent.value_loss_history))
	# print(agent.policy_loss_history)
	all_policy_loss[seed] = policy_loss_history[1:]
	# print(len(agent.policy_loss_history))

	[all_reward_histories, all_loss_histories, all_value_loss, all_policy_loss] = df_ops([all_reward_histories,
	all_loss_histories,
	all_value_loss,
	all_policy_loss], seeds)
	# [all_reward_histories, all_loss_histories] = df_ops([all_reward_histories,
	# all_loss_histories], seeds)

	all_policy_loss.to_csv(args.method + '_policy_loss.csv')
	all_reward_histories.to_csv(args.method + '_reward_history.csv')
	all_loss_histories.to_csv(args.method + '_loss_history.csv')
	all_value_loss.to_csv(args.method + '_value_loss.csv')

	fig = plt.figure(figsize=(15, 10))

	# --- Subplot 1: Average PPO Loss ---
	ax2 = plt.subplot(221)
	# Plot the shaded High-Low Range
	ax2.fill_between(
	all_loss_histories.index,
	all_loss_histories['Low'],
	all_loss_histories['High'],
	color='#A8DADC', # Light blue for aesthetic shading
	alpha=0.5,
	label="High-Low Range"
	)
	# Plot the Average Line
	ax2.plot(all_loss_histories['Avg'], label="Avg Loss", color='#1D3557', linewidth=2)
	ax2.set_ylabel("Average PPO Loss")
	ax2.set_xlabel("PPO Update")
	ax2.legend()

	# --- Subplot 2: Reward ---
	ax3 = plt.subplot(222)
	# Plot the shaded High-Low Range
	ax3.fill_between(
	all_reward_histories.index,
	all_reward_histories['Low'],
	all_reward_histories['High'],
	color='#FEDCC8', # Light orange/peach
	alpha=0.5,
	label="High-Low Range"
	)
	# Plot the Average Line
	ax3.plot(all_reward_histories['Avg'], label="Avg Reward", color='#E63946', linewidth=2)
	ax3.set_ylabel("Average Reward")
	ax3.set_xlabel("PPO Update")
	ax3.legend()

	# --- Subplot 3: Policy Loss ---
	ax4 = plt.subplot(223)
	# Plot the shaded High-Low Range
	ax4.fill_between(
	all_policy_loss.index,
	all_policy_loss['Low'],
	all_policy_loss['High'],
	color='#B0E0A0', # Light green
	alpha=0.5,
	label="High-Low Range"
	)
	# Plot the Average Line
	ax4.plot(all_policy_loss['Avg'], label="Policy Loss", color='#38B000', linewidth=2)
	ax4.set_ylabel("Average Policy Loss")
	ax4.set_xlabel("PPO Update")
	ax4.legend()

	# --- Subplot 4: Value Loss ---
	ax5 = plt.subplot(224)
	# Plot the shaded High-Low Range
	ax5.fill_between(
	all_value_loss.index,
	all_value_loss['Low'],
	all_value_loss['High'],
	color='#D7BDE2', # Light purple
	alpha=0.5,
	label="High-Low Range"
	)
	# Plot the Average Line
	ax5.plot(all_value_loss['Avg'], label="Value Loss", color='#8E44AD', linewidth=2)
	ax5.set_ylabel("Average Value Loss")
	ax5.set_xlabel("PPO Update")
	ax5.legend()

	# --- Figure Settings ---
	fig.suptitle(f"PPO Training Stability - {args.method}", fontsize=16, fontweight='bold')
	# fig.tight_layout() # Adjust layout to make room for suptitle
	plt.show()

	except Exception as e:
	logger.error(f"Error: {e}", exc_info=True)
	return 1
	finally:
	avg = total_return / episode if episode else 0
	logger.info(f"\nEpisodes: {episode}, Avg return: {avg:.3f}")
	env.close()

	return 0


	if __name__ == "__main__":
	raise SystemExit(main())