Car_Race_AI_V5 / Train.py
privateboss's picture
Upload 5 files
8ef51bf verified
import tensorflow as tf
import os
import json
from PPO_Model import PPOAgent
from datetime import datetime
gpus = tf.config.list_physical_devices('GPU')
if gpus:
try:
for gpu in gpus:
tf.config.experimental.set_memory_growth(gpu, True)
logical_gpus = tf.config.experimental.list_logical_devices('GPU')
print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
except RuntimeError as e:
print(e)
agent_config = {
"env_id": "CarRacing-v3",
"num_envs": 21,
"gamma": 0.99,
"lam": 0.95,
"clip_epsilon": 0.2,
"actor_lr": 3e-4,
"critic_lr": 3e-4,
"ppo_epochs": 10,
"minibatches": 4,
"steps_per_batch": 1024,
"num_stack_frames": 4,
"resize_dim": (84, 84),
"grayscale": True,
"seed": 42,
"log_dir": "./ppo_car_racing_logs",
"entropy_coeff": 0.001,
'save_interval_timesteps': 537600,
'hidden_layer_sizes': [512, 512, 512]
}
RESUME_TRAINING_FLAG = True
RESUME_CONFIG_FILE = "resume_config.json"
if __name__ == "__main__":
resume_from_timestep = 0
resume_model_path = None
run_log_directory = None
if RESUME_TRAINING_FLAG and os.path.exists(RESUME_CONFIG_FILE):
try:
with open(RESUME_CONFIG_FILE, "r") as f:
resume_info = json.load(f)
resume_from_timestep = resume_info.get("last_global_timestep", 0)
resume_model_path = resume_info.get("last_checkpoint_path", None)
run_log_directory = resume_info.get("run_log_directory", None)
print(f"Found resume config: Will attempt to resume from timestep {resume_from_timestep}")
print(f"Loading model from: {resume_model_path}")
print(f"Continuing logging in directory: {run_log_directory}")
if not (resume_model_path and os.path.exists(resume_model_path)):
print("WARNING: Resume model path invalid or not found. Starting a new run.")
resume_from_timestep = 0
resume_model_path = None
run_log_directory = None
os.remove(RESUME_CONFIG_FILE)
except (IOError, json.JSONDecodeError) as e:
print(f"WARNING: Failed to read or parse resume config file. Starting a new run. Error: {e}")
resume_from_timestep = 0
resume_model_path = None
run_log_directory = None
if os.path.exists(RESUME_CONFIG_FILE):
os.remove(RESUME_CONFIG_FILE)
if run_log_directory is None:
current_time = datetime.now().strftime("%Y%m%d-%H%M%S")
run_log_directory = os.path.join(agent_config["log_dir"], current_time)
print(f"No valid resume config found. Starting a new run in: {run_log_directory}")
agent_config["log_dir"] = run_log_directory
print("Initializing PPO Agent...")
agent = PPOAgent(**agent_config)
total_timesteps = 30_000_000
try:
agent.train(total_timesteps,
resume_from_timestep=resume_from_timestep,
resume_model_path=resume_model_path,
run_log_dir=run_log_directory)
except KeyboardInterrupt:
print("\nTraining interrupted by user. Saving current state for resume...")
print("State likely saved by periodic checkpointing. Exiting.")
except Exception as e:
print(f"\nAn error occurred during training: {e}")
print("Attempting to save current state for resume before exiting...")
finally:
print(f"\nTraining session ended. TensorBoard logs available at: tensorboard --logdir {agent.train_log_dir}")
print("To view logs, run the above command in your terminal.")
#I