import gymnasium as gym
import numpy
import numpy as np
from Qlearning_pole import Qlearning
import os

# Rendering the environment
# env=gym.make('CartPole-v1',render_mode='human')


Q1 = Qlearning()
# run the Q-Learning algorithm
Q1.train()
# simulate the learned strategy
(obtainedRewardsOptimal, env1) = Q1.simulateLearnedStrategy()

# close the environment
env1.close()
# get the sum of rewards
np.sum(obtainedRewardsOptimal)
import matplotlib.pyplot as plt
# now simulate a random strategy
(obtainedRewardsRandom, env2) = Q1.simulateRandomStrategy()
plt.figure(figsize=(12, 5))
# plot the figure and adjust the plot parameters
numpy.save("Qmatrix_new.npy",Q1.Q)
plt.plot(Q1.sumRewardsEpisode, color='blue', linewidth=1)
plt.xlabel('Episode')
plt.ylabel('Reward')
plt.yscale('log')
plt.savefig('convergence.png')
plt.title("Convergence of rewards")
plt.show()


# close the environment
env1.close()
# get the sum of rewards
np.sum(obtainedRewardsOptimal)

# now simulate a random strategy
obtainedRewardsRandom = []
for i in range(50):
    (rewardsRandom, env2) = Q1.simulateRandomStrategy()
    obtainedRewardsRandom.append(rewardsRandom)
plt.title("Rewards with random strategy")
plt.hist(obtainedRewardsRandom)
plt.xlabel('Sum of rewards')
plt.ylabel('Percentage')
plt.savefig('histogram.png')
plt.show()

# run this several times and compare with a random learning strategy
(obtainedRewardsOptimal, env1) = Q1.simulateLearnedStrategy()