File size: 9,496 Bytes
5960497
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
import tensorflow as tf
import functools

from baselines.common.tf_util import get_session, save_variables, load_variables
from baselines.common.tf_util import initialize
import numpy as np
from itertools import combinations

try:
    from baselines.common.mpi_adam_optimizer import MpiAdamOptimizer
    from mpi4py import MPI
    from baselines.common.mpi_util import sync_from_root
except ImportError:
    MPI = None


class Model(object):
    """
    We use this object to :
    __init__:
    - Creates the step_model
    - Creates the train_model

    train():
    - Make the training part (feedforward and retropropagation of gradients)

    save/load():
    - Save load the model
    """

    def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train,
                 nsteps, ent_coef, vf_coef, skill_coef, vq_coef, max_grad_norm, alpha1=20, alpha2=1, beta=0.25,
                 Cd=5, mpi_rank_weight=1, comm=None,
                 microbatch_size=None):
        self.sess = sess = get_session()

        if MPI is not None and comm is None:
            comm = MPI.COMM_WORLD

        with tf.compat.v1.variable_scope('sppo2_model', reuse=tf.compat.v1.AUTO_REUSE):
            # CREATE OUR TWO MODELS
            # act_model that is used for sampling
            act_model = policy(nbatch_act, 1, sess)

            # Train model for training
            if microbatch_size is None:
                train_model = policy(nbatch_train, nsteps, sess)
            else:
                train_model = policy(microbatch_size, nsteps, sess)

        # CREATE THE PLACEHOLDERS
        self.A = A = train_model.pdtype.sample_placeholder([None])
        self.ADV = ADV = tf.compat.v1.placeholder(tf.float32, [None])
        self.R = R = tf.compat.v1.placeholder(tf.float32, [None])
        self.skill_weight = skill_weight = tf.compat.v1.placeholder(tf.float32, [])

        # Keep track of old actor
        self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.compat.v1.placeholder(tf.float32, [None])
        # Keep track of old critic
        self.OLDVPRED = OLDVPRED = tf.compat.v1.placeholder(tf.float32, [None])
        self.LR = LR = tf.compat.v1.placeholder(tf.float32, [])
        # Cliprange
        self.CLIPRANGE = CLIPRANGE = tf.compat.v1.placeholder(tf.float32, [])

        self.train_skill = train_skill = tf.compat.v1.placeholder(tf.bool, [])

        neglogpac = train_model.pd.neglogp(A)

        # Calculate the entropy
        # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
        entropy = tf.reduce_mean(input_tensor=train_model.pd.entropy())

        # CALCULATE THE LOSS
        # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss

        # Clip the value to reduce variability during Critic training
        # Get the predicted value
        vpred = train_model.vf
        vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, - CLIPRANGE, CLIPRANGE)
        # Unclipped value
        vf_losses1 = tf.square(vpred - R)
        # Clipped value
        vf_losses2 = tf.square(vpredclipped - R)

        vf_loss = .5 * tf.reduce_mean(input_tensor=tf.maximum(vf_losses1, vf_losses2))

        # Calculate ratio (pi current policy / pi old policy)
        ratio = tf.exp(OLDNEGLOGPAC - neglogpac)

        # Defining Loss = - J is equivalent to max J
        pg_losses = -ADV * ratio

        pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE)

        # calculating the skill loss
        # To avoid the GPU memory exploded, please choose the proper chunk size for your GPUs
        chunk_size = 64

        def pairwise_sqd_distance(x):
            if x.shape[-1] <= chunk_size:
                tiled = tf.tile(tf.expand_dims(x, axis=1), tf.stack([1, x.shape[0], 1]))
                tiled_trans = tf.transpose(tiled, perm=[1, 0, 2])
                diffs = tiled - tiled_trans
                sqd_dist_mat = tf.reduce_sum(tf.square(diffs), axis=2)
            else:
                sqd_dist_mat = []
                for i in range(x.shape[-1] // chunk_size):
                    tiled = tf.tile(tf.expand_dims(x[:, i * chunk_size:i * chunk_size + chunk_size], axis=1),
                                    tf.stack([1, x.shape[0], 1]))

                    tiled_trans = tf.transpose(tiled, perm=[1, 0, 2])

                    diffs = tiled - tiled_trans
                    if sqd_dist_mat != []:
                        sqd_dist_mat = tf.add(sqd_dist_mat, tf.reduce_sum(tf.square(diffs), axis=2))
                    else:
                        sqd_dist_mat = tf.reduce_sum(tf.square(diffs), axis=2)

            return sqd_dist_mat

        def make_q(z, alpha):
            sqd_dist_mat = pairwise_sqd_distance(z)
            q = tf.pow((1 + sqd_dist_mat / alpha), -(alpha + 1) / 2)
            q = tf.linalg.set_diag(q, np.zeros(shape=[z.shape[0]]))
            q = q / tf.reduce_sum(q, axis=0, keepdims=True)
            q = tf.clip_by_value(q, 1e-10, 1.0)

            return q

        p = make_q(train_model.pure_latent, alpha=alpha1)
        q = make_q(train_model.skill_latent, alpha=alpha2)
        skill_losses = tf.reduce_sum(-(tf.multiply(p, tf.math.log(q)))) / (p.shape[0] * p.shape[1])
        skill_loss_ = tf.reduce_mean(skill_losses)

        # calculating the vq loss
        if beta != 0:
            commitment_loss = beta * tf.reduce_mean(
                input_tensor=(tf.stop_gradient(train_model.pure_vq_latent) - train_model.skill_latent) ** 2, axis=1
            )
        else:
            commitment_loss = 0
        codebook_loss = tf.reduce_mean(
            input_tensor=(train_model.pure_vq_latent - tf.stop_gradient(train_model.skill_latent)) ** 2, axis=1)

        vq_losses = commitment_loss + codebook_loss
        vq_loss_ = tf.reduce_mean(vq_losses)

        # Final PG loss
        pg_loss = tf.reduce_mean(input_tensor=tf.maximum(pg_losses, pg_losses2))
        approxkl = .5 * tf.reduce_mean(input_tensor=tf.square(neglogpac - OLDNEGLOGPAC))
        clipfrac = tf.reduce_mean(input_tensor=tf.cast(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE), dtype=tf.float32))

        # Total loss

        loss = tf.cond(train_skill, lambda: pg_loss - entropy * ent_coef + vf_loss * vf_coef + skill_weight * (
                skill_loss_ * skill_coef + vq_loss_ * vq_coef),
                       lambda: pg_loss - entropy * ent_coef + vf_loss * vf_coef)

        # UPDATE THE PARAMETERS USING LOSS
        # 1. Get the model parameters
        params = tf.compat.v1.trainable_variables('sppo2_model')
        # 2. Build our trainer
        if comm is not None and comm.Get_size() > 1:
            self.trainer = MpiAdamOptimizer(comm, learning_rate=LR, mpi_rank_weight=mpi_rank_weight, epsilon=1e-5)
        else:
            self.trainer = tf.compat.v1.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5)
        # 3. Calculate the gradients
        grads_and_var = self.trainer.compute_gradients(loss, params)
        grads, var = zip(*grads_and_var)

        if max_grad_norm is not None:
            # Clip the gradients (normalize)
            grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads_and_var = list(zip(grads, var))
        # zip aggregate each gradient with parameters associated
        # For instance zip(ABCD, xyza) => Ax, By, Cz, Da

        self.grads = grads
        self.var = var
        self._train_op = self.trainer.apply_gradients(grads_and_var)
        self.loss_names = ['loss', 'policy_loss', 'value_loss', 'skill_loss', 'vq_loss',
                           'policy_entropy',
                           'approxkl',
                           'clipfrac']
        self.stats_list = [loss, pg_loss, vf_loss, skill_loss_, vq_loss_, entropy, approxkl,
                           clipfrac]

        self.train_model = train_model
        self.act_model = act_model
        self.step = act_model.step
        self.skill_step = act_model.skill_step
        self.value = act_model.value
        self.initial_state = act_model.initial_state

        self.save = functools.partial(save_variables, sess=sess)
        self.load = functools.partial(load_variables, sess=sess)

        initialize()
        global_variables = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.GLOBAL_VARIABLES, scope="")
        if MPI is not None:
            sync_from_root(sess, global_variables, comm=comm)  # pylint: disable=E1101

    def train(self, lr, cliprange, train_skill, skill_weight, obs, returns,
              masks, actions,
              values,
              neglogpacs,
              states=None):
        # Here we calculate advantage A(s,a) = R + yV(s') - V(s)
        # Returns = R + yV(s')
        advs = returns - values

        # Normalize the advantages
        advs = (advs - advs.mean()) / (advs.std() + 1e-8)

        td_map = {
            self.train_model.X: obs,
            self.A: actions,
            self.ADV: advs,
            self.R: returns,
            self.LR: lr,
            self.train_skill: train_skill,
            self.skill_weight: skill_weight,
            self.CLIPRANGE: cliprange,
            self.OLDNEGLOGPAC: neglogpacs,
            self.OLDVPRED: values
        }
        if states is not None:
            td_map[self.train_model.S] = states
            td_map[self.train_model.M] = masks

        return self.sess.run(
            self.stats_list + [self._train_op],
            td_map
        )[:-1]