[ { "train_loss": -0.15511070310068362, "train_policy_loss": 0.0, "train_kl_loss": -3.102214025259018, "train_reward": 0.30753333568573, "baseline": 0.3100000023841858, "epoch": 1 }, { "train_loss": -0.24350257394835353, "train_policy_loss": 0.0, "train_kl_loss": -5.263461359739304, "train_reward": 0.300500001758337, "baseline": 0.3100000023841858, "epoch": 2 } ]