Artod
/

ai-creature

Reinforcement Learning

computer-vision

soft-actor-critic

Model card Files Files and versions

ai-creature / agent_sac.js

Artod's picture

Upload 16 files

b396e7b over 2 years ago

history blame contribute delete

35.9 kB

	/**
	* Soft Actor Critic Agent https://arxiv.org/abs/1812.05905
	* without value network.
	*/
	const AgentSac = (() => {
	/**
	* Validates the shape of a given tensor.
	*
	* @param {Tensor} tensor - tensor whose shape must be validated
	* @param {array} shape - shape to compare with
	* @param {string} [msg = ''] - message for the error
	*/
	const assertShape = (tensor, shape, msg = '') => {
	console.assert(
	JSON.stringify(tensor.shape) === JSON.stringify(shape),
	msg + ' shape ' + tensor.shape + ' is not ' + shape)
	}

	// const VERSION = 1 // +100 for bump tower
	// const VERSION = 2 // balls
	// const VERSION = 3 // tests
	// const VERSION = 4 // tests
	// const VERSION = 5 // exp #1
	// const VERSION = 6 // exp #2
	// const VERSION = 7 // exp #3
	// const VERSION = 8 // exp #4
	// const VERSION = 9 // exp #
	// const VERSION = 10 // exp # good, doesn't touch
	// const VERSION = 11 // exp #
	// const VERSION = 12 // exp # 25x25
	// const VERSION = 13 // exp # 25x25 single CNN
	// const VERSION = 15 // 15.1 stable RB 10^5
	// const VERSION = 16 // reward from RL2, rb 10^6, gr/red balls, bad
	// const VERSION = 18 // reward from RL2, CNN from SAC paper, works!
	// const VERSION = 19 // moving balls, super!
	// const VERSION = 20 // moving balls, discret impulse, bad
	// const VERSION = 21 // independant look
	// const VERSION = 22 // dqn arch, bad
	// const VERSION = 23 // dqn trunc, works! fast learn
	// const VERSION = 24 // dqn trunc 3 layers, super and fast
	// const VERSION = 25 // dqn trunc 3 layers 2x512, poor
	// const VERSION = 26 // rl2 cnn arc, bad too many weights
	// const VERSION = 27 // sac cnn 16x6x3->16x4x2->8x3x1->2x256 and 2 clr frames, 2h, kiss, Excellent!
	// const VERSION = 28 // same but 1 frame, works
	// const VERSION = 29 // 1fr w/o accel, poor
	// const VERSION = 30 // 2fr wide img, poor
	// const VERSION = 31 // 2 small imgs, small cnn out, poor
	// const VERSION = 32 // 2fr binacular
	// const VERSION = 33 // 4fr binacular, Good, but poor after reload on wider cage
	// const VERSION = 34 // 4fr binacular, smaller fov=2, angle 0.7, poor
	// const VERSION = 35 // 4fr binacular with dist, poor
	// const VERSION = 36 // 4fr binacular with dist, works but reload not
	// const VERSION = 37 // BCNN achiasma, good -> reload poor
	// const VERSION = 38 // BCNN achiasma, smaller cnn
	// const VERSION = 39 // 1fr BCNN achiasma, smaller cnn, works super fast, 30min
	// const VERSION = 40 // 2fr BCNN achiasma, 2l smaller cnn, poor
	// const VERSION = 41 // 2fr BCNN achiasma, 2l smaller cnn, some perfm after 30min
	// const VERSION = 41 // 1fr BCNN achiasma, 2l smaller cnn, super kiss, reload poor
	// const VERSION = 42 // 2fr BCNN achiasma, 2l smaller cnn, reload poor
	// const VERSION = 43 // 1fr BCNN achiasma, 3l, fov 0.8, 1h good, reload not bad
	// const VERSION = 44 // 2fr BCNN achiasma, 3l, fov 0.8, slow 1h, reload not bad, a bit better than 1fr, degrade
	// const VERSION = 45 // 1fr BCNN achiasma, 2l, fov 0.8, poor
	// const VERSION = 46 // 2fr BCNN achiasma, 2l, fov 0.8, fast 30 min but poor on reload
	// const VERSION = 47 // 1fr BCNN chiasma, 2l, fov 0.7, poor
	// const VERSION = 48 // 2fr BCNN chiasma, 2l, fov 0.7 poor
	// const VERSION = 49 // 1fr BCNN chiasma stacked, 3l, poor
	// const VERSION = 50 // 2fr 2nets monocular, 1h good, reload poor
	// const VERSION = 51 // 1fr 1nets monocular, stuck
	// const VERSION = 52 // 2fr 2nets monocular, poor
	// const VERSION = 53 // 2fr 2nets monocular,
	// const VERSION = 54 // 2fr binocular
	// const VERSION = 55 // 2fr binocular
	// const VERSION = 56 // 2fr binocular
	// const VERSION = 57 // 1fr binocular, sphere vimeo super
	// const VERSION = 58 // 2fr binocular, sphere
	// const VERSION = 59 // 1fr binocular, sphere
	// const VERSION = 61 // 2fr binocular, sphere, 2lay BASELINE!!! cage 55, mass 2, ball mass 1
	// const VERSION = 62
	//const VERSION = 63 // 1fr 30min! cage 60
	// const VERSION = 64 // 2fr nores
	// const VERSION = 66 // 1fr 30min slightly slower
	// const VERSION = 67 // 2fr 30min as prev
	// const VERSION = 65 // 1fr l/r diff, 30min +400
	// const VERSION = 68 // 1fr l/r diff, 30min -100 good
	// const VERSION = 69 // 1fr l/r diff, 30min -190 good
	// const VERSION = 70 // 1fr l/r diff, 30min -420
	// const VERSION = 71 // 1fr l/r diff, 30min -480
	// const VERSION = 72 // 1fr no diff, 30min
	// const VERSION = 73 // 1fr no diff, 30min -400 cage 50
	// const VERSION = 74 // 1fr diff, 30min 2.6k!
	// const VERSION = 75 // 1fr diff, 30min -300
	// const VERSION = 76 // 1fr diff, 20min +300!
	// const VERSION = 77 // 1fr diff, 20min +3.5k!
	// const VERSION = 78 // 1fr diff, 30min -90
	// const VERSION = 79 // 1fr NO diff, 25min +158
	// const VERSION = 80 // 1fr NO diff, 30min -200
	// const VERSION = 81 // 1fr NO diff, 20min +1200
	// const VERSION = 82 // 1fr NO diff, 30min
	// const VERSION = 83 // 1fr NO diff, priority 30min -400
	const VERSION = 84 // 1fr diff, 30min

	const LOG_STD_MIN = -20
	const LOG_STD_MAX = 2
	const EPSILON = 1e-8
	const NAME = {
	ACTOR: 'actor',
	Q1: 'q1',
	Q2: 'q2',
	Q1_TARGET: 'q1-target',
	Q2_TARGET: 'q2-target',
	ALPHA: 'alpha'
	}

	return class AgentSac {
	constructor({
	batchSize = 1,
	frameShape = [25, 25, 3],
	nFrames = 1, // Number of stacked frames per state
	nActions = 3, // 3 - impuls, 3 - RGB color
	nTelemetry = 10, // 3 - linear valocity, 3 - acceleration, 3 - collision point, 1 - lidar (tanh of distance)
	gamma = 0.99, // Discount factor (γ)
	tau = 5e-3, // Target smoothing coefficient (τ)
	trainable = true, // Whether the actor is trainable
	verbose = false,
	forced = false, // force to create fresh models (not from checkpoint)
	prefix = '', // for tests,
	sighted = true,
	rewardScale = 10
	} = {}) {
	this._batchSize = batchSize
	this._frameShape = frameShape
	this._nFrames = nFrames
	this._nActions = nActions
	this._nTelemetry = nTelemetry
	this._gamma = gamma
	this._tau = tau
	this._trainable = trainable
	this._verbose = verbose
	this._inited = false
	this._prefix = (prefix === '' ? '' : prefix + '-')
	this._forced = forced
	this._sighted = sighted
	this._rewardScale = rewardScale

	this._frameStackShape = [...this._frameShape.slice(0, 2), this._frameShape[2] * this._nFrames]

	// https://github.com/rail-berkeley/softlearning/blob/13cf187cc93d90f7c217ea2845067491c3c65464/softlearning/algorithms/sac.py#L37
	this._targetEntropy = -nActions
	}

	/**
	* Initialization.
	*/
	async init() {
	if (this._inited) throw Error('щ（ﾟДﾟщ）')

	this._frameInputL = tf.input({batchShape : [null, ...this._frameStackShape]})
	this._frameInputR = tf.input({batchShape : [null, ...this._frameStackShape]})

	this._telemetryInput = tf.input({batchShape : [null, this._nTelemetry]})

	this.actor = await this._getActor(this._prefix + NAME.ACTOR, this.trainable)

	if (!this._trainable)
	return

	this.actorOptimizer = tf.train.adam()

	this._actionInput = tf.input({batchShape : [null, this._nActions]})

	this.q1 = await this._getCritic(this._prefix + NAME.Q1)
	this.q1Optimizer = tf.train.adam()

	this.q2 = await this._getCritic(this._prefix + NAME.Q2)
	this.q2Optimizer = tf.train.adam()

	this.q1Targ = await this._getCritic(this._prefix + NAME.Q1_TARGET, true) // true for batch norm
	this.q2Targ = await this._getCritic(this._prefix + NAME.Q2_TARGET, true)

	this._logAlpha = await this._getLogAlpha(this._prefix + NAME.ALPHA)
	this.alphaOptimizer = tf.train.adam()

	this.updateTargets(1)

	// console.log('weights actorr', this.actor.getWeights().map(w => w.arraySync()))
	// console.log('weights q1q1q1', this.q1.getWeights().map(w => w.arraySync()))
	// console.log('weights q2Targ', this.q2Targ.getWeights().map(w => w.arraySync()))

	this._inited = true
	}

	/**
	* Trains networks on a batch from the replay buffer.
	*
	* @param {{ state, action, reward, nextState }} - trnsitions in batch
	* @returns {void} nothing
	*/
	train({ state, action, reward, nextState }) {
	if (!this._trainable)
	throw new Error('Actor is not trainable')

	return tf.tidy(() => {
	assertShape(state[0], [this._batchSize, this._nTelemetry], 'telemetry')
	assertShape(state[1], [this._batchSize, ...this._frameStackShape], 'frames')
	assertShape(action, [this._batchSize, this._nActions], 'action')
	assertShape(reward, [this._batchSize, 1], 'reward')
	assertShape(nextState[0], [this._batchSize, this._nTelemetry], 'nextState telemetry')
	assertShape(nextState[1], [this._batchSize, ...this._frameStackShape], 'nextState frames')

	this._trainCritics({ state, action, reward, nextState })
	this._trainActor(state)
	this._trainAlpha(state)

	this.updateTargets()
	})
	}

	/**
	* Train Q-networks.
	*
	* @param {{ state, action, reward, nextState }} transition - transition
	*/
	_trainCritics({ state, action, reward, nextState }) {
	const getQLossFunction = (() => {
	const [nextFreshAction, logPi] = this.sampleAction(nextState, true)

	const q1TargValue = this.q1Targ.predict(
	this._sighted ? [...nextState, nextFreshAction] : [nextState[0], nextFreshAction],
	{batchSize: this._batchSize})
	const q2TargValue = this.q2Targ.predict(
	this._sighted ? [...nextState, nextFreshAction] : [nextState[0], nextFreshAction],
	{batchSize: this._batchSize})

	const qTargValue = tf.minimum(q1TargValue, q2TargValue)

	// y = r + γ(1 - d)(min(Q1Targ(s', a'), Q2Targ(s', a')) - α*log(π(s'))
	const alpha = this._getAlpha()
	const target = reward.mul(tf.scalar(this._rewardScale)).add(
	tf.scalar(this._gamma).mul(
	qTargValue.sub(alpha.mul(logPi))
	)
	)

	assertShape(nextFreshAction, [this._batchSize, this._nActions], 'nextFreshAction')
	assertShape(logPi, [this._batchSize, 1], 'logPi')
	assertShape(qTargValue, [this._batchSize, 1], 'qTargValue')
	assertShape(target, [this._batchSize, 1], 'target')

	return (q) => () => {
	const qValue = q.predict(
	this._sighted ? [...state, action] : [state[0], action],
	{batchSize: this._batchSize})

	// const loss = tf.scalar(0.5).mul(tf.losses.meanSquaredError(qValue, target))
	const loss = tf.scalar(0.5).mul(tf.mean(qValue.sub(target).square()))

	assertShape(qValue, [this._batchSize, 1], 'qValue')

	return loss
	}
	})()

	for (const [q, optimizer] of [
	[this.q1, this.q1Optimizer],
	[this.q2, this.q2Optimizer]
	]) {
	const qLossFunction = getQLossFunction(q)

	const { value, grads } = tf.variableGrads(qLossFunction, q.getWeights(true)) // true means trainableOnly

	optimizer.applyGradients(grads)

	if (this._verbose) console.log(q.name + ' Loss: ' + value.arraySync())
	}
	}

	/**
	* Train actor networks.
	*
	* @param {state} state
	*/
	_trainActor(state) {
	// TODO: consider delayed update of policy and targets (if possible)
	const actorLossFunction = () => {
	const [freshAction, logPi] = this.sampleAction(state, true)

	const q1Value = this.q1.predict(
	this._sighted ? [...state, freshAction] : [state[0], freshAction],
	{batchSize: this._batchSize})
	const q2Value = this.q2.predict(
	this._sighted ? [...state, freshAction] : [state[0], freshAction],
	{batchSize: this._batchSize})

	const criticValue = tf.minimum(q1Value, q2Value)

	const alpha = this._getAlpha()
	const loss = alpha.mul(logPi).sub(criticValue)

	assertShape(freshAction, [this._batchSize, this._nActions], 'freshAction')
	assertShape(logPi, [this._batchSize, 1], 'logPi')
	assertShape(q1Value, [this._batchSize, 1], 'q1Value')
	assertShape(criticValue, [this._batchSize, 1], 'criticValue')
	assertShape(loss, [this._batchSize, 1], 'alpha loss')

	return tf.mean(loss)
	}

	const { value, grads } = tf.variableGrads(actorLossFunction, this.actor.getWeights(true)) // true means trainableOnly

	this.actorOptimizer.applyGradients(grads)

	if (this._verbose) console.log('Actor Loss: ' + value.arraySync())
	}

	_trainAlpha(state) {
	const alphaLossFunction = () => {
	const [, logPi] = this.sampleAction(state, true)

	const alpha = this._getAlpha()
	const loss = tf.scalar(-1).mul(
	alpha.mul( // TODO: not sure whether this should be alpha or logAlpha
	logPi.add(tf.scalar(this._targetEntropy))
	)
	)

	assertShape(loss, [this._batchSize, 1], 'alpha loss')

	return tf.mean(loss)
	}

	const { value, grads } = tf.variableGrads(alphaLossFunction, [this._logAlpha]) // true means trainableOnly

	this.alphaOptimizer.applyGradients(grads)

	if (this._verbose) console.log('Alpha Loss: ' + value.arraySync(), tf.exp(this._logAlpha).arraySync())
	}

	/**
	* Soft update target Q-networks.
	*
	* @param {number} [tau = this._tau] - smoothing constant τ for exponentially moving average: `wTarg <- wTarg(1-tau) + wtau`
	*/
	updateTargets(tau = this._tau) {
	tau = tf.scalar(tau)

	const
	q1W = this.q1.getWeights(),
	q2W = this.q2.getWeights(),
	q1WTarg = this.q1Targ.getWeights(),
	q2WTarg = this.q2Targ.getWeights(),
	len = q1W.length

	// console.log('updateTargets q1W', q1W.map(w=>w.arraySync()))
	// console.log('updateTargets q1WTarg', q1WTarg.map(w=>w.arraySync()))

	const calc = (w, wTarg) => wTarg.mul(tf.scalar(1).sub(tau)).add(w.mul(tau))

	const w1 = [], w2 = []
	for (let i = 0; i < len; i++) {
	w1.push(calc(q1W[i], q1WTarg[i]))
	w2.push(calc(q2W[i], q2WTarg[i]))
	}

	this.q1Targ.setWeights(w1)
	this.q2Targ.setWeights(w2)


	}

	/**
	* Returns actions sampled from normal distribution using means and stds predicted by the actor.
	*
	* @param {Tensor[]} state - state
	* @param {Tensor} [withLogProbs = false] - whether return log probabilities
	* @returns {Tensor \|\| Tensor[]} action and log policy
	*/
	sampleAction(state, withLogProbs = false) { // timer ~3ms
	return tf.tidy(() => {
	let [ mu, logStd ] = this.actor.predict(this._sighted ? state : state[0], {batchSize: this._batchSize})

	// https://github.com/rail-berkeley/rlkit/blob/c81509d982b4d52a6239e7bfe7d2540e3d3cd986/rlkit/torch/sac/policies/gaussian_policy.py#L106
	logStd = tf.clipByValue(logStd, LOG_STD_MIN, LOG_STD_MAX)

	const std = tf.exp(logStd)

	// sample normal N(mu = 0, std = 1)
	const normal = tf.randomNormal(mu.shape, 0, 1.0)

	// reparameterization trick: z = mu + std * epsilon
	let pi = mu.add(std.mul(normal))

	let logPi = this._gaussianLikelihood(pi, mu, logStd)

	;({ pi, logPi } = this._applySquashing(pi, mu, logPi))

	if (!withLogProbs)
	return pi

	return [pi, logPi]
	})
	}

	/**
	* Calculates log probability of normal distribution https://en.wikipedia.org/wiki/Log_probability.
	* Converted to js from https://github.com/tensorflow/probability/blob/f3777158691787d3658b5e80883fe1a933d48989/tensorflow_probability/python/distributions/normal.py#L183
	*
	* @param {Tensor} x - sample from normal distribution with mean `mu` and std `std`
	* @param {Tensor} mu - mean
	* @param {Tensor} std - standart deviation
	* @returns {Tensor} log probability
	*/
	_logProb(x, mu, std) {
	const logUnnormalized = tf.scalar(-0.5).mul(
	tf.squaredDifference(x.div(std), mu.div(std))
	)
	const logNormalization = tf.scalar(0.5 * Math.log(2 * Math.PI)).add(tf.log(std))

	return logUnnormalized.sub(logNormalization)
	}

	/**
	* Gaussian likelihood.
	* Translated from https://github.com/openai/spinningup/blob/038665d62d569055401d91856abb287263096178/spinup/algos/tf1/sac/core.py#L24
	*
	* @param {Tensor} x - sample from normal distribution with mean `mu` and std `exp(logStd)`
	* @param {Tensor} mu - mean
	* @param {Tensor} logStd - log of standart deviation
	* @returns {Tensor} log probability
	*/
	_gaussianLikelihood(x, mu, logStd) {
	// pre_sum = -0.5 * (
	// ((x-mu)/(tf.exp(log_std)+EPS))**2
	// + 2*log_std
	// + np.log(2*np.pi)
	// )

	const preSum = tf.scalar(-0.5).mul(
	x.sub(mu).div(
	tf.exp(logStd).add(tf.scalar(EPSILON))
	).square()
	.add(tf.scalar(2).mul(logStd))
	.add(tf.scalar(Math.log(2 * Math.PI)))
	)

	return tf.sum(preSum, 1, true)
	}

	/**
	* Adjustment to log probability when squashing action with tanh
	* Enforcing Action Bounds formula derivation https://stats.stackexchange.com/questions/239588/derivation-of-change-of-variables-of-a-probability-density-function
	* Translated from https://github.com/openai/spinningup/blob/038665d62d569055401d91856abb287263096178/spinup/algos/tf1/sac/core.py#L48
	*
	* @param {*} pi - policy sample
	* @param {*} mu - mean
	* @param {*} logPi - log probability
	* @returns {{ pi, mu, logPi }} squashed and adjasted input
	*/
	_applySquashing(pi, mu, logPi) {
	// logp_pi -= tf.reduce_sum(2(np.log(2) - pi - tf.nn.softplus(-2pi)), axis=1)

	const adj = tf.scalar(2).mul(
	tf.scalar(Math.log(2))
	.sub(pi)
	.sub(tf.softplus(
	tf.scalar(-2).mul(pi)
	))
	)

	logPi = logPi.sub(tf.sum(adj, 1, true))
	mu = tf.tanh(mu)
	pi = tf.tanh(pi)

	return { pi, mu, logPi }
	}

	/**
	* Builds actor network model.
	*
	* @param {string} [name = 'actor'] - name of the model
	* @param {string} trainable - whether a critic is trainable
	* @returns {tf.LayersModel} model
	*/
	async _getActor(name = 'actor', trainable = true) {
	const checkpoint = await this._loadCheckpoint(name)
	if (checkpoint) return checkpoint

	let outputs = this._telemetryInput
	// outputs = tf.layers.dense({units: 128, activation: 'relu'}).apply(outputs)

	if (this._sighted) {
	let convOutputL = this._getConvEncoder(this._frameInputL)
	let convOutputR = this._getConvEncoder(this._frameInputR)
	// let convOutput = tf.layers.concatenate().apply([convOutputL, convOutputR])
	// convOutput = tf.layers.dense({units: 10, activation: 'relu'}).apply(convOutput)

	outputs = tf.layers.concatenate().apply([convOutputL, convOutputR, outputs])
	}

	outputs = tf.layers.dense({units: 256, activation: 'relu'}).apply(outputs)
	outputs = tf.layers.dense({units: 256, activation: 'relu'}).apply(outputs)

	const mu = tf.layers.dense({units: this._nActions}).apply(outputs)
	const logStd = tf.layers.dense({units: this._nActions}).apply(outputs)

	const model = tf.model({inputs: this._sighted ? [this._telemetryInput, this._frameInputL, this._frameInputR] : [this._telemetryInput], outputs: [mu, logStd], name})
	model.trainable = trainable

	if (this._verbose) {
	console.log('==========================')
	console.log('==========================')
	console.log('Actor ' + name + ': ')

	model.summary()
	}

	return model
	}

	/**
	* Builds a critic network model.
	*
	* @param {string} [name = 'critic'] - name of the model
	* @param {string} trainable - whether a critic is trainable
	* @returns {tf.LayersModel} model
	*/
	async _getCritic(name = 'critic', trainable = true) {
	const checkpoint = await this._loadCheckpoint(name)
	if (checkpoint) return checkpoint

	let outputs = tf.layers.concatenate().apply([this._telemetryInput, this._actionInput])
	// outputs = tf.layers.dense({units: 128, activation: 'relu'}).apply(outputs)

	if (this._sighted) {
	let convOutputL = this._getConvEncoder(this._frameInputL)
	let convOutputR = this._getConvEncoder(this._frameInputR)
	// let convOutput = tf.layers.concatenate().apply([convOutputL, convOutputR])
	// convOutput = tf.layers.dense({units: 10, activation: 'relu'}).apply(convOutput)

	outputs = tf.layers.concatenate().apply([convOutputL, convOutputR, outputs])
	}

	outputs = tf.layers.dense({units: 256, activation: 'relu'}).apply(outputs)
	outputs = tf.layers.dense({units: 256, activation: 'relu'}).apply(outputs)

	outputs = tf.layers.dense({units: 1}).apply(outputs)

	const model = tf.model({
	inputs: this._sighted
	? [this._telemetryInput, this._frameInputL, this._frameInputR, this._actionInput]
	: [this._telemetryInput, this._actionInput],
	outputs, name
	})

	model.trainable = trainable

	if (this._verbose) {
	console.log('==========================')
	console.log('==========================')
	console.log('CRITIC ' + name + ': ')

	model.summary()
	}

	return model
	}

	// _encoder = null
	// _getConvEncoder(inputs) {
	// if (!this._encoder)
	// this._encoder = this.__getConvEncoder(inputs)

	// return this._encoder
	// }

	/**
	* Builds convolutional part of a network.
	*
	* @param {Tensor} inputs - input for the conv layers
	* @returns outputs
	*/
	_getConvEncoder(inputs) {
	const kernelSize = 3
	const padding = 'valid'
	const poolSize = 3
	const strides = 1
	// const depthwiseInitializer = 'heNormal'
	// const pointwiseInitializer = 'heNormal'
	const kernelInitializer = 'glorotNormal'
	const biasInitializer = 'glorotNormal'

	let outputs = inputs

	// 32x8x4 -> 64x4x2 -> 64x3x1 -> 64x4x1
	outputs = tf.layers.conv2d({
	filters: 16,
	kernelSize: 5,
	strides: 2,
	padding,
	kernelInitializer,
	biasInitializer,
	activation: 'relu',
	trainable: true
	}).apply(outputs)
	outputs = tf.layers.maxPooling2d({poolSize:2}).apply(outputs)
	//
	// outputs = tf.layers.layerNormalization().apply(outputs)

	outputs = tf.layers.conv2d({
	filters: 16,
	kernelSize: 3,
	strides: 1,
	padding,
	kernelInitializer,
	biasInitializer,
	activation: 'relu',
	trainable: true
	}).apply(outputs)
	outputs = tf.layers.maxPooling2d({poolSize:2}).apply(outputs)

	// outputs = tf.layers.layerNormalization().apply(outputs)

	// outputs = tf.layers.conv2d({
	// filters: 12,
	// kernelSize: 3,
	// strides: 1,
	// padding,
	// kernelInitializer,
	// biasInitializer,
	// activation: 'relu',
	// trainable: true
	// }).apply(outputs)

	// outputs = tf.layers.conv2d({
	// filters: 10,
	// kernelSize: 2,
	// strides: 1,
	// padding,
	// kernelInitializer,
	// biasInitializer,
	// activation: 'relu',
	// trainable: true
	// }).apply(outputs)

	// outputs = tf.layers.conv2d({
	// filters: 64,
	// kernelSize: 4,
	// strides: 1,
	// padding,
	// kernelInitializer,
	// biasInitializer,
	// activation: 'relu'
	// }).apply(outputs)

	// outputs = tf.layers.batchNormalization().apply(outputs)

	// outputs = tf.layers.layerNormalization().apply(outputs)

	outputs = tf.layers.flatten().apply(outputs)

	// convOutputs = tf.layers.dense({units: 96, activation: 'relu'}).apply(convOutputs)

	return outputs
	}

	/**
	* Returns clipped alpha.
	*
	* @returns {Tensor} entropy
	*/
	_getAlpha() {
	// return tf.maximum(tf.exp(this._logAlpha), tf.scalar(this._minAlpha))
	return tf.exp(this._logAlpha)
	}

	/**
	* Builds a log of entropy scale (α) for training.
	*
	* @param {string} name
	* @returns {tf.Variable} trainable variable for log entropy
	*/
	async _getLogAlpha(name = 'alpha') {
	let logAlpha = 0.0

	const checkpoint = await this._loadCheckpoint(name)
	if (checkpoint) {
	logAlpha = checkpoint.getWeights()[0].arraySync()[0][0]

	if (this._verbose)
	console.log('Checkpoint alpha: ', logAlpha)

	this._logAlphaPlaceholder = checkpoint
	} else {
	const model = tf.sequential({ name });
	model.add(tf.layers.dense({ units: 1, inputShape: [1], useBias: false }))
	model.setWeights([tf.tensor([logAlpha], [1, 1])])

	this._logAlphaPlaceholder = model
	}

	return tf.variable(tf.scalar(logAlpha), true) // true -> trainable
	}

	/**
	* Saves all agent's models to the storage.
	*/
	async checkpoint() {
	if (!this._trainable) throw new Error('(╭ರ_ ⊙ )')

	this._logAlphaPlaceholder.setWeights([tf.tensor([this._logAlpha.arraySync()], [1, 1])])

	await Promise.all([
	this._saveCheckpoint(this.actor),
	this._saveCheckpoint(this.q1),
	this._saveCheckpoint(this.q2),
	this._saveCheckpoint(this.q1Targ),
	this._saveCheckpoint(this.q2Targ),
	this._saveCheckpoint(this._logAlphaPlaceholder)
	])

	if (this._verbose)
	console.log('Checkpoint succesfully saved')
	}

	/**
	* Saves a model to the storage.
	*
	* @param {tf.LayersModel} model
	*/
	async _saveCheckpoint(model) {
	const key = this._getChKey(model.name)
	const saveResults = await model.save(key)

	if (this._verbose)
	console.log('Checkpoint saveResults', model.name, saveResults)
	}

	/**
	* Loads saved checkpoint from the storage.
	*
	* @param {string} name model name
	* @returns {tf.LayersModel} model
	*/
	async _loadCheckpoint(name) {
	// return
	if (this._forced) {
	console.log('Forced to not load from the checkpoint ' + name)
	return
	}

	const key = this._getChKey(name)
	const modelsInfo = await tf.io.listModels()

	if (key in modelsInfo) {
	const model = await tf.loadLayersModel(key)

	if (this._verbose)
	console.log('Loaded checkpoint for ' + name)

	return model
	}

	if (this._verbose)
	console.log('Checkpoint not found for ' + name)
	}

	/**
	* Builds the key for the model weights in LocalStorage.
	*
	* @param {tf.LayersModel} name model name
	* @returns {string} key
	*/
	_getChKey(name) {
	return 'indexeddb://' + name + '-' + VERSION
	}
	}
	})()

	/* TESTS */
	;(async () => {
	return

	// https://www.wolframalpha.com/input/?i2d=true&i=y%5C%2840%29x%5C%2844%29+%CE%BC%5C%2844%29+%CF%83%5C%2841%29+%3D+ln%5C%2840%29Divide%5B1%2CSqrt%5B2%CF%80Power%5B%CF%83%2C2%5D%5D%5DExp%5B-Divide%5B1%2C2%5D%5C%2840%29Divide%5BPower%5B%5C%2840%29x-%CE%BC%5C%2841%29%2C2%5D%2CPower%5B%CF%83%2C2%5D%5D%5C%2841%29%5D%5C%2841%29
	;(() => {
	const agent = new AgentSac()

	const
	mu = tf.tensor([0], [1, 1]), // mu = 0
	logStd = tf.tensor([0], [1, 1]), // logStd = 0
	std = tf.exp(logStd), // std = 1
	normal = tf.tensor([0], [1, 1]), // N = 0
	pi = mu.add(std.mul(normal)) // x = 0

	const log = agent._gaussianLikelihood(pi, mu, logStd)

	console.assert(log.arraySync()[0][0].toFixed(5) === '-0.91894',
	'test Gaussian Likelihood for μ=0, σ=1, x=0')
	})()

	;(() => {
	const agent = new AgentSac()

	const
	mu = tf.tensor([1], [1, 1]), // mu = 1
	logStd = tf.tensor([1], [1, 1]), // logStd = 1
	std = tf.exp(logStd), // std = e
	normal = tf.tensor([0], [1, 1]), // N = 0
	pi = mu.add(std.mul(normal)) // x = 1

	const log = agent._gaussianLikelihood(pi, mu, logStd)

	console.assert(log.arraySync()[0][0].toFixed(5) === '-1.91894',
	'test Gaussian Likelihood for μ=1, σ=e, x=0')
	})()

	;(() => {
	const agent = new AgentSac()

	const
	mu = tf.tensor([1], [1, 1]), // mu = -1
	logStd = tf.tensor([1], [1, 1]), // logStd = 1
	std = tf.exp(logStd), // std = e
	normal = tf.tensor([0.1], [1, 1]), // N = 0
	pi = mu.add(std.mul(normal)) // x = -1.27182818

	const logPi = agent._gaussianLikelihood(pi, mu, logStd)
	const { pi: piSquashed, logPi: logPiSquashed } = agent._applySquashing(pi, mu, logPi)

	const logProbBounded = logPi.sub(
	tf.log(
	tf.scalar(1)
	.sub(tf.tanh(pi).pow(tf.scalar(2)))
	// .add(EPSILON)
	)
	).sum(1, true)

	console.assert(logPi.arraySync()[0][0].toFixed(5) === '-1.92394',
	'test Gaussian Likelihood for μ=-1, σ=e, x=-1.27182818')

	console.assert(logPiSquashed.arraySync()[0][0].toFixed(5) === logProbBounded.arraySync()[0][0].toFixed(5),
	'test logPiSquashed for μ=-1, σ=e, x=-1.27182818')

	console.assert(piSquashed.arraySync()[0][0].toFixed(5) === tf.tanh(pi).arraySync()[0][0].toFixed(5),
	'test piSquashed for μ=-1, σ=e, x=-1.27182818')
	})()

	await (async () => {
	const state = tf.tensor([
	0.5, 0.3, -0.9,
	0, -0.8, 1,
	-0.3, 0.04, 0.02,
	0.9
	], [1, 10])

	const action = tf.tensor([
	0.1, -1, -0.4,
	1, -0.8, -0.8, -0.2,
	0.04, 0.02, 0.001
	], [1, 10])

	const fresh = new AgentSac({ prefix: 'test', forced: true })
	await fresh.init()
	await fresh.checkpoint()

	const saved = new AgentSac({ prefix: 'test' })
	await saved.init()

	let frPred, saPred

	frPred = fresh.actor.predict(state, {batchSize: 1})
	saPred = saved.actor.predict(state, {batchSize: 1})
	console.assert(
	frPred[0].arraySync().length > 0 &&
	frPred[1].arraySync().length > 0 &&
	frPred[0].arraySync().join(';') === saPred[0].arraySync().join(';') &&
	frPred[1].arraySync().join(';') === saPred[1].arraySync().join(';'),
	'Models loaded from the checkpoint should be the same')

	frPred = fresh.q1.predict([state, action], {batchSize: 1})
	saPred = fresh.q1Targ.predict([state, action], {batchSize: 1})
	console.assert(
	frPred.arraySync()[0][0] !== undefined &&
	frPred.arraySync()[0][0] === saPred.arraySync()[0][0],
	'Q1 and Q1-target should be the same')

	frPred = fresh.q2.predict([state, action], {batchSize: 1})
	saPred = saved.q2.predict([state, action], {batchSize: 1})
	console.assert(
	frPred.arraySync()[0][0] !== undefined &&
	frPred.arraySync()[0][0] === saPred.arraySync()[0][0],
	'Q and Q restored should be the same')

	console.assert(
	fresh._logAlpha.arraySync() !== undefined &&
	fresh._logAlpha.arraySync() === fresh._logAlpha.arraySync(),
	'Q and Q restored should be the same')
	})()
	})()