Spaces:
Runtime error
Runtime error
| # Copyright 2017 Google, Inc. All Rights Reserved. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| # ============================================================================== | |
| """Generates toy optimization problems. | |
| This module contains a base class, Problem, that defines a minimal interface | |
| for optimization problems, and a few specific problem types that subclass it. | |
| Test functions for optimization: http://www.sfu.ca/~ssurjano/optimization.html | |
| """ | |
| from __future__ import absolute_import | |
| from __future__ import division | |
| from __future__ import print_function | |
| import numpy as np | |
| import tensorflow as tf | |
| from learned_optimizer.problems import problem_spec as prob_spec | |
| tf.app.flags.DEFINE_float("l2_reg_scale", 1e-3, | |
| """Scaling factor for parameter value regularization | |
| in softmax classifier problems.""") | |
| FLAGS = tf.app.flags.FLAGS | |
| EPSILON = 1e-6 | |
| MAX_SEED = 4294967295 | |
| PARAMETER_SCOPE = "parameters" | |
| _Spec = prob_spec.Spec | |
| class Problem(object): | |
| """Base class for optimization problems. | |
| This defines an interface for optimization problems, including objective and | |
| gradients functions and a feed_generator function that yields data to pass to | |
| feed_dict in tensorflow. | |
| Subclasses of Problem must (at the minimum) override the objective method, | |
| which computes the objective/loss/cost to minimize, and specify the desired | |
| shape of the parameters in a list in the param_shapes attribute. | |
| """ | |
| def __init__(self, param_shapes, random_seed, noise_stdev, init_fn=None): | |
| """Initializes a global random seed for the problem. | |
| Args: | |
| param_shapes: A list of tuples defining the expected shapes of the | |
| parameters for this problem | |
| random_seed: Either an integer (or None, in which case the seed is | |
| randomly drawn) | |
| noise_stdev: Strength (standard deviation) of added gradient noise | |
| init_fn: A function taking a tf.Session object that is used to | |
| initialize the problem's variables. | |
| Raises: | |
| ValueError: If the random_seed is not an integer and not None | |
| """ | |
| if random_seed is not None and not isinstance(random_seed, int): | |
| raise ValueError("random_seed must be an integer or None") | |
| # Pick a random seed. | |
| self.random_seed = (np.random.randint(MAX_SEED) if random_seed is None | |
| else random_seed) | |
| # Store the noise level. | |
| self.noise_stdev = noise_stdev | |
| # Set the random seed to ensure any random data in the problem is the same. | |
| np.random.seed(self.random_seed) | |
| # Store the parameter shapes. | |
| self.param_shapes = param_shapes | |
| if init_fn is not None: | |
| self.init_fn = init_fn | |
| else: | |
| self.init_fn = lambda _: None | |
| def init_tensors(self, seed=None): | |
| """Returns a list of tensors with the given shape.""" | |
| return [tf.random_normal(shape, seed=seed) for shape in self.param_shapes] | |
| def init_variables(self, seed=None): | |
| """Returns a list of variables with the given shape.""" | |
| with tf.variable_scope(PARAMETER_SCOPE): | |
| params = [tf.Variable(param) for param in self.init_tensors(seed)] | |
| return params | |
| def objective(self, parameters, data=None, labels=None): | |
| """Computes the objective given a list of parameters. | |
| Args: | |
| parameters: The parameters to optimize (as a list of tensors) | |
| data: An optional batch of data for calculating objectives | |
| labels: An optional batch of corresponding labels | |
| Returns: | |
| A scalar tensor representing the objective value | |
| """ | |
| raise NotImplementedError | |
| def gradients(self, objective, parameters): | |
| """Compute gradients of the objective with respect to the parameters. | |
| Args: | |
| objective: The objective op (e.g. output of self.objective()) | |
| parameters: A list of tensors (the parameters to optimize) | |
| Returns: | |
| A list of tensors representing the gradient for each parameter, | |
| returned in the same order as the given list | |
| """ | |
| grads = tf.gradients(objective, list(parameters)) | |
| noisy_grads = [] | |
| for grad in grads: | |
| if isinstance(grad, tf.IndexedSlices): | |
| noise = self.noise_stdev * tf.random_normal(tf.shape(grad.values)) | |
| new_grad = tf.IndexedSlices(grad.values + noise, grad.indices) | |
| else: | |
| new_grad = grad + self.noise_stdev * tf.random_normal(grad.get_shape()) | |
| noisy_grads.append(new_grad) | |
| return noisy_grads | |
| class Quadratic(Problem): | |
| """Optimizes a random quadratic function. | |
| The objective is: f(x) = (1/2) ||Wx - y||_2^2 | |
| where W is a random Gaussian matrix and y is a random Gaussian vector. | |
| """ | |
| def __init__(self, ndim, random_seed=None, noise_stdev=0.0): | |
| """Initializes a random quadratic problem.""" | |
| param_shapes = [(ndim, 1)] | |
| super(Quadratic, self).__init__(param_shapes, random_seed, noise_stdev) | |
| # Generate a random problem instance. | |
| self.w = np.random.randn(ndim, ndim).astype("float32") | |
| self.y = np.random.randn(ndim, 1).astype("float32") | |
| def objective(self, params, data=None, labels=None): | |
| """Quadratic objective (see base class for details).""" | |
| return tf.nn.l2_loss(tf.matmul(self.w, params[0]) - self.y) | |
| class SoftmaxClassifier(Problem): | |
| """Helper functions for supervised softmax classification problems.""" | |
| def init_tensors(self, seed=None): | |
| """Returns a list of tensors with the given shape.""" | |
| return [tf.random_normal(shape, seed=seed) * 1.2 / np.sqrt(shape[0]) | |
| for shape in self.param_shapes] | |
| def inference(self, params, data): | |
| """Computes logits given parameters and data. | |
| Args: | |
| params: List of parameter tensors or variables | |
| data: Batch of features with samples along the first dimension | |
| Returns: | |
| logits: Un-normalized logits with shape (num_samples, num_classes) | |
| """ | |
| raise NotImplementedError | |
| def objective(self, params, data, labels): | |
| """Computes the softmax cross entropy. | |
| Args: | |
| params: List of parameter tensors or variables | |
| data: Batch of features with samples along the first dimension | |
| labels: Vector of labels with the same number of samples as the data | |
| Returns: | |
| loss: Softmax cross entropy loss averaged over the samples in the batch | |
| Raises: | |
| ValueError: If the objective is to be computed over >2 classes, because | |
| this operation is broken in tensorflow at the moment. | |
| """ | |
| # Forward pass. | |
| logits = self.inference(params, data) | |
| # Compute the loss. | |
| l2reg = [tf.reduce_sum(param ** 2) for param in params] | |
| if int(logits.get_shape()[1]) == 2: | |
| labels = tf.cast(labels, tf.float32) | |
| losses = tf.nn.sigmoid_cross_entropy_with_logits( | |
| labels=labels, logits=logits[:, 0]) | |
| else: | |
| raise ValueError("Unable to compute softmax cross entropy for more than" | |
| " 2 classes.") | |
| return tf.reduce_mean(losses) + tf.reduce_mean(l2reg) * FLAGS.l2_reg_scale | |
| def argmax(self, logits): | |
| """Samples the most likely class label given the logits. | |
| Args: | |
| logits: Un-normalized logits with shape (num_samples, num_classes) | |
| Returns: | |
| predictions: Predicted class labels, has shape (num_samples,) | |
| """ | |
| return tf.cast(tf.argmax(tf.nn.softmax(logits), 1), tf.int32) | |
| def accuracy(self, params, data, labels): | |
| """Computes the accuracy (fraction of correct classifications). | |
| Args: | |
| params: List of parameter tensors or variables | |
| data: Batch of features with samples along the first dimension | |
| labels: Vector of labels with the same number of samples as the data | |
| Returns: | |
| accuracy: Fraction of correct classifications across the batch | |
| """ | |
| predictions = self.argmax(self.inference(params, data)) | |
| return tf.contrib.metrics.accuracy(predictions, tf.cast(labels, tf.int32)) | |
| class SoftmaxRegression(SoftmaxClassifier): | |
| """Builds a softmax regression problem.""" | |
| def __init__(self, n_features, n_classes, activation=tf.identity, | |
| random_seed=None, noise_stdev=0.0): | |
| self.activation = activation | |
| self.n_features = n_features | |
| param_shapes = [(n_features, n_classes), (n_classes,)] | |
| super(SoftmaxRegression, self).__init__(param_shapes, | |
| random_seed, | |
| noise_stdev) | |
| def inference(self, params, data): | |
| features = tf.reshape(data, (-1, self.n_features)) | |
| return tf.matmul(features, params[0]) + params[1] | |
| class SparseSoftmaxRegression(SoftmaxClassifier): | |
| """Builds a sparse input softmax regression problem.""" | |
| def __init__(self, | |
| n_features, | |
| n_classes, | |
| activation=tf.identity, | |
| random_seed=None, | |
| noise_stdev=0.0): | |
| self.activation = activation | |
| self.n_features = n_features | |
| param_shapes = [(n_classes, n_features), (n_features, n_classes), ( | |
| n_classes,)] | |
| super(SparseSoftmaxRegression, self).__init__(param_shapes, random_seed, | |
| noise_stdev) | |
| def inference(self, params, data): | |
| all_embeddings, softmax_weights, softmax_bias = params | |
| embeddings = tf.nn.embedding_lookup(all_embeddings, tf.cast(data, tf.int32)) | |
| embeddings = tf.reduce_sum(embeddings, 1) | |
| return tf.matmul(embeddings, softmax_weights) + softmax_bias | |
| class OneHotSparseSoftmaxRegression(SoftmaxClassifier): | |
| """Builds a sparse input softmax regression problem. | |
| This is identical to SparseSoftmaxRegression, but without using embedding | |
| ops. | |
| """ | |
| def __init__(self, | |
| n_features, | |
| n_classes, | |
| activation=tf.identity, | |
| random_seed=None, | |
| noise_stdev=0.0): | |
| self.activation = activation | |
| self.n_features = n_features | |
| self.n_classes = n_classes | |
| param_shapes = [(n_classes, n_features), (n_features, n_classes), ( | |
| n_classes,)] | |
| super(OneHotSparseSoftmaxRegression, self).__init__(param_shapes, | |
| random_seed, | |
| noise_stdev) | |
| def inference(self, params, data): | |
| all_embeddings, softmax_weights, softmax_bias = params | |
| num_ids = tf.shape(data)[1] | |
| one_hot_embeddings = tf.one_hot(tf.cast(data, tf.int32), self.n_classes) | |
| one_hot_embeddings = tf.reshape(one_hot_embeddings, [-1, self.n_classes]) | |
| embeddings = tf.matmul(one_hot_embeddings, all_embeddings) | |
| embeddings = tf.reshape(embeddings, [-1, num_ids, self.n_features]) | |
| embeddings = tf.reduce_sum(embeddings, 1) | |
| return tf.matmul(embeddings, softmax_weights) + softmax_bias | |
| class FullyConnected(SoftmaxClassifier): | |
| """Builds a multi-layer perceptron classifier.""" | |
| def __init__(self, n_features, n_classes, hidden_sizes=(32, 64), | |
| activation=tf.nn.sigmoid, random_seed=None, noise_stdev=0.0): | |
| """Initializes an multi-layer perceptron classification problem.""" | |
| # Store the number of features and activation function. | |
| self.n_features = n_features | |
| self.activation = activation | |
| # Define the network as a list of weight + bias shapes for each layer. | |
| param_shapes = [] | |
| for ix, sz in enumerate(hidden_sizes + (n_classes,)): | |
| # The previous layer"s size (n_features if input). | |
| prev_size = n_features if ix == 0 else hidden_sizes[ix - 1] | |
| # Weight shape for this layer. | |
| param_shapes.append((prev_size, sz)) | |
| # Bias shape for this layer. | |
| param_shapes.append((sz,)) | |
| super(FullyConnected, self).__init__(param_shapes, random_seed, noise_stdev) | |
| def inference(self, params, data): | |
| # Flatten the features into a vector. | |
| features = tf.reshape(data, (-1, self.n_features)) | |
| # Pass the data through the network. | |
| preactivations = tf.matmul(features, params[0]) + params[1] | |
| for layer in range(2, len(self.param_shapes), 2): | |
| net = self.activation(preactivations) | |
| preactivations = tf.matmul(net, params[layer]) + params[layer + 1] | |
| return preactivations | |
| def accuracy(self, params, data, labels): | |
| """Computes the accuracy (fraction of correct classifications). | |
| Args: | |
| params: List of parameter tensors or variables | |
| data: Batch of features with samples along the first dimension | |
| labels: Vector of labels with the same number of samples as the data | |
| Returns: | |
| accuracy: Fraction of correct classifications across the batch | |
| """ | |
| predictions = self.argmax(self.activation(self.inference(params, data))) | |
| return tf.contrib.metrics.accuracy(predictions, tf.cast(labels, tf.int32)) | |
| class ConvNet(SoftmaxClassifier): | |
| """Builds an N-layer convnet for image classification.""" | |
| def __init__(self, | |
| image_shape, | |
| n_classes, | |
| filter_list, | |
| activation=tf.nn.relu, | |
| random_seed=None, | |
| noise_stdev=0.0): | |
| # Number of channels, number of pixels in x- and y- dimensions. | |
| n_channels, px, py = image_shape | |
| # Store the activation. | |
| self.activation = activation | |
| param_shapes = [] | |
| input_size = n_channels | |
| for fltr in filter_list: | |
| # Add conv2d filters. | |
| param_shapes.append((fltr[0], fltr[1], input_size, fltr[2])) | |
| input_size = fltr[2] | |
| # Number of units in the final (dense) layer. | |
| self.affine_size = input_size * px * py | |
| param_shapes.append((self.affine_size, n_classes)) # affine weights | |
| param_shapes.append((n_classes,)) # affine bias | |
| super(ConvNet, self).__init__(param_shapes, random_seed, noise_stdev) | |
| def init_tensors(self, seed=None): | |
| """Returns a list of tensors with the given shape.""" | |
| return [tf.random_normal(shape, mean=0., stddev=0.01, seed=seed) | |
| for shape in self.param_shapes] | |
| def inference(self, params, data): | |
| # Unpack. | |
| w_conv_list = params[:-2] | |
| output_w, output_b = params[-2:] | |
| conv_input = data | |
| for w_conv in w_conv_list: | |
| layer = tf.nn.conv2d(conv_input, w_conv, strides=[1] * 4, padding="SAME") | |
| output = self.activation(layer) | |
| conv_input = output | |
| # Flatten. | |
| flattened = tf.reshape(conv_input, (-1, self.affine_size)) | |
| # Fully connected layer. | |
| return tf.matmul(flattened, output_w) + output_b | |
| class Bowl(Problem): | |
| """A 2D quadratic bowl.""" | |
| def __init__(self, condition_number, angle=0.0, | |
| random_seed=None, noise_stdev=0.0): | |
| assert condition_number > 0, "Condition number must be positive." | |
| # Define parameter shapes. | |
| param_shapes = [(2, 1)] | |
| super(Bowl, self).__init__(param_shapes, random_seed, noise_stdev) | |
| self.condition_number = condition_number | |
| self.angle = angle | |
| self._build_matrix(condition_number, angle) | |
| def _build_matrix(self, condition_number, angle): | |
| """Builds the Hessian matrix.""" | |
| hessian = np.array([[condition_number, 0.], [0., 1.]], dtype="float32") | |
| # Build the rotation matrix. | |
| rotation_matrix = np.array([ | |
| [np.cos(angle), -np.sin(angle)], | |
| [np.sin(angle), np.cos(angle)] | |
| ]) | |
| # The objective is 0.5 * || Ax ||_2^2 | |
| # where the data matrix (A) is: sqrt(Hessian).dot(rotation_matrix). | |
| self.matrix = np.sqrt(hessian).dot(rotation_matrix) | |
| def objective(self, params, data=None, labels=None): | |
| mtx = tf.constant(self.matrix, dtype=tf.float32) | |
| return tf.nn.l2_loss(tf.matmul(mtx, params[0])) | |
| def surface(self, xlim=5, ylim=5, n=50): | |
| xm, ym = _mesh(xlim, ylim, n) | |
| pts = np.vstack([xm.ravel(), ym.ravel()]) | |
| zm = 0.5 * np.linalg.norm(self.matrix.dot(pts), axis=0) ** 2 | |
| return xm, ym, zm.reshape(n, n) | |
| class Problem2D(Problem): | |
| def __init__(self, random_seed=None, noise_stdev=0.0): | |
| param_shapes = [(2,)] | |
| super(Problem2D, self).__init__(param_shapes, random_seed, noise_stdev) | |
| def surface(self, n=50, xlim=5, ylim=5): | |
| """Computes the objective surface over a 2d mesh.""" | |
| # Create a mesh over the given coordinate ranges. | |
| xm, ym = _mesh(xlim, ylim, n) | |
| with tf.Graph().as_default(), tf.Session() as sess: | |
| # Ops to compute the objective at every (x, y) point. | |
| x = tf.placeholder(tf.float32, shape=xm.shape) | |
| y = tf.placeholder(tf.float32, shape=ym.shape) | |
| obj = self.objective([[x, y]]) | |
| # Run the computation. | |
| zm = sess.run(obj, feed_dict={x: xm, y: ym}) | |
| return xm, ym, zm | |
| class Rosenbrock(Problem2D): | |
| """See https://en.wikipedia.org/wiki/Rosenbrock_function. | |
| This function has a single global minima at [1, 1] | |
| The objective value at this point is zero. | |
| """ | |
| def init_tensors(self, seed=None): | |
| """Returns a list of tensors with the given shape.""" | |
| return [tf.random_uniform(shape, minval=-5., maxval=10., seed=seed) | |
| for shape in self.param_shapes] | |
| def objective(self, params, data=None, labels=None): | |
| x, y = tf.split(params[0], 2, axis=0) | |
| obj = (1 - x)**2 + 100 * (y - x**2)**2 | |
| return tf.squeeze(obj) | |
| def make_rosenbrock_loss_and_init(device=None): | |
| """A variable-backed version of Rosenbrock problem. | |
| See the Rosenbrock class for details. | |
| Args: | |
| device: Where to place the ops of this problem. | |
| Returns: | |
| A tuple of two callables, first of which creates the loss and the second | |
| creates the parameter initializer function. | |
| """ | |
| def make_rosenbrock_loss(): | |
| with tf.name_scope("optimizee"): | |
| with tf.device(device): | |
| x = tf.get_variable("x", [1]) | |
| y = tf.get_variable("y", [1]) | |
| c = tf.get_variable( | |
| "c", [1], | |
| initializer=tf.constant_initializer(100.0), | |
| trainable=False) | |
| obj = (1 - x)**2 + c * (y - x**2)**2 | |
| return tf.squeeze(obj) | |
| def make_init_fn(parameters): | |
| with tf.device(device): | |
| init_op = tf.variables_initializer(parameters) | |
| def init_fn(sess): | |
| tf.logging.info("Initializing model parameters.") | |
| sess.run(init_op) | |
| return init_fn | |
| return make_rosenbrock_loss, make_init_fn | |
| class Saddle(Problem2D): | |
| """Loss surface around a saddle point.""" | |
| def objective(self, params, data=None, labels=None): | |
| x, y = tf.split(params[0], 2, axis=0) | |
| obj = x ** 2 - y ** 2 | |
| return tf.squeeze(obj) | |
| class LogSumExp(Problem2D): | |
| """2D function defined by the log of the sum of exponentials.""" | |
| def objective(self, params, data=None, labels=None): | |
| x, y = tf.split(params[0], 2, axis=0) | |
| obj = tf.log(tf.exp(x + 3. * y - 0.1) + | |
| tf.exp(x - 3. * y - 0.1) + | |
| tf.exp(-x - 0.1) + 1.0) | |
| return tf.squeeze(obj) | |
| class Ackley(Problem2D): | |
| """Ackley's function (contains many local minima).""" | |
| def init_tensors(self, seed=None): | |
| """Returns a list of tensors with the given shape.""" | |
| return [tf.random_uniform(shape, minval=-32.768, maxval=32.768, seed=seed) | |
| for shape in self.param_shapes] | |
| def objective(self, params, data=None, labels=None): | |
| x, y = tf.split(params[0], 2, axis=0) | |
| obj = (-20 * tf.exp(-0.2 * tf.sqrt(0.5 * (x ** 2 + y ** 2))) - | |
| tf.exp(0.5 * (tf.cos(2 * np.pi * x) + tf.cos(2 * np.pi * y))) + | |
| tf.exp(1.0) + 20.) | |
| return tf.squeeze(obj) | |
| class Beale(Problem2D): | |
| """Beale function (a multimodal function with sharp peaks).""" | |
| def init_tensors(self, seed=None): | |
| """Returns a list of tensors with the given shape.""" | |
| return [tf.random_uniform(shape, minval=-4.5, maxval=4.5, seed=seed) | |
| for shape in self.param_shapes] | |
| def objective(self, params, data=None, labels=None): | |
| x, y = tf.split(params[0], 2, axis=0) | |
| obj = ((1.5 - x + x * y) ** 2 + | |
| (2.25 - x + x * y ** 2) ** 2 + | |
| (2.625 - x + x * y ** 3) ** 2) | |
| return tf.squeeze(obj) | |
| class Booth(Problem2D): | |
| """Booth's function (has a long valley along one dimension).""" | |
| def init_tensors(self, seed=None): | |
| """Returns a list of tensors with the given shape.""" | |
| return [tf.random_uniform(shape, minval=-10., maxval=10., seed=seed) | |
| for shape in self.param_shapes] | |
| def objective(self, params, data=None, labels=None): | |
| x, y = tf.split(params[0], 2, axis=0) | |
| obj = (x + 2 * y - 7) ** 2 + (2 * x + y - 5) ** 2 | |
| return tf.squeeze(obj) | |
| class StyblinskiTang(Problem2D): | |
| """Styblinski-Tang function (a bumpy function in two dimensions).""" | |
| def init_tensors(self, seed=None): | |
| """Returns a list of tensors with the given shape.""" | |
| return [tf.random_uniform(shape, minval=-5., maxval=5., seed=seed) | |
| for shape in self.param_shapes] | |
| def objective(self, params, data=None, labels=None): | |
| params = tf.split(params[0], 2, axis=0) | |
| obj = 0.5 * tf.reduce_sum([x ** 4 - 16 * x ** 2 + 5 * x | |
| for x in params], 0) + 80. | |
| return tf.squeeze(obj) | |
| class Matyas(Problem2D): | |
| """Matyas function (a function with a single global minimum in a valley).""" | |
| def init_tensors(self, seed=None): | |
| """Returns a list of tensors with the given shape.""" | |
| return [tf.random_uniform(shape, minval=-10, maxval=10, seed=seed) | |
| for shape in self.param_shapes] | |
| def objective(self, params, data=None, labels=None): | |
| x, y = tf.split(params[0], 2, axis=0) | |
| obj = 0.26 * (x ** 2 + y ** 2) - 0.48 * x * y | |
| return tf.squeeze(obj) | |
| class Branin(Problem2D): | |
| """Branin function (a function with three global minima).""" | |
| def init_tensors(self, seed=None): | |
| """Returns a list of tensors with the given shape.""" | |
| x1 = tf.random_uniform((1,), minval=-5., maxval=10., | |
| seed=seed) | |
| x2 = tf.random_uniform((1,), minval=0., maxval=15., | |
| seed=seed) | |
| return [tf.concat([x1, x2], 0)] | |
| def objective(self, params, data=None, labels=None): | |
| x, y = tf.split(params[0], 2, axis=0) | |
| # Define some constants. | |
| a = 1. | |
| b = 5.1 / (4. * np.pi ** 2) | |
| c = 5 / np.pi | |
| r = 6. | |
| s = 10. | |
| t = 1 / (8. * np.pi) | |
| # Evaluate the function. | |
| obj = a * (y - b * x ** 2 + c * x - r) ** 2 + s * (1 - t) * tf.cos(x) + s | |
| return tf.squeeze(obj) | |
| class Michalewicz(Problem2D): | |
| """Michalewicz function (has steep ridges and valleys).""" | |
| def init_tensors(self, seed=None): | |
| """Returns a list of tensors with the given shape.""" | |
| return [tf.random_uniform(shape, minval=0., maxval=np.pi, seed=seed) | |
| for shape in self.param_shapes] | |
| def objective(self, params, data=None, labels=None): | |
| x, y = tf.split(params[0], 2, axis=0) | |
| m = 5 # Defines how steep the ridges are (larger m => steeper ridges). | |
| obj = 2. - (tf.sin(x) * tf.sin(x ** 2 / np.pi) ** (2 * m) + | |
| tf.sin(y) * tf.sin(2 * y ** 2 / np.pi) ** (2 * m)) | |
| return tf.squeeze(obj) | |
| class Rescale(Problem): | |
| """Takes an existing problem, and rescales all the parameters.""" | |
| def __init__(self, problem_spec, scale=10., noise_stdev=0.0): | |
| self.problem = problem_spec.build() | |
| self.param_shapes = self.problem.param_shapes | |
| self.scale = scale | |
| super(Rescale, self).__init__(self.param_shapes, random_seed=None, | |
| noise_stdev=noise_stdev) | |
| def init_tensors(self, seed=None): | |
| params_raw = self.problem.init_tensors(seed=seed) | |
| params = [t * self.scale for t in params_raw] | |
| return params | |
| def objective(self, params, data=None, labels=None): | |
| params_raw = [t/self.scale for t in params] | |
| problem_obj = self.problem.objective(params_raw, data, labels) | |
| return problem_obj | |
| class SumTask(Problem): | |
| """Takes a list of problems and modifies the objective to be their sum.""" | |
| def __init__(self, problem_specs, noise_stdev=0.0): | |
| self.problems = [ps.build() for ps in problem_specs] | |
| self.param_shapes = [] | |
| for prob in self.problems: | |
| self.param_shapes += prob.param_shapes | |
| super(SumTask, self).__init__(self.param_shapes, random_seed=None, | |
| noise_stdev=noise_stdev) | |
| def init_tensors(self, seed=None): | |
| tensors = [] | |
| for prob in self.problems: | |
| tensors += prob.init_tensors(seed=seed) | |
| return tensors | |
| def objective(self, params, data=None, labels=None): | |
| obj = 0. | |
| index = 0 | |
| for prob in self.problems: | |
| num_params = len(prob.param_shapes) | |
| obj += prob.objective(params[index:index + num_params]) | |
| index += num_params | |
| return obj | |
| class IsotropicQuadratic(Problem): | |
| """An isotropic quadratic problem.""" | |
| def objective(self, params, data=None, labels=None): | |
| return sum([tf.reduce_sum(param ** 2) for param in params]) | |
| class Norm(Problem): | |
| """Takes an existing problem and modifies the objective to be its N-norm.""" | |
| def __init__(self, ndim, random_seed=None, noise_stdev=0.0, norm_power=2.): | |
| param_shapes = [(ndim, 1)] | |
| super(Norm, self).__init__(param_shapes, random_seed, noise_stdev) | |
| # Generate a random problem instance. | |
| self.w = np.random.randn(ndim, ndim).astype("float32") | |
| self.y = np.random.randn(ndim, 1).astype("float32") | |
| self.norm_power = norm_power | |
| def objective(self, params, data=None, labels=None): | |
| diff = tf.matmul(self.w, params[0]) - self.y | |
| exp = 1. / self.norm_power | |
| loss = tf.reduce_sum((tf.abs(diff) + EPSILON) ** self.norm_power) ** exp | |
| return loss | |
| class LogObjective(Problem): | |
| """Takes an existing problem and modifies the objective to be its log.""" | |
| def __init__(self, problem_spec): | |
| self.problem = problem_spec.build() | |
| self.param_shapes = self.problem.param_shapes | |
| super(LogObjective, self).__init__(self.param_shapes, | |
| random_seed=None, | |
| noise_stdev=0.0) | |
| def objective(self, params, data=None, labels=None): | |
| problem_obj = self.problem.objective(params, data, labels) | |
| return tf.log(problem_obj + EPSILON) - tf.log(EPSILON) | |
| class SparseProblem(Problem): | |
| """Takes a problem and sets gradients to 0 with the given probability.""" | |
| def __init__(self, | |
| problem_spec, | |
| zero_probability=0.99, | |
| random_seed=None, | |
| noise_stdev=0.0): | |
| self.problem = problem_spec.build() | |
| self.param_shapes = self.problem.param_shapes | |
| self.zero_prob = zero_probability | |
| super(SparseProblem, self).__init__(self.param_shapes, | |
| random_seed=random_seed, | |
| noise_stdev=noise_stdev) | |
| def objective(self, parameters, data=None, labels=None): | |
| return self.problem.objective(parameters, data, labels) | |
| def gradients(self, objective, parameters): | |
| grads = tf.gradients(objective, list(parameters)) | |
| new_grads = [] | |
| for grad in grads: | |
| mask = tf.greater(self.zero_prob, tf.random_uniform(grad.get_shape())) | |
| zero_grad = tf.zeros_like(grad, dtype=tf.float32) | |
| noisy_grad = grad + self.noise_stdev * tf.random_normal(grad.get_shape()) | |
| new_grads.append(tf.where(mask, zero_grad, noisy_grad)) | |
| return new_grads | |
| class DependencyChain(Problem): | |
| """A problem in which parameters must be optimized in order. | |
| A sequence of parameters which all need to be brought to 0, but where each | |
| parameter in the sequence can't be brought to 0 until the preceding one | |
| has been. This should take a long time to optimize, with steady | |
| (or accelerating) progress throughout the entire process. | |
| """ | |
| def __init__(self, ndim, random_seed=None, noise_stdev=0.): | |
| param_shapes = [(ndim + 1,)] | |
| self.ndim = ndim | |
| super(DependencyChain, self).__init__( | |
| param_shapes, random_seed, noise_stdev) | |
| def objective(self, params, data=None, labels=None): | |
| terms = params[0][0]**2 + params[0][1:]**2 / (params[0][:-1]**2 + EPSILON) | |
| return tf.reduce_sum(terms) | |
| class MinMaxWell(Problem): | |
| """Problem with global min when both the min and max (absolute) params are 1. | |
| The gradient for all but two parameters (the min and max) is zero. This | |
| should therefore encourage the optimizer to behave sensible even when | |
| parameters have zero gradients, as is common eg for some deep neural nets. | |
| """ | |
| def __init__(self, ndim, random_seed=None, noise_stdev=0.): | |
| param_shapes = [(ndim,)] | |
| self.ndim = ndim | |
| super(MinMaxWell, self).__init__(param_shapes, random_seed, noise_stdev) | |
| def objective(self, params, data=None, labels=None): | |
| params_sqr = params[0]**2 | |
| min_sqr = tf.reduce_min(params_sqr) | |
| max_sqr = tf.reduce_max(params_sqr) | |
| epsilon = 1e-12 | |
| return max_sqr + 1./min_sqr - 2. + epsilon | |
| class OutwardSnake(Problem): | |
| """A winding path out to infinity. | |
| Ideal step length stays constant along the entire path. | |
| """ | |
| def __init__(self, ndim, random_seed=None, noise_stdev=0.): | |
| param_shapes = [(ndim,)] | |
| self.ndim = ndim | |
| super(OutwardSnake, self).__init__(param_shapes, random_seed, noise_stdev) | |
| def objective(self, params, data, labels=None): | |
| radius = tf.sqrt(tf.reduce_sum(params[0]**2)) | |
| rad_loss = tf.reduce_sum(1. / (radius + 1e-6) * data[:, 0]) | |
| sin_dist = params[0][1:] - tf.cos(params[0][:-1]) * np.pi | |
| sin_loss = tf.reduce_sum((sin_dist * data[:, 1:])**2) | |
| return rad_loss + sin_loss | |
| class ProjectionQuadratic(Problem): | |
| """Dataset consists of different directions to probe. Global min is at 0.""" | |
| def __init__(self, ndim, random_seed=None, noise_stdev=0.): | |
| param_shapes = [(1, ndim)] | |
| super(ProjectionQuadratic, self).__init__( | |
| param_shapes, random_seed, noise_stdev) | |
| def objective(self, params, data, labels=None): | |
| return tf.reduce_sum((params[0] * data)**2) | |
| class SumOfQuadratics(Problem): | |
| def __init__(self, ndim, random_seed=None, noise_stdev=0.): | |
| param_shapes = [(1, ndim)] | |
| super(SumOfQuadratics, self).__init__( | |
| param_shapes, random_seed, noise_stdev) | |
| def objective(self, params, data, labels=None): | |
| epsilon = 1e-12 | |
| # Assume dataset is designed so that the global minimum is at params=0. | |
| # Subtract loss at params=0, so that global minimum has objective value | |
| # epsilon (added to avoid floating point issues). | |
| return (tf.reduce_sum((params[0] - data)**2) - tf.reduce_sum(data**2) + | |
| epsilon) | |
| class MatMulAlgorithm(Problem): | |
| """A 6-th order polynomial optimization problem. | |
| This problem is parametrized by n and k. A solution to this problem with | |
| objective value exactly zero defines a matrix multiplication algorithm of | |
| n x n matrices using k multiplications between matrices. When applied | |
| recursively, such an algorithm has complexity O(n^(log_n(k))). | |
| Given n, it is not known in general which values of k in [n^2, n^3] have a | |
| solution. There is always a solution with k = n^3 (this is the naive | |
| algorithm). | |
| In the special case n = 2, it is known that there are solutions for k = {7, 8} | |
| but not for k <= 6. For n = 3, it is known that there are exact solutions for | |
| 23 <= k <= 27, and there are asymptotic solutions for k = {21, 22}, but the | |
| other cases are unknown. | |
| For a given n and k, if one solution exists then infinitely many solutions | |
| exist due to permutation and scaling symmetries in the parameters. | |
| This is a very hard problem for some values of n and k (e.g. n = 3, k = 21), | |
| but very easy for other values (e.g. n = 2, k = 7). | |
| For a given n and k, the specific formulation of this problem is as follows. | |
| Let theta_a, theta_b, theta_c be parameter matrices with respective dimensions | |
| [n**2, k], [n**2, k], [k, n**2]. Then for any matrices a, b with shape [n, n], | |
| we can form the matrix c with shape [n, n] via the operation: | |
| ((vec(a) * theta_a) .* (vec(b) * theta_b)) * theta_c = vec(c), (#) | |
| where vec(x) is the operator that flattens a matrix with shape [n, n] into a | |
| row vector with shape [1, n**2], * denotes matrix multiplication and .* | |
| denotes elementwise multiplication. | |
| This operation, parameterized by theta_a, theta_b, theta_c, is a matrix | |
| multiplication algorithm iff c = a*b for all [n, n] matrices a and b. But | |
| actually it suffices to verify all combinations of one-hot matrices a and b, | |
| of which there are n**4 such combinations. This gives a batch of n**4 matrix | |
| triplets (a, b, c) such that equation (#) must hold for each triplet. We solve | |
| for theta_a, theta_b, theta_c by minimizing the sum of squares of errors | |
| across this batch. | |
| Finally, theta_c can be computed from theta_a and theta_b. Therefore it | |
| suffices to learn theta_a and theta_b, from which theta_c and therefore the | |
| objective value can be computed. | |
| """ | |
| def __init__(self, n, k): | |
| assert isinstance(n, int), "n must be an integer" | |
| assert isinstance(k, int), "k must be an integer" | |
| assert n >= 2, "Must have n >= 2" | |
| assert k >= n**2 and k <= n**3, "Must have n**2 <= k <= n**3" | |
| param_shapes = [(n**2, k), (n**2, k)] # theta_a, theta_b | |
| super(MatMulAlgorithm, self).__init__( | |
| param_shapes, random_seed=None, noise_stdev=0.0) | |
| self.n = n | |
| self.k = k | |
| # Build a batch of all combinations of one-hot matrices a, b, and their | |
| # respective products c. Correctness on this batch is a necessary and | |
| # sufficient condition for the algorithm to be valid. The number of matrices | |
| # in {a, b, c}_3d is n**4 and each matrix is n x n. | |
| onehots = np.identity(n**2).reshape(n**2, n, n) | |
| a_3d = np.repeat(onehots, n**2, axis=0) | |
| b_3d = np.tile(onehots, [n**2, 1, 1]) | |
| c_3d = np.matmul(a_3d, b_3d) | |
| # Convert the batch to 2D Tensors. | |
| self.a = tf.constant(a_3d.reshape(n**4, n**2), tf.float32, name="a") | |
| self.b = tf.constant(b_3d.reshape(n**4, n**2), tf.float32, name="b") | |
| self.c = tf.constant(c_3d.reshape(n**4, n**2), tf.float32, name="c") | |
| def init_tensors(self, seed=None): | |
| # Initialize params such that the columns of theta_a and theta_b have L2 | |
| # norm 1. | |
| def _param_initializer(shape, seed=None): | |
| x = tf.random_normal(shape, dtype=tf.float32, seed=seed) | |
| return tf.transpose(tf.nn.l2_normalize(tf.transpose(x), 1)) | |
| return [_param_initializer(shape, seed) for shape in self.param_shapes] | |
| def objective(self, parameters, data=None, labels=None): | |
| theta_a = parameters[0] | |
| theta_b = parameters[1] | |
| # Compute theta_c from theta_a and theta_b. | |
| p = tf.matmul(self.a, theta_a) * tf.matmul(self.b, theta_b) | |
| p_trans = tf.transpose(p, name="p_trans") | |
| p_inv = tf.matmul( | |
| tf.matrix_inverse(tf.matmul(p_trans, p)), p_trans, name="p_inv") | |
| theta_c = tf.matmul(p_inv, self.c, name="theta_c") | |
| # Compute the "predicted" value of c. | |
| c_hat = tf.matmul(p, theta_c, name="c_hat") | |
| # Compute the loss (sum of squared errors). | |
| loss = tf.reduce_sum((c_hat - self.c)**2, name="loss") | |
| return loss | |
| def matmul_problem_sequence(n, k_min, k_max): | |
| """Helper to generate a sequence of matrix multiplication problems.""" | |
| return [(_Spec(MatMulAlgorithm, (n, k), {}), None, None) | |
| for k in range(k_min, k_max + 1)] | |
| def init_fixed_variables(arrays): | |
| with tf.variable_scope(PARAMETER_SCOPE): | |
| params = [tf.Variable(arr.astype("float32")) for arr in arrays] | |
| return params | |
| def _mesh(xlim, ylim, n): | |
| """Creates a 2D meshgrid covering the given ranges. | |
| Args: | |
| xlim: int that defines the desired x-range (-xlim, xlim) | |
| ylim: int that defines the desired y-range (-ylim, ylim) | |
| n: number of points in each dimension of the mesh | |
| Returns: | |
| xm: 2D array of x-values in the mesh | |
| ym: 2D array of y-values in the mesh | |
| """ | |
| return np.meshgrid(np.linspace(-xlim, xlim, n), | |
| np.linspace(-ylim, ylim, n)) | |