Spaces:
Runtime error
Runtime error
| # Copyright 2017 Google, Inc. All Rights Reserved. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| # ============================================================================== | |
| """Functions to generate or load datasets for supervised learning.""" | |
| from __future__ import absolute_import | |
| from __future__ import division | |
| from __future__ import print_function | |
| from collections import namedtuple | |
| import numpy as np | |
| from sklearn.datasets import make_classification | |
| MAX_SEED = 4294967295 | |
| class Dataset(namedtuple("Dataset", "data labels")): | |
| """Helper class for managing a supervised learning dataset. | |
| Args: | |
| data: an array of type float32 with N samples, each of which is the set | |
| of features for that sample. (Shape (N, D_i), where N is the number of | |
| samples and D_i is the number of features for that sample.) | |
| labels: an array of type int32 or int64 with N elements, indicating the | |
| class label for the corresponding set of features in data. | |
| """ | |
| # Since this is an immutable object, we don't need to reserve slots. | |
| __slots__ = () | |
| def size(self): | |
| """Dataset size (number of samples).""" | |
| return len(self.data) | |
| def batch_indices(self, num_batches, batch_size): | |
| """Creates indices of shuffled minibatches. | |
| Args: | |
| num_batches: the number of batches to generate | |
| batch_size: the size of each batch | |
| Returns: | |
| batch_indices: a list of minibatch indices, arranged so that the dataset | |
| is randomly shuffled. | |
| Raises: | |
| ValueError: if the data and labels have different lengths | |
| """ | |
| if len(self.data) != len(self.labels): | |
| raise ValueError("Labels and data must have the same number of samples.") | |
| batch_indices = [] | |
| # Follows logic in mnist.py to ensure we cover the entire dataset. | |
| index_in_epoch = 0 | |
| dataset_size = len(self.data) | |
| dataset_indices = np.arange(dataset_size) | |
| np.random.shuffle(dataset_indices) | |
| for _ in range(num_batches): | |
| start = index_in_epoch | |
| index_in_epoch += batch_size | |
| if index_in_epoch > dataset_size: | |
| # Finished epoch, reshuffle. | |
| np.random.shuffle(dataset_indices) | |
| # Start next epoch. | |
| start = 0 | |
| index_in_epoch = batch_size | |
| end = index_in_epoch | |
| batch_indices.append(dataset_indices[start:end].tolist()) | |
| return batch_indices | |
| def noisy_parity_class(n_samples, | |
| n_classes=2, | |
| n_context_ids=5, | |
| noise_prob=0.25, | |
| random_seed=None): | |
| """Returns a randomly generated sparse-to-sparse dataset. | |
| The label is a parity class of a set of context classes. | |
| Args: | |
| n_samples: number of samples (data points) | |
| n_classes: number of class labels (default: 2) | |
| n_context_ids: how many classes to take the parity of (default: 5). | |
| noise_prob: how often to corrupt the label (default: 0.25) | |
| random_seed: seed used for drawing the random data (default: None) | |
| Returns: | |
| dataset: A Dataset namedtuple containing the generated data and labels | |
| """ | |
| np.random.seed(random_seed) | |
| x = np.random.randint(0, n_classes, [n_samples, n_context_ids]) | |
| noise = np.random.binomial(1, noise_prob, [n_samples]) | |
| y = (np.sum(x, 1) + noise) % n_classes | |
| return Dataset(x.astype("float32"), y.astype("int32")) | |
| def random(n_features, n_samples, n_classes=2, sep=1.0, random_seed=None): | |
| """Returns a randomly generated classification dataset. | |
| Args: | |
| n_features: number of features (dependent variables) | |
| n_samples: number of samples (data points) | |
| n_classes: number of class labels (default: 2) | |
| sep: separation of the two classes, a higher value corresponds to | |
| an easier classification problem (default: 1.0) | |
| random_seed: seed used for drawing the random data (default: None) | |
| Returns: | |
| dataset: A Dataset namedtuple containing the generated data and labels | |
| """ | |
| # Generate the problem data. | |
| x, y = make_classification(n_samples=n_samples, | |
| n_features=n_features, | |
| n_informative=n_features, | |
| n_redundant=0, | |
| n_classes=n_classes, | |
| class_sep=sep, | |
| random_state=random_seed) | |
| return Dataset(x.astype("float32"), y.astype("int32")) | |
| def random_binary(n_features, n_samples, random_seed=None): | |
| """Returns a randomly generated dataset of binary values. | |
| Args: | |
| n_features: number of features (dependent variables) | |
| n_samples: number of samples (data points) | |
| random_seed: seed used for drawing the random data (default: None) | |
| Returns: | |
| dataset: A Dataset namedtuple containing the generated data and labels | |
| """ | |
| random_seed = (np.random.randint(MAX_SEED) if random_seed is None | |
| else random_seed) | |
| np.random.seed(random_seed) | |
| x = np.random.randint(2, size=(n_samples, n_features)) | |
| y = np.zeros((n_samples, 1)) | |
| return Dataset(x.astype("float32"), y.astype("int32")) | |
| def random_symmetric(n_features, n_samples, random_seed=None): | |
| """Returns a randomly generated dataset of values and their negatives. | |
| Args: | |
| n_features: number of features (dependent variables) | |
| n_samples: number of samples (data points) | |
| random_seed: seed used for drawing the random data (default: None) | |
| Returns: | |
| dataset: A Dataset namedtuple containing the generated data and labels | |
| """ | |
| random_seed = (np.random.randint(MAX_SEED) if random_seed is None | |
| else random_seed) | |
| np.random.seed(random_seed) | |
| x1 = np.random.normal(size=(int(n_samples/2), n_features)) | |
| x = np.concatenate((x1, -x1), axis=0) | |
| y = np.zeros((n_samples, 1)) | |
| return Dataset(x.astype("float32"), y.astype("int32")) | |
| def random_mlp(n_features, n_samples, random_seed=None, n_layers=6, width=20): | |
| """Returns a generated output of an MLP with random weights. | |
| Args: | |
| n_features: number of features (dependent variables) | |
| n_samples: number of samples (data points) | |
| random_seed: seed used for drawing the random data (default: None) | |
| n_layers: number of layers in random MLP | |
| width: width of the layers in random MLP | |
| Returns: | |
| dataset: A Dataset namedtuple containing the generated data and labels | |
| """ | |
| random_seed = (np.random.randint(MAX_SEED) if random_seed is None | |
| else random_seed) | |
| np.random.seed(random_seed) | |
| x = np.random.normal(size=(n_samples, n_features)) | |
| y = x | |
| n_in = n_features | |
| scale_factor = np.sqrt(2.) / np.sqrt(n_features) | |
| for _ in range(n_layers): | |
| weights = np.random.normal(size=(n_in, width)) * scale_factor | |
| y = np.dot(y, weights).clip(min=0) | |
| n_in = width | |
| y = y[:, 0] | |
| y[y > 0] = 1 | |
| return Dataset(x.astype("float32"), y.astype("int32")) | |
| EMPTY_DATASET = Dataset(np.array([], dtype="float32"), | |
| np.array([], dtype="int32")) | |