Spaces:
Running
Running
| """ | |
| Title: Sequence to sequence learning for performing number addition | |
| Author: [Smerity](https://twitter.com/Smerity) and others | |
| Date created: 2015/08/17 | |
| Last modified: 2024/02/13 | |
| Description: A model that learns to add strings of numbers, e.g. "535+61" -> "596". | |
| Accelerator: GPU | |
| """ | |
| """ | |
| ## Introduction | |
| In this example, we train a model to learn to add two numbers, provided as strings. | |
| **Example:** | |
| - Input: "535+61" | |
| - Output: "596" | |
| Input may optionally be reversed, which was shown to increase performance in many tasks | |
| in: [Learning to Execute](http://arxiv.org/abs/1410.4615) and | |
| [Sequence to Sequence Learning with Neural Networks](http://papers.nips.cc/paper/5346-sequence-to-sequence-learning-with-neural-networks.pdf). | |
| Theoretically, sequence order inversion introduces shorter term dependencies between | |
| source and target for this problem. | |
| **Results:** | |
| For two digits (reversed): | |
| + One layer LSTM (128 HN), 5k training examples = 99% train/test accuracy in 55 epochs | |
| Three digits (reversed): | |
| + One layer LSTM (128 HN), 50k training examples = 99% train/test accuracy in 100 epochs | |
| Four digits (reversed): | |
| + One layer LSTM (128 HN), 400k training examples = 99% train/test accuracy in 20 epochs | |
| Five digits (reversed): | |
| + One layer LSTM (128 HN), 550k training examples = 99% train/test accuracy in 30 epochs | |
| """ | |
| """ | |
| ## Setup | |
| """ | |
| import keras | |
| from keras import layers | |
| import numpy as np | |
| # Parameters for the model and dataset. | |
| TRAINING_SIZE = 50000 | |
| DIGITS = 3 | |
| REVERSE = True | |
| # Maximum length of input is 'int + int' (e.g., '345+678'). Maximum length of | |
| # int is DIGITS. | |
| MAXLEN = DIGITS + 1 + DIGITS | |
| """ | |
| ## Generate the data | |
| """ | |
| class CharacterTable: | |
| """Given a set of characters: | |
| + Encode them to a one-hot integer representation | |
| + Decode the one-hot or integer representation to their character output | |
| + Decode a vector of probabilities to their character output | |
| """ | |
| def __init__(self, chars): | |
| """Initialize character table. | |
| # Arguments | |
| chars: Characters that can appear in the input. | |
| """ | |
| self.chars = sorted(set(chars)) | |
| self.char_indices = dict((c, i) for i, c in enumerate(self.chars)) | |
| self.indices_char = dict((i, c) for i, c in enumerate(self.chars)) | |
| def encode(self, C, num_rows): | |
| """One-hot encode given string C. | |
| # Arguments | |
| C: string, to be encoded. | |
| num_rows: Number of rows in the returned one-hot encoding. This is | |
| used to keep the # of rows for each data the same. | |
| """ | |
| x = np.zeros((num_rows, len(self.chars))) | |
| for i, c in enumerate(C): | |
| x[i, self.char_indices[c]] = 1 | |
| return x | |
| def decode(self, x, calc_argmax=True): | |
| """Decode the given vector or 2D array to their character output. | |
| # Arguments | |
| x: A vector or a 2D array of probabilities or one-hot representations; | |
| or a vector of character indices (used with `calc_argmax=False`). | |
| calc_argmax: Whether to find the character index with maximum | |
| probability, defaults to `True`. | |
| """ | |
| if calc_argmax: | |
| x = x.argmax(axis=-1) | |
| return "".join(self.indices_char[x] for x in x) | |
| # All the numbers, plus sign and space for padding. | |
| chars = "0123456789+ " | |
| ctable = CharacterTable(chars) | |
| questions = [] | |
| expected = [] | |
| seen = set() | |
| print("Generating data...") | |
| while len(questions) < TRAINING_SIZE: | |
| f = lambda: int( | |
| "".join( | |
| np.random.choice(list("0123456789")) | |
| for i in range(np.random.randint(1, DIGITS + 1)) | |
| ) | |
| ) | |
| a, b = f(), f() | |
| # Skip any addition questions we've already seen | |
| # Also skip any such that x+Y == Y+x (hence the sorting). | |
| key = tuple(sorted((a, b))) | |
| if key in seen: | |
| continue | |
| seen.add(key) | |
| # Pad the data with spaces such that it is always MAXLEN. | |
| q = "{}+{}".format(a, b) | |
| query = q + " " * (MAXLEN - len(q)) | |
| ans = str(a + b) | |
| # Answers can be of maximum size DIGITS + 1. | |
| ans += " " * (DIGITS + 1 - len(ans)) | |
| if REVERSE: | |
| # Reverse the query, e.g., '12+345 ' becomes ' 543+21'. (Note the | |
| # space used for padding.) | |
| query = query[::-1] | |
| questions.append(query) | |
| expected.append(ans) | |
| print("Total questions:", len(questions)) | |
| """ | |
| ## Vectorize the data | |
| """ | |
| print("Vectorization...") | |
| x = np.zeros((len(questions), MAXLEN, len(chars)), dtype=bool) | |
| y = np.zeros((len(questions), DIGITS + 1, len(chars)), dtype=bool) | |
| for i, sentence in enumerate(questions): | |
| x[i] = ctable.encode(sentence, MAXLEN) | |
| for i, sentence in enumerate(expected): | |
| y[i] = ctable.encode(sentence, DIGITS + 1) | |
| # Shuffle (x, y) in unison as the later parts of x will almost all be larger | |
| # digits. | |
| indices = np.arange(len(y)) | |
| np.random.shuffle(indices) | |
| x = x[indices] | |
| y = y[indices] | |
| # Explicitly set apart 10% for validation data that we never train over. | |
| split_at = len(x) - len(x) // 10 | |
| (x_train, x_val) = x[:split_at], x[split_at:] | |
| (y_train, y_val) = y[:split_at], y[split_at:] | |
| print("Training Data:") | |
| print(x_train.shape) | |
| print(y_train.shape) | |
| print("Validation Data:") | |
| print(x_val.shape) | |
| print(y_val.shape) | |
| """ | |
| ## Build the model | |
| """ | |
| print("Build model...") | |
| num_layers = 1 # Try to add more LSTM layers! | |
| model = keras.Sequential() | |
| # "Encode" the input sequence using a LSTM, producing an output of size 128. | |
| # Note: In a situation where your input sequences have a variable length, | |
| # use input_shape=(None, num_feature). | |
| model.add(layers.Input((MAXLEN, len(chars)))) | |
| model.add(layers.LSTM(128)) | |
| # As the decoder RNN's input, repeatedly provide with the last output of | |
| # RNN for each time step. Repeat 'DIGITS + 1' times as that's the maximum | |
| # length of output, e.g., when DIGITS=3, max output is 999+999=1998. | |
| model.add(layers.RepeatVector(DIGITS + 1)) | |
| # The decoder RNN could be multiple layers stacked or a single layer. | |
| for _ in range(num_layers): | |
| # By setting return_sequences to True, return not only the last output but | |
| # all the outputs so far in the form of (num_samples, timesteps, | |
| # output_dim). This is necessary as TimeDistributed in the below expects | |
| # the first dimension to be the timesteps. | |
| model.add(layers.LSTM(128, return_sequences=True)) | |
| # Apply a dense layer to the every temporal slice of an input. For each of step | |
| # of the output sequence, decide which character should be chosen. | |
| model.add(layers.Dense(len(chars), activation="softmax")) | |
| model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"]) | |
| model.summary() | |
| """ | |
| ## Train the model | |
| """ | |
| # Training parameters. | |
| epochs = 30 | |
| batch_size = 32 | |
| # Formatting characters for results display. | |
| green_color = "\033[92m" | |
| red_color = "\033[91m" | |
| end_char = "\033[0m" | |
| # Train the model each generation and show predictions against the validation | |
| # dataset. | |
| for epoch in range(1, epochs): | |
| print() | |
| print("Iteration", epoch) | |
| model.fit( | |
| x_train, | |
| y_train, | |
| batch_size=batch_size, | |
| epochs=1, | |
| validation_data=(x_val, y_val), | |
| ) | |
| # Select 10 samples from the validation set at random so we can visualize | |
| # errors. | |
| for i in range(10): | |
| ind = np.random.randint(0, len(x_val)) | |
| rowx, rowy = x_val[np.array([ind])], y_val[np.array([ind])] | |
| preds = np.argmax(model.predict(rowx, verbose=0), axis=-1) | |
| q = ctable.decode(rowx[0]) | |
| correct = ctable.decode(rowy[0]) | |
| guess = ctable.decode(preds[0], calc_argmax=False) | |
| print("Q", q[::-1] if REVERSE else q, end=" ") | |
| print("T", correct, end=" ") | |
| if correct == guess: | |
| print(f"{green_color}☑ {guess}{end_char}") | |
| else: | |
| print(f"{red_color}☒ {guess}{end_char}") | |
| """ | |
| You'll get to 99+% validation accuracy after ~30 epochs. | |
| """ | |