Spaces:

MCP-1st-Birthday
/

ML-Starter

Running

App Files Files Community

ML-Starter / knowledge_base /nlp /addition_rnn.py

emreatilgan

feat: Initialize mcp_server with embedding and loader modules

9ce984a 18 days ago

raw

history blame contribute delete

7.86 kB

	"""
	Title: Sequence to sequence learning for performing number addition
	Author: [Smerity](https://twitter.com/Smerity) and others
	Date created: 2015/08/17
	Last modified: 2024/02/13
	Description: A model that learns to add strings of numbers, e.g. "535+61" -> "596".
	Accelerator: GPU
	"""

	"""
	## Introduction

	In this example, we train a model to learn to add two numbers, provided as strings.

	Example:

	- Input: "535+61"
	- Output: "596"

	Input may optionally be reversed, which was shown to increase performance in many tasks
	in: [Learning to Execute](http://arxiv.org/abs/1410.4615) and
	[Sequence to Sequence Learning with Neural Networks](http://papers.nips.cc/paper/5346-sequence-to-sequence-learning-with-neural-networks.pdf).

	Theoretically, sequence order inversion introduces shorter term dependencies between
	source and target for this problem.

	Results:

	For two digits (reversed):

	+ One layer LSTM (128 HN), 5k training examples = 99% train/test accuracy in 55 epochs

	Three digits (reversed):

	+ One layer LSTM (128 HN), 50k training examples = 99% train/test accuracy in 100 epochs

	Four digits (reversed):

	+ One layer LSTM (128 HN), 400k training examples = 99% train/test accuracy in 20 epochs

	Five digits (reversed):

	+ One layer LSTM (128 HN), 550k training examples = 99% train/test accuracy in 30 epochs
	"""

	"""
	## Setup
	"""

	import keras
	from keras import layers
	import numpy as np

	# Parameters for the model and dataset.
	TRAINING_SIZE = 50000
	DIGITS = 3
	REVERSE = True

	# Maximum length of input is 'int + int' (e.g., '345+678'). Maximum length of
	# int is DIGITS.
	MAXLEN = DIGITS + 1 + DIGITS

	"""
	## Generate the data
	"""


	class CharacterTable:
	"""Given a set of characters:
	+ Encode them to a one-hot integer representation
	+ Decode the one-hot or integer representation to their character output
	+ Decode a vector of probabilities to their character output
	"""

	def __init__(self, chars):
	"""Initialize character table.
	# Arguments
	chars: Characters that can appear in the input.
	"""
	self.chars = sorted(set(chars))
	self.char_indices = dict((c, i) for i, c in enumerate(self.chars))
	self.indices_char = dict((i, c) for i, c in enumerate(self.chars))

	def encode(self, C, num_rows):
	"""One-hot encode given string C.
	# Arguments
	C: string, to be encoded.
	num_rows: Number of rows in the returned one-hot encoding. This is
	used to keep the # of rows for each data the same.
	"""
	x = np.zeros((num_rows, len(self.chars)))
	for i, c in enumerate(C):
	x[i, self.char_indices[c]] = 1
	return x

	def decode(self, x, calc_argmax=True):
	"""Decode the given vector or 2D array to their character output.
	# Arguments
	x: A vector or a 2D array of probabilities or one-hot representations;
	or a vector of character indices (used with `calc_argmax=False`).
	calc_argmax: Whether to find the character index with maximum
	probability, defaults to `True`.
	"""
	if calc_argmax:
	x = x.argmax(axis=-1)
	return "".join(self.indices_char[x] for x in x)


	# All the numbers, plus sign and space for padding.
	chars = "0123456789+ "
	ctable = CharacterTable(chars)

	questions = []
	expected = []
	seen = set()
	print("Generating data...")
	while len(questions) < TRAINING_SIZE:
	f = lambda: int(
	"".join(
	np.random.choice(list("0123456789"))
	for i in range(np.random.randint(1, DIGITS + 1))
	)
	)
	a, b = f(), f()
	# Skip any addition questions we've already seen
	# Also skip any such that x+Y == Y+x (hence the sorting).
	key = tuple(sorted((a, b)))
	if key in seen:
	continue
	seen.add(key)
	# Pad the data with spaces such that it is always MAXLEN.
	q = "{}+{}".format(a, b)
	query = q + " " * (MAXLEN - len(q))
	ans = str(a + b)
	# Answers can be of maximum size DIGITS + 1.
	ans += " " * (DIGITS + 1 - len(ans))
	if REVERSE:
	# Reverse the query, e.g., '12+345 ' becomes ' 543+21'. (Note the
	# space used for padding.)
	query = query[::-1]
	questions.append(query)
	expected.append(ans)
	print("Total questions:", len(questions))

	"""
	## Vectorize the data
	"""

	print("Vectorization...")
	x = np.zeros((len(questions), MAXLEN, len(chars)), dtype=bool)
	y = np.zeros((len(questions), DIGITS + 1, len(chars)), dtype=bool)
	for i, sentence in enumerate(questions):
	x[i] = ctable.encode(sentence, MAXLEN)
	for i, sentence in enumerate(expected):
	y[i] = ctable.encode(sentence, DIGITS + 1)

	# Shuffle (x, y) in unison as the later parts of x will almost all be larger
	# digits.
	indices = np.arange(len(y))
	np.random.shuffle(indices)
	x = x[indices]
	y = y[indices]

	# Explicitly set apart 10% for validation data that we never train over.
	split_at = len(x) - len(x) // 10
	(x_train, x_val) = x[:split_at], x[split_at:]
	(y_train, y_val) = y[:split_at], y[split_at:]

	print("Training Data:")
	print(x_train.shape)
	print(y_train.shape)

	print("Validation Data:")
	print(x_val.shape)
	print(y_val.shape)

	"""
	## Build the model
	"""

	print("Build model...")
	num_layers = 1 # Try to add more LSTM layers!

	model = keras.Sequential()
	# "Encode" the input sequence using a LSTM, producing an output of size 128.
	# Note: In a situation where your input sequences have a variable length,
	# use input_shape=(None, num_feature).
	model.add(layers.Input((MAXLEN, len(chars))))
	model.add(layers.LSTM(128))
	# As the decoder RNN's input, repeatedly provide with the last output of
	# RNN for each time step. Repeat 'DIGITS + 1' times as that's the maximum
	# length of output, e.g., when DIGITS=3, max output is 999+999=1998.
	model.add(layers.RepeatVector(DIGITS + 1))
	# The decoder RNN could be multiple layers stacked or a single layer.
	for _ in range(num_layers):
	# By setting return_sequences to True, return not only the last output but
	# all the outputs so far in the form of (num_samples, timesteps,
	# output_dim). This is necessary as TimeDistributed in the below expects
	# the first dimension to be the timesteps.
	model.add(layers.LSTM(128, return_sequences=True))

	# Apply a dense layer to the every temporal slice of an input. For each of step
	# of the output sequence, decide which character should be chosen.
	model.add(layers.Dense(len(chars), activation="softmax"))
	model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
	model.summary()

	"""
	## Train the model
	"""

	# Training parameters.
	epochs = 30
	batch_size = 32

	# Formatting characters for results display.
	green_color = "\033[92m"
	red_color = "\033[91m"
	end_char = "\033[0m"

	# Train the model each generation and show predictions against the validation
	# dataset.
	for epoch in range(1, epochs):
	print()
	print("Iteration", epoch)
	model.fit(
	x_train,
	y_train,
	batch_size=batch_size,
	epochs=1,
	validation_data=(x_val, y_val),
	)
	# Select 10 samples from the validation set at random so we can visualize
	# errors.
	for i in range(10):
	ind = np.random.randint(0, len(x_val))
	rowx, rowy = x_val[np.array([ind])], y_val[np.array([ind])]
	preds = np.argmax(model.predict(rowx, verbose=0), axis=-1)
	q = ctable.decode(rowx[0])
	correct = ctable.decode(rowy[0])
	guess = ctable.decode(preds[0], calc_argmax=False)
	print("Q", q[::-1] if REVERSE else q, end=" ")
	print("T", correct, end=" ")
	if correct == guess:
	print(f"{green_color}☑ {guess}{end_char}")
	else:
	print(f"{red_color}☒ {guess}{end_char}")

	"""
	You'll get to 99+% validation accuracy after ~30 epochs.
	"""