Spaces:

Pendrokar
/

xVASynth-TTS

Running on CPU Upgrade

DeepMoji xVASynth Plugin

e1c08c5 almost 2 years ago

1.71 kB

	'''
	Split a given dataset into three different datasets: training, validation and
	testing.

	This is achieved by splitting the given list of sentences into three separate
	lists according to either a given ratio (e.g. [0.7, 0.1, 0.2]) or by an
	explicit enumeration. The sentences are also tokenised using the given
	vocabulary.

	Also splits a given list of dictionaries containing information about
	each sentence.

	An additional parameter can be set 'extend_with', which will extend the given
	vocabulary with up to 'extend_with' tokens, taken from the training dataset.
	'''
	from __future__ import print_function, unicode_literals
	import example_helper
	import json

	from torchmoji.sentence_tokenizer import SentenceTokenizer

	DATASET = [
	'I am sentence 0',
	'I am sentence 1',
	'I am sentence 2',
	'I am sentence 3',
	'I am sentence 4',
	'I am sentence 5',
	'I am sentence 6',
	'I am sentence 7',
	'I am sentence 8',
	'I am sentence 9 newword',
	]

	INFO_DICTS = [
	{'label': 'sentence 0'},
	{'label': 'sentence 1'},
	{'label': 'sentence 2'},
	{'label': 'sentence 3'},
	{'label': 'sentence 4'},
	{'label': 'sentence 5'},
	{'label': 'sentence 6'},
	{'label': 'sentence 7'},
	{'label': 'sentence 8'},
	{'label': 'sentence 9'},
	]

	with open('../model/vocabulary.json', 'r') as f:
	vocab = json.load(f)
	st = SentenceTokenizer(vocab, 30)

	# Split using the default split ratio
	print(st.split_train_val_test(DATASET, INFO_DICTS))

	# Split explicitly
	print(st.split_train_val_test(DATASET,
	INFO_DICTS,
	[[0, 1, 2, 4, 9], [5, 6], [7, 8, 3]],
	extend_with=1))