Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
| ''' | |
| Split a given dataset into three different datasets: training, validation and | |
| testing. | |
| This is achieved by splitting the given list of sentences into three separate | |
| lists according to either a given ratio (e.g. [0.7, 0.1, 0.2]) or by an | |
| explicit enumeration. The sentences are also tokenised using the given | |
| vocabulary. | |
| Also splits a given list of dictionaries containing information about | |
| each sentence. | |
| An additional parameter can be set 'extend_with', which will extend the given | |
| vocabulary with up to 'extend_with' tokens, taken from the training dataset. | |
| ''' | |
| from __future__ import print_function, unicode_literals | |
| import example_helper | |
| import json | |
| from torchmoji.sentence_tokenizer import SentenceTokenizer | |
| DATASET = [ | |
| 'I am sentence 0', | |
| 'I am sentence 1', | |
| 'I am sentence 2', | |
| 'I am sentence 3', | |
| 'I am sentence 4', | |
| 'I am sentence 5', | |
| 'I am sentence 6', | |
| 'I am sentence 7', | |
| 'I am sentence 8', | |
| 'I am sentence 9 newword', | |
| ] | |
| INFO_DICTS = [ | |
| {'label': 'sentence 0'}, | |
| {'label': 'sentence 1'}, | |
| {'label': 'sentence 2'}, | |
| {'label': 'sentence 3'}, | |
| {'label': 'sentence 4'}, | |
| {'label': 'sentence 5'}, | |
| {'label': 'sentence 6'}, | |
| {'label': 'sentence 7'}, | |
| {'label': 'sentence 8'}, | |
| {'label': 'sentence 9'}, | |
| ] | |
| with open('../model/vocabulary.json', 'r') as f: | |
| vocab = json.load(f) | |
| st = SentenceTokenizer(vocab, 30) | |
| # Split using the default split ratio | |
| print(st.split_train_val_test(DATASET, INFO_DICTS)) | |
| # Split explicitly | |
| print(st.split_train_val_test(DATASET, | |
| INFO_DICTS, | |
| [[0, 1, 2, 4, 9], [5, 6], [7, 8, 3]], | |
| extend_with=1)) | |