| | |
| | ''' |
| | Provides functionality for converting a given list of tokens (words) into |
| | numbers, according to the given vocabulary. |
| | ''' |
| | from __future__ import print_function, division, unicode_literals |
| |
|
| | import numbers |
| | import numpy as np |
| |
|
| | from torchmoji.create_vocab import extend_vocab, VocabBuilder |
| | from torchmoji.word_generator import WordGenerator |
| | from torchmoji.global_variables import SPECIAL_TOKENS |
| |
|
| | |
| |
|
| | from sklearn.model_selection import train_test_split |
| |
|
| | from copy import deepcopy |
| |
|
| | class SentenceTokenizer(): |
| | """ Create numpy array of tokens corresponding to input sentences. |
| | The vocabulary can include Unicode tokens. |
| | """ |
| | def __init__(self, vocabulary, fixed_length, custom_wordgen=None, |
| | ignore_sentences_with_only_custom=False, masking_value=0, |
| | unknown_value=1): |
| | """ Needs a dictionary as input for the vocabulary. |
| | """ |
| |
|
| | if len(vocabulary) > np.iinfo('uint16').max: |
| | raise ValueError('Dictionary is too big ({} tokens) for the numpy ' |
| | 'datatypes used (max limit={}). Reduce vocabulary' |
| | ' or adjust code accordingly!' |
| | .format(len(vocabulary), np.iinfo('uint16').max)) |
| |
|
| | |
| | self.vocabulary = deepcopy(vocabulary) |
| | self.fixed_length = fixed_length |
| | self.ignore_sentences_with_only_custom = ignore_sentences_with_only_custom |
| | self.masking_value = masking_value |
| | self.unknown_value = unknown_value |
| |
|
| | |
| | |
| | |
| | if custom_wordgen is not None: |
| | assert custom_wordgen.stream is None |
| | self.wordgen = custom_wordgen |
| | self.uses_custom_wordgen = True |
| | else: |
| | self.wordgen = WordGenerator(None, allow_unicode_text=True, |
| | ignore_emojis=False, |
| | remove_variation_selectors=True, |
| | break_replacement=True) |
| | self.uses_custom_wordgen = False |
| |
|
| | def tokenize_sentences(self, sentences, reset_stats=True, max_sentences=None): |
| | """ Converts a given list of sentences into a numpy array according to |
| | its vocabulary. |
| | |
| | # Arguments: |
| | sentences: List of sentences to be tokenized. |
| | reset_stats: Whether the word generator's stats should be reset. |
| | max_sentences: Maximum length of sentences. Must be set if the |
| | length cannot be inferred from the input. |
| | |
| | # Returns: |
| | Numpy array of the tokenization sentences with masking, |
| | infos, |
| | stats |
| | |
| | # Raises: |
| | ValueError: When maximum length is not set and cannot be inferred. |
| | """ |
| |
|
| | if max_sentences is None and not hasattr(sentences, '__len__'): |
| | raise ValueError('Either you must provide an array with a length' |
| | 'attribute (e.g. a list) or specify the maximum ' |
| | 'length yourself using `max_sentences`!') |
| | n_sentences = (max_sentences if max_sentences is not None |
| | else len(sentences)) |
| |
|
| | if self.masking_value == 0: |
| | tokens = np.zeros((n_sentences, self.fixed_length), dtype='uint16') |
| | else: |
| | tokens = (np.ones((n_sentences, self.fixed_length), dtype='uint16') |
| | * self.masking_value) |
| |
|
| | if reset_stats: |
| | self.wordgen.reset_stats() |
| |
|
| | |
| | |
| | infos = [] |
| |
|
| | |
| | self.wordgen.stream = sentences |
| | next_insert = 0 |
| | n_ignored_unknowns = 0 |
| | for s_words, s_info in self.wordgen: |
| | s_tokens = self.find_tokens(s_words) |
| |
|
| | if (self.ignore_sentences_with_only_custom and |
| | np.all([True if t < len(SPECIAL_TOKENS) |
| | else False for t in s_tokens])): |
| | n_ignored_unknowns += 1 |
| | continue |
| | if len(s_tokens) > self.fixed_length: |
| | s_tokens = s_tokens[:self.fixed_length] |
| | tokens[next_insert,:len(s_tokens)] = s_tokens |
| | infos.append(s_info) |
| | next_insert += 1 |
| |
|
| | |
| | |
| | |
| | if not self.uses_custom_wordgen and not self.ignore_sentences_with_only_custom: |
| | assert len(sentences) == next_insert |
| | else: |
| | |
| | tokens = tokens[:next_insert] |
| | infos = infos[:next_insert] |
| |
|
| | return tokens, infos, self.wordgen.stats |
| |
|
| | def find_tokens(self, words): |
| | assert len(words) > 0 |
| | tokens = [] |
| | for w in words: |
| | try: |
| | tokens.append(self.vocabulary[w]) |
| | except KeyError: |
| | tokens.append(self.unknown_value) |
| | return tokens |
| |
|
| | def split_train_val_test(self, sentences, info_dicts, |
| | split_parameter=[0.7, 0.1, 0.2], extend_with=0): |
| | """ Splits given sentences into three different datasets: training, |
| | validation and testing. |
| | |
| | # Arguments: |
| | sentences: The sentences to be tokenized. |
| | info_dicts: A list of dicts that contain information about each |
| | sentence (e.g. a label). |
| | split_parameter: A parameter for deciding the splits between the |
| | three different datasets. If instead of being passed three |
| | values, three lists are passed, then these will be used to |
| | specify which observation belong to which dataset. |
| | extend_with: An optional parameter. If > 0 then this is the number |
| | of tokens added to the vocabulary from this dataset. The |
| | expanded vocab will be generated using only the training set, |
| | but is applied to all three sets. |
| | |
| | # Returns: |
| | List of three lists of tokenized sentences, |
| | |
| | List of three corresponding dictionaries with information, |
| | |
| | How many tokens have been added to the vocab. Make sure to extend |
| | the embedding layer of the model accordingly. |
| | """ |
| |
|
| | |
| | if isinstance(split_parameter, list) and \ |
| | all(isinstance(x, list) for x in split_parameter) and \ |
| | len(split_parameter) == 3: |
| |
|
| | |
| | def verify_indices(inds): |
| | return list(filter(lambda i: isinstance(i, numbers.Number) |
| | and i < len(sentences), inds)) |
| |
|
| | ind_train = verify_indices(split_parameter[0]) |
| | ind_val = verify_indices(split_parameter[1]) |
| | ind_test = verify_indices(split_parameter[2]) |
| | else: |
| | |
| | ind = list(range(len(sentences))) |
| | ind_train, ind_test = train_test_split(ind, test_size=split_parameter[2]) |
| | ind_train, ind_val = train_test_split(ind_train, test_size=split_parameter[1]) |
| |
|
| | |
| | train = np.array([sentences[x] for x in ind_train]) |
| | test = np.array([sentences[x] for x in ind_test]) |
| | val = np.array([sentences[x] for x in ind_val]) |
| |
|
| | info_train = np.array([info_dicts[x] for x in ind_train]) |
| | info_test = np.array([info_dicts[x] for x in ind_test]) |
| | info_val = np.array([info_dicts[x] for x in ind_val]) |
| |
|
| | added = 0 |
| | |
| | if extend_with > 0: |
| | wg = WordGenerator(train) |
| | vb = VocabBuilder(wg) |
| | vb.count_all_words() |
| | added = extend_vocab(self.vocabulary, vb, max_tokens=extend_with) |
| |
|
| | |
| | result = [self.tokenize_sentences(s)[0] for s in [train, val, test]] |
| | result_infos = [info_train, info_val, info_test] |
| | |
| | |
| |
|
| | return result, result_infos, added |
| |
|
| | def to_sentence(self, sentence_idx): |
| | """ Converts a tokenized sentence back to a list of words. |
| | |
| | # Arguments: |
| | sentence_idx: List of numbers, representing a tokenized sentence |
| | given the current vocabulary. |
| | |
| | # Returns: |
| | String created by converting all numbers back to words and joined |
| | together with spaces. |
| | """ |
| | |
| | ind_to_word = {ind: word for word, ind in self.vocabulary.items()} |
| |
|
| | sentence_as_list = [ind_to_word[x] for x in sentence_idx] |
| | cleaned_list = [x for x in sentence_as_list if x != 'CUSTOM_MASK'] |
| | return " ".join(cleaned_list) |
| |
|
| |
|
| | def coverage(dataset, verbose=False): |
| | """ Computes the percentage of words in a given dataset that are unknown. |
| | |
| | # Arguments: |
| | dataset: Tokenized dataset to be checked. |
| | verbose: Verbosity flag. |
| | |
| | # Returns: |
| | Percentage of unknown tokens. |
| | """ |
| | n_total = np.count_nonzero(dataset) |
| | n_unknown = np.sum(dataset == 1) |
| | coverage = 1.0 - float(n_unknown) / n_total |
| |
|
| | if verbose: |
| | print("Unknown words: {}".format(n_unknown)) |
| | print("Total words: {}".format(n_total)) |
| | print("Coverage: {}".format(coverage)) |
| | return coverage |
| |
|