Spaces:
Runtime error
Runtime error
| #!/usr/bin/env python | |
| # Copyright (c) 2017-present, Facebook, Inc. | |
| # All rights reserved. | |
| # | |
| # This source code is licensed under the MIT license found in the | |
| # LICENSE file in the root directory of this source tree. | |
| # NOTE: This requires PyTorch! We do not provide installation scripts to install PyTorch. | |
| # It is up to you to install this dependency if you want to execute this example. | |
| # PyTorch's website should give you clear instructions on this: http://pytorch.org/ | |
| from __future__ import absolute_import | |
| from __future__ import division | |
| from __future__ import print_function | |
| from __future__ import unicode_literals | |
| from torch.nn.modules.sparse import EmbeddingBag | |
| import numpy as np | |
| import torch | |
| import random | |
| import string | |
| import time | |
| from fasttext import load_model | |
| from torch.autograd import Variable | |
| class FastTextEmbeddingBag(EmbeddingBag): | |
| def __init__(self, model_path): | |
| self.model = load_model(model_path) | |
| input_matrix = self.model.get_input_matrix() | |
| input_matrix_shape = input_matrix.shape | |
| super().__init__(input_matrix_shape[0], input_matrix_shape[1]) | |
| self.weight.data.copy_(torch.FloatTensor(input_matrix)) | |
| def forward(self, words): | |
| word_subinds = np.empty([0], dtype=np.int64) | |
| word_offsets = [0] | |
| for word in words: | |
| _, subinds = self.model.get_subwords(word) | |
| word_subinds = np.concatenate((word_subinds, subinds)) | |
| word_offsets.append(word_offsets[-1] + len(subinds)) | |
| word_offsets = word_offsets[:-1] | |
| ind = Variable(torch.LongTensor(word_subinds)) | |
| offsets = Variable(torch.LongTensor(word_offsets)) | |
| return super().forward(ind, offsets) | |
| def random_word(N): | |
| return ''.join( | |
| random.choices( | |
| string.ascii_uppercase + string.ascii_lowercase + string.digits, | |
| k=N | |
| ) | |
| ) | |
| if __name__ == "__main__": | |
| ft_emb = FastTextEmbeddingBag("fil9.bin") | |
| model = load_model("fil9.bin") | |
| num_lines = 200 | |
| total_seconds = 0.0 | |
| total_words = 0 | |
| for _ in range(num_lines): | |
| words = [ | |
| random_word(random.randint(1, 10)) | |
| for _ in range(random.randint(15, 25)) | |
| ] | |
| total_words += len(words) | |
| words_average_length = sum([len(word) for word in words]) / len(words) | |
| start = time.clock() | |
| words_emb = ft_emb(words) | |
| total_seconds += (time.clock() - start) | |
| for i in range(len(words)): | |
| word = words[i] | |
| ft_word_emb = model.get_word_vector(word) | |
| py_emb = np.array(words_emb[i].data) | |
| assert (np.isclose(ft_word_emb, py_emb).all()) | |
| print( | |
| "Avg. {:2.5f} seconds to build embeddings for {} lines with a total of {} words.". | |
| format(total_seconds, num_lines, total_words) | |
| ) | |