Spaces:
Sleeping
Sleeping
| from collections import Counter | |
| from nltk.tokenize import RegexpTokenizer | |
| from source.config import Config | |
| class Vocab: | |
| """ | |
| Offers word2index and index2word functionality after counting words in input sentences. | |
| Allows choosing the size of the vocabulary by taking the most common words. Explicitly reserves four indices: | |
| <pad>, <sos>, <eos> and <unk>. | |
| """ | |
| def __init__(self, sentence_splitter=None): | |
| """ | |
| Args: | |
| sentence_splitter: tokenizing function | |
| """ | |
| self.config = Config() | |
| self.counter = Counter() | |
| self.word2index = dict() | |
| self.index2word = dict() | |
| self.size = 0 | |
| # predefined tokens | |
| self.PADDING_INDEX = 0 | |
| self.SOS = 1 | |
| self.EOS = 2 | |
| self.UNKNOWN_WORD_INDEX = 3 | |
| if sentence_splitter is None: | |
| # matches sequences of characters including ones between < > | |
| word_regex = r'(?:\w+|<\w+>)' | |
| # tokenize the string into words | |
| sentence_splitter = RegexpTokenizer(word_regex).tokenize | |
| self.splitter = sentence_splitter | |
| def add_sentence(self, sentence: str): | |
| """ | |
| Update word counts from sentence after tokenizing it into words | |
| """ | |
| self.counter.update(self.splitter(sentence)) | |
| def word_to_index(self, word: str) -> int: | |
| """ Map word to index from word2index dictionary in vocabulary | |
| Args: | |
| word (str): word to be mapped | |
| Returns: | |
| int: index matched to the word | |
| """ | |
| try: | |
| return self.word2index[word] | |
| except KeyError: | |
| return self.UNKNOWN_WORD_INDEX | |
| def index_to_word(self, index: int) -> str: | |
| """ Map word to index from index2word dictionary in vocabulary | |
| Args: | |
| word (str): index to be mapped | |
| Returns: | |
| str: word matched to the index | |
| """ | |
| try: | |
| return self.index2word[index] | |
| except KeyError: | |
| return self.index2word[self.UNKNOWN_WORD_INDEX] | |
| def load_vocab(self, filepath: str): | |
| """ Load the word2index and index2word dictionaries from a text file. | |
| Args: | |
| file_name (str): name of the text file where the vocabulary is saved (i.e 'word2index.txt') | |
| Note: the lines in file are assumed to be in form: 'word SPACE index' and it asssumes a header line | |
| """ | |
| self.word2index = dict() | |
| self.index2word = dict() | |
| try: | |
| with open(filepath) as file: | |
| for line in file: | |
| line = line.strip().split(' ') | |
| word, index = line[0], line[1] | |
| self.word2index[word] = int(index) | |
| self.index2word[int(index)] = word | |
| except Exception as e: | |
| print(f"Error loading vocabulary from file {filepath}: {e}") | |