Spaces:
Runtime error
Runtime error
| # -*- coding: utf-8 -*- | |
| """final-maybe | |
| Automatically generated by Colaboratory. | |
| Original file is located at | |
| https://colab.research.google.com/drive/1UueiutTkBBB9Gb2Brp4mQVUvw7cjXTwo | |
| """ | |
| import nltk | |
| SOS = "<s> " | |
| EOS = "</s>" | |
| UNK = "<UNK>" | |
| """Add Sentence Tokens: | |
| To identify the beginning and end of the sentence | |
| add the StartOfSentence and EndOfSentence tokens. | |
| The argument 'sentences' takes a list of str and 'n' is the order of the model. | |
| The function returns the list of generated of sentences. | |
| For bigram models (or greater) both tokens are added otherwise or only one is added. | |
| """ | |
| def add_sentence_tokens(sentences, n): | |
| sos = SOS * (n-1) if n > 1 else SOS | |
| return ['{}{} {}'.format(sos, s, EOS) for s in sentences] | |
| """Replace singletons: | |
| For the tokens appearing only ones in the corpus, replace it with <UNK> | |
| The argument 'tokens' takes input of the tokens comprised in the corpus. | |
| The function returns list of tokens after replacing each singleton with <UNK> | |
| """ | |
| def replace_singletons(tokens): | |
| vocab = nltk.FreqDist(tokens) | |
| return [token if vocab[token] > 1 else UNK for token in tokens] | |
| """Preprocess: | |
| The function takes the argument 'sentences' that takes the list of str of | |
| preprocess. The argument 'n' is the order of the model. | |
| Adds the above three tokens to the sentences and tokenize. | |
| The function returns preprocessed sentences. | |
| """ | |
| def preprocess(sentences, n): | |
| sentences = add_sentence_tokens(sentences, n) | |
| tokens = ' '.join(sentences).split(' ') | |
| tokens = replace_singletons(tokens) | |
| return tokens | |
| from itertools import product | |
| import math | |
| from pathlib import Path | |
| """ This function loads training and testing corpus from a directory. | |
| The argument 'data_dir' contains path of the directory. The directory should contain files: 'train.txt' and 'test.txt' | |
| Function will return train and test sets as lists of sentences. | |
| """ | |
| def load_data(data_dir): | |
| train_path = data_dir + 'train.txt' | |
| test_path = data_dir + 'test.txt' | |
| with open(train_path, 'r') as f: | |
| train = [l.strip() for l in f.readlines()] | |
| with open(test_path, 'r') as f: | |
| test = [l.strip() for l in f.readlines()] | |
| return train, test | |
| """Trained N-gram model: | |
| A trained model for the given corpus is constructed by preprocessing the | |
| corpus and calculating the smoothed probabilities of each n-gram. | |
| The arguments contains training data (list of strings), n (integer; order of the model), | |
| and an integer used for laplace smoothing. | |
| Further, the model has a method for calculating perplexity. | |
| """ | |
| class LanguageModel(object): | |
| def __init__(self, train_data, n, laplace=1): | |
| self.n = n | |
| self.laplace = laplace | |
| self.tokens = preprocess(train_data, n) | |
| self.vocab = nltk.FreqDist(self.tokens) | |
| self.model = self._create_model() | |
| self.masks = list(reversed(list(product((0,1), repeat=n)))) | |
| def _smooth(self): | |
| """ | |
| The n tokens of n-gram in training corpus and first n-1 tokens of each n-gram | |
| results in Laplace smoothenedd probability. | |
| The function returns the smoothened probability mapped to its n-gram. | |
| """ | |
| vocab_size = len(self.vocab) | |
| n_grams = nltk.ngrams(self.tokens, self.n) | |
| n_vocab = nltk.FreqDist(n_grams) | |
| m_grams = nltk.ngrams(self.tokens, self.n-1) | |
| m_vocab = nltk.FreqDist(m_grams) | |
| def smoothed_count(n_gram, n_count): | |
| m_gram = n_gram[:-1] | |
| m_count = m_vocab[m_gram] | |
| return (n_count + self.laplace) / (m_count + self.laplace * vocab_size) | |
| return { n_gram: smoothed_count(n_gram, count) for n_gram, count in n_vocab.items() } | |
| def _create_model(self): | |
| """ | |
| This function creates a probability distribution of the vocabulary of training corpus. | |
| The probabilities in a unigram model are simply relative frequencies of each token over the whole corpus. | |
| Otherwise, the relative frequencies are Laplace-smoothed probabilities. | |
| Function returns a dictionary which maps each n-gram, which is in the form of tuple of strings, to its probabilities (float) | |
| """ | |
| if self.n == 1: | |
| num_tokens = len(self.tokens) | |
| return { (unigram,): count / num_tokens for unigram, count in self.vocab.items() } | |
| else: | |
| return self._smooth() | |
| def _convert_oov(self, ngram): | |
| """ | |
| This function handles the words which are encountered in the test and converts the given n-gram to one which is known by the model. | |
| Stop when the model contains an entry for every permutation. | |
| The function returns n-gram with <UNK> tokens in certain positions such that the model | |
| contains an entry for it. | |
| """ | |
| mask = lambda ngram, bitmask: tuple((token if flag == 1 else "<UNK>" for token,flag in zip(ngram, bitmask))) | |
| ngram = (ngram,) if type(ngram) is str else ngram | |
| for possible_known in [mask(ngram, bitmask) for bitmask in self.masks]: | |
| if possible_known in self.model: | |
| return possible_known | |
| def perplexity(self, test_data): | |
| """ | |
| Perplexity of the model is calculated using the sentences and returns | |
| a float value. | |
| """ | |
| test_tokens = preprocess(test_data, self.n) | |
| test_ngrams = nltk.ngrams(test_tokens, self.n) | |
| N = len(test_tokens) | |
| known_ngrams = (self._convert_oov(ngram) for ngram in test_ngrams) | |
| probabilities = [self.model[ngram] for ngram in known_ngrams] | |
| return math.exp((-1/N) * sum(map(math.log, probabilities))) | |
| def _best_candidate(self, prev, i, without=[]): | |
| """ | |
| Selects the most probable token depending on the basis of previous | |
| (n-1) tokens. | |
| The function takes the argument of previous (n-1) tokens, and the tokens to | |
| exclude from candidates list. | |
| The function returns the most probable token and its probability. | |
| """ | |
| blacklist = ["<UNK>"] + without | |
| candidates = ((ngram[-1],prob) for ngram,prob in self.model.items() if ngram[:-1]==prev) | |
| candidates = filter(lambda candidate: candidate[0] not in blacklist, candidates) | |
| candidates = sorted(candidates, key=lambda candidate: candidate[1], reverse=True) | |
| if len(candidates) == 0: | |
| return ("</s>", 1) | |
| else: | |
| return candidates[0 if prev != () and prev[-1] != "<s>" else i] | |
| # data_path = '/content/drive/Shareddrives/MathProject22/Dataset/data/' | |
| # train, test = load_data(data_path) | |
| # #if __name__ == '__main__': | |
| # model_instance= LanguageModel(train[0:100], 3, 0) | |
| # # first number is the n of n gram | |
| # # second number is the coefficient whether laplace used or not | |
| # print(model_instance.perplexity(test)) | |
| # prev=('I','love',) | |
| # print(model_instance._best_candidate(prev,1)[0]) | |
| # # `1 is ith best fit as a candidate | |
| # import pickle | |
| # filename = 'without_laplace.sav' | |
| # pickle.dump(model_instance, open(filename, 'wb')) | |
| # len(train) |