| import numpy as np |
| from collections import defaultdict |
|
|
|
|
| class NgramLanguageModel: |
| def __init__(self, n, sentences, vocabulary): |
| |
| self.START = '<s>' |
| self.END = '</s>' |
|
|
| vocabulary.add(self.START) |
| vocabulary.add(self.END) |
|
|
| self.n = n |
| self.sentences = sentences |
| self.vocabulary = vocabulary |
| self.vocab_size = len(self.vocabulary) |
|
|
| |
| self.ngram_counts = defaultdict(int) |
| self.context_counts = defaultdict(int) |
|
|
| self.build_model() |
|
|
| def build_model(self): |
| for sentence in self.sentences: |
| |
| padded_sentence = [self.START] * (self.n - 1) + sentence + [self.END] |
|
|
| |
| for i in range(len(padded_sentence) - self.n + 1): |
| ngram = tuple(padded_sentence[i:i + self.n]) |
| context = ngram[:-1] if self.n > 1 else () |
|
|
| self.ngram_counts[ngram] += 1 |
| if self.n > 1: |
| self.context_counts[context] += 1 |
|
|
| print(f"{self.n}-gram model built!") |
| print(f"Unique {self.n}-grams: {len(self.ngram_counts):,}") |
|
|
| def get_probability(self, ngram): |
| ngram = tuple(ngram) |
|
|
| if self.n == 1: |
| |
| total_words = sum(self.ngram_counts.values()) |
| count = self.ngram_counts.get(ngram, 0) |
| prob = (count + 1) / (total_words + self.vocab_size) |
| else: |
| |
| context = ngram[:-1] |
| count = self.ngram_counts.get(ngram, 0) |
| context_count = self.context_counts.get(context, 0) |
| prob = (count + 1) / (context_count + self.vocab_size) |
|
|
| return prob |
|
|
| def get_sentence_probability(self, sentence): |
| |
| padded_sentence = [self.START] * (self.n - 1) + sentence + [self.END] |
|
|
| log_prob = 0.0 |
|
|
| for i in range(len(padded_sentence) - self.n + 1): |
| ngram = padded_sentence[i:i + self.n] |
| prob = self.get_probability(ngram) |
| log_prob += np.log2(prob) |
|
|
| return 2 ** log_prob, log_prob |
|
|
| def get_perplexity(self, sentence): |
| _, log_prob = self.get_sentence_probability(sentence) |
| N = len(sentence) + 1 |
| perplexity = 2 ** (-log_prob / N) |
| return perplexity |
|
|
|
|