hatakekksheeshh
/

spelling-error-correction

Text Generation

Model card Files Files and versions

xet

Community

hatakekksheeshh commited on Feb 4

Commit

e8f701d

verified ·

1 Parent(s): e2db6d8

Upload ngram_model.py with huggingface_hub

Browse files

Files changed (1) hide show

ngram_model.py +77 -0

ngram_model.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import numpy as np
+from collections import defaultdict
+class NgramLanguageModel:
+    def __init__(self, n, sentences, vocabulary):
+        # Add special tokens
+        self.START = '<s>'
+        self.END = '</s>'
+        vocabulary.add(self.START)
+        vocabulary.add(self.END)
+        self.n = n
+        self.sentences = sentences
+        self.vocabulary = vocabulary
+        self.vocab_size = len(self.vocabulary)
+        # Build n-gram counts
+        self.ngram_counts = defaultdict(int)
+        self.context_counts = defaultdict(int)
+        self.build_model()
+    def build_model(self):
+        for sentence in self.sentences:
+            # Add start and end tokens
+            padded_sentence = [self.START] * (self.n - 1) + sentence + [self.END]
+            # Count n-grams
+            for i in range(len(padded_sentence) - self.n + 1):
+                ngram = tuple(padded_sentence[i:i + self.n])
+                context = ngram[:-1] if self.n > 1 else ()
+                self.ngram_counts[ngram] += 1
+                if self.n > 1:
+                    self.context_counts[context] += 1
+        print(f"{self.n}-gram model built!")
+        print(f"Unique {self.n}-grams: {len(self.ngram_counts):,}")
+    def get_probability(self, ngram):
+        ngram = tuple(ngram)
+        if self.n == 1:
+            # Unigram: P(w) = (count(w) + 1) / (total_words + V)
+            total_words = sum(self.ngram_counts.values())
+            count = self.ngram_counts.get(ngram, 0)
+            prob = (count + 1) / (total_words + self.vocab_size)
+        else:
+            # N-gram: P(w_n | context) = (count(context, w_n) + 1) / (count(context) + V)
+            context = ngram[:-1]
+            count = self.ngram_counts.get(ngram, 0)
+            context_count = self.context_counts.get(context, 0)
+            prob = (count + 1) / (context_count + self.vocab_size)
+        return prob
+    def get_sentence_probability(self, sentence):
+        # Add padding
+        padded_sentence = [self.START] * (self.n - 1) + sentence + [self.END]
+        log_prob = 0.0
+        for i in range(len(padded_sentence) - self.n + 1):
+            ngram = padded_sentence[i:i + self.n]
+            prob = self.get_probability(ngram)
+            log_prob += np.log2(prob)
+        return 2 ** log_prob, log_prob
+    def get_perplexity(self, sentence):
+        _, log_prob = self.get_sentence_probability(sentence)
+        N = len(sentence) + 1
+        perplexity = 2 ** (-log_prob / N)
+        return perplexity