hatakekksheeshh commited on
Commit
e8f701d
·
verified ·
1 Parent(s): e2db6d8

Upload ngram_model.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. ngram_model.py +77 -0
ngram_model.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from collections import defaultdict
3
+
4
+
5
+ class NgramLanguageModel:
6
+ def __init__(self, n, sentences, vocabulary):
7
+ # Add special tokens
8
+ self.START = '<s>'
9
+ self.END = '</s>'
10
+
11
+ vocabulary.add(self.START)
12
+ vocabulary.add(self.END)
13
+
14
+ self.n = n
15
+ self.sentences = sentences
16
+ self.vocabulary = vocabulary
17
+ self.vocab_size = len(self.vocabulary)
18
+
19
+ # Build n-gram counts
20
+ self.ngram_counts = defaultdict(int)
21
+ self.context_counts = defaultdict(int)
22
+
23
+ self.build_model()
24
+
25
+ def build_model(self):
26
+ for sentence in self.sentences:
27
+ # Add start and end tokens
28
+ padded_sentence = [self.START] * (self.n - 1) + sentence + [self.END]
29
+
30
+ # Count n-grams
31
+ for i in range(len(padded_sentence) - self.n + 1):
32
+ ngram = tuple(padded_sentence[i:i + self.n])
33
+ context = ngram[:-1] if self.n > 1 else ()
34
+
35
+ self.ngram_counts[ngram] += 1
36
+ if self.n > 1:
37
+ self.context_counts[context] += 1
38
+
39
+ print(f"{self.n}-gram model built!")
40
+ print(f"Unique {self.n}-grams: {len(self.ngram_counts):,}")
41
+
42
+ def get_probability(self, ngram):
43
+ ngram = tuple(ngram)
44
+
45
+ if self.n == 1:
46
+ # Unigram: P(w) = (count(w) + 1) / (total_words + V)
47
+ total_words = sum(self.ngram_counts.values())
48
+ count = self.ngram_counts.get(ngram, 0)
49
+ prob = (count + 1) / (total_words + self.vocab_size)
50
+ else:
51
+ # N-gram: P(w_n | context) = (count(context, w_n) + 1) / (count(context) + V)
52
+ context = ngram[:-1]
53
+ count = self.ngram_counts.get(ngram, 0)
54
+ context_count = self.context_counts.get(context, 0)
55
+ prob = (count + 1) / (context_count + self.vocab_size)
56
+
57
+ return prob
58
+
59
+ def get_sentence_probability(self, sentence):
60
+ # Add padding
61
+ padded_sentence = [self.START] * (self.n - 1) + sentence + [self.END]
62
+
63
+ log_prob = 0.0
64
+
65
+ for i in range(len(padded_sentence) - self.n + 1):
66
+ ngram = padded_sentence[i:i + self.n]
67
+ prob = self.get_probability(ngram)
68
+ log_prob += np.log2(prob)
69
+
70
+ return 2 ** log_prob, log_prob
71
+
72
+ def get_perplexity(self, sentence):
73
+ _, log_prob = self.get_sentence_probability(sentence)
74
+ N = len(sentence) + 1
75
+ perplexity = 2 ** (-log_prob / N)
76
+ return perplexity
77
+