Mayanand commited on
Commit
6557c9c
·
1 Parent(s): 555b54d

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +62 -1
utils.py CHANGED
@@ -25,4 +25,65 @@ def return_user_agent():
25
  headers = {
26
  'User-Agent': ua
27
  }
28
- return headers
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  headers = {
26
  'User-Agent': ua
27
  }
28
+ return headers
29
+
30
+
31
+ # function to return 2
32
+ def return2():
33
+ return 2
34
+
35
+ class Tokenizer:
36
+ """Tokenizer class for tokenizing captions in the Flicker8k dataset.
37
+
38
+ Parameters
39
+ ----------
40
+ root : str
41
+ root directory where dataset is stored
42
+
43
+ """
44
+
45
+ def __init__(self, root):
46
+ self.vocab = ['<start>', '<end>', '<unk>', '<pad>']
47
+ self.count = 3
48
+ self.idx2val = {}
49
+ self.val2idx = {'<start>': 0, '<end>': 1, '<unk>': 2, '<pad>': 3}
50
+ self.root = root
51
+
52
+ def add(self, text):
53
+ for i in text.lower().strip().split():
54
+ if i not in self.val2idx.keys():
55
+ self.count += 1
56
+ self.vocab.append(i)
57
+ self.val2idx.update({i: self.count})
58
+
59
+ def tokenize(self, fname):
60
+ print(f'tokenizing file {fname}...')
61
+ temp = read_file(os.path.join(self.root, fname))
62
+ df = pd.DataFrame(temp, columns=['id'])
63
+ for i in df['id']:
64
+ captions = self.caption_df[self.caption_df['id'] == i].reset_index(drop=True)['caption']
65
+ for caption in captions:
66
+ self.add(caption)
67
+
68
+ self.complete()
69
+
70
+ def complete(self):
71
+ self.idx2val = {key: value for value, key in self.val2idx.items()}
72
+ self.val2idx = defaultdict(return2, self.val2idx)
73
+
74
+ def pickle_tokenizer(self, fname):
75
+ print(f"saving to file {fname}")
76
+ with open(fname, 'wb') as f:
77
+ state_dict = {'idx2val': self.idx2val, 'val2idx': self.val2idx, 'vocab': self.vocab}
78
+ pickle.dump(state_dict, f)
79
+
80
+ def load_tokenizer(self, fname):
81
+ print(f"loading from file {fname}...")
82
+ with open(fname, 'rb') as f:
83
+ state_dict = pickle.load(f)
84
+ self.vocab = state_dict['vocab']
85
+ self.val2idx = state_dict['val2idx']
86
+ self.idx2val = state_dict['idx2val']
87
+
88
+ def __len__(self):
89
+ return len(self.vocab)