File size: 2,759 Bytes
555b54d
 
 
9c3e719
b7ae26f
555b54d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6557c9c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import os 
import random
import requests
import pickle
from urllib.request import urlretrieve

user_agents_url = 'https://raw.githubusercontent.com/danielmiessler/SecLists/master/Fuzzing/User-Agents/UserAgents-IE.txt'

os.makedirs('./tmp', exist_ok=True)
download_path = './tmp/user_agents.txt'

# download user-agents
if not os.path.exists(download_path):
    urlretrieve(user_agents_url, download_path)

# reading user-agents file
with open(download_path, 'r') as f:
    user_agents = f.readlines()
    user_agents = list(map(lambda x: x.strip('\n'), user_agents))

def return_user_agent():
    """
    this function returns different user agent randomly 
    """
    ua = random.choice(user_agents)
    # using this header to pretend as regular user so that we are not blocked by website
    headers = {
        'User-Agent': ua
    }
    return headers


# function to return 2
def return2():
    return 2

class Tokenizer:
    """Tokenizer class for tokenizing captions in the Flicker8k dataset.

    Parameters
    ----------
    root : str
        root directory where dataset is stored

    """

    def __init__(self, root):
        self.vocab = ['<start>', '<end>', '<unk>', '<pad>']
        self.count = 3
        self.idx2val = {}
        self.val2idx = {'<start>': 0, '<end>': 1, '<unk>': 2, '<pad>': 3}
        self.root = root

    def add(self, text):
        for i in text.lower().strip().split():
            if i not in self.val2idx.keys():
                self.count += 1
                self.vocab.append(i)
                self.val2idx.update({i: self.count})

    def tokenize(self, fname):
        print(f'tokenizing file {fname}...')
        temp = read_file(os.path.join(self.root, fname))
        df = pd.DataFrame(temp, columns=['id'])
        for i in df['id']:
            captions = self.caption_df[self.caption_df['id'] == i].reset_index(drop=True)['caption']
            for caption in captions:
                self.add(caption)

        self.complete()

    def complete(self):
        self.idx2val = {key: value for value, key in self.val2idx.items()}
        self.val2idx = defaultdict(return2, self.val2idx)

    def pickle_tokenizer(self, fname):
        print(f"saving to file {fname}")
        with open(fname, 'wb') as f:
            state_dict = {'idx2val': self.idx2val, 'val2idx': self.val2idx, 'vocab': self.vocab}
            pickle.dump(state_dict, f)

    def load_tokenizer(self, fname):
        print(f"loading from file {fname}...")
        with open(fname, 'rb') as f:
            state_dict = pickle.load(f)
            self.vocab = state_dict['vocab']
            self.val2idx = state_dict['val2idx']
            self.idx2val = state_dict['idx2val']

    def __len__(self):
        return len(self.vocab)