Image-Captioning / preprocess.py
Mayanand's picture
Rename utils.py to preprocess.py
077eba9
import os
import random
import requests
import pickle
from urllib.request import urlretrieve
user_agents_url = 'https://raw.githubusercontent.com/danielmiessler/SecLists/master/Fuzzing/User-Agents/UserAgents-IE.txt'
os.makedirs('./tmp', exist_ok=True)
download_path = './tmp/user_agents.txt'
# download user-agents
if not os.path.exists(download_path):
urlretrieve(user_agents_url, download_path)
# reading user-agents file
with open(download_path, 'r') as f:
user_agents = f.readlines()
user_agents = list(map(lambda x: x.strip('\n'), user_agents))
def return_user_agent():
"""
this function returns different user agent randomly
"""
ua = random.choice(user_agents)
# using this header to pretend as regular user so that we are not blocked by website
headers = {
'User-Agent': ua
}
return headers
# function to return 2
def return2():
return 2
class Tokenizer:
"""Tokenizer class for tokenizing captions in the Flicker8k dataset.
Parameters
----------
root : str
root directory where dataset is stored
"""
def __init__(self, root):
self.vocab = ['<start>', '<end>', '<unk>', '<pad>']
self.count = 3
self.idx2val = {}
self.val2idx = {'<start>': 0, '<end>': 1, '<unk>': 2, '<pad>': 3}
self.root = root
def add(self, text):
for i in text.lower().strip().split():
if i not in self.val2idx.keys():
self.count += 1
self.vocab.append(i)
self.val2idx.update({i: self.count})
def tokenize(self, fname):
print(f'tokenizing file {fname}...')
temp = read_file(os.path.join(self.root, fname))
df = pd.DataFrame(temp, columns=['id'])
for i in df['id']:
captions = self.caption_df[self.caption_df['id'] == i].reset_index(drop=True)['caption']
for caption in captions:
self.add(caption)
self.complete()
def complete(self):
self.idx2val = {key: value for value, key in self.val2idx.items()}
self.val2idx = defaultdict(return2, self.val2idx)
def pickle_tokenizer(self, fname):
print(f"saving to file {fname}")
with open(fname, 'wb') as f:
state_dict = {'idx2val': self.idx2val, 'val2idx': self.val2idx, 'vocab': self.vocab}
pickle.dump(state_dict, f)
def load_tokenizer(self, fname):
print(f"loading from file {fname}...")
with open(fname, 'rb') as f:
state_dict = pickle.load(f)
self.vocab = state_dict['vocab']
self.val2idx = state_dict['val2idx']
self.idx2val = state_dict['idx2val']
def __len__(self):
return len(self.vocab)