Spaces:

Mayanand
/

Image-Captioning

Build error

App Files Files Community

Image-Captioning / preprocess.py

Mayanand

Rename utils.py to preprocess.py

077eba9 about 3 years ago

raw

history blame contribute delete

2.76 kB

	import os
	import random
	import requests
	import pickle
	from urllib.request import urlretrieve

	user_agents_url = 'https://raw.githubusercontent.com/danielmiessler/SecLists/master/Fuzzing/User-Agents/UserAgents-IE.txt'

	os.makedirs('./tmp', exist_ok=True)
	download_path = './tmp/user_agents.txt'

	# download user-agents
	if not os.path.exists(download_path):
	urlretrieve(user_agents_url, download_path)

	# reading user-agents file
	with open(download_path, 'r') as f:
	user_agents = f.readlines()
	user_agents = list(map(lambda x: x.strip('\n'), user_agents))

	def return_user_agent():
	"""
	this function returns different user agent randomly
	"""
	ua = random.choice(user_agents)
	# using this header to pretend as regular user so that we are not blocked by website
	headers = {
	'User-Agent': ua
	}
	return headers


	# function to return 2
	def return2():
	return 2

	class Tokenizer:
	"""Tokenizer class for tokenizing captions in the Flicker8k dataset.

	Parameters
	----------
	root : str
	root directory where dataset is stored

	"""

	def __init__(self, root):
	self.vocab = ['<start>', '<end>', '<unk>', '<pad>']
	self.count = 3
	self.idx2val = {}
	self.val2idx = {'<start>': 0, '<end>': 1, '<unk>': 2, '<pad>': 3}
	self.root = root

	def add(self, text):
	for i in text.lower().strip().split():
	if i not in self.val2idx.keys():
	self.count += 1
	self.vocab.append(i)
	self.val2idx.update({i: self.count})

	def tokenize(self, fname):
	print(f'tokenizing file {fname}...')
	temp = read_file(os.path.join(self.root, fname))
	df = pd.DataFrame(temp, columns=['id'])
	for i in df['id']:
	captions = self.caption_df[self.caption_df['id'] == i].reset_index(drop=True)['caption']
	for caption in captions:
	self.add(caption)

	self.complete()

	def complete(self):
	self.idx2val = {key: value for value, key in self.val2idx.items()}
	self.val2idx = defaultdict(return2, self.val2idx)

	def pickle_tokenizer(self, fname):
	print(f"saving to file {fname}")
	with open(fname, 'wb') as f:
	state_dict = {'idx2val': self.idx2val, 'val2idx': self.val2idx, 'vocab': self.vocab}
	pickle.dump(state_dict, f)

	def load_tokenizer(self, fname):
	print(f"loading from file {fname}...")
	with open(fname, 'rb') as f:
	state_dict = pickle.load(f)
	self.vocab = state_dict['vocab']
	self.val2idx = state_dict['val2idx']
	self.idx2val = state_dict['idx2val']

	def __len__(self):
	return len(self.vocab)