Spaces:
Build error
Build error
| import os | |
| import random | |
| import requests | |
| import pickle | |
| from urllib.request import urlretrieve | |
| user_agents_url = 'https://raw.githubusercontent.com/danielmiessler/SecLists/master/Fuzzing/User-Agents/UserAgents-IE.txt' | |
| os.makedirs('./tmp', exist_ok=True) | |
| download_path = './tmp/user_agents.txt' | |
| # download user-agents | |
| if not os.path.exists(download_path): | |
| urlretrieve(user_agents_url, download_path) | |
| # reading user-agents file | |
| with open(download_path, 'r') as f: | |
| user_agents = f.readlines() | |
| user_agents = list(map(lambda x: x.strip('\n'), user_agents)) | |
| def return_user_agent(): | |
| """ | |
| this function returns different user agent randomly | |
| """ | |
| ua = random.choice(user_agents) | |
| # using this header to pretend as regular user so that we are not blocked by website | |
| headers = { | |
| 'User-Agent': ua | |
| } | |
| return headers | |
| # function to return 2 | |
| def return2(): | |
| return 2 | |
| class Tokenizer: | |
| """Tokenizer class for tokenizing captions in the Flicker8k dataset. | |
| Parameters | |
| ---------- | |
| root : str | |
| root directory where dataset is stored | |
| """ | |
| def __init__(self, root): | |
| self.vocab = ['<start>', '<end>', '<unk>', '<pad>'] | |
| self.count = 3 | |
| self.idx2val = {} | |
| self.val2idx = {'<start>': 0, '<end>': 1, '<unk>': 2, '<pad>': 3} | |
| self.root = root | |
| def add(self, text): | |
| for i in text.lower().strip().split(): | |
| if i not in self.val2idx.keys(): | |
| self.count += 1 | |
| self.vocab.append(i) | |
| self.val2idx.update({i: self.count}) | |
| def tokenize(self, fname): | |
| print(f'tokenizing file {fname}...') | |
| temp = read_file(os.path.join(self.root, fname)) | |
| df = pd.DataFrame(temp, columns=['id']) | |
| for i in df['id']: | |
| captions = self.caption_df[self.caption_df['id'] == i].reset_index(drop=True)['caption'] | |
| for caption in captions: | |
| self.add(caption) | |
| self.complete() | |
| def complete(self): | |
| self.idx2val = {key: value for value, key in self.val2idx.items()} | |
| self.val2idx = defaultdict(return2, self.val2idx) | |
| def pickle_tokenizer(self, fname): | |
| print(f"saving to file {fname}") | |
| with open(fname, 'wb') as f: | |
| state_dict = {'idx2val': self.idx2val, 'val2idx': self.val2idx, 'vocab': self.vocab} | |
| pickle.dump(state_dict, f) | |
| def load_tokenizer(self, fname): | |
| print(f"loading from file {fname}...") | |
| with open(fname, 'rb') as f: | |
| state_dict = pickle.load(f) | |
| self.vocab = state_dict['vocab'] | |
| self.val2idx = state_dict['val2idx'] | |
| self.idx2val = state_dict['idx2val'] | |
| def __len__(self): | |
| return len(self.vocab) |